From bb914f6a95b50d9e0b23a7d669e31998ecd33f79 Mon Sep 17 00:00:00 2001
From: Joshua Hesketh <joshua.hesketh@grafana.com>
Date: Wed, 3 Apr 2024 13:52:52 +1100
Subject: [PATCH 01/10] Querytee: Add metric to measure relative backend
 latency

This metric gives us a measurement of how individual queries compare
between two backends in terms of latency (or duration).
---
 tools/querytee/proxy_endpoint.go      |   2 +
 tools/querytee/proxy_endpoint_test.go | 132 ++++++++++++++++++++++++++
 tools/querytee/proxy_metrics.go       |   7 ++
 3 files changed, 141 insertions(+)

diff --git a/tools/querytee/proxy_endpoint.go b/tools/querytee/proxy_endpoint.go
index e81a893b99f..4d89ad56fb8 100644
--- a/tools/querytee/proxy_endpoint.go
+++ b/tools/querytee/proxy_endpoint.go
@@ -224,6 +224,8 @@ func (p *ProxyEndpoint) executeBackendRequests(req *http.Request, resCh chan *ba
 			)
 		}
 
+		relativeDuration := expectedResponse.elapsedTime - actualResponse.elapsedTime
+		p.metrics.relativeDuration.WithLabelValues(req.Method, p.routeName).Observe(relativeDuration.Seconds())
 		p.metrics.responsesComparedTotal.WithLabelValues(p.routeName, string(result)).Inc()
 	}
 }
diff --git a/tools/querytee/proxy_endpoint_test.go b/tools/querytee/proxy_endpoint_test.go
index 0b549f41b9e..cd5aac61412 100644
--- a/tools/querytee/proxy_endpoint_test.go
+++ b/tools/querytee/proxy_endpoint_test.go
@@ -446,6 +446,138 @@ func Test_ProxyEndpoint_LogSlowQueries(t *testing.T) {
 	}
 }
 
+func Test_ProxyEndpoint_RelativeDurationMetric(t *testing.T) {
+	scenarios := map[string]struct {
+		preferredResponseLatency time.Duration
+		secondaryResponseLatency time.Duration
+		expectedMetric           string
+	}{
+		"secondary backend is 2 seconds faster than preferred": {
+			preferredResponseLatency: 3 * time.Second,
+			secondaryResponseLatency: 1 * time.Second,
+			expectedMetric: (`
+			# HELP cortex_querytee_backend_request_relative_duration_seconds Time (in seconds) of preferred backend less secondary backend.
+			# TYPE cortex_querytee_backend_request_relative_duration_seconds histogram
+			cortex_querytee_backend_request_relative_duration_seconds_bucket{method="GET",route="test",le="-100"} 0
+			cortex_querytee_backend_request_relative_duration_seconds_bucket{method="GET",route="test",le="-50"} 0
+			cortex_querytee_backend_request_relative_duration_seconds_bucket{method="GET",route="test",le="-25"} 0
+			cortex_querytee_backend_request_relative_duration_seconds_bucket{method="GET",route="test",le="-10"} 0
+			cortex_querytee_backend_request_relative_duration_seconds_bucket{method="GET",route="test",le="-5"} 0
+			cortex_querytee_backend_request_relative_duration_seconds_bucket{method="GET",route="test",le="-4"} 0
+			cortex_querytee_backend_request_relative_duration_seconds_bucket{method="GET",route="test",le="-3"} 0
+			cortex_querytee_backend_request_relative_duration_seconds_bucket{method="GET",route="test",le="-2"} 0
+			cortex_querytee_backend_request_relative_duration_seconds_bucket{method="GET",route="test",le="-1.5"} 0
+			cortex_querytee_backend_request_relative_duration_seconds_bucket{method="GET",route="test",le="-1"} 0
+			cortex_querytee_backend_request_relative_duration_seconds_bucket{method="GET",route="test",le="-0.75"} 0
+			cortex_querytee_backend_request_relative_duration_seconds_bucket{method="GET",route="test",le="-0.5"} 0
+			cortex_querytee_backend_request_relative_duration_seconds_bucket{method="GET",route="test",le="-0.25"} 0
+			cortex_querytee_backend_request_relative_duration_seconds_bucket{method="GET",route="test",le="-0.1"} 0
+			cortex_querytee_backend_request_relative_duration_seconds_bucket{method="GET",route="test",le="-0.05"} 0
+			cortex_querytee_backend_request_relative_duration_seconds_bucket{method="GET",route="test",le="-0.025"} 0
+			cortex_querytee_backend_request_relative_duration_seconds_bucket{method="GET",route="test",le="-0.01"} 0
+			cortex_querytee_backend_request_relative_duration_seconds_bucket{method="GET",route="test",le="-0.005"} 0
+			cortex_querytee_backend_request_relative_duration_seconds_bucket{method="GET",route="test",le="0.005"} 0
+			cortex_querytee_backend_request_relative_duration_seconds_bucket{method="GET",route="test",le="0.01"} 0
+			cortex_querytee_backend_request_relative_duration_seconds_bucket{method="GET",route="test",le="0.025"} 0
+			cortex_querytee_backend_request_relative_duration_seconds_bucket{method="GET",route="test",le="0.05"} 0
+			cortex_querytee_backend_request_relative_duration_seconds_bucket{method="GET",route="test",le="0.1"} 0
+			cortex_querytee_backend_request_relative_duration_seconds_bucket{method="GET",route="test",le="0.25"} 0
+			cortex_querytee_backend_request_relative_duration_seconds_bucket{method="GET",route="test",le="0.5"} 0
+			cortex_querytee_backend_request_relative_duration_seconds_bucket{method="GET",route="test",le="0.75"} 0
+			cortex_querytee_backend_request_relative_duration_seconds_bucket{method="GET",route="test",le="1"} 0
+			cortex_querytee_backend_request_relative_duration_seconds_bucket{method="GET",route="test",le="1.5"} 0
+			cortex_querytee_backend_request_relative_duration_seconds_bucket{method="GET",route="test",le="2"} 1
+			cortex_querytee_backend_request_relative_duration_seconds_bucket{method="GET",route="test",le="3"} 1
+			cortex_querytee_backend_request_relative_duration_seconds_bucket{method="GET",route="test",le="4"} 1
+			cortex_querytee_backend_request_relative_duration_seconds_bucket{method="GET",route="test",le="5"} 1
+			cortex_querytee_backend_request_relative_duration_seconds_bucket{method="GET",route="test",le="10"} 1
+			cortex_querytee_backend_request_relative_duration_seconds_bucket{method="GET",route="test",le="25"} 1
+			cortex_querytee_backend_request_relative_duration_seconds_bucket{method="GET",route="test",le="50"} 1
+			cortex_querytee_backend_request_relative_duration_seconds_bucket{method="GET",route="test",le="100"} 1
+			cortex_querytee_backend_request_relative_duration_seconds_bucket{method="GET",route="test",le="+Inf"} 1
+			cortex_querytee_backend_request_relative_duration_seconds_sum{method="GET",route="test"} 2
+			cortex_querytee_backend_request_relative_duration_seconds_count{method="GET",route="test"} 1
+`),
+		},
+		"preferred backend is 5 seconds faster than secondary": {
+			preferredResponseLatency: 2 * time.Second,
+			secondaryResponseLatency: 7 * time.Second,
+			expectedMetric: (`
+			# HELP cortex_querytee_backend_request_relative_duration_seconds Time (in seconds) of preferred backend less secondary backend.
+			# TYPE cortex_querytee_backend_request_relative_duration_seconds histogram
+			cortex_querytee_backend_request_relative_duration_seconds_bucket{method="GET",route="test",le="-100"} 0
+			cortex_querytee_backend_request_relative_duration_seconds_bucket{method="GET",route="test",le="-50"} 0
+			cortex_querytee_backend_request_relative_duration_seconds_bucket{method="GET",route="test",le="-25"} 0
+			cortex_querytee_backend_request_relative_duration_seconds_bucket{method="GET",route="test",le="-10"} 0
+			cortex_querytee_backend_request_relative_duration_seconds_bucket{method="GET",route="test",le="-5"} 1
+			cortex_querytee_backend_request_relative_duration_seconds_bucket{method="GET",route="test",le="-4"} 1
+			cortex_querytee_backend_request_relative_duration_seconds_bucket{method="GET",route="test",le="-3"} 1
+			cortex_querytee_backend_request_relative_duration_seconds_bucket{method="GET",route="test",le="-2"} 1
+			cortex_querytee_backend_request_relative_duration_seconds_bucket{method="GET",route="test",le="-1.5"} 1
+			cortex_querytee_backend_request_relative_duration_seconds_bucket{method="GET",route="test",le="-1"} 1
+			cortex_querytee_backend_request_relative_duration_seconds_bucket{method="GET",route="test",le="-0.75"} 1
+			cortex_querytee_backend_request_relative_duration_seconds_bucket{method="GET",route="test",le="-0.5"} 1
+			cortex_querytee_backend_request_relative_duration_seconds_bucket{method="GET",route="test",le="-0.25"} 1
+			cortex_querytee_backend_request_relative_duration_seconds_bucket{method="GET",route="test",le="-0.1"} 1
+			cortex_querytee_backend_request_relative_duration_seconds_bucket{method="GET",route="test",le="-0.05"} 1
+			cortex_querytee_backend_request_relative_duration_seconds_bucket{method="GET",route="test",le="-0.025"} 1
+			cortex_querytee_backend_request_relative_duration_seconds_bucket{method="GET",route="test",le="-0.01"} 1
+			cortex_querytee_backend_request_relative_duration_seconds_bucket{method="GET",route="test",le="-0.005"} 1
+			cortex_querytee_backend_request_relative_duration_seconds_bucket{method="GET",route="test",le="0.005"} 1
+			cortex_querytee_backend_request_relative_duration_seconds_bucket{method="GET",route="test",le="0.01"} 1
+			cortex_querytee_backend_request_relative_duration_seconds_bucket{method="GET",route="test",le="0.025"} 1
+			cortex_querytee_backend_request_relative_duration_seconds_bucket{method="GET",route="test",le="0.05"} 1
+			cortex_querytee_backend_request_relative_duration_seconds_bucket{method="GET",route="test",le="0.1"} 1
+			cortex_querytee_backend_request_relative_duration_seconds_bucket{method="GET",route="test",le="0.25"} 1
+			cortex_querytee_backend_request_relative_duration_seconds_bucket{method="GET",route="test",le="0.5"} 1
+			cortex_querytee_backend_request_relative_duration_seconds_bucket{method="GET",route="test",le="0.75"} 1
+			cortex_querytee_backend_request_relative_duration_seconds_bucket{method="GET",route="test",le="1"} 1
+			cortex_querytee_backend_request_relative_duration_seconds_bucket{method="GET",route="test",le="1.5"} 1
+			cortex_querytee_backend_request_relative_duration_seconds_bucket{method="GET",route="test",le="2"} 1
+			cortex_querytee_backend_request_relative_duration_seconds_bucket{method="GET",route="test",le="3"} 1
+			cortex_querytee_backend_request_relative_duration_seconds_bucket{method="GET",route="test",le="4"} 1
+			cortex_querytee_backend_request_relative_duration_seconds_bucket{method="GET",route="test",le="5"} 1
+			cortex_querytee_backend_request_relative_duration_seconds_bucket{method="GET",route="test",le="10"} 1
+			cortex_querytee_backend_request_relative_duration_seconds_bucket{method="GET",route="test",le="25"} 1
+			cortex_querytee_backend_request_relative_duration_seconds_bucket{method="GET",route="test",le="50"} 1
+			cortex_querytee_backend_request_relative_duration_seconds_bucket{method="GET",route="test",le="100"} 1
+			cortex_querytee_backend_request_relative_duration_seconds_bucket{method="GET",route="test",le="+Inf"} 1
+			cortex_querytee_backend_request_relative_duration_seconds_sum{method="GET",route="test"} -5
+			cortex_querytee_backend_request_relative_duration_seconds_count{method="GET",route="test"} 1
+`),
+		},
+	}
+
+	for name, scenario := range scenarios {
+		t.Run(name, func(t *testing.T) {
+			backends := []ProxyBackendInterface{
+				newMockProxyBackend("preferred-backend", time.Second, true, scenario.preferredResponseLatency),
+				newMockProxyBackend("secondary-backend", time.Second, false, scenario.secondaryResponseLatency),
+			}
+
+			logger := newMockLogger()
+			reg := prometheus.NewPedanticRegistry()
+			comparator := &mockComparator{
+				comparisonResult: ComparisonSuccess,
+			}
+
+			endpoint := NewProxyEndpoint(backends, "test", NewProxyMetrics(reg), logger, comparator, 0)
+
+			resp := httptest.NewRecorder()
+			req, err := http.NewRequest("GET", "http://test/api/v1/test", nil)
+			require.NoError(t, err)
+			endpoint.ServeHTTP(resp, req)
+
+			// The HTTP request above will return as soon as the primary response is received, but this doesn't guarantee that the response comparison has been completed.
+			// Wait for the response comparison to complete before checking the logged messages.
+			waitForResponseComparisonMetric(t, reg, ComparisonSuccess)
+
+			require.NoError(t, testutil.GatherAndCompare(reg, bytes.NewBufferString(scenario.expectedMetric), "cortex_querytee_backend_request_relative_duration_seconds"))
+
+		})
+	}
+}
+
 func waitForResponseComparisonMetric(t *testing.T, g prometheus.Gatherer, expectedResult ComparisonResult) {
 	started := time.Now()
 	timeoutAt := started.Add(2 * time.Second)
diff --git a/tools/querytee/proxy_metrics.go b/tools/querytee/proxy_metrics.go
index 019c9dea1d0..de2c53262e8 100644
--- a/tools/querytee/proxy_metrics.go
+++ b/tools/querytee/proxy_metrics.go
@@ -23,6 +23,7 @@ type ProxyMetrics struct {
 	requestDuration        *prometheus.HistogramVec
 	responsesTotal         *prometheus.CounterVec
 	responsesComparedTotal *prometheus.CounterVec
+	relativeDuration       *prometheus.HistogramVec
 }
 
 func NewProxyMetrics(registerer prometheus.Registerer) *ProxyMetrics {
@@ -43,6 +44,12 @@ func NewProxyMetrics(registerer prometheus.Registerer) *ProxyMetrics {
 			Name:      "responses_compared_total",
 			Help:      "Total number of responses compared per route name by result.",
 		}, []string{"route", "result"}),
+		relativeDuration: promauto.With(registerer).NewHistogramVec(prometheus.HistogramOpts{
+			Namespace: queryTeeMetricsNamespace,
+			Name:      "backend_request_relative_duration_seconds",
+			Help:      "Time (in seconds) of preferred backend less secondary backend.",
+			Buckets:   []float64{-100, -50, -25, -10, -5, -4, -3, -2, -1.5, -1, -.75, -.5, -.25, -.1, -.05, -.025, -.01, -.005, .005, .01, .025, .05, .1, .25, .5, 0.75, 1, 1.5, 2, 3, 4, 5, 10, 25, 50, 100},
+		}, []string{"method", "route"}),
 	}
 
 	return m

From c0c91f8237aa90a045940662c9c53451d96daa33 Mon Sep 17 00:00:00 2001
From: Joshua Hesketh <joshua.hesketh@grafana.com>
Date: Thu, 4 Apr 2024 17:34:15 +1100
Subject: [PATCH 02/10] Add proportional metric, and change to native
 histograms

---
 tools/querytee/proxy_endpoint.go      |   4 +-
 tools/querytee/proxy_endpoint_test.go | 111 ++++++--------------------
 tools/querytee/proxy_metrics.go       |  17 ++--
 3 files changed, 38 insertions(+), 94 deletions(-)

diff --git a/tools/querytee/proxy_endpoint.go b/tools/querytee/proxy_endpoint.go
index 4d89ad56fb8..62615f3457b 100644
--- a/tools/querytee/proxy_endpoint.go
+++ b/tools/querytee/proxy_endpoint.go
@@ -225,7 +225,9 @@ func (p *ProxyEndpoint) executeBackendRequests(req *http.Request, resCh chan *ba
 		}
 
 		relativeDuration := expectedResponse.elapsedTime - actualResponse.elapsedTime
-		p.metrics.relativeDuration.WithLabelValues(req.Method, p.routeName).Observe(relativeDuration.Seconds())
+		proportionalDuration := expectedResponse.elapsedTime.Seconds() / actualResponse.elapsedTime.Seconds()
+		p.metrics.relativeDuration.WithLabelValues(p.routeName).Observe(relativeDuration.Seconds())
+		p.metrics.proportionalDuration.WithLabelValues(p.routeName).Observe(proportionalDuration)
 		p.metrics.responsesComparedTotal.WithLabelValues(p.routeName, string(result)).Inc()
 	}
 }
diff --git a/tools/querytee/proxy_endpoint_test.go b/tools/querytee/proxy_endpoint_test.go
index cd5aac61412..b78a90689c8 100644
--- a/tools/querytee/proxy_endpoint_test.go
+++ b/tools/querytee/proxy_endpoint_test.go
@@ -21,6 +21,7 @@ import (
 	"github.com/pkg/errors"
 	"github.com/prometheus/client_golang/prometheus"
 	"github.com/prometheus/client_golang/prometheus/testutil"
+	dto "github.com/prometheus/client_model/go"
 	"github.com/stretchr/testify/assert"
 	"github.com/stretchr/testify/require"
 	"go.uber.org/atomic"
@@ -455,96 +456,12 @@ func Test_ProxyEndpoint_RelativeDurationMetric(t *testing.T) {
 		"secondary backend is 2 seconds faster than preferred": {
 			preferredResponseLatency: 3 * time.Second,
 			secondaryResponseLatency: 1 * time.Second,
-			expectedMetric: (`
-			# HELP cortex_querytee_backend_request_relative_duration_seconds Time (in seconds) of preferred backend less secondary backend.
-			# TYPE cortex_querytee_backend_request_relative_duration_seconds histogram
-			cortex_querytee_backend_request_relative_duration_seconds_bucket{method="GET",route="test",le="-100"} 0
-			cortex_querytee_backend_request_relative_duration_seconds_bucket{method="GET",route="test",le="-50"} 0
-			cortex_querytee_backend_request_relative_duration_seconds_bucket{method="GET",route="test",le="-25"} 0
-			cortex_querytee_backend_request_relative_duration_seconds_bucket{method="GET",route="test",le="-10"} 0
-			cortex_querytee_backend_request_relative_duration_seconds_bucket{method="GET",route="test",le="-5"} 0
-			cortex_querytee_backend_request_relative_duration_seconds_bucket{method="GET",route="test",le="-4"} 0
-			cortex_querytee_backend_request_relative_duration_seconds_bucket{method="GET",route="test",le="-3"} 0
-			cortex_querytee_backend_request_relative_duration_seconds_bucket{method="GET",route="test",le="-2"} 0
-			cortex_querytee_backend_request_relative_duration_seconds_bucket{method="GET",route="test",le="-1.5"} 0
-			cortex_querytee_backend_request_relative_duration_seconds_bucket{method="GET",route="test",le="-1"} 0
-			cortex_querytee_backend_request_relative_duration_seconds_bucket{method="GET",route="test",le="-0.75"} 0
-			cortex_querytee_backend_request_relative_duration_seconds_bucket{method="GET",route="test",le="-0.5"} 0
-			cortex_querytee_backend_request_relative_duration_seconds_bucket{method="GET",route="test",le="-0.25"} 0
-			cortex_querytee_backend_request_relative_duration_seconds_bucket{method="GET",route="test",le="-0.1"} 0
-			cortex_querytee_backend_request_relative_duration_seconds_bucket{method="GET",route="test",le="-0.05"} 0
-			cortex_querytee_backend_request_relative_duration_seconds_bucket{method="GET",route="test",le="-0.025"} 0
-			cortex_querytee_backend_request_relative_duration_seconds_bucket{method="GET",route="test",le="-0.01"} 0
-			cortex_querytee_backend_request_relative_duration_seconds_bucket{method="GET",route="test",le="-0.005"} 0
-			cortex_querytee_backend_request_relative_duration_seconds_bucket{method="GET",route="test",le="0.005"} 0
-			cortex_querytee_backend_request_relative_duration_seconds_bucket{method="GET",route="test",le="0.01"} 0
-			cortex_querytee_backend_request_relative_duration_seconds_bucket{method="GET",route="test",le="0.025"} 0
-			cortex_querytee_backend_request_relative_duration_seconds_bucket{method="GET",route="test",le="0.05"} 0
-			cortex_querytee_backend_request_relative_duration_seconds_bucket{method="GET",route="test",le="0.1"} 0
-			cortex_querytee_backend_request_relative_duration_seconds_bucket{method="GET",route="test",le="0.25"} 0
-			cortex_querytee_backend_request_relative_duration_seconds_bucket{method="GET",route="test",le="0.5"} 0
-			cortex_querytee_backend_request_relative_duration_seconds_bucket{method="GET",route="test",le="0.75"} 0
-			cortex_querytee_backend_request_relative_duration_seconds_bucket{method="GET",route="test",le="1"} 0
-			cortex_querytee_backend_request_relative_duration_seconds_bucket{method="GET",route="test",le="1.5"} 0
-			cortex_querytee_backend_request_relative_duration_seconds_bucket{method="GET",route="test",le="2"} 1
-			cortex_querytee_backend_request_relative_duration_seconds_bucket{method="GET",route="test",le="3"} 1
-			cortex_querytee_backend_request_relative_duration_seconds_bucket{method="GET",route="test",le="4"} 1
-			cortex_querytee_backend_request_relative_duration_seconds_bucket{method="GET",route="test",le="5"} 1
-			cortex_querytee_backend_request_relative_duration_seconds_bucket{method="GET",route="test",le="10"} 1
-			cortex_querytee_backend_request_relative_duration_seconds_bucket{method="GET",route="test",le="25"} 1
-			cortex_querytee_backend_request_relative_duration_seconds_bucket{method="GET",route="test",le="50"} 1
-			cortex_querytee_backend_request_relative_duration_seconds_bucket{method="GET",route="test",le="100"} 1
-			cortex_querytee_backend_request_relative_duration_seconds_bucket{method="GET",route="test",le="+Inf"} 1
-			cortex_querytee_backend_request_relative_duration_seconds_sum{method="GET",route="test"} 2
-			cortex_querytee_backend_request_relative_duration_seconds_count{method="GET",route="test"} 1
-`),
+			expectedMetric:           (`todo`),
 		},
 		"preferred backend is 5 seconds faster than secondary": {
 			preferredResponseLatency: 2 * time.Second,
 			secondaryResponseLatency: 7 * time.Second,
-			expectedMetric: (`
-			# HELP cortex_querytee_backend_request_relative_duration_seconds Time (in seconds) of preferred backend less secondary backend.
-			# TYPE cortex_querytee_backend_request_relative_duration_seconds histogram
-			cortex_querytee_backend_request_relative_duration_seconds_bucket{method="GET",route="test",le="-100"} 0
-			cortex_querytee_backend_request_relative_duration_seconds_bucket{method="GET",route="test",le="-50"} 0
-			cortex_querytee_backend_request_relative_duration_seconds_bucket{method="GET",route="test",le="-25"} 0
-			cortex_querytee_backend_request_relative_duration_seconds_bucket{method="GET",route="test",le="-10"} 0
-			cortex_querytee_backend_request_relative_duration_seconds_bucket{method="GET",route="test",le="-5"} 1
-			cortex_querytee_backend_request_relative_duration_seconds_bucket{method="GET",route="test",le="-4"} 1
-			cortex_querytee_backend_request_relative_duration_seconds_bucket{method="GET",route="test",le="-3"} 1
-			cortex_querytee_backend_request_relative_duration_seconds_bucket{method="GET",route="test",le="-2"} 1
-			cortex_querytee_backend_request_relative_duration_seconds_bucket{method="GET",route="test",le="-1.5"} 1
-			cortex_querytee_backend_request_relative_duration_seconds_bucket{method="GET",route="test",le="-1"} 1
-			cortex_querytee_backend_request_relative_duration_seconds_bucket{method="GET",route="test",le="-0.75"} 1
-			cortex_querytee_backend_request_relative_duration_seconds_bucket{method="GET",route="test",le="-0.5"} 1
-			cortex_querytee_backend_request_relative_duration_seconds_bucket{method="GET",route="test",le="-0.25"} 1
-			cortex_querytee_backend_request_relative_duration_seconds_bucket{method="GET",route="test",le="-0.1"} 1
-			cortex_querytee_backend_request_relative_duration_seconds_bucket{method="GET",route="test",le="-0.05"} 1
-			cortex_querytee_backend_request_relative_duration_seconds_bucket{method="GET",route="test",le="-0.025"} 1
-			cortex_querytee_backend_request_relative_duration_seconds_bucket{method="GET",route="test",le="-0.01"} 1
-			cortex_querytee_backend_request_relative_duration_seconds_bucket{method="GET",route="test",le="-0.005"} 1
-			cortex_querytee_backend_request_relative_duration_seconds_bucket{method="GET",route="test",le="0.005"} 1
-			cortex_querytee_backend_request_relative_duration_seconds_bucket{method="GET",route="test",le="0.01"} 1
-			cortex_querytee_backend_request_relative_duration_seconds_bucket{method="GET",route="test",le="0.025"} 1
-			cortex_querytee_backend_request_relative_duration_seconds_bucket{method="GET",route="test",le="0.05"} 1
-			cortex_querytee_backend_request_relative_duration_seconds_bucket{method="GET",route="test",le="0.1"} 1
-			cortex_querytee_backend_request_relative_duration_seconds_bucket{method="GET",route="test",le="0.25"} 1
-			cortex_querytee_backend_request_relative_duration_seconds_bucket{method="GET",route="test",le="0.5"} 1
-			cortex_querytee_backend_request_relative_duration_seconds_bucket{method="GET",route="test",le="0.75"} 1
-			cortex_querytee_backend_request_relative_duration_seconds_bucket{method="GET",route="test",le="1"} 1
-			cortex_querytee_backend_request_relative_duration_seconds_bucket{method="GET",route="test",le="1.5"} 1
-			cortex_querytee_backend_request_relative_duration_seconds_bucket{method="GET",route="test",le="2"} 1
-			cortex_querytee_backend_request_relative_duration_seconds_bucket{method="GET",route="test",le="3"} 1
-			cortex_querytee_backend_request_relative_duration_seconds_bucket{method="GET",route="test",le="4"} 1
-			cortex_querytee_backend_request_relative_duration_seconds_bucket{method="GET",route="test",le="5"} 1
-			cortex_querytee_backend_request_relative_duration_seconds_bucket{method="GET",route="test",le="10"} 1
-			cortex_querytee_backend_request_relative_duration_seconds_bucket{method="GET",route="test",le="25"} 1
-			cortex_querytee_backend_request_relative_duration_seconds_bucket{method="GET",route="test",le="50"} 1
-			cortex_querytee_backend_request_relative_duration_seconds_bucket{method="GET",route="test",le="100"} 1
-			cortex_querytee_backend_request_relative_duration_seconds_bucket{method="GET",route="test",le="+Inf"} 1
-			cortex_querytee_backend_request_relative_duration_seconds_sum{method="GET",route="test"} -5
-			cortex_querytee_backend_request_relative_duration_seconds_count{method="GET",route="test"} 1
-`),
+			expectedMetric:           (`todo`),
 		},
 	}
 
@@ -572,12 +489,30 @@ func Test_ProxyEndpoint_RelativeDurationMetric(t *testing.T) {
 			// Wait for the response comparison to complete before checking the logged messages.
 			waitForResponseComparisonMetric(t, reg, ComparisonSuccess)
 
-			require.NoError(t, testutil.GatherAndCompare(reg, bytes.NewBufferString(scenario.expectedMetric), "cortex_querytee_backend_request_relative_duration_seconds"))
-
+			//require.NoError(t, testutil.GatherAndCompare(reg, bytes.NewBufferString(scenario.expectedMetric), "cortex_querytee_backend_request_relative_duration_seconds"))
+			got, done, err := prometheus.ToTransactionalGatherer(reg).Gather()
+			defer done()
+			require.NoError(t, err, "Failed to gather metrics from registry")
+			got = filterMetrics(got, []string{"cortex_querytee_backend_request_relative_duration_seconds"})
+			require.Equal(t, 1, len(got), "Expect only one metric after filtering")
+			require.Equal(t, uint64(1), got[0].Metric[0].Histogram.GetSampleCount(), "Expect only 1 sample")
 		})
 	}
 }
 
+func filterMetrics(metrics []*dto.MetricFamily, names []string) []*dto.MetricFamily {
+	var filtered []*dto.MetricFamily
+	for _, m := range metrics {
+		for _, name := range names {
+			if m.GetName() == name {
+				filtered = append(filtered, m)
+				break
+			}
+		}
+	}
+	return filtered
+}
+
 func waitForResponseComparisonMetric(t *testing.T, g prometheus.Gatherer, expectedResult ComparisonResult) {
 	started := time.Now()
 	timeoutAt := started.Add(2 * time.Second)
diff --git a/tools/querytee/proxy_metrics.go b/tools/querytee/proxy_metrics.go
index de2c53262e8..47b1b7afba0 100644
--- a/tools/querytee/proxy_metrics.go
+++ b/tools/querytee/proxy_metrics.go
@@ -24,6 +24,7 @@ type ProxyMetrics struct {
 	responsesTotal         *prometheus.CounterVec
 	responsesComparedTotal *prometheus.CounterVec
 	relativeDuration       *prometheus.HistogramVec
+	proportionalDuration   *prometheus.HistogramVec
 }
 
 func NewProxyMetrics(registerer prometheus.Registerer) *ProxyMetrics {
@@ -45,11 +46,17 @@ func NewProxyMetrics(registerer prometheus.Registerer) *ProxyMetrics {
 			Help:      "Total number of responses compared per route name by result.",
 		}, []string{"route", "result"}),
 		relativeDuration: promauto.With(registerer).NewHistogramVec(prometheus.HistogramOpts{
-			Namespace: queryTeeMetricsNamespace,
-			Name:      "backend_request_relative_duration_seconds",
-			Help:      "Time (in seconds) of preferred backend less secondary backend.",
-			Buckets:   []float64{-100, -50, -25, -10, -5, -4, -3, -2, -1.5, -1, -.75, -.5, -.25, -.1, -.05, -.025, -.01, -.005, .005, .01, .025, .05, .1, .25, .5, 0.75, 1, 1.5, 2, 3, 4, 5, 10, 25, 50, 100},
-		}, []string{"method", "route"}),
+			Namespace:                   queryTeeMetricsNamespace,
+			Name:                        "backend_request_relative_duration_seconds",
+			Help:                        "Time (in seconds) of preferred backend less secondary backend.",
+			NativeHistogramBucketFactor: 2,
+		}, []string{"route"}),
+		proportionalDuration: promauto.With(registerer).NewHistogramVec(prometheus.HistogramOpts{
+			Namespace:                   queryTeeMetricsNamespace,
+			Name:                        "backend_request_relative_duration_proportional",
+			Help:                        "Proportional time (%) of preferred backend vs secondary backend.",
+			NativeHistogramBucketFactor: 2,
+		}, []string{"route"}),
 	}
 
 	return m

From 79fd1e9ea4d28d8690e3bd623bb8035b7f0b241f Mon Sep 17 00:00:00 2001
From: Joshua Hesketh <joshua.hesketh@grafana.com>
Date: Thu, 4 Apr 2024 20:25:04 +1100
Subject: [PATCH 03/10] Improve histogram unit tests

Also test the proportional metric
---
 tools/querytee/proxy_endpoint_test.go | 181 +++++++++++++++++---------
 1 file changed, 121 insertions(+), 60 deletions(-)

diff --git a/tools/querytee/proxy_endpoint_test.go b/tools/querytee/proxy_endpoint_test.go
index b78a90689c8..20ad46239dc 100644
--- a/tools/querytee/proxy_endpoint_test.go
+++ b/tools/querytee/proxy_endpoint_test.go
@@ -335,7 +335,7 @@ func Test_ProxyEndpoint_Comparison(t *testing.T) {
 
 			// The HTTP request above will return as soon as the primary response is received, but this doesn't guarantee that the response comparison has been completed.
 			// Wait for the response comparison to complete before checking the logged messages.
-			waitForResponseComparisonMetric(t, reg, scenario.expectedComparisonResult)
+			waitForResponseComparisonMetric(t, reg, scenario.expectedComparisonResult, 1)
 
 			switch scenario.expectedComparisonResult {
 			case ComparisonSuccess:
@@ -354,67 +354,81 @@ func Test_ProxyEndpoint_Comparison(t *testing.T) {
 func Test_ProxyEndpoint_LogSlowQueries(t *testing.T) {
 	scenarios := map[string]struct {
 		slowResponseThreshold         time.Duration
-		preferredResponseLatency      time.Duration
-		secondaryResponseLatency      time.Duration
+		latencyPairs                  []latencyPair
 		expectLatencyExceedsThreshold bool
 		fastestBackend                string
 		slowestBackend                string
 	}{
 		"responses are below threshold": {
-			slowResponseThreshold:         100 * time.Millisecond,
-			preferredResponseLatency:      1 * time.Millisecond,
-			secondaryResponseLatency:      1 * time.Millisecond,
+			slowResponseThreshold: 100 * time.Millisecond,
+			latencyPairs: []latencyPair{{
+				preferredResponseLatency: 1 * time.Millisecond,
+				secondaryResponseLatency: 1 * time.Millisecond,
+			}},
 			expectLatencyExceedsThreshold: false,
 		},
 		"one response above threshold": {
-			slowResponseThreshold:         50 * time.Millisecond,
-			preferredResponseLatency:      1 * time.Millisecond,
-			secondaryResponseLatency:      70 * time.Millisecond,
+			slowResponseThreshold: 50 * time.Millisecond,
+			latencyPairs: []latencyPair{{
+				preferredResponseLatency: 1 * time.Millisecond,
+				secondaryResponseLatency: 70 * time.Millisecond,
+			}},
 			expectLatencyExceedsThreshold: true,
 			fastestBackend:                "preferred-backend",
 			slowestBackend:                "secondary-backend",
 		},
 		"responses are both above threshold, but lower than threshold between themselves": {
-			slowResponseThreshold:         50 * time.Millisecond,
-			preferredResponseLatency:      51 * time.Millisecond,
-			secondaryResponseLatency:      62 * time.Millisecond,
+			slowResponseThreshold: 50 * time.Millisecond,
+			latencyPairs: []latencyPair{{
+				preferredResponseLatency: 51 * time.Millisecond,
+				secondaryResponseLatency: 62 * time.Millisecond,
+			}},
 			expectLatencyExceedsThreshold: false,
 		},
 		"responses are both above threshold, and above threshold between themselves": {
-			slowResponseThreshold:         10 * time.Millisecond,
-			preferredResponseLatency:      11 * time.Millisecond,
-			secondaryResponseLatency:      52 * time.Millisecond,
+			slowResponseThreshold: 10 * time.Millisecond,
+			latencyPairs: []latencyPair{{
+				preferredResponseLatency: 11 * time.Millisecond,
+				secondaryResponseLatency: 52 * time.Millisecond,
+			}},
 			expectLatencyExceedsThreshold: true,
 			fastestBackend:                "preferred-backend",
 			slowestBackend:                "secondary-backend",
 		},
 		"secondary latency is faster than primary, and difference is below threshold": {
-			slowResponseThreshold:         50 * time.Millisecond,
-			preferredResponseLatency:      10 * time.Millisecond,
-			secondaryResponseLatency:      1 * time.Millisecond,
+			slowResponseThreshold: 50 * time.Millisecond,
+			latencyPairs: []latencyPair{{
+				preferredResponseLatency: 10 * time.Millisecond,
+				secondaryResponseLatency: 1 * time.Millisecond,
+			}},
 			expectLatencyExceedsThreshold: false,
 		},
 		"secondary latency is faster than primary, and difference is above threshold": {
-			slowResponseThreshold:         50 * time.Millisecond,
-			preferredResponseLatency:      71 * time.Millisecond,
-			secondaryResponseLatency:      1 * time.Millisecond,
+			slowResponseThreshold: 50 * time.Millisecond,
+			latencyPairs: []latencyPair{{
+				preferredResponseLatency: 71 * time.Millisecond,
+				secondaryResponseLatency: 1 * time.Millisecond,
+			}},
 			expectLatencyExceedsThreshold: true,
 			fastestBackend:                "secondary-backend",
 			slowestBackend:                "preferred-backend",
 		},
 		"slowest response threshold is disabled (0)": {
-			slowResponseThreshold:         0 * time.Millisecond,
-			preferredResponseLatency:      200 * time.Millisecond,
-			secondaryResponseLatency:      100 * time.Millisecond,
+			slowResponseThreshold: 0 * time.Millisecond,
+			latencyPairs: []latencyPair{{
+				preferredResponseLatency: 200 * time.Millisecond,
+				secondaryResponseLatency: 100 * time.Millisecond,
+			}},
 			expectLatencyExceedsThreshold: false,
 		},
 	}
 
 	for name, scenario := range scenarios {
 		t.Run(name, func(t *testing.T) {
+			preferredLatencies, secondaryLatencies := splitLatencyPairs(scenario.latencyPairs)
 			backends := []ProxyBackendInterface{
-				newMockProxyBackend("preferred-backend", time.Second, true, scenario.preferredResponseLatency),
-				newMockProxyBackend("secondary-backend", time.Second, false, scenario.secondaryResponseLatency),
+				newMockProxyBackend("preferred-backend", time.Second, true, preferredLatencies),
+				newMockProxyBackend("secondary-backend", time.Second, false, secondaryLatencies),
 			}
 
 			logger := newMockLogger()
@@ -432,7 +446,7 @@ func Test_ProxyEndpoint_LogSlowQueries(t *testing.T) {
 
 			// The HTTP request above will return as soon as the primary response is received, but this doesn't guarantee that the response comparison has been completed.
 			// Wait for the response comparison to complete before checking the logged messages.
-			waitForResponseComparisonMetric(t, reg, ComparisonSuccess)
+			waitForResponseComparisonMetric(t, reg, ComparisonSuccess, uint64(len(scenario.latencyPairs)))
 
 			if scenario.expectLatencyExceedsThreshold {
 				requireLogKeyValues(t, logger.messages, map[string]string{
@@ -449,27 +463,42 @@ func Test_ProxyEndpoint_LogSlowQueries(t *testing.T) {
 
 func Test_ProxyEndpoint_RelativeDurationMetric(t *testing.T) {
 	scenarios := map[string]struct {
-		preferredResponseLatency time.Duration
-		secondaryResponseLatency time.Duration
-		expectedMetric           string
+		latencyPairs                  []latencyPair
+		expectedSampleCount           uint64
+		expectedDurationSampleSum     float64
+		expectedProportionalSampleSum float64
 	}{
-		"secondary backend is 2 seconds faster than preferred": {
-			preferredResponseLatency: 3 * time.Second,
-			secondaryResponseLatency: 1 * time.Second,
-			expectedMetric:           (`todo`),
+		"secondary backend is faster than preferred": {
+			latencyPairs: []latencyPair{{
+				preferredResponseLatency: 3 * time.Second,
+				secondaryResponseLatency: 1 * time.Second,
+			}, {
+				preferredResponseLatency: 5 * time.Second,
+				secondaryResponseLatency: 2 * time.Second,
+			},
+			},
+			expectedSampleCount:           2,
+			expectedDurationSampleSum:     5,
+			expectedProportionalSampleSum: (3.0/1 + 5.0/2),
 		},
 		"preferred backend is 5 seconds faster than secondary": {
-			preferredResponseLatency: 2 * time.Second,
-			secondaryResponseLatency: 7 * time.Second,
-			expectedMetric:           (`todo`),
+			latencyPairs: []latencyPair{{
+				preferredResponseLatency: 2 * time.Second,
+				secondaryResponseLatency: 7 * time.Second,
+			},
+			},
+			expectedSampleCount:           1,
+			expectedDurationSampleSum:     -5,
+			expectedProportionalSampleSum: (2.0 / 7),
 		},
 	}
 
 	for name, scenario := range scenarios {
 		t.Run(name, func(t *testing.T) {
+			preferredLatencies, secondaryLatencies := splitLatencyPairs(scenario.latencyPairs)
 			backends := []ProxyBackendInterface{
-				newMockProxyBackend("preferred-backend", time.Second, true, scenario.preferredResponseLatency),
-				newMockProxyBackend("secondary-backend", time.Second, false, scenario.secondaryResponseLatency),
+				newMockProxyBackend("preferred-backend", time.Second, true, preferredLatencies),
+				newMockProxyBackend("secondary-backend", time.Second, false, secondaryLatencies),
 			}
 
 			logger := newMockLogger()
@@ -483,19 +512,28 @@ func Test_ProxyEndpoint_RelativeDurationMetric(t *testing.T) {
 			resp := httptest.NewRecorder()
 			req, err := http.NewRequest("GET", "http://test/api/v1/test", nil)
 			require.NoError(t, err)
-			endpoint.ServeHTTP(resp, req)
+			for range scenario.latencyPairs {
+				// This is just in serial to keep the tests simple
+				endpoint.ServeHTTP(resp, req)
+			}
 
 			// The HTTP request above will return as soon as the primary response is received, but this doesn't guarantee that the response comparison has been completed.
-			// Wait for the response comparison to complete before checking the logged messages.
-			waitForResponseComparisonMetric(t, reg, ComparisonSuccess)
+			// Wait for the response comparison to complete the number of requests we have
+			waitForResponseComparisonMetric(t, reg, ComparisonSuccess, scenario.expectedSampleCount)
 
-			//require.NoError(t, testutil.GatherAndCompare(reg, bytes.NewBufferString(scenario.expectedMetric), "cortex_querytee_backend_request_relative_duration_seconds"))
 			got, done, err := prometheus.ToTransactionalGatherer(reg).Gather()
 			defer done()
 			require.NoError(t, err, "Failed to gather metrics from registry")
-			got = filterMetrics(got, []string{"cortex_querytee_backend_request_relative_duration_seconds"})
-			require.Equal(t, 1, len(got), "Expect only one metric after filtering")
-			require.Equal(t, uint64(1), got[0].Metric[0].Histogram.GetSampleCount(), "Expect only 1 sample")
+
+			gotDuration := filterMetrics(got, []string{"cortex_querytee_backend_request_relative_duration_seconds"})
+			require.Equal(t, 1, len(gotDuration), "Expect only one metric after filtering")
+			require.Equal(t, scenario.expectedSampleCount, gotDuration[0].Metric[0].Histogram.GetSampleCount())
+			require.Equal(t, scenario.expectedDurationSampleSum, gotDuration[0].Metric[0].Histogram.GetSampleSum())
+
+			gotProportional := filterMetrics(got, []string{"cortex_querytee_backend_request_relative_duration_proportional"})
+			require.Equal(t, 1, len(gotProportional), "Expect only one metric after filtering")
+			require.Equal(t, scenario.expectedSampleCount, gotProportional[0].Metric[0].Histogram.GetSampleCount())
+			require.Equal(t, scenario.expectedProportionalSampleSum, gotProportional[0].Metric[0].Histogram.GetSampleSum())
 		})
 	}
 }
@@ -513,7 +551,7 @@ func filterMetrics(metrics []*dto.MetricFamily, names []string) []*dto.MetricFam
 	return filtered
 }
 
-func waitForResponseComparisonMetric(t *testing.T, g prometheus.Gatherer, expectedResult ComparisonResult) {
+func waitForResponseComparisonMetric(t *testing.T, g prometheus.Gatherer, expectedResult ComparisonResult, expectedCount uint64) {
 	started := time.Now()
 	timeoutAt := started.Add(2 * time.Second)
 
@@ -521,8 +559,8 @@ func waitForResponseComparisonMetric(t *testing.T, g prometheus.Gatherer, expect
 		expected := fmt.Sprintf(`
 			# HELP cortex_querytee_responses_compared_total Total number of responses compared per route name by result.
 			# TYPE cortex_querytee_responses_compared_total counter
-			cortex_querytee_responses_compared_total{result="%v",route="test"} 1
-`, expectedResult)
+			cortex_querytee_responses_compared_total{result="%v",route="test"} %d
+`, expectedResult, expectedCount)
 		err := testutil.GatherAndCompare(g, bytes.NewBufferString(expected), "cortex_querytee_responses_compared_total")
 
 		if err == nil {
@@ -699,19 +737,33 @@ func (m *mockLogger) Log(keyvals ...interface{}) error {
 	return nil
 }
 
+type latencyPair struct {
+	preferredResponseLatency time.Duration
+	secondaryResponseLatency time.Duration
+}
+
+func splitLatencyPairs(latencyPairs []latencyPair) (preferredLatencies []time.Duration, secondaryLatencies []time.Duration) {
+	for _, pair := range latencyPairs {
+		preferredLatencies = append(preferredLatencies, pair.preferredResponseLatency)
+		secondaryLatencies = append(secondaryLatencies, pair.secondaryResponseLatency)
+	}
+	return
+}
+
 type mockProxyBackend struct {
-	name                string
-	timeout             time.Duration
-	preferred           bool
-	fakeResponseLatency time.Duration
+	name                  string
+	timeout               time.Duration
+	preferred             bool
+	fakeResponseLatencies []time.Duration
+	responseIndex         int
 }
 
-func newMockProxyBackend(name string, timeout time.Duration, preferred bool, fakeResponseLatency time.Duration) ProxyBackendInterface {
+func newMockProxyBackend(name string, timeout time.Duration, preferred bool, fakeResponseLatencies []time.Duration) ProxyBackendInterface {
 	return &mockProxyBackend{
-		name:                name,
-		timeout:             timeout,
-		preferred:           preferred,
-		fakeResponseLatency: fakeResponseLatency,
+		name:                  name,
+		timeout:               timeout,
+		preferred:             preferred,
+		fakeResponseLatencies: fakeResponseLatencies,
 	}
 }
 
@@ -728,11 +780,20 @@ func (b *mockProxyBackend) Preferred() bool {
 }
 
 func (b *mockProxyBackend) ForwardRequest(_ *http.Request, _ io.ReadCloser) (time.Duration, int, []byte, *http.Response, error) {
+	statusCode := 200
+	if b.responseIndex >= len(b.fakeResponseLatencies) {
+		statusCode = 500
+	}
 	resp := &http.Response{
-		StatusCode: 200,
+		StatusCode: statusCode,
 		Header:     make(http.Header),
 		Body:       io.NopCloser(bytes.NewBufferString(`{}`)),
 	}
 	resp.Header.Set("Content-Type", "application/json")
-	return time.Duration(b.fakeResponseLatency), 200, []byte("{}"), resp, nil
+	if b.responseIndex >= len(b.fakeResponseLatencies) {
+		return 0, 500, []byte("{}"), resp, errors.New("no more latencies available")
+	}
+	latency := b.fakeResponseLatencies[b.responseIndex]
+	b.responseIndex++
+	return latency, 200, []byte("{}"), resp, nil
 }

From 9230e43d3b4acd70a021403f28f534a97d36a397 Mon Sep 17 00:00:00 2001
From: Joshua Hesketh <josh@hesketh.net.au>
Date: Tue, 9 Apr 2024 14:56:19 +1000
Subject: [PATCH 04/10] Update tools/querytee/proxy_metrics.go

Co-authored-by: Charles Korn <charleskorn@users.noreply.github.com>
---
 tools/querytee/proxy_metrics.go | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tools/querytee/proxy_metrics.go b/tools/querytee/proxy_metrics.go
index 47b1b7afba0..5bc297d5d14 100644
--- a/tools/querytee/proxy_metrics.go
+++ b/tools/querytee/proxy_metrics.go
@@ -54,7 +54,7 @@ func NewProxyMetrics(registerer prometheus.Registerer) *ProxyMetrics {
 		proportionalDuration: promauto.With(registerer).NewHistogramVec(prometheus.HistogramOpts{
 			Namespace:                   queryTeeMetricsNamespace,
 			Name:                        "backend_request_relative_duration_proportional",
-			Help:                        "Proportional time (%) of preferred backend vs secondary backend.",
+			Help:                        "Response time of preferred backend, as a proportion of secondary backend response time.",
 			NativeHistogramBucketFactor: 2,
 		}, []string{"route"}),
 	}

From 5b437ac0e4bfe8270812ee1bdd629f52450f1da6 Mon Sep 17 00:00:00 2001
From: Joshua Hesketh <joshua.hesketh@grafana.com>
Date: Tue, 9 Apr 2024 14:58:45 +1000
Subject: [PATCH 05/10] Tidy up test

---
 tools/querytee/proxy_endpoint_test.go | 7 ++-----
 1 file changed, 2 insertions(+), 5 deletions(-)

diff --git a/tools/querytee/proxy_endpoint_test.go b/tools/querytee/proxy_endpoint_test.go
index 20ad46239dc..47514676f82 100644
--- a/tools/querytee/proxy_endpoint_test.go
+++ b/tools/querytee/proxy_endpoint_test.go
@@ -780,17 +780,14 @@ func (b *mockProxyBackend) Preferred() bool {
 }
 
 func (b *mockProxyBackend) ForwardRequest(_ *http.Request, _ io.ReadCloser) (time.Duration, int, []byte, *http.Response, error) {
-	statusCode := 200
-	if b.responseIndex >= len(b.fakeResponseLatencies) {
-		statusCode = 500
-	}
 	resp := &http.Response{
-		StatusCode: statusCode,
+		StatusCode: 200,
 		Header:     make(http.Header),
 		Body:       io.NopCloser(bytes.NewBufferString(`{}`)),
 	}
 	resp.Header.Set("Content-Type", "application/json")
 	if b.responseIndex >= len(b.fakeResponseLatencies) {
+		resp.StatusCode = 500
 		return 0, 500, []byte("{}"), resp, errors.New("no more latencies available")
 	}
 	latency := b.fakeResponseLatencies[b.responseIndex]

From acf0cc0ff9e89c865e33caeedbf92d9e0f2f5b4f Mon Sep 17 00:00:00 2001
From: Joshua Hesketh <joshua.hesketh@grafana.com>
Date: Tue, 9 Apr 2024 15:14:41 +1000
Subject: [PATCH 06/10] Add changelog

---
 CHANGELOG.md | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 7539640db68..337f13b68f6 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -43,6 +43,9 @@
 ### Query-tee
 
 * [ENHANCEMENT] Log queries that take longer than `proxy.log-slow-query-response-threshold` when compared to other backends. #7346
+* [ENHANCEMENT] Add two new metrics for measuring the relative duration between backends: #7782
+  * `cortex_querytee_backend_request_relative_duration_seconds`
+  * `cortex_querytee_backend_request_relative_duration_proportional`
 
 ### Documentation
 

From 2b8da7063e8c5d7b2bef22ec6d7998411179ff39 Mon Sep 17 00:00:00 2001
From: Joshua Hesketh <joshua.hesketh@grafana.com>
Date: Tue, 9 Apr 2024 15:14:49 +1000
Subject: [PATCH 07/10] Add docs

---
 docs/sources/mimir/manage/tools/query-tee.md | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/docs/sources/mimir/manage/tools/query-tee.md b/docs/sources/mimir/manage/tools/query-tee.md
index 8ca1bb2e5b6..33dbd94adb9 100644
--- a/docs/sources/mimir/manage/tools/query-tee.md
+++ b/docs/sources/mimir/manage/tools/query-tee.md
@@ -163,6 +163,11 @@ cortex_querytee_responses_total{backend="<hostname>",method="<method>",route="<r
 cortex_querytee_responses_compared_total{route="<route>",result="<success|fail>"}
 ```
 
+Additionally, if backend results comparison is configured, two native-histograms are available:
+
+* `cortex_querytee_backend_request_relative_duration_seconds`: Time (in seconds) of preferred backend less secondary backend.
+* `cortex_querytee_backend_request_relative_duration_proportional`: Response time of preferred backend, as a proportion of secondary backend response time.
+
 ### Ruler remote operational mode test
 
 When the ruler is configured with the [remote evaluation mode]({{< relref "../../references/architecture/components/ruler" >}}) you can use the query-tee to compare rule evaluations too.

From 55353dcd52088eb737d8974206f36250eee34e5f Mon Sep 17 00:00:00 2001
From: Joshua Hesketh <joshua.hesketh@grafana.com>
Date: Sat, 20 Apr 2024 20:42:08 +0200
Subject: [PATCH 08/10] Generate doc

---
 docs/sources/mimir/manage/tools/query-tee.md | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/docs/sources/mimir/manage/tools/query-tee.md b/docs/sources/mimir/manage/tools/query-tee.md
index 33dbd94adb9..565082ef50a 100644
--- a/docs/sources/mimir/manage/tools/query-tee.md
+++ b/docs/sources/mimir/manage/tools/query-tee.md
@@ -165,8 +165,8 @@ cortex_querytee_responses_compared_total{route="<route>",result="<success|fail>"
 
 Additionally, if backend results comparison is configured, two native-histograms are available:
 
-* `cortex_querytee_backend_request_relative_duration_seconds`: Time (in seconds) of preferred backend less secondary backend.
-* `cortex_querytee_backend_request_relative_duration_proportional`: Response time of preferred backend, as a proportion of secondary backend response time.
+- `cortex_querytee_backend_request_relative_duration_seconds`: Time (in seconds) of preferred backend less secondary backend.
+- `cortex_querytee_backend_request_relative_duration_proportional`: Response time of preferred backend, as a proportion of secondary backend response time.
 
 ### Ruler remote operational mode test
 

From 36962be88b11b8ab2941663b3264dee227983e22 Mon Sep 17 00:00:00 2001
From: Joshua Hesketh <joshua.hesketh@grafana.com>
Date: Sat, 20 Apr 2024 21:21:39 +0200
Subject: [PATCH 09/10] Rename metric to be "response"

---
 CHANGELOG.md                                 | 4 ++--
 docs/sources/mimir/manage/tools/query-tee.md | 6 +++---
 tools/querytee/proxy_endpoint_test.go        | 4 ++--
 tools/querytee/proxy_metrics.go              | 4 ++--
 4 files changed, 9 insertions(+), 9 deletions(-)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 337f13b68f6..852fd4038f7 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -44,8 +44,8 @@
 
 * [ENHANCEMENT] Log queries that take longer than `proxy.log-slow-query-response-threshold` when compared to other backends. #7346
 * [ENHANCEMENT] Add two new metrics for measuring the relative duration between backends: #7782
-  * `cortex_querytee_backend_request_relative_duration_seconds`
-  * `cortex_querytee_backend_request_relative_duration_proportional`
+  * `cortex_querytee_backend_response_relative_duration_seconds`
+  * `cortex_querytee_backend_response_relative_duration_proportional`
 
 ### Documentation
 
diff --git a/docs/sources/mimir/manage/tools/query-tee.md b/docs/sources/mimir/manage/tools/query-tee.md
index 565082ef50a..471576c94b3 100644
--- a/docs/sources/mimir/manage/tools/query-tee.md
+++ b/docs/sources/mimir/manage/tools/query-tee.md
@@ -163,10 +163,10 @@ cortex_querytee_responses_total{backend="<hostname>",method="<method>",route="<r
 cortex_querytee_responses_compared_total{route="<route>",result="<success|fail>"}
 ```
 
-Additionally, if backend results comparison is configured, two native-histograms are available:
+Additionally, if backend results comparison is configured, two native histograms are available:
 
-- `cortex_querytee_backend_request_relative_duration_seconds`: Time (in seconds) of preferred backend less secondary backend.
-- `cortex_querytee_backend_request_relative_duration_proportional`: Response time of preferred backend, as a proportion of secondary backend response time.
+- `cortex_querytee_backend_response_relative_duration_seconds`: Time (in seconds) of preferred backend less secondary backend.
+- `cortex_querytee_backend_response_relative_duration_proportional`: Response time of preferred backend, as a proportion of secondary backend response time.
 
 ### Ruler remote operational mode test
 
diff --git a/tools/querytee/proxy_endpoint_test.go b/tools/querytee/proxy_endpoint_test.go
index 47514676f82..ea97d245cff 100644
--- a/tools/querytee/proxy_endpoint_test.go
+++ b/tools/querytee/proxy_endpoint_test.go
@@ -525,12 +525,12 @@ func Test_ProxyEndpoint_RelativeDurationMetric(t *testing.T) {
 			defer done()
 			require.NoError(t, err, "Failed to gather metrics from registry")
 
-			gotDuration := filterMetrics(got, []string{"cortex_querytee_backend_request_relative_duration_seconds"})
+			gotDuration := filterMetrics(got, []string{"cortex_querytee_backend_response_relative_duration_seconds"})
 			require.Equal(t, 1, len(gotDuration), "Expect only one metric after filtering")
 			require.Equal(t, scenario.expectedSampleCount, gotDuration[0].Metric[0].Histogram.GetSampleCount())
 			require.Equal(t, scenario.expectedDurationSampleSum, gotDuration[0].Metric[0].Histogram.GetSampleSum())
 
-			gotProportional := filterMetrics(got, []string{"cortex_querytee_backend_request_relative_duration_proportional"})
+			gotProportional := filterMetrics(got, []string{"cortex_querytee_backend_response_relative_duration_proportional"})
 			require.Equal(t, 1, len(gotProportional), "Expect only one metric after filtering")
 			require.Equal(t, scenario.expectedSampleCount, gotProportional[0].Metric[0].Histogram.GetSampleCount())
 			require.Equal(t, scenario.expectedProportionalSampleSum, gotProportional[0].Metric[0].Histogram.GetSampleSum())
diff --git a/tools/querytee/proxy_metrics.go b/tools/querytee/proxy_metrics.go
index 5bc297d5d14..a022b4fcda0 100644
--- a/tools/querytee/proxy_metrics.go
+++ b/tools/querytee/proxy_metrics.go
@@ -47,13 +47,13 @@ func NewProxyMetrics(registerer prometheus.Registerer) *ProxyMetrics {
 		}, []string{"route", "result"}),
 		relativeDuration: promauto.With(registerer).NewHistogramVec(prometheus.HistogramOpts{
 			Namespace:                   queryTeeMetricsNamespace,
-			Name:                        "backend_request_relative_duration_seconds",
+			Name:                        "backend_response_relative_duration_seconds",
 			Help:                        "Time (in seconds) of preferred backend less secondary backend.",
 			NativeHistogramBucketFactor: 2,
 		}, []string{"route"}),
 		proportionalDuration: promauto.With(registerer).NewHistogramVec(prometheus.HistogramOpts{
 			Namespace:                   queryTeeMetricsNamespace,
-			Name:                        "backend_request_relative_duration_proportional",
+			Name:                        "backend_response_relative_duration_proportional",
 			Help:                        "Response time of preferred backend, as a proportion of secondary backend response time.",
 			NativeHistogramBucketFactor: 2,
 		}, []string{"route"}),

From 2510074450153c3fd6398bd725dcb072d8a4abf5 Mon Sep 17 00:00:00 2001
From: Joshua Hesketh <joshua.hesketh@grafana.com>
Date: Sat, 20 Apr 2024 21:22:01 +0200
Subject: [PATCH 10/10] Don't use latencyPair in
 Test_ProxyEndpoint_LogSlowQueries

---
 tools/querytee/proxy_endpoint_test.go | 66 +++++++++++----------------
 1 file changed, 26 insertions(+), 40 deletions(-)

diff --git a/tools/querytee/proxy_endpoint_test.go b/tools/querytee/proxy_endpoint_test.go
index ea97d245cff..0ce1af5b379 100644
--- a/tools/querytee/proxy_endpoint_test.go
+++ b/tools/querytee/proxy_endpoint_test.go
@@ -354,81 +354,67 @@ func Test_ProxyEndpoint_Comparison(t *testing.T) {
 func Test_ProxyEndpoint_LogSlowQueries(t *testing.T) {
 	scenarios := map[string]struct {
 		slowResponseThreshold         time.Duration
-		latencyPairs                  []latencyPair
+		preferredResponseLatency      time.Duration
+		secondaryResponseLatency      time.Duration
 		expectLatencyExceedsThreshold bool
 		fastestBackend                string
 		slowestBackend                string
 	}{
 		"responses are below threshold": {
-			slowResponseThreshold: 100 * time.Millisecond,
-			latencyPairs: []latencyPair{{
-				preferredResponseLatency: 1 * time.Millisecond,
-				secondaryResponseLatency: 1 * time.Millisecond,
-			}},
+			slowResponseThreshold:         100 * time.Millisecond,
+			preferredResponseLatency:      1 * time.Millisecond,
+			secondaryResponseLatency:      1 * time.Millisecond,
 			expectLatencyExceedsThreshold: false,
 		},
 		"one response above threshold": {
-			slowResponseThreshold: 50 * time.Millisecond,
-			latencyPairs: []latencyPair{{
-				preferredResponseLatency: 1 * time.Millisecond,
-				secondaryResponseLatency: 70 * time.Millisecond,
-			}},
+			slowResponseThreshold:         50 * time.Millisecond,
+			preferredResponseLatency:      1 * time.Millisecond,
+			secondaryResponseLatency:      70 * time.Millisecond,
 			expectLatencyExceedsThreshold: true,
 			fastestBackend:                "preferred-backend",
 			slowestBackend:                "secondary-backend",
 		},
 		"responses are both above threshold, but lower than threshold between themselves": {
-			slowResponseThreshold: 50 * time.Millisecond,
-			latencyPairs: []latencyPair{{
-				preferredResponseLatency: 51 * time.Millisecond,
-				secondaryResponseLatency: 62 * time.Millisecond,
-			}},
+			slowResponseThreshold:         50 * time.Millisecond,
+			preferredResponseLatency:      51 * time.Millisecond,
+			secondaryResponseLatency:      62 * time.Millisecond,
 			expectLatencyExceedsThreshold: false,
 		},
 		"responses are both above threshold, and above threshold between themselves": {
-			slowResponseThreshold: 10 * time.Millisecond,
-			latencyPairs: []latencyPair{{
-				preferredResponseLatency: 11 * time.Millisecond,
-				secondaryResponseLatency: 52 * time.Millisecond,
-			}},
+			slowResponseThreshold:         10 * time.Millisecond,
+			preferredResponseLatency:      11 * time.Millisecond,
+			secondaryResponseLatency:      52 * time.Millisecond,
 			expectLatencyExceedsThreshold: true,
 			fastestBackend:                "preferred-backend",
 			slowestBackend:                "secondary-backend",
 		},
 		"secondary latency is faster than primary, and difference is below threshold": {
-			slowResponseThreshold: 50 * time.Millisecond,
-			latencyPairs: []latencyPair{{
-				preferredResponseLatency: 10 * time.Millisecond,
-				secondaryResponseLatency: 1 * time.Millisecond,
-			}},
+			slowResponseThreshold:         50 * time.Millisecond,
+			preferredResponseLatency:      10 * time.Millisecond,
+			secondaryResponseLatency:      1 * time.Millisecond,
 			expectLatencyExceedsThreshold: false,
 		},
 		"secondary latency is faster than primary, and difference is above threshold": {
-			slowResponseThreshold: 50 * time.Millisecond,
-			latencyPairs: []latencyPair{{
-				preferredResponseLatency: 71 * time.Millisecond,
-				secondaryResponseLatency: 1 * time.Millisecond,
-			}},
+			slowResponseThreshold:         50 * time.Millisecond,
+			preferredResponseLatency:      71 * time.Millisecond,
+			secondaryResponseLatency:      1 * time.Millisecond,
 			expectLatencyExceedsThreshold: true,
 			fastestBackend:                "secondary-backend",
 			slowestBackend:                "preferred-backend",
 		},
 		"slowest response threshold is disabled (0)": {
-			slowResponseThreshold: 0 * time.Millisecond,
-			latencyPairs: []latencyPair{{
-				preferredResponseLatency: 200 * time.Millisecond,
-				secondaryResponseLatency: 100 * time.Millisecond,
-			}},
+			slowResponseThreshold:         0 * time.Millisecond,
+			preferredResponseLatency:      200 * time.Millisecond,
+			secondaryResponseLatency:      100 * time.Millisecond,
 			expectLatencyExceedsThreshold: false,
 		},
 	}
 
 	for name, scenario := range scenarios {
 		t.Run(name, func(t *testing.T) {
-			preferredLatencies, secondaryLatencies := splitLatencyPairs(scenario.latencyPairs)
 			backends := []ProxyBackendInterface{
-				newMockProxyBackend("preferred-backend", time.Second, true, preferredLatencies),
-				newMockProxyBackend("secondary-backend", time.Second, false, secondaryLatencies),
+				newMockProxyBackend("preferred-backend", time.Second, true, []time.Duration{scenario.preferredResponseLatency}),
+				newMockProxyBackend("secondary-backend", time.Second, false, []time.Duration{scenario.secondaryResponseLatency}),
 			}
 
 			logger := newMockLogger()
@@ -446,7 +432,7 @@ func Test_ProxyEndpoint_LogSlowQueries(t *testing.T) {
 
 			// The HTTP request above will return as soon as the primary response is received, but this doesn't guarantee that the response comparison has been completed.
 			// Wait for the response comparison to complete before checking the logged messages.
-			waitForResponseComparisonMetric(t, reg, ComparisonSuccess, uint64(len(scenario.latencyPairs)))
+			waitForResponseComparisonMetric(t, reg, ComparisonSuccess, 1)
 
 			if scenario.expectLatencyExceedsThreshold {
 				requireLogKeyValues(t, logger.messages, map[string]string{