grafana · jhesketh · Apr 29, 2024 · Apr 3, 2024 · Apr 4, 2024 · Apr 4, 2024
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -43,6 +43,9 @@
 ### Query-tee
 
 * [ENHANCEMENT] Log queries that take longer than `proxy.log-slow-query-response-threshold` when compared to other backends. #7346
+* [ENHANCEMENT] Add two new metrics for measuring the relative duration between backends: #7782
+  * `cortex_querytee_backend_response_relative_duration_seconds`
+  * `cortex_querytee_backend_response_relative_duration_proportional`
 
 ### Documentation
 

@@ -163,6 +163,11 @@ cortex_querytee_responses_total{backend="<hostname>",method="<method>",route="<r
 cortex_querytee_responses_compared_total{route="<route>",result="<success|fail>"}
 ```
 
+Additionally, if backend results comparison is configured, two native histograms are available:
+
+- `cortex_querytee_backend_response_relative_duration_seconds`: Time (in seconds) of preferred backend less secondary backend.
+- `cortex_querytee_backend_response_relative_duration_proportional`: Response time of preferred backend, as a proportion of secondary backend response time.
+
 ### Ruler remote operational mode test
 
 When the ruler is configured with the [remote evaluation mode]({{< relref "../../references/architecture/components/ruler" >}}) you can use the query-tee to compare rule evaluations too.

@@ -224,6 +224,10 @@ func (p *ProxyEndpoint) executeBackendRequests(req *http.Request, resCh chan *ba
 			)
 		}
 
+		relativeDuration := expectedResponse.elapsedTime - actualResponse.elapsedTime
+		proportionalDuration := expectedResponse.elapsedTime.Seconds() / actualResponse.elapsedTime.Seconds()
+		p.metrics.relativeDuration.WithLabelValues(p.routeName).Observe(relativeDuration.Seconds())
+		p.metrics.proportionalDuration.WithLabelValues(p.routeName).Observe(proportionalDuration)
 		p.metrics.responsesComparedTotal.WithLabelValues(p.routeName, string(result)).Inc()
 	}
 }

@@ -21,6 +21,7 @@ import (
 	"github.com/pkg/errors"
 	"github.com/prometheus/client_golang/prometheus"
 	"github.com/prometheus/client_golang/prometheus/testutil"
+	dto "github.com/prometheus/client_model/go"
 	"github.com/stretchr/testify/assert"
 	"github.com/stretchr/testify/require"
 	"go.uber.org/atomic"
@@ -334,7 +335,7 @@ func Test_ProxyEndpoint_Comparison(t *testing.T) {
 
 			// The HTTP request above will return as soon as the primary response is received, but this doesn't guarantee that the response comparison has been completed.
 			// Wait for the response comparison to complete before checking the logged messages.
-			waitForResponseComparisonMetric(t, reg, scenario.expectedComparisonResult)
+			waitForResponseComparisonMetric(t, reg, scenario.expectedComparisonResult, 1)
 
 			switch scenario.expectedComparisonResult {
 			case ComparisonSuccess:
@@ -412,8 +413,8 @@ func Test_ProxyEndpoint_LogSlowQueries(t *testing.T) {
 	for name, scenario := range scenarios {
 		t.Run(name, func(t *testing.T) {
 			backends := []ProxyBackendInterface{
-				newMockProxyBackend("preferred-backend", time.Second, true, scenario.preferredResponseLatency),
-				newMockProxyBackend("secondary-backend", time.Second, false, scenario.secondaryResponseLatency),
+				newMockProxyBackend("preferred-backend", time.Second, true, []time.Duration{scenario.preferredResponseLatency}),
+				newMockProxyBackend("secondary-backend", time.Second, false, []time.Duration{scenario.secondaryResponseLatency}),
 			}
 
 			logger := newMockLogger()
@@ -431,7 +432,7 @@ func Test_ProxyEndpoint_LogSlowQueries(t *testing.T) {
 
 			// The HTTP request above will return as soon as the primary response is received, but this doesn't guarantee that the response comparison has been completed.
 			// Wait for the response comparison to complete before checking the logged messages.
-			waitForResponseComparisonMetric(t, reg, ComparisonSuccess)
+			waitForResponseComparisonMetric(t, reg, ComparisonSuccess, 1)
 
 			if scenario.expectLatencyExceedsThreshold {
 				requireLogKeyValues(t, logger.messages, map[string]string{
@@ -446,16 +447,106 @@ func Test_ProxyEndpoint_LogSlowQueries(t *testing.T) {
 	}
 }
 
-func waitForResponseComparisonMetric(t *testing.T, g prometheus.Gatherer, expectedResult ComparisonResult) {
+func Test_ProxyEndpoint_RelativeDurationMetric(t *testing.T) {
+	scenarios := map[string]struct {
+		latencyPairs                  []latencyPair
+		expectedSampleCount           uint64
+		expectedDurationSampleSum     float64
+		expectedProportionalSampleSum float64
+	}{
+		"secondary backend is faster than preferred": {
+			latencyPairs: []latencyPair{{
+				preferredResponseLatency: 3 * time.Second,
+				secondaryResponseLatency: 1 * time.Second,
+			}, {
+				preferredResponseLatency: 5 * time.Second,
+				secondaryResponseLatency: 2 * time.Second,
+			},
+			},
+			expectedSampleCount:           2,
+			expectedDurationSampleSum:     5,
+			expectedProportionalSampleSum: (3.0/1 + 5.0/2),
+		},
+		"preferred backend is 5 seconds faster than secondary": {
+			latencyPairs: []latencyPair{{
+				preferredResponseLatency: 2 * time.Second,
+				secondaryResponseLatency: 7 * time.Second,
+			},
+			},
+			expectedSampleCount:           1,
+			expectedDurationSampleSum:     -5,
+			expectedProportionalSampleSum: (2.0 / 7),
+		},
+	}
+
+	for name, scenario := range scenarios {
+		t.Run(name, func(t *testing.T) {
+			preferredLatencies, secondaryLatencies := splitLatencyPairs(scenario.latencyPairs)
+			backends := []ProxyBackendInterface{
+				newMockProxyBackend("preferred-backend", time.Second, true, preferredLatencies),
+				newMockProxyBackend("secondary-backend", time.Second, false, secondaryLatencies),
+			}
+
+			logger := newMockLogger()
+			reg := prometheus.NewPedanticRegistry()
+			comparator := &mockComparator{
+				comparisonResult: ComparisonSuccess,
+			}
+
+			endpoint := NewProxyEndpoint(backends, "test", NewProxyMetrics(reg), logger, comparator, 0)
+
+			resp := httptest.NewRecorder()
+			req, err := http.NewRequest("GET", "http://test/api/v1/test", nil)
+			require.NoError(t, err)
+			for range scenario.latencyPairs {
+				// This is just in serial to keep the tests simple
+				endpoint.ServeHTTP(resp, req)
+			}
+
+			// The HTTP request above will return as soon as the primary response is received, but this doesn't guarantee that the response comparison has been completed.
+			// Wait for the response comparison to complete the number of requests we have
+			waitForResponseComparisonMetric(t, reg, ComparisonSuccess, scenario.expectedSampleCount)
+
+			got, done, err := prometheus.ToTransactionalGatherer(reg).Gather()
+			defer done()
+			require.NoError(t, err, "Failed to gather metrics from registry")
+
+			gotDuration := filterMetrics(got, []string{"cortex_querytee_backend_response_relative_duration_seconds"})
+			require.Equal(t, 1, len(gotDuration), "Expect only one metric after filtering")
+			require.Equal(t, scenario.expectedSampleCount, gotDuration[0].Metric[0].Histogram.GetSampleCount())
+			require.Equal(t, scenario.expectedDurationSampleSum, gotDuration[0].Metric[0].Histogram.GetSampleSum())
+
+			gotProportional := filterMetrics(got, []string{"cortex_querytee_backend_response_relative_duration_proportional"})
+			require.Equal(t, 1, len(gotProportional), "Expect only one metric after filtering")
+			require.Equal(t, scenario.expectedSampleCount, gotProportional[0].Metric[0].Histogram.GetSampleCount())
+			require.Equal(t, scenario.expectedProportionalSampleSum, gotProportional[0].Metric[0].Histogram.GetSampleSum())
+		})
+	}
+}
+
+func filterMetrics(metrics []*dto.MetricFamily, names []string) []*dto.MetricFamily {
+	var filtered []*dto.MetricFamily
+	for _, m := range metrics {
+		for _, name := range names {
+			if m.GetName() == name {
+				filtered = append(filtered, m)
+				break
+			}
+		}
+	}
+	return filtered
+}
+
+func waitForResponseComparisonMetric(t *testing.T, g prometheus.Gatherer, expectedResult ComparisonResult, expectedCount uint64) {
 	started := time.Now()
 	timeoutAt := started.Add(2 * time.Second)
 
 	for {
 		expected := fmt.Sprintf(`
 			# HELP cortex_querytee_responses_compared_total Total number of responses compared per route name by result.
 			# TYPE cortex_querytee_responses_compared_total counter
-			cortex_querytee_responses_compared_total{result="%v",route="test"} 1
-`, expectedResult)
+			cortex_querytee_responses_compared_total{result="%v",route="test"} %d
+`, expectedResult, expectedCount)
 		err := testutil.GatherAndCompare(g, bytes.NewBufferString(expected), "cortex_querytee_responses_compared_total")
 
 		if err == nil {
@@ -632,19 +723,33 @@ func (m *mockLogger) Log(keyvals ...interface{}) error {
 	return nil
 }
 
+type latencyPair struct {
+	preferredResponseLatency time.Duration
+	secondaryResponseLatency time.Duration
+}
+
+func splitLatencyPairs(latencyPairs []latencyPair) (preferredLatencies []time.Duration, secondaryLatencies []time.Duration) {
+	for _, pair := range latencyPairs {
+		preferredLatencies = append(preferredLatencies, pair.preferredResponseLatency)
+		secondaryLatencies = append(secondaryLatencies, pair.secondaryResponseLatency)
+	}
+	return
+}
+
 type mockProxyBackend struct {
-	name                string
-	timeout             time.Duration
-	preferred           bool
-	fakeResponseLatency time.Duration
+	name                  string
+	timeout               time.Duration
+	preferred             bool
+	fakeResponseLatencies []time.Duration
+	responseIndex         int
 }
 
-func newMockProxyBackend(name string, timeout time.Duration, preferred bool, fakeResponseLatency time.Duration) ProxyBackendInterface {
+func newMockProxyBackend(name string, timeout time.Duration, preferred bool, fakeResponseLatencies []time.Duration) ProxyBackendInterface {
 	return &mockProxyBackend{
-		name:                name,
-		timeout:             timeout,
-		preferred:           preferred,
-		fakeResponseLatency: fakeResponseLatency,
+		name:                  name,
+		timeout:               timeout,
+		preferred:             preferred,
+		fakeResponseLatencies: fakeResponseLatencies,
 	}
 }
 
@@ -667,5 +772,11 @@ func (b *mockProxyBackend) ForwardRequest(_ *http.Request, _ io.ReadCloser) (tim
 		Body:       io.NopCloser(bytes.NewBufferString(`{}`)),
 	}
 	resp.Header.Set("Content-Type", "application/json")
-	return time.Duration(b.fakeResponseLatency), 200, []byte("{}"), resp, nil
+	if b.responseIndex >= len(b.fakeResponseLatencies) {
+		resp.StatusCode = 500
+		return 0, 500, []byte("{}"), resp, errors.New("no more latencies available")
+	}
+	latency := b.fakeResponseLatencies[b.responseIndex]
+	b.responseIndex++
+	return latency, 200, []byte("{}"), resp, nil
 }
@@ -23,6 +23,8 @@ type ProxyMetrics struct {
 	requestDuration        *prometheus.HistogramVec
 	responsesTotal         *prometheus.CounterVec
 	responsesComparedTotal *prometheus.CounterVec
+	relativeDuration       *prometheus.HistogramVec
+	proportionalDuration   *prometheus.HistogramVec
 }
 
 func NewProxyMetrics(registerer prometheus.Registerer) *ProxyMetrics {
@@ -43,6 +45,18 @@ func NewProxyMetrics(registerer prometheus.Registerer) *ProxyMetrics {
 			Name:      "responses_compared_total",
 			Help:      "Total number of responses compared per route name by result.",
 		}, []string{"route", "result"}),
+		relativeDuration: promauto.With(registerer).NewHistogramVec(prometheus.HistogramOpts{
+			Namespace:                   queryTeeMetricsNamespace,
+			Name:                        "backend_response_relative_duration_seconds",
+			Help:                        "Time (in seconds) of preferred backend less secondary backend.",
+			NativeHistogramBucketFactor: 2,
+		}, []string{"route"}),
+		proportionalDuration: promauto.With(registerer).NewHistogramVec(prometheus.HistogramOpts{
+			Namespace:                   queryTeeMetricsNamespace,
+			Name:                        "backend_response_relative_duration_proportional",
+			Help:                        "Response time of preferred backend, as a proportion of secondary backend response time.",
+			NativeHistogramBucketFactor: 2,
+		}, []string{"route"}),
 	}
 
 	return m