Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Querytee: Add metric to measure relative backend latency #7782

Merged
merged 10 commits into from
Apr 29, 2024
3 changes: 3 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -43,6 +43,9 @@
### Query-tee

* [ENHANCEMENT] Log queries that take longer than `proxy.log-slow-query-response-threshold` when compared to other backends. #7346
* [ENHANCEMENT] Add two new metrics for measuring the relative duration between backends: #7782
* `cortex_querytee_backend_response_relative_duration_seconds`
* `cortex_querytee_backend_response_relative_duration_proportional`

### Documentation

Expand Down
5 changes: 5 additions & 0 deletions docs/sources/mimir/manage/tools/query-tee.md
Original file line number Diff line number Diff line change
Expand Up @@ -163,6 +163,11 @@ cortex_querytee_responses_total{backend="<hostname>",method="<method>",route="<r
cortex_querytee_responses_compared_total{route="<route>",result="<success|fail>"}
```

Additionally, if backend results comparison is configured, two native histograms are available:

- `cortex_querytee_backend_response_relative_duration_seconds`: Time (in seconds) of preferred backend less secondary backend.
- `cortex_querytee_backend_response_relative_duration_proportional`: Response time of preferred backend, as a proportion of secondary backend response time.

### Ruler remote operational mode test

When the ruler is configured with the [remote evaluation mode]({{< relref "../../references/architecture/components/ruler" >}}) you can use the query-tee to compare rule evaluations too.
Expand Down
4 changes: 4 additions & 0 deletions tools/querytee/proxy_endpoint.go
Original file line number Diff line number Diff line change
Expand Up @@ -224,6 +224,10 @@ func (p *ProxyEndpoint) executeBackendRequests(req *http.Request, resCh chan *ba
)
}

relativeDuration := expectedResponse.elapsedTime - actualResponse.elapsedTime
proportionalDuration := expectedResponse.elapsedTime.Seconds() / actualResponse.elapsedTime.Seconds()
p.metrics.relativeDuration.WithLabelValues(p.routeName).Observe(relativeDuration.Seconds())
p.metrics.proportionalDuration.WithLabelValues(p.routeName).Observe(proportionalDuration)
p.metrics.responsesComparedTotal.WithLabelValues(p.routeName, string(result)).Inc()
}
}
Expand Down
145 changes: 128 additions & 17 deletions tools/querytee/proxy_endpoint_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@ import (
"github.com/pkg/errors"
"github.com/prometheus/client_golang/prometheus"
"github.com/prometheus/client_golang/prometheus/testutil"
dto "github.com/prometheus/client_model/go"
"github.com/stretchr/testify/assert"
"github.com/stretchr/testify/require"
"go.uber.org/atomic"
Expand Down Expand Up @@ -334,7 +335,7 @@ func Test_ProxyEndpoint_Comparison(t *testing.T) {

// The HTTP request above will return as soon as the primary response is received, but this doesn't guarantee that the response comparison has been completed.
// Wait for the response comparison to complete before checking the logged messages.
waitForResponseComparisonMetric(t, reg, scenario.expectedComparisonResult)
waitForResponseComparisonMetric(t, reg, scenario.expectedComparisonResult, 1)

switch scenario.expectedComparisonResult {
case ComparisonSuccess:
Expand Down Expand Up @@ -412,8 +413,8 @@ func Test_ProxyEndpoint_LogSlowQueries(t *testing.T) {
for name, scenario := range scenarios {
t.Run(name, func(t *testing.T) {
backends := []ProxyBackendInterface{
newMockProxyBackend("preferred-backend", time.Second, true, scenario.preferredResponseLatency),
newMockProxyBackend("secondary-backend", time.Second, false, scenario.secondaryResponseLatency),
newMockProxyBackend("preferred-backend", time.Second, true, []time.Duration{scenario.preferredResponseLatency}),
newMockProxyBackend("secondary-backend", time.Second, false, []time.Duration{scenario.secondaryResponseLatency}),
}

logger := newMockLogger()
Expand All @@ -431,7 +432,7 @@ func Test_ProxyEndpoint_LogSlowQueries(t *testing.T) {

// The HTTP request above will return as soon as the primary response is received, but this doesn't guarantee that the response comparison has been completed.
// Wait for the response comparison to complete before checking the logged messages.
waitForResponseComparisonMetric(t, reg, ComparisonSuccess)
waitForResponseComparisonMetric(t, reg, ComparisonSuccess, 1)

if scenario.expectLatencyExceedsThreshold {
requireLogKeyValues(t, logger.messages, map[string]string{
Expand All @@ -446,16 +447,106 @@ func Test_ProxyEndpoint_LogSlowQueries(t *testing.T) {
}
}

func waitForResponseComparisonMetric(t *testing.T, g prometheus.Gatherer, expectedResult ComparisonResult) {
func Test_ProxyEndpoint_RelativeDurationMetric(t *testing.T) {
scenarios := map[string]struct {
latencyPairs []latencyPair
expectedSampleCount uint64
expectedDurationSampleSum float64
expectedProportionalSampleSum float64
}{
"secondary backend is faster than preferred": {
latencyPairs: []latencyPair{{
preferredResponseLatency: 3 * time.Second,
secondaryResponseLatency: 1 * time.Second,
}, {
preferredResponseLatency: 5 * time.Second,
secondaryResponseLatency: 2 * time.Second,
},
},
expectedSampleCount: 2,
expectedDurationSampleSum: 5,
expectedProportionalSampleSum: (3.0/1 + 5.0/2),
},
"preferred backend is 5 seconds faster than secondary": {
latencyPairs: []latencyPair{{
preferredResponseLatency: 2 * time.Second,
secondaryResponseLatency: 7 * time.Second,
},
},
expectedSampleCount: 1,
expectedDurationSampleSum: -5,
expectedProportionalSampleSum: (2.0 / 7),
},
}

for name, scenario := range scenarios {
t.Run(name, func(t *testing.T) {
preferredLatencies, secondaryLatencies := splitLatencyPairs(scenario.latencyPairs)
backends := []ProxyBackendInterface{
newMockProxyBackend("preferred-backend", time.Second, true, preferredLatencies),
newMockProxyBackend("secondary-backend", time.Second, false, secondaryLatencies),
}

logger := newMockLogger()
reg := prometheus.NewPedanticRegistry()
comparator := &mockComparator{
comparisonResult: ComparisonSuccess,
}

endpoint := NewProxyEndpoint(backends, "test", NewProxyMetrics(reg), logger, comparator, 0)

resp := httptest.NewRecorder()
req, err := http.NewRequest("GET", "http://test/api/v1/test", nil)
require.NoError(t, err)
for range scenario.latencyPairs {
// This is just in serial to keep the tests simple
endpoint.ServeHTTP(resp, req)
}

// The HTTP request above will return as soon as the primary response is received, but this doesn't guarantee that the response comparison has been completed.
// Wait for the response comparison to complete the number of requests we have
waitForResponseComparisonMetric(t, reg, ComparisonSuccess, scenario.expectedSampleCount)

got, done, err := prometheus.ToTransactionalGatherer(reg).Gather()
defer done()
require.NoError(t, err, "Failed to gather metrics from registry")

gotDuration := filterMetrics(got, []string{"cortex_querytee_backend_response_relative_duration_seconds"})
require.Equal(t, 1, len(gotDuration), "Expect only one metric after filtering")
require.Equal(t, scenario.expectedSampleCount, gotDuration[0].Metric[0].Histogram.GetSampleCount())
require.Equal(t, scenario.expectedDurationSampleSum, gotDuration[0].Metric[0].Histogram.GetSampleSum())

gotProportional := filterMetrics(got, []string{"cortex_querytee_backend_response_relative_duration_proportional"})
require.Equal(t, 1, len(gotProportional), "Expect only one metric after filtering")
require.Equal(t, scenario.expectedSampleCount, gotProportional[0].Metric[0].Histogram.GetSampleCount())
require.Equal(t, scenario.expectedProportionalSampleSum, gotProportional[0].Metric[0].Histogram.GetSampleSum())
})
}
}

func filterMetrics(metrics []*dto.MetricFamily, names []string) []*dto.MetricFamily {
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

[nit] This method is only ever used with one value for name - might be able to simplify this and the test that calls it.

var filtered []*dto.MetricFamily
for _, m := range metrics {
for _, name := range names {
if m.GetName() == name {
filtered = append(filtered, m)
break
}
}
}
return filtered
}

func waitForResponseComparisonMetric(t *testing.T, g prometheus.Gatherer, expectedResult ComparisonResult, expectedCount uint64) {
started := time.Now()
timeoutAt := started.Add(2 * time.Second)

for {
expected := fmt.Sprintf(`
# HELP cortex_querytee_responses_compared_total Total number of responses compared per route name by result.
# TYPE cortex_querytee_responses_compared_total counter
cortex_querytee_responses_compared_total{result="%v",route="test"} 1
`, expectedResult)
cortex_querytee_responses_compared_total{result="%v",route="test"} %d
`, expectedResult, expectedCount)
err := testutil.GatherAndCompare(g, bytes.NewBufferString(expected), "cortex_querytee_responses_compared_total")

if err == nil {
Expand Down Expand Up @@ -632,19 +723,33 @@ func (m *mockLogger) Log(keyvals ...interface{}) error {
return nil
}

type latencyPair struct {
preferredResponseLatency time.Duration
secondaryResponseLatency time.Duration
}

func splitLatencyPairs(latencyPairs []latencyPair) (preferredLatencies []time.Duration, secondaryLatencies []time.Duration) {
for _, pair := range latencyPairs {
preferredLatencies = append(preferredLatencies, pair.preferredResponseLatency)
secondaryLatencies = append(secondaryLatencies, pair.secondaryResponseLatency)
}
return
}

type mockProxyBackend struct {
name string
timeout time.Duration
preferred bool
fakeResponseLatency time.Duration
name string
timeout time.Duration
preferred bool
fakeResponseLatencies []time.Duration
responseIndex int
}

func newMockProxyBackend(name string, timeout time.Duration, preferred bool, fakeResponseLatency time.Duration) ProxyBackendInterface {
func newMockProxyBackend(name string, timeout time.Duration, preferred bool, fakeResponseLatencies []time.Duration) ProxyBackendInterface {
return &mockProxyBackend{
name: name,
timeout: timeout,
preferred: preferred,
fakeResponseLatency: fakeResponseLatency,
name: name,
timeout: timeout,
preferred: preferred,
fakeResponseLatencies: fakeResponseLatencies,
}
}

Expand All @@ -667,5 +772,11 @@ func (b *mockProxyBackend) ForwardRequest(_ *http.Request, _ io.ReadCloser) (tim
Body: io.NopCloser(bytes.NewBufferString(`{}`)),
}
resp.Header.Set("Content-Type", "application/json")
return time.Duration(b.fakeResponseLatency), 200, []byte("{}"), resp, nil
if b.responseIndex >= len(b.fakeResponseLatencies) {
resp.StatusCode = 500
return 0, 500, []byte("{}"), resp, errors.New("no more latencies available")
}
latency := b.fakeResponseLatencies[b.responseIndex]
b.responseIndex++
return latency, 200, []byte("{}"), resp, nil
}
14 changes: 14 additions & 0 deletions tools/querytee/proxy_metrics.go
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,8 @@ type ProxyMetrics struct {
requestDuration *prometheus.HistogramVec
responsesTotal *prometheus.CounterVec
responsesComparedTotal *prometheus.CounterVec
relativeDuration *prometheus.HistogramVec
proportionalDuration *prometheus.HistogramVec
}

func NewProxyMetrics(registerer prometheus.Registerer) *ProxyMetrics {
Expand All @@ -43,6 +45,18 @@ func NewProxyMetrics(registerer prometheus.Registerer) *ProxyMetrics {
Name: "responses_compared_total",
Help: "Total number of responses compared per route name by result.",
}, []string{"route", "result"}),
relativeDuration: promauto.With(registerer).NewHistogramVec(prometheus.HistogramOpts{
Namespace: queryTeeMetricsNamespace,
Name: "backend_response_relative_duration_seconds",
Help: "Time (in seconds) of preferred backend less secondary backend.",
NativeHistogramBucketFactor: 2,
}, []string{"route"}),
proportionalDuration: promauto.With(registerer).NewHistogramVec(prometheus.HistogramOpts{
Namespace: queryTeeMetricsNamespace,
Name: "backend_response_relative_duration_proportional",
Help: "Response time of preferred backend, as a proportion of secondary backend response time.",
NativeHistogramBucketFactor: 2,
}, []string{"route"}),
}

return m
Expand Down
Loading