Skip to content

Commit

Permalink
Querytee: Add metric to measure relative backend latency (#7782)
Browse files Browse the repository at this point in the history
* Querytee: Add metric to measure relative backend latency

This metric gives us a measurement of how individual queries compare
between two backends in terms of latency (or duration).

* Add proportional metric, and change to native histograms

* Improve histogram unit tests

Also test the proportional metric

* Update tools/querytee/proxy_metrics.go

Co-authored-by: Charles Korn <[email protected]>

* Tidy up test

* Add changelog

* Add docs

* Generate doc

* Rename metric to be "response"

* Don't use latencyPair in Test_ProxyEndpoint_LogSlowQueries

---------

Co-authored-by: Charles Korn <[email protected]>
  • Loading branch information
jhesketh and charleskorn authored Apr 29, 2024
1 parent 748a726 commit 5660319
Show file tree
Hide file tree
Showing 5 changed files with 154 additions and 17 deletions.
3 changes: 3 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -94,6 +94,9 @@
### Query-tee

* [ENHANCEMENT] Log queries that take longer than `proxy.log-slow-query-response-threshold` when compared to other backends. #7346
* [ENHANCEMENT] Add two new metrics for measuring the relative duration between backends: #7782
* `cortex_querytee_backend_response_relative_duration_seconds`
* `cortex_querytee_backend_response_relative_duration_proportional`

### Documentation

Expand Down
5 changes: 5 additions & 0 deletions docs/sources/mimir/manage/tools/query-tee.md
Original file line number Diff line number Diff line change
Expand Up @@ -163,6 +163,11 @@ cortex_querytee_responses_total{backend="<hostname>",method="<method>",route="<r
cortex_querytee_responses_compared_total{route="<route>",result="<success|fail>"}
```

Additionally, if backend results comparison is configured, two native histograms are available:

- `cortex_querytee_backend_response_relative_duration_seconds`: Time (in seconds) of preferred backend less secondary backend.
- `cortex_querytee_backend_response_relative_duration_proportional`: Response time of preferred backend, as a proportion of secondary backend response time.

### Ruler remote operational mode test

When the ruler is configured with the [remote evaluation mode]({{< relref "../../references/architecture/components/ruler" >}}) you can use the query-tee to compare rule evaluations too.
Expand Down
4 changes: 4 additions & 0 deletions tools/querytee/proxy_endpoint.go
Original file line number Diff line number Diff line change
Expand Up @@ -224,6 +224,10 @@ func (p *ProxyEndpoint) executeBackendRequests(req *http.Request, resCh chan *ba
)
}

relativeDuration := expectedResponse.elapsedTime - actualResponse.elapsedTime
proportionalDuration := expectedResponse.elapsedTime.Seconds() / actualResponse.elapsedTime.Seconds()
p.metrics.relativeDuration.WithLabelValues(p.routeName).Observe(relativeDuration.Seconds())
p.metrics.proportionalDuration.WithLabelValues(p.routeName).Observe(proportionalDuration)
p.metrics.responsesComparedTotal.WithLabelValues(p.routeName, string(result)).Inc()
}
}
Expand Down
145 changes: 128 additions & 17 deletions tools/querytee/proxy_endpoint_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@ import (
"github.com/pkg/errors"
"github.com/prometheus/client_golang/prometheus"
"github.com/prometheus/client_golang/prometheus/testutil"
dto "github.com/prometheus/client_model/go"
"github.com/stretchr/testify/assert"
"github.com/stretchr/testify/require"
"go.uber.org/atomic"
Expand Down Expand Up @@ -334,7 +335,7 @@ func Test_ProxyEndpoint_Comparison(t *testing.T) {

// The HTTP request above will return as soon as the primary response is received, but this doesn't guarantee that the response comparison has been completed.
// Wait for the response comparison to complete before checking the logged messages.
waitForResponseComparisonMetric(t, reg, scenario.expectedComparisonResult)
waitForResponseComparisonMetric(t, reg, scenario.expectedComparisonResult, 1)

switch scenario.expectedComparisonResult {
case ComparisonSuccess:
Expand Down Expand Up @@ -412,8 +413,8 @@ func Test_ProxyEndpoint_LogSlowQueries(t *testing.T) {
for name, scenario := range scenarios {
t.Run(name, func(t *testing.T) {
backends := []ProxyBackendInterface{
newMockProxyBackend("preferred-backend", time.Second, true, scenario.preferredResponseLatency),
newMockProxyBackend("secondary-backend", time.Second, false, scenario.secondaryResponseLatency),
newMockProxyBackend("preferred-backend", time.Second, true, []time.Duration{scenario.preferredResponseLatency}),
newMockProxyBackend("secondary-backend", time.Second, false, []time.Duration{scenario.secondaryResponseLatency}),
}

logger := newMockLogger()
Expand All @@ -431,7 +432,7 @@ func Test_ProxyEndpoint_LogSlowQueries(t *testing.T) {

// The HTTP request above will return as soon as the primary response is received, but this doesn't guarantee that the response comparison has been completed.
// Wait for the response comparison to complete before checking the logged messages.
waitForResponseComparisonMetric(t, reg, ComparisonSuccess)
waitForResponseComparisonMetric(t, reg, ComparisonSuccess, 1)

if scenario.expectLatencyExceedsThreshold {
requireLogKeyValues(t, logger.messages, map[string]string{
Expand All @@ -446,16 +447,106 @@ func Test_ProxyEndpoint_LogSlowQueries(t *testing.T) {
}
}

func waitForResponseComparisonMetric(t *testing.T, g prometheus.Gatherer, expectedResult ComparisonResult) {
func Test_ProxyEndpoint_RelativeDurationMetric(t *testing.T) {
scenarios := map[string]struct {
latencyPairs []latencyPair
expectedSampleCount uint64
expectedDurationSampleSum float64
expectedProportionalSampleSum float64
}{
"secondary backend is faster than preferred": {
latencyPairs: []latencyPair{{
preferredResponseLatency: 3 * time.Second,
secondaryResponseLatency: 1 * time.Second,
}, {
preferredResponseLatency: 5 * time.Second,
secondaryResponseLatency: 2 * time.Second,
},
},
expectedSampleCount: 2,
expectedDurationSampleSum: 5,
expectedProportionalSampleSum: (3.0/1 + 5.0/2),
},
"preferred backend is 5 seconds faster than secondary": {
latencyPairs: []latencyPair{{
preferredResponseLatency: 2 * time.Second,
secondaryResponseLatency: 7 * time.Second,
},
},
expectedSampleCount: 1,
expectedDurationSampleSum: -5,
expectedProportionalSampleSum: (2.0 / 7),
},
}

for name, scenario := range scenarios {
t.Run(name, func(t *testing.T) {
preferredLatencies, secondaryLatencies := splitLatencyPairs(scenario.latencyPairs)
backends := []ProxyBackendInterface{
newMockProxyBackend("preferred-backend", time.Second, true, preferredLatencies),
newMockProxyBackend("secondary-backend", time.Second, false, secondaryLatencies),
}

logger := newMockLogger()
reg := prometheus.NewPedanticRegistry()
comparator := &mockComparator{
comparisonResult: ComparisonSuccess,
}

endpoint := NewProxyEndpoint(backends, "test", NewProxyMetrics(reg), logger, comparator, 0)

resp := httptest.NewRecorder()
req, err := http.NewRequest("GET", "http://test/api/v1/test", nil)
require.NoError(t, err)
for range scenario.latencyPairs {
// This is just in serial to keep the tests simple
endpoint.ServeHTTP(resp, req)
}

// The HTTP request above will return as soon as the primary response is received, but this doesn't guarantee that the response comparison has been completed.
// Wait for the response comparison to complete the number of requests we have
waitForResponseComparisonMetric(t, reg, ComparisonSuccess, scenario.expectedSampleCount)

got, done, err := prometheus.ToTransactionalGatherer(reg).Gather()
defer done()
require.NoError(t, err, "Failed to gather metrics from registry")

gotDuration := filterMetrics(got, []string{"cortex_querytee_backend_response_relative_duration_seconds"})
require.Equal(t, 1, len(gotDuration), "Expect only one metric after filtering")
require.Equal(t, scenario.expectedSampleCount, gotDuration[0].Metric[0].Histogram.GetSampleCount())
require.Equal(t, scenario.expectedDurationSampleSum, gotDuration[0].Metric[0].Histogram.GetSampleSum())

gotProportional := filterMetrics(got, []string{"cortex_querytee_backend_response_relative_duration_proportional"})
require.Equal(t, 1, len(gotProportional), "Expect only one metric after filtering")
require.Equal(t, scenario.expectedSampleCount, gotProportional[0].Metric[0].Histogram.GetSampleCount())
require.Equal(t, scenario.expectedProportionalSampleSum, gotProportional[0].Metric[0].Histogram.GetSampleSum())
})
}
}

func filterMetrics(metrics []*dto.MetricFamily, names []string) []*dto.MetricFamily {
var filtered []*dto.MetricFamily
for _, m := range metrics {
for _, name := range names {
if m.GetName() == name {
filtered = append(filtered, m)
break
}
}
}
return filtered
}

func waitForResponseComparisonMetric(t *testing.T, g prometheus.Gatherer, expectedResult ComparisonResult, expectedCount uint64) {
started := time.Now()
timeoutAt := started.Add(2 * time.Second)

for {
expected := fmt.Sprintf(`
# HELP cortex_querytee_responses_compared_total Total number of responses compared per route name by result.
# TYPE cortex_querytee_responses_compared_total counter
cortex_querytee_responses_compared_total{result="%v",route="test"} 1
`, expectedResult)
cortex_querytee_responses_compared_total{result="%v",route="test"} %d
`, expectedResult, expectedCount)
err := testutil.GatherAndCompare(g, bytes.NewBufferString(expected), "cortex_querytee_responses_compared_total")

if err == nil {
Expand Down Expand Up @@ -632,19 +723,33 @@ func (m *mockLogger) Log(keyvals ...interface{}) error {
return nil
}

type latencyPair struct {
preferredResponseLatency time.Duration
secondaryResponseLatency time.Duration
}

func splitLatencyPairs(latencyPairs []latencyPair) (preferredLatencies []time.Duration, secondaryLatencies []time.Duration) {
for _, pair := range latencyPairs {
preferredLatencies = append(preferredLatencies, pair.preferredResponseLatency)
secondaryLatencies = append(secondaryLatencies, pair.secondaryResponseLatency)
}
return
}

type mockProxyBackend struct {
name string
timeout time.Duration
preferred bool
fakeResponseLatency time.Duration
name string
timeout time.Duration
preferred bool
fakeResponseLatencies []time.Duration
responseIndex int
}

func newMockProxyBackend(name string, timeout time.Duration, preferred bool, fakeResponseLatency time.Duration) ProxyBackendInterface {
func newMockProxyBackend(name string, timeout time.Duration, preferred bool, fakeResponseLatencies []time.Duration) ProxyBackendInterface {
return &mockProxyBackend{
name: name,
timeout: timeout,
preferred: preferred,
fakeResponseLatency: fakeResponseLatency,
name: name,
timeout: timeout,
preferred: preferred,
fakeResponseLatencies: fakeResponseLatencies,
}
}

Expand All @@ -667,5 +772,11 @@ func (b *mockProxyBackend) ForwardRequest(_ *http.Request, _ io.ReadCloser) (tim
Body: io.NopCloser(bytes.NewBufferString(`{}`)),
}
resp.Header.Set("Content-Type", "application/json")
return time.Duration(b.fakeResponseLatency), 200, []byte("{}"), resp, nil
if b.responseIndex >= len(b.fakeResponseLatencies) {
resp.StatusCode = 500
return 0, 500, []byte("{}"), resp, errors.New("no more latencies available")
}
latency := b.fakeResponseLatencies[b.responseIndex]
b.responseIndex++
return latency, 200, []byte("{}"), resp, nil
}
14 changes: 14 additions & 0 deletions tools/querytee/proxy_metrics.go
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,8 @@ type ProxyMetrics struct {
requestDuration *prometheus.HistogramVec
responsesTotal *prometheus.CounterVec
responsesComparedTotal *prometheus.CounterVec
relativeDuration *prometheus.HistogramVec
proportionalDuration *prometheus.HistogramVec
}

func NewProxyMetrics(registerer prometheus.Registerer) *ProxyMetrics {
Expand All @@ -43,6 +45,18 @@ func NewProxyMetrics(registerer prometheus.Registerer) *ProxyMetrics {
Name: "responses_compared_total",
Help: "Total number of responses compared per route name by result.",
}, []string{"route", "result"}),
relativeDuration: promauto.With(registerer).NewHistogramVec(prometheus.HistogramOpts{
Namespace: queryTeeMetricsNamespace,
Name: "backend_response_relative_duration_seconds",
Help: "Time (in seconds) of preferred backend less secondary backend.",
NativeHistogramBucketFactor: 2,
}, []string{"route"}),
proportionalDuration: promauto.With(registerer).NewHistogramVec(prometheus.HistogramOpts{
Namespace: queryTeeMetricsNamespace,
Name: "backend_response_relative_duration_proportional",
Help: "Response time of preferred backend, as a proportion of secondary backend response time.",
NativeHistogramBucketFactor: 2,
}, []string{"route"}),
}

return m
Expand Down

0 comments on commit 5660319

Please sign in to comment.