Skip to content

Commit

Permalink
stop retrying chunk pool exhaustion at query frontend, retry at queri…
Browse files Browse the repository at this point in the history
…er level

Signed-off-by: Ben Ye <[email protected]>
  • Loading branch information
yeya24 committed Sep 14, 2023
1 parent 97effe9 commit 44e5531
Show file tree
Hide file tree
Showing 4 changed files with 76 additions and 3 deletions.
6 changes: 6 additions & 0 deletions pkg/querier/blocks_store_queryable.go
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@ package querier
import (
"context"
"fmt"
"github.com/thanos-io/thanos/pkg/pool"
"io"
"sort"
"strings"
Expand Down Expand Up @@ -1113,6 +1114,8 @@ func countSamplesAndChunks(series ...*storepb.Series) (samplesCount, chunksCount

// only retry connection issues
func isRetryableError(err error) bool {
s := status.Convert(err)
s.Err()
switch status.Code(err) {
case codes.Unavailable:
return true
Expand All @@ -1122,6 +1125,9 @@ func isRetryableError(err error) bool {
// https://github.com/grpc/grpc-go/blob/03172006f5d168fc646d87928d85cb9c4a480291/clientconn.go#L67
case codes.Canceled:
return strings.Contains(err.Error(), "grpc: the client connection is closing")
case codes.Unknown:
// Catch chunks pool exhaustion error only.
return strings.Contains(err.Error(), pool.ErrPoolExhausted.Error())
default:
return false
}
Expand Down
30 changes: 30 additions & 0 deletions pkg/querier/blocks_store_queryable_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,7 @@ import (
"github.com/stretchr/testify/require"
"github.com/thanos-io/promql-engine/engine"
"github.com/thanos-io/promql-engine/logicalplan"
"github.com/thanos-io/thanos/pkg/pool"
"github.com/thanos-io/thanos/pkg/store/hintspb"
"github.com/thanos-io/thanos/pkg/store/labelpb"
"github.com/thanos-io/thanos/pkg/store/storepb"
Expand Down Expand Up @@ -668,6 +669,35 @@ func TestBlocksStoreQuerier_Select(t *testing.T) {
},
},
},
"multiple store-gateways has the block, but one of them fails to return due to chunk pool exhaustion": {
finderResult: bucketindex.Blocks{
{ID: block1},
},
storeSetResponses: []interface{}{
map[BlocksStoreClient][]ulid.ULID{
&storeGatewayClientMock{
remoteAddr: "1.1.1.1",
mockedSeriesErr: status.Error(codes.Unknown, pool.ErrPoolExhausted.Error()),
}: {block1},
},
map[BlocksStoreClient][]ulid.ULID{
&storeGatewayClientMock{remoteAddr: "2.2.2.2", mockedSeriesResponses: []*storepb.SeriesResponse{
mockSeriesResponse(labels.Labels{metricNameLabel, series1Label}, minT, 2),
mockHintsResponse(block1),
}}: {block1},
},
},
limits: &blocksStoreLimitsMock{},
queryLimiter: noOpQueryLimiter,
expectedSeries: []seriesResult{
{
lbls: labels.New(metricNameLabel, series1Label),
values: []valueResult{
{t: minT, v: 2},
},
},
},
},
"all store-gateways return PermissionDenied": {
finderResult: bucketindex.Blocks{
{ID: block1},
Expand Down
24 changes: 22 additions & 2 deletions pkg/querier/tripperware/queryrange/retry.go
Original file line number Diff line number Diff line change
Expand Up @@ -3,11 +3,14 @@ package queryrange
import (
"context"
"errors"
"strings"
"unsafe"

"github.com/go-kit/log"
"github.com/go-kit/log/level"
"github.com/prometheus/client_golang/prometheus"
"github.com/prometheus/client_golang/prometheus/promauto"
"github.com/thanos-io/thanos/pkg/pool"
"github.com/weaveworks/common/httpgrpc"

"github.com/cortexproject/cortex/pkg/querier/tripperware"
Expand Down Expand Up @@ -72,9 +75,9 @@ func (r retry) Do(ctx context.Context, req tripperware.Request) (tripperware.Res
return nil, err
}

// Retry if we get a HTTP 500 or a non-HTTP error.
// Retry if we get an HTTP 500 or a non-HTTP error.
httpResp, ok := httpgrpc.HTTPResponseFromError(err)
if !ok || httpResp.Code/100 == 5 {
if !ok || isRetryableResponse(httpResp) {
lastErr = err
level.Error(util_log.WithContext(ctx, r.log)).Log("msg", "error processing request", "try", tries, "err", err)
continue
Expand All @@ -84,3 +87,20 @@ func (r retry) Do(ctx context.Context, req tripperware.Request) (tripperware.Res
}
return nil, lastErr
}

func isRetryableResponse(resp *httpgrpc.HTTPResponse) bool {
if resp.Code/100 != 5 {
return false
}

// If pool exhausted, retry at query frontend might make things worse.
// Rely on retries at querier level only.
if strings.Contains(yoloString(resp.Body), pool.ErrPoolExhausted.Error()) {
return false
}
return true
}

func yoloString(buf []byte) string {
return *((*string)(unsafe.Pointer(&buf)))
}
19 changes: 18 additions & 1 deletion pkg/querier/tripperware/queryrange/retry_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@ import (

"github.com/go-kit/log"
"github.com/stretchr/testify/require"
"github.com/thanos-io/thanos/pkg/pool"
"github.com/weaveworks/common/httpgrpc"
"go.uber.org/atomic"

Expand Down Expand Up @@ -38,16 +39,32 @@ func TestRetry(t *testing.T) {
{
name: "don't retry 400s",
handler: tripperware.HandlerFunc(func(_ context.Context, req tripperware.Request) (tripperware.Response, error) {
if try.Inc() == 5 {
return &PrometheusResponse{Status: "Hello World"}, nil
}
return nil, httpgrpc.Errorf(http.StatusBadRequest, "Bad Request")
}),
err: httpgrpc.Errorf(http.StatusBadRequest, "Bad Request"),
},
{
name: "retry 500s",
handler: tripperware.HandlerFunc(func(_ context.Context, req tripperware.Request) (tripperware.Response, error) {
if try.Inc() == 5 {
return &PrometheusResponse{Status: "Hello World"}, nil
}
return nil, httpgrpc.Errorf(http.StatusInternalServerError, "Internal Server Error")
}),
err: httpgrpc.Errorf(http.StatusInternalServerError, "Internal Server Error"),
resp: &PrometheusResponse{Status: "Hello World"},
},
{
name: "don't retry chunk pool exhaustion",
handler: tripperware.HandlerFunc(func(_ context.Context, req tripperware.Request) (tripperware.Response, error) {
if try.Inc() == 5 {
return &PrometheusResponse{Status: "Hello World"}, nil
}
return nil, httpgrpc.Errorf(http.StatusInternalServerError, pool.ErrPoolExhausted.Error())
}),
err: httpgrpc.Errorf(http.StatusInternalServerError, pool.ErrPoolExhausted.Error()),
},
{
name: "last error",
Expand Down

0 comments on commit 44e5531

Please sign in to comment.