-
Notifications
You must be signed in to change notification settings - Fork 7
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Reject series/write requests when max pending request limit is hit #114
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -36,7 +36,6 @@ import ( | |
"github.com/prometheus/prometheus/tsdb" | ||
"go.opentelemetry.io/otel/attribute" | ||
"go.opentelemetry.io/otel/trace" | ||
"go.uber.org/atomic" | ||
"golang.org/x/exp/slices" | ||
"google.golang.org/grpc" | ||
"google.golang.org/grpc/codes" | ||
|
@@ -142,9 +141,6 @@ type Handler struct { | |
writeTimeseriesTotal *prometheus.HistogramVec | ||
writeE2eLatency *prometheus.HistogramVec | ||
|
||
pendingWriteRequests prometheus.Gauge | ||
pendingWriteRequestsCounter atomic.Int32 | ||
|
||
Limiter *Limiter | ||
} | ||
|
||
|
@@ -235,12 +231,6 @@ func NewHandler(logger log.Logger, o *Options) *Handler { | |
Buckets: []float64{1, 5, 10, 20, 30, 40, 50, 60, 90, 120, 300, 600, 900, 1200, 1800, 3600}, | ||
}, []string{"code", "tenant", "rollup"}, | ||
), | ||
pendingWriteRequests: promauto.With(registerer).NewGauge( | ||
prometheus.GaugeOpts{ | ||
Name: "thanos_receive_pending_write_requests", | ||
Help: "The number of pending write requests.", | ||
}, | ||
), | ||
} | ||
|
||
h.forwardRequests.WithLabelValues(labelSuccess) | ||
|
@@ -1083,12 +1073,15 @@ func quorumReached(successes []int, successThreshold int) bool { | |
|
||
// RemoteWrite implements the gRPC remote write handler for storepb.WriteableStore. | ||
func (h *Handler) RemoteWrite(ctx context.Context, r *storepb.WriteRequest) (*storepb.WriteResponse, error) { | ||
if h.Limiter.ShouldRejectNewRequest() { | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. nice, let's add a unit test for this behavior? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I can do that in a follow-up PR while testing it in Dev. It's quite tricky to write a unit test for such a feature. I want to see how useful it's in Dev before spending time on it. |
||
return nil, status.Error(codes.ResourceExhausted, "too many pending write requests") | ||
} | ||
// NB: ShouldRejectNewRequest() increments the number of pending requests only when it returns false. | ||
defer h.Limiter.DecrementPendingRequests() | ||
|
||
span, ctx := tracing.StartSpan(ctx, "receive_grpc") | ||
defer span.Finish() | ||
|
||
h.pendingWriteRequests.Set(float64(h.pendingWriteRequestsCounter.Add(1))) | ||
defer h.pendingWriteRequestsCounter.Add(-1) | ||
|
||
_, err := h.handleRequest(ctx, uint64(r.Replica), r.Tenant, &prompb.WriteRequest{Timeseries: r.Timeseries}) | ||
if err != nil { | ||
level.Debug(h.logger).Log("msg", "failed to handle request", "err", err) | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -11,6 +11,8 @@ import ( | |
"github.com/prometheus/client_golang/prometheus" | ||
"github.com/prometheus/client_golang/prometheus/promauto" | ||
"go.uber.org/atomic" | ||
"google.golang.org/grpc/codes" | ||
"google.golang.org/grpc/status" | ||
|
||
"github.com/thanos-io/thanos/pkg/extkingpin" | ||
"github.com/thanos-io/thanos/pkg/store/storepb" | ||
|
@@ -108,11 +110,14 @@ func NewBytesLimiterFactory(limit units.Base2Bytes) BytesLimiterFactory { | |
type SeriesSelectLimits struct { | ||
SeriesPerRequest uint64 | ||
SamplesPerRequest uint64 | ||
PendingRequests int32 | ||
jnyi marked this conversation as resolved.
Show resolved
Hide resolved
|
||
} | ||
|
||
func (l *SeriesSelectLimits) RegisterFlags(cmd extkingpin.FlagClause) { | ||
cmd.Flag("store.limits.request-series", "The maximum series allowed for a single Series request. The Series call fails if this limit is exceeded. 0 means no limit.").Default("0").Uint64Var(&l.SeriesPerRequest) | ||
cmd.Flag("store.limits.request-samples", "The maximum samples allowed for a single Series request, The Series call fails if this limit is exceeded. 0 means no limit. NOTE: For efficiency the limit is internally implemented as 'chunks limit' considering each chunk contains a maximum of 120 samples.").Default("0").Uint64Var(&l.SamplesPerRequest) | ||
cmd.Flag("store.limits.pending-requests", "Reject gRPC series requests right away when this number of requests are pending. Value 0 disables this feature."). | ||
Default("0").Int32Var(&l.PendingRequests) | ||
} | ||
|
||
var _ storepb.StoreServer = &limitedStoreServer{} | ||
|
@@ -123,6 +128,13 @@ type limitedStoreServer struct { | |
newSeriesLimiter SeriesLimiterFactory | ||
newSamplesLimiter ChunksLimiterFactory | ||
failedRequestsCounter *prometheus.CounterVec | ||
|
||
// This is a read-only field once it's set. | ||
// Value 0 disables the feature. | ||
maxPendingRequests int32 | ||
pendingRequests atomic.Int32 | ||
maxPendingRequestLimitHit prometheus.Counter | ||
jnyi marked this conversation as resolved.
Show resolved
Hide resolved
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. LGTM. Shall we consider adding an alert around this metrics? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Definitely once the counter is there. |
||
pendingRequestsGauge prometheus.Gauge | ||
} | ||
|
||
// NewLimitedStoreServer creates a new limitedStoreServer. | ||
|
@@ -135,10 +147,25 @@ func NewLimitedStoreServer(store storepb.StoreServer, reg prometheus.Registerer, | |
Name: "thanos_store_selects_dropped_total", | ||
Help: "Number of select queries that were dropped due to configured limits.", | ||
}, []string{"reason"}), | ||
pendingRequestsGauge: promauto.With(reg).NewGauge(prometheus.GaugeOpts{ | ||
Name: "thanos_store_server_pending_series_requests", | ||
Help: "Number of pending series requests", | ||
}), | ||
maxPendingRequestLimitHit: promauto.With(reg).NewCounter(prometheus.CounterOpts{ | ||
Name: "thanos_store_server_hit_max_pending_series_requests_total", | ||
Help: "Number of pending series requests that hit the max pending request limit", | ||
}), | ||
maxPendingRequests: selectLimits.PendingRequests, | ||
} | ||
} | ||
|
||
func (s *limitedStoreServer) Series(req *storepb.SeriesRequest, srv storepb.Store_SeriesServer) error { | ||
if s.maxPendingRequests > 0 && s.pendingRequests.Load() >= s.maxPendingRequests { | ||
jnyi marked this conversation as resolved.
Show resolved
Hide resolved
|
||
return status.Error(codes.ResourceExhausted, "too many pending series requests") | ||
} | ||
s.pendingRequestsGauge.Set(float64(s.pendingRequests.Add(1))) | ||
defer s.pendingRequests.Add(-1) | ||
|
||
seriesLimiter := s.newSeriesLimiter(s.failedRequestsCounter.WithLabelValues("series")) | ||
chunksLimiter := s.newSamplesLimiter(s.failedRequestsCounter.WithLabelValues("chunks")) | ||
limitedSrv := newLimitedServer(srv, seriesLimiter, chunksLimiter) | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
consider reuse the writeLimitsConfig so less interface changes?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I agree that it'd be ideal to have this config field in
writeLimitsConfig
, but DB pods don't loadwriteLimitsConfig
at all. Pantheon-writer pods load that config. I'll have to keep it this way.The interface is not changed.
receive.NewLimiter()
stays the same. I added another functionreceive.NewLimiterWithOptions()
.There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
got it, that's fair, do you wanna add some unit tests for limiter to test the load shedding behavior?