From 39161ca8b2ea9e066a1169a13eaf2c97a5ac13ce Mon Sep 17 00:00:00 2001 From: Anna Tran Date: Fri, 29 Nov 2024 14:33:15 -0800 Subject: [PATCH 1/2] Return 503 on hitting distributor client inflight request limit Signed-off-by: Anna Tran --- pkg/distributor/distributor.go | 3 +-- pkg/distributor/distributor_test.go | 3 ++- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/pkg/distributor/distributor.go b/pkg/distributor/distributor.go index d702703590..80c76a6340 100644 --- a/pkg/distributor/distributor.go +++ b/pkg/distributor/distributor.go @@ -55,7 +55,6 @@ var ( // Distributor instance limits errors. errTooManyInflightPushRequests = errors.New("too many inflight push requests in distributor") errMaxSamplesPushRateLimitReached = errors.New("distributor's samples push rate limit reached") - errTooManyInflightClientRequests = errors.New("too many inflight ingester client requests in distributor") ) const ( @@ -680,7 +679,7 @@ func (d *Distributor) Push(ctx context.Context, req *cortexpb.WriteRequest) (*co // only reject requests at this stage to allow distributor to finish sending the current batch request to all ingesters // even if we've exceeded the MaxInflightClientRequests in the `doBatch` if d.cfg.InstanceLimits.MaxInflightClientRequests > 0 && d.inflightClientRequests.Load() > int64(d.cfg.InstanceLimits.MaxInflightClientRequests) { - return nil, errTooManyInflightClientRequests + return nil, httpgrpc.Errorf(http.StatusServiceUnavailable, "too many inflight ingester client requests in distributor") } removeReplica := false diff --git a/pkg/distributor/distributor_test.go b/pkg/distributor/distributor_test.go index 68829cc2f8..2e0beac7e6 100644 --- a/pkg/distributor/distributor_test.go +++ b/pkg/distributor/distributor_test.go @@ -863,7 +863,8 @@ func TestDistributor_PushInstanceLimits(t *testing.T) { preInflightClient: 103, inflightClientLimit: 101, pushes: []testPush{ - {samples: 100, expectedError: errTooManyInflightClientRequests}, + {samples: 100, expectedError: httpgrpc.Errorf(http.StatusServiceUnavailable, + "too many inflight ingester client requests in distributor")}, }, }, "below ingestion rate limit": { From ac14d6474db1cee345a3cb8dc44d18c0157d1804 Mon Sep 17 00:00:00 2001 From: Anna Tran Date: Fri, 29 Nov 2024 15:10:02 -0800 Subject: [PATCH 2/2] Return HTTP 503 on hitting distributor instance limits Signed-off-by: Anna Tran --- CHANGELOG.md | 1 + pkg/distributor/distributor.go | 8 ++------ pkg/distributor/distributor_test.go | 12 ++++++------ 3 files changed, 9 insertions(+), 12 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index c70c1b14af..6b02b1d76f 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -40,6 +40,7 @@ * [ENHANCEMENT] StoreGateway: Add new `cortex_bucket_store_chunk_pool_inuse_bytes` metric to track the usage in chunk pool. #6310 * [ENHANCEMENT] Distributor: Add new `cortex_distributor_inflight_client_requests` metric to track number of ingester client inflight requests. #6358 * [ENHANCEMENT] Distributor: Expose `cortex_label_size_bytes` native histogram metric. #6372 +* [ENHANCEMENT] Distributor: Return HTTP 5XX instead of HTTP 4XX when instance limits are hit. #6358 * [BUGFIX] Runtime-config: Handle absolute file paths when working directory is not / #6224 * [BUGFIX] Ruler: Allow rule evaluation to complete during shutdown. #6326 * [BUGFIX] Ring: update ring with new ip address when instance is lost, rejoins, but heartbeat is disabled #6271 diff --git a/pkg/distributor/distributor.go b/pkg/distributor/distributor.go index 80c76a6340..6e7e283238 100644 --- a/pkg/distributor/distributor.go +++ b/pkg/distributor/distributor.go @@ -51,10 +51,6 @@ var ( // Validation errors. errInvalidShardingStrategy = errors.New("invalid sharding strategy") errInvalidTenantShardSize = errors.New("invalid tenant shard size. The value must be greater than or equal to 0") - - // Distributor instance limits errors. - errTooManyInflightPushRequests = errors.New("too many inflight push requests in distributor") - errMaxSamplesPushRateLimitReached = errors.New("distributor's samples push rate limit reached") ) const ( @@ -667,12 +663,12 @@ func (d *Distributor) Push(ctx context.Context, req *cortexpb.WriteRequest) (*co d.incomingMetadata.WithLabelValues(userID).Add(float64(len(req.Metadata))) if d.cfg.InstanceLimits.MaxInflightPushRequests > 0 && inflight > int64(d.cfg.InstanceLimits.MaxInflightPushRequests) { - return nil, errTooManyInflightPushRequests + return nil, httpgrpc.Errorf(http.StatusServiceUnavailable, "too many inflight push requests in distributor") } if d.cfg.InstanceLimits.MaxIngestionRate > 0 { if rate := d.ingestionRate.Rate(); rate >= d.cfg.InstanceLimits.MaxIngestionRate { - return nil, errMaxSamplesPushRateLimitReached + return nil, httpgrpc.Errorf(http.StatusServiceUnavailable, "distributor's samples push rate limit reached") } } diff --git a/pkg/distributor/distributor_test.go b/pkg/distributor/distributor_test.go index 2e0beac7e6..2c95f82788 100644 --- a/pkg/distributor/distributor_test.go +++ b/pkg/distributor/distributor_test.go @@ -840,7 +840,7 @@ func TestDistributor_PushInstanceLimits(t *testing.T) { preInflight: 101, inflightLimit: 101, pushes: []testPush{ - {samples: 100, expectedError: errTooManyInflightPushRequests}, + {samples: 100, expectedError: httpgrpc.Errorf(http.StatusServiceUnavailable, "too many inflight push requests in distributor")}, }, }, "below inflight client limit": { @@ -893,7 +893,7 @@ func TestDistributor_PushInstanceLimits(t *testing.T) { ingestionRateLimit: 1000, pushes: []testPush{ - {samples: 100, expectedError: errMaxSamplesPushRateLimitReached}, + {samples: 100, expectedError: httpgrpc.Errorf(http.StatusServiceUnavailable, "distributor's samples push rate limit reached")}, {samples: 100, expectedError: nil}, }, }, @@ -903,10 +903,10 @@ func TestDistributor_PushInstanceLimits(t *testing.T) { ingestionRateLimit: 1000, pushes: []testPush{ - {samples: 5000, expectedError: nil}, // after push, rate = 500 + 0.2*(5000-500) = 1400 - {samples: 5000, expectedError: errMaxSamplesPushRateLimitReached}, // after push, rate = 1400 + 0.2*(0 - 1400) = 1120 - {samples: 5000, expectedError: errMaxSamplesPushRateLimitReached}, // after push, rate = 1120 + 0.2*(0 - 1120) = 896 - {samples: 5000, expectedError: nil}, // 896 is below 1000, so this push succeeds, new rate = 896 + 0.2*(5000-896) = 1716.8 + {samples: 5000, expectedError: nil}, // after push, rate = 500 + 0.2*(5000-500) = 1400 + {samples: 5000, expectedError: httpgrpc.Errorf(http.StatusServiceUnavailable, "distributor's samples push rate limit reached")}, // after push, rate = 1400 + 0.2*(0 - 1400) = 1120 + {samples: 5000, expectedError: httpgrpc.Errorf(http.StatusServiceUnavailable, "distributor's samples push rate limit reached")}, // after push, rate = 1120 + 0.2*(0 - 1120) = 896 + {samples: 5000, expectedError: nil}, // 896 is below 1000, so this push succeeds, new rate = 896 + 0.2*(5000-896) = 1716.8 }, }, }