From a1eccc8cd83538e960fa526cbd2c730594fb241c Mon Sep 17 00:00:00 2001
From: Douglas Camata <159076+douglascamata@users.noreply.github.com>
Date: Wed, 17 Jan 2024 14:31:47 +0100
Subject: [PATCH] Receive: refactor handler for improved readability and
 organization (#6898)

* [wip] First checkpoint

Signed-off-by: Douglas Camata <159076+douglascamata@users.noreply.github.com>

* [wip] Second checkpoint

All tests passing, unit and e2e.

Signed-off-by: Douglas Camata <159076+douglascamata@users.noreply.github.com>

* Small random refactors

Signed-off-by: Douglas Camata <159076+douglascamata@users.noreply.github.com>

* Add some useful trace tags

Signed-off-by: Douglas Camata <159076+douglascamata@users.noreply.github.com>

* Concurrent and traced local writes

Signed-off-by: Douglas Camata <159076+douglascamata@users.noreply.github.com>

* Improve variable names in remote writes

Signed-off-by: Douglas Camata <159076+douglascamata@users.noreply.github.com>

* Rename `newFanoutForward` function

Signed-off-by: Douglas Camata <159076+douglascamata@users.noreply.github.com>

* More refactors

Signed-off-by: Douglas Camata <159076+douglascamata@users.noreply.github.com>

* Fix linting issue

Signed-off-by: Douglas Camata <159076+douglascamata@users.noreply.github.com>

* Add a quorum test with sloppy quorum

Signed-off-by: Douglas Camata <159076+douglascamata@users.noreply.github.com>

* [wip] Try to make retries work

Signed-off-by: Douglas Camata <159076+douglascamata@users.noreply.github.com>

* [wip] Checkpoint: wait group still hanging

Signed-off-by: Douglas Camata <159076+douglascamata@users.noreply.github.com>

* Some refactors

Signed-off-by: Douglas Camata <159076+douglascamata@users.noreply.github.com>

* Add some commented code so I don't lose it

Signed-off-by: Douglas Camata <159076+douglascamata@users.noreply.github.com>

* Adapt tests

Signed-off-by: Douglas Camata <159076+douglascamata@users.noreply.github.com>

* Remove sloppy quorum code

Signed-off-by: Douglas Camata <159076+douglascamata@users.noreply.github.com>

* Move some code around

Signed-off-by: Douglas Camata <159076+douglascamata@users.noreply.github.com>

* Remove even more leftover of sloppy quorum

Signed-off-by: Douglas Camata <159076+douglascamata@users.noreply.github.com>

* Extract a type to hold function params

Signed-off-by: Douglas Camata <159076+douglascamata@users.noreply.github.com>

* Remove unused struct field

Signed-off-by: Douglas Camata <159076+douglascamata@users.noreply.github.com>

* Remove useless variable

Signed-off-by: Douglas Camata <159076+douglascamata@users.noreply.github.com>

* Remove type that wasn't used enough

Signed-off-by: Douglas Camata <159076+douglascamata@users.noreply.github.com>

* Delete function to tighten up max buffered responses

Signed-off-by: Douglas Camata <159076+douglascamata@users.noreply.github.com>

* Add comments to some functions

Signed-off-by: Douglas Camata <159076+douglascamata@users.noreply.github.com>

* Fix peer up check

Signed-off-by: Douglas Camata <159076+douglascamata@users.noreply.github.com>

* Fix size of replication tracking slices

Signed-off-by: Douglas Camata <159076+douglascamata@users.noreply.github.com>

* Rename context

Signed-off-by: Douglas Camata <159076+douglascamata@users.noreply.github.com>

* Don't do local writes concurrently

Signed-off-by: Douglas Camata <159076+douglascamata@users.noreply.github.com>

* Remove extra error logging

Signed-off-by: Douglas Camata <159076+douglascamata@users.noreply.github.com>

* Fix syntax after merge

Signed-off-by: Douglas Camata <159076+douglascamata@users.noreply.github.com>

* Add missing methods to peersContainer

Signed-off-by: Douglas Camata <159076+douglascamata@users.noreply.github.com>

* Fix handler test

Signed-off-by: Douglas Camata <159076+douglascamata@users.noreply.github.com>

* Reset peers state on hashring changes

Signed-off-by: Douglas Camata <159076+douglascamata@users.noreply.github.com>

* Handle PR comment regarding waitgroup

Signed-off-by: Douglas Camata <159076+douglascamata@users.noreply.github.com>

* Set span tags to help debug

Signed-off-by: Douglas Camata <159076+douglascamata@users.noreply.github.com>

* Fix concurrency issue

We close the request as soon as quorum is reached and leave a few Go routines running to finish replication and so cleanups.

This means that the context from the HTTP request is cancelled... which ends up also cancelling the pending replication requests.

Signed-off-by: Douglas Camata <159076+douglascamata@users.noreply.github.com>

* Fix request ID middleware

Signed-off-by: Douglas Camata <159076+douglascamata@users.noreply.github.com>

* Fix `distributeTimeseriesToReplicas` comment

Signed-off-by: Douglas Camata <159076+douglascamata@users.noreply.github.com>

* Extract var with 1-indexed replication index

Signed-off-by: Douglas Camata <159076+douglascamata@users.noreply.github.com>

* Rename methods in peersContainer interface

Signed-off-by: Douglas Camata <159076+douglascamata@users.noreply.github.com>

* Make peerGroup `getConnection` check if peers are up

Signed-off-by: Douglas Camata <159076+douglascamata@users.noreply.github.com>

* Remove yet one more not useful log

Signed-off-by: Douglas Camata <159076+douglascamata@users.noreply.github.com>

* Remove logger from `h.sendWrites`

Signed-off-by: Douglas Camata <159076+douglascamata@users.noreply.github.com>

---------

Signed-off-by: Douglas Camata <159076+douglascamata@users.noreply.github.com>
---
 pkg/receive/handler.go                   | 581 +++++++++++++----------
 pkg/receive/handler_test.go              |  45 +-
 pkg/server/http/middleware/request_id.go |   4 +-
 3 files changed, 365 insertions(+), 265 deletions(-)

diff --git a/pkg/receive/handler.go b/pkg/receive/handler.go
index c0c0fea21f..ba4b9b94ce 100644
--- a/pkg/receive/handler.go
+++ b/pkg/receive/handler.go
@@ -107,8 +107,6 @@ type Handler struct {
 	mtx          sync.RWMutex
 	hashring     Hashring
 	peers        peersContainer
-	expBackoff   backoff.Backoff
-	peerStates   map[string]*retryState
 	receiverMode ReceiverMode
 
 	forwardRequests   *prometheus.CounterVec
@@ -132,19 +130,20 @@ func NewHandler(logger log.Logger, o *Options) *Handler {
 	}
 
 	h := &Handler{
-		logger:       logger,
-		writer:       o.Writer,
-		router:       route.New(),
-		options:      o,
-		peers:        newPeerGroup(o.DialOpts...),
+		logger:  logger,
+		writer:  o.Writer,
+		router:  route.New(),
+		options: o,
+		peers: newPeerGroup(
+			backoff.Backoff{
+				Factor: 2,
+				Min:    100 * time.Millisecond,
+				Max:    o.MaxBackoff,
+				Jitter: true,
+			},
+			o.DialOpts...),
 		receiverMode: o.ReceiverMode,
-		expBackoff: backoff.Backoff{
-			Factor: 2,
-			Min:    100 * time.Millisecond,
-			Max:    o.MaxBackoff,
-			Jitter: true,
-		},
-		Limiter: o.Limiter,
+		Limiter:      o.Limiter,
 		forwardRequests: promauto.With(registerer).NewCounterVec(
 			prometheus.CounterOpts{
 				Name: "thanos_receive_forward_requests_total",
@@ -266,8 +265,7 @@ func (h *Handler) Hashring(hashring Hashring) {
 	}
 
 	h.hashring = hashring
-	h.expBackoff.Reset()
-	h.peerStates = make(map[string]*retryState)
+	h.peers.reset()
 }
 
 // getSortedStringSliceDiff returns items which are in slice1 but not in slice2.
@@ -434,41 +432,10 @@ func newWriteResponse(seriesIDs []int, err error) writeResponse {
 	}
 }
 
-func (h *Handler) handleRequest(ctx context.Context, rep uint64, tenant string, wreq *prompb.WriteRequest) error {
-	tLogger := log.With(h.logger, "tenant", tenant)
-
-	// This replica value is used to detect cycles in cyclic topologies.
-	// A non-zero value indicates that the request has already been replicated by a previous receive instance.
-	// For almost all users, this is only used in fully connected topologies of IngestorRouter instances.
-	// For acyclic topologies that use RouterOnly and IngestorOnly instances, this causes issues when replicating data.
-	// See discussion in: https://github.com/thanos-io/thanos/issues/4359.
-	if h.receiverMode == RouterOnly || h.receiverMode == IngestorOnly {
-		rep = 0
-	}
-
-	// The replica value in the header is one-indexed, thus we need >.
-	if rep > h.options.ReplicationFactor {
-		level.Error(tLogger).Log("err", errBadReplica, "msg", "write request rejected",
-			"request_replica", rep, "replication_factor", h.options.ReplicationFactor)
-		return errBadReplica
-	}
-
-	r := replica{n: rep, replicated: rep != 0}
-
-	// On the wire, format is 1-indexed and in-code is 0-indexed, so we decrement the value if it was already replicated.
-	if r.replicated {
-		r.n--
-	}
-
-	// Forward any time series as necessary. All time series
-	// destined for the local node will be written to the receiver.
-	// Time series will be replicated as necessary.
-	return h.forward(ctx, tenant, r, wreq)
-}
-
 func (h *Handler) receiveHTTP(w http.ResponseWriter, r *http.Request) {
 	var err error
 	span, ctx := tracing.StartSpan(r.Context(), "receive_http")
+	span.SetTag("receiver.mode", string(h.receiverMode))
 	defer span.Finish()
 
 	tenant, err := tenancy.GetTenantFromHTTP(r, h.options.TenantHeader, h.options.DefaultTenantID, h.options.TenantField)
@@ -479,6 +446,7 @@ func (h *Handler) receiveHTTP(w http.ResponseWriter, r *http.Request) {
 	}
 
 	tLogger := log.With(h.logger, "tenant", tenant)
+	span.SetTag("tenant", tenant)
 
 	writeGate := h.Limiter.WriteGate()
 	tracing.DoInSpan(r.Context(), "receive_write_gate_ismyturn", func(ctx context.Context) {
@@ -585,8 +553,8 @@ func (h *Handler) receiveHTTP(w http.ResponseWriter, r *http.Request) {
 	}
 
 	responseStatusCode := http.StatusOK
-	if err = h.handleRequest(ctx, rep, tenant, &wreq); err != nil {
-		level.Debug(tLogger).Log("msg", "failed to handle request", "err", err)
+	if err := h.handleRequest(ctx, rep, tenant, &wreq); err != nil {
+		level.Debug(tLogger).Log("msg", "failed to handle request", "err", err.Error())
 		switch errors.Cause(err) {
 		case errNotReady:
 			responseStatusCode = http.StatusServiceUnavailable
@@ -606,6 +574,38 @@ func (h *Handler) receiveHTTP(w http.ResponseWriter, r *http.Request) {
 	h.writeSamplesTotal.WithLabelValues(strconv.Itoa(responseStatusCode), tenant).Observe(float64(totalSamples))
 }
 
+func (h *Handler) handleRequest(ctx context.Context, rep uint64, tenant string, wreq *prompb.WriteRequest) error {
+	tLogger := log.With(h.logger, "tenant", tenant)
+
+	// This replica value is used to detect cycles in cyclic topologies.
+	// A non-zero value indicates that the request has already been replicated by a previous receive instance.
+	// For almost all users, this is only used in fully connected topologies of IngestorRouter instances.
+	// For acyclic topologies that use RouterOnly and IngestorOnly instances, this causes issues when replicating data.
+	// See discussion in: https://github.com/thanos-io/thanos/issues/4359.
+	if h.receiverMode == RouterOnly || h.receiverMode == IngestorOnly {
+		rep = 0
+	}
+
+	// The replica value in the header is one-indexed, thus we need >.
+	if rep > h.options.ReplicationFactor {
+		level.Error(tLogger).Log("err", errBadReplica, "msg", "write request rejected",
+			"request_replica", rep, "replication_factor", h.options.ReplicationFactor)
+		return errBadReplica
+	}
+
+	r := replica{n: rep, replicated: rep != 0}
+
+	// On the wire, format is 1-indexed and in-code is 0-indexed, so we decrement the value if it was already replicated.
+	if r.replicated {
+		r.n--
+	}
+
+	// Forward any time series as necessary. All time series
+	// destined for the local node will be written to the receiver.
+	// Time series will be replicated as necessary.
+	return h.forward(ctx, tenant, r, wreq)
+}
+
 // forward accepts a write request, batches its time series by
 // corresponding endpoint, and forwards them in parallel to the
 // correct endpoint. Requests destined for the local node are written
@@ -635,55 +635,29 @@ func (h *Handler) forward(ctx context.Context, tenant string, r replica, wreq *p
 		}
 	}
 
-	wreqs := make(map[endpointReplica]trackedSeries)
-	for tsID, ts := range wreq.Timeseries {
-		for _, rn := range replicas {
-			endpoint, err := h.hashring.GetN(tenant, &ts, rn)
-			if err != nil {
-				h.mtx.RUnlock()
-				return err
-			}
-			key := endpointReplica{endpoint: endpoint, replica: rn}
-			writeTarget, ok := wreqs[key]
-			if !ok {
-				writeTarget = trackedSeries{
-					seriesIDs:  make([]int, 0),
-					timeSeries: make([]prompb.TimeSeries, 0),
-				}
-			}
-			writeTarget.timeSeries = append(wreqs[key].timeSeries, ts)
-			writeTarget.seriesIDs = append(wreqs[key].seriesIDs, tsID)
-			wreqs[key] = writeTarget
-		}
+	params := remoteWriteParams{
+		tenant:            tenant,
+		writeRequest:      wreq,
+		replicas:          replicas,
+		alreadyReplicated: r.replicated,
 	}
-	h.mtx.RUnlock()
-
-	return h.fanoutForward(ctx, tenant, wreqs, len(wreq.Timeseries), r.replicated)
-}
 
-// writeQuorum returns minimum number of replicas that has to confirm write success before claiming replication success.
-func (h *Handler) writeQuorum() int {
-	return int((h.options.ReplicationFactor / 2) + 1)
+	return h.fanoutForward(ctx, params)
 }
 
-func quorumReached(successes []int, successThreshold int) bool {
-	for _, success := range successes {
-		if success < successThreshold {
-			return false
-		}
-	}
-
-	return true
+type remoteWriteParams struct {
+	tenant            string
+	writeRequest      *prompb.WriteRequest
+	replicas          []uint64
+	alreadyReplicated bool
 }
 
-// fanoutForward fans out concurrently given set of write requests. It returns status immediately when quorum of
-// requests succeeds or fails or if context is canceled.
-func (h *Handler) fanoutForward(pctx context.Context, tenant string, wreqs map[endpointReplica]trackedSeries, numSeries int, seriesReplicated bool) error {
-	var errs writeErrors
+func (h *Handler) fanoutForward(ctx context.Context, params remoteWriteParams) error {
+	ctx, cancel := context.WithTimeout(tracing.CopyTraceContext(context.Background(), ctx), h.options.ForwardTimeout)
 
-	fctx, cancel := context.WithTimeout(tracing.CopyTraceContext(context.Background(), pctx), h.options.ForwardTimeout)
+	var writeErrors writeErrors
 	defer func() {
-		if errs.ErrOrNil() != nil {
+		if writeErrors.ErrOrNil() != nil {
 			// NOTICE: The cancel function is not used on all paths intentionally,
 			// if there is no error when quorum is reached,
 			// let forward requests to optimistically run until timeout.
@@ -691,140 +665,25 @@ func (h *Handler) fanoutForward(pctx context.Context, tenant string, wreqs map[e
 		}
 	}()
 
-	var tLogger log.Logger
-	{
-		logTags := []interface{}{"tenant", tenant}
-		if id, ok := middleware.RequestIDFromContext(pctx); ok {
-			logTags = append(logTags, "request-id", id)
-		}
-		tLogger = log.With(h.logger, logTags...)
-	}
-
-	// NOTE(GiedriusS): First write locally because inside of the function we check if the local TSDB has cached strings.
-	// If not then it copies those strings. This is so that the memory allocated for the
-	// protobuf (except for the labels) can be deallocated.
-	// This causes a write to the labels field. When fanning out this request to other Receivers, the code calls
-	// Size() which reads those same fields. We would like to avoid adding locks around each string
-	// hence we need to write locally first.
-	var maxBufferedResponses = 0
-	for writeTarget := range wreqs {
-		if writeTarget.endpoint != h.options.Endpoint {
-			continue
-		}
-		maxBufferedResponses++
+	logTags := []interface{}{"tenant", params.tenant}
+	if id, ok := middleware.RequestIDFromContext(ctx); ok {
+		logTags = append(logTags, "request-id", id)
 	}
+	requestLogger := log.With(h.logger, logTags...)
 
-	responses := make(chan writeResponse, maxBufferedResponses)
-
-	var wg sync.WaitGroup
-
-	for writeTarget := range wreqs {
-		if writeTarget.endpoint != h.options.Endpoint {
-			continue
-		}
-		// If the endpoint for the write request is the
-		// local node, then don't make a request but store locally.
-		// By handing replication to the local node in the same
-		// function as replication to other nodes, we can treat
-		// a failure to write locally as just another error that
-		// can be ignored if the replication factor is met.
-		var err error
-
-		tracing.DoInSpan(fctx, "receive_tsdb_write", func(_ context.Context) {
-			err = h.writer.Write(fctx, tenant, &prompb.WriteRequest{
-				Timeseries: wreqs[writeTarget].timeSeries,
-			})
-		})
-		if err != nil {
-			level.Debug(tLogger).Log("msg", "local tsdb write failed", "err", err.Error())
-			responses <- newWriteResponse(wreqs[writeTarget].seriesIDs, errors.Wrapf(err, "store locally for endpoint %v", writeTarget.endpoint))
-			continue
-		}
-
-		responses <- newWriteResponse(wreqs[writeTarget].seriesIDs, nil)
+	localWrites, remoteWrites, err := h.distributeTimeseriesToReplicas(params.tenant, params.replicas, params.writeRequest.Timeseries)
+	if err != nil {
+		level.Error(requestLogger).Log("msg", "failed to distribute timeseries to replicas", "err", err)
+		return err
 	}
-	for writeTarget := range wreqs {
-		if writeTarget.endpoint == h.options.Endpoint {
-			continue
-		}
-		wg.Add(1)
-		// Make a request to the specified endpoint.
-		go func(writeTarget endpointReplica) {
-			defer wg.Done()
-
-			var (
-				err error
-				cl  storepb.WriteableStoreClient
-			)
-			defer func() {
-				// This is an actual remote forward request so report metric here.
-				if err != nil {
-					h.forwardRequests.WithLabelValues(labelError).Inc()
-					if !seriesReplicated {
-						h.replications.WithLabelValues(labelError).Inc()
-					}
-					return
-				}
-				h.forwardRequests.WithLabelValues(labelSuccess).Inc()
-				if !seriesReplicated {
-					h.replications.WithLabelValues(labelSuccess).Inc()
-				}
-			}()
 
-			cl, err = h.peers.get(fctx, writeTarget.endpoint)
-			if err != nil {
-				responses <- newWriteResponse(wreqs[writeTarget].seriesIDs, errors.Wrapf(err, "get peer connection for endpoint %v", writeTarget.endpoint))
-				return
-			}
-
-			h.mtx.RLock()
-			b, ok := h.peerStates[writeTarget.endpoint]
-			if ok {
-				if time.Now().Before(b.nextAllowed) {
-					h.mtx.RUnlock()
-					responses <- newWriteResponse(wreqs[writeTarget].seriesIDs, errors.Wrapf(errUnavailable, "backing off forward request for endpoint %v", writeTarget.endpoint))
-					return
-				}
-			}
-			h.mtx.RUnlock()
-
-			// Create a span to track the request made to another receive node.
-			tracing.DoInSpan(fctx, "receive_forward", func(ctx context.Context) {
-				// Actually make the request against the endpoint we determined should handle these time series.
-				_, err = cl.RemoteWrite(ctx, &storepb.WriteRequest{
-					Timeseries: wreqs[writeTarget].timeSeries,
-					Tenant:     tenant,
-					// Increment replica since on-the-wire format is 1-indexed and 0 indicates un-replicated.
-					Replica: int64(writeTarget.replica + 1),
-				})
-			})
-			if err != nil {
-				// Check if peer connection is unavailable, don't attempt to send requests constantly.
-				if st, ok := status.FromError(err); ok {
-					if st.Code() == codes.Unavailable {
-						h.mtx.Lock()
-						if b, ok := h.peerStates[writeTarget.endpoint]; ok {
-							b.attempt++
-							dur := h.expBackoff.ForAttempt(b.attempt)
-							b.nextAllowed = time.Now().Add(dur)
-							level.Debug(tLogger).Log("msg", "target unavailable backing off", "for", dur)
-						} else {
-							h.peerStates[writeTarget.endpoint] = &retryState{nextAllowed: time.Now().Add(h.expBackoff.ForAttempt(0))}
-						}
-						h.mtx.Unlock()
-					}
-				}
-				werr := errors.Wrapf(err, "forwarding request to endpoint %v", writeTarget.endpoint)
-				responses <- newWriteResponse(wreqs[writeTarget].seriesIDs, werr)
-				return
-			}
-			h.mtx.Lock()
-			delete(h.peerStates, writeTarget.endpoint)
-			h.mtx.Unlock()
+	// Prepare a buffered channel to receive the responses from the local and remote writes. Remote writes will all go
+	// asynchronously and with this capacity we will never block on writing to the channel.
+	maxBufferedResponses := len(localWrites) + len(remoteWrites)
+	responses := make(chan writeResponse, maxBufferedResponses)
+	wg := sync.WaitGroup{}
 
-			responses <- newWriteResponse(wreqs[writeTarget].seriesIDs, nil)
-		}(writeTarget)
-	}
+	h.sendWrites(ctx, &wg, params, localWrites, remoteWrites, responses)
 
 	go func() {
 		wg.Wait()
@@ -835,40 +694,42 @@ func (h *Handler) fanoutForward(pctx context.Context, tenant string, wreqs map[e
 	// This is needed if context is canceled or if we reached success of fail quorum faster.
 	defer func() {
 		go func() {
-			for wresp := range responses {
-				if wresp.err != nil {
-					level.Debug(tLogger).Log("msg", "request failed, but not needed to achieve quorum", "err", wresp.err)
+			for resp := range responses {
+				if resp.err != nil {
+					level.Debug(requestLogger).Log("msg", "request failed, but not needed to achieve quorum", "err", resp.err)
 				}
 			}
 		}()
 	}()
 
 	quorum := h.writeQuorum()
-	if seriesReplicated {
+	if params.alreadyReplicated {
 		quorum = 1
 	}
-	successes := make([]int, numSeries)
-	seriesErrs := newReplicationErrors(quorum, numSeries)
+	successes := make([]int, len(params.writeRequest.Timeseries))
+	seriesErrs := newReplicationErrors(quorum, len(params.writeRequest.Timeseries))
 	for {
 		select {
-		case <-fctx.Done():
-			return fctx.Err()
-		case wresp, more := <-responses:
-			if !more {
-				for _, rerr := range seriesErrs {
-					errs.Add(rerr)
+		case <-ctx.Done():
+			return ctx.Err()
+		case resp, hasMore := <-responses:
+			if !hasMore {
+				for _, seriesErr := range seriesErrs {
+					writeErrors.Add(seriesErr)
 				}
-				return errs.ErrOrNil()
+				return writeErrors.ErrOrNil()
 			}
 
-			if wresp.err != nil {
-				for _, tsID := range wresp.seriesIDs {
-					seriesErrs[tsID].Add(wresp.err)
+			if resp.err != nil {
+				// Track errors and successes on a per-series basis.
+				for _, seriesID := range resp.seriesIDs {
+					seriesErrs[seriesID].Add(resp.err)
 				}
 				continue
 			}
-			for _, tsID := range wresp.seriesIDs {
-				successes[tsID]++
+			// At the end, aggregate all errors if there are any and return them.
+			for _, seriesID := range resp.seriesIDs {
+				successes[seriesID]++
 			}
 			if quorumReached(successes, quorum) {
 				return nil
@@ -877,6 +738,170 @@ func (h *Handler) fanoutForward(pctx context.Context, tenant string, wreqs map[e
 	}
 }
 
+// distributeTimeseriesToReplicas distributes the given timeseries from the tenant to different endpoints in a manner
+// that achieves the replication factor indicated by replicas.
+// The first return value are the series that should be written to the local node. The second return value are the
+// series that should be written to remote nodes.
+func (h *Handler) distributeTimeseriesToReplicas(
+	tenant string,
+	replicas []uint64,
+	timeseries []prompb.TimeSeries,
+) (map[endpointReplica]trackedSeries, map[endpointReplica]trackedSeries, error) {
+	h.mtx.RLock()
+	defer h.mtx.RUnlock()
+	remoteWrites := make(map[endpointReplica]trackedSeries)
+	localWrites := make(map[endpointReplica]trackedSeries)
+	for tsIndex, ts := range timeseries {
+		for _, rn := range replicas {
+			endpoint, err := h.hashring.GetN(tenant, &ts, rn)
+			if err != nil {
+				return nil, nil, err
+			}
+			endpointReplica := endpointReplica{endpoint: endpoint, replica: rn}
+			var writeDestination = remoteWrites
+			if endpoint == h.options.Endpoint {
+				writeDestination = localWrites
+			}
+			writeableSeries, ok := writeDestination[endpointReplica]
+			if !ok {
+				writeableSeries = trackedSeries{
+					seriesIDs:  make([]int, 0),
+					timeSeries: make([]prompb.TimeSeries, 0),
+				}
+			}
+			writeableSeries.timeSeries = append(writeDestination[endpointReplica].timeSeries, ts)
+			writeableSeries.seriesIDs = append(writeDestination[endpointReplica].seriesIDs, tsIndex)
+			writeDestination[endpointReplica] = writeableSeries
+		}
+	}
+	return localWrites, remoteWrites, nil
+}
+
+// sendWrites sends the local and remote writes to execute concurrently, controlling them through the provided sync.WaitGroup.
+// The responses from the writes are sent to the responses channel.
+func (h *Handler) sendWrites(
+	ctx context.Context,
+	wg *sync.WaitGroup,
+	params remoteWriteParams,
+	localWrites map[endpointReplica]trackedSeries,
+	remoteWrites map[endpointReplica]trackedSeries,
+	responses chan<- writeResponse,
+) {
+	// Do the writes to the local node first. This should be easy and fast.
+	for writeDestination := range localWrites {
+		func(writeDestination endpointReplica) {
+			h.sendLocalWrite(ctx, writeDestination, params.tenant, localWrites[writeDestination], responses)
+		}(writeDestination)
+	}
+
+	// Do the writes to remote nodes. Run them all in parallel.
+	for writeDestination := range remoteWrites {
+		wg.Add(1)
+		go func(writeDestination endpointReplica) {
+			defer wg.Done()
+			h.sendRemoteWrite(ctx, params.tenant, writeDestination, remoteWrites[writeDestination], params.alreadyReplicated, responses)
+		}(writeDestination)
+	}
+}
+
+// sendLocalWrite sends a write request to the local node.
+// The responses are sent to the responses channel.
+func (h *Handler) sendLocalWrite(
+	ctx context.Context,
+	writeDestination endpointReplica,
+	tenant string,
+	trackedSeries trackedSeries,
+	responses chan<- writeResponse,
+) {
+	span, tracingCtx := tracing.StartSpan(ctx, "receive_local_tsdb_write")
+	defer span.Finish()
+	span.SetTag("endpoint", writeDestination.endpoint)
+	span.SetTag("replica", writeDestination.replica)
+	err := h.writer.Write(tracingCtx, tenant, &prompb.WriteRequest{
+		Timeseries: trackedSeries.timeSeries,
+	})
+	if err != nil {
+		span.SetTag("error", true)
+		span.SetTag("error.msg", err.Error())
+		responses <- newWriteResponse(trackedSeries.seriesIDs, err)
+		return
+	}
+	responses <- newWriteResponse(trackedSeries.seriesIDs, nil)
+}
+
+// sendRemoteWrite sends a write request to the remote node. It takes care of checking wether the endpoint is up or not
+// in the peerGroup, correctly marking them as up or down when appropriate.
+// The responses are sent to the responses channel.
+func (h *Handler) sendRemoteWrite(
+	ctx context.Context,
+	tenant string,
+	endpointReplica endpointReplica,
+	trackedSeries trackedSeries,
+	alreadyReplicated bool,
+	responses chan<- writeResponse,
+) {
+	endpoint := endpointReplica.endpoint
+	cl, err := h.peers.getConnection(ctx, endpoint)
+	if err != nil {
+		if errors.Is(err, errUnavailable) {
+			err = errors.Wrapf(errUnavailable, "backing off forward request for endpoint %v", endpointReplica)
+		}
+		responses <- newWriteResponse(trackedSeries.seriesIDs, err)
+		return
+	}
+
+	span, spanCtx := tracing.StartSpan(ctx, "receive_forward")
+	// This is called "real" because it's 1-indexed.
+	realReplicationIndex := int64(endpointReplica.replica + 1)
+	span.SetTag("endpoint", endpointReplica.endpoint)
+	span.SetTag("replica", realReplicationIndex)
+	// Actually make the request against the endpoint we determined should handle these time series.
+	_, err = cl.RemoteWrite(spanCtx, &storepb.WriteRequest{
+		Timeseries: trackedSeries.timeSeries,
+		Tenant:     tenant,
+		// Increment replica since on-the-wire format is 1-indexed and 0 indicates un-replicated.
+		Replica: realReplicationIndex,
+	})
+	if err != nil {
+		span.SetTag("error", true)
+		span.SetTag("error.msg", err.Error())
+		// Check if peer connection is unavailable, update the peer state to avoid spamming that peer.
+		if st, ok := status.FromError(err); ok {
+			if st.Code() == codes.Unavailable {
+				h.peers.markPeerUnavailable(endpoint)
+			}
+		}
+		h.forwardRequests.WithLabelValues(labelError).Inc()
+		if !alreadyReplicated {
+			h.replications.WithLabelValues(labelError).Inc()
+		}
+		responses <- newWriteResponse(trackedSeries.seriesIDs, err)
+		return
+	}
+	span.Finish()
+	h.forwardRequests.WithLabelValues(labelSuccess).Inc()
+	if !alreadyReplicated {
+		h.replications.WithLabelValues(labelSuccess).Inc()
+	}
+	responses <- newWriteResponse(trackedSeries.seriesIDs, nil)
+	h.peers.markPeerAvailable(endpoint)
+}
+
+// writeQuorum returns minimum number of replicas that has to confirm write success before claiming replication success.
+func (h *Handler) writeQuorum() int {
+	return int((h.options.ReplicationFactor / 2) + 1)
+}
+
+func quorumReached(successes []int, successThreshold int) bool {
+	for _, success := range successes {
+		if success < successThreshold {
+			return false
+		}
+	}
+
+	return true
+}
+
 // RemoteWrite implements the gRPC remote write handler for storepb.WriteableStore.
 func (h *Handler) RemoteWrite(ctx context.Context, r *storepb.WriteRequest) (*storepb.WriteResponse, error) {
 	span, ctx := tracing.StartSpan(ctx, "receive_grpc")
@@ -1161,24 +1186,32 @@ func newReplicationErrors(threshold, numErrors int) []*replicationErrors {
 	return errs
 }
 
-func newPeerGroup(dialOpts ...grpc.DialOption) peersContainer {
+func newPeerGroup(backoff backoff.Backoff, dialOpts ...grpc.DialOption) peersContainer {
 	return &peerGroup{
-		dialOpts: dialOpts,
-		cache:    map[string]*grpc.ClientConn{},
-		m:        sync.RWMutex{},
-		dialer:   grpc.DialContext,
+		dialOpts:    dialOpts,
+		connections: map[string]*grpc.ClientConn{},
+		m:           sync.RWMutex{},
+		dialer:      grpc.DialContext,
+		peerStates:  make(map[string]*retryState),
+		expBackoff:  backoff,
 	}
 }
 
 type peersContainer interface {
 	close(string) error
-	get(context.Context, string) (storepb.WriteableStoreClient, error)
+	getConnection(context.Context, string) (storepb.WriteableStoreClient, error)
+	markPeerUnavailable(string)
+	markPeerAvailable(string)
+	reset()
 }
 
 type peerGroup struct {
-	dialOpts []grpc.DialOption
-	cache    map[string]*grpc.ClientConn
-	m        sync.RWMutex
+	dialOpts    []grpc.DialOption
+	connections map[string]*grpc.ClientConn
+	peerStates  map[string]*retryState
+	expBackoff  backoff.Backoff
+
+	m sync.RWMutex
 
 	// dialer is used for testing.
 	dialer func(ctx context.Context, target string, opts ...grpc.DialOption) (conn *grpc.ClientConn, err error)
@@ -1188,14 +1221,14 @@ func (p *peerGroup) close(addr string) error {
 	p.m.Lock()
 	defer p.m.Unlock()
 
-	c, ok := p.cache[addr]
+	c, ok := p.connections[addr]
 	if !ok {
 		// NOTE(GiedriusS): this could be valid case when the connection
 		// was never established.
 		return nil
 	}
 
-	delete(p.cache, addr)
+	delete(p.connections, addr)
 	if err := c.Close(); err != nil {
 		return fmt.Errorf("closing connection for %s", addr)
 	}
@@ -1203,10 +1236,14 @@ func (p *peerGroup) close(addr string) error {
 	return nil
 }
 
-func (p *peerGroup) get(ctx context.Context, addr string) (storepb.WriteableStoreClient, error) {
+func (p *peerGroup) getConnection(ctx context.Context, addr string) (storepb.WriteableStoreClient, error) {
+	if !p.isPeerUp(addr) {
+		return nil, errUnavailable
+	}
+
 	// use a RLock first to prevent blocking if we don't need to.
 	p.m.RLock()
-	c, ok := p.cache[addr]
+	c, ok := p.connections[addr]
 	p.m.RUnlock()
 	if ok {
 		return storepb.NewWriteableStoreClient(c), nil
@@ -1215,15 +1252,51 @@ func (p *peerGroup) get(ctx context.Context, addr string) (storepb.WriteableStor
 	p.m.Lock()
 	defer p.m.Unlock()
 	// Make sure that another caller hasn't created the connection since obtaining the write lock.
-	c, ok = p.cache[addr]
+	c, ok = p.connections[addr]
 	if ok {
 		return storepb.NewWriteableStoreClient(c), nil
 	}
 	conn, err := p.dialer(ctx, addr, p.dialOpts...)
 	if err != nil {
-		return nil, errors.Wrap(err, "failed to dial peer")
+		p.markPeerUnavailable(addr)
+		dialError := errors.Wrap(err, "failed to dial peer")
+		return nil, errors.Wrap(dialError, errUnavailable.Error())
 	}
 
-	p.cache[addr] = conn
+	p.connections[addr] = conn
 	return storepb.NewWriteableStoreClient(conn), nil
 }
+
+func (p *peerGroup) markPeerUnavailable(addr string) {
+	p.m.Lock()
+	defer p.m.Unlock()
+
+	state, ok := p.peerStates[addr]
+	if !ok {
+		state = &retryState{attempt: -1}
+	}
+	state.attempt++
+	state.nextAllowed = time.Now().Add(p.expBackoff.ForAttempt(state.attempt))
+	p.peerStates[addr] = state
+}
+
+func (p *peerGroup) markPeerAvailable(addr string) {
+	p.m.Lock()
+	defer p.m.Unlock()
+	delete(p.peerStates, addr)
+}
+
+func (p *peerGroup) isPeerUp(addr string) bool {
+	p.m.RLock()
+	defer p.m.RUnlock()
+	state, ok := p.peerStates[addr]
+	if !ok {
+		return true
+	}
+	return time.Now().After(state.nextAllowed)
+}
+
+func (p *peerGroup) reset() {
+	p.expBackoff.Reset()
+	p.peerStates = make(map[string]*retryState)
+}
diff --git a/pkg/receive/handler_test.go b/pkg/receive/handler_test.go
index 7eadfee930..a693843aff 100644
--- a/pkg/receive/handler_test.go
+++ b/pkg/receive/handler_test.go
@@ -44,6 +44,7 @@ import (
 
 	"github.com/thanos-io/thanos/pkg/block/metadata"
 	"github.com/thanos-io/thanos/pkg/extkingpin"
+	"github.com/thanos-io/thanos/pkg/logging"
 	"github.com/thanos-io/thanos/pkg/runutil"
 	"github.com/thanos-io/thanos/pkg/store/labelpb"
 	"github.com/thanos-io/thanos/pkg/store/storepb"
@@ -172,6 +173,15 @@ type fakePeersGroup struct {
 	closeCalled map[string]bool
 }
 
+func (g *fakePeersGroup) markPeerUnavailable(s string) {
+}
+
+func (g *fakePeersGroup) markPeerAvailable(s string) {
+}
+
+func (g *fakePeersGroup) reset() {
+}
+
 func (g *fakePeersGroup) close(addr string) error {
 	if g.closeCalled == nil {
 		g.closeCalled = map[string]bool{}
@@ -180,7 +190,7 @@ func (g *fakePeersGroup) close(addr string) error {
 	return nil
 }
 
-func (g *fakePeersGroup) get(_ context.Context, addr string) (storepb.WriteableStoreClient, error) {
+func (g *fakePeersGroup) getConnection(_ context.Context, addr string) (storepb.WriteableStoreClient, error) {
 	c, ok := g.clients[addr]
 	if !ok {
 		return nil, fmt.Errorf("client %s not found", addr)
@@ -202,8 +212,9 @@ func newTestHandlerHashring(appendables []*fakeAppendable, replicationFactor uin
 
 	ag := addrGen{}
 	limiter, _ := NewLimiter(NewNopConfig(), nil, RouterIngestor, log.NewNopLogger(), 1*time.Second)
+	logger := logging.NewLogger("debug", "logfmt", "receive_test")
 	for i := range appendables {
-		h := NewHandler(nil, &Options{
+		h := NewHandler(logger, &Options{
 			TenantHeader:      tenancy.DefaultTenantHeader,
 			ReplicaHeader:     DefaultReplicaHeader,
 			ReplicationFactor: replicationFactor,
@@ -248,6 +259,7 @@ func testReceiveQuorum(t *testing.T, hashringAlgo HashringAlgorithm, withConsist
 		replicationFactor uint64
 		wreq              *prompb.WriteRequest
 		appendables       []*fakeAppendable
+		randomNode        bool
 	}{
 		{
 			name:              "size 1 success",
@@ -605,17 +617,29 @@ func testReceiveQuorum(t *testing.T, hashringAlgo HashringAlgorithm, withConsist
 				t.Fatalf("unable to create test handler: %v", err)
 			}
 			tenant := "test"
-			// Test from the point of view of every node
-			// so that we know status code does not depend
-			// on which node is erroring and which node is receiving.
-			for i, handler := range handlers {
-				// Test that the correct status is returned.
+
+			if tc.randomNode {
+				handler := handlers[0]
 				rec, err := makeRequest(handler, tenant, tc.wreq)
 				if err != nil {
-					t.Fatalf("handler %d: unexpectedly failed making HTTP request: %v", i+1, err)
+					t.Fatalf("handler: unexpectedly failed making HTTP request: %v", err)
 				}
 				if rec.Code != tc.status {
-					t.Errorf("handler %d: got unexpected HTTP status code: expected %d, got %d; body: %s", i+1, tc.status, rec.Code, rec.Body.String())
+					t.Errorf("handler: got unexpected HTTP status code: expected %d, got %d; body: %s", tc.status, rec.Code, rec.Body.String())
+				}
+			} else {
+				// Test from the point of view of every node
+				// so that we know status code does not depend
+				// on which node is erroring and which node is receiving.
+				for i, handler := range handlers {
+					// Test that the correct status is returned.
+					rec, err := makeRequest(handler, tenant, tc.wreq)
+					if err != nil {
+						t.Fatalf("handler %d: unexpectedly failed making HTTP request: %v", i+1, err)
+					}
+					if rec.Code != tc.status {
+						t.Errorf("handler %d: got unexpected HTTP status code: expected %d, got %d; body: %s", i+1, tc.status, rec.Code, rec.Body.String())
+					}
 				}
 			}
 
@@ -654,6 +678,9 @@ func testReceiveQuorum(t *testing.T, hashringAlgo HashringAlgorithm, withConsist
 							// We have len(handlers) copies of each sample because the test case
 							// is run once for each handler and they all use the same appender.
 							expectedMin = int((tc.replicationFactor/2)+1) * len(ts.Samples)
+							if tc.randomNode {
+								expectedMin = len(ts.Samples)
+							}
 						}
 						if uint64(expectedMin) > got {
 							t.Errorf("handler: %d, labels %q: expected minimum of %d samples, got %d", j, lset.String(), expectedMin, got)
diff --git a/pkg/server/http/middleware/request_id.go b/pkg/server/http/middleware/request_id.go
index f09496b71c..fddeabf932 100644
--- a/pkg/server/http/middleware/request_id.go
+++ b/pkg/server/http/middleware/request_id.go
@@ -33,8 +33,8 @@ func RequestID(h http.Handler) http.HandlerFunc {
 		reqID := r.Header.Get("X-Request-ID")
 		if reqID == "" {
 			entropy := ulid.Monotonic(rand.New(rand.NewSource(time.Now().UnixNano())), 0)
-			reqID := ulid.MustNew(ulid.Timestamp(time.Now()), entropy)
-			r.Header.Set("X-Request-ID", reqID.String())
+			reqID = ulid.MustNew(ulid.Timestamp(time.Now()), entropy).String()
+			r.Header.Set("X-Request-ID", reqID)
 		}
 		ctx := newContextWithRequestID(r.Context(), reqID)
 		h.ServeHTTP(w, r.WithContext(ctx))