Skip to content

Commit

Permalink
memberlist WriteTo: Track dropped packets properly (#611)
Browse files Browse the repository at this point in the history
- Do not increment the `packets_sent_errors_total`, instead have a new `packets_dropped_total` so we can isolate these
- Debug messages instead of warn. These logs are all mostly the same, they don't need to be "warn" level
  • Loading branch information
julienduchesne authored Oct 15, 2024
1 parent d3f80b0 commit 21f60cf
Showing 1 changed file with 11 additions and 2 deletions.
13 changes: 11 additions & 2 deletions kv/memberlist/tcp_transport.go
Original file line number Diff line number Diff line change
Expand Up @@ -123,6 +123,7 @@ type TCPTransport struct {
sentPackets prometheus.Counter
sentPacketsBytes prometheus.Counter
sentPacketsErrors prometheus.Counter
droppedPackets prometheus.Counter
unknownConnections prometheus.Counter
}

Expand Down Expand Up @@ -463,8 +464,9 @@ func (t *TCPTransport) WriteTo(b []byte, addr string) (time.Time, error) {
// If this blocks for too long (as configured), abort and log an error.
select {
case <-time.After(t.cfg.AcquireWriterTimeout):
level.Warn(t.logger).Log("msg", "WriteTo failed to acquire a writer. Dropping message", "timeout", t.cfg.AcquireWriterTimeout, "addr", addr)
t.sentPacketsErrors.Inc()
// Dropped packets are not an issue, the memberlist protocol will retry later.
level.Debug(t.logger).Log("msg", "WriteTo failed to acquire a writer. Dropping message", "timeout", t.cfg.AcquireWriterTimeout, "addr", addr)
t.droppedPackets.Inc()
// WriteTo is used to send "UDP" packets. Since we use TCP, we can detect more errors,
// but memberlist library doesn't seem to cope with that very well. That is why we return nil instead.
return time.Now(), nil
Expand Down Expand Up @@ -681,6 +683,13 @@ func (t *TCPTransport) registerMetrics(registerer prometheus.Registerer) {
Help: "Number of errors when receiving memberlist packets",
})

t.droppedPackets = promauto.With(registerer).NewCounter(prometheus.CounterOpts{
Namespace: t.cfg.MetricsNamespace,
Subsystem: subsystem,
Name: "packets_dropped_total",
Help: "Number of dropped memberlist packets. These packets were not sent due to timeout waiting for a writer.",
})

t.sentPackets = promauto.With(registerer).NewCounter(prometheus.CounterOpts{
Namespace: t.cfg.MetricsNamespace,
Subsystem: subsystem,
Expand Down

0 comments on commit 21f60cf

Please sign in to comment.