From 21f60cf427aaa039d75de94fb48de0a3733889f5 Mon Sep 17 00:00:00 2001 From: Julien Duchesne Date: Tue, 15 Oct 2024 16:07:41 -0400 Subject: [PATCH] memberlist `WriteTo`: Track dropped packets properly (#611) - Do not increment the `packets_sent_errors_total`, instead have a new `packets_dropped_total` so we can isolate these - Debug messages instead of warn. These logs are all mostly the same, they don't need to be "warn" level --- kv/memberlist/tcp_transport.go | 13 +++++++++++-- 1 file changed, 11 insertions(+), 2 deletions(-) diff --git a/kv/memberlist/tcp_transport.go b/kv/memberlist/tcp_transport.go index 2010d3919..241d25b71 100644 --- a/kv/memberlist/tcp_transport.go +++ b/kv/memberlist/tcp_transport.go @@ -123,6 +123,7 @@ type TCPTransport struct { sentPackets prometheus.Counter sentPacketsBytes prometheus.Counter sentPacketsErrors prometheus.Counter + droppedPackets prometheus.Counter unknownConnections prometheus.Counter } @@ -463,8 +464,9 @@ func (t *TCPTransport) WriteTo(b []byte, addr string) (time.Time, error) { // If this blocks for too long (as configured), abort and log an error. select { case <-time.After(t.cfg.AcquireWriterTimeout): - level.Warn(t.logger).Log("msg", "WriteTo failed to acquire a writer. Dropping message", "timeout", t.cfg.AcquireWriterTimeout, "addr", addr) - t.sentPacketsErrors.Inc() + // Dropped packets are not an issue, the memberlist protocol will retry later. + level.Debug(t.logger).Log("msg", "WriteTo failed to acquire a writer. Dropping message", "timeout", t.cfg.AcquireWriterTimeout, "addr", addr) + t.droppedPackets.Inc() // WriteTo is used to send "UDP" packets. Since we use TCP, we can detect more errors, // but memberlist library doesn't seem to cope with that very well. That is why we return nil instead. return time.Now(), nil @@ -681,6 +683,13 @@ func (t *TCPTransport) registerMetrics(registerer prometheus.Registerer) { Help: "Number of errors when receiving memberlist packets", }) + t.droppedPackets = promauto.With(registerer).NewCounter(prometheus.CounterOpts{ + Namespace: t.cfg.MetricsNamespace, + Subsystem: subsystem, + Name: "packets_dropped_total", + Help: "Number of dropped memberlist packets. These packets were not sent due to timeout waiting for a writer.", + }) + t.sentPackets = promauto.With(registerer).NewCounter(prometheus.CounterOpts{ Namespace: t.cfg.MetricsNamespace, Subsystem: subsystem,