From 82778ed8c9bd6c35db02b8cf6653a6ee68aaeffe Mon Sep 17 00:00:00 2001 From: Dmitry S <11892559+swift1337@users.noreply.github.com> Date: Tue, 19 Nov 2024 14:38:32 +0100 Subject: [PATCH] Add prometheus metrics --- zetaclient/tss/keysign.go | 5 +- zetaclient/tss/service.go | 96 ++++++++++++++++++++++++++++++++------- 2 files changed, 81 insertions(+), 20 deletions(-) diff --git a/zetaclient/tss/keysign.go b/zetaclient/tss/keysign.go index b10918a96f..6164241979 100644 --- a/zetaclient/tss/keysign.go +++ b/zetaclient/tss/keysign.go @@ -9,7 +9,6 @@ import ( "github.com/rs/zerolog" tsscommon "gitlab.com/thorchain/tss/go-tss/common" "gitlab.com/thorchain/tss/go-tss/keysign" - "gitlab.com/thorchain/tss/go-tss/tss" "github.com/zeta-chain/node/pkg/cosmos" "github.com/zeta-chain/node/zetaclient/logs" @@ -22,7 +21,7 @@ var ( ) // TestKeySign performs a TSS key-sign test of sample data. -func TestKeySign(tssServer *tss.TssServer, tssPubKey string, logger zerolog.Logger) error { +func TestKeySign(keySigner KeySigner, tssPubKey string, logger zerolog.Logger) error { logger = logger.With().Str(logs.FieldModule, "tss_keysign").Logger() hashedData := crypto.Keccak256Hash(testKeySignData) @@ -40,7 +39,7 @@ func TestKeySign(tssServer *tss.TssServer, tssPubKey string, logger zerolog.Logg Version, ) - res, err := tssServer.KeySign(req) + res, err := keySigner.KeySign(req) switch { case err != nil: return errors.Wrap(err, "key signing request error") diff --git a/zetaclient/tss/service.go b/zetaclient/tss/service.go index b9b69f1718..0457462a6d 100644 --- a/zetaclient/tss/service.go +++ b/zetaclient/tss/service.go @@ -6,9 +6,11 @@ import ( "encoding/hex" "fmt" "strings" + "time" "github.com/btcsuite/btcd/chaincfg/chainhash" "github.com/pkg/errors" + "github.com/prometheus/client_golang/prometheus" "github.com/rs/zerolog" thorcommon "gitlab.com/thorchain/tss/go-tss/common" "gitlab.com/thorchain/tss/go-tss/keysign" @@ -30,11 +32,21 @@ type Service struct { currentPubKey PubKey postBlame bool - logger zerolog.Logger + metrics *Metrics + + logger zerolog.Logger +} + +// Metrics Prometheus metrics for the TSS service. +type Metrics struct { + ActiveMsgsSigns prometheus.Gauge + SignLatency *prometheus.HistogramVec + NodeBlamePerPubKey *prometheus.CounterVec } type serviceConfig struct { postBlame bool + metrics *Metrics } // Opt Service option. @@ -48,11 +60,36 @@ func WithPostBlame(postBlame bool) Opt { } } +// WithMetrics registers Prometheus metrics for the TSS service. +// Otherwise, no metrics will be collected. +func WithMetrics(ctx context.Context, zetacore interfaces.ZetacoreClient, m *Metrics) Opt { + return func(cfg *serviceConfig, _ zerolog.Logger) error { + keygen, err := zetacore.GetKeyGen(ctx) + if err != nil { + return errors.Wrap(err, "failed to get keygen (WithMetrics)") + } + + m.ActiveMsgsSigns.Set(0) + m.SignLatency.Reset() + m.NodeBlamePerPubKey.Reset() + + for _, granteeBech32 := range keygen.GranteePubkeys { + m.NodeBlamePerPubKey.WithLabelValues(granteeBech32).Inc() + } + + cfg.metrics = m + + return nil + } +} + +var noopMetrics = Metrics{ + ActiveMsgsSigns: prometheus.NewGauge(prometheus.GaugeOpts{Name: "noop"}), + SignLatency: prometheus.NewHistogramVec(prometheus.HistogramOpts{Name: "noop"}, []string{"result"}), + NodeBlamePerPubKey: prometheus.NewCounterVec(prometheus.CounterOpts{Name: "noop"}, []string{"pubkey"}), +} + // NewService Service constructor. -// TODO Constructor -// TODO PubKey struct -// TODO Test cases for bootstrap -// TODO metrics // TODO LRU cache func NewService( keySigner KeySigner, @@ -63,26 +100,30 @@ func NewService( ) (*Service, error) { logger = logger.With().Str(logs.FieldModule, "tss_service").Logger() - // Apply opts - var cfg serviceConfig + cfg := serviceConfig{ + metrics: &noopMetrics, + postBlame: false, + } + for _, opt := range opts { if err := opt(&cfg, logger); err != nil { return nil, errors.Wrap(err, "failed to apply tss config option") } } - currentTSSPubKey, err := NewPubKeyFromBech32(tssPubKeyBech32) + // Represents the current TSS public key. + // FWIW, based on this, we can derive EVM / BTC addresses. + currentPubKey, err := NewPubKeyFromBech32(tssPubKeyBech32) if err != nil { return nil, errors.Wrap(err, "invalid tss pub key") } - // todo metrics - return &Service{ tss: keySigner, - currentPubKey: currentTSSPubKey, + currentPubKey: currentPubKey, zetacore: zc, postBlame: cfg.postBlame, + metrics: cfg.metrics, logger: logger, }, nil } @@ -159,12 +200,30 @@ func (s *Service) SignBatch( return signatures, nil } -func (s *Service) sign(req keysign.Request) (keysign.Response, error) { - // todo track signs (metrics) - res, err := s.tss.KeySign(req) - // todo finish tracking +var ( + signLabelsSuccess = prometheus.Labels{"result": "success"} + signLabelsError = prometheus.Labels{"result": "error"} +) + +// sign sends TSS key sign request to the underlying go-tss and registers metrics +func (s *Service) sign(req keysign.Request) (res keysign.Response, err error) { + // metrics start + messagesCount, start := float64(len(req.Messages)), time.Now() + s.metrics.ActiveMsgsSigns.Add(messagesCount) + + // metrics finish + defer func() { + s.metrics.ActiveMsgsSigns.Sub(messagesCount) + + latency := time.Since(start).Seconds() + if err == nil && res.Status == thorcommon.Success { + s.metrics.SignLatency.With(signLabelsSuccess).Observe(latency) + } else { + s.metrics.SignLatency.With(signLabelsError).Observe(latency) + } + }() - return res, err + return s.tss.KeySign(req) } func (s *Service) blameFailure( @@ -184,7 +243,10 @@ func (s *Service) blameFailure( Interface("keysign.fail_blame", res.Blame). Msg("Keysign failed") - // todo inc blame metrics + // register blame metrics + for _, node := range res.Blame.BlameNodes { + s.metrics.NodeBlamePerPubKey.WithLabelValues(node.Pubkey).Inc() + } if !s.postBlame { return errFailure