From be48f077c4748915d407b288c1fcbb573c61bd81 Mon Sep 17 00:00:00 2001 From: Alex Gartner Date: Fri, 26 Jul 2024 11:59:06 -0700 Subject: [PATCH] feat(zetaclient): add TSS sign latency histogram (#2567) * feat(zetaclient): add TSS sign latency histogram * add changelog entry and another bin --- changelog.md | 1 + .../localnet/docker-compose-monitoring.yml | 2 -- zetaclient/metrics/metrics.go | 8 +++++ zetaclient/tss/concurrent_keysigns_tracker.go | 33 +++++++++++++------ .../tss/concurrent_keysigns_tracker_test.go | 9 +++-- zetaclient/tss/tss_signer.go | 8 ++--- 6 files changed, 40 insertions(+), 21 deletions(-) diff --git a/changelog.md b/changelog.md index 75311aa4f2..b5c5744d19 100644 --- a/changelog.md +++ b/changelog.md @@ -35,6 +35,7 @@ * [2497](https://github.com/zeta-chain/node/pull/2416) - support for runtime chain (de)provisioning * [2518](https://github.com/zeta-chain/node/pull/2518) - add support for Solana address in zetacore * [2483](https://github.com/zeta-chain/node/pull/2483) - add priorityFee (gasTipCap) gas to the state +* [2567](https://github.com/zeta-chain/node/pull/2567) - add sign latency metric to zetaclient (zetaclient_sign_latency) ### Refactor diff --git a/contrib/localnet/docker-compose-monitoring.yml b/contrib/localnet/docker-compose-monitoring.yml index 9f11bd5575..89ddd0d7c3 100644 --- a/contrib/localnet/docker-compose-monitoring.yml +++ b/contrib/localnet/docker-compose-monitoring.yml @@ -18,8 +18,6 @@ services: hostname: prometheus volumes: - ./prometheus/prometheus.yml:/etc/prometheus/prometheus.yml - ports: - - "9090:9090" networks: mynetwork: ipv4_address: 172.20.0.31 diff --git a/zetaclient/metrics/metrics.go b/zetaclient/metrics/metrics.go index 1da10d10e8..50d88b398b 100644 --- a/zetaclient/metrics/metrics.go +++ b/zetaclient/metrics/metrics.go @@ -104,6 +104,14 @@ var ( Name: "percentage_of_rate_reached", Help: "Percentage of the rate limiter rate reached", }) + + // SignLatency is a histogram of of the TSS keysign latency + SignLatency = promauto.NewHistogramVec(prometheus.HistogramOpts{ + Namespace: ZetaClientNamespace, + Name: "sign_latency", + Help: "Histogram of the TSS keysign latency", + Buckets: []float64{1, 7, 15, 30, 60, 120, 240}, + }, []string{"result"}) ) // NewMetrics creates a new Metrics instance diff --git a/zetaclient/tss/concurrent_keysigns_tracker.go b/zetaclient/tss/concurrent_keysigns_tracker.go index 42b6cb8413..d232dfa831 100644 --- a/zetaclient/tss/concurrent_keysigns_tracker.go +++ b/zetaclient/tss/concurrent_keysigns_tracker.go @@ -2,7 +2,9 @@ package tss import ( "sync" + "time" + "github.com/prometheus/client_golang/prometheus" "github.com/rs/zerolog" "github.com/zeta-chain/zetacore/zetaclient/metrics" @@ -25,26 +27,37 @@ func NewKeysignsTracker(logger zerolog.Logger) *ConcurrentKeysignsTracker { } // StartMsgSign is incrementing the number of active signing ceremonies as well as updating the prometheus metric -func (k *ConcurrentKeysignsTracker) StartMsgSign() { +// +// Call the returned function to signify the signing is complete +func (k *ConcurrentKeysignsTracker) StartMsgSign() func(bool) { k.mu.Lock() defer k.mu.Unlock() k.numActiveMsgSigns++ metrics.NumActiveMsgSigns.Inc() k.Logger.Debug().Msgf("Start TSS message sign, numActiveMsgSigns: %d", k.numActiveMsgSigns) -} -// EndMsgSign is decrementing the number of active signing ceremonies as well as updating the prometheus metric -func (k *ConcurrentKeysignsTracker) EndMsgSign() { - k.mu.Lock() - defer k.mu.Unlock() - if k.numActiveMsgSigns > 0 { - k.numActiveMsgSigns-- - metrics.NumActiveMsgSigns.Dec() + startTime := time.Now() + + return func(hasError bool) { + k.mu.Lock() + defer k.mu.Unlock() + if k.numActiveMsgSigns > 0 { + k.numActiveMsgSigns-- + metrics.NumActiveMsgSigns.Dec() + } + k.Logger.Debug().Msgf("End TSS message sign, numActiveMsgSigns: %d", k.numActiveMsgSigns) + + result := "success" + if hasError { + result = "error" + } + metrics.SignLatency.With(prometheus.Labels{"result": result}).Observe(time.Since(startTime).Seconds()) } - k.Logger.Debug().Msgf("End TSS message sign, numActiveMsgSigns: %d", k.numActiveMsgSigns) } // GetNumActiveMessageSigns gets the current number of active signing ceremonies func (k *ConcurrentKeysignsTracker) GetNumActiveMessageSigns() int64 { + k.mu.Lock() + defer k.mu.Unlock() return k.numActiveMsgSigns } diff --git a/zetaclient/tss/concurrent_keysigns_tracker_test.go b/zetaclient/tss/concurrent_keysigns_tracker_test.go index 89d96b7fd8..6295095747 100644 --- a/zetaclient/tss/concurrent_keysigns_tracker_test.go +++ b/zetaclient/tss/concurrent_keysigns_tracker_test.go @@ -18,10 +18,9 @@ func TestKeySignManager_StartMsgSign(t *testing.T) { func TestKeySignManager_EndMsgSign(t *testing.T) { ksman := NewKeysignsTracker(zerolog.Logger{}) - ksman.StartMsgSign() - ksman.StartMsgSign() - ksman.EndMsgSign() - ksman.EndMsgSign() - ksman.EndMsgSign() + end1 := ksman.StartMsgSign() + end2 := ksman.StartMsgSign() + end1(true) + end2(false) require.Equal(t, int64(0), ksman.GetNumActiveMessageSigns()) } diff --git a/zetaclient/tss/tss_signer.go b/zetaclient/tss/tss_signer.go index 3219752173..22026da5b4 100644 --- a/zetaclient/tss/tss_signer.go +++ b/zetaclient/tss/tss_signer.go @@ -246,9 +246,9 @@ func (tss *TSS) Sign( nil, "0.14.0", ) - tss.KeysignsTracker.StartMsgSign() + end := tss.KeysignsTracker.StartMsgSign() ksRes, err := tss.Server.KeySign(keysignReq) - tss.KeysignsTracker.EndMsgSign() + end(err != nil || ksRes.Status == thorcommon.Fail) if err != nil { log.Warn().Msg("keysign fail") } @@ -328,9 +328,9 @@ func (tss *TSS) SignBatch( // #nosec G115 always in range keysignReq := keysign.NewRequest(tssPubkey, digestBase64, int64(height), nil, "0.14.0") - tss.KeysignsTracker.StartMsgSign() + end := tss.KeysignsTracker.StartMsgSign() ksRes, err := tss.Server.KeySign(keysignReq) - tss.KeysignsTracker.EndMsgSign() + end(err != nil || ksRes.Status == thorcommon.Fail) if err != nil { log.Warn().Err(err).Msg("keysign fail") }