From 837c998a956a7e1fc1c66231915bcae5450a2a00 Mon Sep 17 00:00:00 2001 From: Geoffrey Gilmore Date: Thu, 9 Dec 2021 19:54:55 -0800 Subject: [PATCH] start wrapping requests withRedFmetrics --- cmd/zoekt-sourcegraph-indexserver/main.go | 9 +- cmd/zoekt-sourcegraph-indexserver/metrics.go | 149 +++++++++++++++++++ cmd/zoekt-sourcegraph-indexserver/sg.go | 5 +- 3 files changed, 155 insertions(+), 8 deletions(-) create mode 100644 cmd/zoekt-sourcegraph-indexserver/metrics.go diff --git a/cmd/zoekt-sourcegraph-indexserver/main.go b/cmd/zoekt-sourcegraph-indexserver/main.go index d9c783bea..1c3dfe76f 100644 --- a/cmd/zoekt-sourcegraph-indexserver/main.go +++ b/cmd/zoekt-sourcegraph-indexserver/main.go @@ -48,11 +48,10 @@ var ( Buckets: prometheus.ExponentialBuckets(1, 10, 6), // 1s -> 27min }) - metricResolveRevisionDuration = promauto.NewHistogramVec(prometheus.HistogramOpts{ - Name: "resolve_revision_seconds", - Help: "A histogram of latencies for resolving a repository revision.", - Buckets: prometheus.ExponentialBuckets(.25, 2, 4), // 250ms -> 2s - }, []string{"success"}) // success=true|false + metricResolveRevisionObserver = NewRedfMetrics("resolve_revision", + WithLabels("success"), // success=true|false + WithDurationBuckets(prometheus.ExponentialBuckets(.25, 2, 4)), // 250ms -> 2s + ) metricGetIndexOptionsError = promauto.NewCounter(prometheus.CounterOpts{ Name: "get_index_options_error_total", diff --git a/cmd/zoekt-sourcegraph-indexserver/metrics.go b/cmd/zoekt-sourcegraph-indexserver/metrics.go new file mode 100644 index 000000000..07a6db2ea --- /dev/null +++ b/cmd/zoekt-sourcegraph-indexserver/metrics.go @@ -0,0 +1,149 @@ +package main + +import ( + "fmt" + "time" + + "github.com/prometheus/client_golang/prometheus" +) + +// Example Usage: +// +// observer := NewRedFMetrics("operation_name", WithLabels("factorA", "factorB")) +// +// start := time.now() +// err := doOperation() +// +// operation.Observe(time.Since(start), err) +// +// m.Observe(duration duration, err error, "label0", "label1"...) + +// RedFMetrics contains four common metrics for an operation. +// It's based on the RED method + some additional advice from +// Google SRE's "Monitoring Distributed Systems". +// +// See: +// - https://www.weave.works/blog/the-red-method-key-metrics-for-microservices-architecture/ +// - https://sre.google/sre-book/monitoring-distributed-systems/ +type RedFMetrics struct { + Count *prometheus.CounterVec // How often did this operation run successfully? + Duration *prometheus.HistogramVec // How long did this operation run for? + + ErrorCount *prometheus.CounterVec // How often did this operation fail? + ErrorDuration *prometheus.HistogramVec // How long did the failures take? +} + +func (m *RedFMetrics) Observe(d time.Duration, err error, lvals ...string) { + if err != nil { + m.ErrorCount.WithLabelValues(lvals...).Inc() + m.ErrorDuration.WithLabelValues(lvals...).Observe(d.Seconds()) + return + } + + m.Count.WithLabelValues(lvals...).Inc() + m.ErrorDuration.WithLabelValues(lvals...).Observe(d.Seconds()) +} + +type redfMetricOptions struct { + countHelp string + durationHelp string + + errorsCountHelp string + errorsDurationHelp string + + labels []string + durationBuckets []float64 +} + +// RedfMetricsOption alter the default behavior of NewRedfMetrics. +type RedfMetricsOption func(o *redfMetricOptions) + +// WithDurationHelp overrides the default help text for duration metrics. +func WithDurationHelp(text string) RedfMetricsOption { + return func(o *redfMetricOptions) { o.durationHelp = text } +} + +// WithCountHelp overrides the default help text for count metrics. +func WithCountHelp(text string) RedfMetricsOption { + return func(o *redfMetricOptions) { o.countHelp = text } +} + +// WithErrorsCountHelp overrides the default help text for error count metrics. +func WithErrorsCountHelp(text string) RedfMetricsOption { + return func(o *redfMetricOptions) { o.errorsCountHelp = text } +} + +// WithErrorsCountHelp overrides the default help text for error duration metrics. +func WithErrorsDurationHelp(text string) RedfMetricsOption { + return func(o *redfMetricOptions) { o.errorsDurationHelp = text } +} + +// WithLabels overrides the default labels for all metrics. +func WithLabels(labels ...string) RedfMetricsOption { + return func(o *redfMetricOptions) { o.labels = labels } +} + +// WithDurationBuckets overrides the default histogram bucket values for duration metrics. +func WithDurationBuckets(buckets []float64) RedfMetricsOption { + return func(o *redfMetricOptions) { + if len(buckets) != 0 { + o.durationBuckets = buckets + } + } +} + +func NewRedfMetrics(name string, overrides ...RedfMetricsOption) *RedFMetrics { + options := &redfMetricOptions{ + countHelp: fmt.Sprintf("Number of successful %s operations", name), + durationHelp: fmt.Sprintf("Time in seconds spent performing %s operations", name), + errorsCountHelp: fmt.Sprintf("Number of failed %s operations", name), + errorsDurationHelp: fmt.Sprintf("Time in seconds spent performing failed %s operations", name), + + labels: nil, + durationBuckets: prometheus.DefBuckets, + } + + for _, override := range overrides { + override(options) + } + + count := prometheus.NewCounterVec( + prometheus.CounterOpts{ + Name: fmt.Sprintf("%s_total", name), + Help: options.countHelp, + }, + options.labels, + ) + + duration := prometheus.NewHistogramVec( + prometheus.HistogramOpts{ + Name: fmt.Sprintf("%s_duration", name), + Help: options.countHelp, + }, + options.labels, + ) + + errorsCount := prometheus.NewCounterVec( + prometheus.CounterOpts{ + Name: fmt.Sprintf("%s_errors_total", name), + Help: options.errorsCountHelp, + }, + options.labels, + ) + + errorsDuration := prometheus.NewHistogramVec( + prometheus.HistogramOpts{ + Name: fmt.Sprintf("%s_errors_duration", name), + Help: options.errorsDurationHelp, + }, + options.labels, + ) + + return &RedFMetrics{ + Count: count, + Duration: duration, + + ErrorCount: errorsCount, + ErrorDuration: errorsDuration, + } +} diff --git a/cmd/zoekt-sourcegraph-indexserver/sg.go b/cmd/zoekt-sourcegraph-indexserver/sg.go index 7f2e1bf49..0d21225b3 100644 --- a/cmd/zoekt-sourcegraph-indexserver/sg.go +++ b/cmd/zoekt-sourcegraph-indexserver/sg.go @@ -113,7 +113,7 @@ func (s *sourcegraphClient) List(ctx context.Context, indexed []uint32) (*Source first = false s.configFingerprint.Store(lastFingerprint) - metricResolveRevisionDuration.WithLabelValues("false").Observe(time.Since(start).Seconds()) + metricResolveRevisionObserver.Observe(time.Since(start), err, "false") tr.LazyPrintf("failed fetching options batch: %v", err) tr.SetError() continue @@ -125,10 +125,9 @@ func (s *sourcegraphClient) List(ctx context.Context, indexed []uint32) (*Source s.configFingerprint.Store(fingerprint) } - metricResolveRevisionDuration.WithLabelValues("true").Observe(time.Since(start).Seconds()) + metricResolveRevisionObserver.Observe(time.Since(start), err, "false") for _, opt := range opts { if opt.Error != "" { - metricGetIndexOptionsError.Inc() tr.LazyPrintf("failed fetching options for %v: %v", opt.Name, opt.Error) tr.SetError() continue