Skip to content

Commit

Permalink
Add more duration info to SG request stats log (#5573)
Browse files Browse the repository at this point in the history
* include get_all, merge and total latency to store gateway series request log

Signed-off-by: Ben Ye <[email protected]>

* update thanos and add metrics to cortex

Signed-off-by: Ben Ye <[email protected]>

* update changelog

Signed-off-by: Ben Ye <[email protected]>

* update again

Signed-off-by: Ben Ye <[email protected]>

---------

Signed-off-by: Ben Ye <[email protected]>
  • Loading branch information
yeya24 authored Sep 21, 2023
1 parent 4e162a0 commit b7a3a5d
Show file tree
Hide file tree
Showing 51 changed files with 594 additions and 1,818 deletions.
1 change: 1 addition & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -64,6 +64,7 @@
* [ENHANCEMENT] Querier: Retry store gateway client connection closing gRPC error. #5558
* [ENHANCEMENT] QueryFrontend: Add generic retry for all APIs. #5561.
* [ENHANCEMENT] QueryFrontend: Add metric for number of series requests. #5373
* [ENHANCEMENT] Store Gateway: Add histogram metrics for total time spent fetching series and chunks per request. #5573
* [BUGFIX] Ruler: Validate if rule group can be safely converted back to rule group yaml from protobuf message #5265
* [BUGFIX] Querier: Convert gRPC `ResourceExhausted` status code from store gateway to 422 limit error. #5286
* [BUGFIX] Alertmanager: Route web-ui requests to the alertmanager distributor when sharding is enabled. #5293
Expand Down
8 changes: 3 additions & 5 deletions go.mod
Original file line number Diff line number Diff line change
Expand Up @@ -51,9 +51,9 @@ require (
github.com/sony/gobreaker v0.5.0
github.com/spf13/afero v1.9.5
github.com/stretchr/testify v1.8.4
github.com/thanos-io/objstore v0.0.0-20230816175749-20395bffdf26
github.com/thanos-io/objstore v0.0.0-20230921130928-63a603e651ed
github.com/thanos-io/promql-engine v0.0.0-20230821193351-e1ae4275b96e
github.com/thanos-io/thanos v0.32.3-0.20230911095949-f6a39507b6bd
github.com/thanos-io/thanos v0.32.4-0.20230921182036-6257767ec9d0
github.com/uber/jaeger-client-go v2.30.0+incompatible
github.com/weaveworks/common v0.0.0-20221201103051-7c2720a9024d
go.etcd.io/etcd/api/v3 v3.5.9
Expand Down Expand Up @@ -117,7 +117,6 @@ require (
github.com/coreos/go-systemd/v22 v22.5.0 // indirect
github.com/davecgh/go-spew v1.1.2-0.20180830191138-d8f796af33cc // indirect
github.com/dennwc/varint v1.0.0 // indirect
github.com/dgryski/go-metro v0.0.0-20200812162917-85c65e2d0165 // indirect
github.com/dgryski/go-rendezvous v0.0.0-20200823014737-9f7001d12a5f // indirect
github.com/docker/go-units v0.5.0 // indirect
github.com/edsrzf/mmap-go v1.1.0 // indirect
Expand Down Expand Up @@ -190,7 +189,6 @@ require (
github.com/rs/cors v1.9.0 // indirect
github.com/rs/xid v1.5.0 // indirect
github.com/sean-/seed v0.0.0-20170313163322-e2103e2c3529 // indirect
github.com/seiflotfy/cuckoofilter v0.0.0-20220411075957-e3b120b3f5fb // indirect
github.com/sercand/kuberesolver v2.4.0+incompatible // indirect
github.com/shurcooL/httpfs v0.0.0-20230704072500-f1e31cf0ba5c // indirect
github.com/shurcooL/vfsgen v0.0.0-20200824052919-0d455de96546 // indirect
Expand Down Expand Up @@ -221,7 +219,7 @@ require (
golang.org/x/exp v0.0.0-20230713183714-613f0c0eb8a1 // indirect
golang.org/x/mod v0.12.0 // indirect
golang.org/x/oauth2 v0.11.0 // indirect
golang.org/x/sys v0.11.0 // indirect
golang.org/x/sys v0.12.0 // indirect
golang.org/x/text v0.12.0 // indirect
golang.org/x/tools v0.11.0 // indirect
golang.org/x/xerrors v0.0.0-20220907171357-04be3eba64a2 // indirect
Expand Down
16 changes: 6 additions & 10 deletions go.sum
Original file line number Diff line number Diff line change
Expand Up @@ -549,8 +549,6 @@ github.com/davecgh/go-spew v1.1.2-0.20180830191138-d8f796af33cc h1:U9qPSI2PIWSS1
github.com/davecgh/go-spew v1.1.2-0.20180830191138-d8f796af33cc/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38=
github.com/dennwc/varint v1.0.0 h1:kGNFFSSw8ToIy3obO/kKr8U9GZYUAxQEVuix4zfDWzE=
github.com/dennwc/varint v1.0.0/go.mod h1:hnItb35rvZvJrbTALZtY/iQfDs48JKRG1RPpgziApxA=
github.com/dgryski/go-metro v0.0.0-20200812162917-85c65e2d0165 h1:BS21ZUJ/B5X2UVUbczfmdWH7GapPWAhxcMsDnjJTU1E=
github.com/dgryski/go-metro v0.0.0-20200812162917-85c65e2d0165/go.mod h1:c9O8+fpSOX1DM8cPNSkX/qsBWdkD4yd2dpciOWQjpBw=
github.com/dgryski/go-rendezvous v0.0.0-20200823014737-9f7001d12a5f h1:lO4WD4F/rVNCu3HqELle0jiPLLBs70cWOduZpkS1E78=
github.com/dgryski/go-rendezvous v0.0.0-20200823014737-9f7001d12a5f/go.mod h1:cuUVRXasLTGF7a8hSLbxyZXjz+1KgoB3wDUb6vlszIc=
github.com/dhui/dktest v0.3.16 h1:i6gq2YQEtcrjKbeJpBkWjE8MmLZPYllcjOFbTZuPDnw=
Expand Down Expand Up @@ -1158,8 +1156,6 @@ github.com/sean-/seed v0.0.0-20170313163322-e2103e2c3529 h1:nn5Wsu0esKSJiIVhscUt
github.com/sean-/seed v0.0.0-20170313163322-e2103e2c3529/go.mod h1:DxrIzT+xaE7yg65j358z/aeFdxmN0P9QXhEzd20vsDc=
github.com/segmentio/fasthash v1.0.3 h1:EI9+KE1EwvMLBWwjpRDc+fEM+prwxDYbslddQGtrmhM=
github.com/segmentio/fasthash v1.0.3/go.mod h1:waKX8l2N8yckOgmSsXJi7x1ZfdKZ4x7KRMzBtS3oedY=
github.com/seiflotfy/cuckoofilter v0.0.0-20220411075957-e3b120b3f5fb h1:XfLJSPIOUX+osiMraVgIrMR27uMXnRJWGm1+GL8/63U=
github.com/seiflotfy/cuckoofilter v0.0.0-20220411075957-e3b120b3f5fb/go.mod h1:bR6DqgcAl1zTcOX8/pE2Qkj9XO00eCNqmKb7lXP8EAg=
github.com/sercand/kuberesolver/v4 v4.0.0 h1:frL7laPDG/lFm5n98ODmWnn+cvPpzlkf3LhzuPhcHP4=
github.com/sercand/kuberesolver/v4 v4.0.0/go.mod h1:F4RGyuRmMAjeXHKL+w4P7AwUnPceEAPAhxUgXZjKgvM=
github.com/shurcooL/httpfs v0.0.0-20230704072500-f1e31cf0ba5c h1:aqg5Vm5dwtvL+YgDpBcK1ITf3o96N/K7/wsRXQnUTEs=
Expand Down Expand Up @@ -1212,12 +1208,12 @@ github.com/subosito/gotenv v1.4.1/go.mod h1:ayKnFf/c6rvx/2iiLrJUk1e6plDbT3edrFNG
github.com/tencentyun/cos-go-sdk-v5 v0.7.40 h1:W6vDGKCHe4wBACI1d2UgE6+50sJFhRWU4O8IB2ozzxM=
github.com/thanos-community/galaxycache v0.0.0-20211122094458-3a32041a1f1e h1:f1Zsv7OAU9iQhZwigp50Yl38W10g/vd5NC8Rdk1Jzng=
github.com/thanos-community/galaxycache v0.0.0-20211122094458-3a32041a1f1e/go.mod h1:jXcofnrSln/cLI6/dhlBxPQZEEQHVPCcFaH75M+nSzM=
github.com/thanos-io/objstore v0.0.0-20230816175749-20395bffdf26 h1:q1lin/af0lw+I3sS79ccHs2CLjFOPc190J9saeQ5qQ4=
github.com/thanos-io/objstore v0.0.0-20230816175749-20395bffdf26/go.mod h1:oJ82xgcBDzGJrEgUsjlTj6n01+ZWUMMUR8BlZzX5xDE=
github.com/thanos-io/objstore v0.0.0-20230921130928-63a603e651ed h1:iWQdY3S6DpWjelVvKKSKgS7LeLkhK4VaEnQfphB9ZXA=
github.com/thanos-io/objstore v0.0.0-20230921130928-63a603e651ed/go.mod h1:oJ82xgcBDzGJrEgUsjlTj6n01+ZWUMMUR8BlZzX5xDE=
github.com/thanos-io/promql-engine v0.0.0-20230821193351-e1ae4275b96e h1:kwsFCU8eSkZehbrAN3nXPw5RdMHi/Bok/y8l2C4M+gk=
github.com/thanos-io/promql-engine v0.0.0-20230821193351-e1ae4275b96e/go.mod h1:+T/ZYNCGybT6eTsGGvVtGb63nT1cvUmH6MjqRrcQoKw=
github.com/thanos-io/thanos v0.32.3-0.20230911095949-f6a39507b6bd h1:JAXqwb/nzY7WzijekZrhrL63m988VLyoFUEaKLU15iA=
github.com/thanos-io/thanos v0.32.3-0.20230911095949-f6a39507b6bd/go.mod h1:J81dp4qaOX+GfPmRoYqu/aZXfEBri7+i3TzY2xamthg=
github.com/thanos-io/thanos v0.32.4-0.20230921182036-6257767ec9d0 h1:T9Vot+BQao6M6j8F0JQbseAqtniOw1Csz+QHRRRwF48=
github.com/thanos-io/thanos v0.32.4-0.20230921182036-6257767ec9d0/go.mod h1:Px5Boq60s+2WwR+V4v4oxgmxfw9WHrwMwjRou6pkUNw=
github.com/themihai/gomemcache v0.0.0-20180902122335-24332e2d58ab h1:7ZR3hmisBWw77ZpO1/o86g+JV3VKlk3d48jopJxzTjU=
github.com/themihai/gomemcache v0.0.0-20180902122335-24332e2d58ab/go.mod h1:eheTFp954zcWZXCU8d0AT76ftsQOTo4DTqkN/h3k1MY=
github.com/tidwall/pretty v1.0.0/go.mod h1:XNkn88O1ChpSDQmQeStsy+sBenx6DDtFZJxhVysOjyk=
Expand Down Expand Up @@ -1607,8 +1603,8 @@ golang.org/x/sys v0.2.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
golang.org/x/sys v0.4.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
golang.org/x/sys v0.5.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
golang.org/x/sys v0.6.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
golang.org/x/sys v0.11.0 h1:eG7RXZHdqOJ1i+0lgLgCpSXAp6M3LYlAo6osgSi0xOM=
golang.org/x/sys v0.11.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
golang.org/x/sys v0.12.0 h1:CM0HF96J0hcLAwsHPJZjfdNzs0gftsLfgKt57wWHJ0o=
golang.org/x/sys v0.12.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
golang.org/x/term v0.0.0-20201126162022-7de9c90e9dd1/go.mod h1:bj7SfCRtBDWHUb9snDiAeCFNEtKQo2Wmx5Cou7ajbmo=
golang.org/x/term v0.0.0-20210927222741-03fcf44c2211/go.mod h1:jbD1KX2456YbFQfuXm/mYQcufACuNUgVhRMnK/tPxf8=
golang.org/x/term v0.1.0/go.mod h1:jbD1KX2456YbFQfuXm/mYQcufACuNUgVhRMnK/tPxf8=
Expand Down
12 changes: 8 additions & 4 deletions pkg/querier/blocks_store_queryable.go
Original file line number Diff line number Diff line change
Expand Up @@ -609,6 +609,7 @@ func (q *blocksStoreQuerier) fetchSeriesFromStores(
return errors.Wrapf(err, "failed to create series request")
}

begin := time.Now()
stream, err := c.Series(gCtx, req)
if err != nil {
if isRetryableError(err) {
Expand Down Expand Up @@ -725,10 +726,10 @@ func (q *blocksStoreQuerier) fetchSeriesFromStores(
"requested blocks", strings.Join(convertULIDsToString(blockIDs), " "),
"queried blocks", strings.Join(convertULIDsToString(myQueriedBlocks), " "))

// It is also interesting to look at data downloaded at store gateway even if
// no series got matched, but to reduce verbosity we are more interested in those
// matched case. With vertical sharding enabled it is easy to log too much.
if numSeries > 0 {
// Use number of blocks queried to check whether we should log the query
// or not. It might be logging too much but good to understand per request
// performance.
if seriesQueryStats.BlocksQueried > 0 {
level.Info(spanLog).Log("msg", "store gateway series request stats",
"instance", c.RemoteAddress(),
"queryable_chunk_bytes_fetched", chunkBytes,
Expand All @@ -753,6 +754,9 @@ func (q *blocksStoreQuerier) fetchSeriesFromStores(
"chunks_fetch_count", seriesQueryStats.ChunksFetchCount,
"chunks_fetched_size_sum", seriesQueryStats.ChunksFetchedSizeSum,
"data_downloaded_size_sum", seriesQueryStats.DataDownloadedSizeSum,
"get_all_duration", seriesQueryStats.GetAllDuration,
"merge_duration", seriesQueryStats.MergeDuration,
"response_time", time.Since(begin),
)
}

Expand Down
20 changes: 17 additions & 3 deletions pkg/storegateway/bucket_store_metrics.go
Original file line number Diff line number Diff line change
Expand Up @@ -38,9 +38,11 @@ type BucketStoreMetrics struct {
cachedPostingsOriginalSizeBytes *prometheus.Desc
cachedPostingsCompressedSizeBytes *prometheus.Desc

seriesFetchDuration *prometheus.Desc
postingsFetchDuration *prometheus.Desc
chunkFetchDuration *prometheus.Desc
seriesFetchDuration *prometheus.Desc
seriesFetchDurationSum *prometheus.Desc
postingsFetchDuration *prometheus.Desc
chunkFetchDuration *prometheus.Desc
chunkFetchDurationSum *prometheus.Desc

lazyExpandedPostingsCount *prometheus.Desc
lazyExpandedPostingSizeBytes *prometheus.Desc
Expand Down Expand Up @@ -160,6 +162,10 @@ func NewBucketStoreMetrics() *BucketStoreMetrics {
"cortex_bucket_store_series_fetch_duration_seconds",
"Time it takes to fetch series to respond a request sent to store-gateway. It includes both the time to fetch it from cache and from storage in case of cache misses.",
nil, nil),
seriesFetchDurationSum: prometheus.NewDesc(
"cortex_bucket_store_series_fetch_duration_sum_seconds",
"The time it takes to fetch postings to respond to a request sent to a store gateway. It includes both the time to fetch it from the cache and from storage in case of cache misses.",
nil, nil),
postingsFetchDuration: prometheus.NewDesc(
"cortex_bucket_store_postings_fetch_duration_seconds",
"Time it takes to fetch postings to respond a request sent to store-gateway. It includes both the time to fetch it from cache and from storage in case of cache misses.",
Expand All @@ -168,6 +174,10 @@ func NewBucketStoreMetrics() *BucketStoreMetrics {
"cortex_bucket_store_chunks_fetch_duration_seconds",
"The total time spent fetching chunks within a single request a store gateway.",
nil, nil),
chunkFetchDurationSum: prometheus.NewDesc(
"cortex_bucket_store_chunks_fetch_duration_sum_seconds",
"The total absolute time spent fetching chunks within a single request for one block.",
nil, nil),

indexHeaderLazyLoadCount: prometheus.NewDesc(
"cortex_bucket_store_indexheader_lazy_load_total",
Expand Down Expand Up @@ -241,8 +251,10 @@ func (m *BucketStoreMetrics) Describe(out chan<- *prometheus.Desc) {
out <- m.cachedPostingsCompressedSizeBytes

out <- m.seriesFetchDuration
out <- m.seriesFetchDurationSum
out <- m.postingsFetchDuration
out <- m.chunkFetchDuration
out <- m.chunkFetchDurationSum

out <- m.indexHeaderLazyLoadCount
out <- m.indexHeaderLazyLoadFailedCount
Expand Down Expand Up @@ -288,8 +300,10 @@ func (m *BucketStoreMetrics) Collect(out chan<- prometheus.Metric) {
data.SendSumOfCountersWithLabels(out, m.cachedPostingsCompressedSizeBytes, "thanos_bucket_store_cached_postings_compressed_size_bytes_total")

data.SendSumOfHistograms(out, m.seriesFetchDuration, "thanos_bucket_store_series_fetch_duration_seconds")
data.SendSumOfHistograms(out, m.seriesFetchDurationSum, "thanos_bucket_store_series_fetch_duration_sum_seconds")
data.SendSumOfHistograms(out, m.postingsFetchDuration, "thanos_bucket_store_postings_fetch_duration_seconds")
data.SendSumOfHistograms(out, m.chunkFetchDuration, "thanos_bucket_store_chunks_fetch_duration_seconds")
data.SendSumOfHistograms(out, m.chunkFetchDurationSum, "thanos_bucket_store_chunks_fetch_duration_sum_seconds")

data.SendSumOfCounters(out, m.indexHeaderLazyLoadCount, "thanos_bucket_store_indexheader_lazy_load_total")
data.SendSumOfCounters(out, m.indexHeaderLazyLoadFailedCount, "thanos_bucket_store_indexheader_lazy_load_failed_total")
Expand Down
Loading

0 comments on commit b7a3a5d

Please sign in to comment.