update thanos and add metrics to cortex

Signed-off-by: Ben Ye <[email protected]>
cortexproject · Sep 19, 2023 · 101da53 · 101da53
1 parent a7b4ceb
commit 101da53
Show file tree

Hide file tree

Showing 7 changed files with 154 additions and 48 deletions.
diff --git a/go.mod b/go.mod
@@ -53,7 +53,7 @@ require (
 	github.com/stretchr/testify v1.8.4
 	github.com/thanos-io/objstore v0.0.0-20230913122821-eb06103887ab
 	github.com/thanos-io/promql-engine v0.0.0-20230821193351-e1ae4275b96e
-	github.com/thanos-io/thanos v0.32.3-0.20230911095949-f6a39507b6bd
+	github.com/thanos-io/thanos v0.32.3-0.20230919181645-2bc12a582284
 	github.com/uber/jaeger-client-go v2.30.0+incompatible
 	github.com/weaveworks/common v0.0.0-20221201103051-7c2720a9024d
 	go.etcd.io/etcd/api/v3 v3.5.9
@@ -262,5 +262,3 @@ replace github.com/google/gnostic => github.com/googleapis/gnostic v0.6.9
 replace gopkg.in/alecthomas/kingpin.v2 => github.com/alecthomas/kingpin v1.3.8-0.20210301060133-17f40c25f497
 
 replace github.com/sercand/kuberesolver => github.com/sercand/kuberesolver/v4 v4.0.0
-
-replace github.com/thanos-io/thanos => github.com/yeya24/thanos v0.2.2-0.20230918182451-e0a5d841de76
diff --git a/go.sum b/go.sum
@@ -1212,6 +1212,8 @@ github.com/thanos-io/objstore v0.0.0-20230913122821-eb06103887ab h1:IfcvGL/erj7I
 github.com/thanos-io/objstore v0.0.0-20230913122821-eb06103887ab/go.mod h1:oJ82xgcBDzGJrEgUsjlTj6n01+ZWUMMUR8BlZzX5xDE=
 github.com/thanos-io/promql-engine v0.0.0-20230821193351-e1ae4275b96e h1:kwsFCU8eSkZehbrAN3nXPw5RdMHi/Bok/y8l2C4M+gk=
 github.com/thanos-io/promql-engine v0.0.0-20230821193351-e1ae4275b96e/go.mod h1:+T/ZYNCGybT6eTsGGvVtGb63nT1cvUmH6MjqRrcQoKw=
+github.com/thanos-io/thanos v0.32.3-0.20230919181645-2bc12a582284 h1:x3Fs9sB5PvmNdS0DsTARyK5Dfc9IwOSReu7K7IN6mPA=
+github.com/thanos-io/thanos v0.32.3-0.20230919181645-2bc12a582284/go.mod h1:MD/vJgwt6va80KWA3qELrNODEo1evehiKvulDVrNGKY=
 github.com/themihai/gomemcache v0.0.0-20180902122335-24332e2d58ab h1:7ZR3hmisBWw77ZpO1/o86g+JV3VKlk3d48jopJxzTjU=
 github.com/themihai/gomemcache v0.0.0-20180902122335-24332e2d58ab/go.mod h1:eheTFp954zcWZXCU8d0AT76ftsQOTo4DTqkN/h3k1MY=
 github.com/tidwall/pretty v1.0.0/go.mod h1:XNkn88O1ChpSDQmQeStsy+sBenx6DDtFZJxhVysOjyk=
@@ -1234,8 +1236,6 @@ github.com/xdg-go/scram v1.1.2/go.mod h1:RT/sEzTbU5y00aCK8UOx6R7YryM0iF1N2MOmC3k
 github.com/xdg-go/stringprep v1.0.2/go.mod h1:8F9zXuvzgwmyT5DUm4GUfZGDdT3W+LCvS6+da4O5kxM=
 github.com/xdg-go/stringprep v1.0.3/go.mod h1:W3f5j4i+9rC0kuIEJL0ky1VpHXQU3ocBgklLGvcBnW8=
 github.com/xdg-go/stringprep v1.0.4/go.mod h1:mPGuuIYwz7CmR2bT9j4GbQqutWS1zV24gijq1dTyGkM=
-github.com/yeya24/thanos v0.2.2-0.20230918182451-e0a5d841de76 h1:czelPDvh0+R9z+TnjjIIhgoI6kKpSo30VvLcEAHtGtU=
-github.com/yeya24/thanos v0.2.2-0.20230918182451-e0a5d841de76/go.mod h1:MD/vJgwt6va80KWA3qELrNODEo1evehiKvulDVrNGKY=
 github.com/youmark/pkcs8 v0.0.0-20181117223130-1be2e3e5546d/go.mod h1:rHwXgn7JulP+udvsHwJoVG1YGAP6VLg4y9I5dyZdqmA=
 github.com/yuin/goldmark v1.1.25/go.mod h1:3hX8gzYuyVAZsxl0MRgGTJEmQBFcNTphYh9decYSb74=
 github.com/yuin/goldmark v1.1.27/go.mod h1:3hX8gzYuyVAZsxl0MRgGTJEmQBFcNTphYh9decYSb74=

diff --git a/pkg/querier/blocks_store_queryable.go b/pkg/querier/blocks_store_queryable.go
@@ -756,7 +756,7 @@ func (q *blocksStoreQuerier) fetchSeriesFromStores(
 					"data_downloaded_size_sum", seriesQueryStats.DataDownloadedSizeSum,
 					"get_all_duration", seriesQueryStats.GetAllDuration,
 					"merge_duration", seriesQueryStats.MergeDuration,
-					"response_latency", time.Since(begin),
+					"response_time", time.Since(begin),
 				)
 			}
 

diff --git a/pkg/storegateway/bucket_store_metrics.go b/pkg/storegateway/bucket_store_metrics.go
@@ -38,9 +38,11 @@ type BucketStoreMetrics struct {
 	cachedPostingsOriginalSizeBytes      *prometheus.Desc
 	cachedPostingsCompressedSizeBytes    *prometheus.Desc
 
-	seriesFetchDuration   *prometheus.Desc
-	postingsFetchDuration *prometheus.Desc
-	chunkFetchDuration    *prometheus.Desc
+	seriesFetchDuration    *prometheus.Desc
+	seriesFetchDurationSum *prometheus.Desc
+	postingsFetchDuration  *prometheus.Desc
+	chunkFetchDuration     *prometheus.Desc
+	chunkFetchDurationSum  *prometheus.Desc
 
 	lazyExpandedPostingsCount                     *prometheus.Desc
 	lazyExpandedPostingSizeBytes                  *prometheus.Desc
@@ -160,6 +162,10 @@ func NewBucketStoreMetrics() *BucketStoreMetrics {
 			"cortex_bucket_store_series_fetch_duration_seconds",
 			"Time it takes to fetch series to respond a request sent to store-gateway. It includes both the time to fetch it from cache and from storage in case of cache misses.",
 			nil, nil),
+		seriesFetchDurationSum: prometheus.NewDesc(
+			"cortex_bucket_store_series_fetch_duration_sum_seconds",
+			"The time it takes to fetch postings to respond to a request sent to a store gateway. It includes both the time to fetch it from the cache and from storage in case of cache misses.",
+			nil, nil),
 		postingsFetchDuration: prometheus.NewDesc(
 			"cortex_bucket_store_postings_fetch_duration_seconds",
 			"Time it takes to fetch postings to respond a request sent to store-gateway. It includes both the time to fetch it from cache and from storage in case of cache misses.",
@@ -168,6 +174,10 @@ func NewBucketStoreMetrics() *BucketStoreMetrics {
 			"cortex_bucket_store_chunks_fetch_duration_seconds",
 			"The total time spent fetching chunks within a single request a store gateway.",
 			nil, nil),
+		chunkFetchDurationSum: prometheus.NewDesc(
+			"cortex_bucket_store_chunks_fetch_duration_sum_seconds",
+			"The total absolute time spent fetching chunks within a single request for one block.",
+			nil, nil),
 
 		indexHeaderLazyLoadCount: prometheus.NewDesc(
 			"cortex_bucket_store_indexheader_lazy_load_total",
@@ -241,8 +251,10 @@ func (m *BucketStoreMetrics) Describe(out chan<- *prometheus.Desc) {
 	out <- m.cachedPostingsCompressedSizeBytes
 
 	out <- m.seriesFetchDuration
+	out <- m.seriesFetchDurationSum
 	out <- m.postingsFetchDuration
 	out <- m.chunkFetchDuration
+	out <- m.chunkFetchDurationSum
 
 	out <- m.indexHeaderLazyLoadCount
 	out <- m.indexHeaderLazyLoadFailedCount
@@ -288,8 +300,10 @@ func (m *BucketStoreMetrics) Collect(out chan<- prometheus.Metric) {
 	data.SendSumOfCountersWithLabels(out, m.cachedPostingsCompressedSizeBytes, "thanos_bucket_store_cached_postings_compressed_size_bytes_total")
 
 	data.SendSumOfHistograms(out, m.seriesFetchDuration, "thanos_bucket_store_series_fetch_duration_seconds")
+	data.SendSumOfHistograms(out, m.seriesFetchDurationSum, "thanos_bucket_store_series_fetch_duration_sum_seconds")
 	data.SendSumOfHistograms(out, m.postingsFetchDuration, "thanos_bucket_store_postings_fetch_duration_seconds")
 	data.SendSumOfHistograms(out, m.chunkFetchDuration, "thanos_bucket_store_chunks_fetch_duration_seconds")
+	data.SendSumOfHistograms(out, m.chunkFetchDurationSum, "thanos_bucket_store_chunks_fetch_duration_sum_seconds")
 
 	data.SendSumOfCounters(out, m.indexHeaderLazyLoadCount, "thanos_bucket_store_indexheader_lazy_load_total")
 	data.SendSumOfCounters(out, m.indexHeaderLazyLoadFailedCount, "thanos_bucket_store_indexheader_lazy_load_failed_total")

diff --git a/pkg/storegateway/bucket_store_metrics_test.go b/pkg/storegateway/bucket_store_metrics_test.go
@@ -290,6 +290,25 @@ func TestBucketStoreMetrics(t *testing.T) {
 			cortex_bucket_store_series_data_touched_sum{data_type="touched-c"} 180152
 			cortex_bucket_store_series_data_touched_count{data_type="touched-c"} 3
 
+        	# HELP cortex_bucket_store_series_fetch_duration_sum_seconds The time it takes to fetch postings to respond to a request sent to a store gateway. It includes both the time to fetch it from the cache and from storage in case of cache misses.
+        	# TYPE cortex_bucket_store_series_fetch_duration_sum_seconds histogram
+        	cortex_bucket_store_series_fetch_duration_sum_seconds_bucket{le="0.001"} 0
+        	cortex_bucket_store_series_fetch_duration_sum_seconds_bucket{le="0.01"} 0
+        	cortex_bucket_store_series_fetch_duration_sum_seconds_bucket{le="0.1"} 0
+        	cortex_bucket_store_series_fetch_duration_sum_seconds_bucket{le="0.3"} 0
+        	cortex_bucket_store_series_fetch_duration_sum_seconds_bucket{le="0.6"} 0
+        	cortex_bucket_store_series_fetch_duration_sum_seconds_bucket{le="1"} 0
+        	cortex_bucket_store_series_fetch_duration_sum_seconds_bucket{le="3"} 0
+        	cortex_bucket_store_series_fetch_duration_sum_seconds_bucket{le="6"} 0
+        	cortex_bucket_store_series_fetch_duration_sum_seconds_bucket{le="9"} 0
+        	cortex_bucket_store_series_fetch_duration_sum_seconds_bucket{le="20"} 0
+        	cortex_bucket_store_series_fetch_duration_sum_seconds_bucket{le="30"} 0
+        	cortex_bucket_store_series_fetch_duration_sum_seconds_bucket{le="60"} 0
+        	cortex_bucket_store_series_fetch_duration_sum_seconds_bucket{le="90"} 0
+        	cortex_bucket_store_series_fetch_duration_sum_seconds_bucket{le="120"} 0
+        	cortex_bucket_store_series_fetch_duration_sum_seconds_bucket{le="+Inf"} 3
+        	cortex_bucket_store_series_fetch_duration_sum_seconds_sum 1.306102e+06
+        	cortex_bucket_store_series_fetch_duration_sum_seconds_count 3
 			# HELP cortex_bucket_store_series_get_all_duration_seconds Time it takes until all per-block prepares and preloads for a query are finished.
 			# TYPE cortex_bucket_store_series_get_all_duration_seconds histogram
 			cortex_bucket_store_series_get_all_duration_seconds_bucket{le="0.001"} 0
@@ -395,7 +414,7 @@ func TestBucketStoreMetrics(t *testing.T) {
 
         	# HELP cortex_bucket_store_chunk_refetches_total Total number of cases where configured estimated chunk bytes was not enough was to fetch chunks from object store, resulting in refetch.
         	# TYPE cortex_bucket_store_chunk_refetches_total counter
-        	cortex_bucket_store_chunk_refetches_total 0
+        	cortex_bucket_store_chunk_refetches_total 765646
 
 			# HELP cortex_bucket_store_cached_postings_compressed_size_bytes_total Compressed size of postings stored into cache.
 			# TYPE cortex_bucket_store_cached_postings_compressed_size_bytes_total counter
@@ -439,6 +458,25 @@ func TestBucketStoreMetrics(t *testing.T) {
         	cortex_bucket_store_chunks_fetch_duration_seconds_bucket{le="+Inf"} 3
         	cortex_bucket_store_chunks_fetch_duration_seconds_sum 1.328621e+06
         	cortex_bucket_store_chunks_fetch_duration_seconds_count 3
+        	# HELP cortex_bucket_store_chunks_fetch_duration_sum_seconds The total absolute time spent fetching chunks within a single request for one block.
+        	# TYPE cortex_bucket_store_chunks_fetch_duration_sum_seconds histogram
+        	cortex_bucket_store_chunks_fetch_duration_sum_seconds_bucket{le="0.001"} 0
+        	cortex_bucket_store_chunks_fetch_duration_sum_seconds_bucket{le="0.01"} 0
+        	cortex_bucket_store_chunks_fetch_duration_sum_seconds_bucket{le="0.1"} 0
+        	cortex_bucket_store_chunks_fetch_duration_sum_seconds_bucket{le="0.3"} 0
+        	cortex_bucket_store_chunks_fetch_duration_sum_seconds_bucket{le="0.6"} 0
+        	cortex_bucket_store_chunks_fetch_duration_sum_seconds_bucket{le="1"} 0
+        	cortex_bucket_store_chunks_fetch_duration_sum_seconds_bucket{le="3"} 0
+        	cortex_bucket_store_chunks_fetch_duration_sum_seconds_bucket{le="6"} 0
+        	cortex_bucket_store_chunks_fetch_duration_sum_seconds_bucket{le="9"} 0
+        	cortex_bucket_store_chunks_fetch_duration_sum_seconds_bucket{le="20"} 0
+        	cortex_bucket_store_chunks_fetch_duration_sum_seconds_bucket{le="30"} 0
+        	cortex_bucket_store_chunks_fetch_duration_sum_seconds_bucket{le="60"} 0
+        	cortex_bucket_store_chunks_fetch_duration_sum_seconds_bucket{le="90"} 0
+        	cortex_bucket_store_chunks_fetch_duration_sum_seconds_bucket{le="120"} 0
+        	cortex_bucket_store_chunks_fetch_duration_sum_seconds_bucket{le="+Inf"} 3
+        	cortex_bucket_store_chunks_fetch_duration_sum_seconds_sum 1.328621e+06
+        	cortex_bucket_store_chunks_fetch_duration_sum_seconds_count 3
         	# HELP cortex_bucket_store_empty_postings_total Total number of empty postings when fetching block series.
             # TYPE cortex_bucket_store_empty_postings_total counter
         	cortex_bucket_store_empty_postings_total 112595
@@ -604,6 +642,7 @@ func populateMockedBucketStoreMetrics(base float64) *prometheus.Registry {
 	m.chunkSizeBytes.Observe(11 * base)
 
 	m.seriesRefetches.Add(33 * base)
+	m.chunkRefetches.Add(34 * base)
 
 	m.cachedPostingsCompressions.WithLabelValues("encode").Add(50 * base)
 	m.cachedPostingsCompressions.WithLabelValues("decode").Add(51 * base)
@@ -618,8 +657,10 @@ func populateMockedBucketStoreMetrics(base float64) *prometheus.Registry {
 	m.cachedPostingsCompressedSizeBytes.Add(57 * base)
 
 	m.seriesFetchDuration.Observe(58 * base)
+	m.seriesFetchDurationSum.Observe(58 * base)
 	m.postingsFetchDuration.Observe(59 * base)
 	m.chunkFetchDuration.Observe(59 * base)
+	m.chunkFetchDurationSum.Observe(59 * base)
 
 	m.indexHeaderLazyLoadCount.Add(60 * base)
 	m.indexHeaderLazyLoadFailedCount.Add(61 * base)
@@ -664,9 +705,11 @@ type mockedBucketStoreMetrics struct {
 	cachedPostingsOriginalSizeBytes      prometheus.Counter
 	cachedPostingsCompressedSizeBytes    prometheus.Counter
 
-	seriesFetchDuration   prometheus.Histogram
-	postingsFetchDuration prometheus.Histogram
-	chunkFetchDuration    prometheus.Histogram
+	seriesFetchDuration    prometheus.Histogram
+	seriesFetchDurationSum prometheus.Histogram
+	postingsFetchDuration  prometheus.Histogram
+	chunkFetchDuration     prometheus.Histogram
+	chunkFetchDurationSum  prometheus.Histogram
 
 	indexHeaderLazyLoadCount         prometheus.Counter
 	indexHeaderLazyLoadFailedCount   prometheus.Counter
@@ -801,6 +844,11 @@ func newMockedBucketStoreMetrics(reg prometheus.Registerer) *mockedBucketStoreMe
 		Help:    "Time it takes to fetch series from a bucket to respond a query. It also includes the time it takes to cache fetch and store operations.",
 		Buckets: []float64{0.001, 0.01, 0.1, 0.3, 0.6, 1, 3, 6, 9, 20, 30, 60, 90, 120},
 	})
+	m.seriesFetchDurationSum = promauto.With(reg).NewHistogram(prometheus.HistogramOpts{
+		Name:    "thanos_bucket_store_series_fetch_duration_sum_seconds",
+		Help:    "The total time it takes to fetch series to respond to a request sent to a store gateway across all series batches. It includes both the time to fetch it from the cache and from storage in case of cache misses.",
+		Buckets: []float64{0.001, 0.01, 0.1, 0.3, 0.6, 1, 3, 6, 9, 20, 30, 60, 90, 120},
+	})
 	m.postingsFetchDuration = promauto.With(reg).NewHistogram(prometheus.HistogramOpts{
 		Name:    "thanos_bucket_store_postings_fetch_duration_seconds",
 		Help:    "Time it takes to fetch postings from a bucket to respond a query. It also includes the time it takes to cache fetch and store operations.",
@@ -811,6 +859,11 @@ func newMockedBucketStoreMetrics(reg prometheus.Registerer) *mockedBucketStoreMe
 		Help:    "The total time spent fetching chunks within a single request a store gateway.",
 		Buckets: []float64{0.001, 0.01, 0.1, 0.3, 0.6, 1, 3, 6, 9, 20, 30, 60, 90, 120},
 	})
+	m.chunkFetchDurationSum = promauto.With(reg).NewHistogram(prometheus.HistogramOpts{
+		Name:    "thanos_bucket_store_chunks_fetch_duration_sum_seconds",
+		Help:    "The total absolute time spent fetching chunks within a single request for one block.",
+		Buckets: []float64{0.001, 0.01, 0.1, 0.3, 0.6, 1, 3, 6, 9, 20, 30, 60, 90, 120},
+	})
 
 	m.indexHeaderLazyLoadCount = promauto.With(reg).NewCounter(prometheus.CounterOpts{
 		Name: "thanos_bucket_store_indexheader_lazy_load_total",