From fa132cd2a0d283881e35a26a3e3718c40b1d4425 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Gy=C3=B6rgy=20Krajcsovits?= Date: Tue, 26 Mar 2024 17:41:11 +0100 Subject: [PATCH 1/5] Distributor: add bucket count validation to native histograms MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Otherwise the ingester does it and returns an untyped error which we turn into 500x with little detail on the series as well. Signed-off-by: György Krajcsovits --- pkg/distributor/validate.go | 37 +++++++++++++ pkg/distributor/validate_test.go | 93 ++++++++++++++++++++++++++++++++ pkg/util/globalerror/errors.go | 1 + 3 files changed, 131 insertions(+) diff --git a/pkg/distributor/validate.go b/pkg/distributor/validate.go index af2c6e3f82e..872ad3472ad 100644 --- a/pkg/distributor/validate.go +++ b/pkg/distributor/validate.go @@ -38,6 +38,7 @@ var ( reasonLabelValueTooLong = globalerror.SeriesLabelValueTooLong.LabelValue() reasonMaxNativeHistogramBuckets = globalerror.MaxNativeHistogramBuckets.LabelValue() reasonInvalidNativeHistogramSchema = globalerror.InvalidSchemaNativeHistogram.LabelValue() + reasonBucketCountMismatch = globalerror.BucketCountMismatch.LabelValue() reasonDuplicateLabelNames = globalerror.SeriesWithDuplicateLabelNames.LabelValue() reasonTooFarInFuture = globalerror.SampleTooFarInFuture.LabelValue() @@ -79,6 +80,7 @@ var ( maxNativeHistogramBucketsMsgFormat = globalerror.MaxNativeHistogramBuckets.Message("received a native histogram sample with too many buckets, timestamp: %d series: %s, buckets: %d, limit: %d") notReducibleNativeHistogramMsgFormat = globalerror.NotReducibleNativeHistogram.Message("received a native histogram sample with too many buckets and cannot reduce, timestamp: %d series: %s, buckets: %d, limit: %d") invalidSchemaNativeHistogramMsgFormat = globalerror.InvalidSchemaNativeHistogram.Message("received a native histogram sample with an invalid schema: %d") + bucketCountMismatchMsgFormat = globalerror.BucketCountMismatch.Message("native histogram bucket count mismatch, timestamp: %d, series: %s, expected %v, got %v") sampleTimestampTooNewMsgFormat = globalerror.SampleTooFarInFuture.MessageWithPerTenantLimitConfig( "received a sample whose timestamp is too far in the future, timestamp: %d series: '%.200s'", validation.CreationGracePeriodFlag, @@ -121,6 +123,7 @@ type sampleValidationMetrics struct { labelValueTooLong *prometheus.CounterVec maxNativeHistogramBuckets *prometheus.CounterVec invalidNativeHistogramSchema *prometheus.CounterVec + bucketCountMismatch *prometheus.CounterVec duplicateLabelNames *prometheus.CounterVec tooFarInFuture *prometheus.CounterVec } @@ -135,6 +138,7 @@ func (m *sampleValidationMetrics) deleteUserMetrics(userID string) { m.labelValueTooLong.DeletePartialMatch(filter) m.maxNativeHistogramBuckets.DeletePartialMatch(filter) m.invalidNativeHistogramSchema.DeletePartialMatch(filter) + m.bucketCountMismatch.DeletePartialMatch(filter) m.duplicateLabelNames.DeletePartialMatch(filter) m.tooFarInFuture.DeletePartialMatch(filter) } @@ -148,6 +152,7 @@ func (m *sampleValidationMetrics) deleteUserMetricsForGroup(userID, group string m.labelValueTooLong.DeleteLabelValues(userID, group) m.maxNativeHistogramBuckets.DeleteLabelValues(userID, group) m.invalidNativeHistogramSchema.DeleteLabelValues(userID, group) + m.bucketCountMismatch.DeleteLabelValues(userID, group) m.duplicateLabelNames.DeleteLabelValues(userID, group) m.tooFarInFuture.DeleteLabelValues(userID, group) } @@ -162,6 +167,7 @@ func newSampleValidationMetrics(r prometheus.Registerer) *sampleValidationMetric labelValueTooLong: validation.DiscardedSamplesCounter(r, reasonLabelValueTooLong), maxNativeHistogramBuckets: validation.DiscardedSamplesCounter(r, reasonMaxNativeHistogramBuckets), invalidNativeHistogramSchema: validation.DiscardedSamplesCounter(r, reasonInvalidNativeHistogramSchema), + bucketCountMismatch: validation.DiscardedSamplesCounter(r, reasonBucketCountMismatch), duplicateLabelNames: validation.DiscardedSamplesCounter(r, reasonDuplicateLabelNames), tooFarInFuture: validation.DiscardedSamplesCounter(r, reasonTooFarInFuture), } @@ -251,6 +257,37 @@ func validateSampleHistogram(m *sampleValidationMetrics, now model.Time, cfg sam } } + // Check that bucket counts including zero bucket add up to the overall count. + if s.IsFloatHistogram() { + count := s.GetZeroCountFloat() + for _, c := range s.GetNegativeCounts() { + count += c + } + for _, c := range s.GetPositiveCounts() { + count += c + } + if count != s.GetCountFloat() { + m.bucketCountMismatch.WithLabelValues(userID, group).Inc() + return fmt.Errorf(bucketCountMismatchMsgFormat, s.Timestamp, mimirpb.FromLabelAdaptersToString(ls), s.GetCountFloat(), count) + } + } else { + count := s.GetZeroCountInt() + bucketCount := int64(0) + for _, c := range s.GetNegativeDeltas() { + bucketCount += c + count += uint64(bucketCount) + } + bucketCount = int64(0) + for _, c := range s.GetPositiveDeltas() { + bucketCount += c + count += uint64(bucketCount) + } + if count != s.GetCountInt() { + m.bucketCountMismatch.WithLabelValues(userID, group).Inc() + return fmt.Errorf(bucketCountMismatchMsgFormat, s.Timestamp, mimirpb.FromLabelAdaptersToString(ls), s.GetCountInt(), count) + } + } + return nil } diff --git a/pkg/distributor/validate_test.go b/pkg/distributor/validate_test.go index 0508d1b2724..5f29d3ee28b 100644 --- a/pkg/distributor/validate_test.go +++ b/pkg/distributor/validate_test.go @@ -583,6 +583,99 @@ func TestInvalidNativeHistogramSchema(t *testing.T) { `), "cortex_discarded_samples_total")) } +func TestInvalidBucketCountHistogram(t *testing.T) { + testCases := map[string]struct { + h *mimirpb.Histogram + expectedError error + }{ + "a valid zero counts causes no error": { + h: &mimirpb.Histogram{}, + expectedError: nil, + }, + "a valid integer histogram causes no error": { + h: &mimirpb.Histogram{ + Count: &mimirpb.Histogram_CountInt{CountInt: 5}, + Sum: 10, + Schema: 1, + ZeroThreshold: 0.001, + ZeroCount: &mimirpb.Histogram_ZeroCountInt{ZeroCountInt: 1}, + NegativeSpans: []mimirpb.BucketSpan{{Offset: 0, Length: 2}}, + NegativeDeltas: []int64{1, 1}, + PositiveSpans: []mimirpb.BucketSpan{{Offset: 0, Length: 1}}, + PositiveDeltas: []int64{1}, + ResetHint: mimirpb.Histogram_UNKNOWN, + Timestamp: 0, + }, + expectedError: nil, + }, + "a valid float histogram causes no error": { + h: &mimirpb.Histogram{ + Count: &mimirpb.Histogram_CountFloat{CountFloat: 5.5}, + Sum: 10, + Schema: 1, + ZeroThreshold: 0.001, + ZeroCount: &mimirpb.Histogram_ZeroCountFloat{ZeroCountFloat: 1.5}, + NegativeSpans: []mimirpb.BucketSpan{{Offset: 0, Length: 2}}, + NegativeCounts: []float64{1.0, 2.0}, + PositiveSpans: []mimirpb.BucketSpan{{Offset: 0, Length: 1}}, + PositiveCounts: []float64{1.0}, + ResetHint: mimirpb.Histogram_UNKNOWN, + Timestamp: 0, + }, + expectedError: nil, + }, + "an integer histogram with the wrong overall count": { + h: &mimirpb.Histogram{ + Count: &mimirpb.Histogram_CountInt{CountInt: 4}, + Sum: 10, + Schema: 1, + ZeroThreshold: 0.001, + ZeroCount: &mimirpb.Histogram_ZeroCountInt{ZeroCountInt: 1}, + NegativeSpans: []mimirpb.BucketSpan{{Offset: 0, Length: 2}}, + NegativeDeltas: []int64{1, 1}, + PositiveSpans: []mimirpb.BucketSpan{{Offset: 0, Length: 1}}, + PositiveDeltas: []int64{1}, + ResetHint: mimirpb.Histogram_UNKNOWN, + Timestamp: 0, + }, + expectedError: fmt.Errorf("native histogram bucket count mismatch, timestamp: 0, series: a{a=\"a\"}, expected 4, got 5 (err-mimir-native-histogram-bucket-count-mismatch)"), + }, + "a float histogram with the wrong overall count": { + h: &mimirpb.Histogram{ + Count: &mimirpb.Histogram_CountFloat{CountFloat: 4.5}, + Sum: 10, + Schema: 1, + ZeroThreshold: 0.001, + ZeroCount: &mimirpb.Histogram_ZeroCountFloat{ZeroCountFloat: 1.5}, + NegativeSpans: []mimirpb.BucketSpan{{Offset: 0, Length: 2}}, + NegativeCounts: []float64{1.0, 2.0}, + PositiveSpans: []mimirpb.BucketSpan{{Offset: 0, Length: 1}}, + PositiveCounts: []float64{1.0}, + ResetHint: mimirpb.Histogram_UNKNOWN, + Timestamp: 0, + }, + expectedError: fmt.Errorf("native histogram bucket count mismatch, timestamp: 0, series: a{a=\"a\"}, expected 4.5, got 5.5 (err-mimir-native-histogram-bucket-count-mismatch)"), + }, + } + + registry := prometheus.NewRegistry() + metrics := newSampleValidationMetrics(registry) + cfg := sampleValidationCfg{} + labels := []mimirpb.LabelAdapter{{Name: model.MetricNameLabel, Value: "a"}, {Name: "a", Value: "a"}} + for testName, testCase := range testCases { + t.Run(testName, func(t *testing.T) { + err := validateSampleHistogram(metrics, model.Now(), cfg, "user-1", "group-1", labels, testCase.h) + require.Equal(t, testCase.expectedError, err) + }) + } + + require.NoError(t, testutil.GatherAndCompare(registry, strings.NewReader(` + # HELP cortex_discarded_samples_total The total number of samples that were discarded. + # TYPE cortex_discarded_samples_total counter + cortex_discarded_samples_total{group="group-1",reason="native_histogram_bucket_count_mismatch",user="user-1"} 2 + `), "cortex_discarded_samples_total")) +} + func tooManyLabelsArgs(series []mimirpb.LabelAdapter, limit int) []any { metric := mimirpb.FromLabelAdaptersToMetric(series).String() ellipsis := "" diff --git a/pkg/util/globalerror/errors.go b/pkg/util/globalerror/errors.go index 2b26569324c..7c7e4c64a28 100644 --- a/pkg/util/globalerror/errors.go +++ b/pkg/util/globalerror/errors.go @@ -20,6 +20,7 @@ const ( MaxNativeHistogramBuckets ID = "max-native-histogram-buckets" NotReducibleNativeHistogram ID = "not-reducible-native-histogram" InvalidSchemaNativeHistogram ID = "invalid-native-histogram-schema" + BucketCountMismatch ID = "native-histogram-bucket-count-mismatch" SeriesInvalidLabel ID = "label-invalid" SeriesLabelNameTooLong ID = "label-name-too-long" SeriesLabelValueTooLong ID = "label-value-too-long" From 9d581fa58e376534f0fe1802e6bb32c628263a5f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Gy=C3=B6rgy=20Krajcsovits?= Date: Tue, 26 Mar 2024 17:55:30 +0100 Subject: [PATCH 2/5] Add runbook entry MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: György Krajcsovits --- docs/sources/mimir/manage/mimir-runbooks/_index.md | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/docs/sources/mimir/manage/mimir-runbooks/_index.md b/docs/sources/mimir/manage/mimir-runbooks/_index.md index 294d3e98ee1..c613c1686aa 100644 --- a/docs/sources/mimir/manage/mimir-runbooks/_index.md +++ b/docs/sources/mimir/manage/mimir-runbooks/_index.md @@ -1385,6 +1385,14 @@ This non-critical error occurs when Mimir receives a write request that contains The series containing such samples are skipped during ingestion, and valid series within the same request are ingested. {{< /admonition >}} +### err-mimir-native-histogram-bucket-count-mismatch + +This non-critical error occurs when Mimir receives a write request that contains a sample that is a native histogram where the zero, positive and negative bucket counts do not add up to the overall count of the native histogram. + +{{< admonition type="note" >}} +The series containing such samples are skipped during ingestion, and valid series within the same request are ingested. +{{< /admonition >}} + ### err-mimir-label-invalid This non-critical error occurs when Mimir receives a write request that contains a series with an invalid label name. From 31513842af0a031a7a14ed28fa08a7d3002dff98 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Gy=C3=B6rgy=20Krajcsovits?= Date: Tue, 26 Mar 2024 18:03:46 +0100 Subject: [PATCH 3/5] Update changelog MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: György Krajcsovits --- CHANGELOG.md | 1 + 1 file changed, 1 insertion(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index 7539640db68..1389e1be65e 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -106,6 +106,7 @@ * [CHANGE] The configuration option `-querier.max-query-into-future` has been deprecated and will be removed in Mimir 2.14. #7496 * [CHANGE] Distributor: the metric `cortex_distributor_sample_delay_seconds` has been deprecated and will be removed in Mimir 2.14. #7516 * [CHANGE] Query-frontend: The deprecated YAML setting `frontend.cache_unaligned_requests` has been moved to `limits.cache_unaligned_requests`. #7519 +* [CHANGE] Distributor: validate that in native histograms the zero, negative and positive bucket counts add up to the overall count of the histogram. Such errors are now reported as 4xx and not 5xx and show up in the `cortex_discarded_samples_total` with the label `reason="native_histogram_bucket_count_mismatch"`. #7736 * [FEATURE] Introduce `-server.log-source-ips-full` option to log all IPs from `Forwarded`, `X-Real-IP`, `X-Forwarded-For` headers. #7250 * [FEATURE] Introduce `-tenant-federation.max-tenants` option to limit the max number of tenants allowed for requests when federation is enabled. #6959 * [FEATURE] Cardinality API: added a new `count_method` parameter which enables counting active label names. #7085 From 0399f20f035924c1bd68c3633ac04665cb84910c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Gy=C3=B6rgy=20Krajcsovits?= Date: Tue, 26 Mar 2024 18:28:35 +0100 Subject: [PATCH 4/5] Only check integer native histograms MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: György Krajcsovits --- CHANGELOG.md | 2 +- pkg/distributor/validate.go | 12 +----------- pkg/distributor/validate_test.go | 5 +++-- 3 files changed, 5 insertions(+), 14 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 1389e1be65e..6cbb11c8266 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -106,7 +106,7 @@ * [CHANGE] The configuration option `-querier.max-query-into-future` has been deprecated and will be removed in Mimir 2.14. #7496 * [CHANGE] Distributor: the metric `cortex_distributor_sample_delay_seconds` has been deprecated and will be removed in Mimir 2.14. #7516 * [CHANGE] Query-frontend: The deprecated YAML setting `frontend.cache_unaligned_requests` has been moved to `limits.cache_unaligned_requests`. #7519 -* [CHANGE] Distributor: validate that in native histograms the zero, negative and positive bucket counts add up to the overall count of the histogram. Such errors are now reported as 4xx and not 5xx and show up in the `cortex_discarded_samples_total` with the label `reason="native_histogram_bucket_count_mismatch"`. #7736 +* [CHANGE] Distributor: validate that in integer native histograms the zero, negative and positive bucket counts add up to the overall count of the histogram. Such errors are now reported as 4xx and not 5xx and show up in the `cortex_discarded_samples_total` with the label `reason="native_histogram_bucket_count_mismatch"`. #7736 * [FEATURE] Introduce `-server.log-source-ips-full` option to log all IPs from `Forwarded`, `X-Real-IP`, `X-Forwarded-For` headers. #7250 * [FEATURE] Introduce `-tenant-federation.max-tenants` option to limit the max number of tenants allowed for requests when federation is enabled. #6959 * [FEATURE] Cardinality API: added a new `count_method` parameter which enables counting active label names. #7085 diff --git a/pkg/distributor/validate.go b/pkg/distributor/validate.go index 872ad3472ad..cc7a959e5cf 100644 --- a/pkg/distributor/validate.go +++ b/pkg/distributor/validate.go @@ -259,17 +259,7 @@ func validateSampleHistogram(m *sampleValidationMetrics, now model.Time, cfg sam // Check that bucket counts including zero bucket add up to the overall count. if s.IsFloatHistogram() { - count := s.GetZeroCountFloat() - for _, c := range s.GetNegativeCounts() { - count += c - } - for _, c := range s.GetPositiveCounts() { - count += c - } - if count != s.GetCountFloat() { - m.bucketCountMismatch.WithLabelValues(userID, group).Inc() - return fmt.Errorf(bucketCountMismatchMsgFormat, s.Timestamp, mimirpb.FromLabelAdaptersToString(ls), s.GetCountFloat(), count) - } + // Do nothing. Due to floating point precision issues, we don't check the bucket count. } else { count := s.GetZeroCountInt() bucketCount := int64(0) diff --git a/pkg/distributor/validate_test.go b/pkg/distributor/validate_test.go index 5f29d3ee28b..673b6c61011 100644 --- a/pkg/distributor/validate_test.go +++ b/pkg/distributor/validate_test.go @@ -654,7 +654,8 @@ func TestInvalidBucketCountHistogram(t *testing.T) { ResetHint: mimirpb.Histogram_UNKNOWN, Timestamp: 0, }, - expectedError: fmt.Errorf("native histogram bucket count mismatch, timestamp: 0, series: a{a=\"a\"}, expected 4.5, got 5.5 (err-mimir-native-histogram-bucket-count-mismatch)"), + // Due to floating point precision issues, this case is not an error at the moment. + expectedError: nil, }, } @@ -672,7 +673,7 @@ func TestInvalidBucketCountHistogram(t *testing.T) { require.NoError(t, testutil.GatherAndCompare(registry, strings.NewReader(` # HELP cortex_discarded_samples_total The total number of samples that were discarded. # TYPE cortex_discarded_samples_total counter - cortex_discarded_samples_total{group="group-1",reason="native_histogram_bucket_count_mismatch",user="user-1"} 2 + cortex_discarded_samples_total{group="group-1",reason="native_histogram_bucket_count_mismatch",user="user-1"} 1 `), "cortex_discarded_samples_total")) } From a2f7c2a78fb37c8f65d127e632938bab5b07ada9 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Gy=C3=B6rgy=20Krajcsovits?= Date: Tue, 26 Mar 2024 18:42:00 +0100 Subject: [PATCH 5/5] Fix lint error MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: György Krajcsovits --- pkg/distributor/validate.go | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/pkg/distributor/validate.go b/pkg/distributor/validate.go index cc7a959e5cf..40c539d8364 100644 --- a/pkg/distributor/validate.go +++ b/pkg/distributor/validate.go @@ -258,9 +258,8 @@ func validateSampleHistogram(m *sampleValidationMetrics, now model.Time, cfg sam } // Check that bucket counts including zero bucket add up to the overall count. - if s.IsFloatHistogram() { - // Do nothing. Due to floating point precision issues, we don't check the bucket count. - } else { + if !s.IsFloatHistogram() { + // Do nothing for float histograms, due to floating point precision issues, we don't check the bucket count. count := s.GetZeroCountInt() bucketCount := int64(0) for _, c := range s.GetNegativeDeltas() {