Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Distributor: add bucket count validation to native histograms #7736

Merged
merged 5 commits into from
Apr 2, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -106,6 +106,7 @@
* [CHANGE] The configuration option `-querier.max-query-into-future` has been deprecated and will be removed in Mimir 2.14. #7496
* [CHANGE] Distributor: the metric `cortex_distributor_sample_delay_seconds` has been deprecated and will be removed in Mimir 2.14. #7516
* [CHANGE] Query-frontend: The deprecated YAML setting `frontend.cache_unaligned_requests` has been moved to `limits.cache_unaligned_requests`. #7519
* [CHANGE] Distributor: validate that in integer native histograms the zero, negative and positive bucket counts add up to the overall count of the histogram. Such errors are now reported as 4xx and not 5xx and show up in the `cortex_discarded_samples_total` with the label `reason="native_histogram_bucket_count_mismatch"`. #7736
* [FEATURE] Introduce `-server.log-source-ips-full` option to log all IPs from `Forwarded`, `X-Real-IP`, `X-Forwarded-For` headers. #7250
* [FEATURE] Introduce `-tenant-federation.max-tenants` option to limit the max number of tenants allowed for requests when federation is enabled. #6959
* [FEATURE] Cardinality API: added a new `count_method` parameter which enables counting active label names. #7085
Expand Down
8 changes: 8 additions & 0 deletions docs/sources/mimir/manage/mimir-runbooks/_index.md
Original file line number Diff line number Diff line change
Expand Up @@ -1385,6 +1385,14 @@ This non-critical error occurs when Mimir receives a write request that contains
The series containing such samples are skipped during ingestion, and valid series within the same request are ingested.
{{< /admonition >}}

### err-mimir-native-histogram-bucket-count-mismatch

This non-critical error occurs when Mimir receives a write request that contains a sample that is a native histogram where the zero, positive and negative bucket counts do not add up to the overall count of the native histogram.

{{< admonition type="note" >}}
The series containing such samples are skipped during ingestion, and valid series within the same request are ingested.
{{< /admonition >}}

### err-mimir-label-invalid

This non-critical error occurs when Mimir receives a write request that contains a series with an invalid label name.
Expand Down
26 changes: 26 additions & 0 deletions pkg/distributor/validate.go
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,7 @@ var (
reasonLabelValueTooLong = globalerror.SeriesLabelValueTooLong.LabelValue()
reasonMaxNativeHistogramBuckets = globalerror.MaxNativeHistogramBuckets.LabelValue()
reasonInvalidNativeHistogramSchema = globalerror.InvalidSchemaNativeHistogram.LabelValue()
reasonBucketCountMismatch = globalerror.BucketCountMismatch.LabelValue()
reasonDuplicateLabelNames = globalerror.SeriesWithDuplicateLabelNames.LabelValue()
reasonTooFarInFuture = globalerror.SampleTooFarInFuture.LabelValue()

Expand Down Expand Up @@ -79,6 +80,7 @@ var (
maxNativeHistogramBucketsMsgFormat = globalerror.MaxNativeHistogramBuckets.Message("received a native histogram sample with too many buckets, timestamp: %d series: %s, buckets: %d, limit: %d")
notReducibleNativeHistogramMsgFormat = globalerror.NotReducibleNativeHistogram.Message("received a native histogram sample with too many buckets and cannot reduce, timestamp: %d series: %s, buckets: %d, limit: %d")
invalidSchemaNativeHistogramMsgFormat = globalerror.InvalidSchemaNativeHistogram.Message("received a native histogram sample with an invalid schema: %d")
bucketCountMismatchMsgFormat = globalerror.BucketCountMismatch.Message("native histogram bucket count mismatch, timestamp: %d, series: %s, expected %v, got %v")
sampleTimestampTooNewMsgFormat = globalerror.SampleTooFarInFuture.MessageWithPerTenantLimitConfig(
"received a sample whose timestamp is too far in the future, timestamp: %d series: '%.200s'",
validation.CreationGracePeriodFlag,
Expand Down Expand Up @@ -121,6 +123,7 @@ type sampleValidationMetrics struct {
labelValueTooLong *prometheus.CounterVec
maxNativeHistogramBuckets *prometheus.CounterVec
invalidNativeHistogramSchema *prometheus.CounterVec
bucketCountMismatch *prometheus.CounterVec
duplicateLabelNames *prometheus.CounterVec
tooFarInFuture *prometheus.CounterVec
}
Expand All @@ -135,6 +138,7 @@ func (m *sampleValidationMetrics) deleteUserMetrics(userID string) {
m.labelValueTooLong.DeletePartialMatch(filter)
m.maxNativeHistogramBuckets.DeletePartialMatch(filter)
m.invalidNativeHistogramSchema.DeletePartialMatch(filter)
m.bucketCountMismatch.DeletePartialMatch(filter)
m.duplicateLabelNames.DeletePartialMatch(filter)
m.tooFarInFuture.DeletePartialMatch(filter)
}
Expand All @@ -148,6 +152,7 @@ func (m *sampleValidationMetrics) deleteUserMetricsForGroup(userID, group string
m.labelValueTooLong.DeleteLabelValues(userID, group)
m.maxNativeHistogramBuckets.DeleteLabelValues(userID, group)
m.invalidNativeHistogramSchema.DeleteLabelValues(userID, group)
m.bucketCountMismatch.DeleteLabelValues(userID, group)
m.duplicateLabelNames.DeleteLabelValues(userID, group)
m.tooFarInFuture.DeleteLabelValues(userID, group)
}
Expand All @@ -162,6 +167,7 @@ func newSampleValidationMetrics(r prometheus.Registerer) *sampleValidationMetric
labelValueTooLong: validation.DiscardedSamplesCounter(r, reasonLabelValueTooLong),
maxNativeHistogramBuckets: validation.DiscardedSamplesCounter(r, reasonMaxNativeHistogramBuckets),
invalidNativeHistogramSchema: validation.DiscardedSamplesCounter(r, reasonInvalidNativeHistogramSchema),
bucketCountMismatch: validation.DiscardedSamplesCounter(r, reasonBucketCountMismatch),
duplicateLabelNames: validation.DiscardedSamplesCounter(r, reasonDuplicateLabelNames),
tooFarInFuture: validation.DiscardedSamplesCounter(r, reasonTooFarInFuture),
}
Expand Down Expand Up @@ -251,6 +257,26 @@ func validateSampleHistogram(m *sampleValidationMetrics, now model.Time, cfg sam
}
}

// Check that bucket counts including zero bucket add up to the overall count.
if !s.IsFloatHistogram() {
// Do nothing for float histograms, due to floating point precision issues, we don't check the bucket count.
count := s.GetZeroCountInt()
bucketCount := int64(0)
for _, c := range s.GetNegativeDeltas() {
bucketCount += c
count += uint64(bucketCount)
}
bucketCount = int64(0)
for _, c := range s.GetPositiveDeltas() {
bucketCount += c
count += uint64(bucketCount)
}
if count != s.GetCountInt() {
m.bucketCountMismatch.WithLabelValues(userID, group).Inc()
return fmt.Errorf(bucketCountMismatchMsgFormat, s.Timestamp, mimirpb.FromLabelAdaptersToString(ls), s.GetCountInt(), count)
}
}

return nil
}

Expand Down
94 changes: 94 additions & 0 deletions pkg/distributor/validate_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -583,6 +583,100 @@ func TestInvalidNativeHistogramSchema(t *testing.T) {
`), "cortex_discarded_samples_total"))
}

func TestInvalidBucketCountHistogram(t *testing.T) {
testCases := map[string]struct {
h *mimirpb.Histogram
expectedError error
}{
"a valid zero counts causes no error": {
h: &mimirpb.Histogram{},
expectedError: nil,
},
"a valid integer histogram causes no error": {
h: &mimirpb.Histogram{
Count: &mimirpb.Histogram_CountInt{CountInt: 5},
Sum: 10,
Schema: 1,
ZeroThreshold: 0.001,
ZeroCount: &mimirpb.Histogram_ZeroCountInt{ZeroCountInt: 1},
NegativeSpans: []mimirpb.BucketSpan{{Offset: 0, Length: 2}},
NegativeDeltas: []int64{1, 1},
PositiveSpans: []mimirpb.BucketSpan{{Offset: 0, Length: 1}},
PositiveDeltas: []int64{1},
ResetHint: mimirpb.Histogram_UNKNOWN,
Timestamp: 0,
},
expectedError: nil,
},
"a valid float histogram causes no error": {
h: &mimirpb.Histogram{
Count: &mimirpb.Histogram_CountFloat{CountFloat: 5.5},
Sum: 10,
Schema: 1,
ZeroThreshold: 0.001,
ZeroCount: &mimirpb.Histogram_ZeroCountFloat{ZeroCountFloat: 1.5},
NegativeSpans: []mimirpb.BucketSpan{{Offset: 0, Length: 2}},
NegativeCounts: []float64{1.0, 2.0},
PositiveSpans: []mimirpb.BucketSpan{{Offset: 0, Length: 1}},
PositiveCounts: []float64{1.0},
ResetHint: mimirpb.Histogram_UNKNOWN,
Timestamp: 0,
},
expectedError: nil,
},
"an integer histogram with the wrong overall count": {
h: &mimirpb.Histogram{
Count: &mimirpb.Histogram_CountInt{CountInt: 4},
Sum: 10,
Schema: 1,
ZeroThreshold: 0.001,
ZeroCount: &mimirpb.Histogram_ZeroCountInt{ZeroCountInt: 1},
NegativeSpans: []mimirpb.BucketSpan{{Offset: 0, Length: 2}},
NegativeDeltas: []int64{1, 1},
PositiveSpans: []mimirpb.BucketSpan{{Offset: 0, Length: 1}},
PositiveDeltas: []int64{1},
ResetHint: mimirpb.Histogram_UNKNOWN,
Timestamp: 0,
},
expectedError: fmt.Errorf("native histogram bucket count mismatch, timestamp: 0, series: a{a=\"a\"}, expected 4, got 5 (err-mimir-native-histogram-bucket-count-mismatch)"),
},
"a float histogram with the wrong overall count": {
h: &mimirpb.Histogram{
Count: &mimirpb.Histogram_CountFloat{CountFloat: 4.5},
Sum: 10,
Schema: 1,
ZeroThreshold: 0.001,
ZeroCount: &mimirpb.Histogram_ZeroCountFloat{ZeroCountFloat: 1.5},
NegativeSpans: []mimirpb.BucketSpan{{Offset: 0, Length: 2}},
NegativeCounts: []float64{1.0, 2.0},
PositiveSpans: []mimirpb.BucketSpan{{Offset: 0, Length: 1}},
PositiveCounts: []float64{1.0},
ResetHint: mimirpb.Histogram_UNKNOWN,
Timestamp: 0,
},
// Due to floating point precision issues, this case is not an error at the moment.
expectedError: nil,
},
}

registry := prometheus.NewRegistry()
metrics := newSampleValidationMetrics(registry)
cfg := sampleValidationCfg{}
labels := []mimirpb.LabelAdapter{{Name: model.MetricNameLabel, Value: "a"}, {Name: "a", Value: "a"}}
for testName, testCase := range testCases {
t.Run(testName, func(t *testing.T) {
err := validateSampleHistogram(metrics, model.Now(), cfg, "user-1", "group-1", labels, testCase.h)
require.Equal(t, testCase.expectedError, err)
})
}

require.NoError(t, testutil.GatherAndCompare(registry, strings.NewReader(`
# HELP cortex_discarded_samples_total The total number of samples that were discarded.
# TYPE cortex_discarded_samples_total counter
cortex_discarded_samples_total{group="group-1",reason="native_histogram_bucket_count_mismatch",user="user-1"} 1
`), "cortex_discarded_samples_total"))
}

func tooManyLabelsArgs(series []mimirpb.LabelAdapter, limit int) []any {
metric := mimirpb.FromLabelAdaptersToMetric(series).String()
ellipsis := ""
Expand Down
1 change: 1 addition & 0 deletions pkg/util/globalerror/errors.go
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@ const (
MaxNativeHistogramBuckets ID = "max-native-histogram-buckets"
NotReducibleNativeHistogram ID = "not-reducible-native-histogram"
InvalidSchemaNativeHistogram ID = "invalid-native-histogram-schema"
BucketCountMismatch ID = "native-histogram-bucket-count-mismatch"
SeriesInvalidLabel ID = "label-invalid"
SeriesLabelNameTooLong ID = "label-name-too-long"
SeriesLabelValueTooLong ID = "label-value-too-long"
Expand Down
Loading