From 422d53a19f916632471b8a0f0ba5f33fe874408a Mon Sep 17 00:00:00 2001 From: gotjosh Date: Tue, 8 Oct 2024 19:26:54 +0100 Subject: [PATCH] bugfix: When OOO native histograms are disabled it should be a client error (#9567) * When OOO native histograms are disabled it should be a client error When ingesters replay data from Kafka, if they receive a non-client error they will continue to attempt to push the batch to the TSDB. In this PR, I'm translating Native Histogram OOO errors into client errors to ensure that we can continue with the next set of write requests when replaying data. Signed-off-by: gotjosh * Address review feedback Signed-off-by: gotjosh * Update docs/sources/mimir/manage/mimir-runbooks/_index.md Co-authored-by: Fiona Liao * lint Signed-off-by: gotjosh --------- Signed-off-by: gotjosh Co-authored-by: Fiona Liao (cherry picked from commit f737a5350960dfd5b39add659f63edf4fdda41bd) --- .../mimir/manage/mimir-runbooks/_index.md | 23 +++- pkg/ingester/ingester.go | 6 + pkg/ingester/ingester_test.go | 109 ++++++++++++++---- pkg/util/globalerror/user.go | 1 + 4 files changed, 114 insertions(+), 25 deletions(-) diff --git a/docs/sources/mimir/manage/mimir-runbooks/_index.md b/docs/sources/mimir/manage/mimir-runbooks/_index.md index 22e70b8c713..669f10b5db4 100644 --- a/docs/sources/mimir/manage/mimir-runbooks/_index.md +++ b/docs/sources/mimir/manage/mimir-runbooks/_index.md @@ -1671,7 +1671,7 @@ The series containing such samples are skipped during ingestion, and valid serie ### err-mimir-native-histogram-count-mismatch -This non-critical error occures when Mimir receives a write request that contains a sample that is a native histogram +This non-critical error occurs when Mimir receives a write request that contains a sample that is a native histogram where the buckets counts don't add up to the overall count recorded in the native histogram, provided that the overall sum is a regular float number. @@ -1685,7 +1685,7 @@ When `-ingester.error-sample-rate` is configured to a value greater than `0`, in ### err-mimir-native-histogram-count-not-big-enough -This non-critical error occures when Mimir receives a write request that contains a sample that is a native histogram +This non-critical error occurs when Mimir receives a write request that contains a sample that is a native histogram where the buckets counts add up to a higher number than the overall count recorded in the native histogram, provided that the overall sum is not a float number (NaN). @@ -1699,7 +1699,7 @@ When `-ingester.error-sample-rate` is configured to a value greater than `0`, in ### err-mimir-native-histogram-negative-bucket-count -This non-critical error occures when Mimir receives a write request that contains a sample that is a native histogram +This non-critical error occurs when Mimir receives a write request that contains a sample that is a native histogram where some bucket count is negative. {{< admonition type="note" >}} @@ -1712,7 +1712,7 @@ When `-ingester.error-sample-rate` is configured to a value greater than `0`, in ### err-mimir-native-histogram-span-negative-offset -This non-critical error occures when Mimir receives a write request that contains a sample that is a native histogram +This non-critical error occurs when Mimir receives a write request that contains a sample that is a native histogram where a bucket span has a negative offset. {{< admonition type="note" >}} @@ -1725,7 +1725,7 @@ When `-ingester.error-sample-rate` is configured to a value greater than `0`, in ### err-mimir-native-histogram-spans-buckets-mismatch -This non-critical error occures when Mimir receives a write request that contains a sample that is a native histogram +This non-critical error occurs when Mimir receives a write request that contains a sample that is a native histogram where the number of bucket counts does not agree with the number of buckets encoded in the bucket spans. {{< admonition type="note" >}} @@ -1736,6 +1736,19 @@ The series containing such samples are skipped during ingestion, and valid serie When `-ingester.error-sample-rate` is configured to a value greater than `0`, invalid native histogram errors are logged only once every `-ingester.error-sample-rate` times. {{< /admonition >}} +### err-mimir-native-histogram-ooo-disabled + +This non-critical error occurs when Mimir receives a write request that contains a sample that is a native histogram +where another sample with a more recent timestamp has already been ingested and `-ingester.ooo-native-histograms-ingestion-enabled` is set to `false`. + +{{< admonition type="note" >}} +The series containing such samples are skipped during ingestion, and valid series within the same request are ingested. +{{< /admonition >}} + +{{< admonition type="note" >}} +When `-ingester.error-sample-rate` is configured to a value greater than `0`, invalid native histogram errors are logged only once every `-ingester.error-sample-rate` times. +{{< /admonition >}} + ### err-mimir-label-invalid This non-critical error occurs when Mimir receives a write request that contains a series with an invalid label name. diff --git a/pkg/ingester/ingester.go b/pkg/ingester/ingester.go index b6363197237..57c2929c1e7 100644 --- a/pkg/ingester/ingester.go +++ b/pkg/ingester/ingester.go @@ -1344,6 +1344,12 @@ func (i *Ingester) pushSamplesToAppender(userID string, timeseries []mimirpb.Pre return true // Map TSDB native histogram validation errors to soft errors. + case errors.Is(err, storage.ErrOOONativeHistogramsDisabled): + stats.sampleOutOfOrderCount++ + updateFirstPartial(i.errorSamplers.nativeHistogramValidationError, func() softError { + return newNativeHistogramValidationError(globalerror.NativeHistogramOOODisabled, err, model.Time(timestamp), labels) + }) + return true case errors.Is(err, histogram.ErrHistogramCountMismatch): stats.invalidNativeHistogramCount++ updateFirstPartial(i.errorSamplers.nativeHistogramValidationError, func() softError { diff --git a/pkg/ingester/ingester_test.go b/pkg/ingester/ingester_test.go index 80e6b658566..1fb4eb0b574 100644 --- a/pkg/ingester/ingester_test.go +++ b/pkg/ingester/ingester_test.go @@ -373,20 +373,21 @@ func TestIngester_Push(t *testing.T) { histogramWithSpansBucketsMismatch.PositiveSpans[1].Length++ tests := map[string]struct { - reqs []*mimirpb.WriteRequest - expectedErr error - expectedIngested model.Matrix - expectedMetadataIngested []*mimirpb.MetricMetadata - expectedExemplarsIngested []mimirpb.TimeSeries - expectedExemplarsDropped []mimirpb.TimeSeries - expectedMetrics string - additionalMetrics []string - disableActiveSeries bool - maxExemplars int - maxMetadataPerUser int - maxMetadataPerMetric int - nativeHistograms bool - ignoreOOOExemplars bool + reqs []*mimirpb.WriteRequest + expectedErr error + expectedIngested model.Matrix + expectedMetadataIngested []*mimirpb.MetricMetadata + expectedExemplarsIngested []mimirpb.TimeSeries + expectedExemplarsDropped []mimirpb.TimeSeries + expectedMetrics string + additionalMetrics []string + disableActiveSeries bool + maxExemplars int + maxMetadataPerUser int + maxMetadataPerMetric int + nativeHistograms bool + disableOOONativeHistograms bool + ignoreOOOExemplars bool }{ "should succeed on valid series and metadata": { reqs: []*mimirpb.WriteRequest{ @@ -1915,6 +1916,66 @@ func TestIngester_Push(t *testing.T) { cortex_ingester_tsdb_head_max_timestamp_seconds 0.01 `, }, + "should soft fail if OOO native histograms are disabled": { + nativeHistograms: true, + disableOOONativeHistograms: true, + reqs: []*mimirpb.WriteRequest{ + mimirpb.NewWriteRequest(nil, mimirpb.API).AddHistogramSeries( + [][]mimirpb.LabelAdapter{metricLabelAdapters}, + []mimirpb.Histogram{mimirpb.FromHistogramToHistogramProto(10, util_test.GenerateTestHistogram(1))}, + nil, + ), + mimirpb.NewWriteRequest(nil, mimirpb.API).AddHistogramSeries( + [][]mimirpb.LabelAdapter{metricLabelAdapters}, + []mimirpb.Histogram{mimirpb.FromHistogramToHistogramProto(-10, util_test.GenerateTestHistogram(1))}, + nil, + ), + }, + expectedErr: newErrorWithStatus(wrapOrAnnotateWithUser(newNativeHistogramValidationError(globalerror.NativeHistogramOOODisabled, fmt.Errorf("out-of-order native histogram ingestion is disabled"), model.Time(-10), []mimirpb.LabelAdapter{metricLabelAdapters[0]}), userID), codes.InvalidArgument), + expectedMetrics: ` + # HELP cortex_ingester_ingested_samples_total The total number of samples ingested per user. + # TYPE cortex_ingester_ingested_samples_total counter + cortex_ingester_ingested_samples_total{user="test"} 1 + # HELP cortex_discarded_samples_total The total number of samples that were discarded. + # TYPE cortex_discarded_samples_total counter + cortex_discarded_samples_total{group="",reason="sample-out-of-order",user="test"} 1 + # HELP cortex_ingester_ingested_samples_failures_total The total number of samples that errored on ingestion per user. + # TYPE cortex_ingester_ingested_samples_failures_total counter + cortex_ingester_ingested_samples_failures_total{user="test"} 1 + # HELP cortex_ingester_memory_users The current number of users in memory. + # TYPE cortex_ingester_memory_users gauge + cortex_ingester_memory_users 1 + # HELP cortex_ingester_memory_series The current number of series in memory. + # TYPE cortex_ingester_memory_series gauge + cortex_ingester_memory_series 1 + # HELP cortex_ingester_memory_series_created_total The total number of series that were created per user. + # TYPE cortex_ingester_memory_series_created_total counter + cortex_ingester_memory_series_created_total{user="test"} 1 + # HELP cortex_ingester_memory_series_removed_total The total number of series that were removed per user. + # TYPE cortex_ingester_memory_series_removed_total counter + cortex_ingester_memory_series_removed_total{user="test"} 0 + # HELP cortex_ingester_tsdb_head_min_timestamp_seconds Minimum timestamp of the head block across all tenants. + # TYPE cortex_ingester_tsdb_head_min_timestamp_seconds gauge + cortex_ingester_tsdb_head_min_timestamp_seconds 0.01 + # HELP cortex_ingester_tsdb_head_max_timestamp_seconds Maximum timestamp of the head block across all tenants. + # TYPE cortex_ingester_tsdb_head_max_timestamp_seconds gauge + cortex_ingester_tsdb_head_max_timestamp_seconds 0.01 + # HELP cortex_ingester_active_native_histogram_buckets Number of currently active native histogram buckets per user. + # TYPE cortex_ingester_active_native_histogram_buckets gauge + cortex_ingester_active_native_histogram_buckets{user="test"} 8 + # HELP cortex_ingester_active_native_histogram_series Number of currently active native histogram series per user. + # TYPE cortex_ingester_active_native_histogram_series gauge + cortex_ingester_active_native_histogram_series{user="test"} 1 + # HELP cortex_ingester_active_series Number of currently active series per user. + # TYPE cortex_ingester_active_series gauge + cortex_ingester_active_series{user="test"} 1 + `, + expectedIngested: model.Matrix{ + &model.SampleStream{Metric: metricLabelSet, Histograms: []model.SampleHistogramPair{ + {Histogram: mimirpb.FromHistogramToPromHistogram(util_test.GenerateTestHistogram(1)), Timestamp: 10}}, + }, + }, + }, "should soft fail if histogram has a negative bucket count": { nativeHistograms: true, reqs: []*mimirpb.WriteRequest{ @@ -3195,6 +3256,12 @@ func TestIngester_Push(t *testing.T) { limits.MaxGlobalMetadataPerMetric = testData.maxMetadataPerMetric limits.NativeHistogramsIngestionEnabled = testData.nativeHistograms limits.IgnoreOOOExemplars = testData.ignoreOOOExemplars + var oooTimeWindow int64 + if testData.disableOOONativeHistograms { + oooTimeWindow = int64(1 * time.Hour.Seconds()) + limits.OutOfOrderTimeWindow = model.Duration(1 * time.Hour) + limits.OOONativeHistogramsIngestionEnabled = false + } i, err := prepareIngesterWithBlocksStorageAndLimits(t, cfg, limits, nil, "", registry) require.NoError(t, err) @@ -3222,10 +3289,11 @@ func TestIngester_Push(t *testing.T) { if testData.expectedErr == nil { assert.NoError(t, err) } else { + require.Error(t, err) handledErr := mapPushErrorToErrorWithStatus(err) errWithStatus, ok := handledErr.(globalerror.ErrorWithStatus) - assert.True(t, ok) - assert.True(t, errWithStatus.Equals(testData.expectedErr)) + require.True(t, ok) + require.Truef(t, errWithStatus.Equals(testData.expectedErr), "errors don't match \nactual: '%v'\nexpected: '%v'", errWithStatus, testData.expectedErr) } } } @@ -3244,7 +3312,7 @@ func TestIngester_Push(t *testing.T) { if len(res) == 0 { res = nil } - assert.Equal(t, testData.expectedIngested, res) + require.Equal(t, testData.expectedIngested, res) // Read back samples to see what has been really ingested exemplarRes, err := i.QueryExemplars(ctx, &client.ExemplarQueryRequest{ @@ -3308,9 +3376,10 @@ func TestIngester_Push(t *testing.T) { assert.Equal(t, int64(expectedTenantsCount), usagestats.GetInt(memoryTenantsStatsName).Value()) assert.Equal(t, int64(expectedSamplesCount)+appendedSamplesStatsBefore, usagestats.GetCounter(appendedSamplesStatsName).Total()) assert.Equal(t, int64(expectedExemplarsCount)+appendedExemplarsStatsBefore, usagestats.GetCounter(appendedExemplarsStatsName).Total()) - assert.Equal(t, int64(0), usagestats.GetInt(tenantsWithOutOfOrderEnabledStatName).Value()) - assert.Equal(t, int64(0), usagestats.GetInt(minOutOfOrderTimeWindowSecondsStatName).Value()) - assert.Equal(t, int64(0), usagestats.GetInt(maxOutOfOrderTimeWindowSecondsStatName).Value()) + + assert.Equal(t, testData.disableOOONativeHistograms, usagestats.GetInt(tenantsWithOutOfOrderEnabledStatName).Value() == int64(1)) + assert.Equal(t, oooTimeWindow, usagestats.GetInt(minOutOfOrderTimeWindowSecondsStatName).Value()) + assert.Equal(t, oooTimeWindow, usagestats.GetInt(maxOutOfOrderTimeWindowSecondsStatName).Value()) }) } } diff --git a/pkg/util/globalerror/user.go b/pkg/util/globalerror/user.go index 6985ee0fea5..3e0648cffaa 100644 --- a/pkg/util/globalerror/user.go +++ b/pkg/util/globalerror/user.go @@ -85,6 +85,7 @@ const ( NativeHistogramNegativeBucketCount ID = "native-histogram-negative-bucket-count" NativeHistogramSpanNegativeOffset ID = "native-histogram-span-negative-offset" NativeHistogramSpansBucketsMismatch ID = "native-histogram-spans-buckets-mismatch" + NativeHistogramOOODisabled ID = "native-histogram-ooo-disabled" // Alertmanager errors AlertmanagerMaxGrafanaConfigSize ID = "alertmanager-max-grafana-config-size"