diff --git a/docs/sources/mimir/manage/mimir-runbooks/_index.md b/docs/sources/mimir/manage/mimir-runbooks/_index.md index 22e70b8c713..669f10b5db4 100644 --- a/docs/sources/mimir/manage/mimir-runbooks/_index.md +++ b/docs/sources/mimir/manage/mimir-runbooks/_index.md @@ -1671,7 +1671,7 @@ The series containing such samples are skipped during ingestion, and valid serie ### err-mimir-native-histogram-count-mismatch -This non-critical error occures when Mimir receives a write request that contains a sample that is a native histogram +This non-critical error occurs when Mimir receives a write request that contains a sample that is a native histogram where the buckets counts don't add up to the overall count recorded in the native histogram, provided that the overall sum is a regular float number. @@ -1685,7 +1685,7 @@ When `-ingester.error-sample-rate` is configured to a value greater than `0`, in ### err-mimir-native-histogram-count-not-big-enough -This non-critical error occures when Mimir receives a write request that contains a sample that is a native histogram +This non-critical error occurs when Mimir receives a write request that contains a sample that is a native histogram where the buckets counts add up to a higher number than the overall count recorded in the native histogram, provided that the overall sum is not a float number (NaN). @@ -1699,7 +1699,7 @@ When `-ingester.error-sample-rate` is configured to a value greater than `0`, in ### err-mimir-native-histogram-negative-bucket-count -This non-critical error occures when Mimir receives a write request that contains a sample that is a native histogram +This non-critical error occurs when Mimir receives a write request that contains a sample that is a native histogram where some bucket count is negative. {{< admonition type="note" >}} @@ -1712,7 +1712,7 @@ When `-ingester.error-sample-rate` is configured to a value greater than `0`, in ### err-mimir-native-histogram-span-negative-offset -This non-critical error occures when Mimir receives a write request that contains a sample that is a native histogram +This non-critical error occurs when Mimir receives a write request that contains a sample that is a native histogram where a bucket span has a negative offset. {{< admonition type="note" >}} @@ -1725,7 +1725,7 @@ When `-ingester.error-sample-rate` is configured to a value greater than `0`, in ### err-mimir-native-histogram-spans-buckets-mismatch -This non-critical error occures when Mimir receives a write request that contains a sample that is a native histogram +This non-critical error occurs when Mimir receives a write request that contains a sample that is a native histogram where the number of bucket counts does not agree with the number of buckets encoded in the bucket spans. {{< admonition type="note" >}} @@ -1736,6 +1736,19 @@ The series containing such samples are skipped during ingestion, and valid serie When `-ingester.error-sample-rate` is configured to a value greater than `0`, invalid native histogram errors are logged only once every `-ingester.error-sample-rate` times. {{< /admonition >}} +### err-mimir-native-histogram-ooo-disabled + +This non-critical error occurs when Mimir receives a write request that contains a sample that is a native histogram +where another sample with a more recent timestamp has already been ingested and `-ingester.ooo-native-histograms-ingestion-enabled` is set to `false`. + +{{< admonition type="note" >}} +The series containing such samples are skipped during ingestion, and valid series within the same request are ingested. +{{< /admonition >}} + +{{< admonition type="note" >}} +When `-ingester.error-sample-rate` is configured to a value greater than `0`, invalid native histogram errors are logged only once every `-ingester.error-sample-rate` times. +{{< /admonition >}} + ### err-mimir-label-invalid This non-critical error occurs when Mimir receives a write request that contains a series with an invalid label name. diff --git a/pkg/ingester/ingester.go b/pkg/ingester/ingester.go index b6363197237..57c2929c1e7 100644 --- a/pkg/ingester/ingester.go +++ b/pkg/ingester/ingester.go @@ -1344,6 +1344,12 @@ func (i *Ingester) pushSamplesToAppender(userID string, timeseries []mimirpb.Pre return true // Map TSDB native histogram validation errors to soft errors. + case errors.Is(err, storage.ErrOOONativeHistogramsDisabled): + stats.sampleOutOfOrderCount++ + updateFirstPartial(i.errorSamplers.nativeHistogramValidationError, func() softError { + return newNativeHistogramValidationError(globalerror.NativeHistogramOOODisabled, err, model.Time(timestamp), labels) + }) + return true case errors.Is(err, histogram.ErrHistogramCountMismatch): stats.invalidNativeHistogramCount++ updateFirstPartial(i.errorSamplers.nativeHistogramValidationError, func() softError { diff --git a/pkg/ingester/ingester_test.go b/pkg/ingester/ingester_test.go index 80e6b658566..1fb4eb0b574 100644 --- a/pkg/ingester/ingester_test.go +++ b/pkg/ingester/ingester_test.go @@ -373,20 +373,21 @@ func TestIngester_Push(t *testing.T) { histogramWithSpansBucketsMismatch.PositiveSpans[1].Length++ tests := map[string]struct { - reqs []*mimirpb.WriteRequest - expectedErr error - expectedIngested model.Matrix - expectedMetadataIngested []*mimirpb.MetricMetadata - expectedExemplarsIngested []mimirpb.TimeSeries - expectedExemplarsDropped []mimirpb.TimeSeries - expectedMetrics string - additionalMetrics []string - disableActiveSeries bool - maxExemplars int - maxMetadataPerUser int - maxMetadataPerMetric int - nativeHistograms bool - ignoreOOOExemplars bool + reqs []*mimirpb.WriteRequest + expectedErr error + expectedIngested model.Matrix + expectedMetadataIngested []*mimirpb.MetricMetadata + expectedExemplarsIngested []mimirpb.TimeSeries + expectedExemplarsDropped []mimirpb.TimeSeries + expectedMetrics string + additionalMetrics []string + disableActiveSeries bool + maxExemplars int + maxMetadataPerUser int + maxMetadataPerMetric int + nativeHistograms bool + disableOOONativeHistograms bool + ignoreOOOExemplars bool }{ "should succeed on valid series and metadata": { reqs: []*mimirpb.WriteRequest{ @@ -1915,6 +1916,66 @@ func TestIngester_Push(t *testing.T) { cortex_ingester_tsdb_head_max_timestamp_seconds 0.01 `, }, + "should soft fail if OOO native histograms are disabled": { + nativeHistograms: true, + disableOOONativeHistograms: true, + reqs: []*mimirpb.WriteRequest{ + mimirpb.NewWriteRequest(nil, mimirpb.API).AddHistogramSeries( + [][]mimirpb.LabelAdapter{metricLabelAdapters}, + []mimirpb.Histogram{mimirpb.FromHistogramToHistogramProto(10, util_test.GenerateTestHistogram(1))}, + nil, + ), + mimirpb.NewWriteRequest(nil, mimirpb.API).AddHistogramSeries( + [][]mimirpb.LabelAdapter{metricLabelAdapters}, + []mimirpb.Histogram{mimirpb.FromHistogramToHistogramProto(-10, util_test.GenerateTestHistogram(1))}, + nil, + ), + }, + expectedErr: newErrorWithStatus(wrapOrAnnotateWithUser(newNativeHistogramValidationError(globalerror.NativeHistogramOOODisabled, fmt.Errorf("out-of-order native histogram ingestion is disabled"), model.Time(-10), []mimirpb.LabelAdapter{metricLabelAdapters[0]}), userID), codes.InvalidArgument), + expectedMetrics: ` + # HELP cortex_ingester_ingested_samples_total The total number of samples ingested per user. + # TYPE cortex_ingester_ingested_samples_total counter + cortex_ingester_ingested_samples_total{user="test"} 1 + # HELP cortex_discarded_samples_total The total number of samples that were discarded. + # TYPE cortex_discarded_samples_total counter + cortex_discarded_samples_total{group="",reason="sample-out-of-order",user="test"} 1 + # HELP cortex_ingester_ingested_samples_failures_total The total number of samples that errored on ingestion per user. + # TYPE cortex_ingester_ingested_samples_failures_total counter + cortex_ingester_ingested_samples_failures_total{user="test"} 1 + # HELP cortex_ingester_memory_users The current number of users in memory. + # TYPE cortex_ingester_memory_users gauge + cortex_ingester_memory_users 1 + # HELP cortex_ingester_memory_series The current number of series in memory. + # TYPE cortex_ingester_memory_series gauge + cortex_ingester_memory_series 1 + # HELP cortex_ingester_memory_series_created_total The total number of series that were created per user. + # TYPE cortex_ingester_memory_series_created_total counter + cortex_ingester_memory_series_created_total{user="test"} 1 + # HELP cortex_ingester_memory_series_removed_total The total number of series that were removed per user. + # TYPE cortex_ingester_memory_series_removed_total counter + cortex_ingester_memory_series_removed_total{user="test"} 0 + # HELP cortex_ingester_tsdb_head_min_timestamp_seconds Minimum timestamp of the head block across all tenants. + # TYPE cortex_ingester_tsdb_head_min_timestamp_seconds gauge + cortex_ingester_tsdb_head_min_timestamp_seconds 0.01 + # HELP cortex_ingester_tsdb_head_max_timestamp_seconds Maximum timestamp of the head block across all tenants. + # TYPE cortex_ingester_tsdb_head_max_timestamp_seconds gauge + cortex_ingester_tsdb_head_max_timestamp_seconds 0.01 + # HELP cortex_ingester_active_native_histogram_buckets Number of currently active native histogram buckets per user. + # TYPE cortex_ingester_active_native_histogram_buckets gauge + cortex_ingester_active_native_histogram_buckets{user="test"} 8 + # HELP cortex_ingester_active_native_histogram_series Number of currently active native histogram series per user. + # TYPE cortex_ingester_active_native_histogram_series gauge + cortex_ingester_active_native_histogram_series{user="test"} 1 + # HELP cortex_ingester_active_series Number of currently active series per user. + # TYPE cortex_ingester_active_series gauge + cortex_ingester_active_series{user="test"} 1 + `, + expectedIngested: model.Matrix{ + &model.SampleStream{Metric: metricLabelSet, Histograms: []model.SampleHistogramPair{ + {Histogram: mimirpb.FromHistogramToPromHistogram(util_test.GenerateTestHistogram(1)), Timestamp: 10}}, + }, + }, + }, "should soft fail if histogram has a negative bucket count": { nativeHistograms: true, reqs: []*mimirpb.WriteRequest{ @@ -3195,6 +3256,12 @@ func TestIngester_Push(t *testing.T) { limits.MaxGlobalMetadataPerMetric = testData.maxMetadataPerMetric limits.NativeHistogramsIngestionEnabled = testData.nativeHistograms limits.IgnoreOOOExemplars = testData.ignoreOOOExemplars + var oooTimeWindow int64 + if testData.disableOOONativeHistograms { + oooTimeWindow = int64(1 * time.Hour.Seconds()) + limits.OutOfOrderTimeWindow = model.Duration(1 * time.Hour) + limits.OOONativeHistogramsIngestionEnabled = false + } i, err := prepareIngesterWithBlocksStorageAndLimits(t, cfg, limits, nil, "", registry) require.NoError(t, err) @@ -3222,10 +3289,11 @@ func TestIngester_Push(t *testing.T) { if testData.expectedErr == nil { assert.NoError(t, err) } else { + require.Error(t, err) handledErr := mapPushErrorToErrorWithStatus(err) errWithStatus, ok := handledErr.(globalerror.ErrorWithStatus) - assert.True(t, ok) - assert.True(t, errWithStatus.Equals(testData.expectedErr)) + require.True(t, ok) + require.Truef(t, errWithStatus.Equals(testData.expectedErr), "errors don't match \nactual: '%v'\nexpected: '%v'", errWithStatus, testData.expectedErr) } } } @@ -3244,7 +3312,7 @@ func TestIngester_Push(t *testing.T) { if len(res) == 0 { res = nil } - assert.Equal(t, testData.expectedIngested, res) + require.Equal(t, testData.expectedIngested, res) // Read back samples to see what has been really ingested exemplarRes, err := i.QueryExemplars(ctx, &client.ExemplarQueryRequest{ @@ -3308,9 +3376,10 @@ func TestIngester_Push(t *testing.T) { assert.Equal(t, int64(expectedTenantsCount), usagestats.GetInt(memoryTenantsStatsName).Value()) assert.Equal(t, int64(expectedSamplesCount)+appendedSamplesStatsBefore, usagestats.GetCounter(appendedSamplesStatsName).Total()) assert.Equal(t, int64(expectedExemplarsCount)+appendedExemplarsStatsBefore, usagestats.GetCounter(appendedExemplarsStatsName).Total()) - assert.Equal(t, int64(0), usagestats.GetInt(tenantsWithOutOfOrderEnabledStatName).Value()) - assert.Equal(t, int64(0), usagestats.GetInt(minOutOfOrderTimeWindowSecondsStatName).Value()) - assert.Equal(t, int64(0), usagestats.GetInt(maxOutOfOrderTimeWindowSecondsStatName).Value()) + + assert.Equal(t, testData.disableOOONativeHistograms, usagestats.GetInt(tenantsWithOutOfOrderEnabledStatName).Value() == int64(1)) + assert.Equal(t, oooTimeWindow, usagestats.GetInt(minOutOfOrderTimeWindowSecondsStatName).Value()) + assert.Equal(t, oooTimeWindow, usagestats.GetInt(maxOutOfOrderTimeWindowSecondsStatName).Value()) }) } } diff --git a/pkg/util/globalerror/user.go b/pkg/util/globalerror/user.go index 6985ee0fea5..3e0648cffaa 100644 --- a/pkg/util/globalerror/user.go +++ b/pkg/util/globalerror/user.go @@ -85,6 +85,7 @@ const ( NativeHistogramNegativeBucketCount ID = "native-histogram-negative-bucket-count" NativeHistogramSpanNegativeOffset ID = "native-histogram-span-negative-offset" NativeHistogramSpansBucketsMismatch ID = "native-histogram-spans-buckets-mismatch" + NativeHistogramOOODisabled ID = "native-histogram-ooo-disabled" // Alertmanager errors AlertmanagerMaxGrafanaConfigSize ID = "alertmanager-max-grafana-config-size"