Cache non-transient error responses from the query-frontend (#9028)

* Cache non-transient error responses from the query-frontend Create a new query-frontend middleware that caches errors returned by queries if they are non-transient and will fail again if executed again. This allows us to save work when running a query that hits, e.g., a limit error: running the query again will not help and is a waste of work. See #2676 See #7340 Signed-off-by: Nick Pillitteri <[email protected]> * Set default value for TTL option correctly Signed-off-by: Nick Pillitteri <[email protected]> * Code review changes Signed-off-by: Nick Pillitteri <[email protected]> --------- Signed-off-by: Nick Pillitteri <[email protected]>
grafana · Sep 25, 2024 · 992efbb · 992efbb
1 parent 286c23c
commit 992efbb
Show file tree

Hide file tree

Showing 16 changed files with 887 additions and 94 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -61,6 +61,7 @@
   * `-query-scheduler.grpc-client-config.grpc-compression=s2`
   * `-ruler.client.grpc-compression=s2`
   * `-ruler.query-frontend.grpc-client-config.grpc-compression=s2`
+* [FEATURE] Query-frontend: added experimental configuration options `query-frontend.cache-errors` and `query-frontend.results-cache-ttl-for-errors` to allow non-transient responses to be cached. When set to `true` error responses from hitting limits or bad data are cached for a short TTL. #9028
 * [ENHANCEMENT] Compactor: Add `cortex_compactor_compaction_job_duration_seconds` and `cortex_compactor_compaction_job_blocks` histogram metrics to track duration of individual compaction jobs and number of blocks per job. #8371
 * [ENHANCEMENT] Rules: Added per namespace max rules per rule group limit. The maximum number of rules per rule groups for all namespaces continues to be configured by `-ruler.max-rules-per-rule-group`, but now, this can be superseded by the new `-ruler.max-rules-per-rule-group-by-namespace` option on a per namespace basis. This new limit can be overridden using the overrides mechanism to be applied per-tenant. #8378
 * [ENHANCEMENT] Rules: Added per namespace max rule groups per tenant limit. The maximum number of rule groups per rule tenant for all namespaces continues to be configured by `-ruler.max-rule-groups-per-tenant`, but now, this can be superseded by the new `-ruler.max-rule-groups-per-tenant-by-namespace` option on a per namespace basis. This new limit can be overridden using the overrides mechanism to be applied per-tenant. #8425

diff --git a/cmd/mimir/config-descriptor.json b/cmd/mimir/config-descriptor.json
@@ -4232,6 +4232,17 @@
           "fieldFlag": "query-frontend.results-cache-ttl-for-labels-query",
           "fieldType": "duration"
         },
+        {
+          "kind": "field",
+          "name": "results_cache_ttl_for_errors",
+          "required": false,
+          "desc": "Time to live duration for cached non-transient errors",
+          "fieldValue": null,
+          "fieldDefaultValue": 300000000000,
+          "fieldFlag": "query-frontend.results-cache-ttl-for-errors",
+          "fieldType": "duration",
+          "fieldCategory": "experimental"
+        },
         {
           "kind": "field",
           "name": "cache_unaligned_requests",
@@ -6302,6 +6313,17 @@
           "fieldFlag": "query-frontend.cache-results",
           "fieldType": "boolean"
         },
+        {
+          "kind": "field",
+          "name": "cache_errors",
+          "required": false,
+          "desc": "Cache non-transient errors from queries.",
+          "fieldValue": null,
+          "fieldDefaultValue": false,
+          "fieldFlag": "query-frontend.cache-errors",
+          "fieldType": "boolean",
+          "fieldCategory": "experimental"
+        },
         {
           "kind": "field",
           "name": "max_retries",

diff --git a/cmd/mimir/help-all.txt.tmpl b/cmd/mimir/help-all.txt.tmpl
@@ -2019,6 +2019,8 @@ Usage of ./cmd/mimir/mimir:
     	[experimental] Timeout for writing active series responses. 0 means the value from `-server.http-write-timeout` is used. (default 5m0s)
   -query-frontend.align-queries-with-step
     	Mutate incoming queries to align their start and end with their step to improve result caching.
+  -query-frontend.cache-errors
+    	[experimental] Cache non-transient errors from queries.
   -query-frontend.cache-results
     	Cache query results.
   -query-frontend.cache-unaligned-requests
@@ -2117,6 +2119,8 @@ Usage of ./cmd/mimir/mimir:
     	Time to live duration for cached query results. If query falls into out-of-order time window, -query-frontend.results-cache-ttl-for-out-of-order-time-window is used instead. (default 1w)
   -query-frontend.results-cache-ttl-for-cardinality-query duration
     	Time to live duration for cached cardinality query results. The value 0 disables the cache.
+  -query-frontend.results-cache-ttl-for-errors duration
+    	[experimental] Time to live duration for cached non-transient errors (default 5m)
   -query-frontend.results-cache-ttl-for-labels-query duration
     	Time to live duration for cached label names and label values query results. The value 0 disables the cache.
   -query-frontend.results-cache-ttl-for-out-of-order-time-window duration

diff --git a/docs/sources/mimir/configure/about-versioning.md b/docs/sources/mimir/configure/about-versioning.md
@@ -161,6 +161,7 @@ The following features are currently experimental:
   - Query blocking on a per-tenant basis (configured with the limit `blocked_queries`)
   - Sharding of active series queries (`-query-frontend.shard-active-series-queries`)
   - Server-side write timeout for responses to active series requests (`-query-frontend.active-series-write-timeout`)
+  - Caching of non-transient error responses (`-query-frontend.cache-errors`, `-query-frontend.results-cache-ttl-for-errors`)
 - Query-scheduler
   - `-query-scheduler.querier-forget-delay`
 - Store-gateway

diff --git a/docs/sources/mimir/configure/configuration-parameters/index.md b/docs/sources/mimir/configure/configuration-parameters/index.md
@@ -1644,6 +1644,10 @@ results_cache:
 # CLI flag: -query-frontend.cache-results
 [cache_results: <boolean> | default = false]
 
+# (experimental) Cache non-transient errors from queries.
+# CLI flag: -query-frontend.cache-errors
+[cache_errors: <boolean> | default = false]
+
 # (advanced) Maximum number of retries for a single request; beyond this, the
 # downstream error is returned.
 # CLI flag: -query-frontend.max-retries-per-request
@@ -3451,6 +3455,10 @@ The `limits` block configures default and per-tenant limits imposed by component
 # CLI flag: -query-frontend.results-cache-ttl-for-labels-query
 [results_cache_ttl_for_labels_query: <duration> | default = 0s]
 
+# (experimental) Time to live duration for cached non-transient errors
+# CLI flag: -query-frontend.results-cache-ttl-for-errors
+[results_cache_ttl_for_errors: <duration> | default = 5m]
+
 # (advanced) Cache requests that are not step-aligned.
 # CLI flag: -query-frontend.cache-unaligned-requests
 [cache_unaligned_requests: <boolean> | default = false]

diff --git a/pkg/frontend/querymiddleware/error_caching.go b/pkg/frontend/querymiddleware/error_caching.go
@@ -0,0 +1,184 @@
+// SPDX-License-Identifier: AGPL-3.0-only
+
+package querymiddleware
+
+import (
+	"context"
+	"errors"
+	"fmt"
+	"time"
+
+	"github.com/go-kit/log"
+	"github.com/go-kit/log/level"
+	"github.com/gogo/protobuf/proto"
+	"github.com/grafana/dskit/cache"
+	"github.com/grafana/dskit/tenant"
+	"github.com/prometheus/client_golang/prometheus"
+	"github.com/prometheus/client_golang/prometheus/promauto"
+
+	apierror "github.com/grafana/mimir/pkg/api/error"
+	"github.com/grafana/mimir/pkg/util/spanlogger"
+	"github.com/grafana/mimir/pkg/util/validation"
+)
+
+const (
+	reasonNotAPIError       = "not-api-error"
+	reasonNotCacheableError = "not-cacheable-api-error"
+)
+
+func newErrorCachingMiddleware(cache cache.Cache, limits Limits, shouldCacheReq shouldCacheFn, logger log.Logger, reg prometheus.Registerer) MetricsQueryMiddleware {
+	cacheLoadAttempted := promauto.With(reg).NewCounter(prometheus.CounterOpts{
+		Name: "cortex_frontend_query_error_cache_requests_total",
+		Help: "Number of requests that check the results cache for an error.",
+	})
+	cacheLoadHits := promauto.With(reg).NewCounter(prometheus.CounterOpts{
+		Name: "cortex_frontend_query_error_cache_hits_total",
+		Help: "Number of hits for the errors in the results cache.",
+	})
+	cacheStoreAttempted := promauto.With(reg).NewCounter(prometheus.CounterOpts{
+		Name: "cortex_frontend_query_error_cache_store_requests_total",
+		Help: "Number of requests that resulted in an error.",
+	})
+	cacheStoreSkipped := promauto.With(reg).NewCounterVec(prometheus.CounterOpts{
+		Name: "cortex_frontend_query_error_cache_store_skipped_total",
+		Help: "Number of requests that resulted in an error that was not stored in the results cache.",
+	}, []string{"reason"})
+
+	return MetricsQueryMiddlewareFunc(func(next MetricsQueryHandler) MetricsQueryHandler {
+		return &errorCachingHandler{
+			next:                next,
+			cache:               cache,
+			limits:              limits,
+			shouldCacheReq:      shouldCacheReq,
+			logger:              logger,
+			cacheLoadAttempted:  cacheLoadAttempted,
+			cacheLoadHits:       cacheLoadHits,
+			cacheStoreAttempted: cacheStoreAttempted,
+			cacheStoreSkipped:   cacheStoreSkipped,
+		}
+	})
+}
+
+type errorCachingHandler struct {
+	next           MetricsQueryHandler
+	cache          cache.Cache
+	limits         Limits
+	shouldCacheReq shouldCacheFn
+	logger         log.Logger
+
+	cacheLoadAttempted  prometheus.Counter
+	cacheLoadHits       prometheus.Counter
+	cacheStoreAttempted prometheus.Counter
+	cacheStoreSkipped   *prometheus.CounterVec
+}
+
+func (e *errorCachingHandler) Do(ctx context.Context, request MetricsQueryRequest) (Response, error) {
+	spanLog := spanlogger.FromContext(ctx, e.logger)
+	tenantIDs, err := tenant.TenantIDs(ctx)
+	if err != nil {
+		return e.next.Do(ctx, request)
+	}
+
+	// Check if caching has disabled via an option on the request
+	if !e.shouldCacheReq(request) {
+		return e.next.Do(ctx, request)
+	}
+
+	e.cacheLoadAttempted.Inc()
+	key := errorCachingKey(tenant.JoinTenantIDs(tenantIDs), request)
+	hashedKey := cacheHashKey(key)
+
+	if cachedErr := e.loadErrorFromCache(ctx, key, hashedKey, spanLog); cachedErr != nil {
+		e.cacheLoadHits.Inc()
+		spanLog.DebugLog(
+			"msg", "returned cached API error",
+			"error_type", cachedErr.Type,
+			"key", key,
+			"hashed_key", hashedKey,
+		)
+
+		return nil, cachedErr
+	}
+
+	res, err := e.next.Do(ctx, request)
+	if err != nil {
+		e.cacheStoreAttempted.Inc()
+
+		var apiErr *apierror.APIError
+		if !errors.As(err, &apiErr) {
+			e.cacheStoreSkipped.WithLabelValues(reasonNotAPIError).Inc()
+			return res, err
+		}
+
+		if cacheable, reason := e.isCacheable(apiErr); !cacheable {
+			e.cacheStoreSkipped.WithLabelValues(reason).Inc()
+			spanLog.DebugLog(
+				"msg", "error result from request is not cacheable",
+				"error_type", apiErr.Type,
+				"reason", reason,
+			)
+			return res, err
+		}
+
+		ttl := validation.MinDurationPerTenant(tenantIDs, e.limits.ResultsCacheTTLForErrors)
+		e.storeErrorToCache(key, hashedKey, ttl, apiErr, spanLog)
+	}
+
+	return res, err
+}
+
+func (e *errorCachingHandler) loadErrorFromCache(ctx context.Context, key, hashedKey string, spanLog *spanlogger.SpanLogger) *apierror.APIError {
+	res := e.cache.GetMulti(ctx, []string{hashedKey})
+	cached, ok := res[hashedKey]
+	if !ok {
+		return nil
+	}
+
+	var cachedError CachedError
+	if err := proto.Unmarshal(cached, &cachedError); err != nil {
+		level.Warn(spanLog).Log("msg", "unable to unmarshall cached error", "err", err)
+		return nil
+	}
+
+	if cachedError.GetKey() != key {
+		spanLog.DebugLog(
+			"msg", "cached error key does not match",
+			"expected_key", key,
+			"actual_key", cachedError.GetKey(),
+			"hashed_key", hashedKey,
+		)
+		return nil
+	}
+
+	return apierror.New(apierror.Type(cachedError.ErrorType), cachedError.ErrorMessage)
+
+}
+
+func (e *errorCachingHandler) storeErrorToCache(key, hashedKey string, ttl time.Duration, apiErr *apierror.APIError, spanLog *spanlogger.SpanLogger) {
+	bytes, err := proto.Marshal(&CachedError{
+		Key:          key,
+		ErrorType:    string(apiErr.Type),
+		ErrorMessage: apiErr.Message,
+	})
+
+	if err != nil {
+		level.Warn(spanLog).Log("msg", "unable to marshal cached error", "err", err)
+		return
+	}
+
+	e.cache.SetAsync(hashedKey, bytes, ttl)
+}
+
+func (e *errorCachingHandler) isCacheable(apiErr *apierror.APIError) (bool, string) {
+	if apiErr.Type != apierror.TypeBadData && apiErr.Type != apierror.TypeExec && apiErr.Type != apierror.TypeTooLargeEntry {
+		return false, reasonNotCacheableError
+	}
+
+	return true, ""
+}
+
+// errorCachingKey returns the key for caching and error query response. Standalone function
+// to allow for easier testing.
+func errorCachingKey(tenantID string, r MetricsQueryRequest) string {
+	return fmt.Sprintf("EC:%s:%s:%d:%d:%d", tenantID, r.GetQuery(), r.GetStart(), r.GetEnd(), r.GetStep())
+}