Skip to content

Commit

Permalink
Cache non-transient error responses from the query-frontend (#9028)
Browse files Browse the repository at this point in the history
* Cache non-transient error responses from the query-frontend

Create a new query-frontend middleware that caches errors returned by
queries if they are non-transient and will fail again if executed again.
This allows us to save work when running a query that hits, e.g., a limit
error: running the query again will not help and is a waste of work.

See #2676

See #7340

Signed-off-by: Nick Pillitteri <[email protected]>

* Set default value for TTL option correctly

Signed-off-by: Nick Pillitteri <[email protected]>

* Code review changes

Signed-off-by: Nick Pillitteri <[email protected]>

---------

Signed-off-by: Nick Pillitteri <[email protected]>
  • Loading branch information
56quarters authored Sep 25, 2024
1 parent 286c23c commit 992efbb
Show file tree
Hide file tree
Showing 16 changed files with 887 additions and 94 deletions.
1 change: 1 addition & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -61,6 +61,7 @@
* `-query-scheduler.grpc-client-config.grpc-compression=s2`
* `-ruler.client.grpc-compression=s2`
* `-ruler.query-frontend.grpc-client-config.grpc-compression=s2`
* [FEATURE] Query-frontend: added experimental configuration options `query-frontend.cache-errors` and `query-frontend.results-cache-ttl-for-errors` to allow non-transient responses to be cached. When set to `true` error responses from hitting limits or bad data are cached for a short TTL. #9028
* [ENHANCEMENT] Compactor: Add `cortex_compactor_compaction_job_duration_seconds` and `cortex_compactor_compaction_job_blocks` histogram metrics to track duration of individual compaction jobs and number of blocks per job. #8371
* [ENHANCEMENT] Rules: Added per namespace max rules per rule group limit. The maximum number of rules per rule groups for all namespaces continues to be configured by `-ruler.max-rules-per-rule-group`, but now, this can be superseded by the new `-ruler.max-rules-per-rule-group-by-namespace` option on a per namespace basis. This new limit can be overridden using the overrides mechanism to be applied per-tenant. #8378
* [ENHANCEMENT] Rules: Added per namespace max rule groups per tenant limit. The maximum number of rule groups per rule tenant for all namespaces continues to be configured by `-ruler.max-rule-groups-per-tenant`, but now, this can be superseded by the new `-ruler.max-rule-groups-per-tenant-by-namespace` option on a per namespace basis. This new limit can be overridden using the overrides mechanism to be applied per-tenant. #8425
Expand Down
22 changes: 22 additions & 0 deletions cmd/mimir/config-descriptor.json
Original file line number Diff line number Diff line change
Expand Up @@ -4232,6 +4232,17 @@
"fieldFlag": "query-frontend.results-cache-ttl-for-labels-query",
"fieldType": "duration"
},
{
"kind": "field",
"name": "results_cache_ttl_for_errors",
"required": false,
"desc": "Time to live duration for cached non-transient errors",
"fieldValue": null,
"fieldDefaultValue": 300000000000,
"fieldFlag": "query-frontend.results-cache-ttl-for-errors",
"fieldType": "duration",
"fieldCategory": "experimental"
},
{
"kind": "field",
"name": "cache_unaligned_requests",
Expand Down Expand Up @@ -6302,6 +6313,17 @@
"fieldFlag": "query-frontend.cache-results",
"fieldType": "boolean"
},
{
"kind": "field",
"name": "cache_errors",
"required": false,
"desc": "Cache non-transient errors from queries.",
"fieldValue": null,
"fieldDefaultValue": false,
"fieldFlag": "query-frontend.cache-errors",
"fieldType": "boolean",
"fieldCategory": "experimental"
},
{
"kind": "field",
"name": "max_retries",
Expand Down
4 changes: 4 additions & 0 deletions cmd/mimir/help-all.txt.tmpl
Original file line number Diff line number Diff line change
Expand Up @@ -2019,6 +2019,8 @@ Usage of ./cmd/mimir/mimir:
[experimental] Timeout for writing active series responses. 0 means the value from `-server.http-write-timeout` is used. (default 5m0s)
-query-frontend.align-queries-with-step
Mutate incoming queries to align their start and end with their step to improve result caching.
-query-frontend.cache-errors
[experimental] Cache non-transient errors from queries.
-query-frontend.cache-results
Cache query results.
-query-frontend.cache-unaligned-requests
Expand Down Expand Up @@ -2117,6 +2119,8 @@ Usage of ./cmd/mimir/mimir:
Time to live duration for cached query results. If query falls into out-of-order time window, -query-frontend.results-cache-ttl-for-out-of-order-time-window is used instead. (default 1w)
-query-frontend.results-cache-ttl-for-cardinality-query duration
Time to live duration for cached cardinality query results. The value 0 disables the cache.
-query-frontend.results-cache-ttl-for-errors duration
[experimental] Time to live duration for cached non-transient errors (default 5m)
-query-frontend.results-cache-ttl-for-labels-query duration
Time to live duration for cached label names and label values query results. The value 0 disables the cache.
-query-frontend.results-cache-ttl-for-out-of-order-time-window duration
Expand Down
1 change: 1 addition & 0 deletions docs/sources/mimir/configure/about-versioning.md
Original file line number Diff line number Diff line change
Expand Up @@ -161,6 +161,7 @@ The following features are currently experimental:
- Query blocking on a per-tenant basis (configured with the limit `blocked_queries`)
- Sharding of active series queries (`-query-frontend.shard-active-series-queries`)
- Server-side write timeout for responses to active series requests (`-query-frontend.active-series-write-timeout`)
- Caching of non-transient error responses (`-query-frontend.cache-errors`, `-query-frontend.results-cache-ttl-for-errors`)
- Query-scheduler
- `-query-scheduler.querier-forget-delay`
- Store-gateway
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -1644,6 +1644,10 @@ results_cache:
# CLI flag: -query-frontend.cache-results
[cache_results: <boolean> | default = false]

# (experimental) Cache non-transient errors from queries.
# CLI flag: -query-frontend.cache-errors
[cache_errors: <boolean> | default = false]

# (advanced) Maximum number of retries for a single request; beyond this, the
# downstream error is returned.
# CLI flag: -query-frontend.max-retries-per-request
Expand Down Expand Up @@ -3451,6 +3455,10 @@ The `limits` block configures default and per-tenant limits imposed by component
# CLI flag: -query-frontend.results-cache-ttl-for-labels-query
[results_cache_ttl_for_labels_query: <duration> | default = 0s]
# (experimental) Time to live duration for cached non-transient errors
# CLI flag: -query-frontend.results-cache-ttl-for-errors
[results_cache_ttl_for_errors: <duration> | default = 5m]
# (advanced) Cache requests that are not step-aligned.
# CLI flag: -query-frontend.cache-unaligned-requests
[cache_unaligned_requests: <boolean> | default = false]
Expand Down
184 changes: 184 additions & 0 deletions pkg/frontend/querymiddleware/error_caching.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,184 @@
// SPDX-License-Identifier: AGPL-3.0-only

package querymiddleware

import (
"context"
"errors"
"fmt"
"time"

"github.com/go-kit/log"
"github.com/go-kit/log/level"
"github.com/gogo/protobuf/proto"
"github.com/grafana/dskit/cache"
"github.com/grafana/dskit/tenant"
"github.com/prometheus/client_golang/prometheus"
"github.com/prometheus/client_golang/prometheus/promauto"

apierror "github.com/grafana/mimir/pkg/api/error"
"github.com/grafana/mimir/pkg/util/spanlogger"
"github.com/grafana/mimir/pkg/util/validation"
)

const (
reasonNotAPIError = "not-api-error"
reasonNotCacheableError = "not-cacheable-api-error"
)

func newErrorCachingMiddleware(cache cache.Cache, limits Limits, shouldCacheReq shouldCacheFn, logger log.Logger, reg prometheus.Registerer) MetricsQueryMiddleware {
cacheLoadAttempted := promauto.With(reg).NewCounter(prometheus.CounterOpts{
Name: "cortex_frontend_query_error_cache_requests_total",
Help: "Number of requests that check the results cache for an error.",
})
cacheLoadHits := promauto.With(reg).NewCounter(prometheus.CounterOpts{
Name: "cortex_frontend_query_error_cache_hits_total",
Help: "Number of hits for the errors in the results cache.",
})
cacheStoreAttempted := promauto.With(reg).NewCounter(prometheus.CounterOpts{
Name: "cortex_frontend_query_error_cache_store_requests_total",
Help: "Number of requests that resulted in an error.",
})
cacheStoreSkipped := promauto.With(reg).NewCounterVec(prometheus.CounterOpts{
Name: "cortex_frontend_query_error_cache_store_skipped_total",
Help: "Number of requests that resulted in an error that was not stored in the results cache.",
}, []string{"reason"})

return MetricsQueryMiddlewareFunc(func(next MetricsQueryHandler) MetricsQueryHandler {
return &errorCachingHandler{
next: next,
cache: cache,
limits: limits,
shouldCacheReq: shouldCacheReq,
logger: logger,
cacheLoadAttempted: cacheLoadAttempted,
cacheLoadHits: cacheLoadHits,
cacheStoreAttempted: cacheStoreAttempted,
cacheStoreSkipped: cacheStoreSkipped,
}
})
}

type errorCachingHandler struct {
next MetricsQueryHandler
cache cache.Cache
limits Limits
shouldCacheReq shouldCacheFn
logger log.Logger

cacheLoadAttempted prometheus.Counter
cacheLoadHits prometheus.Counter
cacheStoreAttempted prometheus.Counter
cacheStoreSkipped *prometheus.CounterVec
}

func (e *errorCachingHandler) Do(ctx context.Context, request MetricsQueryRequest) (Response, error) {
spanLog := spanlogger.FromContext(ctx, e.logger)
tenantIDs, err := tenant.TenantIDs(ctx)
if err != nil {
return e.next.Do(ctx, request)
}

// Check if caching has disabled via an option on the request
if !e.shouldCacheReq(request) {
return e.next.Do(ctx, request)
}

e.cacheLoadAttempted.Inc()
key := errorCachingKey(tenant.JoinTenantIDs(tenantIDs), request)
hashedKey := cacheHashKey(key)

if cachedErr := e.loadErrorFromCache(ctx, key, hashedKey, spanLog); cachedErr != nil {
e.cacheLoadHits.Inc()
spanLog.DebugLog(
"msg", "returned cached API error",
"error_type", cachedErr.Type,
"key", key,
"hashed_key", hashedKey,
)

return nil, cachedErr
}

res, err := e.next.Do(ctx, request)
if err != nil {
e.cacheStoreAttempted.Inc()

var apiErr *apierror.APIError
if !errors.As(err, &apiErr) {
e.cacheStoreSkipped.WithLabelValues(reasonNotAPIError).Inc()
return res, err
}

if cacheable, reason := e.isCacheable(apiErr); !cacheable {
e.cacheStoreSkipped.WithLabelValues(reason).Inc()
spanLog.DebugLog(
"msg", "error result from request is not cacheable",
"error_type", apiErr.Type,
"reason", reason,
)
return res, err
}

ttl := validation.MinDurationPerTenant(tenantIDs, e.limits.ResultsCacheTTLForErrors)
e.storeErrorToCache(key, hashedKey, ttl, apiErr, spanLog)
}

return res, err
}

func (e *errorCachingHandler) loadErrorFromCache(ctx context.Context, key, hashedKey string, spanLog *spanlogger.SpanLogger) *apierror.APIError {
res := e.cache.GetMulti(ctx, []string{hashedKey})
cached, ok := res[hashedKey]
if !ok {
return nil
}

var cachedError CachedError
if err := proto.Unmarshal(cached, &cachedError); err != nil {
level.Warn(spanLog).Log("msg", "unable to unmarshall cached error", "err", err)
return nil
}

if cachedError.GetKey() != key {
spanLog.DebugLog(
"msg", "cached error key does not match",
"expected_key", key,
"actual_key", cachedError.GetKey(),
"hashed_key", hashedKey,
)
return nil
}

return apierror.New(apierror.Type(cachedError.ErrorType), cachedError.ErrorMessage)

}

func (e *errorCachingHandler) storeErrorToCache(key, hashedKey string, ttl time.Duration, apiErr *apierror.APIError, spanLog *spanlogger.SpanLogger) {
bytes, err := proto.Marshal(&CachedError{
Key: key,
ErrorType: string(apiErr.Type),
ErrorMessage: apiErr.Message,
})

if err != nil {
level.Warn(spanLog).Log("msg", "unable to marshal cached error", "err", err)
return
}

e.cache.SetAsync(hashedKey, bytes, ttl)
}

func (e *errorCachingHandler) isCacheable(apiErr *apierror.APIError) (bool, string) {
if apiErr.Type != apierror.TypeBadData && apiErr.Type != apierror.TypeExec && apiErr.Type != apierror.TypeTooLargeEntry {
return false, reasonNotCacheableError
}

return true, ""
}

// errorCachingKey returns the key for caching and error query response. Standalone function
// to allow for easier testing.
func errorCachingKey(tenantID string, r MetricsQueryRequest) string {
return fmt.Sprintf("EC:%s:%s:%d:%d:%d", tenantID, r.GetQuery(), r.GetStart(), r.GetEnd(), r.GetStep())
}
Loading

0 comments on commit 992efbb

Please sign in to comment.