Skip to content

Commit

Permalink
Handle too_many_buckets_exception more gracefully.
Browse files Browse the repository at this point in the history
Previously, we returned a `500 Internal Server Error` which was not
a great experience when it would consistently happen for a particular
query. Now we return a nicely formatted error telling the client to
reduce the grouping cardinality instead.
  • Loading branch information
myronmarston committed Dec 3, 2024
1 parent 8f0b162 commit d24e9ab
Show file tree
Hide file tree
Showing 2 changed files with 44 additions and 1 deletion.
Original file line number Diff line number Diff line change
Expand Up @@ -118,6 +118,8 @@ def raise_search_failed_if_any_failures(responses_by_query)
return if failures.empty?

formatted_failures = failures.map do |(query, response), index|
raise_execution_exception_for_known_public_error(response)

# Note: we intentionally omit the body of the request here, because it could contain PII
# or other sensitive values that we don't want logged.
<<~ERROR
Expand All @@ -130,6 +132,21 @@ def raise_search_failed_if_any_failures(responses_by_query)
raise Errors::SearchFailedError, "Got #{failures.size} search failure(s):\n\n#{formatted_failures}"
end

# In general, when we receive a datastore response indicating a search failed, we raise
# `Errors::SearchFailedError` which translates into a `500 Internal Server Error`. That's
# appropriate for transient errors (particularly when there's nothing the client can do about
# it) but for non-transient errors that the client can do something about, we'd like to provide
# a friendlier error. This method handles those cases.
#
# GraphQL::ExecutionError is automatically translated into a nice error response.
def raise_execution_exception_for_known_public_error(response)
if response.dig("error", "caused_by", "type") == "too_many_buckets_exception"
max_buckets = response.dig("error", "caused_by", "max_buckets")
raise ::GraphQL::ExecutionError, "Aggregation query produces too many groupings. " \
"Reduce the grouping cardinality to less than #{max_buckets} and try again."
end
end

# Examine successful query responses and log any shard failure they encounter
def log_shard_failure_if_necessary(responses_by_query)
shard_failures = responses_by_query.each_with_index.select do |(query, response), query_numeric_index|
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -143,7 +143,7 @@ class GraphQL
]
end

it "raises `Errors::SearchFailedError` if a search fails for any reason" do
it "raises `Errors::SearchFailedError` if a search fails for a transient reason" do
allow(main_datastore_client).to receive(:msearch).and_return("took" => 10, "responses" => [
empty_response,
{"took" => 5, "error" => {"bad stuff" => "happened"}, "status" => 400}
Expand All @@ -159,6 +159,32 @@ class GraphQL
)))
end

it "raises `::GraphQL::ExecutionError` if a search fails for a non-transient `too_many_buckets_exception` reason" do
allow(main_datastore_client).to receive(:msearch).and_return("took" => 10, "responses" => [
empty_response,
{
"status" => 400,
"error" => {
"root_cause" => [],
"type" => "search_phase_execution_exception",
"reason" => "",
"grouped" => true,
"caused_by" => {
"type" => "too_many_buckets_exception",
"reason" => "Trying to create too many buckets. Must be less than or equal to: [65535] but was [2056004]. This limit can be set by changing the [search.max_buckets] cluster level setting.",
"max_buckets" => 65535
}
}
}
])

expect {
router.msearch([query1, query2])
}.to raise_error ::GraphQL::ExecutionError, a_string_including(
"Aggregation query produces too many groupings. Reduce the grouping cardinality to less than 65535 and try again."
)
end

it "logs warning if a query has failed shards" do
shard_failure_bits = {
"_shards" => {
Expand Down

0 comments on commit d24e9ab

Please sign in to comment.