From d24e9ab0bbcb4add713dd71b5b3dc6a62897ab2b Mon Sep 17 00:00:00 2001 From: Myron Marston Date: Mon, 2 Dec 2024 17:07:31 -0800 Subject: [PATCH] Handle `too_many_buckets_exception` more gracefully. Previously, we returned a `500 Internal Server Error` which was not a great experience when it would consistently happen for a particular query. Now we return a nicely formatted error telling the client to reduce the grouping cardinality instead. --- .../graphql/datastore_search_router.rb | 17 +++++++++++ .../graphql/datastore_search_router_spec.rb | 28 ++++++++++++++++++- 2 files changed, 44 insertions(+), 1 deletion(-) diff --git a/elasticgraph-graphql/lib/elastic_graph/graphql/datastore_search_router.rb b/elasticgraph-graphql/lib/elastic_graph/graphql/datastore_search_router.rb index 8d36b7ff..6db1358c 100644 --- a/elasticgraph-graphql/lib/elastic_graph/graphql/datastore_search_router.rb +++ b/elasticgraph-graphql/lib/elastic_graph/graphql/datastore_search_router.rb @@ -118,6 +118,8 @@ def raise_search_failed_if_any_failures(responses_by_query) return if failures.empty? formatted_failures = failures.map do |(query, response), index| + raise_execution_exception_for_known_public_error(response) + # Note: we intentionally omit the body of the request here, because it could contain PII # or other sensitive values that we don't want logged. <<~ERROR @@ -130,6 +132,21 @@ def raise_search_failed_if_any_failures(responses_by_query) raise Errors::SearchFailedError, "Got #{failures.size} search failure(s):\n\n#{formatted_failures}" end + # In general, when we receive a datastore response indicating a search failed, we raise + # `Errors::SearchFailedError` which translates into a `500 Internal Server Error`. That's + # appropriate for transient errors (particularly when there's nothing the client can do about + # it) but for non-transient errors that the client can do something about, we'd like to provide + # a friendlier error. This method handles those cases. + # + # GraphQL::ExecutionError is automatically translated into a nice error response. + def raise_execution_exception_for_known_public_error(response) + if response.dig("error", "caused_by", "type") == "too_many_buckets_exception" + max_buckets = response.dig("error", "caused_by", "max_buckets") + raise ::GraphQL::ExecutionError, "Aggregation query produces too many groupings. " \ + "Reduce the grouping cardinality to less than #{max_buckets} and try again." + end + end + # Examine successful query responses and log any shard failure they encounter def log_shard_failure_if_necessary(responses_by_query) shard_failures = responses_by_query.each_with_index.select do |(query, response), query_numeric_index| diff --git a/elasticgraph-graphql/spec/unit/elastic_graph/graphql/datastore_search_router_spec.rb b/elasticgraph-graphql/spec/unit/elastic_graph/graphql/datastore_search_router_spec.rb index 490358e8..f6a5b217 100644 --- a/elasticgraph-graphql/spec/unit/elastic_graph/graphql/datastore_search_router_spec.rb +++ b/elasticgraph-graphql/spec/unit/elastic_graph/graphql/datastore_search_router_spec.rb @@ -143,7 +143,7 @@ class GraphQL ] end - it "raises `Errors::SearchFailedError` if a search fails for any reason" do + it "raises `Errors::SearchFailedError` if a search fails for a transient reason" do allow(main_datastore_client).to receive(:msearch).and_return("took" => 10, "responses" => [ empty_response, {"took" => 5, "error" => {"bad stuff" => "happened"}, "status" => 400} @@ -159,6 +159,32 @@ class GraphQL ))) end + it "raises `::GraphQL::ExecutionError` if a search fails for a non-transient `too_many_buckets_exception` reason" do + allow(main_datastore_client).to receive(:msearch).and_return("took" => 10, "responses" => [ + empty_response, + { + "status" => 400, + "error" => { + "root_cause" => [], + "type" => "search_phase_execution_exception", + "reason" => "", + "grouped" => true, + "caused_by" => { + "type" => "too_many_buckets_exception", + "reason" => "Trying to create too many buckets. Must be less than or equal to: [65535] but was [2056004]. This limit can be set by changing the [search.max_buckets] cluster level setting.", + "max_buckets" => 65535 + } + } + } + ]) + + expect { + router.msearch([query1, query2]) + }.to raise_error ::GraphQL::ExecutionError, a_string_including( + "Aggregation query produces too many groupings. Reduce the grouping cardinality to less than 65535 and try again." + ) + end + it "logs warning if a query has failed shards" do shard_failure_bits = { "_shards" => {