[XLA:GPU] Use absl::Status payload to more precisely identify regis…

…ter allocation errors. The logic introduced in cl/580967289 is too generic. Resource exhausted errors are not necessarily register allocation errors (e.g. OOM). PiperOrigin-RevId: 704801290
openxla · Dec 11, 2024 · f355136 · f355136
1 parent 8912748
commit f355136
Show file tree

Hide file tree

Showing 8 changed files with 40 additions and 14 deletions.
diff --git a/xla/service/gpu/autotuning/BUILD b/xla/service/gpu/autotuning/BUILD
@@ -369,6 +369,7 @@ cc_library(
         "//xla/stream_executor:device_memory",
         "//xla/stream_executor:device_memory_allocator",
         "//xla/stream_executor:stream",
+        "//xla/stream_executor/cuda:ptx_compiler_helpers",
         "//xla/stream_executor/gpu:redzone_allocator",
         "@com_google_absl//absl/functional:any_invocable",
         "@com_google_absl//absl/log:check",

diff --git a/xla/service/gpu/autotuning/autotuner_compile_util.cc b/xla/service/gpu/autotuning/autotuner_compile_util.cc
@@ -41,6 +41,7 @@ limitations under the License.
 #include "xla/service/service_executable_run_options.h"
 #include "xla/shape.h"
 #include "xla/shape_util.h"
+#include "xla/stream_executor/cuda/ptx_compiler_helpers.h"
 #include "xla/stream_executor/device_memory.h"
 #include "xla/stream_executor/gpu/redzone_allocator.h"
 #include "xla/stream_executor/stream.h"
@@ -112,17 +113,14 @@ AutotunerCompileUtil::ProfileExecutable(
     // so GPU caches should be in some comparable states during measurements.
     absl::StatusOr<ExecutionOutput> execution_output =
         Execute(*executable, std::move(execution_inputs));
-    if (!execution_output.ok()) {
-      // Treat register allocation error gracefully. If the compilation happens
-      // with the driver during execution then the error could surface here.
-      // It's enough to check this once here.
-      if (execution_output.status().code() ==
-          absl::StatusCode::kResourceExhausted) {
-        return {std::nullopt};
-      }
-      return execution_output.status();
+    // Treat register allocation error gracefully. If the compilation happens
+    // with the driver during execution then the error could surface here.
+    // It's enough to check this once here.
+    if (stream_executor::IsPtxRegisterAllocationError(
+            execution_output.status())) {
+      return std::nullopt;
     }
-
+    TF_RETURN_IF_ERROR(execution_output.status());
     TF_RETURN_IF_ERROR(stream->BlockHostUntilDone());
   }
   std::vector<ExecutionInput> execution_inputs =

diff --git a/xla/stream_executor/cuda/BUILD b/xla/stream_executor/cuda/BUILD
@@ -692,7 +692,6 @@ cc_library(
     deps = [
         "//xla/stream_executor:device_description",
         "//xla/stream_executor:semantic_version",
-        "@com_google_absl//absl/base",
         "@com_google_absl//absl/log",
         "@com_google_absl//absl/status",
         "@com_google_absl//absl/strings",

diff --git a/xla/stream_executor/cuda/ptx_compiler_helpers.cc b/xla/stream_executor/cuda/ptx_compiler_helpers.cc
@@ -21,8 +21,24 @@ limitations under the License.
 #include "absl/status/status.h"
 #include "absl/strings/match.h"
 #include "absl/strings/str_format.h"
+#include "absl/strings/string_view.h"
 
 namespace stream_executor {
+namespace {
+
+static constexpr absl::string_view kPtxasErrorPayloadKey = "ptxas_log";
+
+}  // namespace
+
+absl::Status PtxRegisterAllocationError(std::string_view message) {
+  absl::Status status = absl::ResourceExhaustedError(message);
+  status.SetPayload(kPtxasErrorPayloadKey, absl::Cord());
+  return status;
+}
+
+bool IsPtxRegisterAllocationError(absl::Status status) {
+  return status.GetPayload(kPtxasErrorPayloadKey).has_value();
+}
 
 bool IsPtxRegisterAllocationError(std::string_view str) {
   return absl::StrContains(str, "ptxas fatal") &&
@@ -43,7 +59,7 @@ absl::Status CreateErrorFromPTXASLog(std::string_view log,
         "Loaded PTX assembler is too old for %s.", architecture));
   }
   if (IsPtxRegisterAllocationError(log)) {
-    return absl::ResourceExhaustedError(log);
+    return PtxRegisterAllocationError(log);
   }
   if (absl::StrContains(log, "warning")) {
     LOG(INFO) << log;

diff --git a/xla/stream_executor/cuda/ptx_compiler_helpers.h b/xla/stream_executor/cuda/ptx_compiler_helpers.h
@@ -21,9 +21,16 @@ limitations under the License.
 #include "xla/stream_executor/semantic_version.h"
 
 namespace stream_executor {
+
+// Creates a status with a payload indicating a register allocation error.
+absl::Status PtxRegisterAllocationError(std::string_view message);
+
 // Checks whether ptxas log contains errors related to register allocation.
 bool IsPtxRegisterAllocationError(std::string_view);
 
+// Checks whether the status is a register allocation error.
+bool IsPtxRegisterAllocationError(absl::Status status);
+
 // Identifies errors in the ptxas log and creates an error status.
 // `architecture` is the name of the GPU architecture, e.g. "sm_80" and is only
 // used for error message generation. If `cancel_if_reg_spill` is true, then a

diff --git a/xla/stream_executor/cuda/ptx_compiler_helpers_test.cc b/xla/stream_executor/cuda/ptx_compiler_helpers_test.cc
@@ -102,5 +102,10 @@ TEST(PtxCompilerHelpersTest,
               IsOk());
 }
 
+TEST(PtxCompilerHelpersTest, IsPtxRegisterAllocationErrorStatus) {
+  EXPECT_TRUE(IsPtxRegisterAllocationError(
+      PtxRegisterAllocationError("Register allocation failed")));
+}
+
 }  // namespace
 }  // namespace stream_executor
diff --git a/xla/stream_executor/cuda/ptx_compiler_impl.cc b/xla/stream_executor/cuda/ptx_compiler_impl.cc
@@ -141,7 +141,7 @@ absl::StatusOr<std::vector<uint8_t>> CompileGpuAsmUsingLibNvPtxCompiler(
           "Linked libnvptxcompiler is too old for %s.", architecture));
     }
     if (IsPtxRegisterAllocationError(error_log)) {
-      return absl::ResourceExhaustedError(error_log);
+      return PtxRegisterAllocationError(error_log);
     }
 
     return absl::InternalError(

diff --git a/xla/stream_executor/cuda/subprocess_compilation.cc b/xla/stream_executor/cuda/subprocess_compilation.cc
@@ -334,7 +334,7 @@ absl::StatusOr<std::vector<uint8_t>> CompileGpuAsmUsingPtxAs(
     }
     if (IsPtxRegisterAllocationError(stderr_output)) {
       LOG(INFO) << stderr_output;
-      return absl::ResourceExhaustedError(stderr_output);
+      return PtxRegisterAllocationError(stderr_output);
     }
 
     return absl::InternalError(