Skip to content

Commit

Permalink
[XLA:GPU] Use absl::Status payload to more precisely identify regis…
Browse files Browse the repository at this point in the history
…ter allocation errors.

The logic introduced in cl/580967289 is too generic. Resource exhausted errors are not necessarily register allocation errors (e.g. OOM).

PiperOrigin-RevId: 704801290
  • Loading branch information
allanrenucci authored and Google-ML-Automation committed Dec 11, 2024
1 parent 8912748 commit f355136
Show file tree
Hide file tree
Showing 8 changed files with 40 additions and 14 deletions.
1 change: 1 addition & 0 deletions xla/service/gpu/autotuning/BUILD
Original file line number Diff line number Diff line change
Expand Up @@ -369,6 +369,7 @@ cc_library(
"//xla/stream_executor:device_memory",
"//xla/stream_executor:device_memory_allocator",
"//xla/stream_executor:stream",
"//xla/stream_executor/cuda:ptx_compiler_helpers",
"//xla/stream_executor/gpu:redzone_allocator",
"@com_google_absl//absl/functional:any_invocable",
"@com_google_absl//absl/log:check",
Expand Down
18 changes: 8 additions & 10 deletions xla/service/gpu/autotuning/autotuner_compile_util.cc
Original file line number Diff line number Diff line change
Expand Up @@ -41,6 +41,7 @@ limitations under the License.
#include "xla/service/service_executable_run_options.h"
#include "xla/shape.h"
#include "xla/shape_util.h"
#include "xla/stream_executor/cuda/ptx_compiler_helpers.h"
#include "xla/stream_executor/device_memory.h"
#include "xla/stream_executor/gpu/redzone_allocator.h"
#include "xla/stream_executor/stream.h"
Expand Down Expand Up @@ -112,17 +113,14 @@ AutotunerCompileUtil::ProfileExecutable(
// so GPU caches should be in some comparable states during measurements.
absl::StatusOr<ExecutionOutput> execution_output =
Execute(*executable, std::move(execution_inputs));
if (!execution_output.ok()) {
// Treat register allocation error gracefully. If the compilation happens
// with the driver during execution then the error could surface here.
// It's enough to check this once here.
if (execution_output.status().code() ==
absl::StatusCode::kResourceExhausted) {
return {std::nullopt};
}
return execution_output.status();
// Treat register allocation error gracefully. If the compilation happens
// with the driver during execution then the error could surface here.
// It's enough to check this once here.
if (stream_executor::IsPtxRegisterAllocationError(
execution_output.status())) {
return std::nullopt;
}

TF_RETURN_IF_ERROR(execution_output.status());
TF_RETURN_IF_ERROR(stream->BlockHostUntilDone());
}
std::vector<ExecutionInput> execution_inputs =
Expand Down
1 change: 0 additions & 1 deletion xla/stream_executor/cuda/BUILD
Original file line number Diff line number Diff line change
Expand Up @@ -692,7 +692,6 @@ cc_library(
deps = [
"//xla/stream_executor:device_description",
"//xla/stream_executor:semantic_version",
"@com_google_absl//absl/base",
"@com_google_absl//absl/log",
"@com_google_absl//absl/status",
"@com_google_absl//absl/strings",
Expand Down
18 changes: 17 additions & 1 deletion xla/stream_executor/cuda/ptx_compiler_helpers.cc
Original file line number Diff line number Diff line change
Expand Up @@ -21,8 +21,24 @@ limitations under the License.
#include "absl/status/status.h"
#include "absl/strings/match.h"
#include "absl/strings/str_format.h"
#include "absl/strings/string_view.h"

namespace stream_executor {
namespace {

static constexpr absl::string_view kPtxasErrorPayloadKey = "ptxas_log";

} // namespace

absl::Status PtxRegisterAllocationError(std::string_view message) {
absl::Status status = absl::ResourceExhaustedError(message);
status.SetPayload(kPtxasErrorPayloadKey, absl::Cord());
return status;
}

bool IsPtxRegisterAllocationError(absl::Status status) {
return status.GetPayload(kPtxasErrorPayloadKey).has_value();
}

bool IsPtxRegisterAllocationError(std::string_view str) {
return absl::StrContains(str, "ptxas fatal") &&
Expand All @@ -43,7 +59,7 @@ absl::Status CreateErrorFromPTXASLog(std::string_view log,
"Loaded PTX assembler is too old for %s.", architecture));
}
if (IsPtxRegisterAllocationError(log)) {
return absl::ResourceExhaustedError(log);
return PtxRegisterAllocationError(log);
}
if (absl::StrContains(log, "warning")) {
LOG(INFO) << log;
Expand Down
7 changes: 7 additions & 0 deletions xla/stream_executor/cuda/ptx_compiler_helpers.h
Original file line number Diff line number Diff line change
Expand Up @@ -21,9 +21,16 @@ limitations under the License.
#include "xla/stream_executor/semantic_version.h"

namespace stream_executor {

// Creates a status with a payload indicating a register allocation error.
absl::Status PtxRegisterAllocationError(std::string_view message);

// Checks whether ptxas log contains errors related to register allocation.
bool IsPtxRegisterAllocationError(std::string_view);

// Checks whether the status is a register allocation error.
bool IsPtxRegisterAllocationError(absl::Status status);

// Identifies errors in the ptxas log and creates an error status.
// `architecture` is the name of the GPU architecture, e.g. "sm_80" and is only
// used for error message generation. If `cancel_if_reg_spill` is true, then a
Expand Down
5 changes: 5 additions & 0 deletions xla/stream_executor/cuda/ptx_compiler_helpers_test.cc
Original file line number Diff line number Diff line change
Expand Up @@ -102,5 +102,10 @@ TEST(PtxCompilerHelpersTest,
IsOk());
}

TEST(PtxCompilerHelpersTest, IsPtxRegisterAllocationErrorStatus) {
EXPECT_TRUE(IsPtxRegisterAllocationError(
PtxRegisterAllocationError("Register allocation failed")));
}

} // namespace
} // namespace stream_executor
2 changes: 1 addition & 1 deletion xla/stream_executor/cuda/ptx_compiler_impl.cc
Original file line number Diff line number Diff line change
Expand Up @@ -141,7 +141,7 @@ absl::StatusOr<std::vector<uint8_t>> CompileGpuAsmUsingLibNvPtxCompiler(
"Linked libnvptxcompiler is too old for %s.", architecture));
}
if (IsPtxRegisterAllocationError(error_log)) {
return absl::ResourceExhaustedError(error_log);
return PtxRegisterAllocationError(error_log);
}

return absl::InternalError(
Expand Down
2 changes: 1 addition & 1 deletion xla/stream_executor/cuda/subprocess_compilation.cc
Original file line number Diff line number Diff line change
Expand Up @@ -334,7 +334,7 @@ absl::StatusOr<std::vector<uint8_t>> CompileGpuAsmUsingPtxAs(
}
if (IsPtxRegisterAllocationError(stderr_output)) {
LOG(INFO) << stderr_output;
return absl::ResourceExhaustedError(stderr_output);
return PtxRegisterAllocationError(stderr_output);
}

return absl::InternalError(
Expand Down

0 comments on commit f355136

Please sign in to comment.