Skip to content

Commit

Permalink
[xla:collectives] NFC: Remove unused NcclApi CommFinalize function
Browse files Browse the repository at this point in the history
PiperOrigin-RevId: 702108779
  • Loading branch information
ezhulenev authored and Google-ML-Automation committed Dec 4, 2024
1 parent 0681280 commit 5342f6b
Show file tree
Hide file tree
Showing 7 changed files with 16 additions and 35 deletions.
5 changes: 5 additions & 0 deletions xla/backends/gpu/collectives/nccl_communicator.cc
Original file line number Diff line number Diff line change
Expand Up @@ -45,6 +45,11 @@ NcclCommunicator::~NcclCommunicator() {
XLA_NCCL_LOG_IF_ERROR(ncclCommDestroy(comm_));
}

absl::Status NcclCommunicator::Abort() {
VLOG(1) << "Abort NCCL communicator: " << ToString();
return XLA_NCCL_STATUS(ncclCommAbort(comm_));
}

absl::Status NcclCommunicator::HealthCheck() const {
VLOG(5) << "Get last async error for NCCL communicator: " << ToString();

Expand Down
1 change: 1 addition & 0 deletions xla/backends/gpu/collectives/nccl_communicator.h
Original file line number Diff line number Diff line change
Expand Up @@ -40,6 +40,7 @@ class NcclCommunicator : public Communicator {
explicit NcclCommunicator(ncclComm_t comm);
~NcclCommunicator() override;

absl::Status Abort() final;
absl::Status HealthCheck() const final;

std::string ToString() const final;
Expand Down
5 changes: 5 additions & 0 deletions xla/core/collectives/communicator.h
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,11 @@ class Communicator {
public:
virtual ~Communicator() = default;

// Abort any uncompleted operations and destroys the underlying communicator
// object. It is undefined behavior to use the communicator after calling
// this method.
virtual absl::Status Abort() = 0;

// Checks the health of the communicator. It might return an error from the
// previously launched asynchronous collective operations, and it does not
// have to wait for the completion of scheduled operations.
Expand Down
13 changes: 0 additions & 13 deletions xla/service/gpu/runtime/nccl_api.cc
Original file line number Diff line number Diff line change
Expand Up @@ -302,9 +302,6 @@ class DefaultNcclApi final : public NcclApi {
absl::Span<const Communicator* const> comms, int32_t color,
absl::Span<const RankId> keys, std::optional<Config> config) final;

absl::Status CommAbort(Communicator* comm) final;
absl::Status CommFinalize(Communicator* comm) final;

absl::StatusOr<int32_t> CommCount(Communicator* comm) final;

absl::Status GroupStart() final;
Expand Down Expand Up @@ -481,16 +478,6 @@ DefaultNcclApi::CommSplit(absl::Span<const Communicator* const> comms,
#endif // !defined(TENSORFLOW_USE_ROCM) || TF_ROCM_VERSION >= 60000
}

absl::Status DefaultNcclApi::CommAbort(Communicator* comm) {
VLOG(1) << "Abort NCCL communicator: " << comm;
return XLA_NCCL_STATUS(ncclCommAbort(Cast(comm)));
}

absl::Status DefaultNcclApi::CommFinalize(Communicator* comm) {
VLOG(1) << "Finalize NCCL communicator: " << comm;
return XLA_NCCL_STATUS(ncclCommFinalize(Cast(comm)));
}

absl::StatusOr<int32_t> DefaultNcclApi::CommCount(Communicator* comm) {
VLOG(5) << "Get the number of ranks in NCCL communicator: " << comm;
int32_t count;
Expand Down
11 changes: 0 additions & 11 deletions xla/service/gpu/runtime/nccl_api.h
Original file line number Diff line number Diff line change
Expand Up @@ -161,17 +161,6 @@ class NcclApi : public GpuCollectives {
absl::Span<const Communicator* const> comms, int32_t color,
absl::Span<const RankId> keys, std::optional<Config> config) = 0;

// Abort any uncompleted operations and destroys the communicator. Frees
// resources that are allocated to a communicator object comm.
//
// https://docs.nvidia.com/deeplearning/nccl/user-guide/docs/api/comms.html#ncclcommabort
virtual absl::Status CommAbort(Communicator* comm) = 0;

// Finalize a communicator object comm.
//
// https://docs.nvidia.com/deeplearning/nccl/user-guide/docs/api/comms.html#ncclcommdestroy
virtual absl::Status CommFinalize(Communicator* comm) = 0;

// Returns the number of ranks in the NCCL communicator comm.
//
// https://docs.nvidia.com/deeplearning/nccl/user-guide/docs/api/comms.html#ncclcommcount
Expand Down
6 changes: 0 additions & 6 deletions xla/service/gpu/runtime/nccl_api_stub.cc
Original file line number Diff line number Diff line change
Expand Up @@ -100,12 +100,6 @@ class NcclApiStub final : public NcclApi {
return UnimplementedError();
}

absl::Status CommAbort(Communicator*) final { return UnimplementedError(); }

absl::Status CommFinalize(Communicator*) final {
return UnimplementedError();
}

absl::StatusOr<int32_t> CommCount(Communicator*) final {
return UnimplementedError();
}
Expand Down
10 changes: 5 additions & 5 deletions xla/service/gpu/runtime/nccl_clique.cc
Original file line number Diff line number Diff line change
Expand Up @@ -138,13 +138,13 @@ static NcclCliques& GetNcclCliques() {
// error state. It will free resources that are allocated to a communicator
// and abort any uncompleted operations before destroying the communicator.
static absl::Status CheckComm(Communicator* comm) {
absl::Status async_err = comm->HealthCheck();
if (!async_err.ok()) {
absl::Status health = comm->HealthCheck();
if (!health.ok()) {
LOG(ERROR) << "Aborting communicator: " << comm
<< " due to async NCCL error: " << async_err;
TF_RETURN_IF_ERROR(NcclApi::Default()->CommAbort(comm));
<< " due to error: " << health;
TF_RETURN_IF_ERROR(comm->Abort());
}
return async_err;
return health;
}

// Runs async check on all communicators in a clique.
Expand Down

0 comments on commit 5342f6b

Please sign in to comment.