-
Notifications
You must be signed in to change notification settings - Fork 450
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
[xla:collectives] NFC: Remove unused NcclApi CommFinalize function
PiperOrigin-RevId: 702108779
- Loading branch information
1 parent
e9947dd
commit 503738a
Showing
38 changed files
with
424 additions
and
318 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,66 @@ | ||
/* Copyright 2024 The OpenXLA Authors. | ||
Licensed under the Apache License, Version 2.0 (the "License"); | ||
you may not use this file except in compliance with the License. | ||
You may obtain a copy of the License at | ||
http://www.apache.org/licenses/LICENSE-2.0 | ||
Unless required by applicable law or agreed to in writing, software | ||
distributed under the License is distributed on an "AS IS" BASIS, | ||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||
See the License for the specific language governing permissions and | ||
limitations under the License. | ||
==============================================================================*/ | ||
|
||
#include "xla/backends/gpu/collectives/gpu_clique.h" | ||
|
||
#include <cstdint> | ||
#include <memory> | ||
#include <optional> | ||
#include <string> | ||
#include <utility> | ||
|
||
#include "absl/container/btree_map.h" | ||
#include "absl/status/status.h" | ||
#include "absl/strings/str_cat.h" | ||
#include "absl/strings/str_format.h" | ||
#include "xla/backends/gpu/collectives/gpu_clique_key.h" | ||
#include "xla/core/collectives/clique.h" | ||
#include "xla/core/collectives/clique_id.h" | ||
#include "xla/core/collectives/communicator.h" | ||
#include "xla/core/collectives/rank_id.h" | ||
#include "tsl/platform/logging.h" | ||
|
||
namespace xla::gpu { | ||
|
||
GpuClique::GpuClique( | ||
GpuCliqueKey key, std::optional<CliqueId> id, | ||
absl::btree_map<RankId, std::unique_ptr<Communicator>> communicators) | ||
: Clique(std::move(communicators)), key_(key), id_(id) {} | ||
|
||
std::string GpuClique::DebugString() const { | ||
std::string out = | ||
absl::StrFormat("key: %s; fingerprint(id): %d; size: %d; communicators: ", | ||
key_.ToString(), id_.has_value() ? id_->fingerprint() : 0, | ||
num_communicators()); | ||
int32_t cnt = 0; | ||
ForEachComm([&](RankId rank, Communicator* comm) { | ||
if (cnt++) absl::StrAppend(&out, ", "); | ||
absl::StrAppendFormat(&out, "[rank=%d, comm=%p]", rank.value(), comm); | ||
}); | ||
return out; | ||
} | ||
|
||
absl::Status GpuClique::HealthCheck() const { | ||
absl::Status health_check = absl::OkStatus(); | ||
ForEachComm([&health_check](RankId rank, Communicator* comm) { | ||
if (auto s = comm->HealthCheck(); !s.ok()) { | ||
LOG(ERROR) << "GPU communicator error (rank " << rank << "): " << s; | ||
if (health_check.ok()) health_check = std::move(s); // return first error | ||
} | ||
}); | ||
return health_check; | ||
} | ||
|
||
} // namespace xla::gpu |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,57 @@ | ||
/* Copyright 2024 The OpenXLA Authors. | ||
Licensed under the Apache License, Version 2.0 (the "License"); | ||
you may not use this file except in compliance with the License. | ||
You may obtain a copy of the License at | ||
http://www.apache.org/licenses/LICENSE-2.0 | ||
Unless required by applicable law or agreed to in writing, software | ||
distributed under the License is distributed on an "AS IS" BASIS, | ||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||
See the License for the specific language governing permissions and | ||
limitations under the License. | ||
==============================================================================*/ | ||
|
||
#ifndef XLA_BACKENDS_GPU_COLLECTIVES_GPU_CLIQUE_H_ | ||
#define XLA_BACKENDS_GPU_COLLECTIVES_GPU_CLIQUE_H_ | ||
|
||
#include <memory> | ||
#include <optional> | ||
#include <string> | ||
|
||
#include "absl/container/btree_map.h" | ||
#include "absl/status/status.h" | ||
#include "xla/backends/gpu/collectives/gpu_clique_key.h" | ||
#include "xla/core/collectives/clique.h" | ||
#include "xla/core/collectives/clique_id.h" | ||
#include "xla/core/collectives/communicator.h" | ||
#include "xla/core/collectives/rank_id.h" | ||
|
||
namespace xla::gpu { | ||
|
||
// A group of GPU communicators making up a clique for a given clique key. | ||
class GpuClique : public Clique { | ||
public: | ||
GpuClique( | ||
GpuCliqueKey key, std::optional<CliqueId> id, | ||
absl::btree_map<RankId, std::unique_ptr<Communicator>> communicators); | ||
|
||
// Returns true if clique is local: all communicators belong to current | ||
// process. Non-local cliques spans multiple processes (typically hosts). | ||
bool IsLocal() const { return num_communicators() == key_.devices().size(); } | ||
|
||
const GpuCliqueKey& key() const { return key_; } | ||
const std::optional<CliqueId>& id() const { return id_; } | ||
|
||
std::string DebugString() const final; | ||
absl::Status HealthCheck() const final; | ||
|
||
private: | ||
GpuCliqueKey key_; | ||
std::optional<CliqueId> id_; | ||
}; | ||
|
||
} // namespace xla::gpu | ||
|
||
#endif // XLA_BACKENDS_GPU_COLLECTIVES_GPU_CLIQUE_H_ |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,47 @@ | ||
/* Copyright 2024 The OpenXLA Authors. | ||
Licensed under the Apache License, Version 2.0 (the "License"); | ||
you may not use this file except in compliance with the License. | ||
You may obtain a copy of the License at | ||
http://www.apache.org/licenses/LICENSE-2.0 | ||
Unless required by applicable law or agreed to in writing, software | ||
distributed under the License is distributed on an "AS IS" BASIS, | ||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||
See the License for the specific language governing permissions and | ||
limitations under the License. | ||
==============================================================================*/ | ||
|
||
#include "xla/core/collectives/clique.h" | ||
|
||
#include <memory> | ||
#include <optional> | ||
#include <utility> | ||
|
||
#include "absl/container/btree_map.h" | ||
#include "absl/functional/function_ref.h" | ||
#include "xla/core/collectives/communicator.h" | ||
#include "xla/core/collectives/rank_id.h" | ||
|
||
namespace xla { | ||
|
||
Clique::Clique( | ||
absl::btree_map<RankId, std::unique_ptr<Communicator>> communicators) | ||
: communicators_(std::move(communicators)) {} | ||
|
||
std::optional<Communicator*> Clique::comm(RankId rank) const { | ||
if (auto it = communicators_.find(rank); it != communicators_.end()) { | ||
return it->second.get(); | ||
} | ||
return std::nullopt; | ||
} | ||
|
||
void Clique::ForEachComm( | ||
absl::FunctionRef<void(RankId, Communicator*)> fn) const { | ||
for (auto& [rank, comm] : communicators_) { | ||
fn(rank, comm.get()); | ||
} | ||
} | ||
|
||
} // namespace xla |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,71 @@ | ||
/* Copyright 2024 The OpenXLA Authors. | ||
Licensed under the Apache License, Version 2.0 (the "License"); | ||
you may not use this file except in compliance with the License. | ||
You may obtain a copy of the License at | ||
http://www.apache.org/licenses/LICENSE-2.0 | ||
Unless required by applicable law or agreed to in writing, software | ||
distributed under the License is distributed on an "AS IS" BASIS, | ||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||
See the License for the specific language governing permissions and | ||
limitations under the License. | ||
==============================================================================*/ | ||
|
||
#ifndef XLA_CORE_COLLECTIVES_CLIQUE_H_ | ||
#define XLA_CORE_COLLECTIVES_CLIQUE_H_ | ||
|
||
// A group of NCCL communicators making up a clique. With NCCL it's notoriously | ||
// easy to get a deadlock, so we take extra care by grouping communicators into | ||
// cliques and making sure that we have a well defined order of all collective | ||
// operations that does not lead to deadlocks. | ||
|
||
#include <cstddef> | ||
#include <memory> | ||
#include <optional> | ||
#include <string> | ||
|
||
#include "absl/container/btree_map.h" | ||
#include "absl/functional/function_ref.h" | ||
#include "absl/status/status.h" | ||
#include "xla/core/collectives/communicator.h" | ||
#include "xla/core/collectives/rank_id.h" | ||
|
||
namespace xla { | ||
|
||
// A group of collective communicators for make up a clique. | ||
// | ||
// We use clique mechanism to group communicators to be able to efficiently | ||
// get exclusive access to all communicators in a clique, as we typically have | ||
// to guarantee that collective operations on all ranks are executed in the | ||
// same order across all devices. | ||
class Clique { | ||
public: | ||
explicit Clique( | ||
absl::btree_map<RankId, std::unique_ptr<Communicator>> communicators); | ||
virtual ~Clique() = default; | ||
|
||
// Returns a communicator for a given rank if it's in a clique. | ||
std::optional<Communicator*> comm(RankId rank) const; | ||
|
||
// Calls `fn` for each communicator in the clique. | ||
void ForEachComm(absl::FunctionRef<void(RankId, Communicator*)> fn) const; | ||
|
||
// Checks that all communicators in the clique are in a healthy state. | ||
virtual absl::Status HealthCheck() const = 0; | ||
|
||
// Returns a human-readable string representation of the clique. | ||
virtual std::string DebugString() const = 0; | ||
|
||
size_t num_communicators() const { return communicators_.size(); } | ||
|
||
private: | ||
// We keep communicators in a sorted order by rank to guarantee deterministic | ||
// traversal order in `ForEachComm`. | ||
absl::btree_map<RankId, std::unique_ptr<Communicator>> communicators_; | ||
}; | ||
|
||
} // namespace xla | ||
|
||
#endif // XLA_CORE_COLLECTIVES_CLIQUE_H_ |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Oops, something went wrong.