From e3aa5e2f6410727efead4970a4ec30569cf76881 Mon Sep 17 00:00:00 2001 From: eqy Date: Tue, 17 Sep 2024 18:50:09 +0000 Subject: [PATCH] [NCCL] Don't override `waitUntilInitialized`'s setting of `comm->initialized_` (#136155) #133630 sets `initialized_` to `true` which causes previous wait codepaths to skip necessary waits, see also #https://github.com/pytorch/pytorch/issues/136151 CC @shuqiangzhang @wconstab Pull Request resolved: https://github.com/pytorch/pytorch/pull/136155 Approved by: https://github.com/fduwjj, https://github.com/kwen2501, https://github.com/c-p-i-o, https://github.com/shuqiangzhang --- torch/csrc/distributed/c10d/NCCLUtils.cpp | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/torch/csrc/distributed/c10d/NCCLUtils.cpp b/torch/csrc/distributed/c10d/NCCLUtils.cpp index b11728e0ba8be7..47ace12db6c3fc 100644 --- a/torch/csrc/distributed/c10d/NCCLUtils.cpp +++ b/torch/csrc/distributed/c10d/NCCLUtils.cpp @@ -84,7 +84,9 @@ std::shared_ptr NCCLComm::split( std::nullopt); ++source->ncclCommSplitCounter_; comm->rank_ = rank; - comm->initialized_ = true; + if (!nccl_use_nonblocking()) { + comm->initialized_ = true; + } return comm; } #endif