From afadfa0175221b0bb57af6f16fa2bb8210be7902 Mon Sep 17 00:00:00 2001 From: fduwjj Date: Thu, 28 Dec 2023 16:36:24 -0800 Subject: [PATCH] [c10d] Add stream info during nccl comm abort call (#116076) Pull Request resolved: https://github.com/pytorch/pytorch/pull/116076 Approved by: https://github.com/XilunWu --- torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) diff --git a/torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp b/torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp index ef637d3a8a4367..9f37672e878191 100644 --- a/torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp +++ b/torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp @@ -1109,8 +1109,17 @@ void ProcessGroupNCCL::abortCommsFromMap( // their responsibility to destroy the process group and recreate // it to recover from errors. + c10::StreamId streamId = -1; + if (ncclStreams_.find(devName) != ncclStreams_.end()) { + auto streams = ncclStreams_.at(devName); + if (streams.size() > 0) { + streamId = streams[0].id(); + } + } + LOG(INFO) << logPrefix() << "] Destroyed " << ncclComms.size() - << "communicators on CUDA device " << devName; + << "communicators on CUDA device: " << devName + << " with stream: " << streamId; } }