[ddp] in-place AllReduce of Grad Buckets (PR#2333) (#91)

Lightning-AI · Mar 27, 2024 · 94c9494 · 94c9494
1 parent 716fffd
commit 94c9494
Show file tree

Hide file tree

Showing 2 changed files with 4 additions and 2 deletions.
diff --git a/thunder/distributed/bucketing.py b/thunder/distributed/bucketing.py
@@ -140,7 +140,7 @@ def _maybe_allreduce(self, bucket: Bucket, group: ProcessGroup) -> None:
                 dist_prims.DistributedReduceOps.SUM,
                 group=group,
                 do_async=True,
-                skip_clone=False,
+                skip_clone=True,
             )
 
     def tell(self, grad: TensorProxy, group: ProcessGroup) -> None:

diff --git a/thunder/executors/torchex.py b/thunder/executors/torchex.py
@@ -1614,7 +1614,9 @@ def _unpack_prim_impl(
         tensors: list[torch.Tensor],
         bucket_key: str,
     ) -> list[torch.Tensor]:
-        return torch._utils._unflatten_dense_tensors(buffer, tensors)
+        _, views = _key_to_bucket_and_views[bucket_key]
+        torch._foreach_copy_(tensors, views, non_blocking=True)
+        return tensors
 
     # TODO(crcrpar): Make this compatible with the torch.compile executor as it's doing really well for cat and reshape.
     # NOTE(crcrpar): why no caching/resue of buffer?