openucx · Sergei-Lebedev · Dec 15, 2023 · Oct 12, 2023 · Nov 6, 2023
diff --git a/src/components/mc/cuda/mc_cuda.c b/src/components/mc/cuda/mc_cuda.c
@@ -159,10 +159,17 @@ static ucc_status_t ucc_mc_cuda_mem_pool_alloc(ucc_mc_buffer_header_t **h_ptr,
                                                ucc_memory_type_t        mt)
 {
     ucc_mc_buffer_header_t *h = NULL;
-    if (size <= MC_CUDA_CONFIG->mpool_elem_size &&
-        mt != UCC_MEMORY_TYPE_CUDA_MANAGED) {
-        h = (ucc_mc_buffer_header_t *)ucc_mpool_get(&ucc_mc_cuda.mpool);
+
+    if (size <= MC_CUDA_CONFIG->mpool_elem_size) {
+        if (mt == UCC_MEMORY_TYPE_CUDA) {
+            h = (ucc_mc_buffer_header_t *)ucc_mpool_get(&ucc_mc_cuda.mpool);
+        } else if (mt == UCC_MEMORY_TYPE_CUDA_MANAGED) {
+            h = (ucc_mc_buffer_header_t *)ucc_mpool_get(&ucc_mc_cuda.mpool_managed);
+        } else {
+            return UCC_ERR_INVALID_PARAM;
+        }
     }
+
     if (!h) {
         // Slow path
         return ucc_mc_cuda_mem_alloc(h_ptr, size, mt);
@@ -196,6 +203,7 @@ static void ucc_mc_cuda_chunk_init(ucc_mpool_t *mp, //NOLINT
     if (st != cudaSuccess) {
         // h->addr will be 0 so ucc_mc_cuda_mem_alloc_pool function will
         // return UCC_ERR_NO_MEMORY. As such mc_error message is suffice.
+        h->addr = NULL;
         cudaGetLastError();
         mc_error(&ucc_mc_cuda.super,
                  "failed to allocate %zd bytes, "
@@ -225,11 +233,39 @@ static void ucc_mc_cuda_chunk_cleanup(ucc_mpool_t *mp, void *obj) //NOLINT: mp i
     }
 }
 
+static void ucc_mc_cuda_chunk_init_managed(ucc_mpool_t *mp, //NOLINT
+                                           void *obj, void *chunk) //NOLINT
+{
+    ucc_mc_buffer_header_t *h = (ucc_mc_buffer_header_t *)obj;
+    cudaError_t             st;
+
+    st = cudaMallocManaged(&h->addr, MC_CUDA_CONFIG->mpool_elem_size,
+                           cudaMemAttachGlobal);
+    if (st != cudaSuccess) {
+        // h->addr will be 0 so ucc_mc_cuda_mem_alloc_pool function will
+        // return UCC_ERR_NO_MEMORY. As such mc_error message is suffice.
+        h->addr = NULL;
+        cudaGetLastError();
+        mc_error(&ucc_mc_cuda.super,
+                 "failed to allocate %zd bytes, "
+                 "cuda error %d(%s)",
+                 MC_CUDA_CONFIG->mpool_elem_size, st, cudaGetErrorString(st));
+    }
+    h->from_pool = 1;
+    h->mt        = UCC_MEMORY_TYPE_CUDA_MANAGED;
+}
+
+
 static ucc_mpool_ops_t ucc_mc_ops = {.chunk_alloc   = ucc_mc_cuda_chunk_alloc,
                                      .chunk_release = ucc_mc_cuda_chunk_release,
                                      .obj_init      = ucc_mc_cuda_chunk_init,
                                      .obj_cleanup   = ucc_mc_cuda_chunk_cleanup};
 
+static ucc_mpool_ops_t ucc_mc_managed_ops = {.chunk_alloc   = ucc_mc_cuda_chunk_alloc,
+                                             .chunk_release = ucc_mc_cuda_chunk_release,
+                                             .obj_init      = ucc_mc_cuda_chunk_init_managed,
+                                             .obj_cleanup   = ucc_mc_cuda_chunk_cleanup};
+
 static ucc_status_t ucc_mc_cuda_mem_free(ucc_mc_buffer_header_t *h_ptr)
 {
     cudaError_t st;
@@ -260,6 +296,8 @@ ucc_mc_cuda_mem_pool_alloc_with_init(ucc_mc_buffer_header_t **h_ptr,
                                      size_t                   size,
                                      ucc_memory_type_t        mt)
 {
+    ucc_status_t status;
+
     // lock assures single mpool initiation when multiple threads concurrently execute
     // different collective operations thus concurrently entering init function.
     ucc_spin_lock(&ucc_mc_cuda.init_spinlock);
@@ -272,14 +310,25 @@ ucc_mc_cuda_mem_pool_alloc_with_init(ucc_mc_buffer_header_t **h_ptr,
     }
 
     if (!ucc_mc_cuda.mpool_init_flag) {
-        ucc_status_t status = ucc_mpool_init(
+        status = ucc_mpool_init(
             &ucc_mc_cuda.mpool, 0, sizeof(ucc_mc_buffer_header_t), 0,
             UCC_CACHE_LINE_SIZE, 1, MC_CUDA_CONFIG->mpool_max_elems,
             &ucc_mc_ops, ucc_mc_cuda.thread_mode, "mc cuda mpool buffers");
         if (status != UCC_OK) {
             ucc_spin_unlock(&ucc_mc_cuda.init_spinlock);
             return status;
         }
+
+        status = ucc_mpool_init(
+            &ucc_mc_cuda.mpool_managed, 0, sizeof(ucc_mc_buffer_header_t), 0,
+            UCC_CACHE_LINE_SIZE, 1, MC_CUDA_CONFIG->mpool_max_elems,
+            &ucc_mc_managed_ops, ucc_mc_cuda.thread_mode, "mc cuda mpool buffers");
+        if (status != UCC_OK) {
+            ucc_spin_unlock(&ucc_mc_cuda.init_spinlock);
+            return status;
+        }
+
+
         ucc_mc_cuda.super.ops.mem_alloc = ucc_mc_cuda_mem_pool_alloc;
         ucc_mc_cuda.mpool_init_flag     = 1;
     }
@@ -422,6 +471,7 @@ static ucc_status_t ucc_mc_cuda_finalize()
     }
     if (ucc_mc_cuda.mpool_init_flag) {
         ucc_mpool_cleanup(&ucc_mc_cuda.mpool, 1);
+        ucc_mpool_cleanup(&ucc_mc_cuda.mpool_managed, 1);
         ucc_mc_cuda.mpool_init_flag     = 0;
         ucc_mc_cuda.super.ops.mem_alloc = ucc_mc_cuda_mem_pool_alloc_with_init;
     }

diff --git a/src/components/mc/cuda/mc_cuda.h b/src/components/mc/cuda/mc_cuda.h
@@ -1,5 +1,5 @@
 /**
- * Copyright (c) 2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2022-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  *
  * See file LICENSE for terms.
  */
@@ -27,6 +27,7 @@ typedef struct ucc_mc_cuda {
     ucc_mpool_t                    events;
     ucc_mpool_t                    strm_reqs;
     ucc_mpool_t                    mpool;
+    ucc_mpool_t                    mpool_managed;
     int                            mpool_init_flag;
     ucc_spinlock_t                 init_spinlock;
     ucc_thread_mode_t              thread_mode;

diff --git a/src/components/mc/ucc_mc.c b/src/components/mc/ucc_mc.c
@@ -145,8 +145,11 @@ UCC_MC_PROFILE_FUNC(ucc_status_t, ucc_mc_alloc, (h_ptr, size, mem_type),
 
 ucc_status_t ucc_mc_free(ucc_mc_buffer_header_t *h_ptr)
 {
-    UCC_CHECK_MC_AVAILABLE(h_ptr->mt);
-    return mc_ops[h_ptr->mt]->mem_free(h_ptr);
+    ucc_memory_type_t mt = (h_ptr->mt == UCC_MEMORY_TYPE_CUDA_MANAGED) ?
+                               UCC_MEMORY_TYPE_CUDA : h_ptr->mt;
+
+    UCC_CHECK_MC_AVAILABLE(mt);
+    return mc_ops[mt]->mem_free(h_ptr);
 }
 
 UCC_MC_PROFILE_FUNC(ucc_status_t, ucc_mc_memcpy,