From 88149a5d8574a63da7efc0347b5972024f7b6dc8 Mon Sep 17 00:00:00 2001
From: Sergey Lebedev <sergeyle@nvidia.com>
Date: Thu, 12 Oct 2023 17:59:21 +0000
Subject: [PATCH] MC/CUDA: add mpool for managed memory

---
 src/components/mc/cuda/mc_cuda.c | 53 +++++++++++++++++++++++++++++---
 src/components/mc/cuda/mc_cuda.h |  1 +
 src/components/mc/ucc_mc.c       |  7 +++--
 3 files changed, 55 insertions(+), 6 deletions(-)

diff --git a/src/components/mc/cuda/mc_cuda.c b/src/components/mc/cuda/mc_cuda.c
index 5c820bd768..9eebe143c2 100644
--- a/src/components/mc/cuda/mc_cuda.c
+++ b/src/components/mc/cuda/mc_cuda.c
@@ -152,10 +152,15 @@ static ucc_status_t ucc_mc_cuda_mem_pool_alloc(ucc_mc_buffer_header_t **h_ptr,
                                                ucc_memory_type_t        mt)
 {
     ucc_mc_buffer_header_t *h = NULL;
-    if (size <= MC_CUDA_CONFIG->mpool_elem_size &&
-        mt != UCC_MEMORY_TYPE_CUDA_MANAGED) {
-        h = (ucc_mc_buffer_header_t *)ucc_mpool_get(&ucc_mc_cuda.mpool);
+
+    if (size <= MC_CUDA_CONFIG->mpool_elem_size) {
+        if (mt == UCC_MEMORY_TYPE_CUDA) {
+            h = (ucc_mc_buffer_header_t *)ucc_mpool_get(&ucc_mc_cuda.mpool);
+        } else {
+            h = (ucc_mc_buffer_header_t *)ucc_mpool_get(&ucc_mc_cuda.mpool_managed);
+        }
     }
+
     if (!h) {
         // Slow path
         return ucc_mc_cuda_mem_alloc(h_ptr, size, mt);
@@ -218,11 +223,37 @@ static void ucc_mc_cuda_chunk_cleanup(ucc_mpool_t *mp, void *obj) //NOLINT: mp i
     }
 }
 
+static void ucc_mc_cuda_chunk_init_managed(ucc_mpool_t *mp, //NOLINT
+                                           void *obj, void *chunk) //NOLINT
+{
+    ucc_mc_buffer_header_t *h = (ucc_mc_buffer_header_t *)obj;
+    cudaError_t             st;
+
+    st = cudaMallocManaged(&h->addr, MC_CUDA_CONFIG->mpool_elem_size, cudaMemAttachGlobal);
+    if (st != cudaSuccess) {
+        // h->addr will be 0 so ucc_mc_cuda_mem_alloc_pool function will
+        // return UCC_ERR_NO_MEMORY. As such mc_error message is suffice.
+        cudaGetLastError();
+        mc_error(&ucc_mc_cuda.super,
+                 "failed to allocate %zd bytes, "
+                 "cuda error %d(%s)",
+                 MC_CUDA_CONFIG->mpool_elem_size, st, cudaGetErrorString(st));
+    }
+    h->from_pool = 1;
+    h->mt        = UCC_MEMORY_TYPE_CUDA_MANAGED;
+}
+
+
 static ucc_mpool_ops_t ucc_mc_ops = {.chunk_alloc   = ucc_mc_cuda_chunk_alloc,
                                      .chunk_release = ucc_mc_cuda_chunk_release,
                                      .obj_init      = ucc_mc_cuda_chunk_init,
                                      .obj_cleanup   = ucc_mc_cuda_chunk_cleanup};
 
+static ucc_mpool_ops_t ucc_mc_managed_ops = {.chunk_alloc   = ucc_mc_cuda_chunk_alloc,
+                                             .chunk_release = ucc_mc_cuda_chunk_release,
+                                             .obj_init      = ucc_mc_cuda_chunk_init_managed,
+                                             .obj_cleanup   = ucc_mc_cuda_chunk_cleanup};
+
 static ucc_status_t ucc_mc_cuda_mem_free(ucc_mc_buffer_header_t *h_ptr)
 {
     cudaError_t st;
@@ -253,6 +284,8 @@ ucc_mc_cuda_mem_pool_alloc_with_init(ucc_mc_buffer_header_t **h_ptr,
                                      size_t                   size,
                                      ucc_memory_type_t        mt)
 {
+    ucc_status_t status;
+
     // lock assures single mpool initiation when multiple threads concurrently execute
     // different collective operations thus concurrently entering init function.
     ucc_spin_lock(&ucc_mc_cuda.init_spinlock);
@@ -265,7 +298,7 @@ ucc_mc_cuda_mem_pool_alloc_with_init(ucc_mc_buffer_header_t **h_ptr,
     }
 
     if (!ucc_mc_cuda.mpool_init_flag) {
-        ucc_status_t status = ucc_mpool_init(
+        status = ucc_mpool_init(
             &ucc_mc_cuda.mpool, 0, sizeof(ucc_mc_buffer_header_t), 0,
             UCC_CACHE_LINE_SIZE, 1, MC_CUDA_CONFIG->mpool_max_elems,
             &ucc_mc_ops, ucc_mc_cuda.thread_mode, "mc cuda mpool buffers");
@@ -273,6 +306,17 @@ ucc_mc_cuda_mem_pool_alloc_with_init(ucc_mc_buffer_header_t **h_ptr,
             ucc_spin_unlock(&ucc_mc_cuda.init_spinlock);
             return status;
         }
+
+        status = ucc_mpool_init(
+            &ucc_mc_cuda.mpool_managed, 0, sizeof(ucc_mc_buffer_header_t), 0,
+            UCC_CACHE_LINE_SIZE, 1, MC_CUDA_CONFIG->mpool_max_elems,
+            &ucc_mc_managed_ops, ucc_mc_cuda.thread_mode, "mc cuda mpool buffers");
+        if (status != UCC_OK) {
+            ucc_spin_unlock(&ucc_mc_cuda.init_spinlock);
+            return status;
+        }
+
+
         ucc_mc_cuda.super.ops.mem_alloc = ucc_mc_cuda_mem_pool_alloc;
         ucc_mc_cuda.mpool_init_flag     = 1;
     }
@@ -415,6 +459,7 @@ static ucc_status_t ucc_mc_cuda_finalize()
     }
     if (ucc_mc_cuda.mpool_init_flag) {
         ucc_mpool_cleanup(&ucc_mc_cuda.mpool, 1);
+        ucc_mpool_cleanup(&ucc_mc_cuda.mpool_managed, 1);
         ucc_mc_cuda.mpool_init_flag     = 0;
         ucc_mc_cuda.super.ops.mem_alloc = ucc_mc_cuda_mem_pool_alloc_with_init;
     }
diff --git a/src/components/mc/cuda/mc_cuda.h b/src/components/mc/cuda/mc_cuda.h
index abc82312c2..13e8ff1cc3 100644
--- a/src/components/mc/cuda/mc_cuda.h
+++ b/src/components/mc/cuda/mc_cuda.h
@@ -27,6 +27,7 @@ typedef struct ucc_mc_cuda {
     ucc_mpool_t                    events;
     ucc_mpool_t                    strm_reqs;
     ucc_mpool_t                    mpool;
+    ucc_mpool_t                    mpool_managed;
     int                            mpool_init_flag;
     ucc_spinlock_t                 init_spinlock;
     ucc_thread_mode_t              thread_mode;
diff --git a/src/components/mc/ucc_mc.c b/src/components/mc/ucc_mc.c
index 2e8208d0f0..dd5e628c80 100644
--- a/src/components/mc/ucc_mc.c
+++ b/src/components/mc/ucc_mc.c
@@ -134,8 +134,11 @@ UCC_MC_PROFILE_FUNC(ucc_status_t, ucc_mc_alloc, (h_ptr, size, mem_type),
 
 ucc_status_t ucc_mc_free(ucc_mc_buffer_header_t *h_ptr)
 {
-    UCC_CHECK_MC_AVAILABLE(h_ptr->mt);
-    return mc_ops[h_ptr->mt]->mem_free(h_ptr);
+    ucc_memory_type_t mt = (h_ptr->mt == UCC_MEMORY_TYPE_CUDA_MANAGED) ?
+                               UCC_MEMORY_TYPE_CUDA : h_ptr->mt;
+
+    UCC_CHECK_MC_AVAILABLE(mt);
+    return mc_ops[mt]->mem_free(h_ptr);
 }
 
 UCC_MC_PROFILE_FUNC(ucc_status_t, ucc_mc_memcpy,