Skip to content

Commit

Permalink
MC/CUDA: add mpool for managed memory (openucx#856)
Browse files Browse the repository at this point in the history
* MC/CUDA: add mpool for managed memory

* REVIEW: fix review comments
  • Loading branch information
Sergei-Lebedev authored and janjust committed Jan 31, 2024
1 parent 4ffa6a9 commit 81095fc
Show file tree
Hide file tree
Showing 3 changed files with 61 additions and 7 deletions.
58 changes: 54 additions & 4 deletions src/components/mc/cuda/mc_cuda.c
Original file line number Diff line number Diff line change
Expand Up @@ -159,10 +159,17 @@ static ucc_status_t ucc_mc_cuda_mem_pool_alloc(ucc_mc_buffer_header_t **h_ptr,
ucc_memory_type_t mt)
{
ucc_mc_buffer_header_t *h = NULL;
if (size <= MC_CUDA_CONFIG->mpool_elem_size &&
mt != UCC_MEMORY_TYPE_CUDA_MANAGED) {
h = (ucc_mc_buffer_header_t *)ucc_mpool_get(&ucc_mc_cuda.mpool);

if (size <= MC_CUDA_CONFIG->mpool_elem_size) {
if (mt == UCC_MEMORY_TYPE_CUDA) {
h = (ucc_mc_buffer_header_t *)ucc_mpool_get(&ucc_mc_cuda.mpool);
} else if (mt == UCC_MEMORY_TYPE_CUDA_MANAGED) {
h = (ucc_mc_buffer_header_t *)ucc_mpool_get(&ucc_mc_cuda.mpool_managed);
} else {
return UCC_ERR_INVALID_PARAM;
}
}

if (!h) {
// Slow path
return ucc_mc_cuda_mem_alloc(h_ptr, size, mt);
Expand Down Expand Up @@ -196,6 +203,7 @@ static void ucc_mc_cuda_chunk_init(ucc_mpool_t *mp, //NOLINT
if (st != cudaSuccess) {
// h->addr will be 0 so ucc_mc_cuda_mem_alloc_pool function will
// return UCC_ERR_NO_MEMORY. As such mc_error message is suffice.
h->addr = NULL;
cudaGetLastError();
mc_error(&ucc_mc_cuda.super,
"failed to allocate %zd bytes, "
Expand Down Expand Up @@ -225,11 +233,39 @@ static void ucc_mc_cuda_chunk_cleanup(ucc_mpool_t *mp, void *obj) //NOLINT: mp i
}
}

static void ucc_mc_cuda_chunk_init_managed(ucc_mpool_t *mp, //NOLINT
void *obj, void *chunk) //NOLINT
{
ucc_mc_buffer_header_t *h = (ucc_mc_buffer_header_t *)obj;
cudaError_t st;

st = cudaMallocManaged(&h->addr, MC_CUDA_CONFIG->mpool_elem_size,
cudaMemAttachGlobal);
if (st != cudaSuccess) {
// h->addr will be 0 so ucc_mc_cuda_mem_alloc_pool function will
// return UCC_ERR_NO_MEMORY. As such mc_error message is suffice.
h->addr = NULL;
cudaGetLastError();
mc_error(&ucc_mc_cuda.super,
"failed to allocate %zd bytes, "
"cuda error %d(%s)",
MC_CUDA_CONFIG->mpool_elem_size, st, cudaGetErrorString(st));
}
h->from_pool = 1;
h->mt = UCC_MEMORY_TYPE_CUDA_MANAGED;
}


static ucc_mpool_ops_t ucc_mc_ops = {.chunk_alloc = ucc_mc_cuda_chunk_alloc,
.chunk_release = ucc_mc_cuda_chunk_release,
.obj_init = ucc_mc_cuda_chunk_init,
.obj_cleanup = ucc_mc_cuda_chunk_cleanup};

static ucc_mpool_ops_t ucc_mc_managed_ops = {.chunk_alloc = ucc_mc_cuda_chunk_alloc,
.chunk_release = ucc_mc_cuda_chunk_release,
.obj_init = ucc_mc_cuda_chunk_init_managed,
.obj_cleanup = ucc_mc_cuda_chunk_cleanup};

static ucc_status_t ucc_mc_cuda_mem_free(ucc_mc_buffer_header_t *h_ptr)
{
cudaError_t st;
Expand Down Expand Up @@ -260,6 +296,8 @@ ucc_mc_cuda_mem_pool_alloc_with_init(ucc_mc_buffer_header_t **h_ptr,
size_t size,
ucc_memory_type_t mt)
{
ucc_status_t status;

// lock assures single mpool initiation when multiple threads concurrently execute
// different collective operations thus concurrently entering init function.
ucc_spin_lock(&ucc_mc_cuda.init_spinlock);
Expand All @@ -272,14 +310,25 @@ ucc_mc_cuda_mem_pool_alloc_with_init(ucc_mc_buffer_header_t **h_ptr,
}

if (!ucc_mc_cuda.mpool_init_flag) {
ucc_status_t status = ucc_mpool_init(
status = ucc_mpool_init(
&ucc_mc_cuda.mpool, 0, sizeof(ucc_mc_buffer_header_t), 0,
UCC_CACHE_LINE_SIZE, 1, MC_CUDA_CONFIG->mpool_max_elems,
&ucc_mc_ops, ucc_mc_cuda.thread_mode, "mc cuda mpool buffers");
if (status != UCC_OK) {
ucc_spin_unlock(&ucc_mc_cuda.init_spinlock);
return status;
}

status = ucc_mpool_init(
&ucc_mc_cuda.mpool_managed, 0, sizeof(ucc_mc_buffer_header_t), 0,
UCC_CACHE_LINE_SIZE, 1, MC_CUDA_CONFIG->mpool_max_elems,
&ucc_mc_managed_ops, ucc_mc_cuda.thread_mode, "mc cuda mpool buffers");
if (status != UCC_OK) {
ucc_spin_unlock(&ucc_mc_cuda.init_spinlock);
return status;
}


ucc_mc_cuda.super.ops.mem_alloc = ucc_mc_cuda_mem_pool_alloc;
ucc_mc_cuda.mpool_init_flag = 1;
}
Expand Down Expand Up @@ -422,6 +471,7 @@ static ucc_status_t ucc_mc_cuda_finalize()
}
if (ucc_mc_cuda.mpool_init_flag) {
ucc_mpool_cleanup(&ucc_mc_cuda.mpool, 1);
ucc_mpool_cleanup(&ucc_mc_cuda.mpool_managed, 1);
ucc_mc_cuda.mpool_init_flag = 0;
ucc_mc_cuda.super.ops.mem_alloc = ucc_mc_cuda_mem_pool_alloc_with_init;
}
Expand Down
3 changes: 2 additions & 1 deletion src/components/mc/cuda/mc_cuda.h
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
/**
* Copyright (c) 2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
* Copyright (c) 2022-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
*
* See file LICENSE for terms.
*/
Expand Down Expand Up @@ -27,6 +27,7 @@ typedef struct ucc_mc_cuda {
ucc_mpool_t events;
ucc_mpool_t strm_reqs;
ucc_mpool_t mpool;
ucc_mpool_t mpool_managed;
int mpool_init_flag;
ucc_spinlock_t init_spinlock;
ucc_thread_mode_t thread_mode;
Expand Down
7 changes: 5 additions & 2 deletions src/components/mc/ucc_mc.c
Original file line number Diff line number Diff line change
Expand Up @@ -145,8 +145,11 @@ UCC_MC_PROFILE_FUNC(ucc_status_t, ucc_mc_alloc, (h_ptr, size, mem_type),

ucc_status_t ucc_mc_free(ucc_mc_buffer_header_t *h_ptr)
{
UCC_CHECK_MC_AVAILABLE(h_ptr->mt);
return mc_ops[h_ptr->mt]->mem_free(h_ptr);
ucc_memory_type_t mt = (h_ptr->mt == UCC_MEMORY_TYPE_CUDA_MANAGED) ?
UCC_MEMORY_TYPE_CUDA : h_ptr->mt;

UCC_CHECK_MC_AVAILABLE(mt);
return mc_ops[mt]->mem_free(h_ptr);
}

UCC_MC_PROFILE_FUNC(ucc_status_t, ucc_mc_memcpy,
Expand Down

0 comments on commit 81095fc

Please sign in to comment.