Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

MC/CUDA: add mpool for managed memory #856

Merged
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
58 changes: 54 additions & 4 deletions src/components/mc/cuda/mc_cuda.c
Original file line number Diff line number Diff line change
Expand Up @@ -159,10 +159,17 @@ static ucc_status_t ucc_mc_cuda_mem_pool_alloc(ucc_mc_buffer_header_t **h_ptr,
ucc_memory_type_t mt)
{
ucc_mc_buffer_header_t *h = NULL;
if (size <= MC_CUDA_CONFIG->mpool_elem_size &&
mt != UCC_MEMORY_TYPE_CUDA_MANAGED) {
h = (ucc_mc_buffer_header_t *)ucc_mpool_get(&ucc_mc_cuda.mpool);

if (size <= MC_CUDA_CONFIG->mpool_elem_size) {
if (mt == UCC_MEMORY_TYPE_CUDA) {
h = (ucc_mc_buffer_header_t *)ucc_mpool_get(&ucc_mc_cuda.mpool);
} else if (mt == UCC_MEMORY_TYPE_CUDA_MANAGED) {
h = (ucc_mc_buffer_header_t *)ucc_mpool_get(&ucc_mc_cuda.mpool_managed);
} else {
return UCC_ERR_INVALID_PARAM;
}
}

if (!h) {
// Slow path
return ucc_mc_cuda_mem_alloc(h_ptr, size, mt);
Expand Down Expand Up @@ -196,6 +203,7 @@ static void ucc_mc_cuda_chunk_init(ucc_mpool_t *mp, //NOLINT
if (st != cudaSuccess) {
// h->addr will be 0 so ucc_mc_cuda_mem_alloc_pool function will
// return UCC_ERR_NO_MEMORY. As such mc_error message is suffice.
h->addr = NULL;
cudaGetLastError();
mc_error(&ucc_mc_cuda.super,
"failed to allocate %zd bytes, "
Expand Down Expand Up @@ -225,11 +233,39 @@ static void ucc_mc_cuda_chunk_cleanup(ucc_mpool_t *mp, void *obj) //NOLINT: mp i
}
}

static void ucc_mc_cuda_chunk_init_managed(ucc_mpool_t *mp, //NOLINT
void *obj, void *chunk) //NOLINT
{
ucc_mc_buffer_header_t *h = (ucc_mc_buffer_header_t *)obj;
cudaError_t st;

st = cudaMallocManaged(&h->addr, MC_CUDA_CONFIG->mpool_elem_size,
cudaMemAttachGlobal);
if (st != cudaSuccess) {
// h->addr will be 0 so ucc_mc_cuda_mem_alloc_pool function will
Sergei-Lebedev marked this conversation as resolved.
Show resolved Hide resolved
// return UCC_ERR_NO_MEMORY. As such mc_error message is suffice.
h->addr = NULL;
cudaGetLastError();
mc_error(&ucc_mc_cuda.super,
"failed to allocate %zd bytes, "
"cuda error %d(%s)",
MC_CUDA_CONFIG->mpool_elem_size, st, cudaGetErrorString(st));
}
h->from_pool = 1;
h->mt = UCC_MEMORY_TYPE_CUDA_MANAGED;
}


static ucc_mpool_ops_t ucc_mc_ops = {.chunk_alloc = ucc_mc_cuda_chunk_alloc,
.chunk_release = ucc_mc_cuda_chunk_release,
.obj_init = ucc_mc_cuda_chunk_init,
.obj_cleanup = ucc_mc_cuda_chunk_cleanup};

static ucc_mpool_ops_t ucc_mc_managed_ops = {.chunk_alloc = ucc_mc_cuda_chunk_alloc,
.chunk_release = ucc_mc_cuda_chunk_release,
.obj_init = ucc_mc_cuda_chunk_init_managed,
.obj_cleanup = ucc_mc_cuda_chunk_cleanup};

static ucc_status_t ucc_mc_cuda_mem_free(ucc_mc_buffer_header_t *h_ptr)
{
cudaError_t st;
Expand Down Expand Up @@ -260,6 +296,8 @@ ucc_mc_cuda_mem_pool_alloc_with_init(ucc_mc_buffer_header_t **h_ptr,
size_t size,
ucc_memory_type_t mt)
{
ucc_status_t status;

// lock assures single mpool initiation when multiple threads concurrently execute
// different collective operations thus concurrently entering init function.
ucc_spin_lock(&ucc_mc_cuda.init_spinlock);
Expand All @@ -272,14 +310,25 @@ ucc_mc_cuda_mem_pool_alloc_with_init(ucc_mc_buffer_header_t **h_ptr,
}

if (!ucc_mc_cuda.mpool_init_flag) {
ucc_status_t status = ucc_mpool_init(
status = ucc_mpool_init(
&ucc_mc_cuda.mpool, 0, sizeof(ucc_mc_buffer_header_t), 0,
UCC_CACHE_LINE_SIZE, 1, MC_CUDA_CONFIG->mpool_max_elems,
&ucc_mc_ops, ucc_mc_cuda.thread_mode, "mc cuda mpool buffers");
if (status != UCC_OK) {
ucc_spin_unlock(&ucc_mc_cuda.init_spinlock);
return status;
}

status = ucc_mpool_init(
&ucc_mc_cuda.mpool_managed, 0, sizeof(ucc_mc_buffer_header_t), 0,
UCC_CACHE_LINE_SIZE, 1, MC_CUDA_CONFIG->mpool_max_elems,
&ucc_mc_managed_ops, ucc_mc_cuda.thread_mode, "mc cuda mpool buffers");
if (status != UCC_OK) {
ucc_spin_unlock(&ucc_mc_cuda.init_spinlock);
return status;
}


ucc_mc_cuda.super.ops.mem_alloc = ucc_mc_cuda_mem_pool_alloc;
ucc_mc_cuda.mpool_init_flag = 1;
}
Expand Down Expand Up @@ -422,6 +471,7 @@ static ucc_status_t ucc_mc_cuda_finalize()
}
if (ucc_mc_cuda.mpool_init_flag) {
ucc_mpool_cleanup(&ucc_mc_cuda.mpool, 1);
ucc_mpool_cleanup(&ucc_mc_cuda.mpool_managed, 1);
ucc_mc_cuda.mpool_init_flag = 0;
ucc_mc_cuda.super.ops.mem_alloc = ucc_mc_cuda_mem_pool_alloc_with_init;
}
Expand Down
3 changes: 2 additions & 1 deletion src/components/mc/cuda/mc_cuda.h
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
/**
* Copyright (c) 2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
* Copyright (c) 2022-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
*
* See file LICENSE for terms.
*/
Expand Down Expand Up @@ -27,6 +27,7 @@ typedef struct ucc_mc_cuda {
ucc_mpool_t events;
ucc_mpool_t strm_reqs;
ucc_mpool_t mpool;
ucc_mpool_t mpool_managed;
Sergei-Lebedev marked this conversation as resolved.
Show resolved Hide resolved
int mpool_init_flag;
ucc_spinlock_t init_spinlock;
ucc_thread_mode_t thread_mode;
Expand Down
7 changes: 5 additions & 2 deletions src/components/mc/ucc_mc.c
Original file line number Diff line number Diff line change
Expand Up @@ -145,8 +145,11 @@ UCC_MC_PROFILE_FUNC(ucc_status_t, ucc_mc_alloc, (h_ptr, size, mem_type),

ucc_status_t ucc_mc_free(ucc_mc_buffer_header_t *h_ptr)
{
UCC_CHECK_MC_AVAILABLE(h_ptr->mt);
return mc_ops[h_ptr->mt]->mem_free(h_ptr);
ucc_memory_type_t mt = (h_ptr->mt == UCC_MEMORY_TYPE_CUDA_MANAGED) ?
UCC_MEMORY_TYPE_CUDA : h_ptr->mt;

UCC_CHECK_MC_AVAILABLE(mt);
return mc_ops[mt]->mem_free(h_ptr);
}

UCC_MC_PROFILE_FUNC(ucc_status_t, ucc_mc_memcpy,
Expand Down
Loading