From 4c3d541bde0b0b7a4c772a955945b2f8a3b1f808 Mon Sep 17 00:00:00 2001 From: Sergey Lebedev Date: Tue, 7 Nov 2023 12:19:22 +0000 Subject: [PATCH 1/2] TOOLS: use separate cuda alloc in perftest --- tools/perf/ucc_pt_coll.cc | 24 ++++++++++++++++++++++++ tools/perf/ucc_pt_cuda.cc | 3 +++ tools/perf/ucc_pt_cuda.h | 30 ++++++++++++++++++++++++++++++ 3 files changed, 57 insertions(+) diff --git a/tools/perf/ucc_pt_coll.cc b/tools/perf/ucc_pt_coll.cc index a561ea73b4..31e6af22bc 100644 --- a/tools/perf/ucc_pt_coll.cc +++ b/tools/perf/ucc_pt_coll.cc @@ -5,11 +5,29 @@ */ #include "ucc_pt_coll.h" +#include "ucc_pt_cuda.h" ucc_status_t ucc_pt_alloc(ucc_mc_buffer_header_t **h_ptr, size_t len, ucc_memory_type_t mem_type) { ucc_status_t status; + int cuda_st; + + if (mem_type == UCC_MEMORY_TYPE_CUDA) { + *h_ptr = new ucc_mc_buffer_header_t; + (*h_ptr)->mt = UCC_MEMORY_TYPE_CUDA; + cuda_st = ucc_pt_cudaMalloc(&((*h_ptr)->addr), len); + if (cuda_st != 0) { + return UCC_ERR_NO_RESOURCE; + } + cuda_st = ucc_pt_cudaMemset((*h_ptr)->addr, 0, len); + if (cuda_st != 0) { + ucc_pt_cudaFree((*h_ptr)->addr); + delete *h_ptr; + return UCC_ERR_NO_RESOURCE; + } + return UCC_OK; + } status = ucc_mc_alloc(h_ptr, len, mem_type); if (status != UCC_OK) { @@ -26,6 +44,12 @@ ucc_status_t ucc_pt_alloc(ucc_mc_buffer_header_t **h_ptr, size_t len, ucc_status_t ucc_pt_free(ucc_mc_buffer_header_t *h_ptr) { + if (h_ptr->mt == UCC_MEMORY_TYPE_CUDA) { + ucc_pt_cudaFree(h_ptr->addr); + delete h_ptr; + return UCC_OK; + } + return ucc_mc_free(h_ptr); } diff --git a/tools/perf/ucc_pt_cuda.cc b/tools/perf/ucc_pt_cuda.cc index bcadabc955..f0807f88a9 100644 --- a/tools/perf/ucc_pt_cuda.cc +++ b/tools/perf/ucc_pt_cuda.cc @@ -36,6 +36,9 @@ void ucc_pt_cuda_init(void) LOAD_CUDA_SYM("cudaGetErrorString", getErrorString); LOAD_CUDA_SYM("cudaStreamCreateWithFlags", streamCreateWithFlags); LOAD_CUDA_SYM("cudaStreamDestroy", streamDestroy); + LOAD_CUDA_SYM("cudaMalloc", cudaMalloc); + LOAD_CUDA_SYM("cudaFree", cudaFree); + LOAD_CUDA_SYM("cudaMemset", cudaMemset); ucc_pt_cuda_iface.available = 1; } diff --git a/tools/perf/ucc_pt_cuda.h b/tools/perf/ucc_pt_cuda.h index 1a5844c1bc..05c1fbbbf8 100644 --- a/tools/perf/ucc_pt_cuda.h +++ b/tools/perf/ucc_pt_cuda.h @@ -31,6 +31,9 @@ typedef struct ucc_pt_cuda_iface { int (*streamCreateWithFlags)(cudaStream_t *stream, unsigned int flags); int (*streamDestroy)(cudaStream_t stream); char* (*getErrorString)(int err); + int (*cudaMalloc)(void **devptr, size_t size); + int (*cudaFree)(void *devptr); + int (*cudaMemset)(void *devptr, int value, size_t count); } ucc_pt_cuda_iface_t; extern ucc_pt_cuda_iface_t ucc_pt_cuda_iface; @@ -74,4 +77,31 @@ static inline int ucc_pt_cudaStreamDestroy(cudaStream_t stream) return 0; } +static inline int ucc_pt_cudaMalloc(void **devptr, size_t size) +{ + if (!ucc_pt_cuda_iface.available) { + return 1; + } + CUDA_CHECK(ucc_pt_cuda_iface.cudaMalloc(devptr, size)); + return 0; +} + +static inline int ucc_pt_cudaFree(void *devptr) +{ + if (!ucc_pt_cuda_iface.available) { + return 1; + } + CUDA_CHECK(ucc_pt_cuda_iface.cudaFree(devptr)); + return 0; +} + +static inline int ucc_pt_cudaMemset(void *devptr, int value, size_t count) +{ + if (!ucc_pt_cuda_iface.available) { + return 1; + } + CUDA_CHECK(ucc_pt_cuda_iface.cudaMemset(devptr, value, count)); + return 0; +} + #endif From 054c1d7183821a21522fb3f5a3b3d0e676e1fb63 Mon Sep 17 00:00:00 2001 From: Sergey Lebedev Date: Fri, 15 Dec 2023 10:51:42 +0000 Subject: [PATCH 2/2] REVIEW: fix review comments --- tools/perf/ucc_pt_coll.cc | 43 +++++++++++++++++++++++++++++++++++---- tools/perf/ucc_pt_cuda.cc | 1 + tools/perf/ucc_pt_cuda.h | 12 +++++++++++ 3 files changed, 52 insertions(+), 4 deletions(-) diff --git a/tools/perf/ucc_pt_coll.cc b/tools/perf/ucc_pt_coll.cc index 31e6af22bc..e013615dd8 100644 --- a/tools/perf/ucc_pt_coll.cc +++ b/tools/perf/ucc_pt_coll.cc @@ -6,6 +6,7 @@ #include "ucc_pt_coll.h" #include "ucc_pt_cuda.h" +#include "utils/ucc_malloc.h" ucc_status_t ucc_pt_alloc(ucc_mc_buffer_header_t **h_ptr, size_t len, ucc_memory_type_t mem_type) @@ -13,20 +14,46 @@ ucc_status_t ucc_pt_alloc(ucc_mc_buffer_header_t **h_ptr, size_t len, ucc_status_t status; int cuda_st; - if (mem_type == UCC_MEMORY_TYPE_CUDA) { + switch (mem_type) { + case UCC_MEMORY_TYPE_CUDA: *h_ptr = new ucc_mc_buffer_header_t; (*h_ptr)->mt = UCC_MEMORY_TYPE_CUDA; cuda_st = ucc_pt_cudaMalloc(&((*h_ptr)->addr), len); if (cuda_st != 0) { - return UCC_ERR_NO_RESOURCE; + return UCC_ERR_NO_MEMORY; } cuda_st = ucc_pt_cudaMemset((*h_ptr)->addr, 0, len); if (cuda_st != 0) { ucc_pt_cudaFree((*h_ptr)->addr); delete *h_ptr; - return UCC_ERR_NO_RESOURCE; + return UCC_ERR_NO_MEMORY; } return UCC_OK; + case UCC_MEMORY_TYPE_CUDA_MANAGED: + *h_ptr = new ucc_mc_buffer_header_t; + (*h_ptr)->mt = UCC_MEMORY_TYPE_CUDA_MANAGED; + cuda_st = ucc_pt_cudaMallocManaged(&((*h_ptr)->addr), len); + if (cuda_st != 0) { + return UCC_ERR_NO_MEMORY; + } + cuda_st = ucc_pt_cudaMemset((*h_ptr)->addr, 0, len); + if (cuda_st != 0) { + ucc_pt_cudaFree((*h_ptr)->addr); + delete *h_ptr; + return UCC_ERR_NO_MEMORY; + } + return UCC_OK; + case UCC_MEMORY_TYPE_HOST: + *h_ptr = new ucc_mc_buffer_header_t; + (*h_ptr)->mt = UCC_MEMORY_TYPE_HOST; + (*h_ptr)->addr = ucc_malloc(len, "perftest data"); + if (!((*h_ptr)->addr)) { + return UCC_ERR_NO_MEMORY; + } + memset((*h_ptr)->addr, 0, len); + return UCC_OK; + default: + break; } status = ucc_mc_alloc(h_ptr, len, mem_type); @@ -44,10 +71,18 @@ ucc_status_t ucc_pt_alloc(ucc_mc_buffer_header_t **h_ptr, size_t len, ucc_status_t ucc_pt_free(ucc_mc_buffer_header_t *h_ptr) { - if (h_ptr->mt == UCC_MEMORY_TYPE_CUDA) { + switch (h_ptr->mt) { + case UCC_MEMORY_TYPE_CUDA: + case UCC_MEMORY_TYPE_CUDA_MANAGED: ucc_pt_cudaFree(h_ptr->addr); delete h_ptr; return UCC_OK; + case UCC_MEMORY_TYPE_HOST: + ucc_free(h_ptr->addr); + delete h_ptr; + return UCC_OK; + default: + break; } return ucc_mc_free(h_ptr); diff --git a/tools/perf/ucc_pt_cuda.cc b/tools/perf/ucc_pt_cuda.cc index f0807f88a9..1d9e55ab4a 100644 --- a/tools/perf/ucc_pt_cuda.cc +++ b/tools/perf/ucc_pt_cuda.cc @@ -39,6 +39,7 @@ void ucc_pt_cuda_init(void) LOAD_CUDA_SYM("cudaMalloc", cudaMalloc); LOAD_CUDA_SYM("cudaFree", cudaFree); LOAD_CUDA_SYM("cudaMemset", cudaMemset); + LOAD_CUDA_SYM("cudaMallocManaged", cudaMallocManaged); ucc_pt_cuda_iface.available = 1; } diff --git a/tools/perf/ucc_pt_cuda.h b/tools/perf/ucc_pt_cuda.h index 05c1fbbbf8..5a370c3528 100644 --- a/tools/perf/ucc_pt_cuda.h +++ b/tools/perf/ucc_pt_cuda.h @@ -10,6 +10,7 @@ #define cudaSuccess 0 #define cudaStreamNonBlocking 0x01 /**< Stream does not synchronize with stream 0 (the NULL stream) */ +#define cudaMemAttachGlobal 0x01 /**< Memory can be accessed by any stream on any device*/ typedef struct CUStream_st *cudaStream_t; #define STR(x) #x @@ -32,6 +33,7 @@ typedef struct ucc_pt_cuda_iface { int (*streamDestroy)(cudaStream_t stream); char* (*getErrorString)(int err); int (*cudaMalloc)(void **devptr, size_t size); + int (*cudaMallocManaged)(void **ptr, size_t size, unsigned int flags); int (*cudaFree)(void *devptr); int (*cudaMemset)(void *devptr, int value, size_t count); } ucc_pt_cuda_iface_t; @@ -86,6 +88,16 @@ static inline int ucc_pt_cudaMalloc(void **devptr, size_t size) return 0; } +static inline int ucc_pt_cudaMallocManaged(void **ptr, size_t size) +{ + if (!ucc_pt_cuda_iface.available) { + return 1; + } + CUDA_CHECK(ucc_pt_cuda_iface.cudaMallocManaged(ptr, size, + cudaMemAttachGlobal)); + return 0; +} + static inline int ucc_pt_cudaFree(void *devptr) { if (!ucc_pt_cuda_iface.available) {