diff --git a/tools/perf/ucc_pt_coll.cc b/tools/perf/ucc_pt_coll.cc index a561ea73b4..e013615dd8 100644 --- a/tools/perf/ucc_pt_coll.cc +++ b/tools/perf/ucc_pt_coll.cc @@ -5,11 +5,56 @@ */ #include "ucc_pt_coll.h" +#include "ucc_pt_cuda.h" +#include "utils/ucc_malloc.h" ucc_status_t ucc_pt_alloc(ucc_mc_buffer_header_t **h_ptr, size_t len, ucc_memory_type_t mem_type) { ucc_status_t status; + int cuda_st; + + switch (mem_type) { + case UCC_MEMORY_TYPE_CUDA: + *h_ptr = new ucc_mc_buffer_header_t; + (*h_ptr)->mt = UCC_MEMORY_TYPE_CUDA; + cuda_st = ucc_pt_cudaMalloc(&((*h_ptr)->addr), len); + if (cuda_st != 0) { + return UCC_ERR_NO_MEMORY; + } + cuda_st = ucc_pt_cudaMemset((*h_ptr)->addr, 0, len); + if (cuda_st != 0) { + ucc_pt_cudaFree((*h_ptr)->addr); + delete *h_ptr; + return UCC_ERR_NO_MEMORY; + } + return UCC_OK; + case UCC_MEMORY_TYPE_CUDA_MANAGED: + *h_ptr = new ucc_mc_buffer_header_t; + (*h_ptr)->mt = UCC_MEMORY_TYPE_CUDA_MANAGED; + cuda_st = ucc_pt_cudaMallocManaged(&((*h_ptr)->addr), len); + if (cuda_st != 0) { + return UCC_ERR_NO_MEMORY; + } + cuda_st = ucc_pt_cudaMemset((*h_ptr)->addr, 0, len); + if (cuda_st != 0) { + ucc_pt_cudaFree((*h_ptr)->addr); + delete *h_ptr; + return UCC_ERR_NO_MEMORY; + } + return UCC_OK; + case UCC_MEMORY_TYPE_HOST: + *h_ptr = new ucc_mc_buffer_header_t; + (*h_ptr)->mt = UCC_MEMORY_TYPE_HOST; + (*h_ptr)->addr = ucc_malloc(len, "perftest data"); + if (!((*h_ptr)->addr)) { + return UCC_ERR_NO_MEMORY; + } + memset((*h_ptr)->addr, 0, len); + return UCC_OK; + default: + break; + } status = ucc_mc_alloc(h_ptr, len, mem_type); if (status != UCC_OK) { @@ -26,6 +71,20 @@ ucc_status_t ucc_pt_alloc(ucc_mc_buffer_header_t **h_ptr, size_t len, ucc_status_t ucc_pt_free(ucc_mc_buffer_header_t *h_ptr) { + switch (h_ptr->mt) { + case UCC_MEMORY_TYPE_CUDA: + case UCC_MEMORY_TYPE_CUDA_MANAGED: + ucc_pt_cudaFree(h_ptr->addr); + delete h_ptr; + return UCC_OK; + case UCC_MEMORY_TYPE_HOST: + ucc_free(h_ptr->addr); + delete h_ptr; + return UCC_OK; + default: + break; + } + return ucc_mc_free(h_ptr); } diff --git a/tools/perf/ucc_pt_cuda.cc b/tools/perf/ucc_pt_cuda.cc index bcadabc955..1d9e55ab4a 100644 --- a/tools/perf/ucc_pt_cuda.cc +++ b/tools/perf/ucc_pt_cuda.cc @@ -36,6 +36,10 @@ void ucc_pt_cuda_init(void) LOAD_CUDA_SYM("cudaGetErrorString", getErrorString); LOAD_CUDA_SYM("cudaStreamCreateWithFlags", streamCreateWithFlags); LOAD_CUDA_SYM("cudaStreamDestroy", streamDestroy); + LOAD_CUDA_SYM("cudaMalloc", cudaMalloc); + LOAD_CUDA_SYM("cudaFree", cudaFree); + LOAD_CUDA_SYM("cudaMemset", cudaMemset); + LOAD_CUDA_SYM("cudaMallocManaged", cudaMallocManaged); ucc_pt_cuda_iface.available = 1; } diff --git a/tools/perf/ucc_pt_cuda.h b/tools/perf/ucc_pt_cuda.h index 1a5844c1bc..5a370c3528 100644 --- a/tools/perf/ucc_pt_cuda.h +++ b/tools/perf/ucc_pt_cuda.h @@ -10,6 +10,7 @@ #define cudaSuccess 0 #define cudaStreamNonBlocking 0x01 /**< Stream does not synchronize with stream 0 (the NULL stream) */ +#define cudaMemAttachGlobal 0x01 /**< Memory can be accessed by any stream on any device*/ typedef struct CUStream_st *cudaStream_t; #define STR(x) #x @@ -31,6 +32,10 @@ typedef struct ucc_pt_cuda_iface { int (*streamCreateWithFlags)(cudaStream_t *stream, unsigned int flags); int (*streamDestroy)(cudaStream_t stream); char* (*getErrorString)(int err); + int (*cudaMalloc)(void **devptr, size_t size); + int (*cudaMallocManaged)(void **ptr, size_t size, unsigned int flags); + int (*cudaFree)(void *devptr); + int (*cudaMemset)(void *devptr, int value, size_t count); } ucc_pt_cuda_iface_t; extern ucc_pt_cuda_iface_t ucc_pt_cuda_iface; @@ -74,4 +79,41 @@ static inline int ucc_pt_cudaStreamDestroy(cudaStream_t stream) return 0; } +static inline int ucc_pt_cudaMalloc(void **devptr, size_t size) +{ + if (!ucc_pt_cuda_iface.available) { + return 1; + } + CUDA_CHECK(ucc_pt_cuda_iface.cudaMalloc(devptr, size)); + return 0; +} + +static inline int ucc_pt_cudaMallocManaged(void **ptr, size_t size) +{ + if (!ucc_pt_cuda_iface.available) { + return 1; + } + CUDA_CHECK(ucc_pt_cuda_iface.cudaMallocManaged(ptr, size, + cudaMemAttachGlobal)); + return 0; +} + +static inline int ucc_pt_cudaFree(void *devptr) +{ + if (!ucc_pt_cuda_iface.available) { + return 1; + } + CUDA_CHECK(ucc_pt_cuda_iface.cudaFree(devptr)); + return 0; +} + +static inline int ucc_pt_cudaMemset(void *devptr, int value, size_t count) +{ + if (!ucc_pt_cuda_iface.available) { + return 1; + } + CUDA_CHECK(ucc_pt_cuda_iface.cudaMemset(devptr, value, count)); + return 0; +} + #endif