diff --git a/src/components/tl/ucp/allgather/allgather.c b/src/components/tl/ucp/allgather/allgather.c index c33a98b5f6..769c4fb981 100644 --- a/src/components/tl/ucp/allgather/allgather.c +++ b/src/components/tl/ucp/allgather/allgather.c @@ -9,26 +9,6 @@ #define ALLGATHER_MAX_PATTERN_SIZE (sizeof(UCC_TL_UCP_ALLGATHER_DEFAULT_ALG_SELECT_STR)) -/*--------------YAELIS FUNCTION---------------------*/ - -ucc_status_t new_ucp_tl_self_copy_nb(void *dst, void *src, size_t len, ucc_memory_type_t dst_mem,ucc_memory_type_t src_mem, ucc_rank_t rank, ucc_tl_ucp_team_t *team, ucc_tl_ucp_task_t *task){ - ucc_status_t status; - status = ucc_tl_ucp_send_nb(src, len, src_mem, rank, team, task); - if (ucc_unlikely(UCC_OK != status)) { - printf("\n allgather.c line 18 \n"); - task->super.status = status; - return status; - } - status = ucc_tl_ucp_recv_nb(dst, len, dst_mem, rank, team, task); - if (ucc_unlikely(UCC_OK != status)) { - printf("\n allgather.c line 24 \n"); - task->super.status = status; - return status; - } - return UCC_OK; -} - -/*--------------YAELIS FUNCTION---------------------*/ ucc_base_coll_alg_info_t ucc_tl_ucp_allgather_algs[UCC_TL_UCP_ALLGATHER_ALG_LAST + 1] = { [UCC_TL_UCP_ALLGATHER_ALG_KNOMIAL] = diff --git a/src/components/tl/ucp/allgather/allgather.h b/src/components/tl/ucp/allgather/allgather.h index 6c123831df..b9269aacdd 100644 --- a/src/components/tl/ucp/allgather/allgather.h +++ b/src/components/tl/ucp/allgather/allgather.h @@ -10,13 +10,6 @@ #include "tl_ucp_sendrecv.h" - -#define NEW_MEMCPY(use_cuda, dst, src, len, dst_mem_type, src_mem_type, rank, team, task) \ - ((use_cuda) ? ucc_mc_memcpy(dst, src, len, dst_mem_type, src_mem_type) : \ - new_ucp_tl_self_copy_nb(dst, src, len, dst_mem_type, src_mem_type, rank, team, task)) - - - enum { UCC_TL_UCP_ALLGATHER_ALG_KNOMIAL, UCC_TL_UCP_ALLGATHER_ALG_RING, diff --git a/src/components/tl/ucp/allgather/allgather_bruck.c b/src/components/tl/ucp/allgather/allgather_bruck.c index 6ecafba50b..a1cb01cfc1 100644 --- a/src/components/tl/ucp/allgather/allgather_bruck.c +++ b/src/components/tl/ucp/allgather/allgather_bruck.c @@ -240,6 +240,7 @@ ucc_status_t ucc_tl_ucp_allgather_bruck_start(ucc_coll_task_t *coll_task) /* initial step: copy data on non root ranks to the beginning of buffer */ uint32_t USE_CUDA = UCC_TL_UCP_TEAM_LIB(team)->cfg.allgather_use_cuda; + if (!UCC_IS_INPLACE(TASK_ARGS(task))) { // not inplace: copy chunk from source buff to beginning of receive if(USE_CUDA){ @@ -254,6 +255,7 @@ ucc_status_t ucc_tl_ucp_allgather_bruck_start(ucc_coll_task_t *coll_task) } } else if (trank != 0) { + printf(" inplace\n"); // inplace: copy chunk to the begin if(USE_CUDA){ status = ucc_mc_memcpy(rbuf, PTR_OFFSET(rbuf, data_size * trank), diff --git a/src/components/tl/ucp/allgather/allgather_knomial.c b/src/components/tl/ucp/allgather/allgather_knomial.c index e3a296ef9e..5ae41af193 100644 --- a/src/components/tl/ucp/allgather/allgather_knomial.c +++ b/src/components/tl/ucp/allgather/allgather_knomial.c @@ -205,9 +205,7 @@ ucc_status_t ucc_tl_ucp_allgather_knomial_start(ucc_coll_task_t *coll_task) uint32_t USE_CUDA = UCC_TL_UCP_TEAM_LIB(team)->cfg.allgather_use_cuda; - if(rank==0){ - printf("knomial, rank0 start\n"); - } + UCC_TL_UCP_PROFILE_REQUEST_EVENT(coll_task, "ucp_allgather_kn_start", 0); ucc_tl_ucp_task_reset(task, UCC_INPROGRESS); task->allgather_kn.etask = NULL; @@ -239,9 +237,9 @@ ucc_status_t ucc_tl_ucp_allgather_knomial_start(ucc_coll_task_t *coll_task) } else { /* Loopback */ UCPCHECK_GOTO(ucc_tl_ucp_send_nb(args->src.info.buffer, args->src.info.count * ucc_dt_size(args->src.info.datatype), - args->src.info.mem_type, rank, team, task),task, out2); + args->src.info.mem_type, rank, team, task),task, enqueue); UCPCHECK_GOTO(ucc_tl_ucp_recv_nb(PTR_OFFSET(args->dst.info.buffer, offset), args->src.info.count * ucc_dt_size(args->src.info.datatype), - args->dst.info.mem_type, rank, team, task),task, out2); + args->dst.info.mem_type, rank, team, task),task, enqueue); } } @@ -256,7 +254,7 @@ ucc_status_t ucc_tl_ucp_allgather_knomial_start(ucc_coll_task_t *coll_task) ucc_knomial_pattern_loop_rank(p, rank), p->radix, 0); } -out2: +enqueue: task->allgather_kn.sbuf = PTR_OFFSET(args->dst.info.buffer, offset); return ucc_progress_queue_enqueue(UCC_TL_CORE_CTX(team)->pq, &task->super); } diff --git a/src/components/tl/ucp/allgather/allgather_neighbor.c b/src/components/tl/ucp/allgather/allgather_neighbor.c index 846b6d250f..8a2a4636f0 100644 --- a/src/components/tl/ucp/allgather/allgather_neighbor.c +++ b/src/components/tl/ucp/allgather/allgather_neighbor.c @@ -145,27 +145,27 @@ ucc_status_t ucc_tl_ucp_allgather_neighbor_start(ucc_coll_task_t *coll_task) ucc_rank_t neighbor; void *tmprecv, *tmpsend; + UCC_TL_UCP_PROFILE_REQUEST_EVENT(coll_task, "ucp_allgather_neighbor_start", 0); ucc_tl_ucp_task_reset(task, UCC_INPROGRESS); uint32_t USE_CUDA = UCC_TL_UCP_TEAM_LIB(team)->cfg.allgather_use_cuda; - + if (!UCC_IS_INPLACE(TASK_ARGS(task))) { - /* - status = ucc_mc_memcpy(PTR_OFFSET(rbuf, data_size * trank), sbuf, + if(!USE_CUDA){ + status = ucc_mc_memcpy(PTR_OFFSET(rbuf, data_size * trank), sbuf, data_size, rmem, smem); - if (ucc_unlikely(UCC_OK != status)) { - return status; - } - */ - status = NEW_MEMCPY(USE_CUDA, PTR_OFFSET(rbuf, data_size * trank), sbuf, data_size, rmem, smem, trank, team, task); - if (ucc_unlikely(UCC_OK != status)) { - printf("error neighbor line 162\n"); - return status; + if (ucc_unlikely(UCC_OK != status)) { + return status; + } + } else { + /* Loopback */ + UCPCHECK_GOTO(ucc_tl_ucp_send_nb(sbuf, data_size, smem, trank, team, task),task, tmp); + UCPCHECK_GOTO(ucc_tl_ucp_recv_nb(PTR_OFFSET(rbuf, data_size * trank), data_size, rmem, trank, team, task),task, tmp); } } - +tmp: if (trank % 2) { neighbor = (trank - 1 + tsize) % tsize; } else { diff --git a/src/components/tl/ucp/allgather/allgather_ring.c b/src/components/tl/ucp/allgather/allgather_ring.c index b4e3436a20..97ac6d74d0 100644 --- a/src/components/tl/ucp/allgather/allgather_ring.c +++ b/src/components/tl/ucp/allgather/allgather_ring.c @@ -97,7 +97,6 @@ ucc_status_t ucc_tl_ucp_allgather_ring_start(ucc_coll_task_t *coll_task) if (!UCC_IS_INPLACE(TASK_ARGS(task))) { block = task->allgather_ring.get_send_block(&task->subset, trank, tsize, 0); - if(USE_CUDA){ status = ucc_mc_memcpy(PTR_OFFSET(rbuf, data_size * block), sbuf, data_size, rmem, smem); diff --git a/src/components/tl/ucp/allgather/allgather_sparbit.c b/src/components/tl/ucp/allgather/allgather_sparbit.c index caef51199d..508c839245 100644 --- a/src/components/tl/ucp/allgather/allgather_sparbit.c +++ b/src/components/tl/ucp/allgather/allgather_sparbit.c @@ -131,7 +131,14 @@ ucc_status_t ucc_tl_ucp_allgather_sparbit_start(ucc_coll_task_t *coll_task) task->allgather_sparbit.data_expected = 1; uint32_t USE_CUDA = UCC_TL_UCP_TEAM_LIB(team)->cfg.allgather_use_cuda; - + if(trank == 0){ + printf("\nin sparbit using: "); + if(USE_CUDA){ + printf("cuda\n"); + } else { + printf("loop\n"); + } + } if (!UCC_IS_INPLACE(TASK_ARGS(task))) { if(USE_CUDA){ status = ucc_mc_memcpy(PTR_OFFSET(rbuf, data_size * trank), sbuf,