diff --git a/.azure/azure-pipelines-pr.yml b/.azure/azure-pipelines-pr.yml index 431012825e..ab09b644f6 100644 --- a/.azure/azure-pipelines-pr.yml +++ b/.azure/azure-pipelines-pr.yml @@ -96,7 +96,7 @@ stages: --prefix=$(Build.Repository.LocalPath)/install --enable-gtest make -j install displayName: Build ucc artifact - timeoutInMinutes: 50 + timeoutInMinutes: 60 - bash: | cd build make gtest diff --git a/src/coll_patterns/recursive_knomial.h b/src/coll_patterns/recursive_knomial.h index 78da2a9c82..c165d08e2c 100644 --- a/src/coll_patterns/recursive_knomial.h +++ b/src/coll_patterns/recursive_knomial.h @@ -211,6 +211,25 @@ ucc_knomial_pattern_get_min_radix(ucc_kn_radix_t cfg_radix, return radix; } +/* Calculates for each rank at which distance it should recieve */ +static inline ucc_rank_t +ucc_knomial_calc_recv_dist(ucc_rank_t team_size, ucc_rank_t rank, + ucc_rank_t radix, ucc_rank_t root) +{ + if (rank == root) { + return 0; + } + ucc_rank_t root_base = 0 ; + ucc_rank_t dist = 1; + while (dist <= team_size) { + if (rank < root_base + radix * dist) { + break; + } + dist *= radix; + } + return dist; +} + /* A set of convenience macros used to implement sw based progress of the algorithms that use kn pattern */ enum { diff --git a/src/coll_patterns/sra_knomial.h b/src/coll_patterns/sra_knomial.h index 173048d1f2..ca3a22a215 100644 --- a/src/coll_patterns/sra_knomial.h +++ b/src/coll_patterns/sra_knomial.h @@ -90,45 +90,27 @@ ucc_sra_kn_get_offset_and_seglen(size_t count, size_t dt_size, ucc_rank_t rank, ptrdiff_t *offset, size_t *seglen) { ptrdiff_t _offset = 0; - size_t block_count = count; - ucc_rank_t step_radix = 0; size_t my_seg_len = 0; - ucc_rank_t k, r, peer, my_si; + ucc_rank_t my_si, step_radix; size_t my_seg_offset; ucc_knomial_pattern_t p; ucc_knomial_pattern_init(size, rank, radix, &p); - if (KN_NODE_EXTRA == p.node_type) { - if (offset) - *offset = 0; - if (seglen) - *seglen = count; - return; + goto out; } + while (!ucc_knomial_pattern_loop_done(&p)) { - r = 0; - for (k = 1; k < p.radix; k++) { - peer = ucc_knomial_pattern_get_loop_peer(&p, rank, k); - if (peer == UCC_KN_PEER_NULL) - continue; - r++; - } - step_radix = r + 1; - my_si = ucc_kn_compute_seg_index(rank, p.radix_pow, &p); - my_seg_offset = - ucc_sra_kn_compute_seg_offset(block_count, step_radix, my_si); + step_radix = ucc_kn_compute_step_radix(&p); + my_si = ucc_kn_compute_seg_index(rank, p.radix_pow, &p); + my_seg_offset = ucc_sra_kn_compute_seg_offset(count, step_radix, my_si); + count = ucc_sra_kn_compute_seg_size(count, step_radix, my_si); _offset += my_seg_offset * dt_size; - if (!ucc_knomial_pattern_loop_last_iteration(&p)) { - block_count = - ucc_sra_kn_compute_seg_size(block_count, step_radix, my_si); - } ucc_knomial_pattern_next_iteration(&p); } - if (step_radix) { - my_seg_len = - ucc_sra_kn_compute_seg_size(block_count, step_radix, my_si); - } + my_seg_len = count; + +out: if (offset) *offset = _offset; if (seglen) diff --git a/src/components/tl/ucp/allgather/allgather_knomial.c b/src/components/tl/ucp/allgather/allgather_knomial.c index d6b57cc6eb..e7bcf014f6 100644 --- a/src/components/tl/ucp/allgather/allgather_knomial.c +++ b/src/components/tl/ucp/allgather/allgather_knomial.c @@ -30,6 +30,7 @@ * a new virtual rank number - "vrank". * As such allgather must keep to this ranking to be aligned with scatter. */ + void ucc_tl_ucp_allgather_knomial_progress(ucc_coll_task_t *coll_task) { ucc_tl_ucp_task_t *task = ucc_derived_of(coll_task, @@ -51,7 +52,7 @@ void ucc_tl_ucp_allgather_knomial_progress(ucc_coll_task_t *coll_task) size_t local = GET_LOCAL_COUNT(args, size, rank); void *sbuf; ptrdiff_t peer_seg_offset, local_seg_offset; - ucc_rank_t peer; + ucc_rank_t peer, peer_dist; ucc_kn_radix_t loop_step; size_t peer_seg_count, local_seg_count; ucc_status_t status; @@ -103,6 +104,13 @@ void ucc_tl_ucp_allgather_knomial_progress(ucc_coll_task_t *coll_task) peer = ucc_knomial_pattern_get_loop_peer(p, rank, loop_step); if (peer == UCC_KN_PEER_NULL) continue; + if (coll_task->bargs.args.coll_type == UCC_COLL_TYPE_BCAST) { + peer_dist = ucc_knomial_calc_recv_dist(size - p->n_extra, + ucc_knomial_pattern_loop_rank(p, peer), p->radix, 0); + if (peer_dist < task->allgather_kn.recv_dist) { + continue; + } + } UCPCHECK_GOTO(ucc_tl_ucp_send_nb(sbuf, local_seg_count * dt_size, mem_type, @@ -118,6 +126,13 @@ void ucc_tl_ucp_allgather_knomial_progress(ucc_coll_task_t *coll_task) ucc_kn_ag_pattern_peer_seg(peer, p, &peer_seg_count, &peer_seg_offset); + if (coll_task->bargs.args.coll_type == UCC_COLL_TYPE_BCAST) { + peer_dist = ucc_knomial_calc_recv_dist(size - p->n_extra, + ucc_knomial_pattern_loop_rank(p, peer), p->radix, 0); + if (peer_dist > task->allgather_kn.recv_dist) { + continue; + } + } UCPCHECK_GOTO( ucc_tl_ucp_recv_nb(PTR_OFFSET(rbuf, peer_seg_offset * dt_size), peer_seg_count * dt_size, mem_type, @@ -125,7 +140,7 @@ void ucc_tl_ucp_allgather_knomial_progress(ucc_coll_task_t *coll_task) task, out); } UCC_KN_PHASE_LOOP: - if (UCC_INPROGRESS == ucc_tl_ucp_test(task)) { + if (UCC_INPROGRESS == ucc_tl_ucp_test_recv(task)) { SAVE_STATE(UCC_KN_PHASE_LOOP); return; } @@ -138,8 +153,6 @@ void ucc_tl_ucp_allgather_knomial_progress(ucc_coll_task_t *coll_task) mem_type, INV_VRANK(peer, broot, size), team, task), task, out); - } else { - goto out; } UCC_KN_PHASE_PROXY: if (UCC_INPROGRESS == ucc_tl_ucp_test(task)) { @@ -148,19 +161,21 @@ void ucc_tl_ucp_allgather_knomial_progress(ucc_coll_task_t *coll_task) } out: + ucc_assert(UCC_TL_UCP_TASK_P2P_COMPLETE(task)); task->super.status = UCC_OK; UCC_TL_UCP_PROFILE_REQUEST_EVENT(coll_task, "ucp_allgather_kn_done", 0); } ucc_status_t ucc_tl_ucp_allgather_knomial_start(ucc_coll_task_t *coll_task) { - ucc_tl_ucp_task_t *task = ucc_derived_of(coll_task, ucc_tl_ucp_task_t); - ucc_coll_args_t *args = &TASK_ARGS(task); - ucc_tl_ucp_team_t *team = TASK_TEAM(task); - ucc_coll_type_t ct = args->coll_type; - ucc_rank_t size = UCC_TL_TEAM_SIZE(team); - ucc_kn_radix_t radix = task->allgather_kn.p.radix; - ucc_rank_t rank = VRANK(UCC_TL_TEAM_RANK(team), + ucc_tl_ucp_task_t *task = ucc_derived_of(coll_task, ucc_tl_ucp_task_t); + ucc_coll_args_t *args = &TASK_ARGS(task); + ucc_tl_ucp_team_t *team = TASK_TEAM(task); + ucc_coll_type_t ct = args->coll_type; + ucc_rank_t size = UCC_TL_TEAM_SIZE(team); + ucc_kn_radix_t radix = task->allgather_kn.p.radix; + ucc_knomial_pattern_t *p = &task->allgather_kn.p; + ucc_rank_t rank = VRANK(UCC_TL_TEAM_RANK(team), ct == UCC_COLL_TYPE_BCAST ? args->root : 0, size); ucc_status_t status; @@ -201,6 +216,10 @@ ucc_status_t ucc_tl_ucp_allgather_knomial_start(ucc_coll_task_t *coll_task) offset = ucc_sra_kn_get_offset(args->dst.info.count, ucc_dt_size(args->dst.info.datatype), rank, size, radix); + task->allgather_kn.recv_dist = ucc_knomial_calc_recv_dist( + size - p->n_extra, + ucc_knomial_pattern_loop_rank(p, rank), + p->radix, 0); } task->allgather_kn.sbuf = PTR_OFFSET(args->dst.info.buffer, offset); diff --git a/src/components/tl/ucp/bcast/bcast.h b/src/components/tl/ucp/bcast/bcast.h index 97a6e08349..3ea567fb9c 100644 --- a/src/components/tl/ucp/bcast/bcast.h +++ b/src/components/tl/ucp/bcast/bcast.h @@ -1,5 +1,5 @@ /** - * Copyright (c) 2021, NVIDIA CORPORATION & AFFILIATES. All rights reserved. + * Copyright (c) 2021-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. * * See file LICENSE for terms. */ @@ -17,8 +17,9 @@ enum { extern ucc_base_coll_alg_info_t ucc_tl_ucp_bcast_algs[UCC_TL_UCP_BCAST_ALG_LAST + 1]; +/* SAG bcast supports team size 2, but Knomial is always better in this case */ #define UCC_TL_UCP_BCAST_DEFAULT_ALG_SELECT_STR \ - "bcast:0-32k:@0#bcast:32k-inf:@1" + "bcast:0-inf:[2-2]:@0#bcast:0-32k:[3-inf]:@0#bcast:32k-inf:[3-inf]:@1" static inline int ucc_tl_ucp_bcast_alg_from_str(const char *str) { diff --git a/src/components/tl/ucp/bcast/bcast_knomial.c b/src/components/tl/ucp/bcast/bcast_knomial.c index 0e30a61aae..fec7f83914 100644 --- a/src/components/tl/ucp/bcast/bcast_knomial.c +++ b/src/components/tl/ucp/bcast/bcast_knomial.c @@ -57,11 +57,15 @@ void ucc_tl_ucp_bcast_knomial_progress(ucc_coll_task_t *coll_task) } } dist /= radix; - if (UCC_INPROGRESS == ucc_tl_ucp_test(task)) { + if (UCC_INPROGRESS == ucc_tl_ucp_test_recv(task)) { task->bcast_kn.dist = dist; return; } } + if (UCC_INPROGRESS == ucc_tl_ucp_test(task)) { + task->bcast_kn.dist = dist; + return; + } ucc_assert(UCC_TL_UCP_TASK_P2P_COMPLETE(task)); task->super.status = UCC_OK; diff --git a/src/components/tl/ucp/scatter/scatter_knomial.c b/src/components/tl/ucp/scatter/scatter_knomial.c index 4fe8eb65bf..f406c02c6e 100644 --- a/src/components/tl/ucp/scatter/scatter_knomial.c +++ b/src/components/tl/ucp/scatter/scatter_knomial.c @@ -1,5 +1,5 @@ /** - * Copyright (c) 2021, NVIDIA CORPORATION & AFFILIATES. All rights reserved. + * Copyright (c) 2021-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. * * See file LICENSE for terms. */ @@ -14,48 +14,23 @@ #include "utils/ucc_math.h" #include "utils/ucc_coll_utils.h" -#define SAVE_STATE(_phase) \ - do { \ - task->scatter_kn.phase = _phase; \ +#define SAVE_STATE(_phase) \ + do { \ + task->scatter_kn.phase = _phase; \ } while (0) -#define GET_BASE_PEER(_radix, _rank, _dist, _peer) \ - do { \ - _peer = _rank - ((_rank / _dist) % _radix) * _dist; \ -} while (0) - enum { UCC_SCATTER_KN_PHASE_INIT, UCC_SCATTER_KN_PHASE_LOOP, /* main loop of recursive k-ing */ }; -/* Calculates for each rank at which distance it should recieve */ -ucc_rank_t calc_recv_dist(ucc_rank_t team_size, ucc_rank_t rank, - ucc_rank_t radix, ucc_rank_t root) -{ - if (rank == root) { - return 0; - } - ucc_rank_t root_base; - ucc_rank_t dist = 1; - GET_BASE_PEER(radix, root, dist, root_base); - while (dist <= team_size) { - if (rank >= root_base && rank < root_base + radix * dist) { - break; - } - dist *= radix; - GET_BASE_PEER(radix, root_base, dist, root_base); - } - return dist; -} - void ucc_tl_ucp_scatter_knomial_progress(ucc_coll_task_t *coll_task) { - ucc_tl_ucp_task_t *task = ucc_derived_of(coll_task, ucc_tl_ucp_task_t); - ucc_coll_args_t *args = &TASK_ARGS(task); - ucc_tl_ucp_team_t *team = TASK_TEAM(task); + ucc_tl_ucp_task_t *task = ucc_derived_of(coll_task, + ucc_tl_ucp_task_t); + ucc_coll_args_t *args = &TASK_ARGS(task); + ucc_tl_ucp_team_t *team = TASK_TEAM(task); ucc_kn_radix_t radix = task->scatter_kn.p.radix; - uint8_t node_type = task->scatter_kn.p.node_type; ucc_knomial_pattern_t *p = &task->scatter_kn.p; void *rbuf = args->dst.info.buffer; ucc_memory_type_t mem_type = args->src.info.mem_type; @@ -80,7 +55,7 @@ void ucc_tl_ucp_scatter_knomial_progress(ucc_coll_task_t *coll_task) goto UCC_SCATTER_KN_PHASE_LOOP; } - if (KN_NODE_EXTRA == node_type) { + if (KN_NODE_EXTRA == p->node_type) { goto out; } @@ -101,16 +76,17 @@ void ucc_tl_ucp_scatter_knomial_progress(ucc_coll_task_t *coll_task) Receive will only occur in the following iteration to that of it's parent's send. */ - if (rank != root && task->scatter_kn.recv_dist == p->radix_pow && - task->tagged.recv_posted == 0) { + if ((rank != root) && (task->scatter_kn.recv_dist == p->radix_pow)) { + ucc_assert(task->tagged.recv_posted == 0); for (loop_step = 1; loop_step < radix; loop_step++) { peer = ucc_knomial_pattern_get_loop_peer(p, rank, loop_step); if (peer == UCC_KN_PEER_NULL) continue; vpeer = ucc_knomial_pattern_loop_rank(p, peer); vroot = ucc_knomial_pattern_loop_rank(p, root); - peer_recv_dist = - calc_recv_dist(team_size, vpeer, radix, vroot); + peer_recv_dist = ucc_knomial_calc_recv_dist(team_size, vpeer, + radix, vroot); + task->scatter_kn.recv_size = local_seg_count * dt_size; if (peer_recv_dist < task->scatter_kn.recv_dist) { UCPCHECK_GOTO(ucc_tl_ucp_recv_nb(rbuf, local_seg_count * dt_size, mem_type, @@ -128,9 +104,8 @@ void ucc_tl_ucp_scatter_knomial_progress(ucc_coll_task_t *coll_task) Each rank's send (besides leaf ranks) happens only after it's recieve from previous iteration has completed. */ - if (root == rank || - (task->tagged.recv_posted > 0 && - task->tagged.recv_posted == task->tagged.recv_completed)) { + if ((root == rank) || (task->tagged.recv_posted > 0)) { + ucc_assert(UCC_TL_UCP_TASK_RECV_COMPLETE(task)); for (loop_step = 1; loop_step < radix; loop_step++) { peer = ucc_knomial_pattern_get_loop_peer(p, rank, loop_step); if (peer == UCC_KN_PEER_NULL) @@ -146,6 +121,7 @@ void ucc_tl_ucp_scatter_knomial_progress(ucc_coll_task_t *coll_task) peer_seg_count * dt_size, mem_type, INV_VRANK(peer, (ucc_rank_t)args->root, size), team, task), task, out); } + /*TODO: local_seg_index is always zero since rank that sends is base root? */ local_seg_index = ucc_kn_compute_seg_index(rank, p->radix_pow, p); offset = ucc_sra_kn_compute_seg_offset( @@ -165,8 +141,8 @@ void ucc_tl_ucp_scatter_knomial_progress(ucc_coll_task_t *coll_task) &offset, &local_seg_count); if (offset != 0) { status = ucc_mc_memcpy(PTR_OFFSET(args->dst.info.buffer, offset), - PTR_OFFSET(rbuf, task->scatter_kn.send_offset), - local_seg_count * dt_size, mem_type, mem_type); + rbuf, task->scatter_kn.recv_size, mem_type, + mem_type); if (ucc_unlikely(UCC_OK != status)) { task->super.status = status; return; @@ -195,15 +171,15 @@ ucc_status_t ucc_tl_ucp_scatter_knomial_start(ucc_coll_task_t *coll_task) task->scatter_kn.phase = UCC_SCATTER_KN_PHASE_INIT; vroot = ucc_knomial_pattern_loop_rank(p, VRANK(root, root, size)); vrank = ucc_knomial_pattern_loop_rank(p, VRANK(rank, root, size)); - task->scatter_kn.recv_dist = calc_recv_dist(size - p->n_extra, vrank, - p->radix, vroot); + task->scatter_kn.recv_dist = ucc_knomial_calc_recv_dist(size - p->n_extra, + vrank, p->radix, + vroot); task->scatter_kn.send_offset = 0; return ucc_progress_queue_enqueue(UCC_TL_CORE_CTX(team)->pq, &task->super); } -ucc_status_t -ucc_tl_ucp_scatter_knomial_finalize(ucc_coll_task_t *coll_task) +ucc_status_t ucc_tl_ucp_scatter_knomial_finalize(ucc_coll_task_t *coll_task) { return ucc_tl_ucp_coll_finalize(coll_task); } @@ -212,29 +188,24 @@ ucc_status_t ucc_tl_ucp_scatter_knomial_init_r( ucc_base_coll_args_t *coll_args, ucc_base_team_t *team, ucc_coll_task_t **task_h, ucc_kn_radix_t radix) { - ucc_tl_ucp_team_t *tl_team = ucc_derived_of(team, ucc_tl_ucp_team_t); - ucc_rank_t size = UCC_TL_TEAM_SIZE(tl_team); - ucc_rank_t rank = UCC_TL_TEAM_RANK(tl_team); ucc_tl_ucp_task_t *task; - task = ucc_tl_ucp_init_task(coll_args, team); - task->super.post = ucc_tl_ucp_scatter_knomial_start; - task->super.progress = ucc_tl_ucp_scatter_knomial_progress; - task->super.finalize = ucc_tl_ucp_scatter_knomial_finalize; - ucc_assert(coll_args->args.src.info.mem_type == coll_args->args.dst.info.mem_type); - ucc_knomial_pattern_init(size, rank, radix, &task->scatter_kn.p); + task = ucc_tl_ucp_init_task(coll_args, team); + task->super.post = ucc_tl_ucp_scatter_knomial_start; + task->super.progress = ucc_tl_ucp_scatter_knomial_progress; + task->super.finalize = ucc_tl_ucp_scatter_knomial_finalize; + task->scatter_kn.p.radix = radix; *task_h = &task->super; return UCC_OK; } -ucc_status_t -ucc_tl_ucp_scatter_knomial_init(ucc_base_coll_args_t *coll_args, - ucc_base_team_t * team, - ucc_coll_task_t ** task_h) +ucc_status_t ucc_tl_ucp_scatter_knomial_init(ucc_base_coll_args_t *coll_args, + ucc_base_team_t *team, + ucc_coll_task_t **task_h) { ucc_tl_ucp_team_t *tl_team = ucc_derived_of(team, ucc_tl_ucp_team_t); size_t count = coll_args->args.src.info.count; diff --git a/src/components/tl/ucp/tl_ucp_coll.h b/src/components/tl/ucp/tl_ucp_coll.h index fa67b90bfa..242248256a 100644 --- a/src/components/tl/ucp/tl_ucp_coll.h +++ b/src/components/tl/ucp/tl_ucp_coll.h @@ -1,5 +1,5 @@ /** - * Copyright (c) 2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved. + * Copyright (c) 2022-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. * Copyright (c) Meta Platforms, Inc. and affiliates. 2022. * * See file LICENSE for terms. @@ -136,12 +136,14 @@ typedef struct ucc_tl_ucp_task { ucc_knomial_pattern_t p; ucc_rank_t recv_dist; ptrdiff_t send_offset; + size_t recv_size; } scatter_kn; struct { int phase; ucc_knomial_pattern_t p; void *sbuf; ucc_ee_executor_task_t *etask; + ucc_rank_t recv_dist; } allgather_kn; struct { ucc_rank_t dist; @@ -319,6 +321,25 @@ static inline ucc_status_t ucc_tl_ucp_test(ucc_tl_ucp_task_t *task) return UCC_INPROGRESS; } +#define UCC_TL_UCP_TASK_RECV_COMPLETE(_task) \ + (((_task)->tagged.recv_posted == (_task)->tagged.recv_completed)) + +static inline ucc_status_t ucc_tl_ucp_test_recv(ucc_tl_ucp_task_t *task) +{ + int polls = 0; + + if (UCC_TL_UCP_TASK_RECV_COMPLETE(task)) { + return UCC_OK; + } + while (polls++ < task->n_polls) { + if (UCC_TL_UCP_TASK_RECV_COMPLETE(task)) { + return UCC_OK; + } + ucp_worker_progress(UCC_TL_UCP_TASK_TEAM(task)->worker->ucp_worker); + } + return UCC_INPROGRESS; +} + #define UCC_TL_UCP_TASK_RING_P2P_COMPLETE(_task) \ ((((_task)->tagged.send_posted - (_task)->tagged.send_completed) <= 1) && \ ((_task)->tagged.recv_posted == (_task)->tagged.recv_completed))