diff --git a/src/Makefile.am b/src/Makefile.am index 85496f83dd..47489c60b6 100644 --- a/src/Makefile.am +++ b/src/Makefile.am @@ -91,6 +91,7 @@ noinst_HEADERS = \ coll_patterns/recursive_knomial.h \ coll_patterns/sra_knomial.h \ coll_patterns/bruck_alltoall.h \ + coll_patterns/two_tree.h \ components/topo/ucc_topo.h \ components/topo/ucc_sbgp.h diff --git a/src/coll_patterns/two_tree.h b/src/coll_patterns/two_tree.h new file mode 100644 index 0000000000..97ad88f1eb --- /dev/null +++ b/src/coll_patterns/two_tree.h @@ -0,0 +1,210 @@ +/** + * Copyright (c) 2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. + * + * See file LICENSE for terms. + */ + +#ifndef TWO_TREE_H_ +#define TWO_TREE_H_ + +enum { + LEFT_CHILD, + RIGHT_CHILD +}; + +typedef struct ucc_dbt_single_tree { + ucc_rank_t rank; + ucc_rank_t size; + ucc_rank_t root; + ucc_rank_t parent; + ucc_rank_t children[2]; + int height; + int recv; +} ucc_dbt_single_tree_t; + +static inline ucc_rank_t get_root(ucc_rank_t size) +{ + ucc_rank_t r = 1; + + while (r <= size) { + r *= 2; + } + return r/2 - 1; +} + +static inline int get_height(ucc_rank_t rank) +{ + int h = 1; + + if (rank % 2 == 0) { + return 0; + } + + rank++; + while ((rank & (1 << h)) == 0) { + h++; + } + return h; +} + +static inline ucc_rank_t get_left_child(ucc_rank_t rank, int height) +{ + ucc_rank_t sub_height; + + if (height == 0) { + return -1; + } + + sub_height = 1 << (height - 1); + return rank - sub_height; +} + +static inline ucc_rank_t get_right_child(ucc_rank_t size, ucc_rank_t rank, + int height, ucc_rank_t root) +{ + ucc_rank_t sub_right_root, sub_height; + + if (rank == size - 1 || height == 0) { + return -1; + } + + sub_right_root = get_root(size - rank - 1) + 1; + sub_height = 1 << (height - 1); + + if (rank == root) { + return rank + sub_right_root; + } + return (rank + sub_height < size) ? rank + sub_height + : rank + sub_right_root; +} + +static inline void get_children(ucc_rank_t size, ucc_rank_t rank, int height, + ucc_rank_t root, ucc_rank_t *l_c, + ucc_rank_t *r_c) +{ + *l_c = get_left_child(rank, height); + *r_c = get_right_child(size, rank, height, root); +} + +static inline int get_parent(int vsize, int vrank, int height, int troot) +{ + if (vrank == troot) { + return -1; + } else if (height == 0) { + return ((((vrank/2) % 2 == 0) && (vrank + 1 != vsize))) ? vrank + 1 + : vrank - 1; + } else { + vrank++; + if ((((1<<(height+1)) & vrank) > 0) || (vrank + (1< vsize) { + return vrank - (1<children[LEFT_CHILD], + &t1->children[RIGHT_CHILD]); + t1->height = height; + t1->parent = parent; + t1->size = size; + t1->rank = rank; + t1->root = root; + t1->recv = 0; +} + +static inline ucc_rank_t ucc_two_tree_convert_rank_for_shift(ucc_rank_t rank, + ucc_rank_t size) +{ + ucc_rank_t i; + for (i = 0; i < size; i++) { + if (rank == (i + 1) % size) { + break; + } + } + return i; +} + +static inline ucc_rank_t ucc_two_tree_convert_rank_for_mirror(ucc_rank_t rank, + ucc_rank_t size) +{ + ucc_rank_t i; + for (i = 0; i < size; i++) { + if (rank == size - 1 - i) { + break; + } + } + return i; +} + +static inline void ucc_two_tree_build_t2(ucc_rank_t rank, ucc_rank_t size, + ucc_dbt_single_tree_t *t2) { + ucc_rank_t temp_rank = (size % 2) ? + ucc_two_tree_convert_rank_for_shift(rank, size) : + ucc_two_tree_convert_rank_for_mirror(rank, size); + ucc_dbt_single_tree_t t1_temp; + + ucc_two_tree_build_t1(temp_rank, size, &t1_temp); + if (size % 2) { + ucc_two_tree_build_t2_shift(t1_temp, t2); + } else { + ucc_two_tree_build_t2_mirror(t1_temp, t2); + } +} + +static inline void ucc_two_tree_build_trees(ucc_rank_t rank, ucc_rank_t size, + ucc_dbt_single_tree_t *t1, + ucc_dbt_single_tree_t *t2) +{ + ucc_two_tree_build_t1(rank, size, t1); + ucc_two_tree_build_t2(rank, size, t2); +} + +#endif diff --git a/src/components/tl/ucp/Makefile.am b/src/components/tl/ucp/Makefile.am index cc0a118c60..78c2dfd157 100644 --- a/src/components/tl/ucp/Makefile.am +++ b/src/components/tl/ucp/Makefile.am @@ -49,7 +49,8 @@ bcast = \ bcast/bcast.h \ bcast/bcast.c \ bcast/bcast_knomial.c \ - bcast/bcast_sag_knomial.c + bcast/bcast_sag_knomial.c \ + bcast/bcast_two_tree.c fanin = \ fanin/fanin.h \ diff --git a/src/components/tl/ucp/bcast/bcast.c b/src/components/tl/ucp/bcast/bcast.c index 6a1d5b7720..071194ba11 100644 --- a/src/components/tl/ucp/bcast/bcast.c +++ b/src/components/tl/ucp/bcast/bcast.c @@ -19,6 +19,11 @@ ucc_base_coll_alg_info_t .name = "sag_knomial", .desc = "recursive knomial scatter followed by knomial " "allgather (optimized for BW)"}, + [UCC_TL_UCP_BCAST_ALG_TWO_TREE] = + {.id = UCC_TL_UCP_BCAST_ALG_TWO_TREE, + .name = "two_tree", + .desc = "bcast over double binary tree where a leaf in one tree " + "will be intermediate in other (optimized for latency)"}, [UCC_TL_UCP_BCAST_ALG_LAST] = { .id = 0, .name = NULL, .desc = NULL}}; @@ -36,8 +41,8 @@ ucc_status_t ucc_tl_ucp_bcast_init(ucc_tl_ucp_task_t *task) } ucc_status_t ucc_tl_ucp_bcast_knomial_init(ucc_base_coll_args_t *coll_args, - ucc_base_team_t * team, - ucc_coll_task_t ** task_h) + ucc_base_team_t *team, + ucc_coll_task_t **task_h) { ucc_tl_ucp_task_t *task; ucc_status_t status; diff --git a/src/components/tl/ucp/bcast/bcast.h b/src/components/tl/ucp/bcast/bcast.h index 3ea567fb9c..da3f77a9cb 100644 --- a/src/components/tl/ucp/bcast/bcast.h +++ b/src/components/tl/ucp/bcast/bcast.h @@ -11,6 +11,7 @@ enum { UCC_TL_UCP_BCAST_ALG_KNOMIAL, UCC_TL_UCP_BCAST_ALG_SAG_KNOMIAL, + UCC_TL_UCP_BCAST_ALG_TWO_TREE, UCC_TL_UCP_BCAST_ALG_LAST }; @@ -47,4 +48,8 @@ ucc_status_t ucc_tl_ucp_bcast_sag_knomial_init(ucc_base_coll_args_t *coll_args, ucc_base_team_t *team, ucc_coll_task_t **task_h); +ucc_status_t ucc_tl_ucp_bcast_two_tree_init( + ucc_base_coll_args_t *coll_args, ucc_base_team_t *team, + ucc_coll_task_t **task_h); + #endif diff --git a/src/components/tl/ucp/bcast/bcast_two_tree.c b/src/components/tl/ucp/bcast/bcast_two_tree.c new file mode 100644 index 0000000000..ea118f072d --- /dev/null +++ b/src/components/tl/ucp/bcast/bcast_two_tree.c @@ -0,0 +1,231 @@ +/** + * Copyright (c) 2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. + * + * See file LICENSE for terms. + */ + +#include "config.h" +#include "tl_ucp.h" +#include "bcast.h" +#include "core/ucc_progress_queue.h" +#include "tl_ucp_sendrecv.h" + +enum { + RECV, + SEND_T1, + SEND_T2, + TEST, +}; + +#define UCC_BCAST_TWO_TREE_CHECK_STATE(_p) \ + case _p: \ + goto _p; + +#define UCC_BCAST_TWO_TREE_GOTO_STATE(_state) \ + do { \ + switch (_state) { \ + UCC_BCAST_TWO_TREE_CHECK_STATE(SEND_T1); \ + UCC_BCAST_TWO_TREE_CHECK_STATE(SEND_T2); \ + UCC_BCAST_TWO_TREE_CHECK_STATE(TEST); \ + }; \ + } while (0) + +static void recv_completion_common(void *request, ucs_status_t status, + const ucp_tag_recv_info_t *info, /* NOLINT */ + void *user_data) +{ + ucc_tl_ucp_task_t *task = (ucc_tl_ucp_task_t *)user_data; + if (ucc_unlikely(UCS_OK != status)) { + tl_error(UCC_TASK_LIB(task), "failure in recv completion %s", + ucs_status_string(status)); + task->super.status = ucs_status_to_ucc_status(status); + } + task->tagged.recv_completed++; + if (request) { + ucp_request_free(request); + } +} + +static void recv_completion_1(void *request, ucs_status_t status, + const ucp_tag_recv_info_t *info, /* NOLINT */ + void *user_data) +{ + ucc_tl_ucp_task_t *task = (ucc_tl_ucp_task_t *)user_data; + + task->bcast_two_tree.t1.recv++; + recv_completion_common(request, status, info, user_data); +} + +static void recv_completion_2(void *request, ucs_status_t status, + const ucp_tag_recv_info_t *info, /* NOLINT */ + void *user_data) +{ + ucc_tl_ucp_task_t *task = (ucc_tl_ucp_task_t *)user_data; + + task->bcast_two_tree.t2.recv++; + recv_completion_common(request, status, info, user_data); + +} + +void ucc_tl_ucp_bcast_two_tree_progress(ucc_coll_task_t *coll_task) +{ + ucc_tl_ucp_task_t *task = ucc_derived_of(coll_task, + ucc_tl_ucp_task_t); + ucc_tl_ucp_team_t *team = TASK_TEAM(task); + ucc_rank_t rank = UCC_TL_TEAM_RANK(team); + ucc_dbt_single_tree_t t1 = task->bcast_two_tree.t1; + ucc_dbt_single_tree_t t2 = task->bcast_two_tree.t2; + void *buffer = TASK_ARGS(task).src.info.buffer; + ucc_memory_type_t mtype = TASK_ARGS(task).src.info.mem_type; + ucc_datatype_t dt = TASK_ARGS(task).src.info.datatype; + size_t count = TASK_ARGS(task).src.info.count; + size_t data_size = count * ucc_dt_size(dt) / 2; + ucc_rank_t coll_root = (ucc_rank_t)TASK_ARGS(task).root; + ucp_tag_recv_nbx_callback_t cb[2] = {recv_completion_1, + recv_completion_2}; + uint32_t i; + + UCC_BCAST_TWO_TREE_GOTO_STATE(task->bcast_two_tree.state); + + if (rank != t1.root && rank != coll_root) { + UCPCHECK_GOTO(ucc_tl_ucp_recv_cb(buffer, data_size, mtype, t1.parent, + team, task, cb[0], (void *)task), + task, out); + } + + if (rank != t2.root && rank != coll_root) { + UCPCHECK_GOTO(ucc_tl_ucp_recv_cb(PTR_OFFSET(buffer, data_size), + data_size, mtype, t2.parent, team, + task, cb[1], (void *)task), + task, out); + } + task->bcast_two_tree.state = SEND_T1; + +SEND_T1: + if ((coll_root == rank) || (task->bcast_two_tree.t1.recv > 0)) { + for (i = 0; i < 2; i++) { + if (t1.children[i] != -1 && t1.children[i] != coll_root) { + UCPCHECK_GOTO(ucc_tl_ucp_send_nb(buffer, data_size, mtype, + t1.children[i], team, task), + task, out); + } + } + } else { + goto out; + } + task->bcast_two_tree.state = SEND_T2; + +SEND_T2: + if ((coll_root == rank) || (task->bcast_two_tree.t2.recv > 0)) { + for (i = 0; i < 2; i++) { + if (t2.children[i] != -1 && t2.children[i] != coll_root) { + UCPCHECK_GOTO(ucc_tl_ucp_send_nb(PTR_OFFSET(buffer, data_size), + data_size, mtype, + t2.children[i], team, task), + task, out); + } + } + } else { + goto out; + } + +TEST: + if (UCC_INPROGRESS == ucc_tl_ucp_test_send(task)) { + task->bcast_two_tree.state = TEST; + return; + } + + task->super.status = UCC_OK; + UCC_TL_UCP_PROFILE_REQUEST_EVENT(coll_task, "ucp_bcast_tow_tree_done", 0); + +out: + return; +} + +ucc_status_t ucc_tl_ucp_bcast_two_tree_start(ucc_coll_task_t *coll_task) +{ + ucc_tl_ucp_task_t *task = ucc_derived_of(coll_task, + ucc_tl_ucp_task_t); + ucc_tl_ucp_team_t *team = TASK_TEAM(task); + ucc_status_t status = UCC_OK; + ucc_rank_t rank = UCC_TL_TEAM_RANK(team); + void *buffer = TASK_ARGS(task).src.info.buffer; + ucc_memory_type_t mtype = TASK_ARGS(task).src.info.mem_type; + ucc_datatype_t dt = TASK_ARGS(task).src.info.datatype; + size_t count = TASK_ARGS(task).src.info.count; + size_t data_size = count * ucc_dt_size(dt) / 2; + ucc_rank_t coll_root = (ucc_rank_t)TASK_ARGS(task).root; + ucc_rank_t t1_root = task->bcast_two_tree.t1.root; + ucc_rank_t t2_root = task->bcast_two_tree.t2.root; + ucp_tag_recv_nbx_callback_t cb[2] = {recv_completion_1, + recv_completion_2}; + + task->bcast_two_tree.t1.recv = 0; + task->bcast_two_tree.t2.recv = 0; + + if (rank == coll_root && coll_root != t1_root) { + status = ucc_tl_ucp_send_nb(buffer, data_size, mtype, t1_root, team, + task); + if (UCC_OK != status) { + return status; + } + } + + if (rank == coll_root && coll_root != t2_root) { + status = ucc_tl_ucp_send_nb(PTR_OFFSET(buffer, data_size), data_size, + mtype, t2_root, team, task); + if (UCC_OK != status) { + return status; + } + } + + if (rank != coll_root && rank == t1_root) { + status = ucc_tl_ucp_recv_cb(buffer, data_size, mtype, coll_root, team, + task, cb[0], (void *)task); + if (UCC_OK != status) { + return status; + } + } + + if (rank != coll_root && rank == t2_root) { + status = ucc_tl_ucp_recv_cb(PTR_OFFSET(buffer, data_size), data_size, + mtype, coll_root, team, task, cb[1], + (void *)task); + if (UCC_OK != status) { + return status; + } + } + + task->bcast_two_tree.state = RECV; + UCC_TL_UCP_PROFILE_REQUEST_EVENT(coll_task, "ucp_bcast_two_tree_start", 0); + ucc_tl_ucp_task_reset(task, UCC_INPROGRESS); + + return ucc_progress_queue_enqueue(UCC_TL_CORE_CTX(team)->pq, &task->super); +} + +ucc_status_t ucc_tl_ucp_bcast_two_tree_finalize(ucc_coll_task_t *coll_task) +{ + return ucc_tl_ucp_coll_finalize(coll_task); +} + +ucc_status_t ucc_tl_ucp_bcast_two_tree_init( + ucc_base_coll_args_t *coll_args, ucc_base_team_t *team, + ucc_coll_task_t **task_h) +{ + ucc_tl_ucp_team_t *tl_team; + ucc_tl_ucp_task_t *task; + ucc_rank_t rank, size; + + task = ucc_tl_ucp_init_task(coll_args, team); + task->super.post = ucc_tl_ucp_bcast_two_tree_start; + task->super.progress = ucc_tl_ucp_bcast_two_tree_progress; + task->super.finalize = ucc_tl_ucp_bcast_two_tree_finalize; + tl_team = TASK_TEAM(task); + rank = UCC_TL_TEAM_RANK(tl_team); + size = UCC_TL_TEAM_SIZE(tl_team); + ucc_two_tree_build_trees(rank, size, &task->bcast_two_tree.t1, + &task->bcast_two_tree.t2); + + *task_h = &task->super; + return UCC_OK; +} diff --git a/src/components/tl/ucp/tl_ucp_coll.c b/src/components/tl/ucp/tl_ucp_coll.c index bbbac03fc7..d477f5ae59 100644 --- a/src/components/tl/ucp/tl_ucp_coll.c +++ b/src/components/tl/ucp/tl_ucp_coll.c @@ -281,6 +281,9 @@ ucc_status_t ucc_tl_ucp_alg_id_to_init(int alg_id, const char *alg_id_str, case UCC_TL_UCP_BCAST_ALG_SAG_KNOMIAL: *init = ucc_tl_ucp_bcast_sag_knomial_init; break; + case UCC_TL_UCP_BCAST_ALG_TWO_TREE: + *init = ucc_tl_ucp_bcast_two_tree_init; + break; default: status = UCC_ERR_INVALID_PARAM; break; diff --git a/src/components/tl/ucp/tl_ucp_coll.h b/src/components/tl/ucp/tl_ucp_coll.h index c2a260b103..05fd4ff24d 100644 --- a/src/components/tl/ucp/tl_ucp_coll.h +++ b/src/components/tl/ucp/tl_ucp_coll.h @@ -11,6 +11,7 @@ #include "tl_ucp.h" #include "schedule/ucc_schedule_pipelined.h" #include "coll_patterns/recursive_knomial.h" +#include "coll_patterns/two_tree.h" #include "components/mc/base/ucc_mc_base.h" #include "components/ec/ucc_ec.h" #include "tl_ucp_tag.h" @@ -183,6 +184,11 @@ typedef struct ucc_tl_ucp_task { ucc_rank_t dist; uint32_t radix; } bcast_kn; + struct { + ucc_dbt_single_tree_t t1; + ucc_dbt_single_tree_t t2; + int state; + } bcast_two_tree; struct { ucc_rank_t dist; ucc_rank_t max_dist; @@ -194,6 +200,11 @@ typedef struct ucc_tl_ucp_task { ucc_ee_executor_task_t *etask; ucc_ee_executor_t *executor; } reduce_kn; + struct { + ucc_dbt_single_tree_t t1; + ucc_dbt_single_tree_t t2; + int state; + } reduce_two_tree; struct { ucc_rank_t dist; ucc_rank_t max_dist; @@ -364,6 +375,9 @@ static inline ucc_status_t ucc_tl_ucp_test(ucc_tl_ucp_task_t *task) #define UCC_TL_UCP_TASK_RECV_COMPLETE(_task) \ (((_task)->tagged.recv_posted == (_task)->tagged.recv_completed)) +#define UCC_TL_UCP_TASK_SEND_COMPLETE(_task) \ + (((_task)->tagged.send_posted == (_task)->tagged.send_completed)) + static inline ucc_status_t ucc_tl_ucp_test_recv(ucc_tl_ucp_task_t *task) { int polls = 0; @@ -380,6 +394,22 @@ static inline ucc_status_t ucc_tl_ucp_test_recv(ucc_tl_ucp_task_t *task) return UCC_INPROGRESS; } +static inline ucc_status_t ucc_tl_ucp_test_send(ucc_tl_ucp_task_t *task) +{ + int polls = 0; + + if (UCC_TL_UCP_TASK_SEND_COMPLETE(task)) { + return UCC_OK; + } + while (polls++ < task->n_polls) { + if (UCC_TL_UCP_TASK_SEND_COMPLETE(task)) { + return UCC_OK; + } + ucp_worker_progress(UCC_TL_UCP_TASK_TEAM(task)->worker->ucp_worker); + } + return UCC_INPROGRESS; +} + #define UCC_TL_UCP_TASK_RING_P2P_COMPLETE(_task) \ ((((_task)->tagged.send_posted - (_task)->tagged.send_completed) <= 1) && \ ((_task)->tagged.recv_posted == (_task)->tagged.recv_completed)) diff --git a/test/gtest/coll/test_bcast.cc b/test/gtest/coll/test_bcast.cc index ace5f50a9b..b66e416517 100644 --- a/test/gtest/coll/test_bcast.cc +++ b/test/gtest/coll/test_bcast.cc @@ -280,3 +280,77 @@ UCC_TEST_F(test_bcast_alg, 2step) { } } } + +UCC_TEST_F(test_bcast_alg, two_tree_odd_shift) { + int n_procs = 15; + ucc_job_env_t env = {{"UCC_TL_UCP_TUNE", "bcast:@two_tree:0-inf:inf"}, + {"UCC_CLS", "basic"}}; + UccJob job(n_procs, UccJob::UCC_JOB_CTX_GLOBAL, env); + UccTeam_h team = job.create_team(n_procs); + int repeat = 1; + UccCollCtxVec ctxs; + std::vector mt = {UCC_MEMORY_TYPE_HOST}; + + if (UCC_OK == ucc_mc_available(UCC_MEMORY_TYPE_CUDA)) { + mt.push_back(UCC_MEMORY_TYPE_CUDA); + } + if (UCC_OK == ucc_mc_available(UCC_MEMORY_TYPE_CUDA_MANAGED)) { + mt.push_back(UCC_MEMORY_TYPE_CUDA_MANAGED); + } + + for (auto count : {8, 65536}) { + for (int root = 0; root < n_procs; root++) { + for (auto m : mt) { + this->set_root(root); + SET_MEM_TYPE(m); + this->data_init(n_procs, UCC_DT_INT8, count, ctxs, false); + UccReq req(team, ctxs); + + for (auto i = 0; i < repeat; i++) { + req.start(); + req.wait(); + EXPECT_EQ(true, this->data_validate(ctxs)); + this->reset(ctxs); + } + this->data_fini(ctxs); + } + } + } +} + +UCC_TEST_F(test_bcast_alg, two_tree_even_mirror) { + int n_procs = 16; + ucc_job_env_t env = {{"UCC_TL_UCP_TUNE", "bcast:@two_tree:0-inf:inf"}, + {"UCC_CLS", "basic"}}; + UccJob job(n_procs, UccJob::UCC_JOB_CTX_GLOBAL, env); + UccTeam_h team = job.create_team(n_procs); + int repeat = 1; + UccCollCtxVec ctxs; + std::vector mt = {UCC_MEMORY_TYPE_HOST}; + + if (UCC_OK == ucc_mc_available(UCC_MEMORY_TYPE_CUDA)) { + mt.push_back(UCC_MEMORY_TYPE_CUDA); + } + if (UCC_OK == ucc_mc_available(UCC_MEMORY_TYPE_CUDA_MANAGED)) { + mt.push_back(UCC_MEMORY_TYPE_CUDA_MANAGED); + } + + for (auto count : {8, 65536}) { + for (int root = 0; root < n_procs; root++) { + for (auto m : mt) { + this->set_root(root); + SET_MEM_TYPE(m); + this->data_init(n_procs, UCC_DT_INT8, count, ctxs, false); + UccReq req(team, ctxs); + + for (auto i = 0; i < repeat; i++) { + req.start(); + req.wait(); + EXPECT_EQ(true, this->data_validate(ctxs)); + this->reset(ctxs); + } + this->data_fini(ctxs); + } + } + } +}