Skip to content

Commit

Permalink
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
TL/UCP: reduce dbt
Browse files Browse the repository at this point in the history
shimmybalsam committed Oct 3, 2023
1 parent 14dc0b9 commit 470ee92
Showing 7 changed files with 456 additions and 15 deletions.
46 changes: 33 additions & 13 deletions src/coll_patterns/double_binary_tree.h
Original file line number Diff line number Diff line change
@@ -13,13 +13,14 @@ enum {
};

typedef struct ucc_dbt_single_tree {
ucc_rank_t rank;
ucc_rank_t size;
ucc_rank_t root;
ucc_rank_t parent;
ucc_rank_t children[2];
int height;
int recv;
ucc_rank_t rank;
ucc_rank_t size;
ucc_rank_t root;
ucc_rank_t parent;
ucc_rank_t children[2];
int n_children;
int height;
int recv;
} ucc_dbt_single_tree_t;

static inline ucc_rank_t get_root(ucc_rank_t size)
@@ -86,6 +87,19 @@ static inline void get_children(ucc_rank_t size, ucc_rank_t rank, int height,
*r_c = get_right_child(size, rank, height, root);
}

static inline int get_n_children(ucc_rank_t l_c, ucc_rank_t r_c)
{
int n_children = 0;

if (l_c != -1) {
n_children++;
}
if (r_c != -1) {
n_children++;
}
return n_children;
}

static inline int get_parent(int vsize, int vrank, int height, int troot)
{
if (vrank == troot) {
@@ -118,6 +132,8 @@ static inline void ucc_dbt_build_t2_mirror(ucc_dbt_single_tree_t t1,
size - 1 - t1.children[RIGHT_CHILD];
t.children[RIGHT_CHILD] = (t1.children[LEFT_CHILD] == -1) ? -1 :
size - 1 - t1.children[LEFT_CHILD];
t.n_children = get_n_children(t.children[LEFT_CHILD],
t.children[RIGHT_CHILD]);
t.recv = 0;

*t2 = t;
@@ -138,6 +154,8 @@ static inline void ucc_dbt_build_t2_shift(ucc_dbt_single_tree_t t1,
(t1.children[LEFT_CHILD] + 1) % size;
t.children[RIGHT_CHILD] = (t1.children[RIGHT_CHILD] == -1) ? -1 :
(t1.children[RIGHT_CHILD] + 1) % size;
t.n_children = get_n_children(t.children[LEFT_CHILD],
t.children[RIGHT_CHILD]);
t.recv = 0;

*t2 = t;
@@ -152,12 +170,14 @@ static inline void ucc_dbt_build_t1(ucc_rank_t rank, ucc_rank_t size,

get_children(size, rank, height, root, &t1->children[LEFT_CHILD],
&t1->children[RIGHT_CHILD]);
t1->height = height;
t1->parent = parent;
t1->size = size;
t1->rank = rank;
t1->root = root;
t1->recv = 0;
t1->n_children = get_n_children(t1->children[LEFT_CHILD],
t1->children[RIGHT_CHILD]);
t1->height = height;
t1->parent = parent;
t1->size = size;
t1->rank = rank;
t1->root = root;
t1->recv = 0;
}

static inline ucc_rank_t ucc_dbt_convert_rank_for_shift(ucc_rank_t rank,
3 changes: 2 additions & 1 deletion src/components/tl/ucp/Makefile.am
Original file line number Diff line number Diff line change
@@ -73,7 +73,8 @@ gatherv = \
reduce = \
reduce/reduce.h \
reduce/reduce.c \
reduce/reduce_knomial.c
reduce/reduce_knomial.c \
reduce/reduce_dbt.c

reduce_scatter = \
reduce_scatter/reduce_scatter.h \
18 changes: 18 additions & 0 deletions src/components/tl/ucp/reduce/reduce.c
Original file line number Diff line number Diff line change
@@ -13,6 +13,11 @@ ucc_base_coll_alg_info_t
.name = "knomial",
.desc = "reduce over knomial tree with arbitrary radix "
"(optimized for latency)"},
[UCC_TL_UCP_REDUCE_ALG_DBT] =
{.id = UCC_TL_UCP_REDUCE_ALG_DBT,
.name = "dbt",
.desc = "bcast over double binary tree where a leaf in one tree "
"will be intermediate in other (optimized for BW)"},
[UCC_TL_UCP_REDUCE_ALG_LAST] = {
.id = 0, .name = NULL, .desc = NULL}};

@@ -66,3 +71,16 @@ ucc_status_t ucc_tl_ucp_reduce_init(ucc_tl_ucp_task_t *task)

return status;
}

ucc_status_t ucc_tl_ucp_reduce_knomial_init(ucc_base_coll_args_t *coll_args,
ucc_base_team_t *team,
ucc_coll_task_t **task_h)
{
ucc_tl_ucp_task_t *task;
ucc_status_t status;

task = ucc_tl_ucp_init_task(coll_args, team);
status = ucc_tl_ucp_reduce_init(task);
*task_h = &task->super;
return status;
}
24 changes: 24 additions & 0 deletions src/components/tl/ucp/reduce/reduce.h
Original file line number Diff line number Diff line change
@@ -9,12 +9,16 @@

enum {
UCC_TL_UCP_REDUCE_ALG_KNOMIAL,
UCC_TL_UCP_REDUCE_ALG_DBT,
UCC_TL_UCP_REDUCE_ALG_LAST
};

extern ucc_base_coll_alg_info_t
ucc_tl_ucp_reduce_algs[UCC_TL_UCP_REDUCE_ALG_LAST + 1];

#define UCC_TL_UCP_REDUCE_DEFAULT_ALG_SELECT_STR \
"reduce:0-inf:@0"

/* A set of convenience macros used to implement sw based progress
of the reduce algorithm that uses kn pattern */
enum {
@@ -36,12 +40,32 @@ enum {
}; \
} while (0)


static inline int ucc_tl_ucp_reduce_alg_from_str(const char *str)
{
int i;
for (i = 0; i < UCC_TL_UCP_REDUCE_ALG_LAST; i++) {
if (0 == strcasecmp(str, ucc_tl_ucp_reduce_algs[i].name)) {
break;
}
}
return i;
}

ucc_status_t ucc_tl_ucp_reduce_init(ucc_tl_ucp_task_t *task);

ucc_status_t ucc_tl_ucp_reduce_knomial_init(ucc_base_coll_args_t *coll_args,
ucc_base_team_t *team,
ucc_coll_task_t **task_h);

ucc_status_t ucc_tl_ucp_reduce_knomial_start(ucc_coll_task_t *task);

void ucc_tl_ucp_reduce_knomial_progress(ucc_coll_task_t *task);

ucc_status_t ucc_tl_ucp_reduce_knomial_finalize(ucc_coll_task_t *task);

ucc_status_t ucc_tl_ucp_reduce_dbt_init(ucc_base_coll_args_t *coll_args,
ucc_base_team_t *team,
ucc_coll_task_t **task_h);

#endif
346 changes: 346 additions & 0 deletions src/components/tl/ucp/reduce/reduce_dbt.c
Original file line number Diff line number Diff line change
@@ -0,0 +1,346 @@
/**
* Copyright (c) 2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
*
* See file LICENSE for terms.
*/

#include "config.h"
#include "tl_ucp.h"
#include "reduce.h"
#include "core/ucc_progress_queue.h"
#include "tl_ucp_sendrecv.h"
#include "utils/ucc_dt_reduce.h"

enum {
RECV,
REDUCE,
TEST,
TEST_ROOT,
};

#define UCC_REDUCE_DBT_CHECK_STATE(_p) \
case _p: \
goto _p;

#define UCC_REDUCE_DBT_GOTO_STATE(_state) \
do { \
switch (_state) { \
UCC_REDUCE_DBT_CHECK_STATE(REDUCE); \
UCC_REDUCE_DBT_CHECK_STATE(TEST); \
UCC_REDUCE_DBT_CHECK_STATE(TEST_ROOT); \
}; \
} while (0)

static void recv_completion_common(void *request, ucs_status_t status,
const ucp_tag_recv_info_t *info, /* NOLINT */
void *user_data)
{
ucc_tl_ucp_task_t *task = (ucc_tl_ucp_task_t *)user_data;
if (ucc_unlikely(UCS_OK != status)) {
tl_error(UCC_TASK_LIB(task), "failure in recv completion %s",
ucs_status_string(status));
task->super.status = ucs_status_to_ucc_status(status);
}
task->tagged.recv_completed++;
if (request) {
ucp_request_free(request);
}
}

static void reduce_recv_completion_1(void *request, ucs_status_t status,
const ucp_tag_recv_info_t *info, /* NOLINT */
void *user_data)
{
ucc_tl_ucp_task_t *task = (ucc_tl_ucp_task_t *)user_data;

task->reduce_dbt.t1.recv++;
// printf("rank=%d in cb t1 with recv=%d\n",UCC_TL_TEAM_RANK(TASK_TEAM(task)), task->reduce_dbt.t1.recv);
recv_completion_common(request, status, info, user_data);
}

static void reduce_recv_completion_2(void *request, ucs_status_t status,
const ucp_tag_recv_info_t *info, /* NOLINT */
void *user_data)
{
ucc_tl_ucp_task_t *task = (ucc_tl_ucp_task_t *)user_data;

task->reduce_dbt.t2.recv++;
// printf("rank=%d in cb t2 with recv=%d\n",UCC_TL_TEAM_RANK(TASK_TEAM(task)), task->reduce_dbt.t2.recv);
recv_completion_common(request, status, info, user_data);
}

static inline void single_tree_reduce(ucc_tl_ucp_task_t *task, void *sbuf, void *rbuf, int n_children, size_t count, size_t data_size, ucc_datatype_t dt, ucc_coll_args_t *args, int is_avg)
{
ucc_status_t status;
// status = ucc_dt_reduce(
// sbuf, rbuf, rbuf, count,
// dt, args,
// is_avg ? UCC_EEE_TASK_FLAG_REDUCE_WITH_ALPHA : 0,
// AVG_ALPHA(task), task->reduce_dbt.executor,
// &task->reduce_dbt.etask);

status = ucc_dt_reduce_strided(
sbuf,rbuf, rbuf,
n_children, count, data_size,
dt, args,
is_avg ? UCC_EEE_TASK_FLAG_REDUCE_WITH_ALPHA : 0,
AVG_ALPHA(task), task->reduce_dbt.executor,
&task->reduce_dbt.etask);

if (ucc_unlikely(UCC_OK != status)) {
tl_error(UCC_TASK_LIB(task),
"failed to perform dt reduction");
task->super.status = status;
return;
}
EXEC_TASK_WAIT(task->reduce_dbt.etask);
}

void ucc_tl_ucp_reduce_dbt_progress(ucc_coll_task_t *coll_task)
{
ucc_tl_ucp_task_t *task = ucc_derived_of(coll_task,
ucc_tl_ucp_task_t);
ucc_tl_ucp_team_t *team = TASK_TEAM(task);
ucc_coll_args_t *args = &TASK_ARGS(task);
ucc_rank_t rank = UCC_TL_TEAM_RANK(team);
ucc_dbt_single_tree_t t1 = task->reduce_dbt.t1;
ucc_dbt_single_tree_t t2 = task->reduce_dbt.t2;
ucc_memory_type_t mtype = args->src.info.mem_type;
ucc_datatype_t dt = args->src.info.datatype;
size_t count = args->src.info.count;
size_t count_t1 = (count % 2) ? (count / 2 + 1) : count / 2;
size_t data_size = count * ucc_dt_size(dt) / 2;
size_t data_size_t1 = count_t1 * ucc_dt_size(dt);
ucc_rank_t coll_root = (ucc_rank_t)args->root;
ucp_tag_recv_nbx_callback_t cb[2] = {reduce_recv_completion_1,
reduce_recv_completion_2};
int avg_pre_op =
UCC_TL_UCP_TEAM_LIB(TASK_TEAM(task))->cfg.reduce_avg_pre_op;
int is_avg = ((args->op == UCC_OP_AVG) && !avg_pre_op);
void *t1_sbuf = (avg_pre_op && args->op == UCC_OP_AVG) ? PTR_OFFSET(task->reduce_dbt.scratch, count * ucc_dt_size(dt) * 2) : args->src.info.buffer;
void *t1_rbuf = task->reduce_dbt.scratch;
void *t2_sbuf = PTR_OFFSET(t1_sbuf, data_size_t1);
void *t2_rbuf = PTR_OFFSET(t1_rbuf, data_size_t1 * 2);
uint32_t i, j;

UCC_REDUCE_DBT_GOTO_STATE(task->reduce_dbt.state);
j = 0;
for (i = 0; i < 2; i++) {
if (t1.children[i] != -1) {
UCPCHECK_GOTO(ucc_tl_ucp_recv_cb(PTR_OFFSET(t1_rbuf, data_size_t1 * j), data_size_t1, mtype,
t1.children[i], team, task, cb[0], (void *)task),
task, out);
j++;
}
}

j = 0;
for (i = 0; i < 2; i++) {
if (t2.children[i] != -1) {
UCPCHECK_GOTO(ucc_tl_ucp_recv_cb(PTR_OFFSET(t2_rbuf, data_size * j),
data_size, mtype,
t2.children[i], team, task, cb[1], (void *)task),
task, out);
j++;
}
}
task->reduce_dbt.state = REDUCE;

REDUCE:
if (t1.recv == t1.n_children && !task->reduce_dbt.t1_reduction_comp) {
if (t1.n_children > 0) {
single_tree_reduce(task, t1_sbuf, t1_rbuf, t1.n_children, count_t1, data_size_t1, dt, args, is_avg && t1.root == rank);
}
task->reduce_dbt.t1_reduction_comp = 1;
}
if (t2.recv == t2.n_children && !task->reduce_dbt.t2_reduction_comp) {
if (t2.n_children > 0) {
single_tree_reduce(task, t2_sbuf, t2_rbuf, t2.n_children, count / 2, data_size, dt, args, is_avg && t2.root == rank);
}
task->reduce_dbt.t2_reduction_comp = 1;
}

if (rank != t1.root && task->reduce_dbt.t1_reduction_comp && !task->reduce_dbt.t1_send_comp) {
UCPCHECK_GOTO(ucc_tl_ucp_send_nb((t1.n_children > 0) ? t1_rbuf : t1_sbuf, data_size_t1, mtype, t1.parent,
team, task),
task, out);
task->reduce_dbt.t1_send_comp = 1;
}
if (rank != t2.root && task->reduce_dbt.t2_reduction_comp && !task->reduce_dbt.t2_send_comp) {
UCPCHECK_GOTO(ucc_tl_ucp_send_nb((t2.n_children > 0) ? t2_rbuf : t2_sbuf,
data_size, mtype, t2.parent, team,
task),
task, out);
task->reduce_dbt.t2_send_comp = 1;
}

if (!task->reduce_dbt.t1_reduction_comp || !task->reduce_dbt.t2_reduction_comp) {
return;
}
TEST:
if (UCC_INPROGRESS == ucc_tl_ucp_test_send(task)) {
task->reduce_dbt.state = TEST;
return;
}

if (rank == t1.root && rank != coll_root) {
UCPCHECK_GOTO(ucc_tl_ucp_send_nb(t1_rbuf, data_size_t1, mtype, coll_root,
team, task),
task, out);
}
if (rank == t2.root && rank != coll_root) {
UCPCHECK_GOTO(ucc_tl_ucp_send_nb(t2_rbuf,
data_size, mtype, coll_root, team,
task),
task, out);
}
task->reduce_dbt.t1_reduction_comp = t1.recv;
task->reduce_dbt.t2_reduction_comp = t2.recv;

if (rank == coll_root && rank != t1.root) {
UCPCHECK_GOTO(ucc_tl_ucp_recv_cb(args->dst.info.buffer, data_size_t1, mtype, t1.root,
team, task, cb[0], (void *)task),
task, out);
task->reduce_dbt.t1_reduction_comp++;
}
if (rank == coll_root && rank != t2.root) {
UCPCHECK_GOTO(ucc_tl_ucp_recv_cb(PTR_OFFSET(args->dst.info.buffer, data_size_t1),
data_size, mtype, t2.root, team,
task, cb[1], (void *)task),
task, out);
task->reduce_dbt.t2_reduction_comp++;
}

TEST_ROOT:
if (UCC_INPROGRESS == ucc_tl_ucp_test_send(task) || task->reduce_dbt.t1_reduction_comp != t1.recv || task->reduce_dbt.t2_reduction_comp != t2.recv) {
task->reduce_dbt.state = TEST_ROOT;
return;
}
if (rank == coll_root && rank == t1.root) {
UCPCHECK_GOTO(ucc_mc_memcpy(args->dst.info.buffer, t1_rbuf, data_size_t1, mtype, mtype), task, out);
}
if (rank == coll_root && rank == t2.root) {
UCPCHECK_GOTO(ucc_mc_memcpy(PTR_OFFSET(args->dst.info.buffer, data_size_t1), t2_rbuf, data_size, mtype, mtype), task, out);
}
task->super.status = UCC_OK;
UCC_TL_UCP_PROFILE_REQUEST_EVENT(coll_task, "ucp_reduce_dbt_done", 0);

out:
return;
}

ucc_status_t ucc_tl_ucp_reduce_dbt_start(ucc_coll_task_t *coll_task)
{
ucc_tl_ucp_task_t *task = ucc_derived_of(coll_task, ucc_tl_ucp_task_t);
ucc_tl_ucp_team_t *team = TASK_TEAM(task);
ucc_coll_args_t *args = &TASK_ARGS(task);
ucc_status_t status;
ucc_datatype_t dt;
size_t count;
size_t data_size;
ucc_rank_t rank = UCC_TL_TEAM_RANK(team);
int avg_pre_op =
UCC_TL_UCP_TEAM_LIB(TASK_TEAM(task))->cfg.reduce_avg_pre_op;

task->reduce_dbt.t1.recv = 0;
task->reduce_dbt.t2.recv = 0;
task->reduce_dbt.t1_reduction_comp = 0;
task->reduce_dbt.t2_reduction_comp = 0;
task->reduce_dbt.t1_send_comp = 0;
task->reduce_dbt.t2_send_comp = 0;
ucc_tl_ucp_task_reset(task, UCC_INPROGRESS);

if (TASK_ARGS(task).root == rank) {
count = TASK_ARGS(task).dst.info.count;
dt = TASK_ARGS(task).dst.info.datatype;
} else {
count = TASK_ARGS(task).src.info.count;
dt = TASK_ARGS(task).src.info.datatype;
}
data_size = count * ucc_dt_size(dt);

status = ucc_coll_task_get_executor(&task->super,
&task->reduce_dbt.executor);
if (ucc_unlikely(status != UCC_OK)) {
return status;
}
if (UCC_IS_INPLACE(*args) && (rank == args->root)) {
args->src.info.buffer = args->dst.info.buffer;
}

if (avg_pre_op && args->op == UCC_OP_AVG) {
/* In case of avg_pre_op, each process must divide itself by team_size */
status =
ucc_dt_reduce(args->src.info.buffer, args->src.info.buffer,
PTR_OFFSET(task->reduce_dbt.scratch, data_size * 2), count, dt, args,
UCC_EEE_TASK_FLAG_REDUCE_WITH_ALPHA,
1.0 / (double)(UCC_TL_TEAM_SIZE(TASK_TEAM(task)) * 2),
task->reduce_dbt.executor, &task->reduce_dbt.etask);
if (ucc_unlikely(UCC_OK != status)) {
tl_error(UCC_TASK_LIB(task),
"failed to perform dt reduction");
return status;
}
EXEC_TASK_WAIT(task->reduce_dbt.etask, status);
}

task->reduce_dbt.state = RECV;
UCC_TL_UCP_PROFILE_REQUEST_EVENT(coll_task, "ucp_reduce_dbt_start", 0);
return ucc_progress_queue_enqueue(UCC_TL_CORE_CTX(team)->pq, &task->super);
}

ucc_status_t ucc_tl_ucp_reduce_dbt_finalize(ucc_coll_task_t *coll_task)
{
ucc_tl_ucp_task_t *task = ucc_derived_of(coll_task, ucc_tl_ucp_task_t);

if (task->reduce_dbt.scratch_mc_header) {
ucc_mc_free(task->reduce_dbt.scratch_mc_header);
}

return ucc_tl_ucp_coll_finalize(coll_task);
}

ucc_status_t ucc_tl_ucp_reduce_dbt_init(ucc_base_coll_args_t *coll_args,
ucc_base_team_t *team,
ucc_coll_task_t **task_h)
{
ucc_tl_ucp_team_t *tl_team;
ucc_tl_ucp_task_t *task;
ucc_rank_t rank, size;
ucc_memory_type_t mtype;
ucc_datatype_t dt;
size_t count;
size_t data_size;
ucc_status_t status;

task = ucc_tl_ucp_init_task(coll_args, team);
task->super.flags |= UCC_COLL_TASK_FLAG_EXECUTOR;
task->super.post = ucc_tl_ucp_reduce_dbt_start;
task->super.progress = ucc_tl_ucp_reduce_dbt_progress;
task->super.finalize = ucc_tl_ucp_reduce_dbt_finalize;
tl_team = TASK_TEAM(task);
rank = UCC_TL_TEAM_RANK(tl_team);
size = UCC_TL_TEAM_SIZE(tl_team);
ucc_dbt_build_trees(rank, size, &task->reduce_dbt.t1,
&task->reduce_dbt.t2);

if (coll_args->args.root == rank) {
count = coll_args->args.dst.info.count;
dt = coll_args->args.dst.info.datatype;
mtype = coll_args->args.dst.info.mem_type;
} else {
count = coll_args->args.src.info.count;
dt = coll_args->args.src.info.datatype;
mtype = coll_args->args.src.info.mem_type;
}
data_size = count * ucc_dt_size(dt);
task->reduce_dbt.scratch_mc_header = NULL;
status = ucc_mc_alloc(&task->reduce_dbt.scratch_mc_header, 3 * data_size,
mtype);
if (ucc_unlikely(status != UCC_OK)) {
return status;
}
task->reduce_dbt.scratch = task->reduce_dbt.scratch_mc_header->addr;
*task_h = &task->super;
return UCC_OK;
}
19 changes: 19 additions & 0 deletions src/components/tl/ucp/tl_ucp_coll.c
Original file line number Diff line number Diff line change
@@ -42,6 +42,10 @@ const ucc_tl_ucp_default_alg_desc_t
.select_str = UCC_TL_UCP_BCAST_DEFAULT_ALG_SELECT_STR,
.str_get_fn = NULL
},
{
.select_str = UCC_TL_UCP_REDUCE_DEFAULT_ALG_SELECT_STR,
.str_get_fn = NULL
},
{
.select_str = UCC_TL_UCP_REDUCE_SCATTER_DEFAULT_ALG_SELECT_STR,
.str_get_fn = NULL
@@ -223,6 +227,8 @@ static inline int alg_id_from_str(ucc_coll_type_t coll_type, const char *str)
return ucc_tl_ucp_alltoallv_alg_from_str(str);
case UCC_COLL_TYPE_BCAST:
return ucc_tl_ucp_bcast_alg_from_str(str);
case UCC_COLL_TYPE_REDUCE:
return ucc_tl_ucp_reduce_alg_from_str(str);
case UCC_COLL_TYPE_REDUCE_SCATTER:
return ucc_tl_ucp_reduce_scatter_alg_from_str(str);
case UCC_COLL_TYPE_REDUCE_SCATTERV:
@@ -318,6 +324,19 @@ ucc_status_t ucc_tl_ucp_alg_id_to_init(int alg_id, const char *alg_id_str,
break;
};
break;
case UCC_COLL_TYPE_REDUCE:
switch (alg_id) {
case UCC_TL_UCP_REDUCE_ALG_KNOMIAL:
*init = ucc_tl_ucp_reduce_knomial_init;
break;
case UCC_TL_UCP_REDUCE_ALG_DBT:
*init = ucc_tl_ucp_reduce_dbt_init;
break;
default:
status = UCC_ERR_INVALID_PARAM;
break;
};
break;
case UCC_COLL_TYPE_REDUCE_SCATTER:
switch (alg_id) {
case UCC_TL_UCP_REDUCE_SCATTER_ALG_RING:
15 changes: 14 additions & 1 deletion src/components/tl/ucp/tl_ucp_coll.h
Original file line number Diff line number Diff line change
@@ -17,7 +17,7 @@
#include "tl_ucp_tag.h"

#define UCC_UUNITS_AUTO_RADIX 4
#define UCC_TL_UCP_N_DEFAULT_ALG_SELECT_STR 7
#define UCC_TL_UCP_N_DEFAULT_ALG_SELECT_STR 8

ucc_status_t ucc_tl_ucp_team_default_score_str_alloc(ucc_tl_ucp_team_t *team,
char *default_select_str[UCC_TL_UCP_N_DEFAULT_ALG_SELECT_STR]);
@@ -200,6 +200,19 @@ typedef struct ucc_tl_ucp_task {
ucc_ee_executor_task_t *etask;
ucc_ee_executor_t *executor;
} reduce_kn;
struct {
ucc_dbt_single_tree_t t1;
ucc_dbt_single_tree_t t2;
int state;
int t1_reduction_comp;
int t2_reduction_comp;
int t1_send_comp;
int t2_send_comp;
void *scratch;
ucc_mc_buffer_header_t *scratch_mc_header;
ucc_ee_executor_task_t *etask;
ucc_ee_executor_t *executor;
} reduce_dbt;
struct {
ucc_rank_t dist;
ucc_rank_t max_dist;

0 comments on commit 470ee92

Please sign in to comment.