From d891b011eac5e8a1ff5decd73fde9aabb0be6e1a Mon Sep 17 00:00:00 2001 From: Ching-Hsiang Chu Date: Tue, 14 Jun 2022 04:37:18 -0700 Subject: [PATCH] TL/SELF: support team size 1 (#511) * TL/SELF: support team size 1 * TL/SELF: cleanup * TL/SELF: fix lib name and formatting * CODESTYLE: add TL/SELF * TL/SELF: fix configure file and TL/CUDA creation * TL/SELF: fix compilation errors * CORE: skip service team for size 1 * TL/UCP: only skip team size 1 * REVIEW: add Meta copyright * TL/SELF: fix team creation * TEST: enable gtest for team size 1 * TL/CUDA: skip team size 1 * TL/SELF: address review comments * CODESTYLE: code clean-up and formatting * TL/SELF: use executor * CODESTYLE: code formatting * DOCS: update author email * TL/SELF: address comments * TL/SELF: fix compilation error * TL/SELF: address comments * CODESTYLE: address comments * TEST: add more team size 1 tests * CODESTYLE: address comments * TEST: fix team size 1 gather/bcast gtest --- .github/workflows/codestyle.yaml | 2 +- AUTHORS | 2 +- src/components/tl/cuda/tl_cuda_team.c | 4 + src/components/tl/self/Makefile.am | 25 +++ src/components/tl/self/configure.m4 | 21 ++ src/components/tl/self/tl_self.c | 56 +++++ src/components/tl/self/tl_self.h | 98 +++++++++ src/components/tl/self/tl_self_coll.c | 257 +++++++++++++++++++++++ src/components/tl/self/tl_self_context.c | 48 +++++ src/components/tl/self/tl_self_lib.c | 39 ++++ src/components/tl/self/tl_self_team.c | 92 ++++++++ src/components/tl/ucp/tl_ucp_team.c | 8 + src/core/ucc_context.c | 6 +- src/core/ucc_team.c | 16 +- test/gtest/coll/test_bcast.cc | 5 + test/gtest/coll/test_gather.cc | 5 + test/gtest/common/test_ucc.h | 4 +- test/gtest/core/test_context.cc | 3 + test/gtest/core/test_team.cc | 3 +- 19 files changed, 680 insertions(+), 14 deletions(-) create mode 100644 src/components/tl/self/Makefile.am create mode 100644 src/components/tl/self/configure.m4 create mode 100644 src/components/tl/self/tl_self.c create mode 100644 src/components/tl/self/tl_self.h create mode 100644 src/components/tl/self/tl_self_coll.c create mode 100644 src/components/tl/self/tl_self_context.c create mode 100644 src/components/tl/self/tl_self_lib.c create mode 100644 src/components/tl/self/tl_self_team.c diff --git a/.github/workflows/codestyle.yaml b/.github/workflows/codestyle.yaml index 46e5be7375..e2011ed933 100644 --- a/.github/workflows/codestyle.yaml +++ b/.github/workflows/codestyle.yaml @@ -37,7 +37,7 @@ jobs: fi fi H1="CODESTYLE|REVIEW|CORE|UTIL|TEST|API|DOCS|TOOLS|BUILD|MC|EC|SCHEDULE|TOPO" - H2="CL/|TL/|MC/|EC/|UCP|NCCL|SHARP|BASIC|HIER|CUDA|CPU|EE|RCCL|ROCM" + H2="CL/|TL/|MC/|EC/|UCP|NCCL|SHARP|BASIC|HIER|CUDA|CPU|EE|RCCL|ROCM|SELF" if ! echo $msg | grep -qP '^Merge |^'"(($H1)|($H2))"'+: \w' then echo "Wrong header" diff --git a/AUTHORS b/AUTHORS index 86eed113fe..96405d3401 100644 --- a/AUTHORS +++ b/AUTHORS @@ -1,7 +1,7 @@ Alex Margolin alex.margolin@huawei.com Anatoly Vildemanov anatolyv@nvidia.com Boris Karasev boriska@nvidia.com -Ching-Hsiang Chu king770120@gmail.com +Ching-Hsiang Chu chchu@fb.com Devendar Bureddy devendar@nvidia.com Ferrol Aderholdt faderholdt@nvidia.com Geoffroy Vallee geoffroy@nvidia.com diff --git a/src/components/tl/cuda/tl_cuda_team.c b/src/components/tl/cuda/tl_cuda_team.c index d47fe4acf4..43ae9397f7 100644 --- a/src/components/tl/cuda/tl_cuda_team.c +++ b/src/components/tl/cuda/tl_cuda_team.c @@ -32,6 +32,10 @@ UCC_CLASS_INIT_FUNC(ucc_tl_cuda_team_t, ucc_base_context_t *tl_context, self->stream = NULL; self->topo = NULL; self->scratch.loc = NULL; + if (UCC_TL_TEAM_SIZE(self) < 2) { + tl_trace(tl_context->lib, "team size is too small, min supported 2"); + return UCC_ERR_NOT_SUPPORTED; + } if (UCC_TL_TEAM_SIZE(self) > UCC_TL_CUDA_MAX_PEERS) { tl_info(tl_context->lib, "team size is too large, max supported %d", UCC_TL_CUDA_MAX_PEERS); diff --git a/src/components/tl/self/Makefile.am b/src/components/tl/self/Makefile.am new file mode 100644 index 0000000000..78971e6303 --- /dev/null +++ b/src/components/tl/self/Makefile.am @@ -0,0 +1,25 @@ +# +# Copyright (C) Mellanox Technologies Ltd. 2021-2022. ALL RIGHTS RESERVED. +# Copyright (c) Meta Platforms, Inc. and affiliates. 2022. +# + +if TL_SELF_ENABLED +sources = \ + tl_self.h \ + tl_self.c \ + tl_self_coll.c \ + tl_self_context.c \ + tl_self_lib.c \ + tl_self_team.c + + +module_LTLIBRARIES = libucc_tl_self.la +libucc_tl_self_la_SOURCES = $(sources) +libucc_tl_self_la_CPPFLAGS = $(AM_CPPFLAGS) $(BASE_CPPFLAGS) +libucc_tl_self_la_CFLAGS = $(BASE_CFLAGS) +libucc_tl_self_la_LDFLAGS = -version-info $(SOVERSION) --as-needed +libucc_tl_self_la_LIBADD = $(UCC_TOP_BUILDDIR)/src/libucc.la + +include $(top_srcdir)/config/module.am + +endif diff --git a/src/components/tl/self/configure.m4 b/src/components/tl/self/configure.m4 new file mode 100644 index 0000000000..1fe129e715 --- /dev/null +++ b/src/components/tl/self/configure.m4 @@ -0,0 +1,21 @@ +# +# Copyright (C) Mellanox Technologies Ltd. 2022. ALL RIGHTS RESERVED. +# Copyright (c) Meta Platforms, Inc. and affiliates. 2022. +# + +tl_self_enabled=n +CHECK_TLS_REQUIRED(["self"]) +AS_IF([test "$CHECKED_TL_REQUIRED" = "y"], +[ + tl_modules="${tl_modules}:self" + tl_self_enabled=y + CHECK_NEED_TL_PROFILING(["tl_self"]) + AS_IF([test "$TL_PROFILING_REQUIRED" = "y"], + [ + AC_DEFINE([HAVE_PROFILING_TL_SELF], [1], [Enable profiling for TL SELF]) + prof_modules="${prof_modules}:tl_self" + ], []) +], []) + +AM_CONDITIONAL([TL_SELF_ENABLED], [test "$tl_self_enabled" = "y"]) +AC_CONFIG_FILES([src/components/tl/self/Makefile]) diff --git a/src/components/tl/self/tl_self.c b/src/components/tl/self/tl_self.c new file mode 100644 index 0000000000..ea78862085 --- /dev/null +++ b/src/components/tl/self/tl_self.c @@ -0,0 +1,56 @@ +/** + * Copyright (C) Mellanox Technologies Ltd. 2021-2022. ALL RIGHTS RESERVED. + * Copyright (c) Meta Platforms, Inc. and affiliates. 2022. + * + * See file LICENSE for terms. + */ + +#include "tl_self.h" +#include "utils/ucc_malloc.h" +#include "components/mc/ucc_mc.h" +#include "components/mc/base/ucc_mc_base.h" + +ucc_status_t ucc_tl_self_get_lib_attr(const ucc_base_lib_t *lib, + ucc_base_lib_attr_t *base_attr); +ucc_status_t ucc_tl_self_get_context_attr(const ucc_base_context_t *context, + ucc_base_ctx_attr_t *base_attr); + +static ucc_config_field_t ucc_tl_self_lib_config_table[] = { + {"", "", NULL, ucc_offsetof(ucc_tl_self_lib_config_t, super), + UCC_CONFIG_TYPE_TABLE(ucc_tl_lib_config_table)}, + + {NULL}}; + +static ucs_config_field_t ucc_tl_self_context_config_table[] = { + {"", "", NULL, ucc_offsetof(ucc_tl_self_context_config_t, super), + UCC_CONFIG_TYPE_TABLE(ucc_tl_context_config_table)}, + + {NULL}}; + +UCC_CLASS_DEFINE_NEW_FUNC(ucc_tl_self_lib_t, ucc_base_lib_t, + const ucc_base_lib_params_t *, + const ucc_base_config_t *); + +UCC_CLASS_DEFINE_DELETE_FUNC(ucc_tl_self_lib_t, ucc_base_lib_t); + +UCC_CLASS_DEFINE_NEW_FUNC(ucc_tl_self_context_t, ucc_base_context_t, + const ucc_base_context_params_t *, + const ucc_base_config_t *); + +UCC_CLASS_DEFINE_DELETE_FUNC(ucc_tl_self_context_t, ucc_base_context_t); + +UCC_CLASS_DEFINE_NEW_FUNC(ucc_tl_self_team_t, ucc_base_team_t, + ucc_base_context_t *, const ucc_base_team_params_t *); + +ucc_status_t ucc_tl_self_team_create_test(ucc_base_team_t *tl_team); + +ucc_status_t ucc_tl_self_team_destroy(ucc_base_team_t *tl_team); + +ucc_status_t ucc_tl_self_coll_init(ucc_base_coll_args_t *coll_args, + ucc_base_team_t *team, + ucc_coll_task_t **task); + +ucc_status_t ucc_tl_self_team_get_scores(ucc_base_team_t *tl_team, + ucc_coll_score_t **score); + +UCC_TL_IFACE_DECLARE(self, SELF); diff --git a/src/components/tl/self/tl_self.h b/src/components/tl/self/tl_self.h new file mode 100644 index 0000000000..c35faa5f83 --- /dev/null +++ b/src/components/tl/self/tl_self.h @@ -0,0 +1,98 @@ +/** + * Copyright (C) Mellanox Technologies Ltd. 2021-2022. ALL RIGHTS RESERVED. + * Copyright (c) Meta Platforms, Inc. and affiliates. 2022. + * + * See file LICENSE for terms. + */ + +#ifndef UCC_TL_SELF_H_ +#define UCC_TL_SELF_H_ +#include +#include "components/tl/ucc_tl.h" +#include "components/tl/ucc_tl_log.h" +#include "core/ucc_ee.h" +#include "utils/ucc_mpool.h" + +#ifndef UCC_TL_SELF_DEFAULT_SCORE +#define UCC_TL_SELF_DEFAULT_SCORE 50 +#endif + +#ifdef HAVE_PROFILING_TL_SELF +#include "utils/profile/ucc_profile.h" +#else +#include "utils/profile/ucc_profile_off.h" +#endif + +#define UCC_TL_SELF_PROFILE_FUNC UCC_PROFILE_FUNC +#define UCC_TL_SELF_PROFILE_FUNC_VOID UCC_PROFILE_FUNC_VOID +#define UCC_TL_SELF_PROFILE_REQUEST_NEW UCC_PROFILE_REQUEST_NEW +#define UCC_TL_SELF_PROFILE_REQUEST_EVENT UCC_PROFILE_REQUEST_EVENT +#define UCC_TL_SELF_PROFILE_REQUEST_FREE UCC_PROFILE_REQUEST_FREE + +typedef struct ucc_tl_self_iface { + ucc_tl_iface_t super; +} ucc_tl_self_iface_t; +/* Extern iface should follow the pattern: ucc_tl_ */ +extern ucc_tl_self_iface_t ucc_tl_self; + +typedef struct ucc_tl_self_lib_config { + ucc_tl_lib_config_t super; +} ucc_tl_self_lib_config_t; + +typedef struct ucc_tl_self_context_config { + ucc_tl_context_config_t super; +} ucc_tl_self_context_config_t; + +typedef struct ucc_tl_self_lib { + ucc_tl_lib_t super; + ucc_tl_self_lib_config_t cfg; +} ucc_tl_self_lib_t; +UCC_CLASS_DECLARE(ucc_tl_self_lib_t, const ucc_base_lib_params_t *, + const ucc_base_config_t *); + +typedef struct ucc_tl_self_context { + ucc_tl_context_t super; + ucc_tl_self_context_config_t cfg; + ucc_mpool_t req_mp; +} ucc_tl_self_context_t; +UCC_CLASS_DECLARE(ucc_tl_self_context_t, const ucc_base_context_params_t *, + const ucc_base_config_t *); + +typedef struct ucc_tl_self_task { + ucc_coll_task_t super; + void *src; + void *dst; + size_t size; + ucc_memory_type_t src_memtype; + ucc_memory_type_t dst_memtype; + ucc_ee_executor_task_t *etask; +} ucc_tl_self_task_t; + +typedef struct ucc_tl_self_team { + ucc_tl_team_t super; + ucc_status_t status; +} ucc_tl_self_team_t; +UCC_CLASS_DECLARE(ucc_tl_self_team_t, ucc_base_context_t *, + const ucc_base_team_params_t *); + +#define UCC_TL_SELF_SUPPORTED_COLLS \ + (UCC_COLL_TYPE_ALLTOALL | UCC_COLL_TYPE_ALLTOALLV | \ + UCC_COLL_TYPE_ALLGATHER | UCC_COLL_TYPE_ALLGATHERV | \ + UCC_COLL_TYPE_ALLREDUCE | UCC_COLL_TYPE_BCAST | UCC_COLL_TYPE_BARRIER | \ + UCC_COLL_TYPE_REDUCE | UCC_COLL_TYPE_FANIN | UCC_COLL_TYPE_FANOUT | \ + UCC_COLL_TYPE_GATHER | UCC_COLL_TYPE_GATHERV | UCC_COLL_TYPE_SCATTER | \ + UCC_COLL_TYPE_SCATTERV | UCC_COLL_TYPE_REDUCE_SCATTER | \ + UCC_COLL_TYPE_REDUCE_SCATTERV) + +#define UCC_TL_SELF_TEAM_LIB(_team) \ + (ucc_derived_of((_team)->super.super.context->lib, ucc_tl_self_lib_t)) + +#define UCC_TL_SELF_TEAM_CTX(_team) \ + (ucc_derived_of((_team)->super.super.context, ucc_tl_self_context_t)) + +ucc_status_t ucc_tl_self_coll_init(ucc_base_coll_args_t *coll_args, + ucc_base_team_t *team, + ucc_coll_task_t **task_h); +ucc_status_t ucc_tl_self_coll_finalize(ucc_coll_task_t *coll_task); + +#endif diff --git a/src/components/tl/self/tl_self_coll.c b/src/components/tl/self/tl_self_coll.c new file mode 100644 index 0000000000..7b93dfcc5e --- /dev/null +++ b/src/components/tl/self/tl_self_coll.c @@ -0,0 +1,257 @@ +/** + * Copyright (C) Mellanox Technologies Ltd. 2021-2022. ALL RIGHTS RESERVED. + * Copyright (c) Meta Platforms, Inc. and affiliates. 2022. + * + * See file LICENSE for terms. + */ + +#include "tl_self.h" +#include "utils/ucc_coll_utils.h" +#include "utils/ucc_malloc.h" + +static inline ucc_tl_self_task_t * +ucc_tl_self_coll_init_task(ucc_base_coll_args_t *coll_args, + ucc_base_team_t *team) +{ + ucc_tl_self_team_t *tl_team = ucc_derived_of(team, ucc_tl_self_team_t); + ucc_tl_self_context_t *ctx = UCC_TL_SELF_TEAM_CTX(tl_team); + ucc_tl_self_task_t *task = ucc_mpool_get(&ctx->req_mp); + + if (ucc_unlikely(!task)) { + return NULL; + } + + ucc_coll_task_init(&task->super, coll_args, team); + UCC_TL_SELF_PROFILE_REQUEST_NEW(task, "tl_self_task", 0); + task->super.finalize = ucc_tl_self_coll_finalize; + task->super.triggered_post = ucc_triggered_post; + task->src = NULL; + task->dst = NULL; + task->size = 0; + task->etask = NULL; + return task; +} + +static inline void ucc_tl_self_put_task(ucc_tl_self_task_t *task) +{ + UCC_TL_SELF_PROFILE_REQUEST_FREE(task); + ucc_mpool_put(task); +} + +ucc_status_t ucc_tl_self_coll_finalize(ucc_coll_task_t *coll_task) +{ + ucc_tl_self_task_t *task = ucc_derived_of(coll_task, ucc_tl_self_task_t); + + tl_trace(UCC_TASK_LIB(task), "finalizing task %p", task); + ucc_tl_self_put_task(task); + return UCC_OK; +} + +void ucc_tl_self_noop_progress(ucc_coll_task_t *task) +{ + task->status = UCC_OK; +} + +void ucc_tl_self_copy_progress(ucc_coll_task_t *coll_task) +{ + ucc_tl_self_task_t *task = ucc_derived_of(coll_task, ucc_tl_self_task_t); + ucc_status_t status; + + if (task->etask != NULL) { + status = ucc_ee_executor_task_test(task->etask); + if (status == UCC_OPERATION_INITIALIZED || status == UCC_INPROGRESS) { + task->super.status = UCC_INPROGRESS; + return; + } + ucc_ee_executor_task_finalize(task->etask); + task->etask = NULL; + task->super.status = status; + } +} + +ucc_status_t ucc_tl_self_copy_start(ucc_coll_task_t *coll_task) +{ + ucc_tl_self_task_t *task = ucc_derived_of(coll_task, ucc_tl_self_task_t); + ucc_ee_executor_t *exec; + ucc_ee_executor_task_args_t exec_args; + ucc_status_t status; + + status = ucc_coll_task_get_executor(&task->super, &exec); + if (ucc_unlikely(status != UCC_OK)) { + return status; + } + + exec_args.task_type = UCC_EE_EXECUTOR_TASK_TYPE_COPY; + exec_args.bufs[0] = task->dst; + exec_args.bufs[1] = task->src; + exec_args.count = task->size; + task->super.status = + ucc_ee_executor_task_post(exec, &exec_args, &task->etask); + if (ucc_unlikely(status != UCC_OK)) { + return status; + } + + return ucc_progress_queue_enqueue(UCC_TASK_CORE_CTX(coll_task)->pq, + coll_task); +} + +ucc_status_t ucc_tl_self_coll_start(ucc_coll_task_t *task) +{ + return ucc_progress_queue_enqueue(UCC_TASK_CORE_CTX(task)->pq, task); +} + +ucc_status_t ucc_tl_self_coll_noop_init(ucc_tl_self_task_t *task) +{ + task->super.post = ucc_tl_self_coll_start; + task->super.progress = ucc_tl_self_noop_progress; + return UCC_OK; +} + +ucc_status_t ucc_tl_self_coll_copy_init(ucc_tl_self_task_t *task) +{ + ucc_coll_args_t *args = &(task->super.bargs.args); + + if (UCC_IS_INPLACE(*args)) { + /* no copy is required for in-place */ + task->super.post = ucc_tl_self_coll_start; + task->super.progress = ucc_tl_self_noop_progress; + } else { + task->dst = args->dst.info.buffer; + task->src = args->src.info.buffer; + task->size = + args->src.info.count * ucc_dt_size(args->src.info.datatype); + task->dst_memtype = args->dst.info.mem_type; + task->src_memtype = args->src.info.mem_type; + task->super.post = ucc_tl_self_copy_start; + task->super.progress = ucc_tl_self_copy_progress; + task->super.flags |= UCC_COLL_TASK_FLAG_EXECUTOR; + } + return UCC_OK; +} + +ucc_status_t ucc_tl_self_alltoallv_init(ucc_tl_self_task_t *task) +{ + ucc_coll_args_t *args = &(task->super.bargs.args); + + if (UCC_IS_INPLACE(*args)) { + /* no copy is required for in-place */ + task->super.post = ucc_tl_self_coll_start; + task->super.progress = ucc_tl_self_noop_progress; + } else { + size_t displ = (size_t)ucc_coll_args_get_displacement( + args, args->dst.info_v.displacements, 0); + task->dst = PTR_OFFSET(args->dst.info_v.buffer, displ); + displ = (size_t)ucc_coll_args_get_displacement( + args, args->src.info_v.displacements, 0); + task->src = PTR_OFFSET(args->src.info_v.buffer, displ); + task->size = ucc_coll_args_get_count(args, args->src.info_v.counts, 0) * + ucc_dt_size(args->src.info_v.datatype); + task->dst_memtype = args->dst.info_v.mem_type; + task->src_memtype = args->src.info_v.mem_type; + task->super.post = ucc_tl_self_copy_start; + task->super.progress = ucc_tl_self_copy_progress; + task->super.flags |= UCC_COLL_TASK_FLAG_EXECUTOR; + } + return UCC_OK; +} + +ucc_status_t ucc_tl_self_coll_copyv_init(ucc_tl_self_task_t *task) +{ + ucc_coll_args_t *args = &(task->super.bargs.args); + + if (UCC_IS_INPLACE(*args)) { + /* no copy is required for in-place */ + task->super.post = ucc_tl_self_coll_start; + task->super.progress = ucc_tl_self_noop_progress; + } else { + size_t displ = 0; + /* reduce_scatterv may not provide displacements */ + if (args->dst.info_v.displacements) { + displ = (size_t)ucc_coll_args_get_displacement( + args, args->dst.info_v.displacements, 0); + } + task->dst = PTR_OFFSET(args->dst.info_v.buffer, displ); + task->src = args->src.info.buffer; + task->size = + args->src.info.count * ucc_dt_size(args->src.info.datatype); + task->dst_memtype = args->dst.info_v.mem_type; + task->src_memtype = args->src.info.mem_type; + task->super.post = ucc_tl_self_copy_start; + task->super.progress = ucc_tl_self_copy_progress; + task->super.flags |= UCC_COLL_TASK_FLAG_EXECUTOR; + } + return UCC_OK; +} + +ucc_status_t ucc_tl_self_scatterv_init(ucc_tl_self_task_t *task) +{ + ucc_coll_args_t *args = &(task->super.bargs.args); + + if (UCC_IS_INPLACE(*args)) { + /* no copy is required for in-place */ + task->super.post = ucc_tl_self_coll_start; + task->super.progress = ucc_tl_self_noop_progress; + } else { + size_t displ = (size_t)ucc_coll_args_get_displacement( + args, args->src.info_v.displacements, 0); + task->src = PTR_OFFSET(args->src.info_v.buffer, displ); + task->dst = args->dst.info.buffer; + task->size = + args->dst.info.count * ucc_dt_size(args->dst.info.datatype); + task->dst_memtype = args->dst.info.mem_type; + task->src_memtype = args->src.info_v.mem_type; + task->super.post = ucc_tl_self_copy_start; + task->super.progress = ucc_tl_self_copy_progress; + task->super.flags |= UCC_COLL_TASK_FLAG_EXECUTOR; + } + return UCC_OK; +} + +ucc_status_t ucc_tl_self_coll_init(ucc_base_coll_args_t *coll_args, + ucc_base_team_t *team, + ucc_coll_task_t **task_h) +{ + ucc_status_t status; + ucc_tl_self_task_t *task = ucc_tl_self_coll_init_task(coll_args, team); + + if (ucc_unlikely(!task)) { + return UCC_ERR_NO_MEMORY; + } + + switch (coll_args->args.coll_type) { + case UCC_COLL_TYPE_BARRIER: + case UCC_COLL_TYPE_BCAST: + case UCC_COLL_TYPE_FANIN: + case UCC_COLL_TYPE_FANOUT: + status = ucc_tl_self_coll_noop_init(task); + break; + case UCC_COLL_TYPE_REDUCE: + case UCC_COLL_TYPE_GATHER: + case UCC_COLL_TYPE_ALLTOALL: + case UCC_COLL_TYPE_ALLREDUCE: + case UCC_COLL_TYPE_ALLGATHER: + case UCC_COLL_TYPE_REDUCE_SCATTER: + status = ucc_tl_self_coll_copy_init(task); + break; + case UCC_COLL_TYPE_GATHERV: + case UCC_COLL_TYPE_ALLGATHERV: + case UCC_COLL_TYPE_REDUCE_SCATTERV: + status = ucc_tl_self_coll_copyv_init(task); + break; + case UCC_COLL_TYPE_ALLTOALLV: + status = ucc_tl_self_alltoallv_init(task); + break; + case UCC_COLL_TYPE_SCATTERV: + status = ucc_tl_self_scatterv_init(task); + break; + default: + status = UCC_ERR_NOT_SUPPORTED; + } + if (ucc_unlikely(status != UCC_OK)) { + ucc_tl_self_put_task(task); + return status; + } + tl_trace(team->context->lib, "init coll req %p", task); + *task_h = &task->super; + return status; +} diff --git a/src/components/tl/self/tl_self_context.c b/src/components/tl/self/tl_self_context.c new file mode 100644 index 0000000000..2271105643 --- /dev/null +++ b/src/components/tl/self/tl_self_context.c @@ -0,0 +1,48 @@ +/** + * Copyright (C) Mellanox Technologies Ltd. 2021-2022. ALL RIGHTS RESERVED. + * Copyright (c) Meta Platforms, Inc. and affiliates. 2022. + * + * See file LICENSE for terms. + */ + +#include "tl_self.h" +#include "utils/arch/cpu.h" +#include + +UCC_CLASS_INIT_FUNC(ucc_tl_self_context_t, + const ucc_base_context_params_t *params, + const ucc_base_config_t *config) +{ + ucc_tl_self_context_config_t *tl_self_config = + ucc_derived_of(config, ucc_tl_self_context_config_t); + ucc_status_t status; + + UCC_CLASS_CALL_SUPER_INIT(ucc_tl_context_t, &tl_self_config->super, + params->context); + memcpy(&self->cfg, tl_self_config, sizeof(*tl_self_config)); + + status = ucc_mpool_init(&self->req_mp, 0, sizeof(ucc_tl_self_task_t), 0, + UCC_CACHE_LINE_SIZE, 8, UINT_MAX, NULL, + params->thread_mode, "tl_self_req_mp"); + if (status != UCC_OK) { + tl_error(self->super.super.lib, + "failed to initialize tl_self_req mpool"); + return status; + } + + return status; +} + +UCC_CLASS_CLEANUP_FUNC(ucc_tl_self_context_t) +{ + tl_info(self->super.super.lib, "finalizing tl context: %p", self); +} + +UCC_CLASS_DEFINE(ucc_tl_self_context_t, ucc_tl_context_t); + +ucc_status_t +ucc_tl_self_get_context_attr(const ucc_base_context_t *context, /* NOLINT */ + ucc_base_ctx_attr_t *attr) +{ + return UCC_OK; +} diff --git a/src/components/tl/self/tl_self_lib.c b/src/components/tl/self/tl_self_lib.c new file mode 100644 index 0000000000..c1f2b43046 --- /dev/null +++ b/src/components/tl/self/tl_self_lib.c @@ -0,0 +1,39 @@ +/** + * Copyright (C) Mellanox Technologies Ltd. 2021-2022. ALL RIGHTS RESERVED. + * Copyright (c) Meta Platforms, Inc. and affiliates. 2022. + * + * See file LICENSE for terms. + */ + +#include "tl_self.h" + +/* NOLINTNEXTLINE params is not used*/ +UCC_CLASS_INIT_FUNC(ucc_tl_self_lib_t, const ucc_base_lib_params_t *params, + const ucc_base_config_t *config) +{ + const ucc_tl_self_lib_config_t *tl_config = + ucc_derived_of(config, ucc_tl_self_lib_config_t); + + UCC_CLASS_CALL_SUPER_INIT(ucc_tl_lib_t, &ucc_tl_self.super, + &tl_config->super); + memcpy(&self->cfg, tl_config, sizeof(*tl_config)); + tl_info(&self->super, "initialized lib object: %p", self); + return UCC_OK; +} + +UCC_CLASS_CLEANUP_FUNC(ucc_tl_self_lib_t) +{ + tl_info(&self->super, "finalizing lib object: %p", self); +} + +UCC_CLASS_DEFINE(ucc_tl_self_lib_t, ucc_tl_lib_t); + +ucc_status_t ucc_tl_self_get_lib_attr(const ucc_base_lib_t *lib, /* NOLINT */ + ucc_base_lib_attr_t *base_attr) +{ + ucc_tl_lib_attr_t *attr = ucc_derived_of(base_attr, ucc_tl_lib_attr_t); + + attr->super.attr.thread_mode = UCC_THREAD_MULTIPLE; + attr->super.attr.coll_types = UCC_TL_SELF_SUPPORTED_COLLS; + return UCC_OK; +} diff --git a/src/components/tl/self/tl_self_team.c b/src/components/tl/self/tl_self_team.c new file mode 100644 index 0000000000..33222250a5 --- /dev/null +++ b/src/components/tl/self/tl_self_team.c @@ -0,0 +1,92 @@ +/** + * Copyright (C) Mellanox Technologies Ltd. 2021-2022. ALL RIGHTS RESERVED. + * Copyright (c) Meta Platforms, Inc. and affiliates. 2022. + * + * See file LICENSE for terms. + */ + +#include "coll_score/ucc_coll_score.h" +#include "core/ucc_team.h" +#include "tl_self.h" + +UCC_CLASS_INIT_FUNC(ucc_tl_self_team_t, ucc_base_context_t *tl_context, + const ucc_base_team_params_t *params) +{ + ucc_tl_self_context_t *ctx = + ucc_derived_of(tl_context, ucc_tl_self_context_t); + + UCC_CLASS_CALL_SUPER_INIT(ucc_tl_team_t, &ctx->super, params); + + if (UCC_TL_TEAM_SIZE(self) > 1) { + tl_trace(tl_context->lib, + "team size %d is too large, max supported 1, skip", + UCC_TL_TEAM_SIZE(self)); + return UCC_ERR_NOT_SUPPORTED; + } + + tl_info(tl_context->lib, "posted tl team: %p", self); + return UCC_OK; +} + +UCC_CLASS_CLEANUP_FUNC(ucc_tl_self_team_t) +{ + tl_info(self->super.super.context->lib, "finalizing tl team: %p", self); +} + +UCC_CLASS_DEFINE_DELETE_FUNC(ucc_tl_self_team_t, ucc_base_team_t); + +UCC_CLASS_DEFINE(ucc_tl_self_team_t, ucc_tl_team_t); + +ucc_status_t ucc_tl_self_team_destroy(ucc_base_team_t *tl_team) +{ + UCC_CLASS_DELETE_FUNC_NAME(ucc_tl_self_team_t)(tl_team); + return UCC_OK; +} + +ucc_status_t ucc_tl_self_team_create_test(ucc_base_team_t *tl_team) +{ + ucc_tl_self_team_t *team = ucc_derived_of(tl_team, ucc_tl_self_team_t); + + tl_info(tl_team->context->lib, "initialized tl team: %p", team); + return UCC_OK; +} + +ucc_status_t ucc_tl_self_team_get_scores(ucc_base_team_t *tl_team, + ucc_coll_score_t **score_p) +{ + ucc_tl_self_team_t *team = ucc_derived_of(tl_team, ucc_tl_self_team_t); + ucc_base_context_t *ctx = UCC_TL_TEAM_CTX(team); + int mt_n = 0, i; + ucc_memory_type_t mem_types[UCC_MEMORY_TYPE_LAST]; + ucc_coll_score_t *score; + ucc_status_t status; + + for (i = 0; i < UCC_MEMORY_TYPE_LAST; i++) { + mem_types[mt_n++] = (ucc_memory_type_t)i; + } + + status = ucc_coll_score_build_default( + tl_team, UCC_TL_SELF_DEFAULT_SCORE, ucc_tl_self_coll_init, + UCC_TL_SELF_SUPPORTED_COLLS, mem_types, mt_n, &score); + + if (UCC_OK != status) { + return status; + } + + if (strlen(ctx->score_str) > 0) { + status = ucc_coll_score_update_from_str( + ctx->score_str, score, UCC_TL_TEAM_SIZE(team), + ucc_tl_self_coll_init, &team->super.super, + UCC_TL_SELF_DEFAULT_SCORE, NULL); + if ((status < 0) && (status != UCC_ERR_INVALID_PARAM) && + (status != UCC_ERR_NOT_SUPPORTED)) { + goto err; + } + } + + *score_p = score; + return UCC_OK; +err: + ucc_coll_score_free(score); + return status; +} diff --git a/src/components/tl/ucp/tl_ucp_team.c b/src/components/tl/ucp/tl_ucp_team.c index e385628b1b..d81cdb9bb5 100644 --- a/src/components/tl/ucp/tl_ucp_team.c +++ b/src/components/tl/ucp/tl_ucp_team.c @@ -20,6 +20,14 @@ UCC_CLASS_INIT_FUNC(ucc_tl_ucp_team_t, ucc_base_context_t *tl_context, UCC_CLASS_CALL_SUPER_INIT(ucc_tl_team_t, &ctx->super, params); /* TODO: init based on ctx settings and on params: need to check if all the necessary ranks mappings are provided */ + + if (UCC_TL_TEAM_SIZE(self) < 2) { + tl_trace(tl_context->lib, + "team size %d is too small, minimal size is 2", + UCC_TL_TEAM_SIZE(self)); + return UCC_ERR_NOT_SUPPORTED; + } + self->preconnect_task = NULL; self->seq_num = 0; self->status = UCC_INPROGRESS; diff --git a/src/core/ucc_context.c b/src/core/ucc_context.c index 544ead93fd..763205be6d 100644 --- a/src/core/ucc_context.c +++ b/src/core/ucc_context.c @@ -656,7 +656,8 @@ ucc_status_t ucc_context_create(ucc_lib_h lib, } ctx->id.pi = ucc_local_proc; ctx->id.seq_num = ucc_atomic_fadd32(&ucc_context_seq_num, 1); - if (params->mask & UCC_CONTEXT_PARAM_FIELD_OOB) { + if (params->mask & UCC_CONTEXT_PARAM_FIELD_OOB && + params->oob.n_oob_eps > 1) { do { /* UCC context create is blocking fn, so we can wait here for the completion of addr exchange */ @@ -680,7 +681,8 @@ ucc_status_t ucc_context_create(ucc_lib_h lib, ucc_assert(ctx->addr_storage.rank == params->oob.oob_ep); } if (config->internal_oob) { - if (params->mask & UCC_CONTEXT_PARAM_FIELD_OOB) { + if (params->mask & UCC_CONTEXT_PARAM_FIELD_OOB && + params->oob.n_oob_eps > 1) { ucc_base_team_params_t t_params; ucc_base_team_t * b_team; status = ucc_tl_context_get(ctx, "ucp", &ctx->service_ctx); diff --git a/src/core/ucc_team.c b/src/core/ucc_team.c index f72f4020c9..1f07d1eb4b 100644 --- a/src/core/ucc_team.c +++ b/src/core/ucc_team.c @@ -1,5 +1,6 @@ /** * Copyright (C) Mellanox Technologies Ltd. 2020. ALL RIGHTS RESERVED. + * Copyright (c) Meta Platforms, Inc. and affiliates. 2022. * See file LICENSE for terms. */ @@ -38,8 +39,8 @@ static ucc_status_t ucc_team_create_post_single(ucc_context_t *context, { ucc_status_t status; - if (context->service_team) { - /* User internal service team for OOB */ + if (context->service_team && team->size > 1) { + /* User internal service team for OOB, skip OOB if team size is 1 */ ucc_subset_t subset = {.myrank = team->rank, .map.ep_num = team->size, .map.type = UCC_EP_MAP_FULL}; @@ -60,7 +61,8 @@ static ucc_status_t ucc_team_create_post_single(ucc_context_t *context, team->bp.team = team; team->bp.map.type = UCC_EP_MAP_FULL; team->bp.map.ep_num = team->size; - team->state = UCC_TEAM_ADDR_EXCHANGE; + team->state = (team->size > 1) ? UCC_TEAM_ADDR_EXCHANGE + : UCC_TEAM_CL_CREATE; team->last_team_create_posted = -1; team->status = UCC_INPROGRESS; return UCC_OK; @@ -110,8 +112,8 @@ ucc_status_t ucc_team_create_post(ucc_context_h *contexts, uint32_t num_contexts } team_size = params->ep_map.ep_num; } - if (team_size < 2) { - ucc_warn("minimal size of UCC team is 2, provided %llu", + if (team_size < 1) { + ucc_warn("minimal size of UCC team is 1, provided %llu", (unsigned long long)team_size); return UCC_ERR_INVALID_PARAM; } @@ -240,7 +242,7 @@ ucc_team_create_cls(ucc_context_t *context, ucc_team_t *team) ucc_base_team_t *b_team; ucc_status_t status; - if (context->topo && !team->topo) { + if (context->topo && !team->topo && team->size > 1) { ucc_subset_t subset; /* Context->topo is not NULL if any of the enabled CLs reported topo_required through the lib_attr */ @@ -487,7 +489,7 @@ static ucc_status_t ucc_team_destroy_single(ucc_team_h team) ucc_topo_cleanup(team->topo); - if (team->contexts[0]->service_team) { + if (team->contexts[0]->service_team && team->size > 1) { ucc_internal_oob_finalize(&team->bp.params.oob); } diff --git a/test/gtest/coll/test_bcast.cc b/test/gtest/coll/test_bcast.cc index 2821a1a437..dbbcfa65f4 100644 --- a/test/gtest/coll/test_bcast.cc +++ b/test/gtest/coll/test_bcast.cc @@ -200,6 +200,11 @@ UCC_TEST_P(test_bcast_1, multiple) int size = team->procs.size(); UccCollCtxVec ctx; + if (size == 1 && root > 0) { + /* skip team size 1 and root > 0, which are invalid */ + continue; + } + SET_MEM_TYPE(mem_type); set_root(root); diff --git a/test/gtest/coll/test_gather.cc b/test/gtest/coll/test_gather.cc index 8171732d33..8a958c55f5 100644 --- a/test/gtest/coll/test_gather.cc +++ b/test/gtest/coll/test_gather.cc @@ -242,6 +242,11 @@ UCC_TEST_P(test_gather_1, multiple_host) int size = team->procs.size(); UccCollCtxVec ctx; + if (size == 1 && root > 0) { + /* skip team size 1 and root > 0, which are invalid */ + continue; + } + this->set_inplace(inplace); SET_MEM_TYPE(mem_type); set_root(root); diff --git a/test/gtest/common/test_ucc.h b/test/gtest/common/test_ucc.h index c4c46c00ce..31fd612c68 100644 --- a/test/gtest/common/test_ucc.h +++ b/test/gtest/common/test_ucc.h @@ -202,9 +202,9 @@ class UccJob { UCC_JOB_CTX_GLOBAL, /*< ucc ctx create with OOB */ UCC_JOB_CTX_GLOBAL_ONESIDED } ucc_job_ctx_mode_t; - static const int nStaticTeams = 3; + static const int nStaticTeams = 4; static const int staticUccJobSize = 16; - static constexpr int staticTeamSizes[nStaticTeams] = {2, 11, staticUccJobSize}; + static constexpr int staticTeamSizes[nStaticTeams] = {1, 2, 11, staticUccJobSize}; static void cleanup(); static UccJob* getStaticJob(); static const std::vector &getStaticTeams(); diff --git a/test/gtest/core/test_context.cc b/test/gtest/core/test_context.cc index 981a986b27..2ded95b8e7 100644 --- a/test/gtest/core/test_context.cc +++ b/test/gtest/core/test_context.cc @@ -87,6 +87,9 @@ UCC_TEST_F(test_context_get_attr, work_buffer_size) UCC_TEST_F(test_context, global) { /* Create and cleanup several Jobs (ucc contextss) with OOB */ + UccJob job1(1, UccJob::UCC_JOB_CTX_GLOBAL); + job1.cleanup(); + UccJob job3(3, UccJob::UCC_JOB_CTX_GLOBAL); job3.cleanup(); diff --git a/test/gtest/core/test_team.cc b/test/gtest/core/test_team.cc index fd2937c573..7790d8c0e6 100644 --- a/test/gtest/core/test_team.cc +++ b/test/gtest/core/test_team.cc @@ -27,7 +27,8 @@ UCC_TEST_P(test_team, team_create_destroy_ctx_local) INSTANTIATE_TEST_CASE_P(, test_team, ::testing::Values( - 2, /* Minimal team size */ + 1, /* Minimal team size */ + 2, /* Minimal power of 2 */ 8, /* Some power of 2 */ 7 /* Some non-power of 2 */ ));