From 7f3c2f6e9717b5b43297b1bc57928eb6147f84f9 Mon Sep 17 00:00:00 2001 From: Sergey Lebedev Date: Wed, 13 Nov 2024 11:57:12 +0100 Subject: [PATCH] CORE: fix coll trace for service team (#1046) --- src/components/tl/mlx5/tl_mlx5.h | 3 --- src/components/tl/mlx5/tl_mlx5_context.c | 2 +- src/components/tl/sharp/tl_sharp_context.c | 7 ++++--- src/components/tl/ucc_tl.h | 6 +++++- src/components/tl/ucp/tl_ucp.h | 5 +---- src/components/tl/ucp/tl_ucp_ep.c | 2 +- src/components/tl/ucp/tl_ucp_ep.h | 2 +- src/components/tl/ucp/tl_ucp_team.c | 5 +++-- src/core/ucc_service_coll.c | 4 +--- src/core/ucc_service_coll.h | 5 ++++- 10 files changed, 21 insertions(+), 20 deletions(-) diff --git a/src/components/tl/mlx5/tl_mlx5.h b/src/components/tl/mlx5/tl_mlx5.h index 1b6404e6bd..159ecda8ed 100644 --- a/src/components/tl/mlx5/tl_mlx5.h +++ b/src/components/tl/mlx5/tl_mlx5.h @@ -180,9 +180,6 @@ typedef struct ucc_tl_mlx5_rcache_region { #define UCC_TL_CTX_LIB(_ctx) \ (ucc_derived_of((_ctx)->super.super.lib, ucc_tl_mlx5_lib_t)) -#define IS_SERVICE_TEAM(_team) \ - ((_team)->super.super.params.scope == UCC_CL_LAST + 1) - #define SQUARED(_num) ((_num) * (_num)) ucc_status_t tl_mlx5_create_rcache(ucc_tl_mlx5_context_t *ctx); diff --git a/src/components/tl/mlx5/tl_mlx5_context.c b/src/components/tl/mlx5/tl_mlx5_context.c index d0539e83c9..1011fe72c0 100644 --- a/src/components/tl/mlx5/tl_mlx5_context.c +++ b/src/components/tl/mlx5/tl_mlx5_context.c @@ -251,7 +251,7 @@ ucc_status_t ucc_tl_mlx5_context_ib_ctx_pd_setup(ucc_base_context_t *context) while (UCC_INPROGRESS == (status = ucc_collective_test(&req->super))) { ucc_context_progress(core_ctx); } - ucc_collective_finalize(&req->super); + ucc_collective_finalize_internal(req); if (UCC_OK != status) { tl_debug(context->lib, "failure during mlx5 ctx bcast"); diff --git a/src/components/tl/sharp/tl_sharp_context.c b/src/components/tl/sharp/tl_sharp_context.c index 8a58d25f5e..42d10f8d87 100644 --- a/src/components/tl/sharp/tl_sharp_context.c +++ b/src/components/tl/sharp/tl_sharp_context.c @@ -7,6 +7,7 @@ #include #include "tl_sharp.h" #include "utils/arch/cpu.h" +#include "core/ucc_service_coll.h" static int ucc_tl_sharp_oob_barrier(void *arg) { @@ -141,7 +142,7 @@ static int ucc_tl_sharp_service_barrier(void *arg) ucc_context_progress(ctx->super.super.ucc_context); status = ucc_collective_test(&req->super); } while (status == UCC_INPROGRESS); - ucc_collective_finalize(&req->super); + ucc_collective_finalize_internal(req); return status; } @@ -179,7 +180,7 @@ static int ucc_tl_sharp_service_gather(void *arg, int root, void *sbuf, ucc_context_progress(ctx->super.super.ucc_context); status = ucc_collective_test(&req->super); } while (status == UCC_INPROGRESS); - ucc_collective_finalize(&req->super); + ucc_collective_finalize_internal(req); if (subset.myrank != root) { ucc_free(rbuf); @@ -208,7 +209,7 @@ static int ucc_tl_sharp_service_bcast(void *arg, void *buf, int size, int root) status = ucc_collective_test(&req->super); } while (status == UCC_INPROGRESS); - ucc_collective_finalize(&req->super); + ucc_collective_finalize_internal(req); return status; } diff --git a/src/components/tl/ucc_tl.h b/src/components/tl/ucc_tl.h index 75a5e3e1a0..bd3633db27 100644 --- a/src/components/tl/ucc_tl.h +++ b/src/components/tl/ucc_tl.h @@ -1,5 +1,5 @@ /** - * Copyright (c) 2020, NVIDIA CORPORATION & AFFILIATES. All rights reserved. + * Copyright (c) 2020-2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved. * * See file LICENSE for terms. */ @@ -165,4 +165,8 @@ typedef struct ucc_tl_lib_attr { #define UCC_TL_TEAM_MAP(_tl_team) (_tl_team)->super.super.params.map #define UCC_TL_TEAM_OOB(_tl_team) (_tl_team)->super.super.params.params.oob + +#define UCC_TL_IS_SERVICE_TEAM(_tl_team) \ + ((_tl_team)->super.super.params.scope == UCC_CL_LAST + 1) + #endif diff --git a/src/components/tl/ucp/tl_ucp.h b/src/components/tl/ucp/tl_ucp.h index 894a33bc6f..3c439f4ae5 100644 --- a/src/components/tl/ucp/tl_ucp.h +++ b/src/components/tl/ucp/tl_ucp.h @@ -175,11 +175,8 @@ extern ucc_config_field_t ucc_tl_ucp_lib_config_table[]; #define UCC_TL_UCP_TEAM_CTX(_team) \ (ucc_derived_of((_team)->super.super.context, ucc_tl_ucp_context_t)) -#define IS_SERVICE_TEAM(_team) \ - ((_team)->super.super.params.scope == UCC_CL_LAST + 1) - #define USE_SERVICE_WORKER(_team) \ - (IS_SERVICE_TEAM(_team) && UCC_TL_UCP_TEAM_CTX(_team)->cfg.service_worker) + (UCC_TL_IS_SERVICE_TEAM(_team) && UCC_TL_UCP_TEAM_CTX(_team)->cfg.service_worker) #define UCC_TL_UCP_TASK_TEAM(_task) \ (ucc_derived_of((_task)->super.team, ucc_tl_ucp_team_t)) diff --git a/src/components/tl/ucp/tl_ucp_ep.c b/src/components/tl/ucp/tl_ucp_ep.c index 096564b7b3..8cfdaa1276 100644 --- a/src/components/tl/ucp/tl_ucp_ep.c +++ b/src/components/tl/ucp/tl_ucp_ep.c @@ -1,5 +1,5 @@ /** - * Copyright (c) 2020-2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved. + * Copyright (c) 2020-2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved. * * See file LICENSE for terms. */ diff --git a/src/components/tl/ucp/tl_ucp_ep.h b/src/components/tl/ucp/tl_ucp_ep.h index 6874ef6ccf..d5f46e457c 100644 --- a/src/components/tl/ucp/tl_ucp_ep.h +++ b/src/components/tl/ucp/tl_ucp_ep.h @@ -64,7 +64,7 @@ static inline ucc_status_t ucc_tl_ucp_get_ep(ucc_tl_ucp_team_t *team, ucc_team_t *core_team = UCC_TL_CORE_TEAM(team); /* Core super.super.team ptr is NULL for service_team which has scope == UCC_CL_LAST + 1*/ - ucc_assert((NULL != core_team) || IS_SERVICE_TEAM(team)); + ucc_assert((NULL != core_team) || UCC_TL_IS_SERVICE_TEAM(team)); ctx_rank = core_team ? ucc_get_ctx_rank(core_team, core_rank) : core_rank; *ep = team->worker->eps[ctx_rank]; diff --git a/src/components/tl/ucp/tl_ucp_team.c b/src/components/tl/ucp/tl_ucp_team.c index 4bde47430c..b6db1654e4 100644 --- a/src/components/tl/ucp/tl_ucp_team.c +++ b/src/components/tl/ucp/tl_ucp_team.c @@ -73,7 +73,7 @@ UCC_CLASS_INIT_FUNC(ucc_tl_ucp_team_t, ucc_base_context_t *tl_context, } } - if (ucc_global_config.file_cfg && !IS_SERVICE_TEAM(self) && + if (ucc_global_config.file_cfg && !UCC_TL_IS_SERVICE_TEAM(self) && ctx->topo_required && tl_context->lib->use_tuning) { status = ucc_add_team_sections(&self->cfg, ucc_tl_ucp_lib_config_table, self->topo, &self->tuning_str, @@ -91,7 +91,8 @@ UCC_CLASS_INIT_FUNC(ucc_tl_ucp_team_t, ucc_base_context_t *tl_context, self->cfg.use_reordering = 0; } - if (self->topo && !IS_SERVICE_TEAM(self) && self->topo->topo->sock_bound) { + if (self->topo && !UCC_TL_IS_SERVICE_TEAM(self) && + self->topo->topo->sock_bound) { tsize = UCC_TL_TEAM_SIZE(self); max_radix = (ucc_topo_max_ppn(self->topo) == 1) ? tsize : ucc_min(tsize, ucc_topo_min_socket_size(self->topo)); diff --git a/src/core/ucc_service_coll.c b/src/core/ucc_service_coll.c index 711a61cdde..c5d51fb1f2 100644 --- a/src/core/ucc_service_coll.c +++ b/src/core/ucc_service_coll.c @@ -1,5 +1,5 @@ /** - * Copyright (c) 2021-2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved. + * Copyright (c) 2021-2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved. * * See file LICENSE for terms. */ @@ -139,8 +139,6 @@ ucc_status_t ucc_service_coll_test(ucc_service_coll_req_t *req) return status; } -ucc_status_t ucc_collective_finalize_internal(ucc_coll_task_t *task); - ucc_status_t ucc_service_coll_finalize(ucc_service_coll_req_t *req) { ucc_status_t status; diff --git a/src/core/ucc_service_coll.h b/src/core/ucc_service_coll.h index 9fc8733419..09e6088b78 100644 --- a/src/core/ucc_service_coll.h +++ b/src/core/ucc_service_coll.h @@ -1,5 +1,5 @@ /** - * Copyright (c) 2021, NVIDIA CORPORATION & AFFILIATES. All rights reserved. + * Copyright (c) 2021-2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved. * See file LICENSE for terms. */ @@ -37,4 +37,7 @@ ucc_status_t ucc_internal_oob_init(ucc_team_t *team, ucc_subset_t subset, ucc_team_oob_coll_t *oob); void ucc_internal_oob_finalize(ucc_team_oob_coll_t *oob); + +ucc_status_t ucc_collective_finalize_internal(ucc_coll_task_t *task); + #endif