diff --git a/.ci/Dockerfile.centos8 b/.ci/Dockerfile.centos8 index 05859cb55f..fc20124ef3 100644 --- a/.ci/Dockerfile.centos8 +++ b/.ci/Dockerfile.centos8 @@ -4,13 +4,20 @@ FROM harbor.mellanox.com/torch-ucc/ucc/1.0.0/x86_64/centos8/cuda${CUDA_VER}:base RUN rm -rf ${SRC_DIR}/ucc COPY . ${SRC_DIR}/ucc +RUN sed -i 's/mirrorlist/#mirrorlist/g' /etc/yum.repos.d/CentOS-* && \ + sed -i 's|#baseurl=http://mirror.centos.org|baseurl=http://vault.centos.org|g' /etc/yum.repos.d/CentOS-* +RUN yum install -y sudo && \ + echo "swx-jenkins ALL=(ALL) NOPASSWD: ALL" >> /etc/sudoers #============================================================================== # Build UCC RUN ${SRC_DIR}/ucc/.ci/scripts/build_ucc.sh #============================================================================== # Install torch_ucc (UCC version) python module and build a wheel package RUN ${SRC_DIR}/ucc/.ci/scripts/install_torch_ucc.sh +RUN chown -R 6213:11429 /opt/nvidia #============================================================================== RUN groupadd -g 11429 swx-jenkins RUN adduser --no-create-home --uid 6213 --gid 11429 --home /labhome/swx-jenkins swx-jenkins #============================================================================== +USER swx-jenkins + diff --git a/.ci/job_matrix.yaml b/.ci/job_matrix.yaml index ed5fa2528b..af23b10578 100644 --- a/.ci/job_matrix.yaml +++ b/.ci/job_matrix.yaml @@ -20,13 +20,13 @@ volumes: } env: - CUDA_VER: "11.4.2" + CUDA_VER: '11.4.2' UCC_URI_SUFFIX: "ucc/${UCC_VERSION}/x86_64/centos8/cuda${CUDA_VER}" UCC_DOCKER_IMAGE_NAME: "${registry_host}${registry_path}/${UCC_URI_SUFFIX}" NVIDIA_ROOT_DIR: "/opt/nvidia" SRC_DIR: "${NVIDIA_ROOT_DIR}/src" BIN_DIR: "${NVIDIA_ROOT_DIR}/bin" - DOCKER_OPT: "--pull always --network=host --uts=host --ipc=host --ulimit stack=67108864 --ulimit memlock=-1 --security-opt seccomp=unconfined --cap-add=SYS_ADMIN --device=/dev/infiniband/ --gpus all --user root" + DOCKER_OPT: "--pull always --network=host --uts=host --ipc=host --ulimit stack=67108864 --ulimit memlock=-1 --security-opt seccomp=unconfined --cap-add=SYS_ADMIN --device=/dev/infiniband/ --gpus all" docker_opt: "$DOCKER_OPT" @@ -54,6 +54,7 @@ runs_on_dockers: # bare metal runs_on_agents: - nodeLabel: "swx-clx01" + - nodeLabel: "ml-test-node-gpu" timeout_minutes: 360 @@ -72,7 +73,7 @@ steps: #============================================================================ - name: Run Coverity credentialsId: "bc9a18d3-1153-449c-b924-7fc9249c9cc0" - agentSelector: "{nodeLabel: 'swx-clx01'}" + agentSelector: "{nodeLabel: 'ml-test-node-gpu'}" run: | export UCC_PASSWORD=$UCC_PASSWORD export UCC_USERNAME=$UCC_USERNAME diff --git a/.ci/scripts/coverity.sh b/.ci/scripts/coverity.sh index 61f124eccb..63122fbbd5 100755 --- a/.ci/scripts/coverity.sh +++ b/.ci/scripts/coverity.sh @@ -24,9 +24,6 @@ module load hpcx-gcc module load dev/cuda12.1.1 module load dev/nccl_2.18.3-1_cuda12.1.1_"$(uname -i)" module load tools/cov-2021.12 -previous_date=$(date -d "yesterday" +'%Y-%m-%d') -HPCX_UCX_DIR=/hpc/local/benchmarks/daily/next/$previous_date/hpcx-gcc-redhat7/ucx -HPCX_SHARP_DIR=/hpc/local/benchmarks/daily/next/$previous_date/hpcx-gcc-redhat7/sharp ./autogen.sh ./configure --with-nccl --with-tls=cuda,nccl,self,sharp,shm,ucp,mlx5 --with-ucx="${HPCX_UCX_DIR}" --with-sharp="${HPCX_SHARP_DIR}" make_opt="-j$(($(nproc) / 2 + 1))" diff --git a/.ci/scripts/run_dlrm.sh b/.ci/scripts/run_dlrm.sh index 46e71542bc..da9ebc2e8b 100755 --- a/.ci/scripts/run_dlrm.sh +++ b/.ci/scripts/run_dlrm.sh @@ -32,7 +32,6 @@ mpirun \ -np $NP \ --hostfile ${HOSTFILE} \ --map-by node \ - --allow-run-as-root \ --mca plm_rsh_args '-p 12345' \ -x PATH \ -x LD_LIBRARY_PATH \ @@ -43,7 +42,6 @@ mpirun \ -np $NP \ --hostfile ${HOSTFILE} \ --map-by node \ - --allow-run-as-root \ --mca plm_rsh_args '-p 12345' \ -x PATH \ -x LD_LIBRARY_PATH \ @@ -54,7 +52,6 @@ mpirun \ -np $NP \ --hostfile ${HOSTFILE} \ --map-by node \ - --allow-run-as-root \ --mca plm_rsh_args '-p 12345' \ -x PATH \ -x LD_LIBRARY_PATH \ diff --git a/.ci/scripts/run_dlrm_docker.sh b/.ci/scripts/run_dlrm_docker.sh index 7936a3c509..39fb74e92b 100755 --- a/.ci/scripts/run_dlrm_docker.sh +++ b/.ci/scripts/run_dlrm_docker.sh @@ -19,4 +19,4 @@ HEAD_NODE=$(head -1 "$HOSTFILE") export HEAD_NODE #sudo ssh -p "${DOCKER_SSH_PORT}" "${HEAD_NODE}" /opt/nvidia/src/ucc/.ci/scripts/run_dlrm.sh cpu "/opt/nvidia/src/ucc/.ci/configs/${HEAD_NODE}/hostfile.txt" -sudo ssh -p "${DOCKER_SSH_PORT}" "${HEAD_NODE}" /opt/nvidia/src/ucc/.ci/scripts/run_dlrm.sh gpu "/opt/nvidia/src/ucc/.ci/configs/${HEAD_NODE}/hostfile.txt" +ssh -p "${DOCKER_SSH_PORT}" "${HEAD_NODE}" /opt/nvidia/src/ucc/.ci/scripts/run_dlrm.sh gpu "/opt/nvidia/src/ucc/.ci/configs/${HEAD_NODE}/hostfile.txt" diff --git a/.ci/scripts/run_docker.sh b/.ci/scripts/run_docker.sh index 10f16bc352..7f141d65c9 100755 --- a/.ci/scripts/run_docker.sh +++ b/.ci/scripts/run_docker.sh @@ -41,13 +41,11 @@ DOCKER_RUN_ARGS="\ --cap-add=SYS_ADMIN \ --device=/dev/infiniband/ \ --gpus all \ ---user root \ -it \ -d \ --rm \ --name=${DOCKER_CONTAINER_NAME} \ -v /labhome:/labhome \ --v /root/.ssh:/root/.ssh \ " # shellcheck disable=SC2013 @@ -78,16 +76,16 @@ pdsh -w "${HOST_LIST}" -R ssh docker pull "${DOCKER_IMAGE_NAME}" for HOST in $(cat "$HOSTFILE"); do echo "INFO: start docker container on $HOST ..." # shellcheck disable=SC2029 - sudo ssh "$HOST" "docker run \ + ssh "$HOST" "docker run \ ${DOCKER_RUN_ARGS} \ ${DOCKER_IMAGE_NAME} \ - bash -c '/usr/sbin/sshd -p ${DOCKER_SSH_PORT}; sleep infinity'" + bash -c 'sudo /usr/sbin/sshd -p ${DOCKER_SSH_PORT}; sleep infinity'" echo "INFO: start docker container on $HOST ... DONE" sleep 5 echo "INFO: verify docker container on $HOST ..." - sudo ssh -p "${DOCKER_SSH_PORT}" "$HOST" hostname - sudo ssh -p "${DOCKER_SSH_PORT}" "$HOST" cat /proc/1/cgroup + ssh -p "${DOCKER_SSH_PORT}" "$HOST" hostname + ssh -p "${DOCKER_SSH_PORT}" "$HOST" cat /proc/1/cgroup echo "INFO: verify docker container on $HOST ... DONE" done diff --git a/.ci/scripts/run_tests_ucc_mpi.sh b/.ci/scripts/run_tests_ucc_mpi.sh index 2eb26ea526..4701a7c04e 100755 --- a/.ci/scripts/run_tests_ucc_mpi.sh +++ b/.ci/scripts/run_tests_ucc_mpi.sh @@ -46,7 +46,7 @@ function mpi_params { nnodes=$NNODES fi echo "-np $((nnodes*ppn)) --oversubscribe --hostfile ${HOSTFILE} \ ---map-by ppr:$ppn:node --bind-to socket --allow-run-as-root \ +--map-by ppr:$ppn:node --bind-to socket \ -x PATH -x LD_LIBRARY_PATH --mca opal_common_ucx_opal_mem_hooks 1 --mca plm_rsh_args -p12345 \ --mca coll ^ucc,hcoll \ -x UCX_NET_DEVICES=$DEV:1" diff --git a/.ci/scripts/run_tests_ucc_mpi_docker.sh b/.ci/scripts/run_tests_ucc_mpi_docker.sh index c3e6736c4a..c1929e7cc3 100755 --- a/.ci/scripts/run_tests_ucc_mpi_docker.sh +++ b/.ci/scripts/run_tests_ucc_mpi_docker.sh @@ -18,4 +18,4 @@ fi HEAD_NODE=$(head -1 "$HOSTFILE") export HEAD_NODE -sudo ssh -p "${DOCKER_SSH_PORT}" "${HEAD_NODE}" /opt/nvidia/src/ucc/.ci/scripts/run_tests_ucc_mpi.sh "/opt/nvidia/src/ucc/.ci/configs/${HEAD_NODE}/hostfile.txt" +ssh -p "${DOCKER_SSH_PORT}" "${HEAD_NODE}" /opt/nvidia/src/ucc/.ci/scripts/run_tests_ucc_mpi.sh "/opt/nvidia/src/ucc/.ci/configs/${HEAD_NODE}/hostfile.txt" diff --git a/src/components/tl/mlx5/tl_mlx5_context.c b/src/components/tl/mlx5/tl_mlx5_context.c index 1a65c1d690..0c56ff9390 100644 --- a/src/components/tl/mlx5/tl_mlx5_context.c +++ b/src/components/tl/mlx5/tl_mlx5_context.c @@ -26,6 +26,7 @@ UCC_CLASS_INIT_FUNC(ucc_tl_mlx5_context_t, UCC_CLASS_CALL_SUPER_INIT(ucc_tl_context_t, &tl_mlx5_config->super, params->context); memcpy(&self->cfg, tl_mlx5_config, sizeof(*tl_mlx5_config)); + self->sock = 0; self->rcache = NULL; self->shared_pd = NULL; self->shared_ctx = NULL; @@ -73,8 +74,11 @@ UCC_CLASS_CLEANUP_FUNC(ucc_tl_mlx5_context_t) tl_debug(self->super.super.lib, "failed to free ib ctx and pd"); }; + if (!self->sock) { + close(self->sock); + } + ucc_mpool_cleanup(&self->req_mp, 1); - close(self->sock); } UCC_CLASS_DEFINE(ucc_tl_mlx5_context_t, ucc_tl_context_t); diff --git a/src/components/tl/mlx5/tl_mlx5_pd.c b/src/components/tl/mlx5/tl_mlx5_pd.c index 2e4e4ff676..a553dbc5f5 100644 --- a/src/components/tl/mlx5/tl_mlx5_pd.c +++ b/src/components/tl/mlx5/tl_mlx5_pd.c @@ -127,7 +127,7 @@ ucc_status_t ucc_tl_mlx5_socket_init(ucc_tl_mlx5_context_t *ctx, static ucc_status_t client_recv_data(int *shared_cmd_fd, uint32_t *shared_pd_handle, - const char *sock_path, + const char *sock_path, int *sock_p, ucc_tl_mlx5_lib_t *lib) { struct sockaddr_storage sockaddr = {}; @@ -159,7 +159,8 @@ static ucc_status_t client_recv_data(int *shared_cmd_fd, goto out; } - return status; + *sock_p = sock; + return UCC_OK; out: if (close(sock) == -1) { @@ -229,7 +230,8 @@ ucc_status_t ucc_tl_mlx5_share_ctx_pd(ucc_tl_mlx5_context_t *ctx, ucc_status_t status; if (!is_ctx_owner) { - status = client_recv_data(&ctx_fd, &pd_handle, sock_path, lib); + status = + client_recv_data(&ctx_fd, &pd_handle, sock_path, &ctx->sock, lib); if (UCC_OK != status) { tl_debug(lib, "failed to share ctx & pd from client side"); return status; diff --git a/src/components/tl/ucp/Makefile.am b/src/components/tl/ucp/Makefile.am index b7f93b881e..cc0a118c60 100644 --- a/src/components/tl/ucp/Makefile.am +++ b/src/components/tl/ucp/Makefile.am @@ -9,10 +9,11 @@ SUBDIRS = . include makefile.coll_plugins.am endif -allgather = \ - allgather/allgather.h \ - allgather/allgather.c \ - allgather/allgather_ring.c \ +allgather = \ + allgather/allgather.h \ + allgather/allgather.c \ + allgather/allgather_ring.c \ + allgather/allgather_neighbor.c \ allgather/allgather_knomial.c allgatherv = \ diff --git a/src/components/tl/ucp/allgather/allgather.c b/src/components/tl/ucp/allgather/allgather.c index e0ef976c01..90b06e99ee 100644 --- a/src/components/tl/ucp/allgather/allgather.c +++ b/src/components/tl/ucp/allgather/allgather.c @@ -7,6 +7,8 @@ #include "tl_ucp.h" #include "allgather.h" +#define ALLGATHER_MAX_PATTERN_SIZE (sizeof(UCC_TL_UCP_ALLGATHER_DEFAULT_ALG_SELECT_STR)) + ucc_base_coll_alg_info_t ucc_tl_ucp_allgather_algs[UCC_TL_UCP_ALLGATHER_ALG_LAST + 1] = { [UCC_TL_UCP_ALLGATHER_ALG_KNOMIAL] = @@ -17,6 +19,10 @@ ucc_base_coll_alg_info_t {.id = UCC_TL_UCP_ALLGATHER_ALG_RING, .name = "ring", .desc = "O(N) Ring"}, + [UCC_TL_UCP_ALLGATHER_ALG_NEIGHBOR] = + {.id = UCC_TL_UCP_ALLGATHER_ALG_NEIGHBOR, + .name = "neighbor", + .desc = "O(N) Neighbor Exchange N/2 steps"}, [UCC_TL_UCP_ALLGATHER_ALG_LAST] = { .id = 0, .name = NULL, .desc = NULL}}; @@ -24,3 +30,16 @@ ucc_status_t ucc_tl_ucp_allgather_init(ucc_tl_ucp_task_t *task) { return ucc_tl_ucp_allgather_ring_init_common(task); } + +char *ucc_tl_ucp_allgather_score_str_get(ucc_tl_ucp_team_t *team) +{ + int max_size = ALLGATHER_MAX_PATTERN_SIZE; + int algo_num = UCC_TL_TEAM_SIZE(team) % 2 + ? UCC_TL_UCP_ALLGATHER_ALG_RING + : UCC_TL_UCP_ALLGATHER_ALG_NEIGHBOR; + char *str = ucc_malloc(max_size * sizeof(char)); + + ucc_snprintf_safe(str, max_size, + UCC_TL_UCP_ALLGATHER_DEFAULT_ALG_SELECT_STR, algo_num); + return str; +} diff --git a/src/components/tl/ucp/allgather/allgather.h b/src/components/tl/ucp/allgather/allgather.h index d7d9c2ea8d..b68ab00e95 100644 --- a/src/components/tl/ucp/allgather/allgather.h +++ b/src/components/tl/ucp/allgather/allgather.h @@ -11,14 +11,17 @@ enum { UCC_TL_UCP_ALLGATHER_ALG_KNOMIAL, UCC_TL_UCP_ALLGATHER_ALG_RING, + UCC_TL_UCP_ALLGATHER_ALG_NEIGHBOR, UCC_TL_UCP_ALLGATHER_ALG_LAST }; extern ucc_base_coll_alg_info_t - ucc_tl_ucp_allgather_algs[UCC_TL_UCP_ALLGATHER_ALG_LAST + 1]; + ucc_tl_ucp_allgather_algs[UCC_TL_UCP_ALLGATHER_ALG_LAST + 1]; #define UCC_TL_UCP_ALLGATHER_DEFAULT_ALG_SELECT_STR \ - "allgather:0-4k:@0#allgather:4k-inf:@1" + "allgather:0-4k:@0#allgather:4k-inf:@%d" + +char *ucc_tl_ucp_allgather_score_str_get(ucc_tl_ucp_team_t *team); static inline int ucc_tl_ucp_allgather_alg_from_str(const char *str) { @@ -33,20 +36,30 @@ static inline int ucc_tl_ucp_allgather_alg_from_str(const char *str) ucc_status_t ucc_tl_ucp_allgather_init(ucc_tl_ucp_task_t *task); +/* Ring */ ucc_status_t ucc_tl_ucp_allgather_ring_init(ucc_base_coll_args_t *coll_args, - ucc_base_team_t * team, - ucc_coll_task_t ** task_h); + ucc_base_team_t *team, + ucc_coll_task_t **task_h); ucc_status_t ucc_tl_ucp_allgather_ring_init_common(ucc_tl_ucp_task_t *task); -void ucc_tl_ucp_allgather_ring_progress(ucc_coll_task_t *task); +void ucc_tl_ucp_allgather_ring_progress(ucc_coll_task_t *task); ucc_status_t ucc_tl_ucp_allgather_ring_start(ucc_coll_task_t *task); +/* Neighbor Exchange */ +ucc_status_t ucc_tl_ucp_allgather_neighbor_init(ucc_base_coll_args_t *coll_args, + ucc_base_team_t *team, + ucc_coll_task_t **task_h); + +void ucc_tl_ucp_allgather_neighbor_progress(ucc_coll_task_t *task); + +ucc_status_t ucc_tl_ucp_allgather_neighbor_start(ucc_coll_task_t *task); + /* Uses allgather_kn_radix from config */ ucc_status_t ucc_tl_ucp_allgather_knomial_init(ucc_base_coll_args_t *coll_args, - ucc_base_team_t * team, - ucc_coll_task_t ** task_h); + ucc_base_team_t *team, + ucc_coll_task_t **task_h); /* Internal interface with custom radix */ ucc_status_t ucc_tl_ucp_allgather_knomial_init_r( diff --git a/src/components/tl/ucp/allgather/allgather_neighbor.c b/src/components/tl/ucp/allgather/allgather_neighbor.c new file mode 100644 index 0000000000..771ba2d3b8 --- /dev/null +++ b/src/components/tl/ucp/allgather/allgather_neighbor.c @@ -0,0 +1,176 @@ +/** + * Copyright (c) 2021-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. + * + * See file LICENSE for terms. + */ +#include "config.h" +#include "tl_ucp.h" +#include "allgather.h" +#include "core/ucc_progress_queue.h" +#include "tl_ucp_sendrecv.h" +#include "utils/ucc_math.h" +#include "utils/ucc_coll_utils.h" +#include "components/mc/ucc_mc.h" + +static ucc_rank_t get_recv_from_rank(ucc_rank_t rank, ucc_rank_t size, int i) +{ + const int i_parity = i % 2; + ucc_rank_t offset_at_step[2], recv_data_from; + if (rank % 2) { + recv_data_from = (rank - 1 + size) % size; + offset_at_step[0] = (-2); + offset_at_step[1] = (+2); + } else { + recv_data_from = rank; + offset_at_step[0] = (+2); + offset_at_step[1] = (-2); + } + + return (recv_data_from + offset_at_step[i_parity] * ucc_div_round_up(i, 2) + + size) % + size; +} + +ucc_status_t ucc_tl_ucp_allgather_neighbor_init(ucc_base_coll_args_t *coll_args, + ucc_base_team_t *team, + ucc_coll_task_t **task_h) +{ + ucc_status_t status = UCC_OK; + ucc_tl_ucp_task_t *task; + ucc_tl_ucp_team_t *ucp_team; + + task = ucc_tl_ucp_init_task(coll_args, team); + ucp_team = TASK_TEAM(task); + + if (!ucc_coll_args_is_predefined_dt(&TASK_ARGS(task), UCC_RANK_INVALID)) { + tl_error(UCC_TASK_LIB(task), "user defined datatype is not supported"); + status = UCC_ERR_NOT_SUPPORTED; + goto out; + } + + if (UCC_TL_TEAM_SIZE(ucp_team) % 2) { + tl_debug(UCC_TASK_LIB(task), + "odd team size is not supported, switching to ring"); + status = ucc_tl_ucp_allgather_ring_init_common(task); + } else { + task->super.post = ucc_tl_ucp_allgather_neighbor_start; + task->super.progress = ucc_tl_ucp_allgather_neighbor_progress; + } + +out: + if (status != UCC_OK) { + ucc_tl_ucp_put_task(task); + return status; + } + + *task_h = &task->super; + return status; +} + +/* Original implementation: https://github.com/open-mpi/ompi/blob/main/ompi/mca/coll/base/coll_base_allgather.c */ +void ucc_tl_ucp_allgather_neighbor_progress(ucc_coll_task_t *coll_task) +{ + ucc_tl_ucp_task_t *task = ucc_derived_of(coll_task, ucc_tl_ucp_task_t); + ucc_tl_ucp_team_t *team = TASK_TEAM(task); + ucc_rank_t trank = UCC_TL_TEAM_RANK(team); + ucc_rank_t tsize = UCC_TL_TEAM_SIZE(team); + void *rbuf = TASK_ARGS(task).dst.info.buffer; + ucc_memory_type_t rmem = TASK_ARGS(task).dst.info.mem_type; + ucc_datatype_t dt = TASK_ARGS(task).dst.info.datatype; + size_t count = TASK_ARGS(task).dst.info.count; + size_t data_size = (count / tsize) * ucc_dt_size(dt); + ucc_rank_t neighbors[2], i; + int i_parity, even_rank; + void *tmprecv, *tmpsend; + + if (UCC_INPROGRESS == ucc_tl_ucp_test(task)) { + return; + } + + even_rank = !(trank % 2); + if (even_rank) { + neighbors[0] = (trank + 1) % tsize; + neighbors[1] = (trank - 1 + tsize) % tsize; + } else { + neighbors[0] = (trank - 1 + tsize) % tsize; + neighbors[1] = (trank + 1) % tsize; + } + + while (task->tagged.send_posted < (tsize / 2)) { + i = task->tagged.send_posted; + i_parity = i % 2; + + tmprecv = + PTR_OFFSET(rbuf, get_recv_from_rank(trank, tsize, i) * data_size); + tmpsend = PTR_OFFSET(rbuf, get_recv_from_rank(trank, tsize, i - 1) * + data_size); + + /* Sendreceive */ + UCPCHECK_GOTO(ucc_tl_ucp_send_nb(tmpsend, 2 * data_size, rmem, + neighbors[i_parity], team, task), + task, out); + UCPCHECK_GOTO(ucc_tl_ucp_recv_nb(tmprecv, 2 * data_size, rmem, + neighbors[i_parity], team, task), + task, out); + + if (UCC_INPROGRESS == ucc_tl_ucp_test(task)) { + return; + } + } + + ucc_assert(UCC_TL_UCP_TASK_P2P_COMPLETE(task)); + task->super.status = UCC_OK; + +out: + UCC_TL_UCP_PROFILE_REQUEST_EVENT(coll_task, "ucp_allgather_neighbor_done", + 0); +} + +ucc_status_t ucc_tl_ucp_allgather_neighbor_start(ucc_coll_task_t *coll_task) +{ + ucc_tl_ucp_task_t *task = ucc_derived_of(coll_task, ucc_tl_ucp_task_t); + ucc_tl_ucp_team_t *team = TASK_TEAM(task); + size_t count = TASK_ARGS(task).dst.info.count; + void *sbuf = TASK_ARGS(task).src.info.buffer; + void *rbuf = TASK_ARGS(task).dst.info.buffer; + ucc_memory_type_t smem = TASK_ARGS(task).src.info.mem_type; + ucc_memory_type_t rmem = TASK_ARGS(task).dst.info.mem_type; + ucc_datatype_t dt = TASK_ARGS(task).dst.info.datatype; + ucc_rank_t trank = UCC_TL_TEAM_RANK(team); + ucc_rank_t tsize = UCC_TL_TEAM_SIZE(team); + size_t data_size = (count / tsize) * ucc_dt_size(dt); + ucc_status_t status; + ucc_rank_t neighbor; + void *tmprecv, *tmpsend; + + UCC_TL_UCP_PROFILE_REQUEST_EVENT(coll_task, "ucp_allgather_neighbor_start", + 0); + ucc_tl_ucp_task_reset(task, UCC_INPROGRESS); + + if (!UCC_IS_INPLACE(TASK_ARGS(task))) { + status = ucc_mc_memcpy(PTR_OFFSET(rbuf, data_size * trank), sbuf, + data_size, rmem, smem); + if (ucc_unlikely(UCC_OK != status)) { + return status; + } + } + + if (trank % 2) { + neighbor = (trank - 1 + tsize) % tsize; + } else { + neighbor = (trank + 1) % tsize; + } + + tmprecv = PTR_OFFSET(rbuf, neighbor * data_size); + tmpsend = PTR_OFFSET(rbuf, trank * data_size); + + /* Sendreceive */ + UCPCHECK_GOTO( + ucc_tl_ucp_send_nb(tmpsend, data_size, rmem, neighbor, team, task), + task, out); + UCPCHECK_GOTO( + ucc_tl_ucp_recv_nb(tmprecv, data_size, rmem, neighbor, team, task), + task, out); +out: + return ucc_progress_queue_enqueue(UCC_TL_CORE_CTX(team)->pq, &task->super); +} diff --git a/src/components/tl/ucp/tl_ucp_coll.c b/src/components/tl/ucp/tl_ucp_coll.c index b75e47ff4c..bbbac03fc7 100644 --- a/src/components/tl/ucp/tl_ucp_coll.c +++ b/src/components/tl/ucp/tl_ucp_coll.c @@ -27,8 +27,8 @@ const ucc_tl_ucp_default_alg_desc_t ucc_tl_ucp_default_alg_descs[UCC_TL_UCP_N_DEFAULT_ALG_SELECT_STR] = { { - .select_str = UCC_TL_UCP_ALLGATHER_DEFAULT_ALG_SELECT_STR, - .str_get_fn = NULL + .select_str = NULL, + .str_get_fn = ucc_tl_ucp_allgather_score_str_get }, { .select_str = NULL, @@ -252,6 +252,9 @@ ucc_status_t ucc_tl_ucp_alg_id_to_init(int alg_id, const char *alg_id_str, case UCC_TL_UCP_ALLGATHER_ALG_RING: *init = ucc_tl_ucp_allgather_ring_init; break; + case UCC_TL_UCP_ALLGATHER_ALG_NEIGHBOR: + *init = ucc_tl_ucp_allgather_neighbor_init; + break; default: status = UCC_ERR_INVALID_PARAM; break; diff --git a/test/gtest/coll/test_allgather.cc b/test/gtest/coll/test_allgather.cc index 1cdf17be77..e1cacac5ac 100644 --- a/test/gtest/coll/test_allgather.cc +++ b/test/gtest/coll/test_allgather.cc @@ -296,7 +296,7 @@ INSTANTIATE_TEST_CASE_P( #endif ::testing::Values(1,3,8192), // count ::testing::Values(TEST_INPLACE, TEST_NO_INPLACE), - ::testing::Values("knomial", "ring")), + ::testing::Values("knomial", "ring", "neighbor")), [](const testing::TestParamInfo& info) { std::string name; name += ucc_datatype_str(std::get<0>(info.param));