diff --git a/.ci/Dockerfile.ngc_pytorch b/.ci/Dockerfile.ngc_pytorch new file mode 100644 index 0000000000..91111aa488 --- /dev/null +++ b/.ci/Dockerfile.ngc_pytorch @@ -0,0 +1,20 @@ +ARG CUDA_VER='12.1.1' +FROM harbor.mellanox.com/torch-ucc/ucc/1.0.0/x86_64/centos8/cuda${CUDA_VER}:base +RUN rm -rf ${SRC_DIR}/ucc +COPY . ${SRC_DIR}/ucc + +RUN apt update && apt install -y sudo && \ + echo "swx-jenkins ALL=(ALL) NOPASSWD: ALL" >> /etc/sudoers +RUN pip install 'protobuf<=3.19.0' +#============================================================================== +# Build UCC +RUN ${SRC_DIR}/ucc/.ci/scripts/build_ucc.sh +#============================================================================== +# Install torch_ucc (UCC version) python module and build a wheel package +RUN chown -R 6213:11429 /opt/nvidia +#============================================================================== +RUN groupadd -g 11429 swx-jenkins +RUN adduser --no-create-home --uid 6213 --gid 11429 --home /labhome/swx-jenkins swx-jenkins +#============================================================================== +USER swx-jenkins + diff --git a/.ci/build_base_docker/Dockerfile.ngc_pytorch.base b/.ci/build_base_docker/Dockerfile.ngc_pytorch.base new file mode 100644 index 0000000000..891e6bc833 --- /dev/null +++ b/.ci/build_base_docker/Dockerfile.ngc_pytorch.base @@ -0,0 +1,69 @@ +ARG CUDA_VER='12.1.1' +FROM nvcr.io/nvidia/pytorch:23.11-py3 +#============================================================================== +ARG NVIDIA_ROOT_DIR=/opt/nvidia +ENV DEBIAN_FRONTEND=noninteractive +ENV TZ=Etc/UTC +ENV SRC_DIR=${NVIDIA_ROOT_DIR}/src +ENV PKG_DIR=${NVIDIA_ROOT_DIR}/pkg +ENV BIN_DIR=${NVIDIA_ROOT_DIR}/bin +ENV WORKLOADS_DIR=${NVIDIA_ROOT_DIR}/workloads +ENV CUDA_HOME=/usr/local/cuda +ENV UCX_GITHUB_URL=https://github.com/openucx/ucx.git +ENV UCX_BRANCH=master +ENV UCX_BUILD_TYPE=release-mt +ENV UCX_INSTALL_DIR=${BIN_DIR}/ucx/build-${UCX_BUILD_TYPE} +ENV UCC_INSTALL_DIR=${BIN_DIR}/ucc/build +ENV OFED_PKG='lsof kmod udev swig libelf1 libfuse2 pciutils tk gfortran libpci3 libusb-1.0-0 libltdl-dev libmnl0 bison tcl flex chrpath debhelper ethtool graphviz' +ENV PACKAGES='numactl openssh-server protobuf-compiler rdma-core vim libevent-dev build-essential git make autoconf libtool' +ENV OS_VERSION=ubuntu22.04 +ENV PLATFORM=x86_64 +ENV MOFED_VERSION=23.10-0.5.5.0 +ENV MOFED_URL="https://content.mellanox.com/ofed/MLNX_OFED-${MOFED_VERSION}/MLNX_OFED_LINUX-${MOFED_VERSION}-${OS_VERSION}-${PLATFORM}.tgz" +ENV OMPI_PATH="/opt/hpcx/ompi" +#============================================================================== +RUN apt update && apt install -y ${OFED_PKG} && \ + mkdir -p /tmp/ofed && wget --quiet -O /tmp/ofed/ofed.tgz ${MOFED_URL} && \ + tar -xvf /tmp/ofed/ofed.tgz --strip-components=2 -C /tmp/ofed && \ + /tmp/ofed/mlnxofedinstall --user-space-only --without-fw-update -q --distro ${OS_VERSION} --basic && \ + rm -rf /tmp/ofed + +RUN apt install -y ${PACKAGES} + +# Remove old UCX +RUN rm -rf /opt/hpcx/uc? +ENV PATH=${OMPI_PATH}/bin:$PATH +RUN echo "export PATH=\"\$OMPI_PATH:\$PATH\"" >> /etc/bashrc && \ + export LD_LIBRARY_PATH=\"\$OMPI_PATH/lib64:\${LD_LIBRARY_PATH}\" >> /etc/bashrc +#============================================================================== +# Configure SSH +RUN mkdir -p /var/run/sshd && \ + cat /etc/ssh/ssh_config | grep -v StrictHostKeyChecking > /etc/ssh/ssh_config.new && \ + echo " StrictHostKeyChecking no" >> /etc/ssh/ssh_config.new && \ + mv /etc/ssh/ssh_config.new /etc/ssh/ssh_config && \ + ssh-keygen -A && \ + rm -f /run/nologin +#============================================================================== + +#============================================================================== +RUN mkdir -p ${SRC_DIR} ${PKG_DIR} ${BIN_DIR} ${WORKLOADS_DIR} && \ + cd ${SRC_DIR} && \ + mkdir -p ${SRC_DIR}/ucx && \ + git clone --recursive ${UCX_GITHUB_URL} ${SRC_DIR}/ucx && \ + cd ${SRC_DIR}/ucx && \ + git checkout ${UCX_BRANCH} + +COPY . ${SRC_DIR}/ucc +#============================================================================== +# Build UCX +RUN ${SRC_DIR}/ucc/.ci/scripts/build_ucx.sh +ENV PATH=${UCX_INSTALL_DIR}/bin:${PATH} +#============================================================================== +# Install workloads +WORKDIR ${WORKLOADS_DIR} +RUN git clone https://github.com/facebookresearch/dlrm.git && \ + cd ${WORKLOADS_DIR}/dlrm && \ + pip3 install -r ${WORKLOADS_DIR}/dlrm/requirements.txt && \ + pip3 install tensorboard +RUN git clone https://github.com/facebookresearch/param.git && \ + pip3 install -r ${WORKLOADS_DIR}/param/requirements.txt diff --git a/.ci/job_matrix.yaml b/.ci/job_matrix.yaml index af23b10578..da1f4f65c5 100644 --- a/.ci/job_matrix.yaml +++ b/.ci/job_matrix.yaml @@ -20,7 +20,7 @@ volumes: } env: - CUDA_VER: '11.4.2' + CUDA_VER: '12.1.1' UCC_URI_SUFFIX: "ucc/${UCC_VERSION}/x86_64/centos8/cuda${CUDA_VER}" UCC_DOCKER_IMAGE_NAME: "${registry_host}${registry_path}/${UCC_URI_SUFFIX}" NVIDIA_ROOT_DIR: "/opt/nvidia" @@ -42,8 +42,8 @@ credentials: runs_on_dockers: - { - file: ".ci/Dockerfile.centos8", - name: "centos8", + file: ".ci/Dockerfile.ngc_pytorch", + name: "ngc_pytorch", tag: "${BUILD_NUMBER}", arch: "x86_64", uri: "${UCC_URI_SUFFIX}", @@ -69,7 +69,6 @@ steps: docker pull ${DOCKER_IMAGE_NAME} docker create -ti --rm $DOCKER_OPT ${DOCKER_IMAGE_NAME} /bin/bash > ${WORKSPACE}/ucc_docker.id docker start $(cat ${WORKSPACE}/ucc_docker.id) - #============================================================================ - name: Run Coverity credentialsId: "bc9a18d3-1153-449c-b924-7fc9249c9cc0" @@ -80,7 +79,6 @@ steps: echo "Running coverity" ${WORKSPACE}/.ci/scripts/coverity.sh archiveArtifacts: .ci/scripts/cov-build/* - #============================================================================ - name: Run UCC / Torch-UCC tests agentSelector: "{nodeLabel: 'swx-clx01'}" @@ -88,9 +86,6 @@ steps: echo "INFO: Run UCC tests" hostname docker exec $(cat ${WORKSPACE}/ucc_docker.id) bash -c "\${SRC_DIR}/ucc/.ci/scripts/run_tests_ucc.sh" - - echo "INFO: Run Torch-UCC tests (UCC)" - docker exec $(cat ${WORKSPACE}/ucc_docker.id) bash -c "\${SRC_DIR}/ucc/.ci/scripts/run_tests_torch_ucc.sh" always: | docker rm --force $(cat ${WORKSPACE}/ucc_docker.id) #============================================================================ diff --git a/.ci/scripts/env.sh b/.ci/scripts/env.sh index 649acaa53d..b5fc5da29a 100755 --- a/.ci/scripts/env.sh +++ b/.ci/scripts/env.sh @@ -1,5 +1,9 @@ #!/bin/bash -eEx +export PATH="/opt/hpcx/ompi/bin:$PATH" +export LD_LIBRARY_PATH="/opt/hpcx/ompi/lib:${LD_LIBRARY_PATH}" +export OPAL_PREFIX=/opt/hpcx/ompi + SCRIPT_DIR="$(cd "$(dirname "$0")" && pwd -P)" # shellcheck disable=SC2034 diff --git a/.ci/scripts/run_docker.sh b/.ci/scripts/run_docker.sh index 7f141d65c9..9535298bb2 100755 --- a/.ci/scripts/run_docker.sh +++ b/.ci/scripts/run_docker.sh @@ -45,7 +45,7 @@ DOCKER_RUN_ARGS="\ -d \ --rm \ --name=${DOCKER_CONTAINER_NAME} \ --v /labhome:/labhome \ +-v /labhome/swx-jenkins:/labhome/swx-jenkins \ " # shellcheck disable=SC2013 diff --git a/.ci/scripts/run_tests_ucc_mpi.sh b/.ci/scripts/run_tests_ucc_mpi.sh index 4701a7c04e..73a4eaca6a 100755 --- a/.ci/scripts/run_tests_ucc_mpi.sh +++ b/.ci/scripts/run_tests_ucc_mpi.sh @@ -15,9 +15,6 @@ if [ -z "$HOSTFILE" ]; then exit 1 fi -export PATH="/usr/lib64/openmpi/bin:$PATH" -export LD_LIBRARY_PATH="/usr/lib64/openmpi/lib:${LD_LIBRARY_PATH}" - HEAD_NODE=$(head -1 "$HOSTFILE") export HEAD_NODE export MASTER_ADDR=${HEAD_NODE} diff --git a/.github/workflows/clang-tidy-nvidia.yaml b/.github/workflows/clang-tidy-nvidia.yaml index 408f145f83..ae2cde7580 100644 --- a/.github/workflows/clang-tidy-nvidia.yaml +++ b/.github/workflows/clang-tidy-nvidia.yaml @@ -5,7 +5,7 @@ on: [push, pull_request] env: OPEN_UCX_LINK: https://github.com/openucx/ucx OPEN_UCX_BRANCH: master - HPCX_LINK: http://content.mellanox.com/hpc/hpc-x/v2.13/hpcx-v2.13-gcc-MLNX_OFED_LINUX-5-ubuntu20.04-cuda11-gdrcopy2-nccl2.12-x86_64.tbz + HPCX_LINK: https://content.mellanox.com/hpc/hpc-x/v2.17.1rc2/hpcx-v2.17.1-gcc-mlnx_ofed-ubuntu20.04-cuda12-x86_64.tbz CLANG_VER: 12 MLNX_OFED_VER: 5.9-0.5.6.0 CUDA_VER: 11-4 @@ -33,7 +33,7 @@ jobs: wget https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2004/x86_64/cuda-keyring_1.0-1_all.deb sudo dpkg -i cuda-keyring_1.0-1_all.deb sudo apt-get update - sudo apt-get install -y --no-install-recommends cuda-cudart-dev-${CUDA_VER} cuda-nvcc-${CUDA_VER} cuda-nvml-dev-${CUDA_VER} + sudo apt-get install -y --no-install-recommends cuda-cudart-dev-${CUDA_VER} cuda-nvcc-${CUDA_VER} cuda-nvml-dev-${CUDA_VER} - name: Get UCX run: git clone ${OPEN_UCX_LINK} -b ${OPEN_UCX_BRANCH} /tmp/ucx - name: Build UCX @@ -45,8 +45,8 @@ jobs: run: | cd /tmp wget ${HPCX_LINK} - tar xjf hpcx-v2.13-gcc-MLNX_OFED_LINUX-5-ubuntu20.04-cuda11-gdrcopy2-nccl2.12-x86_64.tbz - mv hpcx-v2.13-gcc-MLNX_OFED_LINUX-5-ubuntu20.04-cuda11-gdrcopy2-nccl2.12-x86_64 hpcx + tar xjf hpcx-v2.17.1-gcc-mlnx_ofed-ubuntu20.04-cuda12-x86_64.tbz + mv hpcx-v2.17.1-gcc-mlnx_ofed-ubuntu20.04-cuda12-x86_64 hpcx - uses: actions/checkout@v1 - name: Build UCC run: | diff --git a/.github/workflows/hpcsdk.yaml b/.github/workflows/hpcsdk.yaml new file mode 100644 index 0000000000..77188cd96a --- /dev/null +++ b/.github/workflows/hpcsdk.yaml @@ -0,0 +1,25 @@ +name: HPC_SDK + +on: [push, pull_request] + +env: + HPCXDIR: /opt/nvidia/hpc_sdk/Linux_x86_64/2023/comm_libs/12.2/hpcx/latest/ + NCCLDIR: /opt/nvidia/hpc_sdk/Linux_x86_64/2023/comm_libs/12.2/nccl/ + CUDADIR: /opt/nvidia/hpc_sdk/Linux_x86_64/2023/cuda/12.2/ + +jobs: + build: + runs-on: ubuntu-20.04 + container: + image: nvcr.io/nvidia/nvhpc:23.9-devel-cuda12.2-ubuntu22.04 + steps: + - name: Install dependencies + run: | + apt-get update + apt-get install -y --no-install-recommends libiberty-dev + - uses: actions/checkout@v1 + - name: Build UCC + run: | + ./autogen.sh + CC=nvc CXX=nvc++ ./configure --with-tls=ucp,mlx5,cuda,self,nccl,sharp --with-mpi=${HPCXDIR}/ompi --with-sharp=${HPCXDIR}/sharp --with-ucx=${HPCXDIR}/ucx --with-cuda=${CUDADIR} --with-nccl=${NCCLDIR} --with-nvcc-gencode="-gencode=arch=compute_80,code=sm_80" + make -j`nproc` install diff --git a/config/m4/sharp.m4 b/config/m4/sharp.m4 index bedc550476..45bcfd04e3 100644 --- a/config/m4/sharp.m4 +++ b/config/m4/sharp.m4 @@ -44,6 +44,7 @@ AS_IF([test "x$with_sharp" != "xno"], AC_SUBST(SHARP_LDFLAGS, "-lsharp_coll -L$check_sharp_dir/lib") AC_CHECK_DECLS([SHARP_COLL_HIDE_ERRORS], [], [], [[#include ]]) AC_CHECK_DECLS([SHARP_COLL_DISABLE_LAZY_GROUP_RESOURCE_ALLOC], [], [], [[#include ]]) + AC_CHECK_DECLS([sharp_coll_do_reduce_scatter], [], [], [[#include ]]) ], [ AS_IF([test "x$with_sharp" != "xguess"], diff --git a/config/m4/ucx.m4 b/config/m4/ucx.m4 index b3a3b871c3..ba57dae303 100644 --- a/config/m4/ucx.m4 +++ b/config/m4/ucx.m4 @@ -1,5 +1,5 @@ # -# Copyright (c) 2001-2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# Copyright (c) 2001-2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved. # See file LICENSE for terms. # @@ -113,6 +113,25 @@ AS_IF([test "x$ucx_checked" != "xyes"],[ [AC_DEFINE([UCS_HAVE_CONFIG_GLOBAL_LIST_ENTRY_FLAGS], [1], [flags for config table])], [], [#include ]) + + AC_CHECK_MEMBER(ucs_rcache_region_t.alignment, + [AC_DEFINE([UCS_HAVE_RCACHE_REGION_ALIGNMENT], [1], [flags for ucs_rcache_get])], + [], + [#include ]) + + + AC_COMPILE_IFELSE([AC_LANG_SOURCE([[#include + int main(int argc, char** argv) { + ucs_config_parser_set_value(NULL, NULL, NULL, NULL, NULL); + return 0; + } ]])], + [AC_DEFINE([UCS_HAVE_PARSER_SET_VALUE_TABLE_PREFIX], [1], [flags for ucs_rcache_get])], + []) + + AC_CHECK_MEMBER(ucs_config_parser_t.doc, + [AC_DEFINE([UCS_HAVE_PARSER_CONFIG_DOC], [1], [flags for ucs_rcache_get])], + [], + [#include ]) ], [ AS_IF([test "x$with_ucx" != "xguess"], diff --git a/src/Makefile.am b/src/Makefile.am index 85496f83dd..c505c31344 100644 --- a/src/Makefile.am +++ b/src/Makefile.am @@ -36,62 +36,63 @@ nobase_dist_libucc_la_HEADERS = \ ucc/api/ucc_version.h \ ucc/api/ucc_status.h -noinst_HEADERS = \ - core/ucc_global_opts.h \ - core/ucc_lib.h \ - core/ucc_context.h \ - core/ucc_team.h \ - core/ucc_ee.h \ - core/ucc_progress_queue.h \ - core/ucc_service_coll.h \ - core/ucc_dt.h \ - schedule/ucc_schedule.h \ - schedule/ucc_schedule_pipelined.h \ - coll_score/ucc_coll_score.h \ - utils/arch/aarch64/cpu.h \ - utils/arch/ppc64/cpu.h \ - utils/arch/riscv64/cpu.h \ - utils/arch/x86_64/cpu.h \ - utils/arch/cpu.h \ - utils/arch/cuda_def.h \ - utils/ucc_compiler_def.h \ - utils/ucc_log.h \ - utils/ucc_parser.h \ - utils/ucc_component.h \ - utils/ucc_datastruct.h \ - utils/ucc_math.h \ - utils/ucc_coll_utils.h \ - utils/ucc_list.h \ - utils/ucc_string.h \ - utils/ucc_queue.h \ - utils/ucc_proc_info.h \ - utils/khash.h \ - utils/ini.h \ - utils/ucc_spinlock.h \ - utils/ucc_mpool.h \ - utils/ucc_rcache.h \ - utils/profile/ucc_profile.h \ - utils/profile/ucc_profile_on.h \ - utils/profile/ucc_profile_off.h \ - utils/ucc_time.h \ - utils/ucc_sys.h \ - utils/ucc_assert.h \ - components/base/ucc_base_iface.h \ - components/cl/ucc_cl.h \ - components/cl/ucc_cl_log.h \ - components/cl/ucc_cl_type.h \ - components/tl/ucc_tl.h \ - components/tl/ucc_tl_log.h \ - components/mc/ucc_mc.h \ - components/mc/base/ucc_mc_base.h \ - components/mc/ucc_mc_log.h \ - components/ec/ucc_ec.h \ - components/ec/base/ucc_ec_base.h \ - components/ec/ucc_ec_log.h \ - coll_patterns/recursive_knomial.h \ - coll_patterns/sra_knomial.h \ - coll_patterns/bruck_alltoall.h \ - components/topo/ucc_topo.h \ +noinst_HEADERS = \ + core/ucc_global_opts.h \ + core/ucc_lib.h \ + core/ucc_context.h \ + core/ucc_team.h \ + core/ucc_ee.h \ + core/ucc_progress_queue.h \ + core/ucc_service_coll.h \ + core/ucc_dt.h \ + schedule/ucc_schedule.h \ + schedule/ucc_schedule_pipelined.h \ + coll_score/ucc_coll_score.h \ + utils/arch/aarch64/cpu.h \ + utils/arch/ppc64/cpu.h \ + utils/arch/riscv64/cpu.h \ + utils/arch/x86_64/cpu.h \ + utils/arch/cpu.h \ + utils/arch/cuda_def.h \ + utils/ucc_compiler_def.h \ + utils/ucc_log.h \ + utils/ucc_parser.h \ + utils/ucc_component.h \ + utils/ucc_datastruct.h \ + utils/ucc_math.h \ + utils/ucc_coll_utils.h \ + utils/ucc_list.h \ + utils/ucc_string.h \ + utils/ucc_queue.h \ + utils/ucc_proc_info.h \ + utils/khash.h \ + utils/ini.h \ + utils/ucc_spinlock.h \ + utils/ucc_mpool.h \ + utils/ucc_rcache.h \ + utils/profile/ucc_profile.h \ + utils/profile/ucc_profile_on.h \ + utils/profile/ucc_profile_off.h \ + utils/ucc_time.h \ + utils/ucc_sys.h \ + utils/ucc_assert.h \ + components/base/ucc_base_iface.h \ + components/cl/ucc_cl.h \ + components/cl/ucc_cl_log.h \ + components/cl/ucc_cl_type.h \ + components/tl/ucc_tl.h \ + components/tl/ucc_tl_log.h \ + components/mc/ucc_mc.h \ + components/mc/base/ucc_mc_base.h \ + components/mc/ucc_mc_log.h \ + components/ec/ucc_ec.h \ + components/ec/base/ucc_ec_base.h \ + components/ec/ucc_ec_log.h \ + coll_patterns/recursive_knomial.h \ + coll_patterns/sra_knomial.h \ + coll_patterns/bruck_alltoall.h \ + coll_patterns/double_binary_tree.h \ + components/topo/ucc_topo.h \ components/topo/ucc_sbgp.h libucc_la_SOURCES = \ diff --git a/src/coll_patterns/double_binary_tree.h b/src/coll_patterns/double_binary_tree.h new file mode 100644 index 0000000000..baab72936a --- /dev/null +++ b/src/coll_patterns/double_binary_tree.h @@ -0,0 +1,238 @@ +/** + * Copyright (c) 2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. + * + * See file LICENSE for terms. + */ + +#ifndef DOUBLE_BINARY_TREE_H_ +#define DOUBLE_BINARY_TREE_H_ + +enum { + LEFT_CHILD, + RIGHT_CHILD +}; + +typedef struct ucc_dbt_single_tree { + ucc_rank_t rank; + ucc_rank_t size; + ucc_rank_t root; + ucc_rank_t parent; + ucc_rank_t children[2]; + int n_children; + int height; + int recv; +} ucc_dbt_single_tree_t; + +static inline ucc_rank_t get_root(ucc_rank_t size) +{ + ucc_rank_t r = 1; + + while (r <= size) { + r *= 2; + } + return r/2 - 1; +} + +static inline int get_height(ucc_rank_t rank) +{ + int h = 1; + + if (rank % 2 == 0) { + return 0; + } + + rank++; + while ((rank & (1 << h)) == 0) { + h++; + } + return h; +} + +static inline ucc_rank_t get_left_child(ucc_rank_t rank, int height) +{ + ucc_rank_t sub_height; + + if (height == 0) { + return UCC_RANK_INVALID; + } + + sub_height = 1 << (height - 1); + return rank - sub_height; +} + +static inline ucc_rank_t get_right_child(ucc_rank_t size, ucc_rank_t rank, + int height, ucc_rank_t root) +{ + ucc_rank_t sub_right_root, sub_height; + + if (rank == size - 1 || height == 0) { + return UCC_RANK_INVALID; + } + + sub_right_root = get_root(size - rank - 1) + 1; + sub_height = 1 << (height - 1); + + if (rank == root) { + return rank + sub_right_root; + } + return (rank + sub_height < size) ? rank + sub_height + : rank + sub_right_root; +} + +static inline void get_children(ucc_rank_t size, ucc_rank_t rank, int height, + ucc_rank_t root, ucc_rank_t *l_c, + ucc_rank_t *r_c) +{ + *l_c = get_left_child(rank, height); + *r_c = get_right_child(size, rank, height, root); +} + +static inline int get_n_children(ucc_rank_t l_c, ucc_rank_t r_c) +{ + int n_children = 0; + + if (l_c != UCC_RANK_INVALID) { + n_children++; + } + + if (r_c != UCC_RANK_INVALID) { + n_children++; + } + + return n_children; +} + +static inline ucc_rank_t get_parent(int vsize, int vrank, int height, int troot) +{ + if (vrank == troot) { + return UCC_RANK_INVALID; + } else if (height == 0) { + return ((((vrank/2) % 2 == 0) && (vrank + 1 != vsize))) ? vrank + 1 + : vrank - 1; + } else { + vrank++; + if ((((1<<(height+1)) & vrank) > 0) || (vrank + (1< vsize) { + return vrank - (1<children[LEFT_CHILD], + &t1->children[RIGHT_CHILD]); + t1->n_children = get_n_children(t1->children[LEFT_CHILD], + t1->children[RIGHT_CHILD]); + t1->height = height; + t1->parent = parent; + t1->size = size; + t1->rank = rank; + t1->root = root; + t1->recv = 0; +} + +static inline ucc_rank_t ucc_dbt_convert_rank_for_shift(ucc_rank_t rank, + ucc_rank_t size) +{ + ucc_rank_t i; + for (i = 0; i < size; i++) { + if (rank == (i + 1) % size) { + break; + } + } + return i; +} + +static inline ucc_rank_t ucc_dbt_convert_rank_for_mirror(ucc_rank_t rank, + ucc_rank_t size) +{ + ucc_rank_t i; + for (i = 0; i < size; i++) { + if (rank == size - 1 - i) { + break; + } + } + return i; +} + +static inline void ucc_dbt_build_t2(ucc_rank_t rank, ucc_rank_t size, + ucc_dbt_single_tree_t *t2) { + ucc_rank_t temp_rank = (size % 2) ? + ucc_dbt_convert_rank_for_shift(rank, size) : + ucc_dbt_convert_rank_for_mirror(rank, size); + ucc_dbt_single_tree_t t1_temp; + + ucc_dbt_build_t1(temp_rank, size, &t1_temp); + if (size % 2) { + ucc_dbt_build_t2_shift(t1_temp, t2); + } else { + ucc_dbt_build_t2_mirror(t1_temp, t2); + } +} + +static inline void ucc_dbt_build_trees(ucc_rank_t rank, ucc_rank_t size, + ucc_dbt_single_tree_t *t1, + ucc_dbt_single_tree_t *t2) +{ + ucc_dbt_build_t1(rank, size, t1); + ucc_dbt_build_t2(rank, size, t2); +} + +#endif diff --git a/src/coll_patterns/recursive_knomial.h b/src/coll_patterns/recursive_knomial.h index 4f8981957c..ebf9a0981b 100644 --- a/src/coll_patterns/recursive_knomial.h +++ b/src/coll_patterns/recursive_knomial.h @@ -50,7 +50,7 @@ typedef struct ucc_knomial_pattern { size_t block_size_counts; size_t count; /* collective buffer size */ ucc_rank_t block_size; - size_t block_offset; + ptrdiff_t block_offset; } ucc_knomial_pattern_t; /** diff --git a/src/coll_patterns/sra_knomial.h b/src/coll_patterns/sra_knomial.h index 1574389632..2f63a243f2 100644 --- a/src/coll_patterns/sra_knomial.h +++ b/src/coll_patterns/sra_knomial.h @@ -159,7 +159,7 @@ ucc_kn_seg_desc_compute(ucc_knomial_pattern_t *p, ucc_kn_seg_desc_t *seg, static inline void ucc_knx_block(ucc_rank_t rank, ucc_rank_t size, ucc_kn_radix_t radix, - size_t count, int iter, size_t *b_count, size_t *b_offset) + size_t count, int iter, size_t *b_count, ptrdiff_t *b_offset) { ucc_rank_t offset = 0; ucc_rank_t block_count; @@ -213,7 +213,7 @@ ucc_kn_agx_pattern_init(ucc_rank_t size, ucc_rank_t rank, ucc_kn_radix_t radix, static inline void ucc_kn_ag_pattern_peer_seg(ucc_rank_t peer, ucc_knomial_pattern_t *p, - size_t *seg_count, size_t *seg_offset) + size_t *seg_count, ptrdiff_t *seg_offset) { ucc_rank_t step_radix, seg_index; ucc_kn_seg_desc_t s; @@ -278,7 +278,7 @@ static inline void ucc_kn_rsx_pattern_init(ucc_rank_t size, ucc_rank_t rank, static inline void ucc_kn_rs_pattern_peer_seg(ucc_rank_t peer, ucc_knomial_pattern_t *p, - size_t *peer_seg_count, size_t *peer_seg_offset) + size_t *peer_seg_count, ptrdiff_t *peer_seg_offset) { ucc_rank_t step_radix, seg_index; @@ -305,7 +305,8 @@ ucc_kn_rs_pattern_peer_seg(ucc_rank_t peer, ucc_knomial_pattern_t *p, static inline void ucc_kn_rs_pattern_next_iter(ucc_knomial_pattern_t *p) { - size_t offset, bs; + size_t bs; + ptrdiff_t offset; ucc_kn_rs_pattern_peer_seg(p->rank, p, &bs, &offset); p->block_size_counts = bs; diff --git a/src/components/cl/hier/allreduce/allreduce.c b/src/components/cl/hier/allreduce/allreduce.c index c69cc4db36..ba93f0789d 100644 --- a/src/components/cl/hier/allreduce/allreduce.c +++ b/src/components/cl/hier/allreduce/allreduce.c @@ -13,7 +13,7 @@ ucc_base_coll_alg_info_t {.id = UCC_CL_HIER_ALLREDUCE_ALG_RAB, .name = "rab", .desc = "intra-node reduce, followed by inter-node allreduce," - " followed by innode broadcast"}, + " followed by intra-node broadcast"}, [UCC_CL_HIER_ALLREDUCE_ALG_SPLIT_RAIL] = {.id = UCC_CL_HIER_ALLREDUCE_ALG_SPLIT_RAIL, .name = "split_rail", diff --git a/src/components/cl/hier/alltoallv/alltoallv.c b/src/components/cl/hier/alltoallv/alltoallv.c index c60bdf84fe..b73af2c82b 100644 --- a/src/components/cl/hier/alltoallv/alltoallv.c +++ b/src/components/cl/hier/alltoallv/alltoallv.c @@ -144,6 +144,11 @@ UCC_CL_HIER_PROFILE_FUNC(ucc_status_t, ucc_cl_hier_alltoallv_init, return UCC_ERR_NOT_SUPPORTED; } + if (coll_args->args.mask & UCC_COLL_ARGS_FIELD_GLOBAL_WORK_BUFFER) { + cl_debug(team->context->lib, "onesided alltoallv is not supported"); + return UCC_ERR_NOT_SUPPORTED; + } + if (!SBGP_ENABLED(cl_team, FULL)) { cl_debug(team->context->lib, "alltoallv requires FULL sbgp"); return UCC_ERR_NOT_SUPPORTED; diff --git a/src/components/ec/cuda/Makefile.am b/src/components/ec/cuda/Makefile.am index 3d7a862ef4..83f478d797 100644 --- a/src/components/ec/cuda/Makefile.am +++ b/src/components/ec/cuda/Makefile.am @@ -1,5 +1,5 @@ # -# Copyright (c) 2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# Copyright (c) 2022-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. # if HAVE_CUDA @@ -12,7 +12,9 @@ sources = \ ec_cuda_executor.c \ ec_cuda_executor_interruptible.c \ ec_cuda_executor_persistent.c \ - ec_cuda_executor_persistent_wait.c + ec_cuda_executor_persistent_wait.c \ + ec_cuda_resources.c \ + ec_cuda_resources.h module_LTLIBRARIES = libucc_ec_cuda.la libucc_ec_cuda_la_SOURCES = $(sources) diff --git a/src/components/ec/cuda/ec_cuda.c b/src/components/ec/cuda/ec_cuda.c index 2357023fed..dd721e1f50 100644 --- a/src/components/ec/cuda/ec_cuda.c +++ b/src/components/ec/cuda/ec_cuda.c @@ -75,116 +75,6 @@ static ucc_config_field_t ucc_ec_cuda_config_table[] = { }; -static ucc_status_t ucc_ec_cuda_ee_executor_mpool_chunk_malloc(ucc_mpool_t *mp, //NOLINT: mp is unused - size_t *size_p, - void ** chunk_p) -{ - return CUDA_FUNC(cudaHostAlloc((void**)chunk_p, *size_p, - cudaHostAllocMapped)); -} - -static void ucc_ec_cuda_ee_executor_mpool_chunk_free(ucc_mpool_t *mp, //NOLINT: mp is unused - void *chunk) -{ - CUDA_FUNC(cudaFreeHost(chunk)); -} - -static void ucc_ec_cuda_executor_chunk_init(ucc_mpool_t *mp, void *obj, //NOLINT: mp is unused - void *chunk) //NOLINT: chunk is unused -{ - ucc_ec_cuda_executor_t *eee = (ucc_ec_cuda_executor_t*) obj; - int max_tasks = EC_CUDA_CONFIG->exec_max_tasks; - - CUDA_FUNC(cudaHostGetDevicePointer( - (void**)(&eee->dev_state), (void *)&eee->state, 0)); - CUDA_FUNC(cudaHostGetDevicePointer( - (void**)(&eee->dev_pidx), (void *)&eee->pidx, 0)); - CUDA_FUNC(cudaMalloc((void**)&eee->dev_cidx, sizeof(*eee->dev_cidx))); - CUDA_FUNC(cudaHostAlloc((void**)&eee->tasks, - max_tasks * MAX_SUBTASKS * - sizeof(ucc_ee_executor_task_args_t), - cudaHostAllocMapped)); - CUDA_FUNC(cudaHostGetDevicePointer( - (void**)(&eee->dev_tasks), (void *)eee->tasks, 0)); - if (ucc_ec_cuda.thread_mode == UCC_THREAD_MULTIPLE) { - ucc_spinlock_init(&eee->tasks_lock, 0); - } -} - -static void ucc_ec_cuda_executor_chunk_cleanup(ucc_mpool_t *mp, void *obj) //NOLINT: mp is unused -{ - ucc_ec_cuda_executor_t *eee = (ucc_ec_cuda_executor_t*) obj; - - CUDA_FUNC(cudaFree((void*)eee->dev_cidx)); - CUDA_FUNC(cudaFreeHost((void*)eee->tasks)); - if (ucc_ec_cuda.thread_mode == UCC_THREAD_MULTIPLE) { - ucc_spinlock_destroy(&eee->tasks_lock); - } -} - - -static ucc_mpool_ops_t ucc_ec_cuda_ee_executor_mpool_ops = { - .chunk_alloc = ucc_ec_cuda_ee_executor_mpool_chunk_malloc, - .chunk_release = ucc_ec_cuda_ee_executor_mpool_chunk_free, - .obj_init = ucc_ec_cuda_executor_chunk_init, - .obj_cleanup = ucc_ec_cuda_executor_chunk_cleanup, -}; - -static void ucc_ec_cuda_event_init(ucc_mpool_t *mp, void *obj, void *chunk) //NOLINT: mp is unused -{ - ucc_ec_cuda_event_t *base = (ucc_ec_cuda_event_t *) obj; - - CUDA_FUNC(cudaEventCreateWithFlags(&base->event, cudaEventDisableTiming)); -} - -static void ucc_ec_cuda_event_cleanup(ucc_mpool_t *mp, void *obj) //NOLINT: mp is unused -{ - ucc_ec_cuda_event_t *base = (ucc_ec_cuda_event_t *) obj; - - CUDA_FUNC(cudaEventDestroy(base->event)); -} - -static ucc_mpool_ops_t ucc_ec_cuda_event_mpool_ops = { - .chunk_alloc = ucc_mpool_hugetlb_malloc, - .chunk_release = ucc_mpool_hugetlb_free, - .obj_init = ucc_ec_cuda_event_init, - .obj_cleanup = ucc_ec_cuda_event_cleanup, -}; - -static void ucc_ec_cuda_graph_init(ucc_mpool_t *mp, void *obj, void *chunk) //NOLINT: mp is unused -{ - ucc_ec_cuda_executor_interruptible_task_t *task = - (ucc_ec_cuda_executor_interruptible_task_t *) obj; - cudaGraphNode_t memcpy_node; - int i; - - CUDA_FUNC(cudaGraphCreate(&task->graph, 0)); - for (i = 0; i < UCC_EE_EXECUTOR_MULTI_OP_NUM_BUFS; i++) { - CUDA_FUNC( - cudaGraphAddMemcpyNode1D(&memcpy_node, task->graph, NULL, 0, - (void*)1, (void*)1, 1, cudaMemcpyDefault)); - } - - CUDA_FUNC( - cudaGraphInstantiateWithFlags(&task->graph_exec, task->graph, 0)); -} - -static void ucc_ec_cuda_graph_cleanup(ucc_mpool_t *mp, void *obj) //NOLINT: mp is unused -{ - ucc_ec_cuda_executor_interruptible_task_t *task = - (ucc_ec_cuda_executor_interruptible_task_t *) obj; - - CUDA_FUNC(cudaGraphExecDestroy(task->graph_exec)); - CUDA_FUNC(cudaGraphDestroy(task->graph)); -} - -static ucc_mpool_ops_t ucc_ec_cuda_interruptible_task_mpool_ops = { - .chunk_alloc = ucc_mpool_hugetlb_malloc, - .chunk_release = ucc_mpool_hugetlb_free, - .obj_init = ucc_ec_cuda_graph_init, - .obj_cleanup = ucc_ec_cuda_graph_cleanup, -}; - static inline void ucc_ec_cuda_set_threads_nbr(int *nt, int maxThreadsPerBlock) { if (*nt != UCC_ULUNITS_AUTO) { @@ -208,15 +98,14 @@ static inline void ucc_ec_cuda_set_threads_nbr(int *nt, int maxThreadsPerBlock) static ucc_status_t ucc_ec_cuda_init(const ucc_ec_params_t *ec_params) { - ucc_ec_cuda_config_t *cfg = EC_CUDA_CONFIG; - ucc_status_t status; + ucc_ec_cuda_config_t *cfg = EC_CUDA_CONFIG; + int supports_coop_launch = 0; int device, num_devices; cudaError_t cuda_st; struct cudaDeviceProp prop; - int supportsCoopLaunch = 0; - ucc_ec_cuda.stream = NULL; - ucc_ec_cuda.stream_initialized = 0; + ucc_ec_cuda_config = ucc_derived_of(ucc_ec_cuda.super.config, + ucc_ec_cuda_config_t); ucc_ec_cuda.exec_streams_initialized = 0; ucc_strncpy_safe(ucc_ec_cuda.super.config->log_component.name, ucc_ec_cuda.super.super.name, @@ -228,9 +117,7 @@ static ucc_status_t ucc_ec_cuda_init(const ucc_ec_params_t *ec_params) return UCC_ERR_NO_RESOURCE; } CUDA_CHECK(cudaGetDevice(&device)); - CUDA_CHECK(cudaGetDeviceProperties(&prop, device)); - ucc_ec_cuda_set_threads_nbr((int *)&cfg->exec_num_threads, prop.maxThreadsPerBlock); ucc_ec_cuda_set_threads_nbr(&cfg->reduce_num_threads, @@ -253,52 +140,6 @@ static ucc_status_t ucc_ec_cuda_init(const ucc_ec_params_t *ec_params) cfg->exec_num_streams = 1; } - /*create event pool */ - ucc_ec_cuda.exec_streams = ucc_calloc(cfg->exec_num_streams, - sizeof(cudaStream_t), - "ec cuda streams"); - if (!ucc_ec_cuda.exec_streams) { - ec_error(&ucc_ec_cuda.super, "failed to allocate streams array"); - return UCC_ERR_NO_MEMORY; - } - status = ucc_mpool_init(&ucc_ec_cuda.events, 0, sizeof(ucc_ec_cuda_event_t), - 0, UCC_CACHE_LINE_SIZE, 16, UINT_MAX, - &ucc_ec_cuda_event_mpool_ops, UCC_THREAD_MULTIPLE, - "CUDA Event Objects"); - if (status != UCC_OK) { - ec_error(&ucc_ec_cuda.super, "failed to create event pool"); - return status; - } - - status = ucc_mpool_init( - &ucc_ec_cuda.executors, 0, sizeof(ucc_ec_cuda_executor_t), 0, - UCC_CACHE_LINE_SIZE, 16, UINT_MAX, &ucc_ec_cuda_ee_executor_mpool_ops, - UCC_THREAD_MULTIPLE, "EE executor Objects"); - if (status != UCC_OK) { - ec_error(&ucc_ec_cuda.super, "failed to create executors pool"); - return status; - } - - status = ucc_mpool_init( - &ucc_ec_cuda.executor_interruptible_tasks, 0, - sizeof(ucc_ec_cuda_executor_interruptible_task_t), 0, UCC_CACHE_LINE_SIZE, - 16, UINT_MAX, &ucc_ec_cuda_interruptible_task_mpool_ops, - UCC_THREAD_MULTIPLE, "interruptible executor tasks"); - if (status != UCC_OK) { - ec_error(&ucc_ec_cuda.super, "failed to create interruptible tasks pool"); - return status; - } - - status = ucc_mpool_init( - &ucc_ec_cuda.executor_persistent_tasks, 0, - sizeof(ucc_ec_cuda_executor_persistent_task_t), 0, UCC_CACHE_LINE_SIZE, - 16, UINT_MAX, NULL, UCC_THREAD_MULTIPLE, - "persistent executor tasks"); - if (status != UCC_OK) { - ec_error(&ucc_ec_cuda.super, "failed to create persistent tasks pool"); - return status; - } - if (cfg->strm_task_mode == UCC_EC_CUDA_TASK_KERNEL) { ucc_ec_cuda.strm_task_mode = UCC_EC_CUDA_TASK_KERNEL; } else { @@ -335,16 +176,17 @@ static ucc_status_t ucc_ec_cuda_init(const ucc_ec_params_t *ec_params) } if (cfg->use_cooperative_launch == 1) { - cudaDeviceGetAttribute(&supportsCoopLaunch, + cudaDeviceGetAttribute(&supports_coop_launch, cudaDevAttrCooperativeLaunch, device); - if (!supportsCoopLaunch) { + if (!supports_coop_launch) { cfg->use_cooperative_launch = 0; ec_warn(&ucc_ec_cuda.super, - "CUDA cooperative groups are not supported. " - "Fall back to non cooperative launch."); + "CUDA cooperative groups are not supported. " + "Fall back to non cooperative launch."); } } + ucc_ec_cuda.resources_hash = kh_init(ucc_ec_cuda_resources_hash); ucc_spinlock_init(&ucc_ec_cuda.init_spinlock, 0); return UCC_OK; } @@ -359,9 +201,15 @@ static ucc_status_t ucc_ec_cuda_get_attr(ucc_ec_attr_t *ec_attr) ucc_status_t ucc_ec_cuda_event_create(void **event) { - ucc_ec_cuda_event_t *cuda_event; + ucc_ec_cuda_event_t *cuda_event; + ucc_ec_cuda_resources_t *resources; + ucc_status_t status; - cuda_event = ucc_mpool_get(&ucc_ec_cuda.events); + status = ucc_ec_cuda_get_resources(&resources); + if (ucc_unlikely(status != UCC_OK)) { + return status; + } + cuda_event = ucc_mpool_get(&resources->events); if (ucc_unlikely(!cuda_event)) { ec_error(&ucc_ec_cuda.super, "failed to get event from mpool"); return UCC_ERR_NO_MEMORY; @@ -390,8 +238,8 @@ ucc_status_t ucc_ec_cuda_event_post(void *ee_context, void *event) ucc_status_t ucc_ec_cuda_event_test(void *event) { - cudaError_t cu_err; ucc_ec_cuda_event_t *cuda_event = event; + cudaError_t cu_err; cu_err = cudaEventQuery(cuda_event->event); @@ -404,26 +252,68 @@ ucc_status_t ucc_ec_cuda_event_test(void *event) static ucc_status_t ucc_ec_cuda_finalize() { - int i; + ucc_ec_cuda_resources_t *resources; - if (ucc_ec_cuda.stream_initialized) { - CUDA_FUNC(cudaStreamDestroy(ucc_ec_cuda.stream)); - ucc_ec_cuda.stream_initialized = 0; + resources = ec_cuda_resources_hash_pop(ucc_ec_cuda.resources_hash); + while (resources) { + ucc_ec_cuda_resources_cleanup(resources); + resources = ec_cuda_resources_hash_pop(ucc_ec_cuda.resources_hash); } - if (ucc_ec_cuda.exec_streams_initialized) { - for (i = 0; i < EC_CUDA_CONFIG->exec_num_streams; i++) { - CUDA_FUNC(cudaStreamDestroy(ucc_ec_cuda.exec_streams[i])); - } - ucc_ec_cuda.exec_streams_initialized = 0; + ucc_spinlock_destroy(&ucc_ec_cuda.init_spinlock); + + return UCC_OK; +} + +ucc_status_t ucc_ec_cuda_get_resources(ucc_ec_cuda_resources_t **resources) +{ + CUcontext cu_ctx; + unsigned long long int cu_ctx_id; + ucc_status_t status; + + status = CUDADRV_FUNC(cuCtxGetCurrent(&cu_ctx)); + if (ucc_unlikely(status != UCC_OK)) { + ec_error(&ucc_ec_cuda.super, "failed to get current CUDA context"); + return status; } - ucc_mpool_cleanup(&ucc_ec_cuda.events, 1); - ucc_mpool_cleanup(&ucc_ec_cuda.executors, 1); - ucc_mpool_cleanup(&ucc_ec_cuda.executor_interruptible_tasks, 1); - ucc_mpool_cleanup(&ucc_ec_cuda.executor_persistent_tasks, 1); - ucc_free(ucc_ec_cuda.exec_streams); +#if CUDA_VERSION < 12000 + cu_ctx_id = 1; +#else + status = CUDADRV_FUNC(cuCtxGetId(cu_ctx, &cu_ctx_id)); + if (ucc_unlikely(status != UCC_OK)) { + ec_error(&ucc_ec_cuda.super, "failed to get currect CUDA context ID"); + } +#endif + *resources = ec_cuda_resources_hash_get(ucc_ec_cuda.resources_hash, + cu_ctx_id); + if (ucc_unlikely(*resources == NULL)) { + ucc_spin_lock(&ucc_ec_cuda.init_spinlock); + *resources = ec_cuda_resources_hash_get(ucc_ec_cuda.resources_hash, + cu_ctx_id); + if (*resources == NULL) { + *resources = ucc_malloc(sizeof(ucc_ec_cuda_resources_t), + "ec cuda resources"); + if (*resources == NULL) { + ec_error(&ucc_ec_cuda.super, + "failed to allocate %zd bytes for resources", + sizeof(ucc_ec_cuda_resources_t)); + ucc_spin_unlock(&ucc_ec_cuda.init_spinlock); + return UCC_ERR_NO_MEMORY; + } + status = ucc_ec_cuda_resources_init(&ucc_ec_cuda.super, + *resources); + if (status != UCC_OK) { + ucc_free(*resources); + ucc_spin_unlock(&ucc_ec_cuda.init_spinlock); + return status; + } + ec_cuda_resources_hash_put(ucc_ec_cuda.resources_hash, cu_ctx_id, + *resources); + } + ucc_spin_unlock(&ucc_ec_cuda.init_spinlock); + } return UCC_OK; } @@ -455,5 +345,7 @@ ucc_ec_cuda_t ucc_ec_cuda = { .super.executor_ops.finalize = ucc_cuda_executor_finalize, }; +ucc_ec_cuda_config_t *ucc_ec_cuda_config; + UCC_CONFIG_REGISTER_TABLE_ENTRY(&ucc_ec_cuda.super.config_table, &ucc_config_global_list); diff --git a/src/components/ec/cuda/ec_cuda.h b/src/components/ec/cuda/ec_cuda.h index d732669f12..84b8588605 100644 --- a/src/components/ec/cuda/ec_cuda.h +++ b/src/components/ec/cuda/ec_cuda.h @@ -11,109 +11,30 @@ #include "components/ec/ucc_ec_log.h" #include "utils/arch/cuda_def.h" #include "utils/ucc_mpool.h" +#include "ec_cuda_resources.h" #include #define WARP_SIZE 32 -#define MAX_SUBTASKS 12 - -typedef enum ucc_ec_cuda_strm_task_mode { - UCC_EC_CUDA_TASK_KERNEL, - UCC_EC_CUDA_TASK_MEM_OPS, - UCC_EC_CUDA_TASK_AUTO, - UCC_EC_CUDA_TASK_LAST, -} ucc_ec_cuda_strm_task_mode_t; - -typedef enum ucc_ec_cuda_executor_state { - UCC_EC_CUDA_EXECUTOR_INITIALIZED, - UCC_EC_CUDA_EXECUTOR_POSTED, - UCC_EC_CUDA_EXECUTOR_STARTED, - UCC_EC_CUDA_EXECUTOR_SHUTDOWN, - UCC_EC_CUDA_EXECUTOR_SHUTDOWN_ACK -} ucc_ec_cuda_executor_state_t; - -typedef enum ucc_ec_cuda_executor_mode { - UCC_EC_CUDA_EXECUTOR_MODE_PERSISTENT, - UCC_EC_CUDA_EXECUTOR_MODE_INTERRUPTIBLE -} ucc_ec_cuda_executor_mode_t; typedef ucc_status_t (*ucc_ec_cuda_task_post_fn) (uint32_t *dev_status, int blocking_wait, cudaStream_t stream); -typedef struct ucc_ec_cuda_config { - ucc_ec_config_t super; - ucc_ec_cuda_strm_task_mode_t strm_task_mode; - unsigned long exec_num_workers; - unsigned long exec_num_threads; - unsigned long exec_max_tasks; - unsigned long exec_num_streams; - unsigned long reduce_num_blocks; - int reduce_num_threads; - int use_cooperative_launch; - unsigned long exec_copy_thresh; -} ucc_ec_cuda_config_t; - typedef struct ucc_ec_cuda { ucc_ec_base_t super; - int stream_initialized; - cudaStream_t stream; int exec_streams_initialized; - cudaStream_t *exec_streams; - ucc_mpool_t events; - ucc_mpool_t executors; - ucc_mpool_t executor_interruptible_tasks; - ucc_mpool_t executor_persistent_tasks; + ucc_ec_cuda_resources_hash_t *resources_hash; ucc_thread_mode_t thread_mode; ucc_ec_cuda_strm_task_mode_t strm_task_mode; ucc_spinlock_t init_spinlock; } ucc_ec_cuda_t; -typedef struct ucc_ec_cuda_event { - cudaEvent_t event; -} ucc_ec_cuda_event_t; - typedef struct ucc_ec_cuda_stream_request { uint32_t status; uint32_t *dev_status; cudaStream_t stream; } ucc_ec_cuda_stream_request_t; -typedef struct ucc_ec_cuda_executor_interruptible_task { - ucc_ee_executor_task_t super; - void *event; - cudaGraph_t graph; - cudaGraphExec_t graph_exec; -} ucc_ec_cuda_executor_interruptible_task_t; - -typedef struct ucc_ec_cuda_executor_persistent_task { - ucc_ee_executor_task_t super; - int num_subtasks; - ucc_ee_executor_task_args_t *subtasks[MAX_SUBTASKS]; -} ucc_ec_cuda_executor_persistent_task_t; - -typedef struct ucc_ec_cuda_executor_task_ops { - ucc_status_t (*task_post)(ucc_ee_executor_t *executor, - const ucc_ee_executor_task_args_t *task_args, - ucc_ee_executor_task_t **task); - ucc_status_t (*task_test)(const ucc_ee_executor_task_t *task); - ucc_status_t (*task_finalize)(ucc_ee_executor_task_t *task); -} ucc_ec_cuda_executor_task_ops_t; - -typedef struct ucc_ec_cuda_executor { - ucc_ee_executor_t super; - ucc_ec_cuda_executor_mode_t mode; - uint64_t requested_ops; - ucc_ec_cuda_executor_task_ops_t ops; - ucc_spinlock_t tasks_lock; - ucc_ec_cuda_executor_state_t state; - int pidx; - ucc_ee_executor_task_args_t *tasks; - ucc_ec_cuda_executor_state_t *dev_state; - ucc_ee_executor_task_args_t *dev_tasks; - int *dev_pidx; - int *dev_cidx; -} ucc_ec_cuda_executor_t; - ucc_status_t ucc_ec_cuda_event_create(void **event); ucc_status_t ucc_ec_cuda_event_destroy(void *event); @@ -122,6 +43,8 @@ ucc_status_t ucc_ec_cuda_event_post(void *ee_context, void *event); ucc_status_t ucc_ec_cuda_event_test(void *event); +ucc_status_t ucc_ec_cuda_get_resources(ucc_ec_cuda_resources_t **resources); + extern ucc_ec_cuda_t ucc_ec_cuda; #define EC_CUDA_CONFIG \ diff --git a/src/components/ec/cuda/ec_cuda_executor.c b/src/components/ec/cuda/ec_cuda_executor.c index 49ae469140..1349187b71 100644 --- a/src/components/ec/cuda/ec_cuda_executor.c +++ b/src/components/ec/cuda/ec_cuda_executor.c @@ -23,8 +23,16 @@ ucc_status_t ucc_cuda_executor_persistent_wait_stop(ucc_ee_executor_t *executor) ucc_status_t ucc_cuda_executor_init(const ucc_ee_executor_params_t *params, ucc_ee_executor_t **executor) { - ucc_ec_cuda_executor_t *eee = ucc_mpool_get(&ucc_ec_cuda.executors); + ucc_ec_cuda_executor_t *eee; + ucc_ec_cuda_resources_t *resources; + ucc_status_t status; + status = ucc_ec_cuda_get_resources(&resources); + if (ucc_unlikely(status != UCC_OK)) { + return status; + } + + eee = ucc_mpool_get(&resources->executors); if (ucc_unlikely(!eee)) { ec_error(&ucc_ec_cuda.super, "failed to allocate executor"); return UCC_ERR_NO_MEMORY; diff --git a/src/components/ec/cuda/ec_cuda_executor_interruptible.c b/src/components/ec/cuda/ec_cuda_executor_interruptible.c index 74cc80b96e..7272b2439f 100644 --- a/src/components/ec/cuda/ec_cuda_executor_interruptible.c +++ b/src/components/ec/cuda/ec_cuda_executor_interruptible.c @@ -9,37 +9,43 @@ ucc_status_t ucc_cuda_executor_interruptible_get_stream(cudaStream_t *stream) { - static uint32_t last_used = 0; - int num_streams = EC_CUDA_CONFIG->exec_num_streams; - ucc_status_t st; - int i, j; - uint32_t id; + static uint32_t last_used = 0; + int num_streams = EC_CUDA_CONFIG->exec_num_streams; + ucc_ec_cuda_resources_t *resources; + ucc_status_t st; + int i, j; + uint32_t id; ucc_assert(num_streams > 0); - if (ucc_unlikely(!ucc_ec_cuda.exec_streams_initialized)) { + st = ucc_ec_cuda_get_resources(&resources); + if (ucc_unlikely(st != UCC_OK)) { + return st; + } + + if (ucc_unlikely(!resources->streams_initialized)) { ucc_spin_lock(&ucc_ec_cuda.init_spinlock); - if (ucc_ec_cuda.exec_streams_initialized) { + if (resources->streams_initialized) { goto unlock; } for(i = 0; i < num_streams; i++) { - st = CUDA_FUNC(cudaStreamCreateWithFlags(&ucc_ec_cuda.exec_streams[i], + st = CUDA_FUNC(cudaStreamCreateWithFlags(&resources->exec_streams[i], cudaStreamNonBlocking)); if (st != UCC_OK) { for (j = 0; j < i; j++) { - CUDA_FUNC(cudaStreamDestroy(ucc_ec_cuda.exec_streams[j])); + CUDA_FUNC(cudaStreamDestroy(resources->exec_streams[j])); } ucc_spin_unlock(&ucc_ec_cuda.init_spinlock); return st; } } - ucc_ec_cuda.exec_streams_initialized = 1; + resources->streams_initialized = 1; unlock: ucc_spin_unlock(&ucc_ec_cuda.init_spinlock); } id = ucc_atomic_fadd32(&last_used, 1); - *stream = ucc_ec_cuda.exec_streams[id % num_streams]; + *stream = resources->exec_streams[id % num_streams]; return UCC_OK; } @@ -52,20 +58,25 @@ ucc_cuda_executor_interruptible_task_post(ucc_ee_executor_t *executor, const ucc_ee_executor_task_args_t *task_args, ucc_ee_executor_task_t **task) { - cudaStream_t stream = NULL; + cudaStream_t stream = NULL; + size_t num_nodes = UCC_EE_EXECUTOR_MULTI_OP_NUM_BUFS; ucc_ec_cuda_executor_interruptible_task_t *ee_task; ucc_status_t status; cudaGraphNode_t nodes[UCC_EE_EXECUTOR_MULTI_OP_NUM_BUFS]; - size_t num_nodes = UCC_EE_EXECUTOR_MULTI_OP_NUM_BUFS; + ucc_ec_cuda_resources_t *resources; int i; + status = ucc_ec_cuda_get_resources(&resources); + if (ucc_unlikely(status != UCC_OK)) { + return status; + } status = ucc_cuda_executor_interruptible_get_stream(&stream); if (ucc_unlikely(status != UCC_OK)) { return status; } - ee_task = ucc_mpool_get(&ucc_ec_cuda.executor_interruptible_tasks); + ee_task = ucc_mpool_get(&resources->executor_interruptible_tasks); if (ucc_unlikely(!ee_task)) { return UCC_ERR_NO_MEMORY; } diff --git a/src/components/ec/cuda/ec_cuda_executor_persistent.c b/src/components/ec/cuda/ec_cuda_executor_persistent.c index b937a89680..c43b132e12 100644 --- a/src/components/ec/cuda/ec_cuda_executor_persistent.c +++ b/src/components/ec/cuda/ec_cuda_executor_persistent.c @@ -18,12 +18,19 @@ ucc_cuda_executor_persistent_task_post(ucc_ee_executor_t *executor, ucc_ee_executor_task_args_t *subtask_args; ucc_ec_cuda_executor_persistent_task_t *ee_task; int i; + ucc_ec_cuda_resources_t *resources; + ucc_status_t status; + + status = ucc_ec_cuda_get_resources(&resources); + if (ucc_unlikely(status != UCC_OK)) { + return status; + } if (ucc_ec_cuda.thread_mode == UCC_THREAD_MULTIPLE) { ucc_spin_lock(&eee->tasks_lock); } - ee_task = ucc_mpool_get(&ucc_ec_cuda.executor_persistent_tasks); + ee_task = ucc_mpool_get(&resources->executor_persistent_tasks); if (ucc_unlikely(!ee_task)) { return UCC_ERR_NO_MEMORY; } diff --git a/src/components/ec/cuda/ec_cuda_resources.c b/src/components/ec/cuda/ec_cuda_resources.c new file mode 100644 index 0000000000..5bc0043f1f --- /dev/null +++ b/src/components/ec/cuda/ec_cuda_resources.c @@ -0,0 +1,197 @@ +#include "ec_cuda_resources.h" +#include "components/ec/ucc_ec_log.h" +#include "utils/ucc_malloc.h" + +static void ucc_ec_cuda_event_init(ucc_mpool_t *mp, void *obj, void *chunk) //NOLINT: mp is unused +{ + ucc_ec_cuda_event_t *base = (ucc_ec_cuda_event_t *) obj; + + CUDA_FUNC(cudaEventCreateWithFlags(&base->event, cudaEventDisableTiming)); +} + +static void ucc_ec_cuda_event_cleanup(ucc_mpool_t *mp, void *obj) //NOLINT: mp is unused +{ + ucc_ec_cuda_event_t *base = (ucc_ec_cuda_event_t *) obj; + + CUDA_FUNC(cudaEventDestroy(base->event)); +} + +static ucc_mpool_ops_t ucc_ec_cuda_event_mpool_ops = { + .chunk_alloc = ucc_mpool_hugetlb_malloc, + .chunk_release = ucc_mpool_hugetlb_free, + .obj_init = ucc_ec_cuda_event_init, + .obj_cleanup = ucc_ec_cuda_event_cleanup, +}; + +static ucc_status_t ucc_ec_cuda_ee_executor_mpool_chunk_malloc(ucc_mpool_t *mp, //NOLINT: mp is unused + size_t *size_p, + void ** chunk_p) +{ + return CUDA_FUNC(cudaHostAlloc((void**)chunk_p, *size_p, + cudaHostAllocMapped)); +} + +static void ucc_ec_cuda_ee_executor_mpool_chunk_free(ucc_mpool_t *mp, //NOLINT: mp is unused + void *chunk) +{ + CUDA_FUNC(cudaFreeHost(chunk)); +} + +static void ucc_ec_cuda_executor_chunk_init(ucc_mpool_t *mp, void *obj, //NOLINT: mp is unused + void *chunk) //NOLINT: chunk is unused +{ + ucc_ec_cuda_executor_t *eee = (ucc_ec_cuda_executor_t*) obj; + int max_tasks = ucc_ec_cuda_config->exec_max_tasks; + + CUDA_FUNC(cudaHostGetDevicePointer( + (void**)(&eee->dev_state), (void *)&eee->state, 0)); + CUDA_FUNC(cudaHostGetDevicePointer( + (void**)(&eee->dev_pidx), (void *)&eee->pidx, 0)); + CUDA_FUNC(cudaMalloc((void**)&eee->dev_cidx, sizeof(*eee->dev_cidx))); + CUDA_FUNC(cudaHostAlloc((void**)&eee->tasks, + max_tasks * MAX_SUBTASKS * + sizeof(ucc_ee_executor_task_args_t), + cudaHostAllocMapped)); + CUDA_FUNC(cudaHostGetDevicePointer( + (void**)(&eee->dev_tasks), (void *)eee->tasks, 0)); + ucc_spinlock_init(&eee->tasks_lock, 0); +} + +static void ucc_ec_cuda_executor_chunk_cleanup(ucc_mpool_t *mp, void *obj) //NOLINT: mp is unused +{ + ucc_ec_cuda_executor_t *eee = (ucc_ec_cuda_executor_t*) obj; + + CUDA_FUNC(cudaFree((void*)eee->dev_cidx)); + CUDA_FUNC(cudaFreeHost((void*)eee->tasks)); + ucc_spinlock_destroy(&eee->tasks_lock); +} + +static ucc_mpool_ops_t ucc_ec_cuda_ee_executor_mpool_ops = { + .chunk_alloc = ucc_ec_cuda_ee_executor_mpool_chunk_malloc, + .chunk_release = ucc_ec_cuda_ee_executor_mpool_chunk_free, + .obj_init = ucc_ec_cuda_executor_chunk_init, + .obj_cleanup = ucc_ec_cuda_executor_chunk_cleanup, +}; + +static void ucc_ec_cuda_graph_init(ucc_mpool_t *mp, void *obj, void *chunk) //NOLINT: mp is unused +{ + ucc_ec_cuda_executor_interruptible_task_t *task = + (ucc_ec_cuda_executor_interruptible_task_t *) obj; + cudaGraphNode_t memcpy_node; + int i; + + CUDA_FUNC(cudaGraphCreate(&task->graph, 0)); + for (i = 0; i < UCC_EE_EXECUTOR_MULTI_OP_NUM_BUFS; i++) { + CUDA_FUNC( + cudaGraphAddMemcpyNode1D(&memcpy_node, task->graph, NULL, 0, + (void*)1, (void*)1, 1, cudaMemcpyDefault)); + } + + CUDA_FUNC( + cudaGraphInstantiateWithFlags(&task->graph_exec, task->graph, 0)); +} + +static void ucc_ec_cuda_graph_cleanup(ucc_mpool_t *mp, void *obj) //NOLINT: mp is unused +{ + ucc_ec_cuda_executor_interruptible_task_t *task = + (ucc_ec_cuda_executor_interruptible_task_t *) obj; + + CUDA_FUNC(cudaGraphExecDestroy(task->graph_exec)); + CUDA_FUNC(cudaGraphDestroy(task->graph)); +} + +static ucc_mpool_ops_t ucc_ec_cuda_interruptible_task_mpool_ops = { + .chunk_alloc = ucc_mpool_hugetlb_malloc, + .chunk_release = ucc_mpool_hugetlb_free, + .obj_init = ucc_ec_cuda_graph_init, + .obj_cleanup = ucc_ec_cuda_graph_cleanup, +}; + +ucc_status_t ucc_ec_cuda_resources_init(ucc_ec_base_t *ec, + ucc_ec_cuda_resources_t *resources) +{ + ucc_status_t status; + int num_streams; + + CUDADRV_CHECK(cuCtxGetCurrent(&resources->cu_ctx)); + status = ucc_mpool_init(&resources->events, 0, sizeof(ucc_ec_cuda_event_t), + 0, UCC_CACHE_LINE_SIZE, 16, UINT_MAX, + &ucc_ec_cuda_event_mpool_ops, UCC_THREAD_MULTIPLE, + "CUDA Event Objects"); + if (status != UCC_OK) { + ec_error(ec, "failed to create CUDA events pool"); + goto exit_err; + } + + status = ucc_mpool_init(&resources->executors, 0, + sizeof(ucc_ec_cuda_executor_t), 0, + UCC_CACHE_LINE_SIZE, 16, UINT_MAX, + &ucc_ec_cuda_ee_executor_mpool_ops, + UCC_THREAD_MULTIPLE, "CUDA EE executor objects"); + if (status != UCC_OK) { + ec_error(ec, "failed to create executors pool"); + goto free_events_mpool; + } + + status = ucc_mpool_init(&resources->executor_interruptible_tasks, 0, + sizeof(ucc_ec_cuda_executor_interruptible_task_t), + 0, UCC_CACHE_LINE_SIZE, 16, UINT_MAX, + &ucc_ec_cuda_interruptible_task_mpool_ops, + UCC_THREAD_MULTIPLE, "interruptible executor tasks"); + if (status != UCC_OK) { + ec_error(ec, "failed to create interruptible tasks pool"); + goto free_executors_mpool; + } + + status = ucc_mpool_init(&resources->executor_persistent_tasks, 0, + sizeof(ucc_ec_cuda_executor_persistent_task_t), 0, + UCC_CACHE_LINE_SIZE, 16, UINT_MAX, NULL, + UCC_THREAD_MULTIPLE, "persistent executor tasks"); + if (status != UCC_OK) { + ec_error(ec, "failed to create persistent tasks pool"); + goto free_interruptible_tasks_mpool; + } + + num_streams = ucc_ec_cuda_config->exec_num_streams; + resources->exec_streams = ucc_calloc(num_streams, sizeof(cudaStream_t), + "ec cuda streams"); + if (!resources->exec_streams) { + ec_error(ec, "failed to allocate %zd bytes for executor streams", + sizeof(cudaStream_t) * num_streams); + status = UCC_ERR_NO_MEMORY; + goto free_persistent_tasks_mpool; + } + + return UCC_OK; + +free_persistent_tasks_mpool: + ucc_mpool_cleanup(&resources->executor_persistent_tasks, 0); +free_interruptible_tasks_mpool: + ucc_mpool_cleanup(&resources->executor_persistent_tasks, 0); +free_executors_mpool: + ucc_mpool_cleanup(&resources->executors, 0); +free_events_mpool: + ucc_mpool_cleanup(&resources->events, 0); +exit_err: + return status; +} + +void ucc_ec_cuda_resources_cleanup(ucc_ec_cuda_resources_t *resources) +{ + int i; + CUcontext tmp_context; + + cuCtxPushCurrent(resources->cu_ctx); + for (i = 0; i < ucc_ec_cuda_config->exec_num_streams; i++) { + if (resources->exec_streams[i] != NULL) { + CUDA_FUNC(cudaStreamDestroy(resources->exec_streams[i])); + } + } + ucc_mpool_cleanup(&resources->events, 1); + ucc_mpool_cleanup(&resources->executors, 1); + ucc_mpool_cleanup(&resources->executor_interruptible_tasks, 1); + ucc_mpool_cleanup(&resources->executor_persistent_tasks, 1); + + ucc_free(resources->exec_streams); + cuCtxPopCurrent(&tmp_context); +} diff --git a/src/components/ec/cuda/ec_cuda_resources.h b/src/components/ec/cuda/ec_cuda_resources.h new file mode 100644 index 0000000000..1390f76cdd --- /dev/null +++ b/src/components/ec/cuda/ec_cuda_resources.h @@ -0,0 +1,158 @@ +/** + * Copyright (c) 2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. + * + * See file LICENSE for terms. + */ + +#ifndef UCC_EC_CUDA_RESOURCES_H_ +#define UCC_EC_CUDA_RESOURCES_H_ + +#include "components/ec/base/ucc_ec_base.h" +#include "utils/arch/cuda_def.h" +#include "utils/ucc_mpool.h" + +#define MAX_SUBTASKS 12 + +typedef enum ucc_ec_cuda_executor_state { + UCC_EC_CUDA_EXECUTOR_INITIALIZED, + UCC_EC_CUDA_EXECUTOR_POSTED, + UCC_EC_CUDA_EXECUTOR_STARTED, + UCC_EC_CUDA_EXECUTOR_SHUTDOWN, + UCC_EC_CUDA_EXECUTOR_SHUTDOWN_ACK +} ucc_ec_cuda_executor_state_t; + +typedef enum ucc_ec_cuda_executor_mode { + UCC_EC_CUDA_EXECUTOR_MODE_PERSISTENT, + UCC_EC_CUDA_EXECUTOR_MODE_INTERRUPTIBLE +} ucc_ec_cuda_executor_mode_t; + +typedef struct ucc_ec_cuda_event { + cudaEvent_t event; +} ucc_ec_cuda_event_t; + +typedef struct ucc_ec_cuda_executor_task_ops { + ucc_status_t (*task_post)(ucc_ee_executor_t *executor, + const ucc_ee_executor_task_args_t *task_args, + ucc_ee_executor_task_t **task); + ucc_status_t (*task_test)(const ucc_ee_executor_task_t *task); + ucc_status_t (*task_finalize)(ucc_ee_executor_task_t *task); +} ucc_ec_cuda_executor_task_ops_t; + +typedef struct ucc_ec_cuda_executor { + ucc_ee_executor_t super; + ucc_ec_cuda_executor_mode_t mode; + uint64_t requested_ops; + ucc_ec_cuda_executor_task_ops_t ops; + ucc_spinlock_t tasks_lock; + ucc_ec_cuda_executor_state_t state; + int pidx; + ucc_ee_executor_task_args_t *tasks; + ucc_ec_cuda_executor_state_t *dev_state; + ucc_ee_executor_task_args_t *dev_tasks; + int *dev_pidx; + int *dev_cidx; +} ucc_ec_cuda_executor_t; + +typedef struct ucc_ec_cuda_executor_interruptible_task { + ucc_ee_executor_task_t super; + void *event; + cudaGraph_t graph; + cudaGraphExec_t graph_exec; +} ucc_ec_cuda_executor_interruptible_task_t; + +typedef struct ucc_ec_cuda_executor_persistent_task { + ucc_ee_executor_task_t super; + int num_subtasks; + ucc_ee_executor_task_args_t *subtasks[MAX_SUBTASKS]; +} ucc_ec_cuda_executor_persistent_task_t; + +typedef struct ucc_ec_cuda_resources { + CUcontext cu_ctx; + ucc_mpool_t events; + ucc_mpool_t executors; + ucc_mpool_t executor_interruptible_tasks; + ucc_mpool_t executor_persistent_tasks; + int streams_initialized; + int num_streams; + cudaStream_t *exec_streams; +} ucc_ec_cuda_resources_t; + +typedef enum ucc_ec_cuda_strm_task_mode { + UCC_EC_CUDA_TASK_KERNEL, + UCC_EC_CUDA_TASK_MEM_OPS, + UCC_EC_CUDA_TASK_AUTO, + UCC_EC_CUDA_TASK_LAST, +} ucc_ec_cuda_strm_task_mode_t; + +typedef struct ucc_ec_cuda_config { + ucc_ec_config_t super; + ucc_ec_cuda_strm_task_mode_t strm_task_mode; + unsigned long exec_num_workers; + unsigned long exec_num_threads; + unsigned long exec_max_tasks; + unsigned long exec_num_streams; + unsigned long reduce_num_blocks; + int reduce_num_threads; + int use_cooperative_launch; + unsigned long exec_copy_thresh; +} ucc_ec_cuda_config_t; + +extern ucc_ec_cuda_config_t *ucc_ec_cuda_config; + +ucc_status_t ucc_ec_cuda_resources_init(ucc_ec_base_t *ec, + ucc_ec_cuda_resources_t *resources); + +void ucc_ec_cuda_resources_cleanup(ucc_ec_cuda_resources_t *resources); + +KHASH_INIT(ucc_ec_cuda_resources_hash, unsigned long long, void*, 1, \ + kh_int64_hash_func, kh_int64_hash_equal); +#define ucc_ec_cuda_resources_hash_t khash_t(ucc_ec_cuda_resources_hash) + +static inline +void* ec_cuda_resources_hash_get(ucc_ec_cuda_resources_hash_t *h, + unsigned long long key) +{ + khiter_t k; + void *value; + + k = kh_get(ucc_ec_cuda_resources_hash, h , key); + if (k == kh_end(h)) { + return NULL; + } + value = kh_value(h, k); + return value; +} + +static inline +void ec_cuda_resources_hash_put(ucc_ec_cuda_resources_hash_t *h, + unsigned long long key, + void *value) +{ + int ret; + khiter_t k; + k = kh_put(ucc_ec_cuda_resources_hash, h, key, &ret); + kh_value(h, k) = value; +} + +static inline +void* ec_cuda_resources_hash_pop(ucc_ec_cuda_resources_hash_t *h) +{ + void *resources = NULL; + khiter_t k; + + k = kh_begin(h); + while (k != kh_end(h)) { + if (kh_exist(h, k)) { + resources = kh_value(h, k); + break; + } + k++; + } + + if (resources) { + kh_del(ucc_ec_cuda_resources_hash, h, k); + } + return resources; +} + +#endif diff --git a/src/components/ec/ucc_ec.c b/src/components/ec/ucc_ec.c index af83e301b4..42cc096a0c 100644 --- a/src/components/ec/ucc_ec.c +++ b/src/components/ec/ucc_ec.c @@ -4,6 +4,7 @@ * See file LICENSE for terms. */ +#include #include "config.h" #include "base/ucc_ec_base.h" #include "ucc_ec.h" @@ -13,6 +14,7 @@ static const ucc_ec_ops_t *ec_ops[UCC_EE_LAST]; static const ucc_ee_executor_ops_t *executor_ops[UCC_EE_LAST]; +static pthread_mutex_t ucc_ec_mutex = PTHREAD_MUTEX_INITIALIZER; #define UCC_CHECK_EC_AVAILABLE(ee) \ do { \ @@ -28,6 +30,7 @@ ucc_status_t ucc_ec_init(const ucc_ec_params_t *ec_params) ucc_status_t status; ucc_ec_attr_t attr; + pthread_mutex_lock(&ucc_ec_mutex); memset(ec_ops, 0, UCC_EE_LAST * sizeof(ucc_ec_ops_t *)); n_ecs = ucc_global_config.ec_framework.n_components; for (i = 0; i < n_ecs; i++) { @@ -62,6 +65,7 @@ ucc_status_t ucc_ec_init(const ucc_ec_params_t *ec_params) attr.field_mask = UCC_EC_ATTR_FIELD_THREAD_MODE; status = ec->get_attr(&attr); if (status != UCC_OK) { + pthread_mutex_unlock(&ucc_ec_mutex); return status; } if (attr.thread_mode < ec_params->thread_mode) { @@ -75,6 +79,7 @@ ucc_status_t ucc_ec_init(const ucc_ec_params_t *ec_params) ec_ops[ec->type] = &ec->ops; executor_ops[ec->type] = &ec->executor_ops; } + pthread_mutex_unlock(&ucc_ec_mutex); return UCC_OK; } @@ -102,6 +107,7 @@ ucc_status_t ucc_ec_finalize() ucc_ee_type_t et; ucc_ec_base_t *ec; + pthread_mutex_lock(&ucc_ec_mutex); for (et = UCC_EE_FIRST; et < UCC_EE_LAST; et++) { if (NULL != ec_ops[et]) { ec = ucc_container_of(ec_ops[et], ucc_ec_base_t, ops); @@ -115,6 +121,7 @@ ucc_status_t ucc_ec_finalize() } } } + pthread_mutex_unlock(&ucc_ec_mutex); return UCC_OK; } diff --git a/src/components/mc/base/ucc_mc_base.h b/src/components/mc/base/ucc_mc_base.h index d6c67a734a..442088a09d 100644 --- a/src/components/mc/base/ucc_mc_base.h +++ b/src/components/mc/base/ucc_mc_base.h @@ -71,7 +71,9 @@ typedef struct ucc_mem_attr { * UCC memory component attributes field mask */ typedef enum ucc_mc_attr_field { - UCC_MC_ATTR_FIELD_THREAD_MODE = UCC_BIT(0) + UCC_MC_ATTR_FIELD_THREAD_MODE = UCC_BIT(0), + /* size of memory pool chunk element */ + UCC_MC_ATTR_FIELD_FAST_ALLOC_SIZE = UCC_BIT(1), } ucc_mc_attr_field_t; typedef struct ucc_mc_attr { @@ -81,6 +83,7 @@ typedef struct ucc_mc_attr { */ uint64_t field_mask; ucc_thread_mode_t thread_mode; + size_t fast_alloc_size; } ucc_mc_attr_t; /** diff --git a/src/components/mc/cuda/Makefile.am b/src/components/mc/cuda/Makefile.am index 1e25e2109a..d8e1dbe55e 100644 --- a/src/components/mc/cuda/Makefile.am +++ b/src/components/mc/cuda/Makefile.am @@ -5,8 +5,10 @@ if HAVE_CUDA sources = \ - mc_cuda.h \ - mc_cuda.c + mc_cuda.h \ + mc_cuda.c \ + mc_cuda_resources.c \ + mc_cuda_resources.h module_LTLIBRARIES = libucc_mc_cuda.la libucc_mc_cuda_la_SOURCES = $(sources) diff --git a/src/components/mc/cuda/mc_cuda.c b/src/components/mc/cuda/mc_cuda.c index 5c820bd768..aa2638b9da 100644 --- a/src/components/mc/cuda/mc_cuda.c +++ b/src/components/mc/cuda/mc_cuda.c @@ -50,8 +50,8 @@ static ucc_status_t ucc_mc_cuda_init(const ucc_mc_params_t *mc_params) int num_devices, driver_ver; cudaError_t cuda_st; - ucc_mc_cuda.stream = NULL; - ucc_mc_cuda.stream_initialized = 0; + ucc_mc_cuda_config = ucc_derived_of(ucc_mc_cuda.super.config, + ucc_mc_cuda_config_t); ucc_strncpy_safe(ucc_mc_cuda.super.config->log_component.name, ucc_mc_cuda.super.super.name, sizeof(ucc_mc_cuda.super.config->log_component.name)); @@ -100,6 +100,7 @@ static ucc_status_t ucc_mc_cuda_init(const ucc_mc_params_t *mc_params) "with driver version %d", driver_ver); } #endif + ucc_mc_cuda.resources_hash = kh_init(ucc_mc_cuda_resources_hash); // lock assures single mpool initiation when multiple threads concurrently execute // different collective operations thus concurrently entering init function. ucc_spinlock_init(&ucc_mc_cuda.init_spinlock, 0); @@ -112,6 +113,13 @@ static ucc_status_t ucc_mc_cuda_get_attr(ucc_mc_attr_t *mc_attr) if (mc_attr->field_mask & UCC_MC_ATTR_FIELD_THREAD_MODE) { mc_attr->thread_mode = ucc_mc_cuda.thread_mode; } + if (mc_attr->field_mask & UCC_MC_ATTR_FIELD_FAST_ALLOC_SIZE) { + if (MC_CUDA_CONFIG->mpool_max_elems > 0) { + mc_attr->fast_alloc_size = MC_CUDA_CONFIG->mpool_elem_size; + } else { + mc_attr->fast_alloc_size = 0; + } + } return UCC_OK; } @@ -120,8 +128,9 @@ static ucc_status_t ucc_mc_cuda_mem_alloc(ucc_mc_buffer_header_t **h_ptr, ucc_memory_type_t mt) { cudaError_t st; - ucc_mc_buffer_header_t *h = - ucc_malloc(sizeof(ucc_mc_buffer_header_t), "mc cuda"); + ucc_mc_buffer_header_t *h; + + h = ucc_malloc(sizeof(ucc_mc_buffer_header_t), "mc cuda"); if (ucc_unlikely(!h)) { mc_error(&ucc_mc_cuda.super, "failed to allocate %zd bytes", sizeof(ucc_mc_buffer_header_t)); @@ -132,13 +141,13 @@ static ucc_status_t ucc_mc_cuda_mem_alloc(ucc_mc_buffer_header_t **h_ptr, cudaMemAttachGlobal); if (ucc_unlikely(st != cudaSuccess)) { cudaGetLastError(); - mc_error(&ucc_mc_cuda.super, - "failed to allocate %zd bytes, " + mc_error(&ucc_mc_cuda.super, "failed to allocate %zd bytes, " "cuda error %d(%s)", size, st, cudaGetErrorString(st)); ucc_free(h); return UCC_ERR_NO_MEMORY; } + h->from_pool = 0; h->mt = UCC_MEMORY_TYPE_CUDA; *h_ptr = h; @@ -151,15 +160,25 @@ static ucc_status_t ucc_mc_cuda_mem_pool_alloc(ucc_mc_buffer_header_t **h_ptr, size_t size, ucc_memory_type_t mt) { - ucc_mc_buffer_header_t *h = NULL; - if (size <= MC_CUDA_CONFIG->mpool_elem_size && - mt != UCC_MEMORY_TYPE_CUDA_MANAGED) { - h = (ucc_mc_buffer_header_t *)ucc_mpool_get(&ucc_mc_cuda.mpool); + ucc_mc_buffer_header_t *h = NULL; + ucc_mc_cuda_resources_t *resources; + ucc_status_t status; + + if ((size <= MC_CUDA_CONFIG->mpool_elem_size) && + (mt != UCC_MEMORY_TYPE_CUDA_MANAGED)) { + status = ucc_mc_cuda_get_resources(&resources); + if (ucc_unlikely(status != UCC_OK)) { + return status; + } + + h = (ucc_mc_buffer_header_t *)ucc_mpool_get(&resources->scratch_mpool); } + if (!h) { // Slow path return ucc_mc_cuda_mem_alloc(h_ptr, size, mt); } + if (ucc_unlikely(!h->addr)){ return UCC_ERR_NO_MEMORY; } @@ -168,61 +187,6 @@ static ucc_status_t ucc_mc_cuda_mem_pool_alloc(ucc_mc_buffer_header_t **h_ptr, return UCC_OK; } -static ucc_status_t ucc_mc_cuda_chunk_alloc(ucc_mpool_t *mp, //NOLINT - size_t *size_p, - void **chunk_p) -{ - *chunk_p = ucc_malloc(*size_p, "mc cuda"); - if (!*chunk_p) { - mc_error(&ucc_mc_cuda.super, "failed to allocate %zd bytes", *size_p); - return UCC_ERR_NO_MEMORY; - } - - return UCC_OK; -} - -static void ucc_mc_cuda_chunk_init(ucc_mpool_t *mp, //NOLINT - void *obj, void *chunk) //NOLINT -{ - ucc_mc_buffer_header_t *h = (ucc_mc_buffer_header_t *)obj; - cudaError_t st = cudaMalloc(&h->addr, MC_CUDA_CONFIG->mpool_elem_size); - if (st != cudaSuccess) { - // h->addr will be 0 so ucc_mc_cuda_mem_alloc_pool function will - // return UCC_ERR_NO_MEMORY. As such mc_error message is suffice. - cudaGetLastError(); - mc_error(&ucc_mc_cuda.super, - "failed to allocate %zd bytes, " - "cuda error %d(%s)", - MC_CUDA_CONFIG->mpool_elem_size, st, cudaGetErrorString(st)); - } - h->from_pool = 1; - h->mt = UCC_MEMORY_TYPE_CUDA; -} - -static void ucc_mc_cuda_chunk_release(ucc_mpool_t *mp, void *chunk) //NOLINT: mp is unused -{ - ucc_free(chunk); -} - -static void ucc_mc_cuda_chunk_cleanup(ucc_mpool_t *mp, void *obj) //NOLINT: mp is unused -{ - ucc_mc_buffer_header_t *h = (ucc_mc_buffer_header_t *)obj; - cudaError_t st; - st = cudaFree(h->addr); - if (st != cudaSuccess) { - cudaGetLastError(); - mc_error(&ucc_mc_cuda.super, - "failed to free mem at %p, " - "cuda error %d(%s)", - obj, st, cudaGetErrorString(st)); - } -} - -static ucc_mpool_ops_t ucc_mc_ops = {.chunk_alloc = ucc_mc_cuda_chunk_alloc, - .chunk_release = ucc_mc_cuda_chunk_release, - .obj_init = ucc_mc_cuda_chunk_init, - .obj_cleanup = ucc_mc_cuda_chunk_cleanup}; - static ucc_status_t ucc_mc_cuda_mem_free(ucc_mc_buffer_header_t *h_ptr) { cudaError_t st; @@ -250,92 +214,72 @@ static ucc_status_t ucc_mc_cuda_mem_pool_free(ucc_mc_buffer_header_t *h_ptr) static ucc_status_t ucc_mc_cuda_mem_pool_alloc_with_init(ucc_mc_buffer_header_t **h_ptr, - size_t size, - ucc_memory_type_t mt) + size_t size, + ucc_memory_type_t mt) { - // lock assures single mpool initiation when multiple threads concurrently execute - // different collective operations thus concurrently entering init function. - ucc_spin_lock(&ucc_mc_cuda.init_spinlock); - if (MC_CUDA_CONFIG->mpool_max_elems == 0) { ucc_mc_cuda.super.ops.mem_alloc = ucc_mc_cuda_mem_alloc; ucc_mc_cuda.super.ops.mem_free = ucc_mc_cuda_mem_free; - ucc_spin_unlock(&ucc_mc_cuda.init_spinlock); return ucc_mc_cuda_mem_alloc(h_ptr, size, mt); - } - - if (!ucc_mc_cuda.mpool_init_flag) { - ucc_status_t status = ucc_mpool_init( - &ucc_mc_cuda.mpool, 0, sizeof(ucc_mc_buffer_header_t), 0, - UCC_CACHE_LINE_SIZE, 1, MC_CUDA_CONFIG->mpool_max_elems, - &ucc_mc_ops, ucc_mc_cuda.thread_mode, "mc cuda mpool buffers"); - if (status != UCC_OK) { - ucc_spin_unlock(&ucc_mc_cuda.init_spinlock); - return status; - } + } else { ucc_mc_cuda.super.ops.mem_alloc = ucc_mc_cuda_mem_pool_alloc; - ucc_mc_cuda.mpool_init_flag = 1; + ucc_mc_cuda.super.ops.mem_free = ucc_mc_cuda_mem_pool_free; + return ucc_mc_cuda_mem_pool_alloc(h_ptr, size, mt); } - ucc_spin_unlock(&ucc_mc_cuda.init_spinlock); - return ucc_mc_cuda_mem_pool_alloc(h_ptr, size, mt); } static ucc_status_t ucc_mc_cuda_memcpy(void *dst, const void *src, size_t len, ucc_memory_type_t dst_mem, ucc_memory_type_t src_mem) { - cudaError_t st; + ucc_status_t status; + ucc_mc_cuda_resources_t *resources; + ucc_assert(dst_mem == UCC_MEMORY_TYPE_CUDA || src_mem == UCC_MEMORY_TYPE_CUDA || dst_mem == UCC_MEMORY_TYPE_CUDA_MANAGED || src_mem == UCC_MEMORY_TYPE_CUDA_MANAGED); - UCC_MC_CUDA_INIT_STREAM(); - st = cudaMemcpyAsync(dst, src, len, cudaMemcpyDefault, ucc_mc_cuda.stream); - if (ucc_unlikely(st != cudaSuccess)) { - cudaGetLastError(); - mc_error(&ucc_mc_cuda.super, - "failed to launch cudaMemcpyAsync, dst %p, src %p, len %zd " - "cuda error %d(%s)", - dst, src, len, st, cudaGetErrorString(st)); - return UCC_ERR_NO_MESSAGE; + status = ucc_mc_cuda_get_resources(&resources); + if (ucc_unlikely(status) != UCC_OK) { + return status; } - st = cudaStreamSynchronize(ucc_mc_cuda.stream); - if (ucc_unlikely(st != cudaSuccess)) { - cudaGetLastError(); + + status = CUDA_FUNC(cudaMemcpyAsync(dst, src, len, cudaMemcpyDefault, + resources->stream)); + if (ucc_unlikely(status != UCC_OK)) { mc_error(&ucc_mc_cuda.super, - "failed to synchronize mc_cuda.stream " - "cuda error %d(%s)", - st, cudaGetErrorString(st)); - return UCC_ERR_NO_MESSAGE; + "failed to launch cudaMemcpyAsync, dst %p, src %p, len %zd", + dst, src, len); + return status; } - return UCC_OK; + + status = CUDA_FUNC(cudaStreamSynchronize(resources->stream)); + + return status; } ucc_status_t ucc_mc_cuda_memset(void *ptr, int val, size_t len) { - cudaError_t st; + ucc_status_t status; + ucc_mc_cuda_resources_t *resources; - UCC_MC_CUDA_INIT_STREAM(); - st = cudaMemsetAsync(ptr, val, len, ucc_mc_cuda.stream); - if (ucc_unlikely(st != cudaSuccess)) { - cudaGetLastError(); - mc_error(&ucc_mc_cuda.super, - "failed to launch cudaMemsetAsync, dst %p, len %zd " - "cuda error %d(%s)", - ptr, len, st, cudaGetErrorString(st)); - return UCC_ERR_NO_MESSAGE; + status = ucc_mc_cuda_get_resources(&resources); + if (ucc_unlikely(status) != UCC_OK) { + return status; } - st = cudaStreamSynchronize(ucc_mc_cuda.stream); - if (ucc_unlikely(st != cudaSuccess)) { - cudaGetLastError(); + + status = CUDA_FUNC(cudaMemsetAsync(ptr, val, len, resources->stream)); + if (ucc_unlikely(status != UCC_OK)) { mc_error(&ucc_mc_cuda.super, - "failed to synchronize mc_cuda.stream " - "cuda error %d(%s)", - st, cudaGetErrorString(st)); - return UCC_ERR_NO_MESSAGE; + "failed to launch cudaMemsetAsync, dst %p, len %zd", + ptr, len); + return status; } - return UCC_OK; + + status = CUDA_FUNC(cudaStreamSynchronize(resources->stream)); + + return status; } static ucc_status_t ucc_mc_cuda_mem_query(const void *ptr, @@ -407,17 +351,69 @@ static ucc_status_t ucc_mc_cuda_mem_query(const void *ptr, return UCC_OK; } -static ucc_status_t ucc_mc_cuda_finalize() +ucc_status_t ucc_mc_cuda_get_resources(ucc_mc_cuda_resources_t **resources) { - if (ucc_mc_cuda.stream != NULL) { - CUDA_CHECK(cudaStreamDestroy(ucc_mc_cuda.stream)); - ucc_mc_cuda.stream = NULL; + CUcontext cu_ctx; + unsigned long long int cu_ctx_id; + ucc_status_t status; + + status = CUDADRV_FUNC(cuCtxGetCurrent(&cu_ctx)); + if (ucc_unlikely(status != UCC_OK)) { + mc_error(&ucc_mc_cuda.super, "failed to get current CUDA context"); + return status; + } + +#if CUDA_VERSION < 12000 + cu_ctx_id = 1; +#else + status = CUDADRV_FUNC(cuCtxGetId(cu_ctx, &cu_ctx_id)); + if (ucc_unlikely(status != UCC_OK)) { + mc_error(&ucc_mc_cuda.super, "failed to get currect CUDA context ID"); + } +#endif + + *resources = mc_cuda_resources_hash_get(ucc_mc_cuda.resources_hash, + cu_ctx_id); + if (ucc_unlikely(*resources == NULL)) { + ucc_spin_lock(&ucc_mc_cuda.init_spinlock); + *resources = mc_cuda_resources_hash_get(ucc_mc_cuda.resources_hash, + cu_ctx_id); + if (*resources == NULL) { + *resources = ucc_malloc(sizeof(ucc_mc_cuda_resources_t), + "mc cuda resources"); + if (*resources == NULL) { + mc_error(&ucc_mc_cuda.super, + "failed to allocate %zd bytes for resources", + sizeof(ucc_mc_cuda_resources_t)); + ucc_spin_unlock(&ucc_mc_cuda.init_spinlock); + return UCC_ERR_NO_MEMORY; + } + status = ucc_mc_cuda_resources_init(&ucc_mc_cuda.super, + *resources); + if (status != UCC_OK) { + ucc_free(*resources); + ucc_spin_unlock(&ucc_mc_cuda.init_spinlock); + return status; + } + mc_cuda_resources_hash_put(ucc_mc_cuda.resources_hash, cu_ctx_id, + *resources); + } + ucc_spin_unlock(&ucc_mc_cuda.init_spinlock); } - if (ucc_mc_cuda.mpool_init_flag) { - ucc_mpool_cleanup(&ucc_mc_cuda.mpool, 1); - ucc_mc_cuda.mpool_init_flag = 0; - ucc_mc_cuda.super.ops.mem_alloc = ucc_mc_cuda_mem_pool_alloc_with_init; + return UCC_OK; +} + +static ucc_status_t ucc_mc_cuda_finalize() +{ + ucc_mc_cuda_resources_t *resources; + + resources = mc_cuda_resources_hash_pop(ucc_mc_cuda.resources_hash); + while (resources) { + ucc_mc_cuda_resources_cleanup(resources); + resources = mc_cuda_resources_hash_pop(ucc_mc_cuda.resources_hash); } + + ucc_mc_cuda.super.ops.mem_alloc = ucc_mc_cuda_mem_pool_alloc_with_init; ucc_spinlock_destroy(&ucc_mc_cuda.init_spinlock); return UCC_OK; } @@ -443,8 +439,9 @@ ucc_mc_cuda_t ucc_mc_cuda = { .table = ucc_mc_cuda_config_table, .size = sizeof(ucc_mc_cuda_config_t), }, - .mpool_init_flag = 0, }; +ucc_mc_cuda_config_t *ucc_mc_cuda_config; + UCC_CONFIG_REGISTER_TABLE_ENTRY(&ucc_mc_cuda.super.config_table, &ucc_config_global_list); diff --git a/src/components/mc/cuda/mc_cuda.h b/src/components/mc/cuda/mc_cuda.h index abc82312c2..10779c27cb 100644 --- a/src/components/mc/cuda/mc_cuda.h +++ b/src/components/mc/cuda/mc_cuda.h @@ -1,5 +1,5 @@ /** - * Copyright (c) 2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved. + * Copyright (c) 2022-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. * * See file LICENSE for terms. */ @@ -7,29 +7,18 @@ #ifndef UCC_MC_CUDA_H_ #define UCC_MC_CUDA_H_ -#include +#include #include "components/mc/base/ucc_mc_base.h" #include "components/mc/ucc_mc_log.h" #include "utils/ucc_mpool.h" #include "utils/arch/cuda_def.h" -#include - -typedef struct ucc_mc_cuda_config { - ucc_mc_config_t super; - size_t mpool_elem_size; - int mpool_max_elems; -} ucc_mc_cuda_config_t; +#include "mc_cuda_resources.h" typedef struct ucc_mc_cuda { ucc_mc_base_t super; - int stream_initialized; - cudaStream_t stream; - ucc_mpool_t events; - ucc_mpool_t strm_reqs; - ucc_mpool_t mpool; - int mpool_init_flag; ucc_spinlock_t init_spinlock; ucc_thread_mode_t thread_mode; + ucc_mc_cuda_resources_hash_t *resources_hash; } ucc_mc_cuda_t; extern ucc_mc_cuda_t ucc_mc_cuda; @@ -37,21 +26,7 @@ extern ucc_mc_cuda_t ucc_mc_cuda; #define MC_CUDA_CONFIG \ (ucc_derived_of(ucc_mc_cuda.super.config, ucc_mc_cuda_config_t)) -#define UCC_MC_CUDA_INIT_STREAM() do { \ - if (!ucc_mc_cuda.stream_initialized) { \ - cudaError_t cuda_st = cudaSuccess; \ - ucc_spin_lock(&ucc_mc_cuda.init_spinlock); \ - if (!ucc_mc_cuda.stream_initialized) { \ - cuda_st = cudaStreamCreateWithFlags(&ucc_mc_cuda.stream, \ - cudaStreamNonBlocking); \ - ucc_mc_cuda.stream_initialized = 1; \ - } \ - ucc_spin_unlock(&ucc_mc_cuda.init_spinlock); \ - if (ucc_unlikely(cudaSuccess != cuda_st)) { \ - return cuda_error_to_ucc_status(cuda_st); \ - } \ - } \ -} while(0) +ucc_status_t ucc_mc_cuda_get_resources(ucc_mc_cuda_resources_t **resources); ucc_status_t ucc_mc_cuda_memset(void *ptr, int val, size_t len); diff --git a/src/components/mc/cuda/mc_cuda_resources.c b/src/components/mc/cuda/mc_cuda_resources.c new file mode 100644 index 0000000000..398b83784e --- /dev/null +++ b/src/components/mc/cuda/mc_cuda_resources.c @@ -0,0 +1,92 @@ +#include "mc_cuda_resources.h" +#include "components/mc/ucc_mc_log.h" +#include "utils/ucc_malloc.h" + +static ucc_status_t ucc_mc_cuda_chunk_alloc(ucc_mpool_t *mp, //NOLINT + size_t *size_p, + void **chunk_p) +{ + *chunk_p = ucc_malloc(*size_p, "mc cuda"); + if (!*chunk_p) { + return UCC_ERR_NO_MEMORY; + } + + return UCC_OK; +} + +static void ucc_mc_cuda_chunk_init(ucc_mpool_t *mp, //NOLINT + void *obj, void *chunk) //NOLINT +{ + ucc_mc_buffer_header_t *h = (ucc_mc_buffer_header_t *)obj; + cudaError_t st; + + st = cudaMalloc(&h->addr, ucc_mc_cuda_config->mpool_elem_size); + if (st != cudaSuccess) { + // h->addr will be 0 so ucc_mc_cuda_mem_alloc_pool function will + // return UCC_ERR_NO_MEMORY. As such mc_error message is suffice. + cudaGetLastError(); + } + h->from_pool = 1; + h->mt = UCC_MEMORY_TYPE_CUDA; +} + +static void ucc_mc_cuda_chunk_release(ucc_mpool_t *mp, void *chunk) //NOLINT: mp is unused +{ + ucc_free(chunk); +} + +static void ucc_mc_cuda_chunk_cleanup(ucc_mpool_t *mp, void *obj) //NOLINT: mp is unused +{ + ucc_mc_buffer_header_t *h = (ucc_mc_buffer_header_t *)obj; + cudaError_t st; + + st = cudaFree(h->addr); + if (st != cudaSuccess) { + cudaGetLastError(); + } +} + +static ucc_mpool_ops_t ucc_mc_ops = {.chunk_alloc = ucc_mc_cuda_chunk_alloc, + .chunk_release = ucc_mc_cuda_chunk_release, + .obj_init = ucc_mc_cuda_chunk_init, + .obj_cleanup = ucc_mc_cuda_chunk_cleanup}; + +ucc_status_t ucc_mc_cuda_resources_init(ucc_mc_base_t *mc, + ucc_mc_cuda_resources_t *resources) +{ + ucc_status_t status; + + CUDADRV_CHECK(cuCtxGetCurrent(&resources->cu_ctx)); + status = ucc_mpool_init(&resources->scratch_mpool, 0, + sizeof(ucc_mc_buffer_header_t), 0, + UCC_CACHE_LINE_SIZE, 1, + ucc_mc_cuda_config->mpool_max_elems, &ucc_mc_ops, + UCC_THREAD_MULTIPLE, "mc cuda mpool buffers"); + if (status != UCC_OK) { + mc_error(mc, "failed to create scratch buffers mpool"); + return status; + } + + status = CUDA_FUNC(cudaStreamCreateWithFlags(&resources->stream, + cudaStreamNonBlocking)); + if (status != UCC_OK) { + mc_error(mc, "failed to create CUDA stream"); + goto free_scratch_mpool; + } + + return UCC_OK; + +free_scratch_mpool: + ucc_mpool_cleanup(&resources->scratch_mpool, 0); + return status; +} + +void ucc_mc_cuda_resources_cleanup(ucc_mc_cuda_resources_t *resources) +{ + CUcontext tmp_context; + + cuCtxPushCurrent(resources->cu_ctx); + ucc_mpool_cleanup(&resources->scratch_mpool, 1); + CUDA_FUNC(cudaStreamDestroy(resources->stream)); + cuCtxPopCurrent(&tmp_context); +} diff --git a/src/components/mc/cuda/mc_cuda_resources.h b/src/components/mc/cuda/mc_cuda_resources.h new file mode 100644 index 0000000000..557effe3c0 --- /dev/null +++ b/src/components/mc/cuda/mc_cuda_resources.h @@ -0,0 +1,84 @@ +/** + * Copyright (c) 2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. + * + * See file LICENSE for terms. + */ + +#ifndef UCC_MC_CUDA_RESOURCES_H_ +#define UCC_MC_CUDA_RESOURCES_H_ + +#include "components/mc/base/ucc_mc_base.h" +#include "utils/arch/cuda_def.h" +#include "utils/ucc_mpool.h" + +typedef struct ucc_mc_cuda_config { + ucc_mc_config_t super; + size_t mpool_elem_size; + int mpool_max_elems; +} ucc_mc_cuda_config_t; + +typedef struct ucc_mc_cuda_resources { + CUcontext cu_ctx; + cudaStream_t stream; + ucc_mpool_t scratch_mpool; +} ucc_mc_cuda_resources_t; + +extern ucc_mc_cuda_config_t *ucc_mc_cuda_config; + +ucc_status_t ucc_mc_cuda_resources_init(ucc_mc_base_t *mc, + ucc_mc_cuda_resources_t *resources); + +void ucc_mc_cuda_resources_cleanup(ucc_mc_cuda_resources_t *resources); + +KHASH_INIT(ucc_mc_cuda_resources_hash, unsigned long long, void*, 1, \ + kh_int64_hash_func, kh_int64_hash_equal); +#define ucc_mc_cuda_resources_hash_t khash_t(ucc_mc_cuda_resources_hash) + +static inline +void* mc_cuda_resources_hash_get(ucc_mc_cuda_resources_hash_t *h, + unsigned long long key) +{ + khiter_t k; + void *value; + + k = kh_get(ucc_mc_cuda_resources_hash, h , key); + if (k == kh_end(h)) { + return NULL; + } + value = kh_value(h, k); + return value; +} + +static inline +void mc_cuda_resources_hash_put(ucc_mc_cuda_resources_hash_t *h, + unsigned long long key, + void *value) +{ + int ret; + khiter_t k; + k = kh_put(ucc_mc_cuda_resources_hash, h, key, &ret); + kh_value(h, k) = value; +} + +static inline +void* mc_cuda_resources_hash_pop(ucc_mc_cuda_resources_hash_t *h) +{ + void *resources = NULL; + khiter_t k; + + k = kh_begin(h); + while (k != kh_end(h)) { + if (kh_exist(h, k)) { + resources = kh_value(h, k); + break; + } + k++; + } + + if (resources) { + kh_del(ucc_mc_cuda_resources_hash, h, k); + } + return resources; +} + +#endif diff --git a/src/components/mc/ucc_mc.c b/src/components/mc/ucc_mc.c index 2e8208d0f0..ad3de3a94b 100644 --- a/src/components/mc/ucc_mc.c +++ b/src/components/mc/ucc_mc.c @@ -121,6 +121,17 @@ ucc_status_t ucc_mc_get_mem_attr(const void *ptr, ucc_mem_attr_t *mem_attr) return UCC_OK; } +ucc_status_t ucc_mc_get_attr(ucc_mc_attr_t *attr, ucc_memory_type_t mem_type) +{ + ucc_memory_type_t mt = (mem_type == UCC_MEMORY_TYPE_CUDA_MANAGED) ? + UCC_MEMORY_TYPE_CUDA : mem_type; + ucc_mc_base_t *mc; + + UCC_CHECK_MC_AVAILABLE(mt); + mc = ucc_container_of(mc_ops[mt], ucc_mc_base_t, ops); + return mc->get_attr(attr); +} + UCC_MC_PROFILE_FUNC(ucc_status_t, ucc_mc_alloc, (h_ptr, size, mem_type), ucc_mc_buffer_header_t **h_ptr, size_t size, ucc_memory_type_t mem_type) @@ -134,8 +145,11 @@ UCC_MC_PROFILE_FUNC(ucc_status_t, ucc_mc_alloc, (h_ptr, size, mem_type), ucc_status_t ucc_mc_free(ucc_mc_buffer_header_t *h_ptr) { - UCC_CHECK_MC_AVAILABLE(h_ptr->mt); - return mc_ops[h_ptr->mt]->mem_free(h_ptr); + ucc_memory_type_t mt = (h_ptr->mt == UCC_MEMORY_TYPE_CUDA_MANAGED) ? + UCC_MEMORY_TYPE_CUDA : h_ptr->mt; + + UCC_CHECK_MC_AVAILABLE(mt); + return mc_ops[mt]->mem_free(h_ptr); } UCC_MC_PROFILE_FUNC(ucc_status_t, ucc_mc_memcpy, diff --git a/src/components/mc/ucc_mc.h b/src/components/mc/ucc_mc.h index e0ce1030c8..e98396b2f7 100644 --- a/src/components/mc/ucc_mc.h +++ b/src/components/mc/ucc_mc.h @@ -24,6 +24,8 @@ ucc_status_t ucc_mc_available(ucc_memory_type_t mem_type); */ ucc_status_t ucc_mc_get_mem_attr(const void *ptr, ucc_mem_attr_t *mem_attr); +ucc_status_t ucc_mc_get_attr(ucc_mc_attr_t *attr, ucc_memory_type_t mem_type); + ucc_status_t ucc_mc_alloc(ucc_mc_buffer_header_t **h_ptr, size_t len, ucc_memory_type_t mem_type); diff --git a/src/components/tl/cuda/tl_cuda_topo.c b/src/components/tl/cuda/tl_cuda_topo.c index 96862e921e..a0f54d57e6 100644 --- a/src/components/tl/cuda/tl_cuda_topo.c +++ b/src/components/tl/cuda/tl_cuda_topo.c @@ -220,7 +220,8 @@ static ucc_status_t ucc_tl_cuda_topo_graph_create(ucc_tl_cuda_topo_t *topo) ucc_tl_cuda_topo_dev_type_t dev_type; ucc_tl_cuda_device_pci_id_t pci_id; ucc_tl_cuda_topo_node_t *node, *peer_node; - int num_gpus, num_nvlinks, link, i; + int num_nvlinks, link, i; + unsigned int num_gpus; nvmlReturn_t nvml_st; nvml_st = nvmlInit_v2(); diff --git a/src/components/tl/mlx5/Makefile.am b/src/components/tl/mlx5/Makefile.am index 11aec4e5b6..2ac9dc91c7 100644 --- a/src/components/tl/mlx5/Makefile.am +++ b/src/components/tl/mlx5/Makefile.am @@ -23,6 +23,7 @@ mcast = \ mcast/p2p/ucc_tl_mlx5_mcast_p2p.c \ mcast/tl_mlx5_mcast_progress.h \ mcast/tl_mlx5_mcast_helper.h \ + mcast/tl_mlx5_mcast_helper.c \ mcast/tl_mlx5_mcast_team.c sources = \ diff --git a/src/components/tl/mlx5/alltoall/alltoall_mkeys.c b/src/components/tl/mlx5/alltoall/alltoall_mkeys.c index 7dd90d49b8..0fa197e6c7 100644 --- a/src/components/tl/mlx5/alltoall/alltoall_mkeys.c +++ b/src/components/tl/mlx5/alltoall/alltoall_mkeys.c @@ -217,7 +217,6 @@ ucc_status_t ucc_tl_mlx5_init_mkeys(ucc_tl_mlx5_team_t *team, if (!node->ops[i].send_mkeys) { tl_error(lib, "failed to malloc"); goto err_malloc; - return UCC_ERR_NO_MEMORY; } node->ops[i].recv_mkeys = (struct mlx5dv_mkey **)ucc_malloc( sizeof(struct mlx5dv_mkey *) * a2a->max_num_of_columns); @@ -230,7 +229,7 @@ ucc_status_t ucc_tl_mlx5_init_mkeys(ucc_tl_mlx5_team_t *team, status = create_master_key(node->sbgp->group_size + 1, a2a->pd, &node->ops[i].send_mkeys[j], lib); if (status != UCC_OK) { - tl_error(lib, " failed to create send masterkey [%d,%d]", i, j); + tl_error(lib, "failed to create send masterkey [%d,%d]", i, j); goto err_create_mkey; } status = create_master_key(node->sbgp->group_size + 1, a2a->pd, diff --git a/src/components/tl/mlx5/mcast/tl_mlx5_mcast_context.c b/src/components/tl/mlx5/mcast/tl_mlx5_mcast_context.c index 90014d1400..ad32c459b0 100644 --- a/src/components/tl/mlx5/mcast/tl_mlx5_mcast_context.c +++ b/src/components/tl/mlx5/mcast/tl_mlx5_mcast_context.c @@ -10,9 +10,242 @@ #include #include "core/ucc_service_coll.h" #include "tl_mlx5.h" +#include "tl_mlx5_mcast_helper.h" +#include "tl_mlx5_mcast_rcache.h" -ucc_status_t ucc_tl_mlx5_mcast_context_init(ucc_tl_mlx5_mcast_context_t *context, /* NOLINT */ - ucc_tl_mlx5_mcast_ctx_params_t *mcast_ctx_conf /* NOLINT */) +#define UCC_TL_MLX5_MCAST_MAX_MTU_COUNT 5 +int mtu_lookup[UCC_TL_MLX5_MCAST_MAX_MTU_COUNT][2] = { + {256, IBV_MTU_256}, + {512, IBV_MTU_512}, + {1024, IBV_MTU_1024}, + {2048, IBV_MTU_2048}, + {4096, IBV_MTU_4096} +}; + +ucc_status_t ucc_tl_mlx5_mcast_context_init(ucc_tl_mlx5_mcast_context_t *context, + ucc_tl_mlx5_mcast_ctx_params_t *mcast_ctx_conf) { + ucc_status_t status = UCC_OK; + struct ibv_device **device_list = NULL; + struct ibv_device *dev = NULL; + char *devname = NULL; + int is_ipv4 = 0; + struct sockaddr_in *in_src_addr = NULL; + struct rdma_cm_event *revent = NULL; + char *ib = NULL; + char *ib_name = NULL; + char *port = NULL; + int active_mtu = 4096; + int max_mtu = 4096; + ucc_tl_mlx5_mcast_coll_context_t *ctx = NULL; + struct ibv_port_attr port_attr; + struct ibv_device_attr device_attr; + struct sockaddr_storage ip_oib_addr; + struct sockaddr_storage dst_addr; + int num_devices; + char addrstr[128]; + ucc_tl_mlx5_context_t *mlx5_ctx; + ucc_base_lib_t *lib; + int i; + int user_provided_ib; + int ib_valid; + const char *dst; + + ctx = &(context->mcast_context); + memset(ctx, 0, sizeof(ucc_tl_mlx5_mcast_coll_context_t)); + memcpy(&ctx->params, mcast_ctx_conf, sizeof(ucc_tl_mlx5_mcast_ctx_params_t)); + + mlx5_ctx = ucc_container_of(context, ucc_tl_mlx5_context_t, mcast); + lib = mlx5_ctx->super.super.lib; + ctx->lib = lib; + + /* TODO unify all the contexts under TL mlx5 */ + device_list = ibv_get_device_list(&num_devices); + if (!device_list || !num_devices) { + tl_debug(lib, "no ib devices available"); + status = UCC_ERR_NOT_SUPPORTED; + goto error; + } + + if (!strcmp(mcast_ctx_conf->ib_dev_name, "")) { + dev = device_list[0]; + devname = (char *)ibv_get_device_name(dev); + ctx->devname = ucc_malloc(strlen(devname)+3, "devname"); + if (!ctx->devname) { + status = UCC_ERR_NO_MEMORY; + goto error; + } + memset(ctx->devname, 0, strlen(devname)+3); + memcpy(ctx->devname, devname, strlen(devname)); + strncat(ctx->devname, ":1", 3); + user_provided_ib = 0; + } else { + ib_valid = 0; + /* user has provided the devname now make sure it is valid */ + for (i = 0; device_list[i]; ++i) { + if (!strcmp(ibv_get_device_name(device_list[i]), mcast_ctx_conf->ib_dev_name)) { + ib_valid = 1; + break; + } + } + if (!ib_valid) { + tl_warn(lib, "ib device %s not found", mcast_ctx_conf->ib_dev_name); + status = UCC_ERR_NOT_FOUND; + ibv_free_device_list(device_list); + goto error; + } + ctx->devname = mcast_ctx_conf->ib_dev_name; + user_provided_ib = 1; + } + + ibv_free_device_list(device_list); + + status = ucc_tl_mlx5_probe_ip_over_ib(ctx->devname, &ip_oib_addr); + if (UCC_OK != status) { + tl_debug(lib, "failed to get ipoib interface for devname %s", ctx->devname); + if (!user_provided_ib) { + ucc_free(ctx->devname); + } + goto error; + } + + is_ipv4 = (ip_oib_addr.ss_family == AF_INET) ? 1 : 0; + in_src_addr = (struct sockaddr_in*)&ip_oib_addr; + + dst = inet_ntop((is_ipv4) ? AF_INET : AF_INET6, + &in_src_addr->sin_addr, addrstr, sizeof(addrstr) - 1); + if (NULL == dst) { + tl_error(lib, "inet_ntop failed"); + status = UCC_ERR_NO_RESOURCE; + goto error; + } + + tl_debug(ctx->lib, "devname %s, ipoib %s", ctx->devname, addrstr); + + ctx->channel = rdma_create_event_channel(); + if (!ctx->channel) { + tl_debug(lib, "rdma_create_event_channel failed, errno %d", errno); + status = UCC_ERR_NO_RESOURCE; + goto error; + } + + memset(&dst_addr, 0, sizeof(struct sockaddr_storage)); + dst_addr.ss_family = is_ipv4 ? AF_INET : AF_INET6; + if (rdma_create_id(ctx->channel, &ctx->id, NULL, RDMA_PS_UDP)) { + tl_debug(lib, "failed to create rdma id, errno %d", errno); + status = UCC_ERR_NOT_SUPPORTED; + goto error; + } + + if (0 != rdma_resolve_addr(ctx->id, (struct sockaddr *)&ip_oib_addr, + (struct sockaddr *) &dst_addr, 1000)) { + tl_debug(lib, "failed to resolve rdma addr, errno %d", errno); + status = UCC_ERR_NOT_SUPPORTED; + goto error; + } + + if (rdma_get_cm_event(ctx->channel, &revent) < 0) { + tl_error(lib, "failed to get cm event, errno %d", errno); + status = UCC_ERR_NO_RESOURCE; + goto error; + } else if (revent->event != RDMA_CM_EVENT_ADDR_RESOLVED) { + tl_error(lib, "cm event is not resolved"); + if (rdma_ack_cm_event(revent) < 0) { + tl_error(lib, "rdma_ack_cm_event failed"); + } + status = UCC_ERR_NO_RESOURCE; + goto error; + } + + if (rdma_ack_cm_event(revent) < 0) { + tl_error(lib, "rdma_ack_cm_event failed"); + status = UCC_ERR_NO_RESOURCE; + goto error; + } + + ctx->ctx = ctx->id->verbs; + ctx->pd = ibv_alloc_pd(ctx->ctx); + if (!ctx->pd) { + tl_error(lib, "failed to allocate pd"); + status = UCC_ERR_NO_RESOURCE; + goto error; + } + + ib = strdup(ctx->devname); + ucc_string_split(ib, ":", 2, &ib_name, &port); + ctx->ib_port = atoi(port); + ucc_free(ib); + + /* Determine MTU */ + if (ibv_query_port(ctx->ctx, ctx->ib_port, &port_attr)) { + tl_error(lib, "couldn't query port in ctx create, errno %d", errno); + status = UCC_ERR_NO_RESOURCE; + goto error; + } + + + for (i = 0; i < UCC_TL_MLX5_MCAST_MAX_MTU_COUNT; i++) { + if (mtu_lookup[i][1] == port_attr.max_mtu) { + max_mtu = mtu_lookup[i][0]; + } + if (mtu_lookup[i][1] == port_attr.active_mtu) { + active_mtu = mtu_lookup[i][0]; + } + } + + ctx->mtu = active_mtu; + + tl_debug(ctx->lib, "port active MTU is %d and port max MTU is %d", + active_mtu, max_mtu); + + if (port_attr.max_mtu < port_attr.active_mtu) { + tl_debug(ctx->lib, "port active MTU (%d) is smaller than port max MTU (%d)", + active_mtu, max_mtu); + } + + if (ibv_query_device(ctx->ctx, &device_attr)) { + tl_error(lib, "failed to query device in ctx create, errno %d", errno); + status = UCC_ERR_NO_RESOURCE; + goto error; + } + + tl_debug(ctx->lib, "MTU %d, MAX QP WR: %d, max sqr_wr: %d, max cq: %d, max cqe: %d", + ctx->mtu, device_attr.max_qp_wr, device_attr.max_srq_wr, + device_attr.max_cq, device_attr.max_cqe); + + ctx->max_qp_wr = device_attr.max_qp_wr; + status = ucc_mpool_init(&ctx->compl_objects_mp, 0, sizeof(ucc_tl_mlx5_mcast_p2p_completion_obj_t), 0, + UCC_CACHE_LINE_SIZE, 8, UINT_MAX, + &ucc_coll_task_mpool_ops, + UCC_THREAD_SINGLE, + "ucc_tl_mlx5_mcast_p2p_completion_obj_t"); + if (ucc_unlikely(UCC_OK != status)) { + tl_error(lib, "failed to initialize compl_objects_mp mpool"); + status = UCC_ERR_NO_MEMORY; + goto error; + } + + ctx->rcache = NULL; + status = ucc_tl_mlx5_mcast_setup_rcache(ctx); + if (UCC_OK != status) { + tl_error(lib, "failed to setup rcache"); + goto error; + } + + tl_debug(ctx->lib, "multicast context setup complete: ctx %p", ctx); + return UCC_OK; + +error: + if (ctx->pd) { + ibv_dealloc_pd(ctx->pd); + } + if (ctx->id) { + rdma_destroy_id(ctx->id); + } + if (ctx->channel) { + rdma_destroy_event_channel(ctx->channel); + } + + return status; } diff --git a/src/components/tl/mlx5/mcast/tl_mlx5_mcast_helper.c b/src/components/tl/mlx5/mcast/tl_mlx5_mcast_helper.c new file mode 100644 index 0000000000..8c52a63c73 --- /dev/null +++ b/src/components/tl/mlx5/mcast/tl_mlx5_mcast_helper.c @@ -0,0 +1,561 @@ +/** + * Copyright (c) 2022-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. + * + * See file LICENSE for terms. + */ + +#include "tl_mlx5_mcast_helper.h" +#include +#include +#include + +#define PREF "/sys/class/net/" +#define SUFF "/device/resource" +#define MAX_STR_LEN 128 + +static ucc_status_t ucc_tl_mlx5_get_ipoib_ip(char *ifname, struct sockaddr_storage *addr) +{ + ucc_status_t status = UCC_ERR_NO_RESOURCE; + struct ifaddrs *ifaddr = NULL; + struct ifaddrs *ifa = NULL; + int is_ipv4 = 0; + int family; + int n; + int is_up; + + if (getifaddrs(&ifaddr) == -1) { + return UCC_ERR_NO_RESOURCE; + } + + for (ifa = ifaddr, n = 0; ifa != NULL; ifa=ifa->ifa_next, n++) { + if (ifa->ifa_addr == NULL) { + continue; + } + + family = ifa->ifa_addr->sa_family; + if (family != AF_INET && family != AF_INET6) { + continue; + } + + is_up = (ifa->ifa_flags & IFF_UP) == IFF_UP; + is_ipv4 = (family == AF_INET) ? 1 : 0; + + if (is_up && !strncmp(ifa->ifa_name, ifname, strlen(ifname)) ) { + if (is_ipv4) { + memcpy((struct sockaddr_in *) addr, + (struct sockaddr_in *) ifa->ifa_addr, + sizeof(struct sockaddr_in)); + } else { + memcpy((struct sockaddr_in6 *) addr, + (struct sockaddr_in6 *) ifa->ifa_addr, + sizeof(struct sockaddr_in6)); + } + + status = UCC_OK; + break; + } + } + + freeifaddrs(ifaddr); + return status; +} + +static int cmp_files(char *f1, char *f2) +{ + int answer = 0; + FILE *fp1; + FILE *fp2; + int ch1; + int ch2; + + if ((fp1 = fopen(f1, "r")) == NULL) { + goto out; + } else if ((fp2 = fopen(f2, "r")) == NULL) { + goto close; + } + + do { + ch1 = getc(fp1); + ch2 = getc(fp2); + } while((ch1 != EOF) && (ch2 != EOF) && (ch1 == ch2)); + + + if (ch1 == ch2) { + answer = 1; + } + + if (fclose(fp2) != 0) { + return 0; + } +close: + if (fclose(fp1) != 0) { + return 0; + } +out: + return answer; +} + +static int port_from_file(char *port_file) +{ + int res = -1; + char buf1[MAX_STR_LEN]; + char buf2[MAX_STR_LEN]; + FILE *fp; + int len; + + if ((fp = fopen(port_file, "r")) == NULL) { + return -1; + } + + if (fgets(buf1, MAX_STR_LEN - 1, fp) == NULL) { + goto out; + } + + len = strlen(buf1) - 2; + strncpy(buf2, buf1 + 2, len); + buf2[len] = 0; + res = atoi(buf2); + +out: + if (fclose(fp) != 0) { + return -1; + } + return res; +} + +static ucc_status_t dev2if(char *dev_name, char *port, struct sockaddr_storage + *rdma_src_addr) +{ + ucc_status_t status = UCC_OK; + glob_t glob_el = {0,}; + char dev_file [MAX_STR_LEN]; + char port_file[MAX_STR_LEN]; + char net_file [MAX_STR_LEN]; + char if_name [MAX_STR_LEN]; + char glob_path[MAX_STR_LEN]; + int i; + char **p; + int len; + + sprintf(glob_path, PREF"*"); + + sprintf(dev_file, "/sys/class/infiniband/%s"SUFF, dev_name); + if (glob(glob_path, 0, 0, &glob_el)) { + return UCC_ERR_NO_RESOURCE; + } + p = glob_el.gl_pathv; + + if (glob_el.gl_pathc >= 1) { + for (i = 0; i < glob_el.gl_pathc; i++, p++) { + sprintf(port_file, "%s/dev_id", *p); + sprintf(net_file, "%s"SUFF, *p); + if(cmp_files(net_file, dev_file) && port != NULL && + port_from_file(port_file) == atoi(port) - 1) { + len = strlen(net_file) - strlen(PREF) - strlen(SUFF); + strncpy(if_name, net_file + strlen(PREF), len); + if_name[len] = 0; + + status = ucc_tl_mlx5_get_ipoib_ip(if_name, rdma_src_addr); + if (UCC_OK == status) { + break; + } + } + } + } + + globfree(&glob_el); + return status; +} + +ucc_status_t ucc_tl_mlx5_probe_ip_over_ib(char* ib_dev, struct + sockaddr_storage *addr) +{ + char *ib_name = NULL; + char *port = NULL; + char *ib = NULL; + ucc_status_t status; + struct sockaddr_storage rdma_src_addr; + + if (ib_dev == NULL) { + return UCC_ERR_NO_RESOURCE; + } + + ib = strdup(ib_dev); + if (!ib) { + return UCC_ERR_NO_MEMORY; + } + + ucc_string_split(ib, ":", 2, &ib_name, &port); + status = dev2if(ib_name, port, &rdma_src_addr); + + if (UCC_OK == status) { + *addr = rdma_src_addr; + } + ucc_free(ib); + + return status; +} + +ucc_status_t ucc_tl_mlx5_mcast_join_mcast_post(ucc_tl_mlx5_mcast_coll_context_t *ctx, + struct sockaddr_in6 *net_addr, + int is_root) +{ + char buf[40]; + const char *dst; + + dst = inet_ntop(AF_INET6, net_addr, buf, 40); + if (NULL == dst) { + tl_error(ctx->lib, "inet_ntop failed"); + return UCC_ERR_NO_RESOURCE; + } + + tl_debug(ctx->lib, "joining addr: %s is_root %d", buf, is_root); + + if (rdma_join_multicast(ctx->id, (struct sockaddr*)net_addr, NULL)) { + tl_error(ctx->lib, "rdma_join_multicast failed errno %d", errno); + return UCC_ERR_NO_RESOURCE; + } + + return UCC_OK; +} + +ucc_status_t ucc_tl_mlx5_mcast_join_mcast_test(ucc_tl_mlx5_mcast_coll_context_t *ctx, + struct rdma_cm_event **event, + int is_root) +{ + char buf[40]; + const char *dst; + + if (rdma_get_cm_event(ctx->channel, event) < 0) { + if (EINTR != errno) { + tl_error(ctx->lib, "rdma_get_cm_event failed, errno %d %s", + errno, strerror(errno)); + return UCC_ERR_NO_RESOURCE; + } else { + return UCC_INPROGRESS; + } + } + + if (RDMA_CM_EVENT_MULTICAST_JOIN != (*event)->event) { + tl_error(ctx->lib, "failed to join multicast, is_root %d. unexpected event was" + " received: event=%d, str=%s, status=%d", + is_root, (*event)->event, rdma_event_str((*event)->event), + (*event)->status); + if (rdma_ack_cm_event(*event) < 0) { + tl_error(ctx->lib, "rdma_ack_cm_event failed"); + } + return UCC_ERR_NO_RESOURCE; + } + + dst = inet_ntop(AF_INET6, (*event)->param.ud.ah_attr.grh.dgid.raw, buf, 40); + if (NULL == dst) { + tl_error(ctx->lib, "inet_ntop failed"); + return UCC_ERR_NO_RESOURCE; + } + + tl_debug(ctx->lib, "is_root %d: joined dgid: %s, mlid 0x%x, sl %d", is_root, buf, + (*event)->param.ud.ah_attr.dlid, (*event)->param.ud.ah_attr.sl); + + return UCC_OK; + +} + +ucc_status_t ucc_tl_mlx5_setup_mcast_group_join_post(ucc_tl_mlx5_mcast_coll_comm_t *comm) +{ + ucc_status_t status; + struct sockaddr_in6 net_addr = {0,}; + + if (comm->rank == 0) { + net_addr.sin6_family = AF_INET6; + net_addr.sin6_flowinfo = comm->comm_id; + + status = ucc_tl_mlx5_mcast_join_mcast_post(comm->ctx, &net_addr, 1); + if (status < 0) { + tl_error(comm->lib, "rank 0 is unable to join mcast group"); + return status; + } + } + + return UCC_OK; +} + +ucc_status_t ucc_tl_mlx5_mcast_init_qps(ucc_tl_mlx5_mcast_coll_context_t *ctx, + ucc_tl_mlx5_mcast_coll_comm_t *comm) +{ + struct ibv_qp_init_attr qp_init_attr = {0}; + + qp_init_attr.qp_type = IBV_QPT_UD; + qp_init_attr.send_cq = comm->scq; + qp_init_attr.recv_cq = comm->rcq; + qp_init_attr.sq_sig_all = 0; + qp_init_attr.cap.max_send_wr = comm->params.sx_depth; + qp_init_attr.cap.max_recv_wr = comm->params.rx_depth; + qp_init_attr.cap.max_inline_data = comm->params.sx_inline; + qp_init_attr.cap.max_send_sge = comm->params.sx_sge; + qp_init_attr.cap.max_recv_sge = comm->params.rx_sge; + + comm->mcast.qp = ibv_create_qp(ctx->pd, &qp_init_attr); + if (!comm->mcast.qp) { + tl_error(ctx->lib, "failed to create mcast qp, errno %d", errno); + return UCC_ERR_NO_RESOURCE; + } + + comm->max_inline = qp_init_attr.cap.max_inline_data; + + return UCC_OK; +} + +static ucc_status_t ucc_tl_mlx5_mcast_create_ah(ucc_tl_mlx5_mcast_coll_comm_t *comm) +{ + struct ibv_ah_attr ah_attr = { + .is_global = 1, + .grh = {.sgid_index = 0}, + .dlid = comm->mcast_lid, + .sl = DEF_SL, + .src_path_bits = DEF_SRC_PATH_BITS, + .port_num = comm->ctx->ib_port + }; + + memcpy(ah_attr.grh.dgid.raw, &comm->mgid, sizeof(ah_attr.grh.dgid.raw)); + + comm->mcast.ah = ibv_create_ah(comm->ctx->pd, &ah_attr); + if (!comm->mcast.ah) { + tl_error(comm->lib, "failed to create AH"); + return UCC_ERR_NO_RESOURCE; + } + return UCC_OK; +} + +ucc_status_t ucc_tl_mlx5_mcast_setup_qps(ucc_tl_mlx5_mcast_coll_context_t *ctx, + ucc_tl_mlx5_mcast_coll_comm_t *comm) +{ + struct ibv_port_attr port_attr; + struct ibv_qp_attr attr; + uint16_t pkey; + + ibv_query_port(ctx->ctx, ctx->ib_port, &port_attr); + + for (ctx->pkey_index = 0; ctx->pkey_index < port_attr.pkey_tbl_len; + ++ctx->pkey_index) { + ibv_query_pkey(ctx->ctx, ctx->ib_port, ctx->pkey_index, &pkey); + if (pkey == DEF_PKEY) + break; + } + + if (ctx->pkey_index >= port_attr.pkey_tbl_len) { + ctx->pkey_index = 0; + ibv_query_pkey(ctx->ctx, ctx->ib_port, ctx->pkey_index, &pkey); + if (!pkey) { + tl_error(ctx->lib, "cannot find valid PKEY"); + return UCC_ERR_NO_RESOURCE; + } + + tl_debug(ctx->lib, "cannot find default pkey 0x%04x on port %d, using " + "index 0 pkey:0x%04x", DEF_PKEY, ctx->ib_port, pkey); + } + + attr.qp_state = IBV_QPS_INIT; + attr.pkey_index = ctx->pkey_index; + attr.port_num = ctx->ib_port; + attr.qkey = DEF_QKEY; + + if (ibv_modify_qp(comm->mcast.qp, &attr, + IBV_QP_STATE | IBV_QP_PKEY_INDEX | IBV_QP_PORT | IBV_QP_QKEY)) { + tl_error(ctx->lib, "failed to move mcast qp to INIT, errno %d", errno); + return UCC_ERR_NO_RESOURCE; + } + + if (ibv_attach_mcast(comm->mcast.qp, &comm->mgid, comm->mcast_lid)) { + tl_error(ctx->lib, "failed to attach QP to the mcast group, errno %d", errno); + return UCC_ERR_NO_RESOURCE; + } + + /* Ok, now cycle to RTR on everyone */ + attr.qp_state = IBV_QPS_RTR; + if (ibv_modify_qp(comm->mcast.qp, &attr, IBV_QP_STATE)) { + tl_error(ctx->lib, "failed to modify QP to RTR, errno %d", errno); + return UCC_ERR_NO_RESOURCE; + } + + attr.qp_state = IBV_QPS_RTS; + attr.sq_psn = DEF_PSN; + if (ibv_modify_qp(comm->mcast.qp, &attr, IBV_QP_STATE | IBV_QP_SQ_PSN)) { + tl_error(ctx->lib, "failed to modify QP to RTS, errno %d", errno); + return UCC_ERR_NO_RESOURCE; + } + + /* Create the address handle */ + if (UCC_OK != ucc_tl_mlx5_mcast_create_ah(comm)) { + tl_error(ctx->lib, "failed to create adress handle"); + return UCC_ERR_NO_RESOURCE; + } + + return UCC_OK; +} + +ucc_status_t ucc_tl_mlx5_fini_mcast_group(ucc_tl_mlx5_mcast_coll_context_t *ctx, + ucc_tl_mlx5_mcast_coll_comm_t *comm) +{ + char buf[40]; + const char *dst; + + dst = inet_ntop(AF_INET6, &comm->mcast_addr, buf, 40); + if (NULL == dst) { + tl_error(comm->lib, "inet_ntop failed"); + return UCC_ERR_NO_RESOURCE; + } + + tl_debug(ctx->lib, "mcast leave: ctx %p, comm %p, dgid: %s", ctx, comm, buf); + + if (rdma_leave_multicast(ctx->id, (struct sockaddr*)&comm->mcast_addr)) { + tl_error(comm->lib, "mcast rmda_leave_multicast failed"); + return UCC_ERR_NO_RESOURCE; + } + + return UCC_OK; +} + +ucc_status_t ucc_tl_mlx5_clean_mcast_comm(ucc_tl_mlx5_mcast_coll_comm_t *comm) +{ + int ret; + ucc_status_t status; + + tl_debug(comm->lib, "cleaning mcast comm: %p, id %d, mlid %x", + comm, comm->comm_id, comm->mcast_lid); + + if (UCC_OK != (status = ucc_tl_mlx5_mcast_reliable(comm))) { + // TODO handle (UCC_INPROGRESS == ret) + tl_error(comm->lib, "couldn't clean mcast team: relibality progress status %d", + status); + return status; + } + + if (comm->mcast.qp) { + ret = ibv_detach_mcast(comm->mcast.qp, &comm->mgid, comm->mcast_lid); + if (ret) { + tl_error(comm->lib, "couldn't detach QP, ret %d, errno %d", ret, errno); + return UCC_ERR_NO_RESOURCE; + } + } + + if (comm->mcast.qp) { + ret = ibv_destroy_qp(comm->mcast.qp); + if (ret) { + tl_error(comm->lib, "failed to destroy QP %d", ret); + return UCC_ERR_NO_RESOURCE; + } + } + + if (comm->rcq) { + ret = ibv_destroy_cq(comm->rcq); + if (ret) { + tl_error(comm->lib, "couldn't destroy rcq"); + return UCC_ERR_NO_RESOURCE; + } + } + + if (comm->scq) { + ret = ibv_destroy_cq(comm->scq); + if (ret) { + tl_error(comm->lib, "couldn't destroy scq"); + return UCC_ERR_NO_RESOURCE; + } + } + + if (comm->grh_mr) { + ret = ibv_dereg_mr(comm->grh_mr); + if (ret) { + tl_error(comm->lib, "couldn't destroy grh mr"); + return UCC_ERR_NO_RESOURCE; + } + } + if (comm->grh_buf) { + ucc_free(comm->grh_buf); + } + + if (comm->pp) { + ucc_free(comm->pp); + } + + if (comm->pp_mr) { + ret = ibv_dereg_mr(comm->pp_mr); + if (ret) { + tl_error(comm->lib, "couldn't destroy pp mr"); + return UCC_ERR_NO_RESOURCE; + } + } + + if (comm->pp_buf) { + ucc_free(comm->pp_buf); + } + + if (comm->call_rwr) { + ucc_free(comm->call_rwr); + } + + if (comm->call_rsgs) { + ucc_free(comm->call_rsgs); + } + + if (comm->mcast.ah) { + ret = ibv_destroy_ah(comm->mcast.ah); + if (ret) { + tl_error(comm->lib, "couldn't destroy ah"); + return UCC_ERR_NO_RESOURCE; + } + } + + if (comm->mcast_lid) { + status = ucc_tl_mlx5_fini_mcast_group(comm->ctx, comm); + if (status) { + tl_error(comm->lib, "couldn't leave mcast group"); + return status; + } + } + + if (comm->ctx->params.print_nack_stats) { + tl_debug(comm->lib, "comm_id %d, comm_size %d, comm->psn %d, rank %d, " + "nacks counter %d, n_mcast_rel %d", + comm->comm_id, comm->commsize, comm->psn, comm->rank, + comm->nacks_counter, comm->n_mcast_reliable); + } + + if (comm->p2p_ctx != NULL) { + ucc_free(comm->p2p_ctx); + } + + ucc_free(comm); + + return UCC_OK; +} + +ucc_status_t ucc_tl_mlx5_clean_mcast_ctx(ucc_tl_mlx5_mcast_coll_context_t *ctx) +{ + tl_debug(ctx->lib, "cleaning mcast ctx: %p", ctx); + + if (ctx->rcache) { + ucc_rcache_destroy(ctx->rcache); + } + + if (ctx->pd) { + if (ibv_dealloc_pd(ctx->pd)) { + tl_error(ctx->lib, "ibv_dealloc_pd failed errno %d", errno); + return UCC_ERR_NO_RESOURCE; + } + } + + if (rdma_destroy_id(ctx->id)) { + tl_error(ctx->lib, "rdma_destroy_id failed errno %d", errno); + return UCC_ERR_NO_RESOURCE; + } + + rdma_destroy_event_channel(ctx->channel); + + if (!strcmp(ctx->params.ib_dev_name, "")) { + ucc_free(ctx->devname); + } + + ucc_free(ctx); + + return UCC_OK; +} diff --git a/src/components/tl/mlx5/mcast/tl_mlx5_mcast_helper.h b/src/components/tl/mlx5/mcast/tl_mlx5_mcast_helper.h index 9ca529f7b9..05037e495f 100644 --- a/src/components/tl/mlx5/mcast/tl_mlx5_mcast_helper.h +++ b/src/components/tl/mlx5/mcast/tl_mlx5_mcast_helper.h @@ -352,7 +352,10 @@ static inline ucc_status_t ucc_tl_mlx5_mcast_reliable(ucc_tl_mlx5_mcast_coll_com return UCC_INPROGRESS; } -ucc_status_t ucc_tl_setup_mcast(ucc_tl_mlx5_mcast_coll_comm_t *comm); +ucc_status_t ucc_tl_mlx5_probe_ip_over_ib(char* ib_dev_list, + struct sockaddr_storage *addr); + +ucc_status_t ucc_tl_mlx5_setup_mcast(ucc_tl_mlx5_mcast_coll_comm_t *comm); ucc_status_t ucc_tl_mlx5_mcast_init_qps(ucc_tl_mlx5_mcast_coll_context_t *ctx, ucc_tl_mlx5_mcast_coll_comm_t *comm); @@ -360,6 +363,6 @@ ucc_status_t ucc_tl_mlx5_mcast_init_qps(ucc_tl_mlx5_mcast_coll_context_t *ctx, ucc_status_t ucc_tl_mlx5_mcast_setup_qps(ucc_tl_mlx5_mcast_coll_context_t *ctx, ucc_tl_mlx5_mcast_coll_comm_t *comm); -ucc_status_t ucc_tl_clean_mcast_comm(ucc_tl_mlx5_mcast_coll_comm_t *comm); +ucc_status_t ucc_tl_mlx5_clean_mcast_comm(ucc_tl_mlx5_mcast_coll_comm_t *comm); #endif /* TL_MLX5_MCAST_HELPER_H_ */ diff --git a/src/components/tl/mlx5/mcast/tl_mlx5_mcast_rcache.c b/src/components/tl/mlx5/mcast/tl_mlx5_mcast_rcache.c index c67a2d3179..75c62ac81f 100644 --- a/src/components/tl/mlx5/mcast/tl_mlx5_mcast_rcache.c +++ b/src/components/tl/mlx5/mcast/tl_mlx5_mcast_rcache.c @@ -19,7 +19,7 @@ static ucs_status_t ucc_tl_mlx5_mcast_coll_reg_mr(ucc_tl_mlx5_mcast_coll_context tl_error(ctx->lib, "failed to register MR"); return UCS_ERR_NO_MEMORY; } - + return UCS_OK; } @@ -33,7 +33,7 @@ static ucc_status_t ucc_tl_mlx5_mcast_coll_dereg_mr(ucc_tl_mlx5_mcast_coll_conte } tl_debug(ctx->lib, "external memory deregister: mr %p", mr); - + if (ibv_dereg_mr(mr)) { tl_error(ctx->lib, "couldn't destroy mr %p", mr); return UCC_ERR_NO_RESOURCE; @@ -140,12 +140,10 @@ ucc_status_t ucc_tl_mlx5_mcast_setup_rcache(ucc_tl_mlx5_mcast_coll_context_t *ct { ucc_rcache_params_t rcache_params; - rcache_params.alignment = 64; rcache_params.ucm_event_priority = 1000; rcache_params.max_regions = ULONG_MAX; rcache_params.max_size = SIZE_MAX; rcache_params.region_struct_size = sizeof(ucc_tl_mlx5_mcast_rcache_region_t); - rcache_params.max_alignment = ucc_get_page_size(); rcache_params.ucm_events = UCM_EVENT_VM_UNMAPPED | UCM_EVENT_MEM_TYPE_FREE; rcache_params.context = ctx; diff --git a/src/components/tl/mlx5/mcast/tl_mlx5_mcast_team.c b/src/components/tl/mlx5/mcast/tl_mlx5_mcast_team.c index 31044fe8b3..f56bc3c1a1 100644 --- a/src/components/tl/mlx5/mcast/tl_mlx5_mcast_team.c +++ b/src/components/tl/mlx5/mcast/tl_mlx5_mcast_team.c @@ -8,6 +8,7 @@ #include "tl_mlx5.h" #include "tl_mlx5_mcast_coll.h" #include "coll_score/ucc_coll_score.h" +#include "tl_mlx5_mcast_helper.h" ucc_status_t ucc_tl_mlx5_mcast_team_init(ucc_base_context_t *base_context, /* NOLINT */ ucc_tl_mlx5_mcast_team_t **mcast_team, /* NOLINT */ @@ -18,3 +19,112 @@ ucc_status_t ucc_tl_mlx5_mcast_team_init(ucc_base_context_t *base_cont return UCC_OK; } +ucc_status_t ucc_tl_mlx5_mcast_coll_setup_comm_resources(ucc_tl_mlx5_mcast_coll_comm_t *comm) +{ + ucc_status_t status; + size_t page_size; + int buf_size, i, ret; + + status = ucc_tl_mlx5_mcast_init_qps(comm->ctx, comm); + if (UCC_OK != status) { + goto error; + } + + status = ucc_tl_mlx5_mcast_setup_qps(comm->ctx, comm); + if (UCC_OK != status) { + goto error; + } + + page_size = ucc_get_page_size(); + buf_size = comm->ctx->mtu; + + // Comm receiving buffers. + ret = posix_memalign((void**)&comm->call_rwr, page_size, sizeof(struct ibv_recv_wr) * + comm->params.rx_depth); + if (ret) { + tl_error(comm->ctx->lib, "posix_memalign failed"); + return UCC_ERR_NO_MEMORY; + } + + ret = posix_memalign((void**)&comm->call_rsgs, page_size, sizeof(struct ibv_sge) * + comm->params.rx_depth * 2); + if (ret) { + tl_error(comm->ctx->lib, "posix_memalign failed"); + return UCC_ERR_NO_MEMORY; + } + + comm->pending_recv = 0; + comm->buf_n = comm->params.rx_depth * 2; + + ret = posix_memalign((void**) &comm->pp_buf, page_size, buf_size * comm->buf_n); + if (ret) { + tl_error(comm->ctx->lib, "posix_memalign failed"); + return UCC_ERR_NO_MEMORY; + } + + memset(comm->pp_buf, 0, buf_size * comm->buf_n); + + comm->pp_mr = ibv_reg_mr(comm->ctx->pd, comm->pp_buf, buf_size * comm->buf_n, + IBV_ACCESS_REMOTE_WRITE | IBV_ACCESS_LOCAL_WRITE); + if (!comm->pp_mr) { + tl_error(comm->ctx->lib, "could not register pp_buf mr, errno %d", errno); + status = UCC_ERR_NO_MEMORY; + goto error; + } + + ret = posix_memalign((void**) &comm->pp, page_size, sizeof(struct + pp_packet) * comm->buf_n); + if (ret) { + tl_error(comm->ctx->lib, "posix_memalign failed"); + return UCC_ERR_NO_MEMORY; + } + + for (i = 0; i < comm->buf_n; i++) { + ucc_list_head_init(&comm->pp[i].super); + + comm->pp[i].buf = (uintptr_t) comm->pp_buf + i * buf_size; + comm->pp[i].context = 0; + + ucc_list_add_tail(&comm->bpool, &comm->pp[i].super); + } + + comm->mcast.swr.wr.ud.ah = comm->mcast.ah; + comm->mcast.swr.num_sge = 1; + comm->mcast.swr.sg_list = &comm->mcast.ssg; + comm->mcast.swr.opcode = IBV_WR_SEND_WITH_IMM; + comm->mcast.swr.wr.ud.remote_qpn = MULTICAST_QPN; + comm->mcast.swr.wr.ud.remote_qkey = DEF_QKEY; + comm->mcast.swr.next = NULL; + + for (i = 0; i < comm->params.rx_depth; i++) { + comm->call_rwr[i].sg_list = &comm->call_rsgs[2 * i]; + comm->call_rwr[i].num_sge = 2; + comm->call_rwr[i].wr_id = MCAST_BCASTRECV_WR; + comm->call_rsgs[2 * i].length = GRH_LENGTH; + comm->call_rsgs[2 * i].addr = (uintptr_t)comm->grh_buf; + comm->call_rsgs[2 * i].lkey = comm->grh_mr->lkey; + comm->call_rsgs[2 * i + 1].lkey = comm->pp_mr->lkey; + comm->call_rsgs[2 * i + 1].length = comm->max_per_packet; + } + + status = ucc_tl_mlx5_mcast_post_recv_buffers(comm); + if (UCC_OK != status) { + goto error; + } + + memset(comm->parents, 0, sizeof(comm->parents)); + memset(comm->children, 0, sizeof(comm->children)); + + comm->nacks_counter = 0; + comm->tx = 0; + comm->n_prep_reliable = 0; + comm->n_mcast_reliable = 0; + comm->reliable_in_progress = 0; + comm->recv_drop_packet_in_progress = 0; + + return status; + +error: + ucc_tl_mlx5_clean_mcast_comm(comm); + return status; +} diff --git a/src/components/tl/mlx5/tl_mlx5.c b/src/components/tl/mlx5/tl_mlx5.c index bab4808ece..0210f2302c 100644 --- a/src/components/tl/mlx5/tl_mlx5.c +++ b/src/components/tl/mlx5/tl_mlx5.c @@ -67,6 +67,27 @@ static ucc_config_field_t ucc_tl_mlx5_lib_config_table[] = { ucc_offsetof(ucc_tl_mlx5_lib_config_t, qp_conf.qp_max_atomic), UCC_CONFIG_TYPE_UINT}, + {"MCAST_SX_DEPTH", "512", "Send context depth of the Mcast comm", + ucc_offsetof(ucc_tl_mlx5_lib_config_t, mcast_conf.sx_depth), + UCC_CONFIG_TYPE_INT}, + + {"MCAST_SX_INLINE", "128", "Minimal size for inline data send in Mcast", + ucc_offsetof(ucc_tl_mlx5_lib_config_t, mcast_conf.sx_inline), + UCC_CONFIG_TYPE_MEMUNITS}, + + {"MCAST_RX_DEPTH", "4096", "Recv context depth of the Mcast comm", + ucc_offsetof(ucc_tl_mlx5_lib_config_t, mcast_conf.rx_depth), + UCC_CONFIG_TYPE_INT}, + + {"MCAST_POST_RECV_THRESH", "64", + "Threshold for posting recv into rx ctx of the Mcast comm", + ucc_offsetof(ucc_tl_mlx5_lib_config_t, mcast_conf.post_recv_thresh), + UCC_CONFIG_TYPE_INT}, + + {"MCAST_WINDOW_SIZE", "64", "Reliability Mcast window size", + ucc_offsetof(ucc_tl_mlx5_lib_config_t, mcast_conf.wsize), + UCC_CONFIG_TYPE_INT}, + {NULL}}; static ucc_config_field_t ucc_tl_mlx5_context_config_table[] = { @@ -77,6 +98,14 @@ static ucc_config_field_t ucc_tl_mlx5_context_config_table[] = { ucc_offsetof(ucc_tl_mlx5_context_config_t, devices), UCC_CONFIG_TYPE_STRING_ARRAY}, + {"MCAST_TIMEOUT", "10000", "Timeout [usec] for the reliability NACK in Mcast", + ucc_offsetof(ucc_tl_mlx5_context_config_t, mcast_ctx_conf.timeout), + UCC_CONFIG_TYPE_INT}, + + {"MCAST_NET_DEVICE", "", "Specifies which network device to use for Mcast", + ucc_offsetof(ucc_tl_mlx5_context_config_t, mcast_ctx_conf.ib_dev_name), + UCC_CONFIG_TYPE_STRING}, + {NULL}}; UCC_CLASS_DEFINE_NEW_FUNC(ucc_tl_mlx5_lib_t, ucc_base_lib_t, diff --git a/src/components/tl/mlx5/tl_mlx5_context.c b/src/components/tl/mlx5/tl_mlx5_context.c index 0c56ff9390..5ac7b59f7d 100644 --- a/src/components/tl/mlx5/tl_mlx5_context.c +++ b/src/components/tl/mlx5/tl_mlx5_context.c @@ -14,6 +14,7 @@ #include "tl_mlx5_ib.h" #define PD_OWNER_RANK 0 +#define TL_MLX5_IB_PORT_INVALID -1 UCC_CLASS_INIT_FUNC(ucc_tl_mlx5_context_t, const ucc_base_context_params_t *params, @@ -210,7 +211,8 @@ ucc_status_t ucc_tl_mlx5_context_ib_ctx_pd_setup(ucc_base_context_t *context) if (!ctx->is_imported) { status = ucc_tl_mlx5_ib_ctx_pd_init(ctx); if (UCC_OK != status) { - goto err_ib_ctx_pd_init; + ctx->ib_port = TL_MLX5_IB_PORT_INVALID; + goto start_bcast; } if (UCC_SBGP_NOT_EXISTS == sbgp->status) { goto topo_ppn_1; @@ -228,21 +230,20 @@ ucc_status_t ucc_tl_mlx5_context_ib_ctx_pd_setup(ucc_base_context_t *context) tl_debug(context->lib, "failed to create tmp file for socket path"); sock_path[0] = '\0'; } - sbcast_data->ib_port = ctx->ib_port; memcpy(sbcast_data->sock_path, sock_path, sizeof(sock_path)); } +start_bcast: + sbcast_data->ib_port = ctx->ib_port; steam = core_ctx->service_team; - s.map = sbgp->map; s.myrank = sbgp->group_rank; - status = UCC_TL_TEAM_IFACE(steam)->scoll.bcast( + status = UCC_TL_TEAM_IFACE(steam)->scoll.bcast( &steam->super, sbcast_data, sbcast_data_length, PD_OWNER_RANK, s, &req); if (UCC_OK != status) { tl_debug(context->lib, "failed to start mlx5 ctx bcast"); goto err; } - while (UCC_INPROGRESS == (status = ucc_collective_test(&req->super))) { ucc_context_progress(core_ctx); } @@ -256,9 +257,15 @@ ucc_status_t ucc_tl_mlx5_context_ib_ctx_pd_setup(ucc_base_context_t *context) ctx->ib_port = sbcast_data->ib_port; memcpy(sock_path, sbcast_data->sock_path, sizeof(sock_path)); + if (ctx->ib_port == TL_MLX5_IB_PORT_INVALID) { + tl_debug(context->lib, "invalid ib port received"); + status = UCC_ERR_NO_RESOURCE; + goto err_ib_ctx_pd_init; + } + if (strlen(sock_path) == 0) { tl_debug(context->lib, "failed to share ctx and pd"); - status = UCC_ERR_NO_MESSAGE; + status = UCC_ERR_NO_RESOURCE; goto err; } status = ucc_tl_mlx5_share_ctx_pd(ctx, sock_path, sbgp->group_size, diff --git a/src/components/tl/mlx5/tl_mlx5_pd.c b/src/components/tl/mlx5/tl_mlx5_pd.c index a553dbc5f5..bf98352883 100644 --- a/src/components/tl/mlx5/tl_mlx5_pd.c +++ b/src/components/tl/mlx5/tl_mlx5_pd.c @@ -263,7 +263,8 @@ ucc_status_t ucc_tl_mlx5_share_ctx_pd(ucc_tl_mlx5_context_t *ctx, } static void ucc_tl_mlx5_context_barrier(ucc_context_oob_coll_t *oob, - ucc_base_lib_t *lib) + ucc_context_t *core_ctx, + ucc_base_lib_t *lib) { char *rbuf; char sbuf; @@ -284,6 +285,7 @@ static void ucc_tl_mlx5_context_barrier(ucc_context_oob_coll_t *oob, oob->allgather(&sbuf, rbuf, sizeof(char), oob->coll_info, &req)) { ucc_assert(req != NULL); while (UCC_OK != (status = oob->req_test(req))) { + ucc_context_progress(core_ctx); if (status < 0) { tl_debug(lib, "failed to test oob req"); break; @@ -303,7 +305,8 @@ ucc_status_t ucc_tl_mlx5_remove_shared_ctx_pd(ucc_tl_mlx5_context_t *ctx) if (ctx->shared_pd && ctx->is_imported) { ibv_unimport_pd(ctx->shared_pd); } - ucc_tl_mlx5_context_barrier(&UCC_TL_CTX_OOB(ctx), lib); + ucc_tl_mlx5_context_barrier(&UCC_TL_CTX_OOB(ctx), + ctx->super.super.ucc_context, lib); if (ctx->shared_pd && !ctx->is_imported) { err = ibv_dealloc_pd(ctx->shared_pd); if (err) { diff --git a/src/components/tl/mlx5/tl_mlx5_rcache.c b/src/components/tl/mlx5/tl_mlx5_rcache.c index 1414c82d15..d6f2aa47d8 100644 --- a/src/components/tl/mlx5/tl_mlx5_rcache.c +++ b/src/components/tl/mlx5/tl_mlx5_rcache.c @@ -63,8 +63,6 @@ ucc_status_t tl_mlx5_rcache_create(ucc_tl_mlx5_context_t *ctx) ucc_rcache_params_t rcache_params; rcache_params.region_struct_size = sizeof(ucc_tl_mlx5_rcache_region_t); - rcache_params.alignment = UCS_PGT_ADDR_ALIGN; - rcache_params.max_alignment = ucc_get_page_size(); rcache_params.ucm_event_priority = 1000; rcache_params.context = (void *)ctx; rcache_params.ops = &ucc_rcache_ops; diff --git a/src/components/tl/mlx5/tl_mlx5_team.c b/src/components/tl/mlx5/tl_mlx5_team.c index 712691078f..b326166674 100644 --- a/src/components/tl/mlx5/tl_mlx5_team.c +++ b/src/components/tl/mlx5/tl_mlx5_team.c @@ -66,7 +66,7 @@ UCC_CLASS_INIT_FUNC(ucc_tl_mlx5_team_t, ucc_base_context_t *tl_context, } self->a2a = NULL; - status = ucc_tl_mlx5_team_init_alltoall(self); + status = ucc_tl_mlx5_team_init_alltoall(self); if (UCC_OK != status) { return status; } @@ -105,9 +105,8 @@ ucc_status_t ucc_tl_mlx5_team_create_test(ucc_base_team_t *team) { ucc_tl_mlx5_team_t *tl_team = ucc_derived_of(team, ucc_tl_mlx5_team_t); ucc_team_t *core_team = UCC_TL_CORE_TEAM(tl_team); - ucc_subset_t subset = {.map.type = UCC_EP_MAP_FULL, - .map.ep_num = core_team->size, - .myrank = core_team->rank}; + ucc_subset_t subset = {.map = UCC_TL_TEAM_MAP(tl_team), + .myrank = UCC_TL_TEAM_RANK(tl_team)}; ucc_status_t status = UCC_OK; switch (tl_team->state) { diff --git a/src/components/tl/nccl/tl_nccl.c b/src/components/tl/nccl/tl_nccl.c index 8e71cdc1e2..46fdcff8e3 100644 --- a/src/components/tl/nccl/tl_nccl.c +++ b/src/components/tl/nccl/tl_nccl.c @@ -39,12 +39,17 @@ static ucs_config_field_t ucc_tl_nccl_context_config_table[] = { UCS_CONFIG_TYPE_ENUM(ucc_tl_nccl_completion_sync_names) }, - {"BLOCKING", "1", - "If set to 0 will use non-blocking mode communicator behavior, " - "if set to 1 will use blocking mode", + {"BLOCKING", "yes", + "If set to no will use non-blocking mode communicator behavior, " + "if set to yes will use blocking mode", ucs_offsetof(ucc_tl_nccl_context_config_t, nccl_cfg_blocking), UCS_CONFIG_TYPE_BOOL}, + {"LAZY_INIT", "yes", + "Initialize NCCL communicator on first collective", + ucc_offsetof(ucc_tl_nccl_context_config_t, nccl_lazy_init), + UCC_CONFIG_TYPE_BOOL}, + {NULL}}; UCC_CLASS_DEFINE_NEW_FUNC(ucc_tl_nccl_lib_t, ucc_base_lib_t, diff --git a/src/components/tl/nccl/tl_nccl.h b/src/components/tl/nccl/tl_nccl.h index 06f32c0371..b922601812 100644 --- a/src/components/tl/nccl/tl_nccl.h +++ b/src/components/tl/nccl/tl_nccl.h @@ -45,6 +45,15 @@ #define NCCL_VERSION_COMM_INIT_NB NCCL_VERSION(2,14,3) #define NCCL_USE_NON_BLOCKING NCCL_VERSION_CODE >= NCCL_VERSION_COMM_INIT_NB +enum { + TL_NCCL_COMM_STATE_ERROR, + TL_NCCL_COMM_STATE_OOB, + TL_NCCL_COMM_STATE_INIT_TEAM, + TL_NCCL_COMM_STATE_INIT_COMM, + TL_NCCL_COMM_STATE_DESTROY_COMM, + TL_NCCL_COMM_STATE_READY, +}; + typedef struct ucc_tl_nccl_iface { ucc_tl_iface_t super; } ucc_tl_nccl_iface_t; @@ -66,6 +75,7 @@ typedef struct ucc_tl_nccl_context_config { ucc_tl_context_config_t super; ucc_tl_nccl_completion_sync_type_t sync_type; int nccl_cfg_blocking; + int nccl_lazy_init; } ucc_tl_nccl_context_config_t; typedef struct ucc_tl_nccl_lib { @@ -85,7 +95,7 @@ UCC_CLASS_DECLARE(ucc_tl_nccl_context_t, const ucc_base_context_params_t *, typedef struct ucc_tl_nccl_team { ucc_tl_team_t super; - ucc_status_t comm_state; + int comm_state; ncclUniqueId *unique_id; void *oob_req; ncclComm_t nccl_comm; @@ -146,6 +156,8 @@ static inline ucc_status_t ucc_tl_nccl_check_nb(ncclResult_t *nccl_status, // NO return UCC_OK; } +ucc_status_t ucc_tl_nccl_comm_init(ucc_tl_nccl_team_t *team); + #define NCCLCHECK_GOTO(_cmd, _label, _st, _lib, _task_st, _comm, _check_nb) \ do { \ ncclResult_t e = _cmd; \ diff --git a/src/components/tl/nccl/tl_nccl_coll.c b/src/components/tl/nccl/tl_nccl_coll.c index 8a225c268b..ee3d523b0b 100644 --- a/src/components/tl/nccl/tl_nccl_coll.c +++ b/src/components/tl/nccl/tl_nccl_coll.c @@ -131,6 +131,7 @@ ucc_status_t ucc_tl_nccl_init_task(ucc_base_coll_args_t *coll_args, ucc_base_team_t *team, ucc_tl_nccl_task_t **coll_task) { + ucc_tl_nccl_team_t *nccl_team = ucc_derived_of(team, ucc_tl_nccl_team_t); ucc_tl_nccl_context_t *nccl_ctx = ucc_derived_of(team->context, ucc_tl_nccl_context_t); ucc_tl_nccl_task_t *task; @@ -143,6 +144,13 @@ ucc_status_t ucc_tl_nccl_init_task(ucc_base_coll_args_t *coll_args, return UCC_ERR_NOT_SUPPORTED; } + if (ucc_unlikely(nccl_team->comm_state != TL_NCCL_COMM_STATE_READY)) { + status = ucc_tl_nccl_comm_init(nccl_team); + if (ucc_unlikely(status != UCC_OK)) { + return status; + } + } + task = ucc_mpool_get(&nccl_ctx->req_mp); if (ucc_unlikely(!task)) { tl_error(team->context->lib, "failed to get task from mpool"); @@ -206,7 +214,7 @@ ucc_status_t ucc_tl_nccl_coll_finalize(ucc_coll_task_t *coll_task) ucc_status_t status = UCC_OK; if (ucc_unlikely(task->super.super.status != UCC_OK)) { - team->comm_state = task->super.super.status; + team->comm_state = TL_NCCL_COMM_STATE_ERROR; } tl_debug(UCC_TASK_LIB(task), "finalizing coll task %p", task); ucc_tl_nccl_free_task(task); diff --git a/src/components/tl/nccl/tl_nccl_team.c b/src/components/tl/nccl/tl_nccl_team.c index af2aff2ac6..bf8caf7e53 100644 --- a/src/components/tl/nccl/tl_nccl_team.c +++ b/src/components/tl/nccl/tl_nccl_team.c @@ -15,14 +15,17 @@ UCC_CLASS_INIT_FUNC(ucc_tl_nccl_team_t, ucc_base_context_t *tl_context, const ucc_base_team_params_t *params) { - ucc_tl_nccl_context_t *ctx = - ucc_derived_of(tl_context, ucc_tl_nccl_context_t); + ucc_tl_nccl_context_t *ctx = ucc_derived_of(tl_context, + ucc_tl_nccl_context_t); + ucc_team_oob_coll_t *oob; ucc_status_t status; ucc_rank_t size; - UCC_CLASS_CALL_SUPER_INIT(ucc_tl_team_t, &ctx->super, params); + UCC_CLASS_CALL_SUPER_INIT(ucc_tl_team_t, &ctx->super, params); + oob = &(UCC_TL_TEAM_OOB(self)); size = UCC_TL_TEAM_SIZE(self); - self->comm_state = UCC_OK; + self->stream = NULL; + self->nccl_comm = NULL; self->unique_id = ucc_malloc(sizeof(ncclUniqueId) * (size + 1), "tl_nccl_unique_id"); if (!self->unique_id) { @@ -31,6 +34,7 @@ UCC_CLASS_INIT_FUNC(ucc_tl_nccl_team_t, ucc_base_context_t *tl_context, sizeof(ncclUniqueId) * (size + 1)); return UCC_ERR_NO_MEMORY; } + if (UCC_TL_TEAM_RANK(self) == 0) { ncclResult_t st; st = ncclGetUniqueId(&self->unique_id[size]); @@ -39,14 +43,16 @@ UCC_CLASS_INIT_FUNC(ucc_tl_nccl_team_t, ucc_base_context_t *tl_context, memset(&self->unique_id[size], 0, sizeof(ncclUniqueId)); } } - status = UCC_TL_TEAM_OOB(self).allgather( - &self->unique_id[size], self->unique_id, - sizeof(ncclUniqueId), UCC_TL_TEAM_OOB(self).coll_info, - &self->oob_req); + + status = oob->allgather(&self->unique_id[size], + self->unique_id, sizeof(ncclUniqueId), + oob->coll_info, &self->oob_req); if (status != UCC_OK) { tl_error(ctx->super.super.lib, "failed to start oob allgather"); goto free_unique_id; } + self->comm_state = TL_NCCL_COMM_STATE_OOB; + return UCC_OK; free_unique_id: @@ -69,15 +75,17 @@ ucc_status_t ucc_tl_nccl_team_destroy(ucc_base_team_t *tl_team) #if NCCL_USE_NON_BLOCKING ncclResult_t nccl_status, st; - if (team->nccl_comm && team->comm_state == UCC_INPROGRESS) { + if (team->comm_state == TL_NCCL_COMM_STATE_DESTROY_COMM) { goto check_finalize; } #endif + if (team->stream) { + cudaStreamDestroy(team->stream); + team->stream = NULL; + } if (team->nccl_comm) { - if (team->comm_state != UCC_OK && team->comm_state != UCC_INPROGRESS) { - /* if communication error was detected ncclCommAbort should be used - since ncclCommDestroy could block */ + if (team->comm_state == TL_NCCL_COMM_STATE_ERROR) { ncclCommAbort(team->nccl_comm); } else { #if NCCL_USE_NON_BLOCKING @@ -91,7 +99,7 @@ ucc_status_t ucc_tl_nccl_team_destroy(ucc_base_team_t *tl_team) ncclCommAbort(team->nccl_comm); return UCC_ERR_NO_MESSAGE; } else if (nccl_status == ncclInProgress) { - team->comm_state = UCC_INPROGRESS; + team->comm_state = TL_NCCL_COMM_STATE_DESTROY_COMM; return UCC_INPROGRESS; } else { ncclCommDestroy(team->nccl_comm); @@ -101,95 +109,125 @@ ucc_status_t ucc_tl_nccl_team_destroy(ucc_base_team_t *tl_team) ncclCommDestroy(team->nccl_comm); #endif } - cudaStreamDestroy(team->stream); } UCC_CLASS_DELETE_FUNC_NAME(ucc_tl_nccl_team_t)(tl_team); return UCC_OK; } -ucc_status_t ucc_tl_nccl_team_create_test(ucc_base_team_t *tl_team) +ucc_status_t ucc_tl_nccl_comm_init(ucc_tl_nccl_team_t *team) { - ucc_tl_nccl_team_t *team = ucc_derived_of(tl_team, ucc_tl_nccl_team_t); + ucc_rank_t tsize = UCC_TL_TEAM_SIZE(team); + ucc_rank_t trank = UCC_TL_TEAM_RANK(team); ucc_status_t status; ncclResult_t nccl_status; - ncclUniqueId errorid; - #if NCCL_USE_NON_BLOCKING ncclConfig_t nccl_cfg = NCCL_CONFIG_INITIALIZER; - ncclResult_t st; - - if (team->comm_state == UCC_INPROGRESS) { - goto ncclInitStage; - } + ncclResult_t async_status; #endif - status = UCC_TL_TEAM_OOB(team).req_test(team->oob_req); - if (status == UCC_INPROGRESS) { - return UCC_INPROGRESS; - } - if (status != UCC_OK) { - UCC_TL_TEAM_OOB(team).req_free(team->oob_req); - tl_error(tl_team->context->lib, "oob req test failed"); - goto free_unique_id; - } - status = UCC_TL_TEAM_OOB(team).req_free(team->oob_req); - if (status != UCC_OK) { - tl_error(tl_team->context->lib, "oob req free failed"); - goto free_unique_id; - } - /* check unique id is valid */ - memset(&errorid, 0, sizeof(errorid)); - if (!memcmp(&errorid, team->unique_id, sizeof(errorid))) { - tl_error(tl_team->context->lib, "incorrect unique id"); - goto free_unique_id; + if (team->comm_state == TL_NCCL_COMM_STATE_READY) { + return UCC_OK; + } else if (team->comm_state == TL_NCCL_COMM_STATE_ERROR) { + return UCC_ERR_NOT_SUPPORTED; + } else if (team->comm_state == TL_NCCL_COMM_STATE_INIT_COMM) { +#if NCCL_USE_NON_BLOCKING + goto nccl_async_init; +#else + ucc_assert_always(0); +#endif } CUDA_CHECK_GOTO(cudaStreamCreateWithFlags(&team->stream, - cudaStreamNonBlocking), free_unique_id, status); + cudaStreamNonBlocking), + exit_err, status); #if NCCL_USE_NON_BLOCKING - nccl_cfg.blocking = UCC_TL_NCCL_TEAM_CTX(team)->cfg.nccl_cfg_blocking; - nccl_status = ncclCommInitRankConfig(&team->nccl_comm, - UCC_TL_TEAM_SIZE(team), - team->unique_id[0], - UCC_TL_TEAM_RANK(team), - &nccl_cfg); - if (nccl_status != ncclInProgress && nccl_status != ncclSuccess) { - goto free_stream; + /* + * if NCCL comm initialized during first call to collective init a.k.a lazy init + * we need to use blocking init to correctly fallback to other TL in case of error + */ + nccl_cfg.blocking = (UCC_TL_NCCL_TEAM_CTX(team)->cfg.nccl_cfg_blocking || + UCC_TL_NCCL_TEAM_CTX(team)->cfg.nccl_lazy_init) ? 1: 0; + + nccl_status = ncclCommInitRankConfig(&team->nccl_comm, tsize, + team->unique_id[0], trank, &nccl_cfg); + if ((nccl_status != ncclInProgress) && (nccl_status != ncclSuccess)) { + goto nccl_comm_init_err; } -ncclInitStage: - st = ncclCommGetAsyncError(team->nccl_comm, &nccl_status); - if (st != ncclSuccess) { - nccl_status = st; +nccl_async_init: + nccl_status = ncclCommGetAsyncError(team->nccl_comm, &async_status); + if (nccl_status != ncclSuccess) { + goto nccl_comm_init_err; } - if (nccl_status == ncclInProgress){ - team->comm_state = UCC_INPROGRESS; - return UCC_INPROGRESS; + if (async_status == ncclInProgress) { + team->comm_state = TL_NCCL_COMM_STATE_INIT_COMM; } #else - nccl_status = ncclCommInitRank(&team->nccl_comm, UCC_TL_TEAM_SIZE(team), - team->unique_id[0], UCC_TL_TEAM_RANK(team)); -#endif + nccl_status = ncclCommInitRank(&team->nccl_comm, tsize, team->unique_id[0], + trank); if (nccl_status != ncclSuccess) { - goto free_stream; + goto nccl_comm_init_err; } - ucc_free(team->unique_id); - tl_debug(tl_team->context->lib, "initialized tl team: %p", team); +#endif + + team->comm_state = TL_NCCL_COMM_STATE_READY; return UCC_OK; -free_stream: - tl_debug(tl_team->context->lib, "NCCL error %d %s", nccl_status, - ncclGetErrorString(nccl_status)); - status = UCC_ERR_NO_MESSAGE; -#if NCCL_USE_NON_BLOCKING - ncclCommAbort(team->nccl_comm); -#endif - cudaStreamDestroy(team->stream); -free_unique_id: - ucc_free(team->unique_id); +nccl_comm_init_err: + tl_debug(team->super.super.context->lib, "NCCL error %d %s", + nccl_status, ncclGetErrorString(nccl_status)); + if (nccl_status == ncclInvalidUsage) { + /* + * handles the case when trying to inititize multiple ranks + * on the same GPU. Return "not supported" and fallback to other TL + */ + status = UCC_ERR_NOT_SUPPORTED; + } else { + status = UCC_ERR_NO_RESOURCE; + } + team->comm_state = TL_NCCL_COMM_STATE_ERROR; + +exit_err: return status; } +ucc_status_t ucc_tl_nccl_team_create_test(ucc_base_team_t *tl_team) +{ + ucc_tl_nccl_team_t *team = ucc_derived_of(tl_team, ucc_tl_nccl_team_t); + ucc_team_oob_coll_t *oob = &(UCC_TL_TEAM_OOB(team)); + ncclUniqueId errorid; + ucc_status_t status; + + + if (team->comm_state == TL_NCCL_COMM_STATE_OOB) { + status = oob->req_test(team->oob_req); + if (status == UCC_INPROGRESS) { + return UCC_INPROGRESS; + } + + oob->req_free(team->oob_req); + if (status != UCC_OK) { + tl_error(tl_team->context->lib, "oob req test failed"); + return status; + } + + /* check unique id is valid */ + memset(&errorid, 0, sizeof(errorid)); + if (!memcmp(&errorid, team->unique_id, sizeof(errorid))) { + tl_error(tl_team->context->lib, "incorrect unique id"); + return status; + } + + team->comm_state = TL_NCCL_COMM_STATE_INIT_TEAM; + } + + if (UCC_TL_NCCL_TEAM_CTX(team)->cfg.nccl_lazy_init) { + return UCC_OK; + } + + return ucc_tl_nccl_comm_init(team); +} + ucc_status_t ucc_tl_nccl_coll_init(ucc_base_coll_args_t *coll_args, ucc_base_team_t *team, ucc_coll_task_t **task_h) diff --git a/src/components/tl/sharp/tl_sharp.h b/src/components/tl/sharp/tl_sharp.h index cc44e9e1f4..adfbc86036 100644 --- a/src/components/tl/sharp/tl_sharp.h +++ b/src/components/tl/sharp/tl_sharp.h @@ -108,6 +108,10 @@ typedef struct ucc_tl_sharp_task { ucc_tl_sharp_reg_t *s_mem_h; ucc_tl_sharp_reg_t *r_mem_h; } allreduce; + struct { + ucc_tl_sharp_reg_t *s_mem_h; + ucc_tl_sharp_reg_t *r_mem_h; + } reduce_scatter; struct { ucc_tl_sharp_reg_t *mem_h; } bcast; @@ -131,9 +135,16 @@ ucc_status_t sharp_status_to_ucc_status(int status); (ucc_derived_of((_task)->super.team->context->lib, ucc_tl_sharp_lib_t)) #define TASK_ARGS(_task) (_task)->super.bargs.args -#define UCC_TL_SHARP_SUPPORTED_COLLS \ +#define UCC_TL_BASIC_SHARP_SUPPORTED_COLLS \ (UCC_COLL_TYPE_ALLREDUCE | UCC_COLL_TYPE_BARRIER | UCC_COLL_TYPE_BCAST) +#if HAVE_DECL_SHARP_COLL_DO_REDUCE_SCATTER +#define UCC_TL_SHARP_SUPPORTED_COLLS \ + (UCC_TL_BASIC_SHARP_SUPPORTED_COLLS | UCC_COLL_TYPE_REDUCE_SCATTER) +#else +#define UCC_TL_SHARP_SUPPORTED_COLLS (UCC_TL_BASIC_SHARP_SUPPORTED_COLLS) +#endif + UCC_CLASS_DECLARE(ucc_tl_sharp_team_t, ucc_base_context_t *, const ucc_base_team_params_t *); diff --git a/src/components/tl/sharp/tl_sharp_coll.c b/src/components/tl/sharp/tl_sharp_coll.c index d246fcc563..5884e18918 100644 --- a/src/components/tl/sharp/tl_sharp_coll.c +++ b/src/components/tl/sharp/tl_sharp_coll.c @@ -26,9 +26,9 @@ enum sharp_datatype ucc_to_sharp_dtype[] = { [UCC_DT_PREDEFINED_ID(UCC_DT_FLOAT64)] = SHARP_DTYPE_DOUBLE, [UCC_DT_PREDEFINED_ID(UCC_DT_FLOAT128)] = SHARP_DTYPE_NULL, #if SHARP_API > SHARP_VERSION(3, 0) - [UCC_DT_PREDEFINED_ID(UCC_DT_INT8)] = SHARP_DTYPE_UNKNOWN, - [UCC_DT_PREDEFINED_ID(UCC_DT_UINT8)] = SHARP_DTYPE_UNKNOWN, - [UCC_DT_PREDEFINED_ID(UCC_DT_BFLOAT16)] = SHARP_DTYPE_UNKNOWN, + [UCC_DT_PREDEFINED_ID(UCC_DT_INT8)] = (enum sharp_datatype)SHARP_DTYPE_UNKNOWN, + [UCC_DT_PREDEFINED_ID(UCC_DT_UINT8)] = (enum sharp_datatype)SHARP_DTYPE_UNKNOWN, + [UCC_DT_PREDEFINED_ID(UCC_DT_BFLOAT16)] = (enum sharp_datatype)SHARP_DTYPE_UNKNOWN, #else [UCC_DT_PREDEFINED_ID(UCC_DT_INT8)] = SHARP_DTYPE_NULL, [UCC_DT_PREDEFINED_ID(UCC_DT_UINT8)] = SHARP_DTYPE_NULL, @@ -308,6 +308,100 @@ ucc_status_t ucc_tl_sharp_bcast_start(ucc_coll_task_t *coll_task) return ucc_progress_queue_enqueue(UCC_TL_CORE_CTX(team)->pq, &task->super); } +#if HAVE_DECL_SHARP_COLL_DO_REDUCE_SCATTER +ucc_status_t ucc_tl_sharp_reduce_scatter_start(ucc_coll_task_t *coll_task) +{ + ucc_tl_sharp_task_t *task = ucc_derived_of(coll_task, ucc_tl_sharp_task_t); + ucc_tl_sharp_team_t *team = TASK_TEAM(task); + ucc_coll_args_t *args = &TASK_ARGS(task); + size_t count = args->dst.info.count; + ucc_datatype_t dt = args->dst.info.datatype; + struct sharp_coll_reduce_spec reduce_spec; + enum sharp_datatype sharp_type; + enum sharp_reduce_op op_type; + size_t src_data_size, dst_data_size; + int ret; + + UCC_TL_SHARP_PROFILE_REQUEST_EVENT(coll_task, "sharp_reduce_scatter_start", + 0); + + sharp_type = ucc_to_sharp_dtype[UCC_DT_PREDEFINED_ID(dt)]; + op_type = ucc_to_sharp_reduce_op[args->op]; + src_data_size = ucc_dt_size(dt) * count * UCC_TL_TEAM_SIZE(team); + dst_data_size = ucc_dt_size(dt) * count; + + if (!UCC_IS_INPLACE(*args)) { + ucc_tl_sharp_mem_register(TASK_CTX(task), team, args->src.info.buffer, + src_data_size, &task->reduce_scatter.s_mem_h); + } + ucc_tl_sharp_mem_register(TASK_CTX(task), team, args->dst.info.buffer, + dst_data_size, &task->reduce_scatter.r_mem_h); + + if (!UCC_IS_INPLACE(*args)) { + reduce_spec.sbuf_desc.buffer.ptr = args->src.info.buffer; + reduce_spec.sbuf_desc.buffer.mem_handle = + task->reduce_scatter.s_mem_h->mr; + reduce_spec.sbuf_desc.mem_type = + ucc_to_sharp_memtype[args->src.info.mem_type]; + } else { + reduce_spec.sbuf_desc.buffer.ptr = args->dst.info.buffer; + reduce_spec.sbuf_desc.buffer.mem_handle = + task->reduce_scatter.r_mem_h->mr; + reduce_spec.sbuf_desc.mem_type = + ucc_to_sharp_memtype[args->dst.info.mem_type]; + } + + reduce_spec.sbuf_desc.buffer.length = src_data_size; + reduce_spec.sbuf_desc.type = SHARP_DATA_BUFFER; + reduce_spec.rbuf_desc.buffer.ptr = args->dst.info.buffer; + reduce_spec.rbuf_desc.buffer.length = dst_data_size; + reduce_spec.rbuf_desc.buffer.mem_handle = task->reduce_scatter.r_mem_h->mr; + reduce_spec.rbuf_desc.type = SHARP_DATA_BUFFER; + reduce_spec.rbuf_desc.mem_type = + ucc_to_sharp_memtype[args->dst.info.mem_type]; + reduce_spec.aggr_mode = SHARP_AGGREGATION_NONE; + reduce_spec.length = count; + reduce_spec.dtype = sharp_type; + reduce_spec.op = op_type; + reduce_spec.offset = 0; + + ret = sharp_coll_do_reduce_scatter_nb(team->sharp_comm, &reduce_spec, + &task->req_handle); + if (ret != SHARP_COLL_SUCCESS) { + tl_error(UCC_TASK_LIB(task), + "sharp_coll_do_reduce_scatter_nb failed:%s", + sharp_coll_strerror(ret)); + coll_task->status = ucc_tl_sharp_status_to_ucc(ret); + return ucc_task_complete(coll_task); + } + coll_task->status = UCC_INPROGRESS; + + return ucc_progress_queue_enqueue(UCC_TL_CORE_CTX(team)->pq, &task->super); +} + +ucc_status_t ucc_tl_sharp_reduce_scatter_init(ucc_tl_sharp_task_t *task) +{ + ucc_coll_args_t *args = &TASK_ARGS(task); + + if (!ucc_coll_args_is_predefined_dt(args, UCC_RANK_INVALID)) { + return UCC_ERR_NOT_SUPPORTED; + } + + if ((!UCC_IS_INPLACE(*args) && + ucc_to_sharp_memtype[args->src.info.mem_type] == SHARP_MEM_TYPE_LAST) || + ucc_to_sharp_memtype[args->dst.info.mem_type] == SHARP_MEM_TYPE_LAST || + ucc_to_sharp_dtype[UCC_DT_PREDEFINED_ID(args->dst.info.datatype)] == + SHARP_DTYPE_NULL || + ucc_to_sharp_reduce_op[args->op] == SHARP_OP_NULL) { + return UCC_ERR_NOT_SUPPORTED; + } + + task->super.post = ucc_tl_sharp_reduce_scatter_start; + task->super.progress = ucc_tl_sharp_collective_progress; + return UCC_OK; +}; +#endif + ucc_status_t ucc_tl_sharp_allreduce_init(ucc_tl_sharp_task_t *task) { ucc_coll_args_t *args = &TASK_ARGS(task); diff --git a/src/components/tl/sharp/tl_sharp_coll.h b/src/components/tl/sharp/tl_sharp_coll.h index 4b0dba17b6..6557dc56e8 100644 --- a/src/components/tl/sharp/tl_sharp_coll.h +++ b/src/components/tl/sharp/tl_sharp_coll.h @@ -1,5 +1,5 @@ /** - * Copyright (c) 2021-2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved. + * Copyright (c) 2021-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. * * See file LICENSE for terms. */ @@ -10,7 +10,7 @@ #include "tl_sharp.h" /* need to query for datatype support at runtime */ -#define SHARP_DTYPE_UNKNOWN -1 +#define SHARP_DTYPE_UNKNOWN 0xFFFF extern enum sharp_datatype ucc_to_sharp_dtype[]; @@ -20,4 +20,7 @@ ucc_status_t ucc_tl_sharp_barrier_init(ucc_tl_sharp_task_t *task); ucc_status_t ucc_tl_sharp_bcast_init(ucc_tl_sharp_task_t *task); +#if HAVE_DECL_SHARP_COLL_DO_REDUCE_SCATTER +ucc_status_t ucc_tl_sharp_reduce_scatter_init(ucc_tl_sharp_task_t *task); +#endif #endif diff --git a/src/components/tl/sharp/tl_sharp_context.c b/src/components/tl/sharp/tl_sharp_context.c index 6e0477680a..72461066b3 100644 --- a/src/components/tl/sharp/tl_sharp_context.c +++ b/src/components/tl/sharp/tl_sharp_context.c @@ -269,12 +269,10 @@ ucc_status_t ucc_tl_sharp_rcache_create(struct sharp_coll_context *context, { ucc_rcache_params_t rcache_params; - rcache_params.alignment = 64; rcache_params.ucm_event_priority = 1000; rcache_params.max_regions = ULONG_MAX; rcache_params.max_size = SIZE_MAX; rcache_params.region_struct_size = sizeof(ucc_tl_sharp_rcache_region_t); - rcache_params.max_alignment = ucc_get_page_size(); rcache_params.ucm_events = UCM_EVENT_VM_UNMAPPED | UCM_EVENT_MEM_TYPE_FREE; rcache_params.context = context; @@ -436,7 +434,7 @@ ucc_status_t ucc_tl_sharp_context_create_epilog(ucc_base_context_t *context) if (lib->cfg.use_internal_oob) { sharp_ctx->oob_ctx.subset = set; } else { - sharp_ctx->oob_ctx.oob = &UCC_TL_CTX_OOB(sharp_ctx); + sharp_ctx->oob_ctx.oob = &UCC_TL_CTX_OOB(sharp_ctx); } status = ucc_topo_init(set, core_ctx->topo, &topo); diff --git a/src/components/tl/sharp/tl_sharp_team.c b/src/components/tl/sharp/tl_sharp_team.c index fe4a5875fb..a8bd380936 100644 --- a/src/components/tl/sharp/tl_sharp_team.c +++ b/src/components/tl/sharp/tl_sharp_team.c @@ -117,7 +117,7 @@ UCC_CLASS_INIT_FUNC(ucc_tl_sharp_team_t, ucc_base_context_t *tl_context, if (sharp_caps.support_mask.dtypes & UCC_BIT(SHARP_DTYPE_BFLOAT16)) { tl_debug(ctx->super.super.lib, "enabling support for UCC_DT_BFLOAT16"); - ucc_to_sharp_dtype[UCC_DT_PREDEFINED_ID(UCC_DT_BFLOAT16)] = UCC_DT_BFLOAT16; + ucc_to_sharp_dtype[UCC_DT_PREDEFINED_ID(UCC_DT_BFLOAT16)] = SHARP_DTYPE_BFLOAT16; } else { tl_debug(ctx->super.super.lib, "disabling support for UCC_DT_BFLOAT16"); ucc_to_sharp_dtype[UCC_DT_PREDEFINED_ID(UCC_DT_BFLOAT16)] = SHARP_DTYPE_NULL; @@ -234,6 +234,11 @@ ucc_status_t ucc_tl_sharp_coll_init(ucc_base_coll_args_t *coll_args, case UCC_COLL_TYPE_BCAST: status = ucc_tl_sharp_bcast_init(task); break; +#if HAVE_DECL_SHARP_COLL_DO_REDUCE_SCATTER + case UCC_COLL_TYPE_REDUCE_SCATTER: + status = ucc_tl_sharp_reduce_scatter_init(task); + break; +#endif default: tl_debug(UCC_TASK_LIB(task), "collective %d is not supported by sharp tl", diff --git a/src/components/tl/ucc_tl.c b/src/components/tl/ucc_tl.c index dcbb2b6d71..3134c9fd14 100644 --- a/src/components/tl/ucc_tl.c +++ b/src/components/tl/ucc_tl.c @@ -242,6 +242,11 @@ ucc_status_t ucc_tl_team_create_multiple(ucc_team_multiple_req_t *req) } req->descs[*id].status = UCC_TL_CTX_IFACE(req->descs[*id].ctx) ->team.create_test(&req->descs[*id].team->super); + if (req->descs[*id].status < 0) { + /* if team create failed in team create test need to cleanup resources */ + UCC_TL_CTX_IFACE(req->descs[*id].ctx)->team.destroy( + &req->descs[*id].team->super); + } return UCC_INPROGRESS; } diff --git a/src/components/tl/ucc_tl.h b/src/components/tl/ucc_tl.h index 53e62052dc..75a5e3e1a0 100644 --- a/src/components/tl/ucc_tl.h +++ b/src/components/tl/ucc_tl.h @@ -138,8 +138,18 @@ typedef struct ucc_tl_lib_attr { #define UCC_TL_TEAM_IFACE(_tl_team) \ (ucc_derived_of((_tl_team)->super.context->lib, ucc_tl_lib_t))->iface +/** + * Get TL team lib + * @param [in] _tl_team pointer to TL team object + * @return pointer to TL lib object + */ #define UCC_TL_TEAM_LIB(_tl_team) (_tl_team)->super.super.context->lib +/** + * Get TL team context + * @param [in] _tl_team pointer to TL team object + * @return pointer to TL context object + */ #define UCC_TL_TEAM_CTX(_tl_team) (_tl_team)->super.super.context #define UCC_TL_CORE_CTX(_tl_team) ((_tl_team)->super.super.context->ucc_context) diff --git a/src/components/tl/ucp/Makefile.am b/src/components/tl/ucp/Makefile.am index 4d684adfb5..30d00633da 100644 --- a/src/components/tl/ucp/Makefile.am +++ b/src/components/tl/ucp/Makefile.am @@ -32,7 +32,8 @@ alltoallv = \ alltoallv/alltoallv.h \ alltoallv/alltoallv.c \ alltoallv/alltoallv_pairwise.c \ - alltoallv/alltoallv_hybrid.c + alltoallv/alltoallv_hybrid.c \ + alltoallv/alltoallv_onesided.c allreduce = \ allreduce/allreduce.h \ @@ -40,6 +41,7 @@ allreduce = \ allreduce/allreduce_knomial.c \ allreduce/allreduce_sliding_window.c \ allreduce/allreduce_sliding_window_setup.c \ + allreduce/allreduce_dbt.c \ allreduce/allreduce_sra_knomial.c barrier = \ @@ -51,7 +53,8 @@ bcast = \ bcast/bcast.h \ bcast/bcast.c \ bcast/bcast_knomial.c \ - bcast/bcast_sag_knomial.c + bcast/bcast_sag_knomial.c \ + bcast/bcast_dbt.c fanin = \ fanin/fanin.h \ @@ -74,7 +77,8 @@ gatherv = \ reduce = \ reduce/reduce.h \ reduce/reduce.c \ - reduce/reduce_knomial.c + reduce/reduce_knomial.c \ + reduce/reduce_dbt.c reduce_scatter = \ reduce_scatter/reduce_scatter.h \ diff --git a/src/components/tl/ucp/allgather/allgather.c b/src/components/tl/ucp/allgather/allgather.c index 90b06e99ee..926b732e55 100644 --- a/src/components/tl/ucp/allgather/allgather.c +++ b/src/components/tl/ucp/allgather/allgather.c @@ -38,7 +38,14 @@ char *ucc_tl_ucp_allgather_score_str_get(ucc_tl_ucp_team_t *team) ? UCC_TL_UCP_ALLGATHER_ALG_RING : UCC_TL_UCP_ALLGATHER_ALG_NEIGHBOR; char *str = ucc_malloc(max_size * sizeof(char)); + ucc_sbgp_t *sbgp; + if (team->cfg.use_reordering) { + sbgp = ucc_topo_get_sbgp(team->topo, UCC_SBGP_FULL_HOST_ORDERED); + if (!ucc_ep_map_is_identity(&sbgp->map)) { + algo_num = UCC_TL_UCP_ALLGATHER_ALG_RING; + } + } ucc_snprintf_safe(str, max_size, UCC_TL_UCP_ALLGATHER_DEFAULT_ALG_SELECT_STR, algo_num); return str; diff --git a/src/components/tl/ucp/allgather/allgather_neighbor.c b/src/components/tl/ucp/allgather/allgather_neighbor.c index 771ba2d3b8..534c197e4e 100644 --- a/src/components/tl/ucp/allgather/allgather_neighbor.c +++ b/src/components/tl/ucp/allgather/allgather_neighbor.c @@ -15,7 +15,9 @@ static ucc_rank_t get_recv_from_rank(ucc_rank_t rank, ucc_rank_t size, int i) { const int i_parity = i % 2; - ucc_rank_t offset_at_step[2], recv_data_from; + int offset_at_step[2]; + ucc_rank_t recv_data_from; + if (rank % 2) { recv_data_from = (rank - 1 + size) % size; offset_at_step[0] = (-2); diff --git a/src/components/tl/ucp/allgather/allgather_ring.c b/src/components/tl/ucp/allgather/allgather_ring.c index 93d7b95fc4..07178aea25 100644 --- a/src/components/tl/ucp/allgather/allgather_ring.c +++ b/src/components/tl/ucp/allgather/allgather_ring.c @@ -108,7 +108,7 @@ ucc_status_t ucc_tl_ucp_allgather_ring_start(ucc_coll_task_t *coll_task) ucc_status_t ucc_tl_ucp_allgather_ring_init_common(ucc_tl_ucp_task_t *task) { - ucc_tl_ucp_team_t *team = TASK_TEAM(task); + ucc_tl_ucp_team_t *team = TASK_TEAM(task); ucc_sbgp_t *sbgp; if (!ucc_coll_args_is_predefined_dt(&TASK_ARGS(task), UCC_RANK_INVALID)) { diff --git a/src/components/tl/ucp/allreduce/allreduce.c b/src/components/tl/ucp/allreduce/allreduce.c index 1b01cb5455..1149d382fe 100644 --- a/src/components/tl/ucp/allreduce/allreduce.c +++ b/src/components/tl/ucp/allreduce/allreduce.c @@ -24,6 +24,11 @@ ucc_base_coll_alg_info_t {.id = UCC_TL_UCP_ALLREDUCE_ALG_SLIDING_WINDOW, .name = "sliding_window", .desc = "sliding window allreduce (optimized for running on DPU)"}, + [UCC_TL_UCP_ALLREDUCE_ALG_DBT] = + {.id = UCC_TL_UCP_ALLREDUCE_ALG_SRA_KNOMIAL, + .name = "dbt", + .desc = "alreduce over double binary tree where a leaf in one tree " + "will be intermediate in other (optimized for BW)"}, [UCC_TL_UCP_ALLREDUCE_ALG_LAST] = { .id = 0, .name = NULL, .desc = NULL}}; diff --git a/src/components/tl/ucp/allreduce/allreduce.h b/src/components/tl/ucp/allreduce/allreduce.h index 5e545b6135..3ec7b3f94c 100644 --- a/src/components/tl/ucp/allreduce/allreduce.h +++ b/src/components/tl/ucp/allreduce/allreduce.h @@ -12,6 +12,7 @@ enum { UCC_TL_UCP_ALLREDUCE_ALG_KNOMIAL, UCC_TL_UCP_ALLREDUCE_ALG_SRA_KNOMIAL, UCC_TL_UCP_ALLREDUCE_ALG_SLIDING_WINDOW, + UCC_TL_UCP_ALLREDUCE_ALG_DBT, UCC_TL_UCP_ALLREDUCE_ALG_LAST }; @@ -102,8 +103,8 @@ typedef struct ucc_tl_ucp_allreduce_sw_host_allgather { } ucc_tl_ucp_allreduce_sw_host_allgather; ucc_status_t ucc_tl_ucp_allreduce_knomial_init(ucc_base_coll_args_t *coll_args, - ucc_base_team_t * team, - ucc_coll_task_t ** task_h); + ucc_base_team_t *team, + ucc_coll_task_t **task_h); ucc_status_t ucc_tl_ucp_allreduce_sliding_window_init(ucc_base_coll_args_t *coll_args, @@ -142,15 +143,22 @@ ucc_tl_ucp_allreduce_sliding_window_finalize(ucc_coll_task_t *task); ucc_status_t ucc_tl_ucp_allreduce_knomial_finalize(ucc_coll_task_t *task); -ucc_status_t -ucc_tl_ucp_allreduce_sra_knomial_init(ucc_base_coll_args_t *coll_args, - ucc_base_team_t * team, - ucc_coll_task_t ** task_h); +ucc_status_t ucc_tl_ucp_allreduce_sra_knomial_init(ucc_base_coll_args_t *coll_args, + ucc_base_team_t *team, + ucc_coll_task_t **task_h); ucc_status_t ucc_tl_ucp_allreduce_sra_knomial_start(ucc_coll_task_t *task); ucc_status_t ucc_tl_ucp_allreduce_sra_knomial_progress(ucc_coll_task_t *task); +ucc_status_t ucc_tl_ucp_allreduce_dbt_init(ucc_base_coll_args_t *coll_args, + ucc_base_team_t *team, + ucc_coll_task_t **task_h); + +ucc_status_t ucc_tl_ucp_allreduce_dbt_start(ucc_coll_task_t *task); + +ucc_status_t ucc_tl_ucp_allreduce_dbt_progress(ucc_coll_task_t *task); + static inline int ucc_tl_ucp_allreduce_alg_from_str(const char *str) { int i; diff --git a/src/components/tl/ucp/allreduce/allreduce_dbt.c b/src/components/tl/ucp/allreduce/allreduce_dbt.c new file mode 100644 index 0000000000..709f4e5f43 --- /dev/null +++ b/src/components/tl/ucp/allreduce/allreduce_dbt.c @@ -0,0 +1,94 @@ +/** + * Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved. + * + * See file LICENSE for terms. + */ + +#include "config.h" +#include "tl_ucp.h" +#include "allreduce.h" +#include "../reduce/reduce.h" +#include "../bcast/bcast.h" + +ucc_status_t ucc_tl_ucp_allreduce_dbt_start(ucc_coll_task_t *coll_task) +{ + ucc_schedule_t *schedule = ucc_derived_of(coll_task, ucc_schedule_t); + ucc_coll_args_t *args = &schedule->super.bargs.args; + ucc_coll_task_t *reduce_task, *bcast_task; + + reduce_task = schedule->tasks[0]; + reduce_task->bargs.args.src.info.buffer = args->src.info.buffer; + reduce_task->bargs.args.dst.info.buffer = args->dst.info.buffer; + reduce_task->bargs.args.src.info.count = args->src.info.count; + reduce_task->bargs.args.dst.info.count = args->dst.info.count; + + bcast_task = schedule->tasks[1]; + bcast_task->bargs.args.src.info.buffer = args->dst.info.buffer; + bcast_task->bargs.args.src.info.count = args->dst.info.count; + + UCC_TL_UCP_PROFILE_REQUEST_EVENT(coll_task, "ucp_allreduce_dbt_start", 0); + return ucc_schedule_start(coll_task); +} + +ucc_status_t ucc_tl_ucp_allreduce_dbt_finalize(ucc_coll_task_t *coll_task) +{ + ucc_schedule_t *schedule = ucc_derived_of(coll_task, ucc_schedule_t); + ucc_status_t status; + + UCC_TL_UCP_PROFILE_REQUEST_EVENT(schedule, "ucp_allreduce_dbt_done", 0); + status = ucc_schedule_finalize(coll_task); + ucc_tl_ucp_put_schedule(schedule); + return status; +} + +ucc_status_t ucc_tl_ucp_allreduce_dbt_init(ucc_base_coll_args_t *coll_args, + ucc_base_team_t *team, + ucc_coll_task_t **task_h) +{ + ucc_tl_ucp_team_t *tl_team = ucc_derived_of(team, ucc_tl_ucp_team_t); + ucc_base_coll_args_t args = *coll_args; + ucc_schedule_t *schedule; + ucc_coll_task_t *reduce_task, *bcast_task; + ucc_status_t status; + + if (UCC_IS_INPLACE(args.args)) { + return UCC_ERR_NOT_SUPPORTED; + } + + status = ucc_tl_ucp_get_schedule(tl_team, coll_args, + (ucc_tl_ucp_schedule_t **)&schedule); + if (ucc_unlikely(UCC_OK != status)) { + return status; + } + + args.args.root = 0; + UCC_CHECK_GOTO(ucc_tl_ucp_reduce_dbt_init(&args, team, &reduce_task), + out, status); + UCC_CHECK_GOTO(ucc_schedule_add_task(schedule, reduce_task), + out, status); + UCC_CHECK_GOTO(ucc_event_manager_subscribe(&schedule->super, + UCC_EVENT_SCHEDULE_STARTED, + reduce_task, + ucc_task_start_handler), + out, status); + + UCC_CHECK_GOTO(ucc_tl_ucp_bcast_dbt_init(&args, team, &bcast_task), + out, status); + UCC_CHECK_GOTO(ucc_schedule_add_task(schedule, bcast_task), + out, status); + UCC_CHECK_GOTO(ucc_event_manager_subscribe(reduce_task, UCC_EVENT_COMPLETED, + bcast_task, + ucc_task_start_handler), + out, status); + + schedule->super.post = ucc_tl_ucp_allreduce_dbt_start; + schedule->super.progress = NULL; + schedule->super.finalize = ucc_tl_ucp_allreduce_dbt_finalize; + *task_h = &schedule->super; + + return UCC_OK; + +out: + ucc_tl_ucp_put_schedule(schedule); + return status; +} diff --git a/src/components/tl/ucp/allreduce/allreduce_sra_knomial.c b/src/components/tl/ucp/allreduce/allreduce_sra_knomial.c index d51ee23802..d24eca786c 100644 --- a/src/components/tl/ucp/allreduce/allreduce_sra_knomial.c +++ b/src/components/tl/ucp/allreduce/allreduce_sra_knomial.c @@ -11,6 +11,7 @@ #include "coll_patterns/sra_knomial.h" #include "utils/ucc_math.h" #include "utils/ucc_coll_utils.h" +#include "components/mc/ucc_mc.h" #include "../reduce_scatter/reduce_scatter.h" #include "../allgather/allgather.h" @@ -53,41 +54,40 @@ ucc_tl_ucp_allreduce_sra_knomial_frag_finalize(ucc_coll_task_t *task) return status; } -static ucc_status_t ucc_tl_ucp_allreduce_sra_knomial_frag_setup( - ucc_schedule_pipelined_t *schedule_p, ucc_schedule_t *frag, int frag_num) +static ucc_status_t +ucc_tl_ucp_allreduce_sra_knomial_frag_setup(ucc_schedule_pipelined_t *schedule_p, + ucc_schedule_t *frag, int frag_num) { - ucc_coll_args_t *args = &schedule_p->super.super.bargs.args; - ucc_datatype_t dt = args->dst.info.datatype; - size_t dt_size = ucc_dt_size(dt); - ucc_coll_args_t *targs; + ucc_coll_args_t *args = &schedule_p->super.super.bargs.args; + ucc_datatype_t dt = args->dst.info.datatype; + size_t dt_size = ucc_dt_size(dt); int n_frags = schedule_p->super.n_tasks; size_t frag_count = ucc_buffer_block_count(args->dst.info.count, n_frags, frag_num); size_t offset = ucc_buffer_block_offset(args->dst.info.count, n_frags, frag_num); + ucc_coll_args_t *targs; - targs = &frag->tasks[0]->bargs.args; //REDUCE_SCATTER - targs->src.info.buffer = - PTR_OFFSET(args->src.info.buffer, offset * dt_size); - targs->dst.info.buffer = - PTR_OFFSET(args->dst.info.buffer, offset * dt_size); - targs->src.info.count = frag_count; - targs->dst.info.count = frag_count; + targs = &frag->tasks[0]->bargs.args; /* REDUCE_SCATTER */ + targs->src.info.buffer = PTR_OFFSET(args->src.info.buffer, offset * dt_size); + targs->src.info.count = frag_count; + targs->dst.info.buffer = PTR_OFFSET(args->dst.info.buffer, offset * dt_size); + targs->dst.info.count = frag_count; - targs = &frag->tasks[1]->bargs.args; //ALLGATHER + targs = &frag->tasks[1]->bargs.args; /* ALLGATHER */ targs->src.info.buffer = NULL; - targs->dst.info.buffer = - PTR_OFFSET(args->dst.info.buffer, offset * dt_size); - targs->src.info.count = 0; - targs->dst.info.count = frag_count; + targs->src.info.count = 0; + targs->dst.info.buffer = PTR_OFFSET(args->dst.info.buffer, offset * dt_size); + targs->dst.info.count = frag_count; return UCC_OK; } -static ucc_status_t ucc_tl_ucp_allreduce_sra_knomial_frag_init( - ucc_base_coll_args_t *coll_args, - ucc_schedule_pipelined_t *sp, //NOLINT - ucc_base_team_t *team, ucc_schedule_t **frag_p) +static ucc_status_t +ucc_tl_ucp_allreduce_sra_knomial_frag_init(ucc_base_coll_args_t *coll_args, + ucc_schedule_pipelined_t *sp, //NOLINT + ucc_base_team_t *team, + ucc_schedule_t **frag_p) { ucc_tl_ucp_team_t *tl_team = ucc_derived_of(team, ucc_tl_ucp_team_t); ucc_datatype_t dtype = coll_args->args.dst.info.datatype; @@ -166,55 +166,84 @@ ucc_status_t ucc_tl_ucp_allreduce_sra_knomial_start(ucc_coll_task_t *task) return ucc_schedule_pipelined_post(task); } -ucc_status_t -ucc_tl_ucp_allreduce_sra_knomial_init(ucc_base_coll_args_t *coll_args, - ucc_base_team_t *team, - ucc_coll_task_t **task_h) +static void +ucc_tl_ucp_allreduce_sra_knomial_get_pipeline_params(ucc_tl_ucp_team_t *team, + ucc_coll_args_t *args, + ucc_pipeline_params_t *pp) { - ucc_tl_ucp_team_t *tl_team = ucc_derived_of(team, ucc_tl_ucp_team_t); - ucc_tl_ucp_lib_config_t *cfg = &tl_team->cfg; - int n_frags, pipeline_depth; - ucc_schedule_pipelined_t *schedule_p; - ucc_status_t status; - ucc_base_coll_args_t bargs; - size_t max_frag_count, dt_size; + ucc_tl_ucp_lib_config_t *cfg = &team->cfg; - dt_size = ucc_dt_size(coll_args->args.dst.info.datatype); - status = ucc_tl_ucp_get_schedule(tl_team, coll_args, - (ucc_tl_ucp_schedule_t **)&schedule_p); - if (ucc_unlikely(UCC_OK != status)) { - return status; + if (!ucc_pipeline_params_is_auto(&cfg->allreduce_sra_kn_pipeline)) { + *pp = cfg->allreduce_sra_kn_pipeline; + return; } - bargs = *coll_args; - if (bargs.mask & UCC_BASE_CARGS_MAX_FRAG_COUNT) { - max_frag_count = bargs.max_frag_count; + if ((args->src.info.mem_type == UCC_MEMORY_TYPE_CUDA) && + (UCC_IS_INPLACE(*args))) { + ucc_mc_attr_t mc_attr; + mc_attr.field_mask = UCC_MC_ATTR_FIELD_FAST_ALLOC_SIZE; + ucc_mc_get_attr(&mc_attr, UCC_MEMORY_TYPE_CUDA); + pp->threshold = mc_attr.fast_alloc_size; + pp->n_frags = 2; + pp->frag_size = mc_attr.fast_alloc_size; + pp->order = UCC_PIPELINE_PARALLEL; + pp->pdepth = 2; } else { - max_frag_count = coll_args->args.dst.info.count; + pp->threshold = SIZE_MAX; + pp->n_frags = 0; + pp->frag_size = 0; + pp->pdepth = 1; + pp->order = UCC_PIPELINE_PARALLEL; + } +} - ucc_pipeline_nfrags_pdepth(&cfg->allreduce_sra_kn_pipeline, - max_frag_count * dt_size, &n_frags, - &pipeline_depth); +ucc_status_t +ucc_tl_ucp_allreduce_sra_knomial_init(ucc_base_coll_args_t *coll_args, + ucc_base_team_t *team, + ucc_coll_task_t **task_h) +{ + ucc_tl_ucp_team_t *tl_team = ucc_derived_of(team, ucc_tl_ucp_team_t); + ucc_coll_args_t *args = &coll_args->args; + size_t dt_size = ucc_dt_size(args->dst.info.datatype); + int n_frags, pipeline_depth; + ucc_schedule_pipelined_t *schedule_p; + ucc_status_t st; + ucc_base_coll_args_t bargs; + size_t max_frag_count; + ucc_pipeline_params_t pipeline_params; + + st = ucc_tl_ucp_get_schedule(tl_team, coll_args, + (ucc_tl_ucp_schedule_t **)&schedule_p); + if (ucc_unlikely(UCC_OK != st)) { + return st; + } + bargs = *coll_args; + max_frag_count = (bargs.mask & UCC_BASE_CARGS_MAX_FRAG_COUNT) ? + bargs.max_frag_count: args->dst.info.count; + ucc_tl_ucp_allreduce_sra_knomial_get_pipeline_params(tl_team, args, + &pipeline_params); + ucc_pipeline_nfrags_pdepth(&pipeline_params, max_frag_count * dt_size, + &n_frags, &pipeline_depth); if (n_frags > 1) { - bargs.mask |= UCC_BASE_CARGS_MAX_FRAG_COUNT; - bargs.max_frag_count = - ucc_buffer_block_count(max_frag_count, n_frags, 0); + bargs.mask |= UCC_BASE_CARGS_MAX_FRAG_COUNT; + bargs.max_frag_count = ucc_buffer_block_count(max_frag_count, n_frags, 0); } - status = ucc_schedule_pipelined_init( - &bargs, team, ucc_tl_ucp_allreduce_sra_knomial_frag_init, - ucc_tl_ucp_allreduce_sra_knomial_frag_setup, pipeline_depth, n_frags, - cfg->allreduce_sra_kn_pipeline.order, schedule_p); - if (UCC_OK != status) { + st = ucc_schedule_pipelined_init(&bargs, team, + ucc_tl_ucp_allreduce_sra_knomial_frag_init, + ucc_tl_ucp_allreduce_sra_knomial_frag_setup, + pipeline_depth, n_frags, + pipeline_params.order, schedule_p); + if (ucc_unlikely(UCC_OK != st)) { tl_error(team->context->lib, "failed to init pipelined schedule"); ucc_tl_ucp_put_schedule(&schedule_p->super); - return status; + return st; } - schedule_p->super.super.finalize = - ucc_tl_ucp_allreduce_sra_knomial_finalize; - schedule_p->super.super.post = ucc_tl_ucp_allreduce_sra_knomial_start; - *task_h = &schedule_p->super.super; + + schedule_p->super.super.finalize = ucc_tl_ucp_allreduce_sra_knomial_finalize; + schedule_p->super.super.post = ucc_tl_ucp_allreduce_sra_knomial_start; + *task_h = &schedule_p->super.super; return UCC_OK; } diff --git a/src/components/tl/ucp/alltoall/alltoall.c b/src/components/tl/ucp/alltoall/alltoall.c index faa888dcc0..3803d96426 100644 --- a/src/components/tl/ucp/alltoall/alltoall.c +++ b/src/components/tl/ucp/alltoall/alltoall.c @@ -56,8 +56,8 @@ ucc_status_t ucc_tl_ucp_alltoall_init(ucc_tl_ucp_task_t *task) } ucc_status_t ucc_tl_ucp_alltoall_pairwise_init(ucc_base_coll_args_t *coll_args, - ucc_base_team_t *team, - ucc_coll_task_t **task_h) + ucc_base_team_t *team, + ucc_coll_task_t **task_h) { ucc_tl_ucp_team_t *tl_team = ucc_derived_of(team, ucc_tl_ucp_team_t); ucc_tl_ucp_task_t *task; @@ -72,8 +72,8 @@ ucc_status_t ucc_tl_ucp_alltoall_pairwise_init(ucc_base_coll_args_t *coll_args, } ucc_status_t ucc_tl_ucp_alltoall_onesided_init(ucc_base_coll_args_t *coll_args, - ucc_base_team_t * team, - ucc_coll_task_t ** task_h) + ucc_base_team_t *team, + ucc_coll_task_t **task_h) { ucc_tl_ucp_team_t *tl_team = ucc_derived_of(team, ucc_tl_ucp_team_t); ucc_tl_ucp_task_t *task; diff --git a/src/components/tl/ucp/alltoall/alltoall_bruck.c b/src/components/tl/ucp/alltoall/alltoall_bruck.c index 984b900b9c..4424437f8a 100644 --- a/src/components/tl/ucp/alltoall/alltoall_bruck.c +++ b/src/components/tl/ucp/alltoall/alltoall_bruck.c @@ -12,10 +12,15 @@ #include "coll_patterns/bruck_alltoall.h" #define RADIX 2 +#define SAVE_STATE(_phase) \ + do { \ + task->alltoall_bruck.phase = _phase; \ + } while (0) enum { PHASE_MERGE, - PHASE_SENDRECV + PHASE_SENDRECV, + PHASE_BCOPY }; static inline int msb_pos_for_level(unsigned int nthbit, ucc_rank_t number) @@ -33,7 +38,8 @@ static inline int msb_pos_for_level(unsigned int nthbit, ucc_rank_t number) return msb_set; } -static inline int find_seg_index(ucc_rank_t seg_index, int level, int nsegs_per_rblock) +static inline int find_seg_index(ucc_rank_t seg_index, int level, + int nsegs_per_rblock) { int block, blockseg; @@ -53,7 +59,8 @@ static inline int find_seg_index(ucc_rank_t seg_index, int level, int nsegs_per_ return block * nsegs_per_rblock + blockseg; } -ucc_status_t ucc_tl_ucp_alltoall_bruck_backward_rotation(void *dst, void *src, +ucc_status_t ucc_tl_ucp_alltoall_bruck_backward_rotation(void *dst, + void *src, ucc_rank_t trank, ucc_rank_t tsize, size_t seg_size) @@ -107,18 +114,29 @@ void ucc_tl_ucp_alltoall_bruck_progress(ucc_coll_task_t *coll_task) ucc_rank_t tsize = UCC_TL_TEAM_SIZE(team); ucc_coll_args_t *args = &TASK_ARGS(task); void *scratch = task->alltoall_bruck.scratch_mc_header->addr; - void *mergebuf = args->dst.info.buffer; + void *mergebuf = task->alltoall_bruck.dst; const ucc_rank_t nrecv_segs = tsize / 2; const size_t seg_size = ucc_dt_size(args->src.info.datatype) * args->src.info.count / tsize; - void *data; + ucc_memory_type_t smtype = args->src.info.mem_type; + ucc_memory_type_t dmtype = args->dst.info.mem_type; ucc_rank_t sendto, recvfrom, step, index; + void *data; ucc_rank_t level, snd_count; int send_buffer_index; - ucc_status_t st; - - if (task->alltoall_bruck.phase == PHASE_SENDRECV) { + ucc_status_t status; + ucc_ee_executor_t *exec; + ucc_ee_executor_task_args_t eargs; + + EXEC_TASK_TEST(task->alltoall_bruck.phase, + "failed to copy data from user buffer to scratch", + task->alltoall_bruck.etask); + switch (task->alltoall_bruck.phase) { + case PHASE_SENDRECV: goto ALLTOALL_BRUCK_PHASE_SENDRECV; + case PHASE_BCOPY: + task->super.status = UCC_OK; + goto out; } step = 1 << (task->alltoall_bruck.iteration - 1); @@ -133,16 +151,16 @@ void ucc_tl_ucp_alltoall_bruck_progress(ucc_coll_task_t *coll_task) index = GET_NEXT_BRUCK_NUM(index, RADIX, step)) { send_buffer_index = find_seg_index(index, level + 1, nrecv_segs); if (send_buffer_index == -1) { - data = PTR_OFFSET(args->src.info.buffer, + data = PTR_OFFSET(task->alltoall_bruck.src, ((index + trank) % tsize) * seg_size); } else { data = PTR_OFFSET(scratch, send_buffer_index * seg_size); } - st = ucc_mc_memcpy(PTR_OFFSET(mergebuf, seg_size * snd_count), - data, seg_size, UCC_MEMORY_TYPE_HOST, - UCC_MEMORY_TYPE_HOST); - if (ucc_unlikely(UCC_OK != st)) { - task->super.status = st; + status = ucc_mc_memcpy(PTR_OFFSET(mergebuf, seg_size * snd_count), + data, seg_size, UCC_MEMORY_TYPE_HOST, + UCC_MEMORY_TYPE_HOST); + if (ucc_unlikely(UCC_OK != status)) { + task->super.status = status; return; } snd_count++; @@ -158,36 +176,88 @@ void ucc_tl_ucp_alltoall_bruck_progress(ucc_coll_task_t *coll_task) task, out); ALLTOALL_BRUCK_PHASE_SENDRECV: if (ucc_tl_ucp_test(task) == UCC_INPROGRESS) { - task->alltoall_bruck.phase = PHASE_SENDRECV; + SAVE_STATE(PHASE_SENDRECV); return; } task->alltoall_bruck.iteration++; step = 1 << (task->alltoall_bruck.iteration - 1); } - st = ucc_mc_memcpy(PTR_OFFSET(args->dst.info.buffer, trank * seg_size), - PTR_OFFSET(args->src.info.buffer, trank * seg_size), - seg_size, UCC_MEMORY_TYPE_HOST, UCC_MEMORY_TYPE_HOST); - if (ucc_unlikely(st != UCC_OK)) { - task->super.status = st; + status = ucc_mc_memcpy(PTR_OFFSET(task->alltoall_bruck.dst, trank * seg_size), + PTR_OFFSET(task->alltoall_bruck.src, trank * seg_size), + seg_size, UCC_MEMORY_TYPE_HOST, UCC_MEMORY_TYPE_HOST); + if (ucc_unlikely(status != UCC_OK)) { + task->super.status = status; return; } - task->super.status = - ucc_tl_ucp_alltoall_bruck_backward_rotation(args->dst.info.buffer, - scratch, trank, tsize, - seg_size); + status = ucc_tl_ucp_alltoall_bruck_backward_rotation(mergebuf, scratch, + trank, tsize, + seg_size); + if (ucc_unlikely(status != UCC_OK)) { + task->super.status = status; + return; + } + + if (smtype != UCC_MEMORY_TYPE_HOST || dmtype != UCC_MEMORY_TYPE_HOST) { + task->alltoall_bruck.phase = PHASE_BCOPY; + status = ucc_coll_task_get_executor(&task->super, &exec); + if (ucc_unlikely(status != UCC_OK)) { + task->super.status = status; + return; + } + + eargs.task_type = UCC_EE_EXECUTOR_TASK_COPY; + eargs.copy.src = mergebuf; + eargs.copy.dst = args->dst.info.buffer; + eargs.copy.len = seg_size * tsize; + status = ucc_ee_executor_task_post(exec, &eargs, + &task->alltoall_bruck.etask); + if (ucc_unlikely(status != UCC_OK)) { + task->super.status = status; + return; + } + EXEC_TASK_TEST(PHASE_BCOPY, "failed to copy data to user buffer", + task->alltoall_bruck.etask); + } + + task->super.status = UCC_OK; out: return; } ucc_status_t ucc_tl_ucp_alltoall_bruck_start(ucc_coll_task_t *coll_task) { - ucc_tl_ucp_task_t *task = ucc_derived_of(coll_task, ucc_tl_ucp_task_t); - ucc_tl_ucp_team_t *team = TASK_TEAM(task); + ucc_tl_ucp_task_t *task = ucc_derived_of(coll_task, ucc_tl_ucp_task_t); + ucc_tl_ucp_team_t *team = TASK_TEAM(task); + ucc_coll_args_t *args = &TASK_ARGS(task); + size_t size = ucc_dt_size(args->src.info.datatype) * + args->src.info.count; + ucc_ee_executor_t *exec; + ucc_ee_executor_task_args_t eargs; + ucc_status_t status; + ucc_tl_ucp_task_reset(task, UCC_INPROGRESS); task->alltoall_bruck.iteration = 1; task->alltoall_bruck.phase = PHASE_MERGE; - ucc_tl_ucp_task_reset(task, UCC_INPROGRESS); + task->alltoall_bruck.etask = NULL; + + if ((args->src.info.mem_type != UCC_MEMORY_TYPE_HOST) || + (args->dst.info.mem_type != UCC_MEMORY_TYPE_HOST)) { + status = ucc_coll_task_get_executor(&task->super, &exec); + if (ucc_unlikely(status != UCC_OK)) { + return status; + } + + eargs.task_type = UCC_EE_EXECUTOR_TASK_COPY; + eargs.copy.src = args->src.info.buffer; + eargs.copy.dst = task->alltoall_bruck.src; + eargs.copy.len = size; + status = ucc_ee_executor_task_post(exec, &eargs, + &task->alltoall_bruck.etask); + if (ucc_unlikely(status != UCC_OK)) { + return status; + } + } return ucc_progress_queue_enqueue(UCC_TL_CORE_CTX(team)->pq, &task->super); } @@ -199,25 +269,28 @@ ucc_status_t ucc_tl_ucp_alltoall_bruck_init(ucc_base_coll_args_t *coll_args, ucc_tl_ucp_team_t *tl_team = ucc_derived_of(team, ucc_tl_ucp_team_t); ucc_rank_t tsize = UCC_TL_TEAM_SIZE(tl_team); ucc_coll_args_t *args = &coll_args->args; - size_t seg_size = ucc_dt_size(args->src.info.datatype) * - args->src.info.count / tsize; + size_t ssize = ucc_dt_size(args->src.info.datatype) * + args->src.info.count; + size_t seg_size = ssize / tsize; + int is_bcopy = 0; size_t scratch_size; ucc_tl_ucp_task_t *task; ucc_status_t status; - if ((coll_args->args.src.info.mem_type != UCC_MEMORY_TYPE_HOST) || - (coll_args->args.dst.info.mem_type != UCC_MEMORY_TYPE_HOST)) { - status = UCC_ERR_NOT_SUPPORTED; - goto out; - } ALLTOALL_TASK_CHECK(coll_args->args, tl_team); - task = ucc_tl_ucp_init_task(coll_args, team); task->super.post = ucc_tl_ucp_alltoall_bruck_start; task->super.progress = ucc_tl_ucp_alltoall_bruck_progress; task->super.finalize = ucc_tl_ucp_alltoall_bruck_finalize; + task->super.flags |= UCC_COLL_TASK_FLAG_EXECUTOR; scratch_size = lognum(tsize) * ucc_div_round_up(tsize, 2) * seg_size; + if ((coll_args->args.src.info.mem_type != UCC_MEMORY_TYPE_HOST) || + (coll_args->args.dst.info.mem_type != UCC_MEMORY_TYPE_HOST)) { + is_bcopy = 1; + scratch_size += 2 * ssize; + } + status = ucc_mc_alloc(&task->alltoall_bruck.scratch_mc_header, scratch_size, UCC_MEMORY_TYPE_HOST); if (ucc_unlikely(status != UCC_OK)) { @@ -226,6 +299,17 @@ ucc_status_t ucc_tl_ucp_alltoall_bruck_init(ucc_base_coll_args_t *coll_args, return status; } + if (is_bcopy) { + task->alltoall_bruck.src = + PTR_OFFSET(task->alltoall_bruck.scratch_mc_header->addr, + lognum(tsize) * ucc_div_round_up(tsize, 2) * seg_size); + task->alltoall_bruck.dst = + PTR_OFFSET(task->alltoall_bruck.src, ssize); + } else { + task->alltoall_bruck.src = args->src.info.buffer; + task->alltoall_bruck.dst = args->dst.info.buffer; + } + *task_h = &task->super; return UCC_OK; diff --git a/src/components/tl/ucp/alltoall/alltoall_onesided.c b/src/components/tl/ucp/alltoall/alltoall_onesided.c index 99c56d281c..856b392534 100644 --- a/src/components/tl/ucp/alltoall/alltoall_onesided.c +++ b/src/components/tl/ucp/alltoall/alltoall_onesided.c @@ -55,9 +55,7 @@ void ucc_tl_ucp_alltoall_onesided_progress(ucc_coll_task_t *ctask) ucc_rank_t gsize = UCC_TL_TEAM_SIZE(team); long * pSync = TASK_ARGS(task).global_work_buffer; - if ((*pSync < gsize) || - (task->onesided.put_completed < task->onesided.put_posted)) { - ucp_worker_progress(UCC_TL_UCP_TEAM_CTX(team)->worker.ucp_worker); + if (ucc_tl_ucp_test_onesided(task, gsize) == UCC_INPROGRESS) { return; } diff --git a/src/components/tl/ucp/alltoallv/alltoallv.c b/src/components/tl/ucp/alltoallv/alltoallv.c index bc21df9f10..063cbd22bf 100644 --- a/src/components/tl/ucp/alltoallv/alltoallv.c +++ b/src/components/tl/ucp/alltoallv/alltoallv.c @@ -19,6 +19,10 @@ ucc_base_coll_alg_info_t {.id = UCC_TL_UCP_ALLTOALLV_ALG_HYBRID, .name = "hybrid", .desc = "hybrid a2av alg "}, + [UCC_TL_UCP_ALLTOALLV_ALG_ONESIDED] = + {.id = UCC_TL_UCP_ALLTOALLV_ALG_ONESIDED, + .name = "onesided", + .desc = "O(N) onesided alltoallv"}, [UCC_TL_UCP_ALLTOALLV_ALG_LAST] = { .id = 0, .name = NULL, .desc = NULL}}; diff --git a/src/components/tl/ucp/alltoallv/alltoallv.h b/src/components/tl/ucp/alltoallv/alltoallv.h index 5aef136564..a501cc4205 100644 --- a/src/components/tl/ucp/alltoallv/alltoallv.h +++ b/src/components/tl/ucp/alltoallv/alltoallv.h @@ -13,6 +13,7 @@ enum { UCC_TL_UCP_ALLTOALLV_ALG_PAIRWISE, UCC_TL_UCP_ALLTOALLV_ALG_HYBRID, + UCC_TL_UCP_ALLTOALLV_ALG_ONESIDED, UCC_TL_UCP_ALLTOALLV_ALG_LAST }; @@ -32,6 +33,9 @@ ucc_status_t ucc_tl_ucp_alltoallv_hybrid_init(ucc_base_coll_args_t *coll_args, ucc_base_team_t *team, ucc_coll_task_t **task_h); +ucc_status_t ucc_tl_ucp_alltoallv_onesided_init(ucc_base_coll_args_t *coll_args, + ucc_base_team_t *team, + ucc_coll_task_t **task_h); ucc_status_t ucc_tl_ucp_alltoallv_pairwise_init_common(ucc_tl_ucp_task_t *task); diff --git a/src/components/tl/ucp/alltoallv/alltoallv_hybrid.c b/src/components/tl/ucp/alltoallv/alltoallv_hybrid.c index 61b130eaa5..7b8c7b7b67 100644 --- a/src/components/tl/ucp/alltoallv/alltoallv_hybrid.c +++ b/src/components/tl/ucp/alltoallv/alltoallv_hybrid.c @@ -510,9 +510,11 @@ ucc_status_t post_recv(ucc_rank_t recvfrom, ucc_rank_t tsize, size_t dt_size, /* check if we have space for maximum recieve. If not, recycle */ if (meta->offset * dt_size + step_buf_size > tmp_buf_size) { - new_offset = receive_buffer_recycler(tsize, (int *)op_metadata, (int *)op_metadata + tsize, - seg_st, p_tmp_recv_region, dt_size, BytesForPacking, - step, user_rbuf, rdisps, trank, radix, node_edge_id); + new_offset = receive_buffer_recycler(tsize, (unsigned int *)op_metadata, + (int *)op_metadata + tsize, + seg_st, p_tmp_recv_region, dt_size, + BytesForPacking, step, user_rbuf, + rdisps, trank, radix, node_edge_id); meta->offset = new_offset; } ucc_assert(meta->offset * dt_size + step_buf_size <= tmp_buf_size); @@ -595,8 +597,8 @@ static ucc_status_t complete_current_step_receives(ucc_rank_t tsize, int step, temp_offset = PTR_OFFSET(temp_offset, cur_buf_length * dt_size); } else { /* data will be sent pairwise */ - ((int *)op_metadata)[i] = COUNT_DIRECT; - ((int *)op_metadata)[i + tsize] = COUNT_DIRECT; + ((int *)op_metadata)[i] = (int)COUNT_DIRECT; + ((int *)op_metadata)[i + tsize] = (int)COUNT_DIRECT; if (i < (step * radix)) { int pairwise_src = (trank - i + tsize) % tsize; if (rcounts[pairwise_src] > 0) { @@ -636,8 +638,8 @@ static ucc_status_t complete_current_step_receives(ucc_rank_t tsize, int step, next_p = tsize; } } else { - ((int *)op_metadata)[i] = COUNT_DIRECT; - ((int *)op_metadata)[i + tsize] = COUNT_DIRECT; + ((int *)op_metadata)[i] = (int)COUNT_DIRECT; + ((int *)op_metadata)[i + tsize] = (int)COUNT_DIRECT; if (i < (step * radix)) { int pairwise_src = (trank - i + tsize) % tsize; if (rcounts[pairwise_src] > 0) { @@ -709,7 +711,7 @@ ucc_status_t pairwise_manager(ucc_rank_t trank, ucc_rank_t tsize, int *r_disps = (int*)TASK_ARGS(task).dst.info_v.displacements; int *scounts = (int*)TASK_ARGS(task).src.info_v.counts; int *rcounts = (int*)TASK_ARGS(task).dst.info_v.counts; - int* cur = &task->alltoallv_hybrid.cur_out; + ucc_rank_t *cur = &task->alltoallv_hybrid.cur_out; int chunk_num_limit = UCC_TL_UCP_TEAM_LIB(team)->cfg.alltoallv_hybrid_pairwise_num_posts; int chunk_byte_limit = UCC_TL_UCP_TEAM_LIB(team)->cfg.alltoallv_hybrid_chunk_byte_limit; ucc_status_t status; diff --git a/src/components/tl/ucp/alltoallv/alltoallv_onesided.c b/src/components/tl/ucp/alltoallv/alltoallv_onesided.c new file mode 100644 index 0000000000..bb6fa14b3e --- /dev/null +++ b/src/components/tl/ucp/alltoallv/alltoallv_onesided.c @@ -0,0 +1,104 @@ +/** + * Copyright (c) 2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. + * + * See file LICENSE for terms. + */ + +#include "config.h" +#include "tl_ucp.h" +#include "alltoallv.h" +#include "core/ucc_progress_queue.h" +#include "utils/ucc_math.h" +#include "tl_ucp_sendrecv.h" + +ucc_status_t ucc_tl_ucp_alltoallv_onesided_start(ucc_coll_task_t *ctask) +{ + ucc_tl_ucp_task_t *task = ucc_derived_of(ctask, ucc_tl_ucp_task_t); + ucc_tl_ucp_team_t *team = TASK_TEAM(task); + ptrdiff_t src = (ptrdiff_t)TASK_ARGS(task).src.info_v.buffer; + ptrdiff_t dest = (ptrdiff_t)TASK_ARGS(task).dst.info_v.buffer; + ucc_rank_t grank = UCC_TL_TEAM_RANK(team); + ucc_rank_t gsize = UCC_TL_TEAM_SIZE(team); + long *pSync = TASK_ARGS(task).global_work_buffer; + ucc_aint_t *s_disp = TASK_ARGS(task).src.info_v.displacements; + ucc_aint_t *d_disp = TASK_ARGS(task).dst.info_v.displacements; + size_t sdt_size = ucc_dt_size(TASK_ARGS(task).src.info_v.datatype); + size_t rdt_size = ucc_dt_size(TASK_ARGS(task).dst.info_v.datatype); + ucc_rank_t peer; + size_t sd_disp, dd_disp, data_size; + + ucc_tl_ucp_task_reset(task, UCC_INPROGRESS); + + /* perform a put to each member peer using the peer's index in the + * destination displacement. */ + for (peer = (grank + 1) % gsize; task->onesided.put_posted < gsize; + peer = (peer + 1) % gsize) { + sd_disp = + ucc_coll_args_get_displacement(&TASK_ARGS(task), s_disp, peer) * + sdt_size; + dd_disp = + ucc_coll_args_get_displacement(&TASK_ARGS(task), d_disp, peer) * + rdt_size; + data_size = + ucc_coll_args_get_count(&TASK_ARGS(task), + TASK_ARGS(task).src.info_v.counts, peer) * + sdt_size; + + UCPCHECK_GOTO(ucc_tl_ucp_put_nb(PTR_OFFSET(src, sd_disp), + PTR_OFFSET(dest, dd_disp), + data_size, peer, team, task), + task, out); + UCPCHECK_GOTO(ucc_tl_ucp_atomic_inc(pSync, peer, team), task, out); + } + return ucc_progress_queue_enqueue(UCC_TL_CORE_CTX(team)->pq, &task->super); +out: + return task->super.status; +} + +void ucc_tl_ucp_alltoallv_onesided_progress(ucc_coll_task_t *ctask) +{ + ucc_tl_ucp_task_t *task = ucc_derived_of(ctask, ucc_tl_ucp_task_t); + ucc_tl_ucp_team_t *team = TASK_TEAM(task); + ucc_rank_t gsize = UCC_TL_TEAM_SIZE(team); + long *pSync = TASK_ARGS(task).global_work_buffer; + + if (ucc_tl_ucp_test_onesided(task, gsize) == UCC_INPROGRESS) { + return; + } + + pSync[0] = 0; + task->super.status = UCC_OK; +} + +ucc_status_t ucc_tl_ucp_alltoallv_onesided_init(ucc_base_coll_args_t *coll_args, + ucc_base_team_t *team, + ucc_coll_task_t **task_h) +{ + ucc_tl_ucp_team_t *tl_team = ucc_derived_of(team, ucc_tl_ucp_team_t); + ucc_tl_ucp_task_t *task; + ucc_status_t status; + + ALLTOALLV_TASK_CHECK(coll_args->args, tl_team); + if (!(coll_args->args.mask & UCC_COLL_ARGS_FIELD_GLOBAL_WORK_BUFFER)) { + tl_error(UCC_TL_TEAM_LIB(tl_team), + "global work buffer not provided nor associated with team"); + status = UCC_ERR_NOT_SUPPORTED; + goto out; + } + if (coll_args->args.mask & UCC_COLL_ARGS_FIELD_FLAGS) { + if (!(coll_args->args.flags & UCC_COLL_ARGS_FLAG_MEM_MAPPED_BUFFERS)) { + tl_error(UCC_TL_TEAM_LIB(tl_team), + "non memory mapped buffers are not supported"); + status = UCC_ERR_NOT_SUPPORTED; + goto out; + } + } + + task = ucc_tl_ucp_init_task(coll_args, team); + *task_h = &task->super; + task->super.post = ucc_tl_ucp_alltoallv_onesided_start; + task->super.progress = ucc_tl_ucp_alltoallv_onesided_progress; + status = UCC_OK; +out: + return status; +} diff --git a/src/components/tl/ucp/bcast/bcast.c b/src/components/tl/ucp/bcast/bcast.c index 6a1d5b7720..b3b98e7779 100644 --- a/src/components/tl/ucp/bcast/bcast.c +++ b/src/components/tl/ucp/bcast/bcast.c @@ -19,6 +19,11 @@ ucc_base_coll_alg_info_t .name = "sag_knomial", .desc = "recursive knomial scatter followed by knomial " "allgather (optimized for BW)"}, + [UCC_TL_UCP_BCAST_ALG_DBT] = + {.id = UCC_TL_UCP_BCAST_ALG_DBT, + .name = "dbt", + .desc = "bcast over double binary tree where a leaf in one tree " + "will be intermediate in other (optimized for BW)"}, [UCC_TL_UCP_BCAST_ALG_LAST] = { .id = 0, .name = NULL, .desc = NULL}}; @@ -36,8 +41,8 @@ ucc_status_t ucc_tl_ucp_bcast_init(ucc_tl_ucp_task_t *task) } ucc_status_t ucc_tl_ucp_bcast_knomial_init(ucc_base_coll_args_t *coll_args, - ucc_base_team_t * team, - ucc_coll_task_t ** task_h) + ucc_base_team_t *team, + ucc_coll_task_t **task_h) { ucc_tl_ucp_task_t *task; ucc_status_t status; diff --git a/src/components/tl/ucp/bcast/bcast.h b/src/components/tl/ucp/bcast/bcast.h index 3ea567fb9c..baaa40c313 100644 --- a/src/components/tl/ucp/bcast/bcast.h +++ b/src/components/tl/ucp/bcast/bcast.h @@ -11,6 +11,7 @@ enum { UCC_TL_UCP_BCAST_ALG_KNOMIAL, UCC_TL_UCP_BCAST_ALG_SAG_KNOMIAL, + UCC_TL_UCP_BCAST_ALG_DBT, UCC_TL_UCP_BCAST_ALG_LAST }; @@ -47,4 +48,8 @@ ucc_status_t ucc_tl_ucp_bcast_sag_knomial_init(ucc_base_coll_args_t *coll_args, ucc_base_team_t *team, ucc_coll_task_t **task_h); +ucc_status_t ucc_tl_ucp_bcast_dbt_init( + ucc_base_coll_args_t *coll_args, ucc_base_team_t *team, + ucc_coll_task_t **task_h); + #endif diff --git a/src/components/tl/ucp/bcast/bcast_dbt.c b/src/components/tl/ucp/bcast/bcast_dbt.c new file mode 100644 index 0000000000..4e1f77594f --- /dev/null +++ b/src/components/tl/ucp/bcast/bcast_dbt.c @@ -0,0 +1,242 @@ +/** + * Copyright (c) 2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. + * + * See file LICENSE for terms. + */ + +#include "config.h" +#include "tl_ucp.h" +#include "bcast.h" +#include "core/ucc_progress_queue.h" +#include "tl_ucp_sendrecv.h" + +enum { + RECV, + SEND_T1, + SEND_T2, + TEST, +}; + +#define UCC_BCAST_DBT_CHECK_STATE(_p) \ + case _p: \ + goto _p; + +#define UCC_BCAST_DBT_GOTO_STATE(_state) \ + do { \ + switch (_state) { \ + UCC_BCAST_DBT_CHECK_STATE(SEND_T1); \ + UCC_BCAST_DBT_CHECK_STATE(SEND_T2); \ + UCC_BCAST_DBT_CHECK_STATE(TEST); \ + }; \ + } while (0) + +static void recv_completion_common(void *request, ucs_status_t status, + const ucp_tag_recv_info_t *info, /* NOLINT */ + void *user_data) +{ + ucc_tl_ucp_task_t *task = (ucc_tl_ucp_task_t *)user_data; + if (ucc_unlikely(UCS_OK != status)) { + tl_error(UCC_TASK_LIB(task), "failure in recv completion %s", + ucs_status_string(status)); + task->super.status = ucs_status_to_ucc_status(status); + } + task->tagged.recv_completed++; + if (request) { + ucp_request_free(request); + } +} + +static void recv_completion_1(void *request, ucs_status_t status, + const ucp_tag_recv_info_t *info, /* NOLINT */ + void *user_data) +{ + ucc_tl_ucp_task_t *task = (ucc_tl_ucp_task_t *)user_data; + + task->bcast_dbt.t1.recv++; + recv_completion_common(request, status, info, user_data); +} + +static void recv_completion_2(void *request, ucs_status_t status, + const ucp_tag_recv_info_t *info, /* NOLINT */ + void *user_data) +{ + ucc_tl_ucp_task_t *task = (ucc_tl_ucp_task_t *)user_data; + + task->bcast_dbt.t2.recv++; + recv_completion_common(request, status, info, user_data); + +} + +void ucc_tl_ucp_bcast_dbt_progress(ucc_coll_task_t *coll_task) +{ + ucc_tl_ucp_task_t *task = + ucc_derived_of(coll_task, ucc_tl_ucp_task_t); + ucc_tl_ucp_team_t *team = TASK_TEAM(task); + ucc_coll_args_t *args = &TASK_ARGS(task); + ucc_rank_t rank = UCC_TL_TEAM_RANK(team); + ucc_dbt_single_tree_t t1 = task->bcast_dbt.t1; + ucc_dbt_single_tree_t t2 = task->bcast_dbt.t2; + void *buffer = args->src.info.buffer; + ucc_memory_type_t mtype = args->src.info.mem_type; + ucc_datatype_t dt = args->src.info.datatype; + size_t count = args->src.info.count; + size_t count_t1 = (count % 2) ? count / 2 + 1 + : count / 2; + size_t data_size_t1 = count_t1 * ucc_dt_size(dt); + size_t data_size_t2 = count / 2 * ucc_dt_size(dt); + ucc_rank_t coll_root = (ucc_rank_t)args->root; + ucp_tag_recv_nbx_callback_t cb[2] = {recv_completion_1, + recv_completion_2}; + uint32_t i; + + UCC_BCAST_DBT_GOTO_STATE(task->bcast_dbt.state); + + if (rank != t1.root && rank != coll_root) { + UCPCHECK_GOTO(ucc_tl_ucp_recv_cb(buffer, data_size_t1, mtype, + t1.parent, team, task, cb[0], + (void *)task), + task, out); + } + + if (rank != t2.root && rank != coll_root) { + UCPCHECK_GOTO(ucc_tl_ucp_recv_cb(PTR_OFFSET(buffer, data_size_t1), + data_size_t2, mtype, t2.parent, team, + task, cb[1], (void *)task), + task, out); + } + task->bcast_dbt.state = SEND_T1; + +SEND_T1: + if ((coll_root == rank) || (task->bcast_dbt.t1.recv > 0)) { + for (i = 0; i < 2; i++) { + if ((t1.children[i] != UCC_RANK_INVALID) && + (t1.children[i] != coll_root)) { + UCPCHECK_GOTO(ucc_tl_ucp_send_nb(buffer, data_size_t1, mtype, + t1.children[i], team, task), + task, out); + } + } + } else { + goto out; + } + task->bcast_dbt.state = SEND_T2; + +SEND_T2: + if ((coll_root == rank) || (task->bcast_dbt.t2.recv > 0)) { + for (i = 0; i < 2; i++) { + if ((t2.children[i] != UCC_RANK_INVALID) && + (t2.children[i] != coll_root)) { + UCPCHECK_GOTO(ucc_tl_ucp_send_nb(PTR_OFFSET(buffer, + data_size_t1), + data_size_t2, mtype, + t2.children[i], team, task), + task, out); + } + } + } else { + goto out; + } + +TEST: + if (UCC_INPROGRESS == ucc_tl_ucp_test_send(task)) { + task->bcast_dbt.state = TEST; + return; + } + + task->super.status = UCC_OK; + UCC_TL_UCP_PROFILE_REQUEST_EVENT(coll_task, "ucp_bcast_dbt_done", 0); + +out: + return; +} + +ucc_status_t ucc_tl_ucp_bcast_dbt_start(ucc_coll_task_t *coll_task) +{ + ucc_tl_ucp_task_t *task = + ucc_derived_of(coll_task, ucc_tl_ucp_task_t); + ucc_tl_ucp_team_t *team = TASK_TEAM(task); + ucc_coll_args_t *args = &TASK_ARGS(task); + ucc_status_t status = UCC_OK; + ucc_rank_t rank = UCC_TL_TEAM_RANK(team); + void *buffer = args->src.info.buffer; + ucc_memory_type_t mtype = args->src.info.mem_type; + ucc_datatype_t dt = args->src.info.datatype; + size_t count = args->src.info.count; + size_t count_t1 = (count % 2) ? count / 2 + 1 + : count / 2; + size_t data_size_t1 = count_t1 * ucc_dt_size(dt); + size_t data_size_t2 = count / 2 * ucc_dt_size(dt); + ucc_rank_t coll_root = (ucc_rank_t)args->root; + ucc_rank_t t1_root = task->bcast_dbt.t1.root; + ucc_rank_t t2_root = task->bcast_dbt.t2.root; + ucp_tag_recv_nbx_callback_t cb[2] = {recv_completion_1, + recv_completion_2}; + + task->bcast_dbt.t1.recv = 0; + task->bcast_dbt.t2.recv = 0; + ucc_tl_ucp_task_reset(task, UCC_INPROGRESS); + + if (rank == coll_root && coll_root != t1_root) { + status = ucc_tl_ucp_send_nb(buffer, data_size_t1, mtype, t1_root, team, + task); + if (UCC_OK != status) { + return status; + } + } + + if (rank == coll_root && coll_root != t2_root) { + status = ucc_tl_ucp_send_nb(PTR_OFFSET(buffer, data_size_t1), + data_size_t2, mtype, t2_root, team, task); + if (UCC_OK != status) { + return status; + } + } + + if (rank != coll_root && rank == t1_root) { + status = ucc_tl_ucp_recv_cb(buffer, data_size_t1, mtype, coll_root, + team, task, cb[0], (void *)task); + if (UCC_OK != status) { + return status; + } + } + + if (rank != coll_root && rank == t2_root) { + status = ucc_tl_ucp_recv_cb(PTR_OFFSET(buffer, data_size_t1), + data_size_t2, mtype, coll_root, team, task, + cb[1], (void *)task); + if (UCC_OK != status) { + return status; + } + } + + task->bcast_dbt.state = RECV; + UCC_TL_UCP_PROFILE_REQUEST_EVENT(coll_task, "ucp_bcast_dbt_start", 0); + return ucc_progress_queue_enqueue(UCC_TL_CORE_CTX(team)->pq, &task->super); +} + +ucc_status_t ucc_tl_ucp_bcast_dbt_finalize(ucc_coll_task_t *coll_task) +{ + return ucc_tl_ucp_coll_finalize(coll_task); +} + +ucc_status_t ucc_tl_ucp_bcast_dbt_init( + ucc_base_coll_args_t *coll_args, ucc_base_team_t *team, + ucc_coll_task_t **task_h) +{ + ucc_tl_ucp_team_t *tl_team; + ucc_tl_ucp_task_t *task; + ucc_rank_t rank, size; + + task = ucc_tl_ucp_init_task(coll_args, team); + task->super.post = ucc_tl_ucp_bcast_dbt_start; + task->super.progress = ucc_tl_ucp_bcast_dbt_progress; + task->super.finalize = ucc_tl_ucp_bcast_dbt_finalize; + tl_team = TASK_TEAM(task); + rank = UCC_TL_TEAM_RANK(tl_team); + size = UCC_TL_TEAM_SIZE(tl_team); + ucc_dbt_build_trees(rank, size, &task->bcast_dbt.t1, + &task->bcast_dbt.t2); + + *task_h = &task->super; + return UCC_OK; +} diff --git a/src/components/tl/ucp/bcast/bcast_sag_knomial.c b/src/components/tl/ucp/bcast/bcast_sag_knomial.c index 1fa56a7367..3f4a6919f6 100644 --- a/src/components/tl/ucp/bcast/bcast_sag_knomial.c +++ b/src/components/tl/ucp/bcast/bcast_sag_knomial.c @@ -70,8 +70,8 @@ ucc_tl_ucp_bcast_sag_knomial_finalize(ucc_coll_task_t *coll_task) ucc_status_t ucc_tl_ucp_bcast_sag_knomial_init(ucc_base_coll_args_t *coll_args, - ucc_base_team_t *team, - ucc_coll_task_t **task_h) + ucc_base_team_t *team, + ucc_coll_task_t **task_h) { ucc_tl_ucp_team_t *tl_team = ucc_derived_of(team, ucc_tl_ucp_team_t); size_t count = coll_args->args.src.info.count; diff --git a/src/components/tl/ucp/reduce/reduce.c b/src/components/tl/ucp/reduce/reduce.c index 82a9380083..039f9f393b 100644 --- a/src/components/tl/ucp/reduce/reduce.c +++ b/src/components/tl/ucp/reduce/reduce.c @@ -13,6 +13,11 @@ ucc_base_coll_alg_info_t .name = "knomial", .desc = "reduce over knomial tree with arbitrary radix " "(optimized for latency)"}, + [UCC_TL_UCP_REDUCE_ALG_DBT] = + {.id = UCC_TL_UCP_REDUCE_ALG_DBT, + .name = "dbt", + .desc = "bcast over double binary tree where a leaf in one tree " + "will be intermediate in other (optimized for BW)"}, [UCC_TL_UCP_REDUCE_ALG_LAST] = { .id = 0, .name = NULL, .desc = NULL}}; @@ -66,3 +71,16 @@ ucc_status_t ucc_tl_ucp_reduce_init(ucc_tl_ucp_task_t *task) return status; } + +ucc_status_t ucc_tl_ucp_reduce_knomial_init(ucc_base_coll_args_t *coll_args, + ucc_base_team_t *team, + ucc_coll_task_t **task_h) +{ + ucc_tl_ucp_task_t *task; + ucc_status_t status; + + task = ucc_tl_ucp_init_task(coll_args, team); + status = ucc_tl_ucp_reduce_init(task); + *task_h = &task->super; + return status; +} diff --git a/src/components/tl/ucp/reduce/reduce.h b/src/components/tl/ucp/reduce/reduce.h index e26c4fdf23..98bc183ff3 100644 --- a/src/components/tl/ucp/reduce/reduce.h +++ b/src/components/tl/ucp/reduce/reduce.h @@ -9,12 +9,16 @@ enum { UCC_TL_UCP_REDUCE_ALG_KNOMIAL, + UCC_TL_UCP_REDUCE_ALG_DBT, UCC_TL_UCP_REDUCE_ALG_LAST }; extern ucc_base_coll_alg_info_t ucc_tl_ucp_reduce_algs[UCC_TL_UCP_REDUCE_ALG_LAST + 1]; +#define UCC_TL_UCP_REDUCE_DEFAULT_ALG_SELECT_STR \ + "reduce:0-inf:@0" + /* A set of convenience macros used to implement sw based progress of the reduce algorithm that uses kn pattern */ enum { @@ -36,12 +40,32 @@ enum { }; \ } while (0) + +static inline int ucc_tl_ucp_reduce_alg_from_str(const char *str) +{ + int i; + for (i = 0; i < UCC_TL_UCP_REDUCE_ALG_LAST; i++) { + if (0 == strcasecmp(str, ucc_tl_ucp_reduce_algs[i].name)) { + break; + } + } + return i; +} + ucc_status_t ucc_tl_ucp_reduce_init(ucc_tl_ucp_task_t *task); +ucc_status_t ucc_tl_ucp_reduce_knomial_init(ucc_base_coll_args_t *coll_args, + ucc_base_team_t *team, + ucc_coll_task_t **task_h); + ucc_status_t ucc_tl_ucp_reduce_knomial_start(ucc_coll_task_t *task); void ucc_tl_ucp_reduce_knomial_progress(ucc_coll_task_t *task); ucc_status_t ucc_tl_ucp_reduce_knomial_finalize(ucc_coll_task_t *task); +ucc_status_t ucc_tl_ucp_reduce_dbt_init(ucc_base_coll_args_t *coll_args, + ucc_base_team_t *team, + ucc_coll_task_t **task_h); + #endif diff --git a/src/components/tl/ucp/reduce/reduce_dbt.c b/src/components/tl/ucp/reduce/reduce_dbt.c new file mode 100644 index 0000000000..08e8774974 --- /dev/null +++ b/src/components/tl/ucp/reduce/reduce_dbt.c @@ -0,0 +1,358 @@ +/** + * Copyright (c) 2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. + * + * See file LICENSE for terms. + */ + +#include "config.h" +#include "tl_ucp.h" +#include "reduce.h" +#include "core/ucc_progress_queue.h" +#include "tl_ucp_sendrecv.h" +#include "utils/ucc_dt_reduce.h" + +enum { + RECV, + REDUCE, + TEST, + TEST_ROOT, +}; + +#define UCC_REDUCE_DBT_CHECK_STATE(_p) \ + case _p: \ + goto _p; + +#define UCC_REDUCE_DBT_GOTO_STATE(_state) \ + do { \ + switch (_state) { \ + UCC_REDUCE_DBT_CHECK_STATE(REDUCE); \ + UCC_REDUCE_DBT_CHECK_STATE(TEST); \ + UCC_REDUCE_DBT_CHECK_STATE(TEST_ROOT); \ + }; \ + } while (0) + +static void recv_completion_common(void *request, ucs_status_t status, + const ucp_tag_recv_info_t *info, /* NOLINT */ + void *user_data) +{ + ucc_tl_ucp_task_t *task = (ucc_tl_ucp_task_t *)user_data; + if (ucc_unlikely(UCS_OK != status)) { + tl_error(UCC_TASK_LIB(task), "failure in recv completion %s", + ucs_status_string(status)); + task->super.status = ucs_status_to_ucc_status(status); + } + task->tagged.recv_completed++; + if (request) { + ucp_request_free(request); + } +} + +static void recv_completion_1(void *request, ucs_status_t status, + const ucp_tag_recv_info_t *info, /* NOLINT */ + void *user_data) +{ + ucc_tl_ucp_task_t *task = (ucc_tl_ucp_task_t *)user_data; + + task->reduce_dbt.trees[0].recv++; + recv_completion_common(request, status, info, user_data); +} + +static void recv_completion_2(void *request, ucs_status_t status, + const ucp_tag_recv_info_t *info, /* NOLINT */ + void *user_data) +{ + ucc_tl_ucp_task_t *task = (ucc_tl_ucp_task_t *)user_data; + + task->reduce_dbt.trees[1].recv++; + recv_completion_common(request, status, info, user_data); +} + +static inline void single_tree_reduce(ucc_tl_ucp_task_t *task, void *sbuf, + void *rbuf, int n_children, size_t count, + size_t data_size, ucc_datatype_t dt, + ucc_coll_args_t *args, int is_avg) +{ + ucc_status_t status; + + status = ucc_dt_reduce_strided( + sbuf,rbuf, rbuf, + n_children, count, data_size, + dt, args, + is_avg ? UCC_EEE_TASK_FLAG_REDUCE_WITH_ALPHA : 0, + AVG_ALPHA(task), task->reduce_dbt.executor, + &task->reduce_dbt.etask); + + if (ucc_unlikely(UCC_OK != status)) { + tl_error(UCC_TASK_LIB(task), + "failed to perform dt reduction"); + task->super.status = status; + return; + } + EXEC_TASK_WAIT(task->reduce_dbt.etask); +} + +void ucc_tl_ucp_reduce_dbt_progress(ucc_coll_task_t *coll_task) +{ + ucc_tl_ucp_task_t *task = ucc_derived_of(coll_task, + ucc_tl_ucp_task_t); + ucc_tl_ucp_team_t *team = TASK_TEAM(task); + ucc_coll_args_t *args = &TASK_ARGS(task); + ucc_dbt_single_tree_t *trees = task->reduce_dbt.trees ; + ucc_rank_t rank = UCC_TL_TEAM_RANK(team); + ucc_rank_t coll_root = (ucc_rank_t)args->root; + int is_root = rank == coll_root; + ucp_tag_recv_nbx_callback_t cb[2] = {recv_completion_1, + recv_completion_2}; + void *sbuf[2], *rbuf[2]; + uint32_t i, j, k; + ucc_memory_type_t mtype; + ucc_datatype_t dt; + size_t count, data_size, data_size_t1; + size_t counts[2]; + int avg_pre_op, avg_post_op; + + if (is_root) { + mtype = args->dst.info.mem_type; + dt = args->dst.info.datatype; + count = args->dst.info.count; + } else { + mtype = args->src.info.mem_type; + dt = args->src.info.datatype; + count = args->src.info.count; + } + + counts[0] = (count % 2) ? count / 2 + 1 : count / 2; + counts[1] = count / 2; + data_size = count * ucc_dt_size(dt); + data_size_t1 = counts[0] * ucc_dt_size(dt); + avg_pre_op = ((args->op == UCC_OP_AVG) && + UCC_TL_UCP_TEAM_LIB(team)->cfg.reduce_avg_pre_op); + avg_post_op = ((args->op == UCC_OP_AVG) && + !UCC_TL_UCP_TEAM_LIB(team)->cfg.reduce_avg_pre_op); + + rbuf[0] = task->reduce_dbt.scratch; + rbuf[1] = PTR_OFFSET(rbuf[0], data_size_t1 * 2);; + sbuf[0] = avg_pre_op ? PTR_OFFSET(rbuf[0], data_size * 2) + : args->src.info.buffer;; + sbuf[1] = PTR_OFFSET(sbuf[0], data_size_t1); + + UCC_REDUCE_DBT_GOTO_STATE(task->reduce_dbt.state); + for (i = 0; i < 2; i++) { + j = 0; + for (k = 0; k < 2; k++) { + if (trees[i].children[k] != UCC_RANK_INVALID) { + UCPCHECK_GOTO(ucc_tl_ucp_recv_cb( + PTR_OFFSET(rbuf[i], counts[i] * ucc_dt_size(dt) * j), + counts[i] * ucc_dt_size(dt), mtype, + trees[i].children[k], team, task, cb[i], + (void *)task), + task, out); + j++; + } + + } + } + task->reduce_dbt.state = REDUCE; + +REDUCE: + for (i = 0; i < 2; i++) { + if (trees[i].recv == trees[i].n_children && + !task->reduce_dbt.reduction_comp[i]) { + if (trees[i].n_children > 0) { + single_tree_reduce(task, sbuf[i], rbuf[i], trees[i].n_children, + counts[i], counts[i] * ucc_dt_size(dt), dt, + args, avg_post_op && trees[i].root == rank); + } + task->reduce_dbt.reduction_comp[i] = 1; + } + } + + for (i = 0; i < 2; i++) { + if (rank != trees[i].root && task->reduce_dbt.reduction_comp[i] && + !task->reduce_dbt.send_comp[i]) { + UCPCHECK_GOTO(ucc_tl_ucp_send_nb((trees[i].n_children > 0) ? rbuf[i] + : sbuf[i], + counts[i] * ucc_dt_size(dt), + mtype, trees[i].parent, team, + task), + task, out); + task->reduce_dbt.send_comp[i] = 1; + } + } + + if (!task->reduce_dbt.reduction_comp[0] || + !task->reduce_dbt.reduction_comp[1]) { + return; + } +TEST: + if (UCC_INPROGRESS == ucc_tl_ucp_test_send(task)) { + task->reduce_dbt.state = TEST; + return; + } + + /* tree roots send to coll root*/ + for (i = 0; i < 2; i++) { + if (rank == trees[i].root && !is_root) { + UCPCHECK_GOTO(ucc_tl_ucp_send_nb(rbuf[i], + counts[i] * ucc_dt_size(dt), + mtype, coll_root, team, task), + task, out); + } + } + + task->reduce_dbt.reduction_comp[0] = trees[0].recv; + task->reduce_dbt.reduction_comp[1] = trees[1].recv; + + for (i = 0; i < 2; i++) { + if (is_root && rank != trees[i].root) { + UCPCHECK_GOTO(ucc_tl_ucp_recv_cb(PTR_OFFSET(args->dst.info.buffer, + i * counts[0] * ucc_dt_size(dt)), + counts[i] * ucc_dt_size(dt), + mtype, trees[i].root, team, task, + cb[i], (void *)task), + task, out); + task->reduce_dbt.reduction_comp[i]++; + } + } + +TEST_ROOT: + if (UCC_INPROGRESS == ucc_tl_ucp_test_send(task) || + task->reduce_dbt.reduction_comp[0] != trees[0].recv || + task->reduce_dbt.reduction_comp[1] != trees[1].recv) { + task->reduce_dbt.state = TEST_ROOT; + return; + } + + for (i = 0; i < 2; i++) { + if (is_root && rank == trees[i].root) { + UCPCHECK_GOTO(ucc_mc_memcpy(PTR_OFFSET(args->dst.info.buffer, + i * counts[(i + 1) % 2] * ucc_dt_size(dt)), + rbuf[i], counts[i] * ucc_dt_size(dt), + mtype, mtype), task, out); + } + } + + task->super.status = UCC_OK; + UCC_TL_UCP_PROFILE_REQUEST_EVENT(coll_task, "ucp_reduce_dbt_done", 0); +out: + return; +} + +ucc_status_t ucc_tl_ucp_reduce_dbt_start(ucc_coll_task_t *coll_task) +{ + ucc_tl_ucp_task_t *task = ucc_derived_of(coll_task, + ucc_tl_ucp_task_t); + ucc_tl_ucp_team_t *team = TASK_TEAM(task); + ucc_coll_args_t *args = &TASK_ARGS(task); + ucc_rank_t rank = UCC_TL_TEAM_RANK(team); + ucc_rank_t team_size = UCC_TL_TEAM_SIZE(team); + int avg_pre_op = + UCC_TL_UCP_TEAM_LIB(TASK_TEAM(task))->cfg.reduce_avg_pre_op; + ucc_datatype_t dt; + size_t count, data_size; + ucc_status_t status; + + task->reduce_dbt.trees[0].recv = 0; + task->reduce_dbt.trees[1].recv = 0; + task->reduce_dbt.reduction_comp[0] = 0; + task->reduce_dbt.reduction_comp[1] = 0; + task->reduce_dbt.send_comp[0] = 0; + task->reduce_dbt.send_comp[1] = 0; + + ucc_tl_ucp_task_reset(task, UCC_INPROGRESS); + + if (args->root == rank) { + count = args->dst.info.count; + dt = args->dst.info.datatype; + } else { + count = args->src.info.count; + dt = args->src.info.datatype; + } + data_size = count * ucc_dt_size(dt); + + status = ucc_coll_task_get_executor(&task->super, + &task->reduce_dbt.executor); + if (ucc_unlikely(status != UCC_OK)) { + return status; + } + + if (UCC_IS_INPLACE(*args) && (rank == args->root)) { + args->src.info.buffer = args->dst.info.buffer; + } + + if (avg_pre_op && args->op == UCC_OP_AVG) { + /* In case of avg_pre_op, each process must divide itself by team_size */ + status = + ucc_dt_reduce(args->src.info.buffer, args->src.info.buffer, + PTR_OFFSET(task->reduce_dbt.scratch, data_size * 2), + count, dt, args, UCC_EEE_TASK_FLAG_REDUCE_WITH_ALPHA, + 1.0 / (double)(team_size * 2), + task->reduce_dbt.executor, &task->reduce_dbt.etask); + if (ucc_unlikely(UCC_OK != status)) { + tl_error(UCC_TASK_LIB(task), + "failed to perform dt reduction"); + return status; + } + EXEC_TASK_WAIT(task->reduce_dbt.etask, status); + } + + task->reduce_dbt.state = RECV; + UCC_TL_UCP_PROFILE_REQUEST_EVENT(coll_task, "ucp_reduce_dbt_start", 0); + return ucc_progress_queue_enqueue(UCC_TL_CORE_CTX(team)->pq, &task->super); +} + +ucc_status_t ucc_tl_ucp_reduce_dbt_finalize(ucc_coll_task_t *coll_task) +{ + ucc_tl_ucp_task_t *task = ucc_derived_of(coll_task, ucc_tl_ucp_task_t); + + if (task->reduce_dbt.scratch_mc_header) { + ucc_mc_free(task->reduce_dbt.scratch_mc_header); + } + + return ucc_tl_ucp_coll_finalize(coll_task); +} + +ucc_status_t ucc_tl_ucp_reduce_dbt_init(ucc_base_coll_args_t *coll_args, + ucc_base_team_t *team, + ucc_coll_task_t **task_h) +{ + ucc_tl_ucp_team_t *tl_team; + ucc_tl_ucp_task_t *task; + ucc_rank_t rank, size; + ucc_memory_type_t mtype; + ucc_datatype_t dt; + size_t count; + size_t data_size; + ucc_status_t status; + + task = ucc_tl_ucp_init_task(coll_args, team); + task->super.flags |= UCC_COLL_TASK_FLAG_EXECUTOR; + task->super.post = ucc_tl_ucp_reduce_dbt_start; + task->super.progress = ucc_tl_ucp_reduce_dbt_progress; + task->super.finalize = ucc_tl_ucp_reduce_dbt_finalize; + tl_team = TASK_TEAM(task); + rank = UCC_TL_TEAM_RANK(tl_team); + size = UCC_TL_TEAM_SIZE(tl_team); + ucc_dbt_build_trees(rank, size, &task->reduce_dbt.trees[0], + &task->reduce_dbt.trees[1]); + + if (coll_args->args.root == rank) { + count = coll_args->args.dst.info.count; + dt = coll_args->args.dst.info.datatype; + mtype = coll_args->args.dst.info.mem_type; + } else { + count = coll_args->args.src.info.count; + dt = coll_args->args.src.info.datatype; + mtype = coll_args->args.src.info.mem_type; + } + data_size = count * ucc_dt_size(dt); + task->reduce_dbt.scratch_mc_header = NULL; + status = ucc_mc_alloc(&task->reduce_dbt.scratch_mc_header, 3 * data_size, + mtype); + if (ucc_unlikely(status != UCC_OK)) { + return status; + } + task->reduce_dbt.scratch = task->reduce_dbt.scratch_mc_header->addr; + *task_h = &task->super; + return UCC_OK; +} diff --git a/src/components/tl/ucp/reduce_scatter/reduce_scatter_knomial.c b/src/components/tl/ucp/reduce_scatter/reduce_scatter_knomial.c index 11a2abc859..ca5457dfb4 100644 --- a/src/components/tl/ucp/reduce_scatter/reduce_scatter_knomial.c +++ b/src/components/tl/ucp/reduce_scatter/reduce_scatter_knomial.c @@ -22,7 +22,8 @@ static inline void get_sbuf_rbuf(ucc_tl_ucp_task_t *task, size_t block_count, size_t dt_size = ucc_dt_size(args->dst.info.datatype); void *scratch = task->reduce_scatter_kn.scratch; ucc_knomial_pattern_t *p = &task->reduce_scatter_kn.p; - size_t offset, local_seg_offset, local_seg_count; + size_t offset, local_seg_count; + ptrdiff_t local_seg_offset; if (ucc_knomial_pattern_loop_first_iteration(p)) { *sbuf = ((KN_NODE_PROXY == p->node_type) || UCC_IS_INPLACE(*args)) diff --git a/src/components/tl/ucp/tl_ucp.c b/src/components/tl/ucp/tl_ucp.c index 83fa7dceeb..1ee970e715 100644 --- a/src/components/tl/ucp/tl_ucp.c +++ b/src/components/tl/ucp/tl_ucp.c @@ -126,7 +126,7 @@ ucc_config_field_t ucc_tl_ucp_lib_config_table[] = { ucc_offsetof(ucc_tl_ucp_lib_config_t, allreduce_sra_kn_radix), UCC_CONFIG_TYPE_UINT_RANGED}, - {"ALLREDUCE_SRA_KN_PIPELINE", "n", + {"ALLREDUCE_SRA_KN_PIPELINE", "auto", "Pipelining settings for SRA Knomial allreduce algorithm", ucc_offsetof(ucc_tl_ucp_lib_config_t, allreduce_sra_kn_pipeline), UCC_CONFIG_TYPE_PIPELINE_PARAMS}, diff --git a/src/components/tl/ucp/tl_ucp_coll.c b/src/components/tl/ucp/tl_ucp_coll.c index e3dd1782af..0efd285db7 100644 --- a/src/components/tl/ucp/tl_ucp_coll.c +++ b/src/components/tl/ucp/tl_ucp_coll.c @@ -42,6 +42,10 @@ const ucc_tl_ucp_default_alg_desc_t .select_str = UCC_TL_UCP_BCAST_DEFAULT_ALG_SELECT_STR, .str_get_fn = NULL }, + { + .select_str = UCC_TL_UCP_REDUCE_DEFAULT_ALG_SELECT_STR, + .str_get_fn = NULL + }, { .select_str = UCC_TL_UCP_REDUCE_SCATTER_DEFAULT_ALG_SELECT_STR, .str_get_fn = NULL @@ -223,6 +227,8 @@ static inline int alg_id_from_str(ucc_coll_type_t coll_type, const char *str) return ucc_tl_ucp_alltoallv_alg_from_str(str); case UCC_COLL_TYPE_BCAST: return ucc_tl_ucp_bcast_alg_from_str(str); + case UCC_COLL_TYPE_REDUCE: + return ucc_tl_ucp_reduce_alg_from_str(str); case UCC_COLL_TYPE_REDUCE_SCATTER: return ucc_tl_ucp_reduce_scatter_alg_from_str(str); case UCC_COLL_TYPE_REDUCE_SCATTERV: @@ -239,6 +245,7 @@ ucc_status_t ucc_tl_ucp_alg_id_to_init(int alg_id, const char *alg_id_str, ucc_base_coll_init_fn_t *init) { ucc_status_t status = UCC_OK; + if (alg_id_str) { alg_id = alg_id_from_str(coll_type, alg_id_str); } @@ -271,6 +278,9 @@ ucc_status_t ucc_tl_ucp_alg_id_to_init(int alg_id, const char *alg_id_str, case UCC_TL_UCP_ALLREDUCE_ALG_SLIDING_WINDOW: *init = ucc_tl_ucp_allreduce_sliding_window_init; break; + case UCC_TL_UCP_ALLREDUCE_ALG_DBT: + *init = ucc_tl_ucp_allreduce_dbt_init; + break; default: status = UCC_ERR_INVALID_PARAM; break; @@ -284,6 +294,9 @@ ucc_status_t ucc_tl_ucp_alg_id_to_init(int alg_id, const char *alg_id_str, case UCC_TL_UCP_BCAST_ALG_SAG_KNOMIAL: *init = ucc_tl_ucp_bcast_sag_knomial_init; break; + case UCC_TL_UCP_BCAST_ALG_DBT: + *init = ucc_tl_ucp_bcast_dbt_init; + break; default: status = UCC_ERR_INVALID_PARAM; break; @@ -313,11 +326,27 @@ ucc_status_t ucc_tl_ucp_alg_id_to_init(int alg_id, const char *alg_id_str, case UCC_TL_UCP_ALLTOALLV_ALG_HYBRID: *init = ucc_tl_ucp_alltoallv_hybrid_init; break; + case UCC_TL_UCP_ALLTOALLV_ALG_ONESIDED: + *init = ucc_tl_ucp_alltoallv_onesided_init; + break; default: status = UCC_ERR_INVALID_PARAM; break; }; break; + case UCC_COLL_TYPE_REDUCE: + switch (alg_id) { + case UCC_TL_UCP_REDUCE_ALG_KNOMIAL: + *init = ucc_tl_ucp_reduce_knomial_init; + break; + case UCC_TL_UCP_REDUCE_ALG_DBT: + *init = ucc_tl_ucp_reduce_dbt_init; + break; + default: + status = UCC_ERR_INVALID_PARAM; + break; + }; + break; case UCC_COLL_TYPE_REDUCE_SCATTER: switch (alg_id) { case UCC_TL_UCP_REDUCE_SCATTER_ALG_RING: diff --git a/src/components/tl/ucp/tl_ucp_coll.h b/src/components/tl/ucp/tl_ucp_coll.h index a4def89286..cb4df40bc5 100644 --- a/src/components/tl/ucp/tl_ucp_coll.h +++ b/src/components/tl/ucp/tl_ucp_coll.h @@ -11,12 +11,13 @@ #include "tl_ucp.h" #include "schedule/ucc_schedule_pipelined.h" #include "coll_patterns/recursive_knomial.h" +#include "coll_patterns/double_binary_tree.h" #include "components/mc/base/ucc_mc_base.h" #include "components/ec/ucc_ec.h" #include "tl_ucp_tag.h" #define UCC_UUNITS_AUTO_RADIX 4 -#define UCC_TL_UCP_N_DEFAULT_ALG_SELECT_STR 7 +#define UCC_TL_UCP_N_DEFAULT_ALG_SELECT_STR 8 ucc_status_t ucc_tl_ucp_team_default_score_str_alloc(ucc_tl_ucp_team_t *team, char *default_select_str[UCC_TL_UCP_N_DEFAULT_ALG_SELECT_STR]); @@ -47,6 +48,7 @@ void ucc_tl_ucp_team_default_score_str_free( return; \ } \ ucc_ee_executor_task_finalize(_etask); \ + _etask = NULL; \ if (ucc_unlikely(status < 0)) { \ tl_error(UCC_TASK_LIB(task), _errmsg); \ task->super.status = status; \ @@ -209,6 +211,11 @@ typedef struct ucc_tl_ucp_task { ucc_rank_t dist; uint32_t radix; } bcast_kn; + struct { + ucc_dbt_single_tree_t t1; + ucc_dbt_single_tree_t t2; + int state; + } bcast_dbt; struct { ucc_rank_t dist; ucc_rank_t max_dist; @@ -220,6 +227,16 @@ typedef struct ucc_tl_ucp_task { ucc_ee_executor_task_t *etask; ucc_ee_executor_t *executor; } reduce_kn; + struct { + int state; + ucc_dbt_single_tree_t trees[2]; + int reduction_comp[2]; + int send_comp[2]; + void *scratch; + ucc_mc_buffer_header_t *scratch_mc_header; + ucc_ee_executor_task_t *etask; + ucc_ee_executor_t *executor; + } reduce_dbt; struct { ucc_rank_t dist; ucc_rank_t max_dist; @@ -245,6 +262,9 @@ typedef struct ucc_tl_ucp_task { } alltoallv_hybrid; struct { ucc_mc_buffer_header_t *scratch_mc_header; + ucc_ee_executor_task_t *etask; + void *src; + void *dst; ucc_rank_t iteration; int phase; } alltoall_bruck; @@ -391,6 +411,9 @@ static inline ucc_status_t ucc_tl_ucp_test(ucc_tl_ucp_task_t *task) #define UCC_TL_UCP_TASK_RECV_COMPLETE(_task) \ (((_task)->tagged.recv_posted == (_task)->tagged.recv_completed)) +#define UCC_TL_UCP_TASK_SEND_COMPLETE(_task) \ + (((_task)->tagged.send_posted == (_task)->tagged.send_completed)) + static inline ucc_status_t ucc_tl_ucp_test_recv(ucc_tl_ucp_task_t *task) { int polls = 0; @@ -407,6 +430,22 @@ static inline ucc_status_t ucc_tl_ucp_test_recv(ucc_tl_ucp_task_t *task) return UCC_INPROGRESS; } +static inline ucc_status_t ucc_tl_ucp_test_send(ucc_tl_ucp_task_t *task) +{ + int polls = 0; + + if (UCC_TL_UCP_TASK_SEND_COMPLETE(task)) { + return UCC_OK; + } + while (polls++ < task->n_polls) { + if (UCC_TL_UCP_TASK_SEND_COMPLETE(task)) { + return UCC_OK; + } + ucp_worker_progress(UCC_TL_UCP_TASK_TEAM(task)->worker->ucp_worker); + } + return UCC_INPROGRESS; +} + #define UCC_TL_UCP_TASK_RING_P2P_COMPLETE(_task) \ ((((_task)->tagged.send_posted - (_task)->tagged.send_completed) <= 1) && \ ((_task)->tagged.recv_posted == (_task)->tagged.recv_completed)) @@ -427,6 +466,32 @@ static inline ucc_status_t ucc_tl_ucp_test_ring(ucc_tl_ucp_task_t *task) return UCC_INPROGRESS; } +#define UCC_TL_UCP_TASK_ONESIDED_P2P_COMPLETE(_task) \ + (((_task)->onesided.put_posted == (_task)->onesided.put_completed) && \ + ((_task)->onesided.get_posted == (_task)->onesided.get_completed)) + +#define UCC_TL_UCP_TASK_ONESIDED_SYNC_COMPLETE(_task, _end) \ + (*((long *)(TASK_ARGS(_task).global_work_buffer)) == _end) + +static inline ucc_status_t ucc_tl_ucp_test_onesided(ucc_tl_ucp_task_t *task, + int sync_end) +{ + int polls = 0; + + if (UCC_TL_UCP_TASK_ONESIDED_P2P_COMPLETE(task) && + UCC_TL_UCP_TASK_ONESIDED_SYNC_COMPLETE(task, sync_end)) { + return UCC_OK; + } + while (polls++ < task->n_polls) { + if (UCC_TL_UCP_TASK_ONESIDED_P2P_COMPLETE(task) && + UCC_TL_UCP_TASK_ONESIDED_SYNC_COMPLETE(task, sync_end)) { + return UCC_OK; + } + ucp_worker_progress(UCC_TL_UCP_TASK_TEAM(task)->worker->ucp_worker); + } + return UCC_INPROGRESS; +} + ucc_status_t ucc_tl_ucp_alg_id_to_init(int alg_id, const char *alg_id_str, ucc_coll_type_t coll_type, ucc_memory_type_t mem_type, diff --git a/src/components/tl/ucp/tl_ucp_context.c b/src/components/tl/ucp/tl_ucp_context.c index e00109ad95..6da05132ba 100644 --- a/src/components/tl/ucp/tl_ucp_context.c +++ b/src/components/tl/ucp/tl_ucp_context.c @@ -162,12 +162,13 @@ UCC_CLASS_INIT_FUNC(ucc_tl_ucp_context_t, "failed to read ucp configuration", err_cfg_read, self); ucp_params.field_mask = - UCP_PARAM_FIELD_FEATURES | UCP_PARAM_FIELD_TAG_SENDER_MASK; + UCP_PARAM_FIELD_FEATURES | UCP_PARAM_FIELD_TAG_SENDER_MASK | UCP_PARAM_FIELD_NAME; ucp_params.features = UCP_FEATURE_TAG | UCP_FEATURE_AM; if (params->params.mask & UCC_CONTEXT_PARAM_FIELD_MEM_PARAMS) { ucp_params.features |= UCP_FEATURE_RMA | UCP_FEATURE_AMO64; } ucp_params.tag_sender_mask = UCC_TL_UCP_TAG_SENDER_MASK; + ucp_params.name = "UCC_UCP_CONTEXT"; if (params->estimated_num_ppn > 0) { ucp_params.field_mask |= UCP_PARAM_FIELD_ESTIMATED_NUM_PPN; diff --git a/src/components/tl/ucp/tl_ucp_sendrecv.h b/src/components/tl/ucp/tl_ucp_sendrecv.h index 9f234cb039..ab815bad71 100644 --- a/src/components/tl/ucp/tl_ucp_sendrecv.h +++ b/src/components/tl/ucp/tl_ucp_sendrecv.h @@ -1,5 +1,5 @@ /** - * Copyright (c) 2021-2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved. + * Copyright (c) 2021-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. * Copyright (c) Meta Platforms, Inc. and affiliates. 2022. * * See file LICENSE for terms. @@ -254,16 +254,18 @@ ucc_tl_ucp_resolve_p2p_by_va(ucc_tl_ucp_team_t *team, void *va, ucp_ep_h *ep, keys = PTR_OFFSET(base_offset, (section_offset * 3)); for (int i = 0; i < ctx->n_rinfo_segs; i++) { - if ((uint64_t)va >= (uint64_t)team->va_base[i] && - (uint64_t)va < (uint64_t)team->va_base[i] + team->base_length[i]) { + uint64_t base = (uint64_t)team->va_base[i]; + uint64_t end = base + team->base_length[i]; + if ((uint64_t)va >= base && + (uint64_t)va < end) { *segment = i; break; } key_offset += key_sizes[i]; } - if (0 > *segment) { + if (ucc_unlikely(0 > *segment)) { tl_error(UCC_TL_TEAM_LIB(team), - "attempt to perform one-sided operation on non-registered memory"); + "attempt to perform one-sided operation on non-registered memory %p", va); return UCC_ERR_NOT_FOUND; } if (ucc_unlikely(NULL == UCC_TL_UCP_REMOTE_RKEY(ctx, peer, *segment))) { diff --git a/src/core/ucc_constructor.c b/src/core/ucc_constructor.c index 2cabdc2f32..c113d2ea56 100644 --- a/src/core/ucc_constructor.c +++ b/src/core/ucc_constructor.c @@ -15,6 +15,7 @@ #include "utils/profile/ucc_profile.h" #include "ucc/api/ucc_version.h" #include +#include static ucc_status_t ucc_check_config_file(void) { @@ -93,100 +94,106 @@ static ucc_status_t init_lib_paths(void) UCC_CONFIG_REGISTER_TABLE(ucc_global_config_table, "UCC global", NULL, ucc_global_config, &ucc_config_global_list) +static pthread_mutex_t ucc_constructor_mutex = PTHREAD_MUTEX_INITIALIZER; + ucc_status_t ucc_constructor(void) { - ucc_global_config_t *cfg = &ucc_global_config; - ucc_status_t status; + ucc_global_config_t *cfg = &ucc_global_config; + ucc_status_t status = UCC_OK; Dl_info dl_info; int ret; - if (!cfg->initialized) { - cfg->initialized = 1; - status = ucc_config_parser_fill_opts( - &ucc_global_config, UCC_CONFIG_GET_TABLE(ucc_global_config_table), - "UCC_", 1); - if (UCC_OK != status) { - ucc_error("failed to parse global options"); - return status; - } + pthread_mutex_lock(&ucc_constructor_mutex); + if (cfg->initialized) { + goto exit_unlock_mutex; + } - if (UCC_OK != (status = init_lib_paths())) { - ucc_error("failed to init ucc components path"); - return status; - } + cfg->initialized = 1; + status = ucc_config_parser_fill_opts( + &ucc_global_config, UCC_CONFIG_GET_TABLE(ucc_global_config_table), + "UCC_", 1); + if (UCC_OK != status) { + ucc_error("failed to parse global options"); + goto exit_unlock_mutex; + } - status = ucc_check_config_file(); - if (UCC_OK != status && UCC_ERR_NOT_FOUND != status) { - /* bail only in case of real error */ - return status; - } + if (UCC_OK != (status = init_lib_paths())) { + ucc_error("failed to init ucc components path"); + goto exit_unlock_mutex; + } - status = ucc_components_load("cl", &cfg->cl_framework); - if (UCC_OK != status) { - ucc_error("no CL components were found in the " - "ucc modules dir: %s", - cfg->component_path); - return status; - } - status = ucc_component_check_scores_uniq(&cfg->cl_framework); - if (UCC_OK != status) { - ucc_error("CLs must have distinct uniq default scores"); - return status; - } - status = ucc_components_load("tl", &cfg->tl_framework); - if (UCC_OK != status) { - /* not critical - some CLs may operate w/o use of TL */ - ucc_debug("no TL components were found in the " - "ucc modules dir: %s", - cfg->component_path); - } - status = ucc_component_check_scores_uniq(&cfg->tl_framework); - if (UCC_OK != status) { - ucc_error("TLs must have distinct uniq default scores"); - return status; - } - status = ucc_components_load("mc", &cfg->mc_framework); - if (UCC_OK != status) { - ucc_error("no memory components were found in the " - "ucc modules dir: %s", + status = ucc_check_config_file(); + if (UCC_OK != status && UCC_ERR_NOT_FOUND != status) { + /* bail only in case of real error */ + goto exit_unlock_mutex; + } + + status = ucc_components_load("cl", &cfg->cl_framework); + if (UCC_OK != status) { + ucc_error("no CL components were found in the " + "ucc modules dir: %s", cfg->component_path); + goto exit_unlock_mutex; + } + status = ucc_component_check_scores_uniq(&cfg->cl_framework); + if (UCC_OK != status) { + ucc_error("CLs must have distinct uniq default scores"); + goto exit_unlock_mutex; + } + status = ucc_components_load("tl", &cfg->tl_framework); + if (UCC_OK != status) { + /* not critical - some CLs may operate w/o use of TL */ + ucc_debug("no TL components were found in the " + "ucc modules dir: %s", cfg->component_path); + } + status = ucc_component_check_scores_uniq(&cfg->tl_framework); + if (UCC_OK != status) { + ucc_error("TLs must have distinct uniq default scores"); + goto exit_unlock_mutex; + } + status = ucc_components_load("mc", &cfg->mc_framework); + if (UCC_OK != status) { + ucc_error("no memory components were found in the " + "ucc modules dir: %s", cfg->component_path); + goto exit_unlock_mutex; + } + status = ucc_components_load("ec", &cfg->ec_framework); + if (status != UCC_OK) { + if (status == UCC_ERR_NOT_FOUND) { + ucc_info("no execution components were found in the " + "ucc modules dir: %s. " + "Triggered operations might not work", cfg->component_path); - return status; - } - status = ucc_components_load("ec", &cfg->ec_framework); - if (status != UCC_OK) { - if (status == UCC_ERR_NOT_FOUND) { - ucc_info("no execution components were found in the " - "ucc modules dir: %s. " - "Triggered operations might not work", - cfg->component_path); - } else { - ucc_error("failed to load execution components %d (%s)", - status, ucc_status_string(status)); - return status; - } + } else { + ucc_error("failed to load execution components %d (%s)", + status, ucc_status_string(status)); + goto exit_unlock_mutex; } + } - if (UCC_OK != ucc_local_proc_info_init()) { - ucc_error("failed to initialize local proc info"); - return status; - } + if (UCC_OK != ucc_local_proc_info_init()) { + ucc_error("failed to initialize local proc info"); + goto exit_unlock_mutex; + } #ifdef HAVE_PROFILING - ucc_profile_init(cfg->profile_mode, cfg->profile_file, - cfg->profile_log_size); + ucc_profile_init(cfg->profile_mode, cfg->profile_file, + cfg->profile_log_size); #endif - if (ucc_global_config.log_component.log_level >= UCC_LOG_LEVEL_INFO) { - ret = dladdr(ucc_init_version, &dl_info); - if (ret == 0) { - ucc_error("failed to get ucc_init_version handler"); - return UCC_ERR_NO_MESSAGE; - } - ucc_info("version: %s, loaded from: %s, cfg file: %s", - ucc_get_version_string(), dl_info.dli_fname, - ucc_global_config.file_cfg ? - ucc_global_config.file_cfg->filename: "n/a"); + if (ucc_global_config.log_component.log_level >= UCC_LOG_LEVEL_INFO) { + ret = dladdr(ucc_init_version, &dl_info); + if (ret == 0) { + ucc_error("failed to get ucc_init_version handler"); + status = UCC_ERR_NO_RESOURCE; + goto exit_unlock_mutex; } + ucc_info("version: %s, loaded from: %s, cfg file: %s", + ucc_get_version_string(), dl_info.dli_fname, + ucc_global_config.file_cfg ? + ucc_global_config.file_cfg->filename: "n/a"); } - return UCC_OK; + +exit_unlock_mutex: + pthread_mutex_unlock(&ucc_constructor_mutex); + return status; } __attribute__((destructor)) static void ucc_destructor(void) diff --git a/src/core/ucc_global_opts.h b/src/core/ucc_global_opts.h index 54079ad6fc..203ca65e9d 100644 --- a/src/core/ucc_global_opts.h +++ b/src/core/ucc_global_opts.h @@ -35,8 +35,8 @@ typedef struct ucc_global_config { /* Limit for profiling log size */ size_t profile_log_size; - char * cfg_filename; - ucc_file_config_t * file_cfg; + char *cfg_filename; + ucc_file_config_t *file_cfg; } ucc_global_config_t; extern ucc_global_config_t ucc_global_config; diff --git a/src/ucc/api/ucc.h b/src/ucc/api/ucc.h index c7c0ce10b0..a269dfb940 100644 --- a/src/ucc/api/ucc.h +++ b/src/ucc/api/ucc.h @@ -1337,7 +1337,7 @@ struct ucc_ep_map_cb { * @ingroup UCC_TEAM_DT */ typedef enum { - UCC_EP_MAP_FULL = 1, /*!< The ep range of the team spans all eps from a context*/ + UCC_EP_MAP_FULL = 1, /*!< The ep range of the team spans all eps from a context. */ UCC_EP_MAP_STRIDED = 2, /*!< The ep range of the team can be described by the 2 values: start, stride.*/ UCC_EP_MAP_ARRAY = 3, /*!< The ep range is given as an array of intergers that map the ep in the team to the team_context rank. */ diff --git a/src/utils/arch/cuda_def.h b/src/utils/arch/cuda_def.h index 7f690531e2..d758846c9d 100644 --- a/src/utils/arch/cuda_def.h +++ b/src/utils/arch/cuda_def.h @@ -74,6 +74,15 @@ static inline ucc_status_t cuda_error_to_ucc_status(cudaError_t cuda_status) } \ } while(0) +#define CUDADRV_CHECK(_cmd) \ + /* coverity[dead_error_line] */ \ + do { \ + ucc_status_t _cuda_status = CUDADRV_FUNC(_cmd); \ + if (ucc_unlikely(_cuda_status != UCC_OK)) { \ + return _cuda_status; \ + } \ + } while(0) + #define CUDA_CHECK_GOTO(_cmd, _label, _cuda_status) \ do { \ _cuda_status = CUDA_FUNC(_cmd); \ diff --git a/src/utils/ucc_coll_utils.c b/src/utils/ucc_coll_utils.c index 3921f1262e..75a49400e2 100644 --- a/src/utils/ucc_coll_utils.c +++ b/src/utils/ucc_coll_utils.c @@ -266,10 +266,11 @@ ucc_ep_map_from_array_generic(void **array, ucc_rank_t size, ucc_rank_t full_size, int need_free, int is64) { int is_const_stride = 0; - ucc_ep_map_t map = {0}; + ucc_ep_map_t map; int64_t stride; ucc_rank_t i; + map.type = (ucc_ep_map_type_t)0; map.ep_num = size; if (size > 1) { /* try to detect strided pattern */ @@ -303,6 +304,7 @@ ucc_ep_map_from_array_generic(void **array, ucc_rank_t size, map.array.map = (void *)(*array); map.array.elem_size = is64 ? sizeof(uint64_t) : sizeof(ucc_rank_t); } + return map; } @@ -359,6 +361,12 @@ void ucc_coll_args_str(const ucc_coll_args_t *args, ucc_rank_t trank, strncat(hdr, tmp, left); } + if (UCC_IS_PERSISTENT(*args)) { + ucc_snprintf_safe(tmp, sizeof(tmp), " persistent"); + left = COLL_ARGS_HEADER_STR_MAX_SIZE - strlen(hdr); + strncat(hdr, tmp, left); + } + if (ucc_coll_args_is_rooted(ct)) { ucc_snprintf_safe(tmp, sizeof(tmp), " root %u", root); left = COLL_ARGS_HEADER_STR_MAX_SIZE - strlen(hdr); @@ -636,6 +644,18 @@ ucc_ep_map_t ucc_ep_map_create_reverse(ucc_rank_t size) return map; } +int ucc_ep_map_is_identity(const ucc_ep_map_t *map) +{ + if ((map->type == UCC_EP_MAP_FULL) || + ((map->type == UCC_EP_MAP_STRIDED) && + (map->strided.start == 0) && + (map->strided.stride == 1))) { + return 1; + } else { + return 0; + } +} + static inline int ucc_ep_map_is_reverse(ucc_ep_map_t *map, int reversed_reordered_flag) { diff --git a/src/utils/ucc_coll_utils.h b/src/utils/ucc_coll_utils.h index 2d3a919f08..c5cb2ef392 100644 --- a/src/utils/ucc_coll_utils.h +++ b/src/utils/ucc_coll_utils.h @@ -71,7 +71,11 @@ #define UCC_COLL_ARGS_ACTIVE_SET(_args) \ ((_args)->mask & UCC_COLL_ARGS_FIELD_ACTIVE_SET) -#define UCC_MEM_TYPE_MASK_FULL -1 +#define UCC_MEM_TYPE_MASK_FULL (UCC_BIT(UCC_MEMORY_TYPE_HOST) | \ + UCC_BIT(UCC_MEMORY_TYPE_CUDA) | \ + UCC_BIT(UCC_MEMORY_TYPE_CUDA_MANAGED) | \ + UCC_BIT(UCC_MEMORY_TYPE_ROCM) | \ + UCC_BIT(UCC_MEMORY_TYPE_ROCM_MANAGED)) static inline int ucc_coll_args_is_reduction(ucc_coll_type_t ct) { @@ -119,29 +123,6 @@ ucc_coll_args_get_displacement(const ucc_coll_args_t *args, return ((uint32_t *)displacements)[idx]; } -static inline const char* ucc_mem_type_str(ucc_memory_type_t ct) -{ - switch((int)ct) { - case UCC_MEMORY_TYPE_HOST: - return "Host"; - case UCC_MEMORY_TYPE_CUDA: - return "Cuda"; - case UCC_MEMORY_TYPE_CUDA_MANAGED: - return "CudaManaged"; - case UCC_MEMORY_TYPE_ROCM: - return "Rocm"; - case UCC_MEMORY_TYPE_ROCM_MANAGED: - return "RocmManaged"; - case UCC_MEMORY_TYPE_ASYMMETRIC: - return "asymmetric"; - case UCC_MEMORY_TYPE_NOT_APPLY: - return "n/a"; - default: - break; - } - return "invalid"; -} - static inline size_t ucc_coll_args_get_total_count(const ucc_coll_args_t *args, const ucc_count_t *counts, ucc_rank_t size) @@ -244,6 +225,8 @@ ucc_status_t ucc_ep_map_create_nested(ucc_ep_map_t *base_map, ucc_ep_map_t *sub_map, ucc_ep_map_t *out); +int ucc_ep_map_is_identity(const ucc_ep_map_t *map); + void ucc_ep_map_destroy_nested(ucc_ep_map_t *out); void ucc_ep_map_destroy(ucc_ep_map_t *map); diff --git a/src/utils/ucc_compiler_def.h b/src/utils/ucc_compiler_def.h index 41d13ecb78..b204df67f3 100644 --- a/src/utils/ucc_compiler_def.h +++ b/src/utils/ucc_compiler_def.h @@ -26,6 +26,7 @@ #define ucc_snprintf_safe snprintf #define ucc_likely ucs_likely #define ucc_unlikely ucs_unlikely +#define ucc_string_split ucs_string_split /** * Prevent compiler from reordering instructions diff --git a/src/utils/ucc_log.h b/src/utils/ucc_log.h index 21ad88dd05..b480ee55ae 100644 --- a/src/utils/ucc_log.h +++ b/src/utils/ucc_log.h @@ -187,4 +187,27 @@ static inline const char* ucc_reduction_op_str(ucc_reduction_op_t op) } } +static inline const char* ucc_mem_type_str(ucc_memory_type_t ct) +{ + switch((int)ct) { + case UCC_MEMORY_TYPE_HOST: + return "Host"; + case UCC_MEMORY_TYPE_CUDA: + return "Cuda"; + case UCC_MEMORY_TYPE_CUDA_MANAGED: + return "CudaManaged"; + case UCC_MEMORY_TYPE_ROCM: + return "Rocm"; + case UCC_MEMORY_TYPE_ROCM_MANAGED: + return "RocmManaged"; + case UCC_MEMORY_TYPE_ASYMMETRIC: + return "asymmetric"; + case UCC_MEMORY_TYPE_NOT_APPLY: + return "n/a"; + default: + break; + } + return "invalid"; +} + #endif diff --git a/src/utils/ucc_parser.c b/src/utils/ucc_parser.c index fff69e47c6..6db8ef52f8 100644 --- a/src/utils/ucc_parser.c +++ b/src/utils/ucc_parser.c @@ -86,25 +86,28 @@ static inline int ucc_check_range(char *range_str, ucc_rank_t *begin, char **range = ucc_str_split(range_str, "-"); char *str_end; unsigned n_range; + long pbegin, pend; if (!range) { goto split_err; } n_range = ucc_str_split_count(range); - *begin = (size_t) strtol(range[0], &str_end, 10); - *end = *begin; + pbegin = strtol(range[0], &str_end, 10); + pend = pbegin; - if (n_range > 2 || *str_end != '\0' || *begin < 0) { + if (n_range > 2 || *str_end != '\0' || pbegin < 0) { goto val_err; } if (n_range == 2) { - *end = (size_t) strtol(range[1], &str_end, 10); - if (*str_end != '\0' || *end < 0) { + pend = strtol(range[1], &str_end, 10); + if (*str_end != '\0' || pend < 0) { goto val_err; } } + *begin = (ucc_rank_t)pbegin; + *end = (ucc_rank_t)pend; ucc_str_split_free(range); return 1; @@ -852,7 +855,7 @@ int ucc_config_sscanf_uint_ranged(const char *buf, void *dest, if (!r) { goto err_tokens; } - r->mtypes = -1; //mask all types + r->mtypes = UCC_MEM_TYPE_MASK_FULL; r->start = 0; r->end = SIZE_MAX; @@ -905,7 +908,7 @@ int ucc_config_sprintf_uint_ranged(char *buf, size_t max, const void *src, ucc_list_for_each(r, &s->ranges, list_elem) { ucs_memunits_to_str(r->start, tmp_start, tmp_max); ucs_memunits_to_str(r->end, tmp_end, tmp_max); - if (r->mtypes == -1) { + if (r->mtypes == UCC_MEM_TYPE_MASK_FULL) { ucc_snprintf_safe(buf, max, "%s-%s:%u", tmp_start, tmp_end, r->value); } else { diff --git a/src/utils/ucc_parser.h b/src/utils/ucc_parser.h index 17a64c3df4..517dd88be8 100644 --- a/src/utils/ucc_parser.h +++ b/src/utils/ucc_parser.h @@ -1,5 +1,5 @@ /** - * Copyright (c) 2020-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. + * Copyright (c) 2020-2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved. * * See file LICENSE for terms. */ @@ -168,8 +168,13 @@ static inline ucc_status_t ucc_config_parser_set_value(void *opts, ucc_config_field_t *fields, const char *name, const char *value) { - ucs_status_t status = - ucs_config_parser_set_value(opts, fields, name, value); + ucs_status_t status; + +#if UCS_HAVE_PARSER_SET_VALUE_TABLE_PREFIX + status = ucs_config_parser_set_value(opts, fields, NULL, name, value); +#else + status = ucs_config_parser_set_value(opts, fields, name, value); +#endif return ucs_status_to_ucc_status(status); } @@ -263,8 +268,29 @@ int ucc_config_sprintf_uint_ranged(char *buf, size_t max, const void *src, ucs_status_t ucc_config_clone_uint_ranged(const void *src, void *dest, const void *arg); -void ucc_config_release_uint_ranged(void *ptr, const void *arg); +void ucc_config_release_uint_ranged(void *ptr, const void *arg); + +#ifdef UCS_HAVE_PARSER_CONFIG_DOC +#define UCC_CONFIG_TYPE_UINT_RANGED \ + { \ + ucc_config_sscanf_uint_ranged, ucc_config_sprintf_uint_ranged, \ + ucc_config_clone_uint_ranged, ucc_config_release_uint_ranged, \ + ucs_config_help_generic, ucs_config_doc_nop, \ + "[-:[mtype]:value," \ + "-:[mtype]:value,...,]default_value\n" \ + "# value and default_value can be \"auto\"" \ + } +#define UCC_CONFIG_TYPE_PIPELINE_PARAMS \ + { \ + ucc_config_sscanf_pipeline_params, ucc_config_sprintf_pipeline_params, \ + ucc_config_clone_pipeline_params, \ + ucc_config_release_pipeline_params, ucs_config_help_generic, \ + ucs_config_doc_nop, \ + "thresh=:fragsize=:nfrags=" \ + ":pdepth=:" \ + } +#else #define UCC_CONFIG_TYPE_UINT_RANGED \ { \ ucc_config_sscanf_uint_ranged, ucc_config_sprintf_uint_ranged, \ @@ -280,7 +306,8 @@ void ucc_config_release_uint_ranged(void *ptr, const void *arg); ucc_config_clone_pipeline_params, \ ucc_config_release_pipeline_params, ucs_config_help_generic, \ "thresh=:fragsize=:nfrags=" \ - ":pdepth=:" \ + ":pdepth=:" \ } +#endif #endif diff --git a/src/utils/ucc_rcache.h b/src/utils/ucc_rcache.h index dd1d6298e6..46993caacb 100644 --- a/src/utils/ucc_rcache.h +++ b/src/utils/ucc_rcache.h @@ -8,6 +8,7 @@ #include #include +#include //TODO: handle external events #define ucc_rcache_t ucs_rcache_t @@ -25,8 +26,17 @@ static inline ucc_status_t ucc_rcache_create(const ucc_rcache_params_t *params, const char *name, ucc_rcache_t **rcache_p) { +#ifndef UCS_HAVE_RCACHE_REGION_ALIGNMENT + ucc_rcache_params_t params_dup = *params; + params_dup.alignment = UCS_PGT_ADDR_ALIGN; + params_dup.max_alignment = ucc_get_page_size(); + + return ucs_status_to_ucc_status(ucs_rcache_create( + ¶ms_dup, name, NULL, rcache_p)); +#else return ucs_status_to_ucc_status(ucs_rcache_create( - params, name, NULL, rcache_p)); + params, name, NULL, rcache_p)); +#endif } /* [arg] parameter allows passing additional information from mem_reg callabck. @@ -36,9 +46,16 @@ static inline ucc_status_t ucc_rcache_get(ucc_rcache_t *rcache, void *address, size_t length, void *arg, ucc_rcache_region_t **region_p) { +#ifdef UCS_HAVE_RCACHE_REGION_ALIGNMENT return ucs_status_to_ucc_status(ucs_rcache_get( rcache, address, length, + ucc_get_page_size(), PROT_READ | PROT_WRITE, arg, region_p)); +#else + return ucs_status_to_ucc_status(ucs_rcache_get( + rcache, address, length, + PROT_READ | PROT_WRITE, arg, region_p)); +#endif } #endif diff --git a/test/gtest/coll/test_allreduce.cc b/test/gtest/coll/test_allreduce.cc index 3384f997e7..ef0a8aed24 100644 --- a/test/gtest/coll/test_allreduce.cc +++ b/test/gtest/coll/test_allreduce.cc @@ -331,6 +331,43 @@ TYPED_TEST(test_allreduce_alg, sra_knomial_pipelined) { } } +TYPED_TEST(test_allreduce_alg, dbt) { + int n_procs = 15; + ucc_job_env_t env = {{"UCC_CL_BASIC_TUNE", "inf"}, + {"UCC_TL_UCP_TUNE", "allreduce:@dbt:inf"}}; + UccJob job(n_procs, UccJob::UCC_JOB_CTX_GLOBAL, env); + UccTeam_h team = job.create_team(n_procs); + int repeat = 3; + UccCollCtxVec ctxs; + std::vector mt = {UCC_MEMORY_TYPE_HOST}; + + if (UCC_OK == ucc_mc_available(UCC_MEMORY_TYPE_CUDA)) { + mt.push_back(UCC_MEMORY_TYPE_CUDA); + } + if (UCC_OK == ucc_mc_available( UCC_MEMORY_TYPE_CUDA_MANAGED)) { + mt.push_back( UCC_MEMORY_TYPE_CUDA_MANAGED); + } + + for (auto count : {65536, 123567}) { + for (auto inplace : {TEST_NO_INPLACE, TEST_INPLACE}) { + for (auto m : mt) { + SET_MEM_TYPE(m); + this->set_inplace(inplace); + this->data_init(n_procs, TypeParam::dt, count, ctxs, true); + UccReq req(team, ctxs); + + for (auto i = 0; i < repeat; i++) { + req.start(); + req.wait(); + EXPECT_EQ(true, this->data_validate(ctxs)); + this->reset(ctxs); + } + this->data_fini(ctxs); + } + } + } +} + TYPED_TEST(test_allreduce_alg, rab) { int n_procs = 15; ucc_job_env_t env = {{"UCC_CL_HIER_TUNE", "allreduce:@rab:0-inf:inf"}, diff --git a/test/gtest/coll/test_bcast.cc b/test/gtest/coll/test_bcast.cc index ace5f50a9b..6d80816a31 100644 --- a/test/gtest/coll/test_bcast.cc +++ b/test/gtest/coll/test_bcast.cc @@ -8,6 +8,7 @@ using Param_0 = std::tuple; using Param_1 = std::tuple; +using Param_2 = std::tuple; class test_bcast : public UccCollArgs, public ucc::test { @@ -241,42 +242,49 @@ INSTANTIATE_TEST_CASE_P( ::testing::Values(1,3,65536), // count ::testing::Values(0,1))); // root -class test_bcast_alg : public test_bcast +class test_bcast_alg : public test_bcast, + public ::testing::WithParamInterface {}; -UCC_TEST_F(test_bcast_alg, 2step) { - int n_procs = 15; - ucc_job_env_t env = {{"UCC_CL_HIER_TUNE", "bcast:@2step:0-inf:inf"}, - {"UCC_CLS", "all"}}; - UccJob job(n_procs, UccJob::UCC_JOB_CTX_GLOBAL, env); - UccTeam_h team = job.create_team(n_procs); - int repeat = 1; +UCC_TEST_P(test_bcast_alg,) { + const ucc_memory_type_t mt = std::get<0>(GetParam()); + const ucc_job_env_t env = std::get<1>(GetParam()); + const int count = std::get<2>(GetParam()); + const int n_procs = std::get<3>(GetParam()); + UccJob job(n_procs, UccJob::UCC_JOB_CTX_GLOBAL, env); + UccTeam_h team = job.create_team(n_procs); + int repeat = 1; UccCollCtxVec ctxs; - std::vector mt = {UCC_MEMORY_TYPE_HOST}; - if (UCC_OK == ucc_mc_available(UCC_MEMORY_TYPE_CUDA)) { - mt.push_back(UCC_MEMORY_TYPE_CUDA); - } - if (UCC_OK == ucc_mc_available(UCC_MEMORY_TYPE_CUDA_MANAGED)) { - mt.push_back(UCC_MEMORY_TYPE_CUDA_MANAGED); - } - - for (auto count : {8, 65536}) { - for (int root = 0; root < n_procs; root++) { - for (auto m : mt) { - this->set_root(root); - SET_MEM_TYPE(m); - this->data_init(n_procs, UCC_DT_INT8, count, ctxs, false); - UccReq req(team, ctxs); - - for (auto i = 0; i < repeat; i++) { - req.start(); - req.wait(); - EXPECT_EQ(true, this->data_validate(ctxs)); - this->reset(ctxs); - } - this->data_fini(ctxs); - } + SET_MEM_TYPE(mt); + for (int root = 0; root < n_procs; root++) { + this->set_root(root); + this->data_init(n_procs, UCC_DT_INT8, count, ctxs, false); + UccReq req(team, ctxs); + + for (auto i = 0; i < repeat; i++) { + req.start(); + req.wait(); + EXPECT_EQ(true, this->data_validate(ctxs)); + this->reset(ctxs); } + this->data_fini(ctxs); } } + +ucc_job_env_t two_step_env = {{"UCC_CL_HIER_TUNE", "bcast:@2step:0-inf:inf"}, + {"UCC_CLS", "all"}}; +ucc_job_env_t dbt_env = {{"UCC_TL_UCP_TUNE", "bcast:@dbt:0-inf:inf"}, + {"UCC_CLS", "basic"}}; +INSTANTIATE_TEST_CASE_P( + , test_bcast_alg, + ::testing::Combine( +#ifdef HAVE_CUDA + ::testing::Values(UCC_MEMORY_TYPE_HOST, UCC_MEMORY_TYPE_CUDA, + UCC_MEMORY_TYPE_CUDA_MANAGED), +#else + ::testing::Values(UCC_MEMORY_TYPE_HOST), +#endif + ::testing::Values(two_step_env, dbt_env), //env + ::testing::Values(8, 65536), // count + ::testing::Values(15,16))); // n_procs diff --git a/test/gtest/coll/test_reduce.cc b/test/gtest/coll/test_reduce.cc index 393e97decc..0f8bfc034f 100644 --- a/test/gtest/coll/test_reduce.cc +++ b/test/gtest/coll/test_reduce.cc @@ -23,17 +23,9 @@ class test_reduce : public UccCollArgs, public testing::Test { ucc_coll_args_t *coll = (ucc_coll_args_t*) calloc(1, sizeof(ucc_coll_args_t)); - ctxs[r] = (gtest_ucc_coll_ctx_t*)calloc(1, - sizeof(gtest_ucc_coll_ctx_t)); - ctxs[r]->args = coll; - - coll->coll_type = UCC_COLL_TYPE_REDUCE; - coll->op = T::redop; - coll->root = root; - coll->src.info.mem_type = mem_type; - coll->src.info.count = (ucc_count_t)count; - coll->src.info.datatype = dt; - + ctxs[r] = (gtest_ucc_coll_ctx_t*)calloc(1, + sizeof(gtest_ucc_coll_ctx_t)); + ctxs[r]->args = coll; ctxs[r]->init_buf = ucc_malloc(ucc_dt_size(dt) * count, "init buf"); EXPECT_NE(ctxs[r]->init_buf, nullptr); @@ -48,6 +40,21 @@ class test_reduce : public UccCollArgs, public testing::Test { ptr[i] = (typename T::type)((i + r + 1) % 8); } + coll->coll_type = UCC_COLL_TYPE_REDUCE; + coll->op = T::redop; + coll->root = root; + if (r != root || !inplace) { + coll->src.info.mem_type = mem_type; + coll->src.info.count = (ucc_count_t)count; + coll->src.info.datatype = dt; + UCC_CHECK(ucc_mc_alloc(&ctxs[r]->src_mc_header, + ucc_dt_size(dt) * count, mem_type)); + coll->src.info.buffer = ctxs[r]->src_mc_header->addr; + UCC_CHECK(ucc_mc_memcpy(coll->src.info.buffer, + ctxs[r]->init_buf, + ucc_dt_size(dt) * count, mem_type, + UCC_MEMORY_TYPE_HOST)); + } if (r == root) { coll->dst.info.mem_type = mem_type; coll->dst.info.count = (ucc_count_t)count; @@ -65,15 +72,6 @@ class test_reduce : public UccCollArgs, public testing::Test { coll->mask |= UCC_COLL_ARGS_FIELD_FLAGS; coll->flags |= UCC_COLL_ARGS_FLAG_IN_PLACE; } - if (r != root || !inplace) { - UCC_CHECK(ucc_mc_alloc(&ctxs[r]->src_mc_header, - ucc_dt_size(dt) * count, mem_type)); - coll->src.info.buffer = ctxs[r]->src_mc_header->addr; - UCC_CHECK(ucc_mc_memcpy(coll->src.info.buffer, - ctxs[r]->init_buf, - ucc_dt_size(dt) * count, mem_type, - UCC_MEMORY_TYPE_HOST)); - } if (persistent) { coll->mask |= UCC_COLL_ARGS_FIELD_FLAGS; coll->flags |= UCC_COLL_ARGS_FLAG_PERSISTENT; @@ -282,42 +280,58 @@ TYPED_TEST(test_reduce_cuda, multiple_inplace_managed) { template class test_reduce_avg_order : public test_reduce { }; +template class test_reduce_dbt : public test_reduce { +}; + +#define TEST_DECLARE_WITH_ENV(_env, _n_procs) \ + { \ + UccJob job(_n_procs, UccJob::UCC_JOB_CTX_GLOBAL, _env); \ + UccTeam_h team = job.create_team(_n_procs); \ + int repeat = 3; \ + UccCollCtxVec ctxs; \ + std::vector mt = {UCC_MEMORY_TYPE_HOST}; \ + if (UCC_OK == ucc_mc_available(UCC_MEMORY_TYPE_CUDA)) { \ + mt.push_back(UCC_MEMORY_TYPE_CUDA); \ + } \ + if (UCC_OK == ucc_mc_available(UCC_MEMORY_TYPE_CUDA_MANAGED)) { \ + mt.push_back(UCC_MEMORY_TYPE_CUDA_MANAGED); \ + } \ + for (auto count : {5, 256, 65536}) { \ + for (auto inplace : {TEST_NO_INPLACE, TEST_INPLACE}) { \ + for (auto m : mt) { \ + CHECK_TYPE_OP_SKIP(TypeParam::dt, TypeParam::redop, m); \ + SET_MEM_TYPE(m); \ + this->set_inplace(inplace); \ + this->data_init(_n_procs, TypeParam::dt, count, ctxs, true); \ + UccReq req(team, ctxs); \ + CHECK_REQ_NOT_SUPPORTED_SKIP(req, this->data_fini(ctxs)); \ + for (auto i = 0; i < repeat; i++) { \ + req.start(); \ + req.wait(); \ + EXPECT_EQ(true, this->data_validate(ctxs)); \ + this->reset(ctxs); \ + } \ + this->data_fini(ctxs); \ + } \ + } \ + } \ + } + TYPED_TEST_CASE(test_reduce_avg_order, CollReduceTypeOpsAvg); +TYPED_TEST_CASE(test_reduce_dbt, CollReduceTypeOpsHost); -TYPED_TEST(test_reduce_avg_order, avg_post_op) -{ - int n_procs = 15; - ucc_job_env_t env = {{"UCC_TL_UCP_REDUCE_AVG_PRE_OP", "0"}}; - UccJob job(n_procs, UccJob::UCC_JOB_CTX_GLOBAL, env); - UccTeam_h team = job.create_team(n_procs); - int repeat = 3; - UccCollCtxVec ctxs; - std::vector mt = {UCC_MEMORY_TYPE_HOST}; +ucc_job_env_t post_op_env = {{"UCC_TL_UCP_REDUCE_AVG_PRE_OP", "0"}}; +ucc_job_env_t reduce_dbt_env = {{"UCC_TL_UCP_TUNE", "reduce:@dbt:0-inf:inf"}, + {"UCC_CLS", "basic"}}; - if (UCC_OK == ucc_mc_available(UCC_MEMORY_TYPE_CUDA)) { - mt.push_back(UCC_MEMORY_TYPE_CUDA); - } - if (UCC_OK == ucc_mc_available(UCC_MEMORY_TYPE_CUDA_MANAGED)) { - mt.push_back(UCC_MEMORY_TYPE_CUDA_MANAGED); - } +TYPED_TEST(test_reduce_avg_order, avg_post_op) { + TEST_DECLARE_WITH_ENV(post_op_env, 15); +} - for (auto count : {4, 256, 65536}) { - for (auto inplace : {TEST_NO_INPLACE, TEST_INPLACE}) { - for (auto m : mt) { - CHECK_TYPE_OP_SKIP(TypeParam::dt, TypeParam::redop, m); - SET_MEM_TYPE(m); - this->set_inplace(inplace); - this->data_init(n_procs, TypeParam::dt, count, ctxs, true); - UccReq req(team, ctxs); - CHECK_REQ_NOT_SUPPORTED_SKIP(req, this->data_fini(ctxs)); - for (auto i = 0; i < repeat; i++) { - req.start(); - req.wait(); - EXPECT_EQ(true, this->data_validate(ctxs)); - this->reset(ctxs); - } - this->data_fini(ctxs); - } - } - } +TYPED_TEST(test_reduce_dbt, reduce_dbt_shift) { + TEST_DECLARE_WITH_ENV(reduce_dbt_env, 15); +} + +TYPED_TEST(test_reduce_dbt, reduce_dbt_mirror) { + TEST_DECLARE_WITH_ENV(reduce_dbt_env, 16); } diff --git a/test/gtest/core/test_mc_reduce.cc b/test/gtest/core/test_mc_reduce.cc index e528119835..674808ccdb 100644 --- a/test/gtest/core/test_mc_reduce.cc +++ b/test/gtest/core/test_mc_reduce.cc @@ -101,6 +101,7 @@ class test_mc_reduce : public testing::Test { std::cerr << "failed to destory cuda stream" << std::endl; return UCC_ERR_NO_MESSAGE; } + ee_context = NULL; } #endif return status; @@ -110,11 +111,11 @@ class test_mc_reduce : public testing::Test { { ucc_status_t status; - status = alloc_executor(mtype); + status = alloc_bufs(mtype, n); if (UCC_OK != status) { return status; } - return alloc_bufs(mtype, n); + return alloc_executor(mtype); } ucc_status_t alloc_bufs(ucc_memory_type_t mtype, size_t n) @@ -192,9 +193,6 @@ class test_mc_reduce : public testing::Test { virtual void TearDown() override { free_bufs(mem_type); - if (executor) { - free_executor(); - } ucc_mc_finalize(); } @@ -246,6 +244,9 @@ class test_mc_reduce : public testing::Test { GTEST_SKIP(); } ASSERT_EQ(status, UCC_OK); + if (executor) { + free_executor(); + } if (mt != UCC_MEMORY_TYPE_HOST) { ucc_mc_memcpy(this->res_h, this->res_d, this->COUNT * sizeof(*this->res_d), @@ -272,6 +273,9 @@ class test_mc_reduce : public testing::Test { GTEST_SKIP(); } ASSERT_EQ(status, UCC_OK); + if (executor) { + free_executor(); + } if (mt != UCC_MEMORY_TYPE_HOST) { ucc_mc_memcpy(this->res_h, this->res_d, this->COUNT * sizeof(*this->res_d), @@ -305,6 +309,9 @@ class test_mc_reduce : public testing::Test { GTEST_SKIP(); } ASSERT_EQ(status, UCC_OK); + if (executor) { + free_executor(); + } if (mt != UCC_MEMORY_TYPE_HOST) { ucc_mc_memcpy(this->res_h, this->res_d, this->COUNT * sizeof(*this->res_d), diff --git a/test/mpi/buffer.cc b/test/mpi/buffer.cc index 69c6d4bc58..f31f42c553 100644 --- a/test/mpi/buffer.cc +++ b/test/mpi/buffer.cc @@ -1,5 +1,5 @@ /** - * Copyright (c) 2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved. + * Copyright (c) 2022-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. * * See file LICENSE for terms. */ @@ -25,7 +25,7 @@ void init_buffer_host(void *buf, size_t count, int _value) } void init_buffer(void *_buf, size_t count, ucc_datatype_t dt, - ucc_memory_type_t mt, int value) + ucc_memory_type_t mt, int value, int offset) { void *buf = NULL; if (mt == UCC_MEMORY_TYPE_CUDA || mt == UCC_MEMORY_TYPE_ROCM) { @@ -37,6 +37,8 @@ void init_buffer(void *_buf, size_t count, ucc_datatype_t dt, std::cerr << "Unsupported mt\n"; MPI_Abort(MPI_COMM_WORLD, -1); } + + value += offset; switch(dt) { case UCC_DT_INT8: init_buffer_host(buf, count, value); diff --git a/test/mpi/main.cc b/test/mpi/main.cc index b719f366f0..f4a571fa14 100644 --- a/test/mpi/main.cc +++ b/test/mpi/main.cc @@ -9,6 +9,7 @@ #include #include #include +#include #include "test_mpi.h" int test_rand_seed = -1; @@ -25,7 +26,7 @@ static std::vector colls = { UCC_COLL_TYPE_SCATTER, UCC_COLL_TYPE_SCATTERV}; static std::vector onesided_colls = { - UCC_COLL_TYPE_ALLTOALL}; + UCC_COLL_TYPE_ALLTOALL, UCC_COLL_TYPE_ALLTOALLV}; static std::vector mtypes = { UCC_MEMORY_TYPE_HOST}; @@ -82,7 +83,7 @@ static std::vector str_split(const char *value, const char *delimit return rst; } -void PrintHelp() +void print_help() { std::cout << "-c, --colls \n\tlist of collectives: " @@ -135,6 +136,23 @@ static ucc_test_mpi_team_t team_str_to_type(std::string team) throw std::string("incorrect team type: ") + team; } +static std::string team_type_to_str(ucc_test_mpi_team_t team) +{ + switch (team) { + case TEAM_WORLD: + return "world"; + case TEAM_SPLIT_HALF: + return "half"; + case TEAM_SPLIT_ODD_EVEN: + return "odd_even"; + case TEAM_REVERSE: + return "reverse"; + default: + break; + } + throw std::string("incorrect team type: "); +} + static ucc_coll_type_t coll_str_to_type(std::string coll) { if (coll == "barrier") { @@ -168,10 +186,8 @@ static ucc_coll_type_t coll_str_to_type(std::string coll) } else if (coll == "scatterv") { return UCC_COLL_TYPE_SCATTERV; } else { - std::cerr << "incorrect coll type: " << coll << std::endl; - PrintHelp(); + throw std::string("incorrect coll type: ") + coll; } - throw std::string("incorrect coll type: ") + coll; } static ucc_memory_type_t mtype_str_to_type(std::string mtype) @@ -394,18 +410,55 @@ int init_rand_seed(int user_seed) return seed; } -void PrintInfo() +void print_info() { int world_rank; - MPI_Comm_rank(MPI_COMM_WORLD, &world_rank); + MPI_Comm_rank(MPI_COMM_WORLD, &world_rank); if (world_rank) { return; } - std::cout << "\n===== UCC MPI TEST INFO =======\n" - << " seed : " << std::to_string(test_rand_seed) << "\n" - << "===============================\n" - << std::endl; + + std::cout << "===== UCC MPI TEST INFO =======" << std::endl; + std::cout <<"seed: " << std::to_string(test_rand_seed) << std::endl; + std::cout <<"collectives: "; + for (const auto &c : colls) { + std::cout << ucc_coll_type_str(c); + if (c != colls.back()) { + std::cout << ", "; + } else { + std::cout << std::endl; + } + } + std::cout <<"data types: "; + for (const auto &d : dtypes) { + std::cout << ucc_datatype_str(d); + if (d != dtypes.back()) { + std::cout << ", "; + } else { + std::cout << std::endl; + } + } + + std::cout <<"memory types: "; + for (const auto &m : mtypes) { + std::cout << ucc_mem_type_str(m); + if (m != mtypes.back()) { + std::cout << ", "; + } else { + std::cout << std::endl; + } + } + + std::cout <<"teams: "; + for (const auto &t : teams) { + std::cout << team_type_to_str(t); + if (t != teams.back()) { + std::cout << ", "; + } else { + std::cout << std::endl; + } + } } void ProcessArgs(int argc, char** argv) @@ -521,8 +574,8 @@ void ProcessArgs(int argc, char** argv) int main(int argc, char *argv[]) { - int failed = 0; - int total_done_skipped_failed[4] = {0}; + int failed = 0; + int total_done_skipped_failed[ucc_ilog2(UCC_COLL_TYPE_LAST) + 1][4] = {0}; std::chrono::steady_clock::time_point begin; int size, required, provided, completed, rank; UccTestMpi *test; @@ -548,7 +601,7 @@ int main(int argc, char *argv[]) if (!err.empty() || show_help) { if (rank == 0) { std::cerr << "ParseArgs error:" << err << "\n\n"; - PrintHelp(); + print_help(); } goto mpi_exit; } @@ -589,7 +642,7 @@ int main(int argc, char *argv[]) test->set_max_size(test_max_size); test_rand_seed = init_rand_seed(test_rand_seed); - PrintInfo(); + print_info(); for (auto inpl : inplace) { for (auto pers : persistent) { @@ -625,19 +678,20 @@ int main(int argc, char *argv[]) } std::cout << std::flush; - total_done_skipped_failed[0] = test->results.size(); for (auto s : test->results) { - switch(s) { + int coll_num = ucc_ilog2(std::get<0>(s)); + switch(std::get<1>(s)) { case UCC_OK: - total_done_skipped_failed[1]++; + total_done_skipped_failed[coll_num][1]++; break; case UCC_ERR_NOT_IMPLEMENTED: case UCC_ERR_LAST: - total_done_skipped_failed[2]++; + total_done_skipped_failed[coll_num][2]++; break; default: - total_done_skipped_failed[3]++; + total_done_skipped_failed[coll_num][3]++; } + total_done_skipped_failed[coll_num][0]++; } MPI_Iallreduce(MPI_IN_PLACE, total_done_skipped_failed, sizeof(total_done_skipped_failed)/sizeof(int), @@ -650,21 +704,60 @@ int main(int argc, char *argv[]) if (0 == rank) { std::chrono::steady_clock::time_point end = std::chrono::steady_clock::now(); + ucc_coll_type_t coll_type; + int num_all = 0, num_skipped = 0, num_done =0, num_failed = 0; + std::ios iostate(nullptr); + + iostate.copyfmt(std::cout); std::cout << "\n===== UCC MPI TEST REPORT =====\n" << - " total tests : " << total_done_skipped_failed[0] << "\n" << - " passed : " << total_done_skipped_failed[1] << "\n" << - " skipped : " << total_done_skipped_failed[2] << "\n" << - " failed : " << total_done_skipped_failed[3] << "\n" << - " elapsed : " << + std::setw(22) << std::left << "collective" << + std::setw(10) << std::right << "tests" << + std::setw(10) << std::right << "passed" << + std::setw(10) << std::right << "failed" << + std::setw(10) << std::right << "skipped" << std::endl; + + for (coll_type = (ucc_coll_type_t)1; + coll_type < UCC_COLL_TYPE_LAST; + coll_type = (ucc_coll_type_t)(coll_type << 1)) + { + int coll_num = ucc_ilog2(coll_type); + if (total_done_skipped_failed[coll_num][0] == 0) { + continue; + } + num_all += total_done_skipped_failed[coll_num][0]; + num_done += total_done_skipped_failed[coll_num][1]; + num_skipped += total_done_skipped_failed[coll_num][2]; + num_failed += total_done_skipped_failed[coll_num][3]; + std::cout << + std::setw(22) << std::left << ucc_coll_type_str(coll_type) << + std::setw(10) << std::right << total_done_skipped_failed[coll_num][0] << + std::setw(10) << std::right << total_done_skipped_failed[coll_num][1] << + std::setw(10) << std::right << total_done_skipped_failed[coll_num][3] << + std::setw(10) << std::right << total_done_skipped_failed[coll_num][2] << + std::endl; + + } + std::cout << + " \n===== UCC MPI TEST SUMMARY =====\n" << + "total tests: " << num_all << "\n" << + "passed: " << num_done << "\n" << + "skipped: " << num_skipped << "\n" << + "failed: " << num_failed << "\n" << + "elapsed: " << std::chrono::duration_cast(end - begin).count() << "s" << std::endl; + std::cout.copyfmt(iostate); /* check if all tests have been skipped */ - if (total_done_skipped_failed[0] == total_done_skipped_failed[2]) { + if (num_all == num_skipped) { std::cout << "\n All tests have been skipped, indicating most likely " "a problem\n"; failed = 1; } + + if (num_failed != 0) { + failed = 1; + } } test_exit: diff --git a/test/mpi/test_allgather.cc b/test/mpi/test_allgather.cc index 12b603e1cf..ebca8c4c95 100644 --- a/test/mpi/test_allgather.cc +++ b/test/mpi/test_allgather.cc @@ -53,6 +53,7 @@ ucc_status_t TestAllgather::set_input(int iter_persistent) int rank; void *buf, *check; + this->iter_persistent = iter_persistent; MPI_Comm_rank(team.comm, &rank); if (inplace) { buf = PTR_OFFSET(rbuf, rank * single_rank_size); @@ -70,18 +71,18 @@ ucc_status_t TestAllgather::set_input(int iter_persistent) ucc_status_t TestAllgather::check() { - int size, completed; + size_t dt_size, single_rank_count; + int size, i; + MPI_Comm_size(team.comm, &size); - size_t single_rank_count = args.dst.info.count / size; - MPI_Datatype mpi_dt = ucc_dt_to_mpi(dt); - MPI_Request req; + single_rank_count = args.dst.info.count / size; + dt_size = ucc_dt_size(dt); + for (i = 0; i < size; i++) { + init_buffer(PTR_OFFSET(check_buf, i * single_rank_count * dt_size), + single_rank_count, dt, UCC_MEMORY_TYPE_HOST, + i * (iter_persistent + 1)); + } - MPI_Iallgather(MPI_IN_PLACE, 0, MPI_DATATYPE_NULL, check_buf, - single_rank_count, mpi_dt, team.comm, &req); - do { - MPI_Test(&req, &completed, MPI_STATUS_IGNORE); - ucc_context_progress(team.ctx); - } while(!completed); return compare_buffers(rbuf, check_buf, single_rank_count * size, dt, mem_type); diff --git a/test/mpi/test_allgatherv.cc b/test/mpi/test_allgatherv.cc index 9554f1c616..a3bfa55d93 100644 --- a/test/mpi/test_allgatherv.cc +++ b/test/mpi/test_allgatherv.cc @@ -82,18 +82,17 @@ ucc_status_t TestAllgatherv::set_input(int iter_persistent) { size_t dt_size = ucc_dt_size(dt); int rank; - void *buf, *check; + void *buf; + this->iter_persistent = iter_persistent; MPI_Comm_rank(team.comm, &rank); if (inplace) { buf = PTR_OFFSET(rbuf, displacements[rank] * dt_size); } else { buf = sbuf; } - check = PTR_OFFSET(check_buf, displacements[rank] * dt_size); init_buffer(buf, counts[rank], dt, mem_type, rank * (iter_persistent + 1)); - UCC_CHECK(ucc_mc_memcpy(check, buf, counts[rank] * dt_size, - UCC_MEMORY_TYPE_HOST, mem_type)); + return UCC_OK; } @@ -108,23 +107,19 @@ TestAllgatherv::~TestAllgatherv() { ucc_status_t TestAllgatherv::check() { - MPI_Datatype mpi_dt = ucc_dt_to_mpi(dt); - int total_count = 0; - int size, rank, completed, i; - MPI_Request req; + int total_count = 0; + int size, i; MPI_Comm_size(team.comm, &size); - MPI_Comm_rank(team.comm, &rank); for (i = 0 ; i < size; i++) { total_count += counts[i]; } - MPI_Iallgatherv(MPI_IN_PLACE, 0, MPI_DATATYPE_NULL, check_buf, - (int *)counts, (int *)displacements, mpi_dt, team.comm, - &req); - do { - MPI_Test(&req, &completed, MPI_STATUS_IGNORE); - ucc_context_progress(team.ctx); - } while(!completed); + + for (i = 0; i < size; i++) { + init_buffer(PTR_OFFSET(check_buf, displacements[i] * ucc_dt_size(dt)), + counts[i], dt, UCC_MEMORY_TYPE_HOST, + i * (iter_persistent + 1)); + } return compare_buffers(rbuf, check_buf, total_count, dt, mem_type); } diff --git a/test/mpi/test_alltoall.cc b/test/mpi/test_alltoall.cc index 7597353e6b..a92900265e 100644 --- a/test/mpi/test_alltoall.cc +++ b/test/mpi/test_alltoall.cc @@ -74,6 +74,7 @@ ucc_status_t TestAlltoall::set_input(int iter_persistent) void * buf; int rank, nprocs, completed; + this->iter_persistent = iter_persistent; MPI_Comm_rank(team.comm, &rank); MPI_Comm_size(team.comm, &nprocs); if (inplace) { @@ -99,19 +100,18 @@ ucc_status_t TestAlltoall::set_input(int iter_persistent) ucc_status_t TestAlltoall::check() { - int size, completed; - size_t single_rank_count; - MPI_Request req; + int size, rank, i; + size_t single_rank_count; + MPI_Comm_rank(team.comm, &rank); MPI_Comm_size(team.comm, &size); single_rank_count = args.src.info.count / size; - MPI_Ialltoall(MPI_IN_PLACE, 0, MPI_DATATYPE_NULL, check_buf, - single_rank_count, ucc_dt_to_mpi(dt), team.comm, &req); - do { - MPI_Test(&req, &completed, MPI_STATUS_IGNORE); - ucc_context_progress(team.ctx); - } while(!completed); + for (i = 0; i < size; i++) { + init_buffer(PTR_OFFSET(check_buf, i * single_rank_count * ucc_dt_size(dt)), + single_rank_count, dt, UCC_MEMORY_TYPE_HOST, + i * (iter_persistent + 1), single_rank_count * rank); + } return compare_buffers(rbuf, check_buf, single_rank_count * size, dt, mem_type); diff --git a/test/mpi/test_alltoallv.cc b/test/mpi/test_alltoallv.cc index c939ea3968..aaa65b7e10 100644 --- a/test/mpi/test_alltoallv.cc +++ b/test/mpi/test_alltoallv.cc @@ -25,22 +25,26 @@ TestAlltoallv::TestAlltoallv(ucc_test_team_t &_team, TestCaseParams ¶ms) : std::default_random_engine eng; size_t dt_size, count; int rank, nprocs, rank_count; - - dt = params.dt; - dt_size = ucc_dt_size(dt); - count = msgsize / dt_size; - sncounts = 0; - rncounts = 0; - scounts = NULL; - sdispls = NULL; - rcounts = NULL; - rdispls = NULL; - scounts64 = NULL; - sdispls64 = NULL; - rcounts64 = NULL; - rdispls64 = NULL; - count_bits = params.count_bits; - displ_bits = params.displ_bits; + bool is_onesided; + void *work_buf; + + dt = params.dt; + dt_size = ucc_dt_size(dt); + count = msgsize / dt_size; + sncounts = 0; + rncounts = 0; + scounts = NULL; + sdispls = NULL; + rcounts = NULL; + rdispls = NULL; + scounts64 = NULL; + sdispls64 = NULL; + rcounts64 = NULL; + rdispls64 = NULL; + count_bits = params.count_bits; + displ_bits = params.displ_bits; + is_onesided = (params.buffers != NULL); + work_buf = NULL; std::uniform_int_distribution urd(count / 2, count); eng.seed(test_rand_seed); @@ -56,6 +60,10 @@ TestAlltoallv::TestAlltoallv(ucc_test_team_t &_team, TestCaseParams ¶ms) : args.mask = UCC_COLL_ARGS_FIELD_FLAGS; args.flags |= UCC_COLL_ARGS_FLAG_CONTIG_SRC_BUFFER | UCC_COLL_ARGS_FLAG_CONTIG_DST_BUFFER; + if (is_onesided) { + args.mask |= UCC_COLL_ARGS_FIELD_GLOBAL_WORK_BUFFER; + args.flags |= UCC_COLL_ARGS_FLAG_MEM_MAPPED_BUFFERS; + } if (count_bits == TEST_FLAG_VSIZE_64BIT) { args.flags |= UCC_COLL_ARGS_FLAG_COUNT_64BIT; } @@ -92,14 +100,21 @@ TestAlltoallv::TestAlltoallv(ucc_test_team_t &_team, TestCaseParams ¶ms) : if (TEST_SKIP_NONE != skip_reduce(test_skip, team.comm)) { return; } - - UCC_CHECK(ucc_mc_alloc(&sbuf_mc_header, sncounts * dt_size, mem_type)); - UCC_CHECK(ucc_mc_alloc(&rbuf_mc_header, rncounts * dt_size, mem_type)); - sbuf = sbuf_mc_header->addr; - rbuf = rbuf_mc_header->addr; - check_buf = ucc_malloc((sncounts + rncounts) * dt_size, "check buf"); + check_buf = ucc_malloc(rncounts * dt_size, "check buf"); UCC_MALLOC_CHECK(check_buf); + if (!is_onesided) { + UCC_CHECK(ucc_mc_alloc(&sbuf_mc_header, sncounts * dt_size, mem_type)); + UCC_CHECK(ucc_mc_alloc(&rbuf_mc_header, rncounts * dt_size, mem_type)); + sbuf = sbuf_mc_header->addr; + rbuf = rbuf_mc_header->addr; + } else { + sbuf = params.buffers[MEM_SEND_SEGMENT]; + rbuf = params.buffers[MEM_RECV_SEGMENT]; + work_buf = params.buffers[MEM_WORK_SEGMENT]; + args.global_work_buffer = work_buf; + } + args.src.info_v.buffer = sbuf; args.src.info_v.datatype = dt; args.src.info_v.mem_type = mem_type; @@ -140,19 +155,40 @@ TestAlltoallv::TestAlltoallv(ucc_test_team_t &_team, TestCaseParams ¶ms) : args.src.info_v.displacements = (ucc_aint_t*)sdispls; args.dst.info_v.displacements = (ucc_aint_t*)rdispls; } + if (is_onesided) { + MPI_Datatype datatype; + size_t disp_size; + void *ldisp; + int alltoall_status; + + if (TEST_FLAG_VSIZE_64BIT == displ_bits) { + datatype = MPI_LONG; + disp_size = sizeof(uint64_t); + } else { + datatype = MPI_INT; + disp_size = sizeof(uint32_t); + } + ldisp = ucc_calloc(nprocs, disp_size, "displacements"); + UCC_MALLOC_CHECK(ldisp); + alltoall_status = MPI_Alltoall(args.dst.info_v.displacements, 1, + datatype, ldisp, 1, datatype, team.comm); + if (MPI_SUCCESS != alltoall_status) { + std::cerr << "*** MPI ALLTOALL FAILED" << std::endl; + MPI_Abort(MPI_COMM_WORLD, -1); + } + args.dst.info_v.displacements = (ucc_aint_t *)ldisp; + } UCC_CHECK(set_input()); UCC_CHECK_SKIP(ucc_collective_init(&args, &req, team.team), test_skip); } ucc_status_t TestAlltoallv::set_input(int iter_persistent) { - size_t dt_size = ucc_dt_size(dt); - int rank; + int rank; + this->iter_persistent = iter_persistent; MPI_Comm_rank(team.comm, &rank); init_buffer(sbuf, sncounts, dt, mem_type, rank * (iter_persistent + 1)); - UCC_CHECK(ucc_mc_memcpy(check_buf, sbuf, sncounts * dt_size, - UCC_MEMORY_TYPE_HOST, mem_type)); return UCC_OK; } @@ -171,20 +207,25 @@ TestAlltoallv::~TestAlltoallv() ucc_status_t TestAlltoallv::check() { - size_t dt_size = ucc_dt_size(dt); MPI_Request req; - int completed; - void *check; + int i, size, rank, completed; + + MPI_Comm_size(team.comm, &size); + MPI_Comm_rank(team.comm, &rank); - check = PTR_OFFSET(check_buf, sncounts * dt_size); - MPI_Ialltoallv(check_buf, scounts, sdispls, ucc_dt_to_mpi(dt), check, - rcounts, rdispls, ucc_dt_to_mpi(dt), team.comm, &req); + MPI_Ialltoall(sdispls, 1, MPI_INT, scounts, 1, MPI_INT, team.comm, &req); do { MPI_Test(&req, &completed, MPI_STATUS_IGNORE); ucc_context_progress(team.ctx); } while(!completed); - return compare_buffers(rbuf, check, rncounts, dt, mem_type); + for (i = 0; i < size; i++) { + init_buffer(PTR_OFFSET(check_buf, rdispls[i] * ucc_dt_size(dt)), + rcounts[i], dt, UCC_MEMORY_TYPE_HOST, + i * (iter_persistent + 1), scounts[i]); + } + + return compare_buffers(rbuf, check_buf, rncounts, dt, mem_type); } std::string TestAlltoallv::str() diff --git a/test/mpi/test_bcast.cc b/test/mpi/test_bcast.cc index 1a541bcfee..080cbb436f 100644 --- a/test/mpi/test_bcast.cc +++ b/test/mpi/test_bcast.cc @@ -45,6 +45,7 @@ ucc_status_t TestBcast::set_input(int iter_persistent) size_t count = msgsize / dt_size; int rank; + this->iter_persistent = iter_persistent; MPI_Comm_rank(team.comm, &rank); if (rank == root) { init_buffer(sbuf, count, dt, mem_type, rank * (iter_persistent + 1)); @@ -56,18 +57,12 @@ ucc_status_t TestBcast::set_input(int iter_persistent) ucc_status_t TestBcast::check() { - size_t count = args.src.info.count; - MPI_Datatype mpi_dt = ucc_dt_to_mpi(dt); - int rank, completed; - MPI_Request req; + size_t count = args.src.info.count; + int rank; MPI_Comm_rank(team.comm, &rank); - MPI_Ibcast(check_buf, count, mpi_dt, root, team.comm, &req); - do { - MPI_Test(&req, &completed, MPI_STATUS_IGNORE); - ucc_context_progress(team.ctx); - } while(!completed); - + init_buffer(check_buf, count, dt, UCC_MEMORY_TYPE_HOST, + root * (iter_persistent + 1)); return (rank == root) ? UCC_OK : compare_buffers(sbuf, check_buf, count, dt, mem_type); diff --git a/test/mpi/test_case.cc b/test/mpi/test_case.cc index 43ad770e6e..7a37c2ec9c 100644 --- a/test/mpi/test_case.cc +++ b/test/mpi/test_case.cc @@ -149,7 +149,14 @@ test_skip_cause_t TestCase::skip_reduce(int skip_cond, test_skip_cause_t cause, { test_skip_cause_t test_skip; test_skip_cause_t skip = skip_cond ? cause : TestCase::test_skip; - MPI_Allreduce((void*)&skip, (void*)&test_skip, 1, MPI_INT, MPI_MAX, comm); + MPI_Request req; + int completed; + + MPI_Iallreduce((void*)&skip, (void*)&test_skip, 1, MPI_INT, MPI_MAX, comm, &req); + do { + MPI_Test(&req, &completed, MPI_STATUS_IGNORE); + tc_progress_ctx(); + } while(!completed); TestCase::test_skip = test_skip; return test_skip; } diff --git a/test/mpi/test_mpi.cc b/test/mpi/test_mpi.cc index 1d89779046..147ce1fd7d 100644 --- a/test/mpi/test_mpi.cc +++ b/test/mpi/test_mpi.cc @@ -96,7 +96,7 @@ UccTestMpi::UccTestMpi(int argc, char *argv[], ucc_thread_mode_t _tm, ucc_context_config_release(ctx_config); if (with_onesided) { prev_env = getenv("UCC_TL_UCP_TUNE"); - setenv("UCC_TL_UCP_TUNE", "alltoall:0-inf:@onesided", 1); + setenv("UCC_TL_UCP_TUNE", "alltoall:0-inf:@onesided#alltoallv:0-inf:@onesided", 1); UCC_CHECK(ucc_lib_config_read(NULL, NULL, &lib_config)); UCC_CHECK(ucc_init(&lib_params, lib_config, &onesided_lib)); ucc_lib_config_release(lib_config); @@ -474,7 +474,7 @@ void set_gpu_device(test_set_gpu_device_t set_device) #endif -std::vector UccTestMpi::exec_tests( +std::vector UccTestMpi::exec_tests( std::vector> tcs, bool triggered, bool persistent) { @@ -483,7 +483,7 @@ std::vector UccTestMpi::exec_tests( ucc_status_t status; MPI_Comm_rank(MPI_COMM_WORLD, &world_rank); - std::vector rst; + std::vector rst; for (i = 0; i < n_persistent; i++) { for (auto tc: tcs) { @@ -501,7 +501,7 @@ std::vector UccTestMpi::exec_tests( std::cout << "SKIPPED: " << skip_str(tc->test_skip) << ": " << tc->str() << " " << std::endl; } - rst.push_back(UCC_ERR_LAST); + rst.push_back(std::make_tuple(tc->args.coll_type, UCC_ERR_LAST)); return rst; } } @@ -528,14 +528,14 @@ std::vector UccTestMpi::exec_tests( if (UCC_OK != status) { std::cerr << "FAILURE in: " << tc->str() << std::endl; } - rst.push_back(status); + rst.push_back(std::make_tuple(tc->args.coll_type, status)); } } return rst; } void UccTestMpi::run_all_at_team(ucc_test_team_t &team, - std::vector &rst) + std::vector &rst) { TestCaseParams params; @@ -586,11 +586,13 @@ void UccTestMpi::run_all_at_team(ucc_test_team_t &team, for (auto r : roots) { for (auto mt: test_memtypes) { if (triggered && !ucc_coll_triggered_supported(mt)) { - rst.push_back(UCC_ERR_NOT_IMPLEMENTED); + rst.push_back(std::make_tuple(c, UCC_ERR_NOT_IMPLEMENTED)); continue; } - if (c == UCC_COLL_TYPE_ALLTOALL && team.ctx != ctx) { + if ((c == UCC_COLL_TYPE_ALLTOALL || + c == UCC_COLL_TYPE_ALLTOALLV) && + team.ctx != ctx) { /* onesided alltoall */ if (mt != UCC_MEMORY_TYPE_HOST) { continue; @@ -640,10 +642,10 @@ void UccTestMpi::run_all_at_team(ucc_test_team_t &team, } typedef struct ucc_test_thread { - pthread_t thread; - int id; - UccTestMpi * test; - std::vector rst; + pthread_t thread; + int id; + UccTestMpi * test; + std::vector rst; } ucc_test_thread_t; static void *thread_start(void *arg) diff --git a/test/mpi/test_mpi.h b/test/mpi/test_mpi.h index fcc11b544b..391cb21996 100644 --- a/test/mpi/test_mpi.h +++ b/test/mpi/test_mpi.h @@ -145,7 +145,6 @@ static inline const char* skip_str(test_skip_cause_t s) { default: return "unknown"; } - return NULL; } static inline const char* team_str(ucc_test_mpi_team_t t) { @@ -268,7 +267,6 @@ class TestCase { size_t msgsize; bool inplace; bool persistent; - ucc_coll_args_t args; ucc_coll_req_h req; ucc_mc_buffer_header_t *sbuf_mc_header, *rbuf_mc_header; void *sbuf; @@ -278,8 +276,9 @@ class TestCase { uint8_t progress_buf[1]; size_t test_max_size; ucc_datatype_t dt; - + int iter_persistent; public: + ucc_coll_args_t args; void mpi_progress(void); test_skip_cause_t test_skip; static std::shared_ptr init_single( @@ -305,6 +304,7 @@ class TestCase { MPI_Comm comm); }; +typedef std::tuple ucc_test_mpi_result_t; class UccTestMpi { ucc_thread_mode_t tm; ucc_context_h ctx; @@ -332,14 +332,15 @@ class UccTestMpi { std::vector gen_roots(ucc_test_team_t &team); std::vector counts_vsize; std::vector displs_vsize; - std::vector exec_tests( + std::vector exec_tests( std::vector> tcs, bool triggered, bool persistent); public: std::vector teams; std::vector onesided_teams; - void run_all_at_team(ucc_test_team_t &team, std::vector &rst); - std::vector results; + void run_all_at_team(ucc_test_team_t &team, + std::vector &rst); + std::vector results; UccTestMpi(int argc, char *argv[], ucc_thread_mode_t tm, int is_local, bool with_onesided); ~UccTestMpi(); @@ -379,6 +380,9 @@ class UccTestMpi { bool is_onesided = false); void progress_ctx() { ucc_context_progress(ctx); + if (onesided_ctx) { + ucc_context_progress(onesided_ctx); + } } }; @@ -523,7 +527,7 @@ class TestScatterv : public TestCase { }; void init_buffer(void *buf, size_t count, ucc_datatype_t dt, - ucc_memory_type_t mt, int value); + ucc_memory_type_t mt, int value, int offset = 0); ucc_status_t compare_buffers(void *rst, void *expected, size_t count, ucc_datatype_t dt, ucc_memory_type_t mt); diff --git a/tools/perf/ucc_pt_benchmark.cc b/tools/perf/ucc_pt_benchmark.cc index c4ef8c6289..cbaa5d664a 100644 --- a/tools/perf/ucc_pt_benchmark.cc +++ b/tools/perf/ucc_pt_benchmark.cc @@ -18,54 +18,61 @@ ucc_pt_benchmark::ucc_pt_benchmark(ucc_pt_benchmark_config cfg, { switch (cfg.op_type) { case UCC_PT_OP_TYPE_ALLGATHER: - coll = new ucc_pt_coll_allgather(cfg.dt, cfg.mt, cfg.inplace, comm); + coll = new ucc_pt_coll_allgather(cfg.dt, cfg.mt, cfg.inplace, + cfg.persistent, comm); break; case UCC_PT_OP_TYPE_ALLGATHERV: - coll = new ucc_pt_coll_allgatherv(cfg.dt, cfg.mt, cfg.inplace, comm); + coll = new ucc_pt_coll_allgatherv(cfg.dt, cfg.mt, cfg.inplace, + cfg.persistent, comm); break; case UCC_PT_OP_TYPE_ALLREDUCE: coll = new ucc_pt_coll_allreduce(cfg.dt, cfg.mt, cfg.op, cfg.inplace, - comm); + cfg.persistent, comm); break; case UCC_PT_OP_TYPE_ALLTOALL: - coll = new ucc_pt_coll_alltoall(cfg.dt, cfg.mt, cfg.inplace, comm); + coll = new ucc_pt_coll_alltoall(cfg.dt, cfg.mt, cfg.inplace, + cfg.persistent, comm); break; case UCC_PT_OP_TYPE_ALLTOALLV: - coll = new ucc_pt_coll_alltoallv(cfg.dt, cfg.mt, cfg.inplace, comm); + coll = new ucc_pt_coll_alltoallv(cfg.dt, cfg.mt, cfg.inplace, + cfg.persistent, comm); break; case UCC_PT_OP_TYPE_BARRIER: coll = new ucc_pt_coll_barrier(comm); break; case UCC_PT_OP_TYPE_BCAST: - coll = new ucc_pt_coll_bcast(cfg.dt, cfg.mt, cfg.root_shift, comm); + coll = new ucc_pt_coll_bcast(cfg.dt, cfg.mt, cfg.root_shift, + cfg.persistent, comm); break; case UCC_PT_OP_TYPE_GATHER: coll = new ucc_pt_coll_gather(cfg.dt, cfg.mt, cfg.inplace, - cfg.root_shift, comm); + cfg.persistent, cfg.root_shift, comm); break; case UCC_PT_OP_TYPE_GATHERV: coll = new ucc_pt_coll_gatherv(cfg.dt, cfg.mt, cfg.inplace, - cfg.root_shift, comm); + cfg.persistent, cfg.root_shift, comm); break; case UCC_PT_OP_TYPE_REDUCE: coll = new ucc_pt_coll_reduce(cfg.dt, cfg.mt, cfg.op, cfg.inplace, - cfg.root_shift, comm); + cfg.persistent, cfg.root_shift, comm); break; case UCC_PT_OP_TYPE_REDUCE_SCATTER: coll = new ucc_pt_coll_reduce_scatter(cfg.dt, cfg.mt, cfg.op, - cfg.inplace, comm); + cfg.inplace, + cfg.persistent, comm); break; case UCC_PT_OP_TYPE_REDUCE_SCATTERV: coll = new ucc_pt_coll_reduce_scatterv(cfg.dt, cfg.mt, cfg.op, - cfg.inplace, comm); + cfg.inplace, cfg.persistent, + comm); break; case UCC_PT_OP_TYPE_SCATTER: coll = new ucc_pt_coll_scatter(cfg.dt, cfg.mt, cfg.inplace, - cfg.root_shift, comm); + cfg.persistent, cfg.root_shift, comm); break; case UCC_PT_OP_TYPE_SCATTERV: coll = new ucc_pt_coll_scatterv(cfg.dt, cfg.mt, cfg.inplace, - cfg.root_shift, comm); + cfg.persistent, cfg.root_shift, comm); break; case UCC_PT_OP_TYPE_MEMCPY: coll = new ucc_pt_op_memcpy(cfg.dt, cfg.mt, cfg.n_bufs, comm); @@ -137,10 +144,11 @@ ucc_status_t ucc_pt_benchmark::run_single_coll_test(ucc_coll_args_t args, double &time) noexcept { - const bool triggered = config.triggered; - ucc_team_h team = comm->get_team(); - ucc_context_h ctx = comm->get_context(); - ucc_status_t st = UCC_OK; + const bool triggered = config.triggered; + const bool persistent = config.persistent; + ucc_team_h team = comm->get_team(); + ucc_context_h ctx = comm->get_context(); + ucc_status_t st = UCC_OK; ucc_coll_req_h req; ucc_ee_h ee; ucc_ev_t comp_ev, *post_ev; @@ -161,10 +169,18 @@ ucc_status_t ucc_pt_benchmark::run_single_coll_test(ucc_coll_args_t args, comp_ev.ev_context_size = 0; } + if (persistent) { + UCCCHECK_GOTO(ucc_collective_init(&args, &req, team), exit_err, st); + } + args.root = config.root % comm->get_size(); for (int i = 0; i < nwarmup + niter; i++) { double s = get_time_us(); - UCCCHECK_GOTO(ucc_collective_init(&args, &req, team), exit_err, st); + + if (!persistent) { + UCCCHECK_GOTO(ucc_collective_init(&args, &req, team), exit_err, st); + } + if (triggered) { comp_ev.req = req; UCCCHECK_GOTO(ucc_collective_triggered_post(ee, &comp_ev), @@ -175,12 +191,16 @@ ucc_status_t ucc_pt_benchmark::run_single_coll_test(ucc_coll_args_t args, } else { UCCCHECK_GOTO(ucc_collective_post(req), free_req, st); } + st = ucc_collective_test(req); while (st > 0) { UCCCHECK_GOTO(ucc_context_progress(ctx), free_req, st); st = ucc_collective_test(req); } - ucc_collective_finalize(req); + + if (!persistent) { + ucc_collective_finalize(req); + } double f = get_time_us(); if (st != UCC_OK) { goto exit_err; @@ -191,6 +211,11 @@ ucc_status_t ucc_pt_benchmark::run_single_coll_test(ucc_coll_args_t args, args.root = (args.root + config.root_shift) % comm->get_size(); UCCCHECK_GOTO(comm->barrier(), exit_err, st); } + + if (persistent) { + ucc_collective_finalize(req); + } + if (niter != 0) { time /= niter; } diff --git a/tools/perf/ucc_pt_coll.cc b/tools/perf/ucc_pt_coll.cc index a561ea73b4..e013615dd8 100644 --- a/tools/perf/ucc_pt_coll.cc +++ b/tools/perf/ucc_pt_coll.cc @@ -5,11 +5,56 @@ */ #include "ucc_pt_coll.h" +#include "ucc_pt_cuda.h" +#include "utils/ucc_malloc.h" ucc_status_t ucc_pt_alloc(ucc_mc_buffer_header_t **h_ptr, size_t len, ucc_memory_type_t mem_type) { ucc_status_t status; + int cuda_st; + + switch (mem_type) { + case UCC_MEMORY_TYPE_CUDA: + *h_ptr = new ucc_mc_buffer_header_t; + (*h_ptr)->mt = UCC_MEMORY_TYPE_CUDA; + cuda_st = ucc_pt_cudaMalloc(&((*h_ptr)->addr), len); + if (cuda_st != 0) { + return UCC_ERR_NO_MEMORY; + } + cuda_st = ucc_pt_cudaMemset((*h_ptr)->addr, 0, len); + if (cuda_st != 0) { + ucc_pt_cudaFree((*h_ptr)->addr); + delete *h_ptr; + return UCC_ERR_NO_MEMORY; + } + return UCC_OK; + case UCC_MEMORY_TYPE_CUDA_MANAGED: + *h_ptr = new ucc_mc_buffer_header_t; + (*h_ptr)->mt = UCC_MEMORY_TYPE_CUDA_MANAGED; + cuda_st = ucc_pt_cudaMallocManaged(&((*h_ptr)->addr), len); + if (cuda_st != 0) { + return UCC_ERR_NO_MEMORY; + } + cuda_st = ucc_pt_cudaMemset((*h_ptr)->addr, 0, len); + if (cuda_st != 0) { + ucc_pt_cudaFree((*h_ptr)->addr); + delete *h_ptr; + return UCC_ERR_NO_MEMORY; + } + return UCC_OK; + case UCC_MEMORY_TYPE_HOST: + *h_ptr = new ucc_mc_buffer_header_t; + (*h_ptr)->mt = UCC_MEMORY_TYPE_HOST; + (*h_ptr)->addr = ucc_malloc(len, "perftest data"); + if (!((*h_ptr)->addr)) { + return UCC_ERR_NO_MEMORY; + } + memset((*h_ptr)->addr, 0, len); + return UCC_OK; + default: + break; + } status = ucc_mc_alloc(h_ptr, len, mem_type); if (status != UCC_OK) { @@ -26,6 +71,20 @@ ucc_status_t ucc_pt_alloc(ucc_mc_buffer_header_t **h_ptr, size_t len, ucc_status_t ucc_pt_free(ucc_mc_buffer_header_t *h_ptr) { + switch (h_ptr->mt) { + case UCC_MEMORY_TYPE_CUDA: + case UCC_MEMORY_TYPE_CUDA_MANAGED: + ucc_pt_cudaFree(h_ptr->addr); + delete h_ptr; + return UCC_OK; + case UCC_MEMORY_TYPE_HOST: + ucc_free(h_ptr->addr); + delete h_ptr; + return UCC_OK; + default: + break; + } + return ucc_mc_free(h_ptr); } diff --git a/tools/perf/ucc_pt_coll.h b/tools/perf/ucc_pt_coll.h index 63afc9bd9e..0b92039fab 100644 --- a/tools/perf/ucc_pt_coll.h +++ b/tools/perf/ucc_pt_coll.h @@ -58,7 +58,8 @@ class ucc_pt_coll { class ucc_pt_coll_allgather: public ucc_pt_coll { public: ucc_pt_coll_allgather(ucc_datatype_t dt, ucc_memory_type mt, - bool is_inplace, ucc_pt_comm *communicator); + bool is_inplace, bool is_persistent, + ucc_pt_comm *communicator); ucc_status_t init_args(size_t count, ucc_pt_test_args_t &args) override; void free_args(ucc_pt_test_args_t &args) override; float get_bw(float time_ms, int grsize, ucc_pt_test_args_t args) override; @@ -67,7 +68,8 @@ class ucc_pt_coll_allgather: public ucc_pt_coll { class ucc_pt_coll_allgatherv: public ucc_pt_coll { public: ucc_pt_coll_allgatherv(ucc_datatype_t dt, ucc_memory_type mt, - bool is_inplace, ucc_pt_comm *communicator); + bool is_inplace, bool is_persistent, + ucc_pt_comm *communicator); ucc_status_t init_args(size_t count, ucc_pt_test_args_t &args) override; void free_args(ucc_pt_test_args_t &args) override; }; @@ -76,7 +78,7 @@ class ucc_pt_coll_allreduce: public ucc_pt_coll { public: ucc_pt_coll_allreduce(ucc_datatype_t dt, ucc_memory_type mt, ucc_reduction_op_t op, bool is_inplace, - ucc_pt_comm *communicator); + bool is_persistent, ucc_pt_comm *communicator); ucc_status_t init_args(size_t count, ucc_pt_test_args_t &args) override; void free_args(ucc_pt_test_args_t &args) override; float get_bw(float time_ms, int grsize, ucc_pt_test_args_t args) override; @@ -85,7 +87,8 @@ class ucc_pt_coll_allreduce: public ucc_pt_coll { class ucc_pt_coll_alltoall: public ucc_pt_coll { public: ucc_pt_coll_alltoall(ucc_datatype_t dt, ucc_memory_type mt, - bool is_inplace, ucc_pt_comm *communicator); + bool is_inplace, bool is_persistent, + ucc_pt_comm *communicator); ucc_status_t init_args(size_t count, ucc_pt_test_args_t &args) override; void free_args(ucc_pt_test_args_t &args) override; float get_bw(float time_ms, int grsize, ucc_pt_test_args_t args) override; @@ -94,7 +97,8 @@ class ucc_pt_coll_alltoall: public ucc_pt_coll { class ucc_pt_coll_alltoallv: public ucc_pt_coll { public: ucc_pt_coll_alltoallv(ucc_datatype_t dt, ucc_memory_type mt, - bool is_inplace, ucc_pt_comm *communicator); + bool is_inplace, bool is_persistent, + ucc_pt_comm *communicator); ucc_status_t init_args(size_t count, ucc_pt_test_args_t &args) override; void free_args(ucc_pt_test_args_t &args) override; }; @@ -109,7 +113,7 @@ class ucc_pt_coll_barrier: public ucc_pt_coll { class ucc_pt_coll_bcast: public ucc_pt_coll { public: ucc_pt_coll_bcast(ucc_datatype_t dt, ucc_memory_type mt, int root_shift, - ucc_pt_comm *communicator); + bool is_persistent, ucc_pt_comm *communicator); ucc_status_t init_args(size_t count, ucc_pt_test_args_t &args) override; void free_args(ucc_pt_test_args_t &args) override; float get_bw(float time_ms, int grsize, ucc_pt_test_args_t args) override; @@ -118,7 +122,7 @@ class ucc_pt_coll_bcast: public ucc_pt_coll { class ucc_pt_coll_gather: public ucc_pt_coll { public: ucc_pt_coll_gather(ucc_datatype_t dt, ucc_memory_type mt, - bool is_inplace, int root_shift, + bool is_inplace, bool is_persistent, int root_shift, ucc_pt_comm *communicator); ucc_status_t init_args(size_t count, ucc_pt_test_args_t &args) override; void free_args(ucc_pt_test_args_t &args) override; @@ -128,7 +132,7 @@ class ucc_pt_coll_gather: public ucc_pt_coll { class ucc_pt_coll_gatherv: public ucc_pt_coll { public: ucc_pt_coll_gatherv(ucc_datatype_t dt, ucc_memory_type mt, - bool is_inplace, int root_shift, + bool is_inplace, bool is_persistent, int root_shift, ucc_pt_comm *communicator); ucc_status_t init_args(size_t count, ucc_pt_test_args_t &args) override; void free_args(ucc_pt_test_args_t &args) override; @@ -137,8 +141,8 @@ class ucc_pt_coll_gatherv: public ucc_pt_coll { class ucc_pt_coll_reduce: public ucc_pt_coll { public: ucc_pt_coll_reduce(ucc_datatype_t dt, ucc_memory_type mt, - ucc_reduction_op_t op, bool is_inplace, int root_shift, - ucc_pt_comm *communicator); + ucc_reduction_op_t op, bool is_inplace, bool is_persistent, + int root_shift, ucc_pt_comm *communicator); ucc_status_t init_args(size_t count, ucc_pt_test_args_t &args) override; void free_args(ucc_pt_test_args_t &args) override; float get_bw(float time_ms, int grsize, ucc_pt_test_args_t args) override; @@ -148,7 +152,7 @@ class ucc_pt_coll_reduce_scatter: public ucc_pt_coll { public: ucc_pt_coll_reduce_scatter(ucc_datatype_t dt, ucc_memory_type mt, ucc_reduction_op_t op, bool is_inplace, - ucc_pt_comm *communicator); + bool is_persistent, ucc_pt_comm *communicator); ucc_status_t init_args(size_t count, ucc_pt_test_args_t &args) override; void free_args(ucc_pt_test_args_t &args) override; float get_bw(float time_ms, int grsize, ucc_pt_test_args_t args) override; @@ -158,7 +162,7 @@ class ucc_pt_coll_reduce_scatterv: public ucc_pt_coll { public: ucc_pt_coll_reduce_scatterv(ucc_datatype_t dt, ucc_memory_type mt, ucc_reduction_op_t op, bool is_inplace, - ucc_pt_comm *communicator); + bool is_persistent, ucc_pt_comm *communicator); ucc_status_t init_args(size_t count, ucc_pt_test_args_t &args) override; void free_args(ucc_pt_test_args_t &args) override; }; @@ -166,7 +170,7 @@ class ucc_pt_coll_reduce_scatterv: public ucc_pt_coll { class ucc_pt_coll_scatter: public ucc_pt_coll { public: ucc_pt_coll_scatter(ucc_datatype_t dt, ucc_memory_type mt, - bool is_inplace, int root_shift, + bool is_inplace, bool is_persistent, int root_shift, ucc_pt_comm *communicator); ucc_status_t init_args(size_t count, ucc_pt_test_args_t &args) override; void free_args(ucc_pt_test_args_t &args) override; @@ -176,7 +180,7 @@ class ucc_pt_coll_scatter: public ucc_pt_coll { class ucc_pt_coll_scatterv: public ucc_pt_coll { public: ucc_pt_coll_scatterv(ucc_datatype_t dt, ucc_memory_type mt, - bool is_inplace, int root_shift, + bool is_inplace, bool is_persistent, int root_shift, ucc_pt_comm *communicator); ucc_status_t init_args(size_t count, ucc_pt_test_args_t &args) override; void free_args(ucc_pt_test_args_t &args) override; diff --git a/tools/perf/ucc_pt_coll_allgather.cc b/tools/perf/ucc_pt_coll_allgather.cc index 76e6084032..b8185dd9e8 100644 --- a/tools/perf/ucc_pt_coll_allgather.cc +++ b/tools/perf/ucc_pt_coll_allgather.cc @@ -12,6 +12,7 @@ ucc_pt_coll_allgather::ucc_pt_coll_allgather(ucc_datatype_t dt, ucc_memory_type mt, bool is_inplace, + bool is_persistent, ucc_pt_comm *communicator) : ucc_pt_coll(communicator) { @@ -21,16 +22,23 @@ ucc_pt_coll_allgather::ucc_pt_coll_allgather(ucc_datatype_t dt, has_bw_ = true; root_shift_ = 0; - coll_args.mask = 0; - coll_args.coll_type = UCC_COLL_TYPE_ALLGATHER; + coll_args.mask = 0; + coll_args.flags = 0; + coll_args.coll_type = UCC_COLL_TYPE_ALLGATHER; coll_args.src.info.datatype = dt; coll_args.src.info.mem_type = mt; coll_args.dst.info.datatype = dt; coll_args.dst.info.mem_type = mt; + if (is_inplace) { - coll_args.mask = UCC_COLL_ARGS_FIELD_FLAGS; + coll_args.mask = UCC_COLL_ARGS_FIELD_FLAGS; coll_args.flags = UCC_COLL_ARGS_FLAG_IN_PLACE; } + + if (is_persistent) { + coll_args.mask |= UCC_COLL_ARGS_FIELD_FLAGS; + coll_args.flags |= UCC_COLL_ARGS_FLAG_PERSISTENT; + } } ucc_status_t ucc_pt_coll_allgather::init_args(size_t single_rank_count, diff --git a/tools/perf/ucc_pt_coll_allgatherv.cc b/tools/perf/ucc_pt_coll_allgatherv.cc index 8642322c64..c6c18a7c5a 100644 --- a/tools/perf/ucc_pt_coll_allgatherv.cc +++ b/tools/perf/ucc_pt_coll_allgatherv.cc @@ -12,6 +12,7 @@ ucc_pt_coll_allgatherv::ucc_pt_coll_allgatherv(ucc_datatype_t dt, ucc_memory_type mt, bool is_inplace, + bool is_persistent, ucc_pt_comm *communicator) : ucc_pt_coll(communicator) { has_inplace_ = true; @@ -20,16 +21,23 @@ ucc_pt_coll_allgatherv::ucc_pt_coll_allgatherv(ucc_datatype_t dt, has_bw_ = false; root_shift_ = 0; - coll_args.mask = 0; - coll_args.coll_type = UCC_COLL_TYPE_ALLGATHERV; - coll_args.src.info.datatype = dt; - coll_args.src.info.mem_type = mt; + coll_args.mask = 0; + coll_args.flags = 0; + coll_args.coll_type = UCC_COLL_TYPE_ALLGATHERV; + coll_args.src.info.datatype = dt; + coll_args.src.info.mem_type = mt; coll_args.dst.info_v.datatype = dt; coll_args.dst.info_v.mem_type = mt; + if (is_inplace) { - coll_args.mask = UCC_COLL_ARGS_FIELD_FLAGS; + coll_args.mask = UCC_COLL_ARGS_FIELD_FLAGS; coll_args.flags = UCC_COLL_ARGS_FLAG_IN_PLACE; } + + if (is_persistent) { + coll_args.mask |= UCC_COLL_ARGS_FIELD_FLAGS; + coll_args.flags |= UCC_COLL_ARGS_FLAG_PERSISTENT; + } } ucc_status_t ucc_pt_coll_allgatherv::init_args(size_t count, diff --git a/tools/perf/ucc_pt_coll_allreduce.cc b/tools/perf/ucc_pt_coll_allreduce.cc index 8234f26dd7..3159dc3a9f 100644 --- a/tools/perf/ucc_pt_coll_allreduce.cc +++ b/tools/perf/ucc_pt_coll_allreduce.cc @@ -12,7 +12,7 @@ ucc_pt_coll_allreduce::ucc_pt_coll_allreduce(ucc_datatype_t dt, ucc_memory_type mt, ucc_reduction_op_t op, - bool is_inplace, + bool is_inplace, bool is_persistent, ucc_pt_comm *communicator) : ucc_pt_coll(communicator) { has_inplace_ = true; @@ -21,17 +21,25 @@ ucc_pt_coll_allreduce::ucc_pt_coll_allreduce(ucc_datatype_t dt, has_bw_ = true; root_shift_ = 0; - coll_args.coll_type = UCC_COLL_TYPE_ALLREDUCE; - coll_args.mask = 0; - if (is_inplace) { - coll_args.mask = UCC_COLL_ARGS_FIELD_FLAGS; - coll_args.flags = UCC_COLL_ARGS_FLAG_IN_PLACE; - } + coll_args.mask = 0; + coll_args.flags = 0; + coll_args.coll_type = UCC_COLL_TYPE_ALLREDUCE; coll_args.op = op; coll_args.src.info.datatype = dt; coll_args.dst.info.datatype = dt; coll_args.src.info.mem_type = mt; coll_args.dst.info.mem_type = mt; + + if (is_inplace) { + coll_args.mask = UCC_COLL_ARGS_FIELD_FLAGS; + coll_args.flags = UCC_COLL_ARGS_FLAG_IN_PLACE; + } + + if (is_persistent) { + coll_args.mask |= UCC_COLL_ARGS_FIELD_FLAGS; + coll_args.flags |= UCC_COLL_ARGS_FLAG_PERSISTENT; + + } } ucc_status_t ucc_pt_coll_allreduce::init_args(size_t count, diff --git a/tools/perf/ucc_pt_coll_alltoall.cc b/tools/perf/ucc_pt_coll_alltoall.cc index f4e9cf57b5..77a2608f7f 100644 --- a/tools/perf/ucc_pt_coll_alltoall.cc +++ b/tools/perf/ucc_pt_coll_alltoall.cc @@ -12,6 +12,7 @@ ucc_pt_coll_alltoall::ucc_pt_coll_alltoall(ucc_datatype_t dt, ucc_memory_type mt, bool is_inplace, + bool is_persistent, ucc_pt_comm *communicator) : ucc_pt_coll(communicator) { has_inplace_ = true; @@ -20,16 +21,23 @@ ucc_pt_coll_alltoall::ucc_pt_coll_alltoall(ucc_datatype_t dt, has_bw_ = true; root_shift_ = 0; - coll_args.mask = 0; - coll_args.coll_type = UCC_COLL_TYPE_ALLTOALL; + coll_args.mask = 0; + coll_args.flags = 0; + coll_args.coll_type = UCC_COLL_TYPE_ALLTOALL; coll_args.src.info.datatype = dt; coll_args.src.info.mem_type = mt; coll_args.dst.info.datatype = dt; coll_args.dst.info.mem_type = mt; + if (is_inplace) { - coll_args.mask = UCC_COLL_ARGS_FIELD_FLAGS; + coll_args.mask = UCC_COLL_ARGS_FIELD_FLAGS; coll_args.flags = UCC_COLL_ARGS_FLAG_IN_PLACE; } + + if (is_persistent) { + coll_args.mask |= UCC_COLL_ARGS_FIELD_FLAGS; + coll_args.flags |= UCC_COLL_ARGS_FLAG_PERSISTENT; + } } ucc_status_t ucc_pt_coll_alltoall::init_args(size_t single_rank_count, diff --git a/tools/perf/ucc_pt_coll_alltoallv.cc b/tools/perf/ucc_pt_coll_alltoallv.cc index 4ba88ec123..6ce68ed032 100644 --- a/tools/perf/ucc_pt_coll_alltoallv.cc +++ b/tools/perf/ucc_pt_coll_alltoallv.cc @@ -12,6 +12,7 @@ ucc_pt_coll_alltoallv::ucc_pt_coll_alltoallv(ucc_datatype_t dt, ucc_memory_type mt, bool is_inplace, + bool is_persistent, ucc_pt_comm *communicator) : ucc_pt_coll(communicator) { has_inplace_ = true; @@ -31,6 +32,11 @@ ucc_pt_coll_alltoallv::ucc_pt_coll_alltoallv(ucc_datatype_t dt, if (is_inplace) { coll_args.flags |= UCC_COLL_ARGS_FLAG_IN_PLACE; } + + if (is_persistent) { + coll_args.flags |= UCC_COLL_ARGS_FLAG_PERSISTENT; + } + } ucc_status_t ucc_pt_coll_alltoallv::init_args(size_t count, diff --git a/tools/perf/ucc_pt_coll_bcast.cc b/tools/perf/ucc_pt_coll_bcast.cc index b389228c38..b869c902c1 100644 --- a/tools/perf/ucc_pt_coll_bcast.cc +++ b/tools/perf/ucc_pt_coll_bcast.cc @@ -11,7 +11,8 @@ #include ucc_pt_coll_bcast::ucc_pt_coll_bcast(ucc_datatype_t dt, ucc_memory_type mt, - int root_shift, ucc_pt_comm *communicator) + int root_shift, bool is_persistent, + ucc_pt_comm *communicator) : ucc_pt_coll(communicator) { has_inplace_ = false; @@ -20,10 +21,16 @@ ucc_pt_coll_bcast::ucc_pt_coll_bcast(ucc_datatype_t dt, ucc_memory_type mt, has_bw_ = true; root_shift_ = root_shift; - coll_args.mask = 0; - coll_args.coll_type = UCC_COLL_TYPE_BCAST; + coll_args.mask = 0; + coll_args.flags = 0; + coll_args.coll_type = UCC_COLL_TYPE_BCAST; coll_args.src.info.datatype = dt; coll_args.src.info.mem_type = mt; + + if (is_persistent) { + coll_args.mask |= UCC_COLL_ARGS_FIELD_FLAGS; + coll_args.flags |= UCC_COLL_ARGS_FLAG_PERSISTENT; + } } ucc_status_t ucc_pt_coll_bcast::init_args(size_t count, diff --git a/tools/perf/ucc_pt_coll_gather.cc b/tools/perf/ucc_pt_coll_gather.cc index e189164484..660356bee8 100644 --- a/tools/perf/ucc_pt_coll_gather.cc +++ b/tools/perf/ucc_pt_coll_gather.cc @@ -11,7 +11,8 @@ #include ucc_pt_coll_gather::ucc_pt_coll_gather(ucc_datatype_t dt, - ucc_memory_type mt, bool is_inplace, int root_shift, + ucc_memory_type mt, bool is_inplace, + bool is_persistent, int root_shift, ucc_pt_comm *communicator) : ucc_pt_coll(communicator) { has_inplace_ = true; @@ -21,15 +22,22 @@ ucc_pt_coll_gather::ucc_pt_coll_gather(ucc_datatype_t dt, root_shift_ = root_shift; coll_args.mask = 0; + coll_args.flags = 0; coll_args.coll_type = UCC_COLL_TYPE_GATHER; coll_args.src.info.datatype = dt; coll_args.src.info.mem_type = mt; coll_args.dst.info.datatype = dt; coll_args.dst.info.mem_type = mt; + if (is_inplace) { coll_args.mask = UCC_COLL_ARGS_FIELD_FLAGS; coll_args.flags = UCC_COLL_ARGS_FLAG_IN_PLACE; } + + if (is_persistent) { + coll_args.mask |= UCC_COLL_ARGS_FIELD_FLAGS; + coll_args.flags |= UCC_COLL_ARGS_FLAG_PERSISTENT; + } } ucc_status_t ucc_pt_coll_gather::init_args(size_t single_rank_count, diff --git a/tools/perf/ucc_pt_coll_gatherv.cc b/tools/perf/ucc_pt_coll_gatherv.cc index 6739f241d6..ab8715b3cc 100644 --- a/tools/perf/ucc_pt_coll_gatherv.cc +++ b/tools/perf/ucc_pt_coll_gatherv.cc @@ -11,7 +11,8 @@ #include ucc_pt_coll_gatherv::ucc_pt_coll_gatherv(ucc_datatype_t dt, - ucc_memory_type mt, bool is_inplace, int root_shift, + ucc_memory_type mt, bool is_inplace, + bool is_persistent, int root_shift, ucc_pt_comm *communicator) : ucc_pt_coll(communicator) { has_inplace_ = true; @@ -21,15 +22,22 @@ ucc_pt_coll_gatherv::ucc_pt_coll_gatherv(ucc_datatype_t dt, root_shift_ = root_shift; coll_args.mask = 0; + coll_args.flags = 0; coll_args.coll_type = UCC_COLL_TYPE_GATHERV; coll_args.src.info.datatype = dt; coll_args.src.info.mem_type = mt; coll_args.dst.info_v.datatype = dt; coll_args.dst.info_v.mem_type = mt; + if (is_inplace) { coll_args.mask = UCC_COLL_ARGS_FIELD_FLAGS; coll_args.flags = UCC_COLL_ARGS_FLAG_IN_PLACE; } + + if (is_persistent) { + coll_args.mask |= UCC_COLL_ARGS_FIELD_FLAGS; + coll_args.flags |= UCC_COLL_ARGS_FLAG_PERSISTENT; + } } ucc_status_t ucc_pt_coll_gatherv::init_args(size_t count, diff --git a/tools/perf/ucc_pt_coll_reduce.cc b/tools/perf/ucc_pt_coll_reduce.cc index ad013bab67..47610bb68c 100644 --- a/tools/perf/ucc_pt_coll_reduce.cc +++ b/tools/perf/ucc_pt_coll_reduce.cc @@ -11,7 +11,8 @@ #include ucc_pt_coll_reduce::ucc_pt_coll_reduce(ucc_datatype_t dt, ucc_memory_type mt, - ucc_reduction_op_t op, bool is_inplace, int root_shift, + ucc_reduction_op_t op, bool is_inplace, + bool is_persistent, int root_shift, ucc_pt_comm *communicator) : ucc_pt_coll(communicator) { has_inplace_ = true; @@ -20,18 +21,24 @@ ucc_pt_coll_reduce::ucc_pt_coll_reduce(ucc_datatype_t dt, ucc_memory_type mt, has_bw_ = true; root_shift_ = root_shift; - coll_args.coll_type = UCC_COLL_TYPE_REDUCE; - coll_args.mask = 0; + coll_args.mask = 0; + coll_args.flags = 0; + coll_args.coll_type = UCC_COLL_TYPE_REDUCE; + coll_args.op = op; + coll_args.src.info.datatype = dt; + coll_args.src.info.mem_type = mt; + coll_args.dst.info.datatype = dt; + coll_args.dst.info.mem_type = mt; + if (is_inplace) { coll_args.mask = UCC_COLL_ARGS_FIELD_FLAGS; coll_args.flags = UCC_COLL_ARGS_FLAG_IN_PLACE; } - coll_args.op = op; - coll_args.src.info.datatype = dt; - coll_args.src.info.mem_type = mt; - coll_args.dst.info.datatype = dt; - coll_args.dst.info.mem_type = mt; + if (is_persistent) { + coll_args.mask |= UCC_COLL_ARGS_FIELD_FLAGS; + coll_args.flags |= UCC_COLL_ARGS_FLAG_PERSISTENT; + } } ucc_status_t ucc_pt_coll_reduce::init_args(size_t count, diff --git a/tools/perf/ucc_pt_coll_reduce_scatter.cc b/tools/perf/ucc_pt_coll_reduce_scatter.cc index 8c51a5ffbd..e15bf80bcb 100644 --- a/tools/perf/ucc_pt_coll_reduce_scatter.cc +++ b/tools/perf/ucc_pt_coll_reduce_scatter.cc @@ -12,7 +12,7 @@ ucc_pt_coll_reduce_scatter::ucc_pt_coll_reduce_scatter(ucc_datatype_t dt, ucc_memory_type mt, ucc_reduction_op_t op, - bool is_inplace, + bool is_inplace, bool is_persistent, ucc_pt_comm *communicator) : ucc_pt_coll(communicator) { has_inplace_ = true; @@ -21,18 +21,24 @@ ucc_pt_coll_reduce_scatter::ucc_pt_coll_reduce_scatter(ucc_datatype_t dt, has_bw_ = true; root_shift_ = 0; - coll_args.coll_type = UCC_COLL_TYPE_REDUCE_SCATTER; - coll_args.mask = 0; - if (is_inplace) { - coll_args.mask = UCC_COLL_ARGS_FIELD_FLAGS; - coll_args.flags = UCC_COLL_ARGS_FLAG_IN_PLACE; - } - + coll_args.mask = 0; + coll_args.flags = 0; + coll_args.coll_type = UCC_COLL_TYPE_REDUCE_SCATTER; coll_args.op = op; coll_args.src.info.datatype = dt; coll_args.src.info.mem_type = mt; coll_args.dst.info.datatype = dt; coll_args.dst.info.mem_type = mt; + + if (is_inplace) { + coll_args.mask = UCC_COLL_ARGS_FIELD_FLAGS; + coll_args.flags = UCC_COLL_ARGS_FLAG_IN_PLACE; + } + + if (is_persistent) { + coll_args.mask |= UCC_COLL_ARGS_FIELD_FLAGS; + coll_args.flags |= UCC_COLL_ARGS_FLAG_PERSISTENT; + } } ucc_status_t ucc_pt_coll_reduce_scatter::init_args(size_t count, diff --git a/tools/perf/ucc_pt_coll_reduce_scatterv.cc b/tools/perf/ucc_pt_coll_reduce_scatterv.cc index 84f55a2132..932ad600d9 100644 --- a/tools/perf/ucc_pt_coll_reduce_scatterv.cc +++ b/tools/perf/ucc_pt_coll_reduce_scatterv.cc @@ -12,7 +12,7 @@ ucc_pt_coll_reduce_scatterv::ucc_pt_coll_reduce_scatterv(ucc_datatype_t dt, ucc_memory_type mt, ucc_reduction_op_t op, - bool is_inplace, + bool is_inplace, bool is_persistent, ucc_pt_comm *communicator) : ucc_pt_coll(communicator) { has_inplace_ = true; @@ -21,18 +21,24 @@ ucc_pt_coll_reduce_scatterv::ucc_pt_coll_reduce_scatterv(ucc_datatype_t dt, has_bw_ = false; root_shift_ = 0; - coll_args.coll_type = UCC_COLL_TYPE_REDUCE_SCATTERV; - coll_args.mask = 0; - if (is_inplace) { - coll_args.mask = UCC_COLL_ARGS_FIELD_FLAGS; - coll_args.flags = UCC_COLL_ARGS_FLAG_IN_PLACE; - } - + coll_args.mask = 0; + coll_args.flags = 0; + coll_args.coll_type = UCC_COLL_TYPE_REDUCE_SCATTERV; coll_args.op = op; coll_args.src.info.datatype = dt; coll_args.src.info.mem_type = mt; coll_args.dst.info_v.datatype = dt; coll_args.dst.info_v.mem_type = mt; + + if (is_inplace) { + coll_args.mask = UCC_COLL_ARGS_FIELD_FLAGS; + coll_args.flags = UCC_COLL_ARGS_FLAG_IN_PLACE; + } + + if (is_persistent) { + coll_args.mask |= UCC_COLL_ARGS_FIELD_FLAGS; + coll_args.flags |= UCC_COLL_ARGS_FLAG_PERSISTENT; + } } ucc_status_t ucc_pt_coll_reduce_scatterv::init_args(size_t count, diff --git a/tools/perf/ucc_pt_coll_scatter.cc b/tools/perf/ucc_pt_coll_scatter.cc index 4d66f51d99..ac414dd2ed 100644 --- a/tools/perf/ucc_pt_coll_scatter.cc +++ b/tools/perf/ucc_pt_coll_scatter.cc @@ -11,7 +11,8 @@ #include ucc_pt_coll_scatter::ucc_pt_coll_scatter(ucc_datatype_t dt, - ucc_memory_type mt, bool is_inplace, int root_shift, + ucc_memory_type mt, bool is_inplace, + bool is_persistent, int root_shift, ucc_pt_comm *communicator) : ucc_pt_coll(communicator) { has_inplace_ = true; @@ -21,15 +22,22 @@ ucc_pt_coll_scatter::ucc_pt_coll_scatter(ucc_datatype_t dt, root_shift_ = root_shift; coll_args.mask = 0; + coll_args.flags = 0; coll_args.coll_type = UCC_COLL_TYPE_SCATTER; coll_args.src.info.datatype = dt; coll_args.src.info.mem_type = mt; coll_args.dst.info.datatype = dt; coll_args.dst.info.mem_type = mt; + if (is_inplace) { coll_args.mask = UCC_COLL_ARGS_FIELD_FLAGS; coll_args.flags = UCC_COLL_ARGS_FLAG_IN_PLACE; } + + if (is_persistent) { + coll_args.mask |= UCC_COLL_ARGS_FIELD_FLAGS; + coll_args.flags |= UCC_COLL_ARGS_FLAG_PERSISTENT; + } } ucc_status_t ucc_pt_coll_scatter::init_args(size_t single_rank_count, diff --git a/tools/perf/ucc_pt_coll_scatterv.cc b/tools/perf/ucc_pt_coll_scatterv.cc index 328752022c..1dc9bf7db9 100644 --- a/tools/perf/ucc_pt_coll_scatterv.cc +++ b/tools/perf/ucc_pt_coll_scatterv.cc @@ -11,7 +11,8 @@ #include ucc_pt_coll_scatterv::ucc_pt_coll_scatterv(ucc_datatype_t dt, - ucc_memory_type mt, bool is_inplace, int root_shift, + ucc_memory_type mt, bool is_inplace, + bool is_persistent, int root_shift, ucc_pt_comm *communicator) : ucc_pt_coll(communicator) { has_inplace_ = true; @@ -21,15 +22,22 @@ ucc_pt_coll_scatterv::ucc_pt_coll_scatterv(ucc_datatype_t dt, root_shift_ = root_shift; coll_args.mask = 0; + coll_args.flags = 0; coll_args.coll_type = UCC_COLL_TYPE_SCATTERV; coll_args.src.info_v.datatype = dt; coll_args.src.info_v.mem_type = mt; coll_args.dst.info.datatype = dt; coll_args.dst.info.mem_type = mt; + if (is_inplace) { coll_args.mask = UCC_COLL_ARGS_FIELD_FLAGS; coll_args.flags = UCC_COLL_ARGS_FLAG_IN_PLACE; } + + if (is_persistent) { + coll_args.mask |= UCC_COLL_ARGS_FIELD_FLAGS; + coll_args.flags |= UCC_COLL_ARGS_FLAG_PERSISTENT; + } } ucc_status_t ucc_pt_coll_scatterv::init_args(size_t count, diff --git a/tools/perf/ucc_pt_config.cc b/tools/perf/ucc_pt_config.cc index 3fcb2b01c9..e59b62ce26 100644 --- a/tools/perf/ucc_pt_config.cc +++ b/tools/perf/ucc_pt_config.cc @@ -18,6 +18,7 @@ ucc_pt_config::ucc_pt_config() { bench.mt = UCC_MEMORY_TYPE_HOST; bench.op = UCC_OP_SUM; bench.inplace = false; + bench.persistent = false; bench.triggered = false; bench.n_iter_small = 1000; bench.n_warmup_small = 100; @@ -89,7 +90,7 @@ ucc_status_t ucc_pt_config::process_args(int argc, char *argv[]) int c; ucc_status_t st; - while ((c = getopt(argc, argv, "c:b:e:d:m:n:w:o:N:r:S:ihFT")) != -1) { + while ((c = getopt(argc, argv, "c:b:e:d:m:n:w:o:N:r:S:iphFT")) != -1) { switch (c) { case 'c': if (ucc_pt_op_map.count(optarg) == 0) { @@ -158,6 +159,9 @@ ucc_status_t ucc_pt_config::process_args(int argc, char *argv[]) case 'i': bench.inplace = true; break; + case 'p': + bench.persistent = true; + break; case 'T': bench.triggered = true; break; @@ -180,6 +184,7 @@ void ucc_pt_config::print_help() std::cout << " -b : Min number of elements"<: Max number of elements"<: datatype"<: reduction operation type"<: root for rooted collectives"<