diff --git a/.ci/Dockerfile.ngc_pytorch b/.ci/Dockerfile.ngc_pytorch
new file mode 100644
index 0000000000..91111aa488
--- /dev/null
+++ b/.ci/Dockerfile.ngc_pytorch
@@ -0,0 +1,20 @@
+ARG CUDA_VER='12.1.1'
+FROM harbor.mellanox.com/torch-ucc/ucc/1.0.0/x86_64/centos8/cuda${CUDA_VER}:base
+RUN rm -rf  ${SRC_DIR}/ucc
+COPY . ${SRC_DIR}/ucc
+
+RUN apt update && apt install -y sudo && \
+    echo "swx-jenkins ALL=(ALL) NOPASSWD: ALL" >> /etc/sudoers
+RUN pip install 'protobuf<=3.19.0'
+#==============================================================================
+# Build UCC
+RUN ${SRC_DIR}/ucc/.ci/scripts/build_ucc.sh
+#==============================================================================
+# Install torch_ucc (UCC version) python module and build a wheel package
+RUN chown -R 6213:11429 /opt/nvidia
+#==============================================================================
+RUN groupadd -g 11429 swx-jenkins
+RUN adduser --no-create-home --uid 6213 --gid 11429 --home /labhome/swx-jenkins swx-jenkins
+#==============================================================================
+USER swx-jenkins
+
diff --git a/.ci/build_base_docker/Dockerfile.ngc_pytorch.base b/.ci/build_base_docker/Dockerfile.ngc_pytorch.base
new file mode 100644
index 0000000000..891e6bc833
--- /dev/null
+++ b/.ci/build_base_docker/Dockerfile.ngc_pytorch.base
@@ -0,0 +1,69 @@
+ARG CUDA_VER='12.1.1'
+FROM nvcr.io/nvidia/pytorch:23.11-py3
+#==============================================================================
+ARG NVIDIA_ROOT_DIR=/opt/nvidia
+ENV DEBIAN_FRONTEND=noninteractive
+ENV TZ=Etc/UTC
+ENV SRC_DIR=${NVIDIA_ROOT_DIR}/src
+ENV PKG_DIR=${NVIDIA_ROOT_DIR}/pkg
+ENV BIN_DIR=${NVIDIA_ROOT_DIR}/bin
+ENV WORKLOADS_DIR=${NVIDIA_ROOT_DIR}/workloads
+ENV CUDA_HOME=/usr/local/cuda
+ENV UCX_GITHUB_URL=https://github.com/openucx/ucx.git
+ENV UCX_BRANCH=master
+ENV UCX_BUILD_TYPE=release-mt
+ENV UCX_INSTALL_DIR=${BIN_DIR}/ucx/build-${UCX_BUILD_TYPE}
+ENV UCC_INSTALL_DIR=${BIN_DIR}/ucc/build
+ENV OFED_PKG='lsof kmod udev swig libelf1 libfuse2 pciutils tk gfortran libpci3 libusb-1.0-0 libltdl-dev libmnl0 bison tcl flex chrpath debhelper ethtool graphviz'
+ENV PACKAGES='numactl  openssh-server protobuf-compiler rdma-core vim libevent-dev build-essential git make autoconf libtool'
+ENV OS_VERSION=ubuntu22.04
+ENV PLATFORM=x86_64
+ENV MOFED_VERSION=23.10-0.5.5.0
+ENV MOFED_URL="https://content.mellanox.com/ofed/MLNX_OFED-${MOFED_VERSION}/MLNX_OFED_LINUX-${MOFED_VERSION}-${OS_VERSION}-${PLATFORM}.tgz"
+ENV OMPI_PATH="/opt/hpcx/ompi"
+#==============================================================================
+RUN apt update && apt install -y ${OFED_PKG} && \
+    mkdir -p /tmp/ofed && wget --quiet -O /tmp/ofed/ofed.tgz ${MOFED_URL} && \
+    tar -xvf /tmp/ofed/ofed.tgz --strip-components=2 -C /tmp/ofed && \
+    /tmp/ofed/mlnxofedinstall --user-space-only --without-fw-update -q  --distro ${OS_VERSION} --basic && \
+    rm -rf /tmp/ofed
+
+RUN     apt install -y ${PACKAGES}
+
+# Remove old UCX
+RUN rm -rf /opt/hpcx/uc?
+ENV PATH=${OMPI_PATH}/bin:$PATH
+RUN echo "export PATH=\"\$OMPI_PATH:\$PATH\"" >> /etc/bashrc && \
+    export LD_LIBRARY_PATH=\"\$OMPI_PATH/lib64:\${LD_LIBRARY_PATH}\" >> /etc/bashrc
+#==============================================================================
+# Configure SSH
+RUN mkdir -p /var/run/sshd && \
+    cat /etc/ssh/ssh_config | grep -v StrictHostKeyChecking > /etc/ssh/ssh_config.new && \
+    echo "    StrictHostKeyChecking no" >> /etc/ssh/ssh_config.new && \
+    mv /etc/ssh/ssh_config.new /etc/ssh/ssh_config && \
+    ssh-keygen -A &&  \
+    rm -f /run/nologin
+#==============================================================================
+
+#==============================================================================
+RUN mkdir -p ${SRC_DIR} ${PKG_DIR} ${BIN_DIR} ${WORKLOADS_DIR} && \
+    cd ${SRC_DIR} && \
+    mkdir -p ${SRC_DIR}/ucx && \
+    git clone --recursive ${UCX_GITHUB_URL} ${SRC_DIR}/ucx && \
+    cd ${SRC_DIR}/ucx && \
+    git checkout ${UCX_BRANCH}
+
+COPY . ${SRC_DIR}/ucc
+#==============================================================================
+# Build UCX
+RUN ${SRC_DIR}/ucc/.ci/scripts/build_ucx.sh
+ENV PATH=${UCX_INSTALL_DIR}/bin:${PATH}
+#==============================================================================
+# Install workloads
+WORKDIR ${WORKLOADS_DIR}
+RUN git clone https://github.com/facebookresearch/dlrm.git && \
+    cd ${WORKLOADS_DIR}/dlrm && \
+    pip3 install -r ${WORKLOADS_DIR}/dlrm/requirements.txt && \
+    pip3 install tensorboard
+RUN git clone https://github.com/facebookresearch/param.git && \
+    pip3 install -r ${WORKLOADS_DIR}/param/requirements.txt
diff --git a/.ci/job_matrix.yaml b/.ci/job_matrix.yaml
index af23b10578..da1f4f65c5 100644
--- a/.ci/job_matrix.yaml
+++ b/.ci/job_matrix.yaml
@@ -20,7 +20,7 @@ volumes:
     }
 
 env:
-  CUDA_VER: '11.4.2'
+  CUDA_VER: '12.1.1'
   UCC_URI_SUFFIX: "ucc/${UCC_VERSION}/x86_64/centos8/cuda${CUDA_VER}"
   UCC_DOCKER_IMAGE_NAME: "${registry_host}${registry_path}/${UCC_URI_SUFFIX}"
   NVIDIA_ROOT_DIR: "/opt/nvidia"
@@ -42,8 +42,8 @@ credentials:
 
 runs_on_dockers:
   - {
-      file: ".ci/Dockerfile.centos8",
-      name: "centos8",
+      file: ".ci/Dockerfile.ngc_pytorch",
+      name: "ngc_pytorch",
       tag: "${BUILD_NUMBER}",
       arch: "x86_64",
       uri: "${UCC_URI_SUFFIX}",
@@ -69,7 +69,6 @@ steps:
       docker pull ${DOCKER_IMAGE_NAME}
       docker create -ti --rm $DOCKER_OPT ${DOCKER_IMAGE_NAME} /bin/bash > ${WORKSPACE}/ucc_docker.id
       docker start $(cat ${WORKSPACE}/ucc_docker.id)
-
   #============================================================================
   - name: Run Coverity
     credentialsId: "bc9a18d3-1153-449c-b924-7fc9249c9cc0"
@@ -80,7 +79,6 @@ steps:
       echo "Running coverity"
       ${WORKSPACE}/.ci/scripts/coverity.sh
     archiveArtifacts: .ci/scripts/cov-build/*
-
   #============================================================================
   - name: Run UCC / Torch-UCC tests
     agentSelector: "{nodeLabel: 'swx-clx01'}"
@@ -88,9 +86,6 @@ steps:
       echo "INFO: Run UCC tests"
       hostname
       docker exec $(cat ${WORKSPACE}/ucc_docker.id) bash -c "\${SRC_DIR}/ucc/.ci/scripts/run_tests_ucc.sh"
-
-      echo "INFO: Run Torch-UCC tests (UCC)"
-      docker exec $(cat ${WORKSPACE}/ucc_docker.id) bash -c "\${SRC_DIR}/ucc/.ci/scripts/run_tests_torch_ucc.sh"
     always: |
       docker rm --force $(cat ${WORKSPACE}/ucc_docker.id)
   #============================================================================
diff --git a/.ci/scripts/env.sh b/.ci/scripts/env.sh
index 649acaa53d..b5fc5da29a 100755
--- a/.ci/scripts/env.sh
+++ b/.ci/scripts/env.sh
@@ -1,5 +1,9 @@
 #!/bin/bash -eEx
 
+export PATH="/opt/hpcx/ompi/bin:$PATH"
+export LD_LIBRARY_PATH="/opt/hpcx/ompi/lib:${LD_LIBRARY_PATH}"
+export OPAL_PREFIX=/opt/hpcx/ompi
+
 SCRIPT_DIR="$(cd "$(dirname "$0")" && pwd -P)"
 
 # shellcheck disable=SC2034
diff --git a/.ci/scripts/run_docker.sh b/.ci/scripts/run_docker.sh
index 7f141d65c9..9535298bb2 100755
--- a/.ci/scripts/run_docker.sh
+++ b/.ci/scripts/run_docker.sh
@@ -45,7 +45,7 @@ DOCKER_RUN_ARGS="\
 -d \
 --rm \
 --name=${DOCKER_CONTAINER_NAME} \
--v /labhome:/labhome \
+-v /labhome/swx-jenkins:/labhome/swx-jenkins \
 "
 
 # shellcheck disable=SC2013
diff --git a/.ci/scripts/run_tests_ucc_mpi.sh b/.ci/scripts/run_tests_ucc_mpi.sh
index 4701a7c04e..73a4eaca6a 100755
--- a/.ci/scripts/run_tests_ucc_mpi.sh
+++ b/.ci/scripts/run_tests_ucc_mpi.sh
@@ -15,9 +15,6 @@ if [ -z "$HOSTFILE" ]; then
     exit 1
 fi
 
-export PATH="/usr/lib64/openmpi/bin:$PATH"
-export LD_LIBRARY_PATH="/usr/lib64/openmpi/lib:${LD_LIBRARY_PATH}"
-
 HEAD_NODE=$(head -1 "$HOSTFILE")
 export HEAD_NODE
 export MASTER_ADDR=${HEAD_NODE}
diff --git a/.github/workflows/clang-tidy-nvidia.yaml b/.github/workflows/clang-tidy-nvidia.yaml
index 408f145f83..ae2cde7580 100644
--- a/.github/workflows/clang-tidy-nvidia.yaml
+++ b/.github/workflows/clang-tidy-nvidia.yaml
@@ -5,7 +5,7 @@ on: [push, pull_request]
 env:
   OPEN_UCX_LINK: https://github.com/openucx/ucx
   OPEN_UCX_BRANCH: master
-  HPCX_LINK: http://content.mellanox.com/hpc/hpc-x/v2.13/hpcx-v2.13-gcc-MLNX_OFED_LINUX-5-ubuntu20.04-cuda11-gdrcopy2-nccl2.12-x86_64.tbz
+  HPCX_LINK: https://content.mellanox.com/hpc/hpc-x/v2.17.1rc2/hpcx-v2.17.1-gcc-mlnx_ofed-ubuntu20.04-cuda12-x86_64.tbz
   CLANG_VER: 12
   MLNX_OFED_VER: 5.9-0.5.6.0
   CUDA_VER: 11-4
@@ -33,7 +33,7 @@ jobs:
         wget https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2004/x86_64/cuda-keyring_1.0-1_all.deb
         sudo dpkg -i cuda-keyring_1.0-1_all.deb
         sudo apt-get update
-        sudo apt-get install -y --no-install-recommends  cuda-cudart-dev-${CUDA_VER} cuda-nvcc-${CUDA_VER} cuda-nvml-dev-${CUDA_VER}
+        sudo apt-get install -y --no-install-recommends cuda-cudart-dev-${CUDA_VER} cuda-nvcc-${CUDA_VER} cuda-nvml-dev-${CUDA_VER}
     - name: Get UCX
       run: git clone ${OPEN_UCX_LINK} -b ${OPEN_UCX_BRANCH} /tmp/ucx
     - name: Build UCX
@@ -45,8 +45,8 @@ jobs:
       run: |
         cd /tmp
         wget ${HPCX_LINK}
-        tar xjf hpcx-v2.13-gcc-MLNX_OFED_LINUX-5-ubuntu20.04-cuda11-gdrcopy2-nccl2.12-x86_64.tbz
-        mv hpcx-v2.13-gcc-MLNX_OFED_LINUX-5-ubuntu20.04-cuda11-gdrcopy2-nccl2.12-x86_64 hpcx
+        tar xjf hpcx-v2.17.1-gcc-mlnx_ofed-ubuntu20.04-cuda12-x86_64.tbz
+        mv hpcx-v2.17.1-gcc-mlnx_ofed-ubuntu20.04-cuda12-x86_64 hpcx
     - uses: actions/checkout@v1
     - name: Build UCC
       run: |
diff --git a/.github/workflows/hpcsdk.yaml b/.github/workflows/hpcsdk.yaml
new file mode 100644
index 0000000000..77188cd96a
--- /dev/null
+++ b/.github/workflows/hpcsdk.yaml
@@ -0,0 +1,25 @@
+name: HPC_SDK
+
+on: [push, pull_request]
+
+env:
+  HPCXDIR: /opt/nvidia/hpc_sdk/Linux_x86_64/2023/comm_libs/12.2/hpcx/latest/
+  NCCLDIR: /opt/nvidia/hpc_sdk/Linux_x86_64/2023/comm_libs/12.2/nccl/
+  CUDADIR: /opt/nvidia/hpc_sdk/Linux_x86_64/2023/cuda/12.2/
+
+jobs:
+  build:
+    runs-on: ubuntu-20.04
+    container:
+      image: nvcr.io/nvidia/nvhpc:23.9-devel-cuda12.2-ubuntu22.04
+    steps:
+    - name: Install dependencies
+      run: |
+        apt-get update
+        apt-get install -y --no-install-recommends libiberty-dev
+    - uses: actions/checkout@v1
+    - name: Build UCC
+      run: |
+        ./autogen.sh
+        CC=nvc CXX=nvc++ ./configure --with-tls=ucp,mlx5,cuda,self,nccl,sharp --with-mpi=${HPCXDIR}/ompi --with-sharp=${HPCXDIR}/sharp --with-ucx=${HPCXDIR}/ucx --with-cuda=${CUDADIR} --with-nccl=${NCCLDIR} --with-nvcc-gencode="-gencode=arch=compute_80,code=sm_80"
+        make -j`nproc` install
diff --git a/config/m4/sharp.m4 b/config/m4/sharp.m4
index bedc550476..45bcfd04e3 100644
--- a/config/m4/sharp.m4
+++ b/config/m4/sharp.m4
@@ -44,6 +44,7 @@ AS_IF([test "x$with_sharp" != "xno"],
                 AC_SUBST(SHARP_LDFLAGS, "-lsharp_coll -L$check_sharp_dir/lib")
                 AC_CHECK_DECLS([SHARP_COLL_HIDE_ERRORS], [], [], [[#include <sharp/api/sharp_coll.h>]])
                 AC_CHECK_DECLS([SHARP_COLL_DISABLE_LAZY_GROUP_RESOURCE_ALLOC], [], [], [[#include <sharp/api/sharp_coll.h>]])
+                AC_CHECK_DECLS([sharp_coll_do_reduce_scatter], [], [], [[#include <sharp/api/sharp_coll.h>]])
             ],
             [
                 AS_IF([test "x$with_sharp" != "xguess"],
diff --git a/config/m4/ucx.m4 b/config/m4/ucx.m4
index b3a3b871c3..ba57dae303 100644
--- a/config/m4/ucx.m4
+++ b/config/m4/ucx.m4
@@ -1,5 +1,5 @@
 #
-# Copyright (c) 2001-2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# Copyright (c) 2001-2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 # See file LICENSE for terms.
 #
 
@@ -113,6 +113,25 @@ AS_IF([test "x$ucx_checked" != "xyes"],[
                 [AC_DEFINE([UCS_HAVE_CONFIG_GLOBAL_LIST_ENTRY_FLAGS], [1], [flags for config table])],
                 [],
                 [#include <ucs/config/parser.h>])
+
+            AC_CHECK_MEMBER(ucs_rcache_region_t.alignment,
+                [AC_DEFINE([UCS_HAVE_RCACHE_REGION_ALIGNMENT], [1], [flags for ucs_rcache_get])],
+                [],
+                [#include <ucs/memory/rcache.h>])
+
+
+            AC_COMPILE_IFELSE([AC_LANG_SOURCE([[#include <ucs/config/parser.h>
+					            int main(int argc, char** argv) {
+				        	    	ucs_config_parser_set_value(NULL, NULL, NULL, NULL, NULL);
+					        	    return 0;
+					            } ]])],
+                  [AC_DEFINE([UCS_HAVE_PARSER_SET_VALUE_TABLE_PREFIX], [1], [flags for ucs_rcache_get])],
+                  [])
+
+            AC_CHECK_MEMBER(ucs_config_parser_t.doc,
+                [AC_DEFINE([UCS_HAVE_PARSER_CONFIG_DOC], [1], [flags for ucs_rcache_get])],
+                [],
+                [#include <ucs/memory/rcache.h>])
         ],
         [
             AS_IF([test "x$with_ucx" != "xguess"],
diff --git a/src/Makefile.am b/src/Makefile.am
index 85496f83dd..c505c31344 100644
--- a/src/Makefile.am
+++ b/src/Makefile.am
@@ -36,62 +36,63 @@ nobase_dist_libucc_la_HEADERS =	\
 	ucc/api/ucc_version.h           \
 	ucc/api/ucc_status.h
 
-noinst_HEADERS =                      \
-	core/ucc_global_opts.h            \
-	core/ucc_lib.h                    \
-	core/ucc_context.h                \
-	core/ucc_team.h                   \
-	core/ucc_ee.h                     \
-	core/ucc_progress_queue.h         \
-	core/ucc_service_coll.h           \
-	core/ucc_dt.h	                  \
-	schedule/ucc_schedule.h           \
-	schedule/ucc_schedule_pipelined.h \
-	coll_score/ucc_coll_score.h       \
-	utils/arch/aarch64/cpu.h          \
-	utils/arch/ppc64/cpu.h            \
-	utils/arch/riscv64/cpu.h          \
-	utils/arch/x86_64/cpu.h           \
-	utils/arch/cpu.h                  \
-	utils/arch/cuda_def.h             \
-	utils/ucc_compiler_def.h          \
-	utils/ucc_log.h                   \
-	utils/ucc_parser.h                \
-	utils/ucc_component.h             \
-	utils/ucc_datastruct.h            \
-	utils/ucc_math.h                  \
-	utils/ucc_coll_utils.h            \
-	utils/ucc_list.h                  \
-	utils/ucc_string.h                \
-	utils/ucc_queue.h                 \
-	utils/ucc_proc_info.h             \
-	utils/khash.h                     \
-	utils/ini.h                       \
-	utils/ucc_spinlock.h              \
-	utils/ucc_mpool.h                 \
-	utils/ucc_rcache.h                \
-	utils/profile/ucc_profile.h       \
-	utils/profile/ucc_profile_on.h    \
-	utils/profile/ucc_profile_off.h   \
-	utils/ucc_time.h                  \
-	utils/ucc_sys.h                   \
-	utils/ucc_assert.h                \
-	components/base/ucc_base_iface.h  \
-	components/cl/ucc_cl.h            \
-	components/cl/ucc_cl_log.h        \
-	components/cl/ucc_cl_type.h       \
-	components/tl/ucc_tl.h            \
-	components/tl/ucc_tl_log.h        \
-	components/mc/ucc_mc.h            \
-	components/mc/base/ucc_mc_base.h  \
-	components/mc/ucc_mc_log.h        \
-	components/ec/ucc_ec.h            \
-	components/ec/base/ucc_ec_base.h  \
-	components/ec/ucc_ec_log.h        \
-	coll_patterns/recursive_knomial.h \
-	coll_patterns/sra_knomial.h       \
-	coll_patterns/bruck_alltoall.h    \
-	components/topo/ucc_topo.h        \
+noinst_HEADERS =                       \
+	core/ucc_global_opts.h             \
+	core/ucc_lib.h                     \
+	core/ucc_context.h                 \
+	core/ucc_team.h                    \
+	core/ucc_ee.h                      \
+	core/ucc_progress_queue.h          \
+	core/ucc_service_coll.h            \
+	core/ucc_dt.h	                   \
+	schedule/ucc_schedule.h            \
+	schedule/ucc_schedule_pipelined.h  \
+	coll_score/ucc_coll_score.h        \
+	utils/arch/aarch64/cpu.h           \
+	utils/arch/ppc64/cpu.h             \
+	utils/arch/riscv64/cpu.h           \
+	utils/arch/x86_64/cpu.h            \
+	utils/arch/cpu.h                   \
+	utils/arch/cuda_def.h              \
+	utils/ucc_compiler_def.h           \
+	utils/ucc_log.h                    \
+	utils/ucc_parser.h                 \
+	utils/ucc_component.h              \
+	utils/ucc_datastruct.h             \
+	utils/ucc_math.h                   \
+	utils/ucc_coll_utils.h             \
+	utils/ucc_list.h                   \
+	utils/ucc_string.h                 \
+	utils/ucc_queue.h                  \
+	utils/ucc_proc_info.h              \
+	utils/khash.h                      \
+	utils/ini.h                        \
+	utils/ucc_spinlock.h               \
+	utils/ucc_mpool.h                  \
+	utils/ucc_rcache.h                 \
+	utils/profile/ucc_profile.h        \
+	utils/profile/ucc_profile_on.h     \
+	utils/profile/ucc_profile_off.h    \
+	utils/ucc_time.h                   \
+	utils/ucc_sys.h                    \
+	utils/ucc_assert.h                 \
+	components/base/ucc_base_iface.h   \
+	components/cl/ucc_cl.h             \
+	components/cl/ucc_cl_log.h         \
+	components/cl/ucc_cl_type.h        \
+	components/tl/ucc_tl.h             \
+	components/tl/ucc_tl_log.h         \
+	components/mc/ucc_mc.h             \
+	components/mc/base/ucc_mc_base.h   \
+	components/mc/ucc_mc_log.h         \
+	components/ec/ucc_ec.h             \
+	components/ec/base/ucc_ec_base.h   \
+	components/ec/ucc_ec_log.h         \
+	coll_patterns/recursive_knomial.h  \
+	coll_patterns/sra_knomial.h        \
+	coll_patterns/bruck_alltoall.h     \
+	coll_patterns/double_binary_tree.h \
+	components/topo/ucc_topo.h         \
 	components/topo/ucc_sbgp.h
 
 libucc_la_SOURCES =                   \
diff --git a/src/coll_patterns/double_binary_tree.h b/src/coll_patterns/double_binary_tree.h
new file mode 100644
index 0000000000..baab72936a
--- /dev/null
+++ b/src/coll_patterns/double_binary_tree.h
@@ -0,0 +1,238 @@
+/**
+ * Copyright (c) 2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ *
+ * See file LICENSE for terms.
+ */
+
+#ifndef DOUBLE_BINARY_TREE_H_
+#define DOUBLE_BINARY_TREE_H_
+
+enum {
+    LEFT_CHILD,
+    RIGHT_CHILD
+};
+
+typedef struct ucc_dbt_single_tree {
+    ucc_rank_t rank;
+    ucc_rank_t size;
+    ucc_rank_t root;
+    ucc_rank_t parent;
+    ucc_rank_t children[2];
+    int        n_children;
+    int        height;
+    int        recv;
+} ucc_dbt_single_tree_t;
+
+static inline ucc_rank_t get_root(ucc_rank_t size)
+{
+    ucc_rank_t r = 1;
+
+    while (r <= size) {
+        r *= 2;
+    }
+    return r/2 - 1;
+}
+
+static inline int get_height(ucc_rank_t rank)
+{
+    int h = 1;
+
+    if (rank % 2 == 0) {
+        return 0;
+    }
+
+    rank++;
+    while ((rank & (1 << h)) == 0) {
+        h++;
+    }
+    return h;
+}
+
+static inline ucc_rank_t get_left_child(ucc_rank_t rank, int height)
+{
+    ucc_rank_t sub_height;
+
+    if (height == 0) {
+        return UCC_RANK_INVALID;
+    }
+
+    sub_height = 1 << (height - 1);
+    return rank - sub_height;
+}
+
+static inline ucc_rank_t get_right_child(ucc_rank_t size, ucc_rank_t rank,
+                                         int height, ucc_rank_t root)
+{
+    ucc_rank_t sub_right_root, sub_height;
+
+    if (rank == size - 1 || height == 0) {
+        return UCC_RANK_INVALID;
+    }
+
+    sub_right_root = get_root(size - rank - 1) + 1;
+    sub_height     = 1 << (height - 1);
+
+    if (rank == root) {
+        return rank + sub_right_root;
+    }
+    return (rank + sub_height < size) ? rank + sub_height
+                                      : rank + sub_right_root;
+}
+
+static inline void get_children(ucc_rank_t size, ucc_rank_t rank, int height,
+                                ucc_rank_t root, ucc_rank_t *l_c,
+                                ucc_rank_t *r_c)
+{
+    *l_c = get_left_child(rank, height);
+    *r_c = get_right_child(size, rank, height, root);
+}
+
+static inline int get_n_children(ucc_rank_t l_c, ucc_rank_t r_c)
+{
+    int n_children = 0;
+
+    if (l_c != UCC_RANK_INVALID) {
+        n_children++;
+    }
+
+    if (r_c != UCC_RANK_INVALID) {
+        n_children++;
+    }
+
+    return n_children;
+}
+
+static inline ucc_rank_t get_parent(int vsize, int vrank, int height, int troot)
+{
+    if (vrank == troot) {
+        return UCC_RANK_INVALID;
+    } else if (height == 0) {
+        return ((((vrank/2) % 2 == 0) && (vrank + 1 != vsize))) ? vrank + 1
+                                                                : vrank - 1;
+    } else {
+        vrank++;
+        if ((((1<<(height+1)) & vrank) > 0) || (vrank + (1<<height)) > vsize) {
+            return vrank - (1<<height) - 1;
+        } else {
+            return vrank + (1<<height) - 1;
+        }
+    }
+}
+
+static inline void ucc_dbt_build_t2_mirror(ucc_dbt_single_tree_t t1,
+                                           ucc_dbt_single_tree_t *t2)
+{
+    ucc_rank_t            size = t1.size;
+    ucc_dbt_single_tree_t t;
+
+    t.size                  = size;
+    t.height                = t1.height;
+    t.rank                  = size - 1 - t1.rank;
+    t.root                  = size - 1 - t1.root;
+    t.parent                = (t1.parent == UCC_RANK_INVALID) ?
+                              UCC_RANK_INVALID : size - 1 - t1.parent;
+    t.children[LEFT_CHILD]  = (t1.children[RIGHT_CHILD] == UCC_RANK_INVALID) ?
+                               UCC_RANK_INVALID :
+                               size - 1 - t1.children[RIGHT_CHILD];
+    t.children[RIGHT_CHILD] = (t1.children[LEFT_CHILD] == UCC_RANK_INVALID) ?
+                               UCC_RANK_INVALID :
+                               size - 1 - t1.children[LEFT_CHILD];
+    t.n_children            = get_n_children(t.children[LEFT_CHILD],
+                                             t.children[RIGHT_CHILD]);
+    t.recv                  = 0;
+
+    *t2 = t;
+}
+
+static inline void ucc_dbt_build_t2_shift(ucc_dbt_single_tree_t t1,
+                                          ucc_dbt_single_tree_t *t2)
+{
+    ucc_rank_t            size = t1.size;
+    ucc_dbt_single_tree_t t;
+
+    t.size                  = size;
+    t.height                = t1.height;
+    t.rank                  = (t1.rank + 1) % size;
+    t.root                  = (t1.root + 1) % size;
+    t.parent                = (t1.parent == UCC_RANK_INVALID) ?
+                              UCC_RANK_INVALID : (t1.parent + 1) % size;
+    t.children[LEFT_CHILD]  = (t1.children[LEFT_CHILD] == UCC_RANK_INVALID) ?
+                              UCC_RANK_INVALID :
+                              (t1.children[LEFT_CHILD] + 1) % size;
+    t.children[RIGHT_CHILD] = (t1.children[RIGHT_CHILD] == UCC_RANK_INVALID) ?
+                              UCC_RANK_INVALID :
+                              (t1.children[RIGHT_CHILD] + 1) % size;
+    t.n_children            = get_n_children(t.children[LEFT_CHILD],
+                                             t.children[RIGHT_CHILD]);
+    t.recv                  = 0;
+
+    *t2 = t;
+}
+
+static inline void ucc_dbt_build_t1(ucc_rank_t rank, ucc_rank_t size,
+                                    ucc_dbt_single_tree_t *t1)
+{
+    int         height = get_height(rank);
+    ucc_rank_t  root   = get_root(size);
+    ucc_rank_t  parent = get_parent(size, rank, height, root);
+
+    get_children(size, rank, height, root, &t1->children[LEFT_CHILD],
+                 &t1->children[RIGHT_CHILD]);
+    t1->n_children = get_n_children(t1->children[LEFT_CHILD],
+                                    t1->children[RIGHT_CHILD]);
+    t1->height     = height;
+    t1->parent     = parent;
+    t1->size       = size;
+    t1->rank       = rank;
+    t1->root       = root;
+    t1->recv       = 0;
+}
+
+static inline ucc_rank_t ucc_dbt_convert_rank_for_shift(ucc_rank_t rank,
+                                                        ucc_rank_t size)
+{
+    ucc_rank_t i;
+    for (i = 0; i < size; i++) {
+        if (rank == (i + 1) % size) {
+            break;
+        }
+    }
+    return i;
+}
+
+static inline ucc_rank_t ucc_dbt_convert_rank_for_mirror(ucc_rank_t rank,
+                                                         ucc_rank_t size)
+{
+    ucc_rank_t i;
+    for (i = 0; i < size; i++) {
+        if (rank == size - 1 - i) {
+            break;
+        }
+    }
+    return i;
+}
+
+static inline void ucc_dbt_build_t2(ucc_rank_t rank, ucc_rank_t size,
+                                    ucc_dbt_single_tree_t *t2) {
+    ucc_rank_t temp_rank = (size % 2) ?
+        ucc_dbt_convert_rank_for_shift(rank, size) :
+        ucc_dbt_convert_rank_for_mirror(rank, size);
+    ucc_dbt_single_tree_t t1_temp;
+
+    ucc_dbt_build_t1(temp_rank, size, &t1_temp);
+    if (size % 2) {
+        ucc_dbt_build_t2_shift(t1_temp, t2);
+    } else {
+        ucc_dbt_build_t2_mirror(t1_temp, t2);
+    }
+}
+
+static inline void ucc_dbt_build_trees(ucc_rank_t rank, ucc_rank_t size,
+                                       ucc_dbt_single_tree_t *t1,
+                                       ucc_dbt_single_tree_t *t2)
+{
+    ucc_dbt_build_t1(rank, size, t1);
+    ucc_dbt_build_t2(rank, size, t2);
+}
+
+#endif
diff --git a/src/coll_patterns/recursive_knomial.h b/src/coll_patterns/recursive_knomial.h
index 4f8981957c..ebf9a0981b 100644
--- a/src/coll_patterns/recursive_knomial.h
+++ b/src/coll_patterns/recursive_knomial.h
@@ -50,7 +50,7 @@ typedef struct ucc_knomial_pattern {
     size_t         block_size_counts;
     size_t         count;         /* collective buffer size */
     ucc_rank_t     block_size;
-    size_t         block_offset;
+    ptrdiff_t      block_offset;
 } ucc_knomial_pattern_t;
 
 /**
diff --git a/src/coll_patterns/sra_knomial.h b/src/coll_patterns/sra_knomial.h
index 1574389632..2f63a243f2 100644
--- a/src/coll_patterns/sra_knomial.h
+++ b/src/coll_patterns/sra_knomial.h
@@ -159,7 +159,7 @@ ucc_kn_seg_desc_compute(ucc_knomial_pattern_t *p, ucc_kn_seg_desc_t *seg,
 
 static inline void
 ucc_knx_block(ucc_rank_t rank, ucc_rank_t size, ucc_kn_radix_t radix,
-              size_t count, int iter, size_t *b_count, size_t *b_offset)
+              size_t count, int iter, size_t *b_count, ptrdiff_t *b_offset)
 {
     ucc_rank_t            offset = 0;
     ucc_rank_t            block_count;
@@ -213,7 +213,7 @@ ucc_kn_agx_pattern_init(ucc_rank_t size, ucc_rank_t rank, ucc_kn_radix_t radix,
 
 static inline void
 ucc_kn_ag_pattern_peer_seg(ucc_rank_t peer, ucc_knomial_pattern_t *p,
-                           size_t *seg_count, size_t *seg_offset)
+                           size_t *seg_count, ptrdiff_t *seg_offset)
 {
     ucc_rank_t step_radix, seg_index;
     ucc_kn_seg_desc_t s;
@@ -278,7 +278,7 @@ static inline void ucc_kn_rsx_pattern_init(ucc_rank_t size, ucc_rank_t rank,
 
 static inline void
 ucc_kn_rs_pattern_peer_seg(ucc_rank_t peer, ucc_knomial_pattern_t *p,
-                           size_t *peer_seg_count, size_t *peer_seg_offset)
+                           size_t *peer_seg_count, ptrdiff_t *peer_seg_offset)
 {
     ucc_rank_t step_radix, seg_index;
 
@@ -305,7 +305,8 @@ ucc_kn_rs_pattern_peer_seg(ucc_rank_t peer, ucc_knomial_pattern_t *p,
 
 static inline void ucc_kn_rs_pattern_next_iter(ucc_knomial_pattern_t *p)
 {
-    size_t offset, bs;
+    size_t bs;
+    ptrdiff_t offset;
 
     ucc_kn_rs_pattern_peer_seg(p->rank, p, &bs, &offset);
     p->block_size_counts = bs;
diff --git a/src/components/cl/hier/allreduce/allreduce.c b/src/components/cl/hier/allreduce/allreduce.c
index c69cc4db36..ba93f0789d 100644
--- a/src/components/cl/hier/allreduce/allreduce.c
+++ b/src/components/cl/hier/allreduce/allreduce.c
@@ -13,7 +13,7 @@ ucc_base_coll_alg_info_t
             {.id   = UCC_CL_HIER_ALLREDUCE_ALG_RAB,
              .name = "rab",
              .desc = "intra-node reduce, followed by inter-node allreduce,"
-                     " followed by innode broadcast"},
+                     " followed by intra-node broadcast"},
         [UCC_CL_HIER_ALLREDUCE_ALG_SPLIT_RAIL] =
             {.id   = UCC_CL_HIER_ALLREDUCE_ALG_SPLIT_RAIL,
              .name = "split_rail",
diff --git a/src/components/cl/hier/alltoallv/alltoallv.c b/src/components/cl/hier/alltoallv/alltoallv.c
index c60bdf84fe..b73af2c82b 100644
--- a/src/components/cl/hier/alltoallv/alltoallv.c
+++ b/src/components/cl/hier/alltoallv/alltoallv.c
@@ -144,6 +144,11 @@ UCC_CL_HIER_PROFILE_FUNC(ucc_status_t, ucc_cl_hier_alltoallv_init,
         return UCC_ERR_NOT_SUPPORTED;
     }
 
+    if (coll_args->args.mask & UCC_COLL_ARGS_FIELD_GLOBAL_WORK_BUFFER) {
+        cl_debug(team->context->lib, "onesided alltoallv is not supported");
+        return UCC_ERR_NOT_SUPPORTED;
+    }
+
     if (!SBGP_ENABLED(cl_team, FULL)) {
         cl_debug(team->context->lib, "alltoallv requires FULL sbgp");
         return UCC_ERR_NOT_SUPPORTED;
diff --git a/src/components/ec/cuda/Makefile.am b/src/components/ec/cuda/Makefile.am
index 3d7a862ef4..83f478d797 100644
--- a/src/components/ec/cuda/Makefile.am
+++ b/src/components/ec/cuda/Makefile.am
@@ -1,5 +1,5 @@
 #
-# Copyright (c) 2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# Copyright (c) 2022-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 #
 
 if HAVE_CUDA
@@ -12,7 +12,9 @@ sources =    \
 	ec_cuda_executor.c                 \
 	ec_cuda_executor_interruptible.c   \
 	ec_cuda_executor_persistent.c      \
-	ec_cuda_executor_persistent_wait.c
+	ec_cuda_executor_persistent_wait.c \
+	ec_cuda_resources.c                \
+	ec_cuda_resources.h
 
 module_LTLIBRARIES         = libucc_ec_cuda.la
 libucc_ec_cuda_la_SOURCES  = $(sources)
diff --git a/src/components/ec/cuda/ec_cuda.c b/src/components/ec/cuda/ec_cuda.c
index 2357023fed..dd721e1f50 100644
--- a/src/components/ec/cuda/ec_cuda.c
+++ b/src/components/ec/cuda/ec_cuda.c
@@ -75,116 +75,6 @@ static ucc_config_field_t ucc_ec_cuda_config_table[] = {
 
 };
 
-static ucc_status_t ucc_ec_cuda_ee_executor_mpool_chunk_malloc(ucc_mpool_t *mp, //NOLINT: mp is unused
-                                                               size_t *size_p,
-                                                               void ** chunk_p)
-{
-    return CUDA_FUNC(cudaHostAlloc((void**)chunk_p, *size_p,
-                                   cudaHostAllocMapped));
-}
-
-static void ucc_ec_cuda_ee_executor_mpool_chunk_free(ucc_mpool_t *mp, //NOLINT: mp is unused
-                                                     void *chunk)
-{
-    CUDA_FUNC(cudaFreeHost(chunk));
-}
-
-static void ucc_ec_cuda_executor_chunk_init(ucc_mpool_t *mp, void *obj, //NOLINT: mp is unused
-                                            void *chunk) //NOLINT: chunk is unused
-{
-    ucc_ec_cuda_executor_t *eee       = (ucc_ec_cuda_executor_t*) obj;
-    int                     max_tasks = EC_CUDA_CONFIG->exec_max_tasks;
-
-    CUDA_FUNC(cudaHostGetDevicePointer(
-                  (void**)(&eee->dev_state), (void *)&eee->state, 0));
-    CUDA_FUNC(cudaHostGetDevicePointer(
-                  (void**)(&eee->dev_pidx), (void *)&eee->pidx, 0));
-    CUDA_FUNC(cudaMalloc((void**)&eee->dev_cidx, sizeof(*eee->dev_cidx)));
-    CUDA_FUNC(cudaHostAlloc((void**)&eee->tasks,
-                            max_tasks * MAX_SUBTASKS *
-                            sizeof(ucc_ee_executor_task_args_t),
-                            cudaHostAllocMapped));
-    CUDA_FUNC(cudaHostGetDevicePointer(
-                  (void**)(&eee->dev_tasks), (void *)eee->tasks, 0));
-    if (ucc_ec_cuda.thread_mode == UCC_THREAD_MULTIPLE) {
-        ucc_spinlock_init(&eee->tasks_lock, 0);
-    }
-}
-
-static void ucc_ec_cuda_executor_chunk_cleanup(ucc_mpool_t *mp, void *obj) //NOLINT: mp is unused
-{
-    ucc_ec_cuda_executor_t *eee = (ucc_ec_cuda_executor_t*) obj;
-
-    CUDA_FUNC(cudaFree((void*)eee->dev_cidx));
-    CUDA_FUNC(cudaFreeHost((void*)eee->tasks));
-    if (ucc_ec_cuda.thread_mode == UCC_THREAD_MULTIPLE) {
-        ucc_spinlock_destroy(&eee->tasks_lock);
-    }
-}
-
-
-static ucc_mpool_ops_t ucc_ec_cuda_ee_executor_mpool_ops = {
-    .chunk_alloc   = ucc_ec_cuda_ee_executor_mpool_chunk_malloc,
-    .chunk_release = ucc_ec_cuda_ee_executor_mpool_chunk_free,
-    .obj_init      = ucc_ec_cuda_executor_chunk_init,
-    .obj_cleanup   = ucc_ec_cuda_executor_chunk_cleanup,
-};
-
-static void ucc_ec_cuda_event_init(ucc_mpool_t *mp, void *obj, void *chunk) //NOLINT: mp is unused
-{
-    ucc_ec_cuda_event_t *base = (ucc_ec_cuda_event_t *) obj;
-
-    CUDA_FUNC(cudaEventCreateWithFlags(&base->event, cudaEventDisableTiming));
-}
-
-static void ucc_ec_cuda_event_cleanup(ucc_mpool_t *mp, void *obj) //NOLINT: mp is unused
-{
-    ucc_ec_cuda_event_t *base = (ucc_ec_cuda_event_t *) obj;
-
-    CUDA_FUNC(cudaEventDestroy(base->event));
-}
-
-static ucc_mpool_ops_t ucc_ec_cuda_event_mpool_ops = {
-    .chunk_alloc   = ucc_mpool_hugetlb_malloc,
-    .chunk_release = ucc_mpool_hugetlb_free,
-    .obj_init      = ucc_ec_cuda_event_init,
-    .obj_cleanup   = ucc_ec_cuda_event_cleanup,
-};
-
-static void ucc_ec_cuda_graph_init(ucc_mpool_t *mp, void *obj, void *chunk) //NOLINT: mp is unused
-{
-    ucc_ec_cuda_executor_interruptible_task_t *task =
-         (ucc_ec_cuda_executor_interruptible_task_t *) obj;
-    cudaGraphNode_t memcpy_node;
-    int i;
-
-    CUDA_FUNC(cudaGraphCreate(&task->graph, 0));
-    for (i = 0; i < UCC_EE_EXECUTOR_MULTI_OP_NUM_BUFS; i++) {
-        CUDA_FUNC(
-            cudaGraphAddMemcpyNode1D(&memcpy_node, task->graph, NULL, 0,
-                                     (void*)1, (void*)1, 1, cudaMemcpyDefault));
-    }
-
-    CUDA_FUNC(
-        cudaGraphInstantiateWithFlags(&task->graph_exec, task->graph, 0));
-}
-
-static void ucc_ec_cuda_graph_cleanup(ucc_mpool_t *mp, void *obj) //NOLINT: mp is unused
-{
-    ucc_ec_cuda_executor_interruptible_task_t *task =
-         (ucc_ec_cuda_executor_interruptible_task_t *) obj;
-
-    CUDA_FUNC(cudaGraphExecDestroy(task->graph_exec));
-    CUDA_FUNC(cudaGraphDestroy(task->graph));
-}
-
-static ucc_mpool_ops_t ucc_ec_cuda_interruptible_task_mpool_ops = {
-    .chunk_alloc   = ucc_mpool_hugetlb_malloc,
-    .chunk_release = ucc_mpool_hugetlb_free,
-    .obj_init      = ucc_ec_cuda_graph_init,
-    .obj_cleanup   = ucc_ec_cuda_graph_cleanup,
-};
-
 static inline void ucc_ec_cuda_set_threads_nbr(int *nt, int maxThreadsPerBlock)
 {
     if (*nt != UCC_ULUNITS_AUTO) {
@@ -208,15 +98,14 @@ static inline void ucc_ec_cuda_set_threads_nbr(int *nt, int maxThreadsPerBlock)
 
 static ucc_status_t ucc_ec_cuda_init(const ucc_ec_params_t *ec_params)
 {
-    ucc_ec_cuda_config_t *cfg = EC_CUDA_CONFIG;
-    ucc_status_t          status;
+    ucc_ec_cuda_config_t *cfg                  = EC_CUDA_CONFIG;
+    int                   supports_coop_launch = 0;
     int                   device, num_devices;
     cudaError_t           cuda_st;
     struct cudaDeviceProp prop;
-    int                   supportsCoopLaunch = 0;
 
-    ucc_ec_cuda.stream                   = NULL;
-    ucc_ec_cuda.stream_initialized       = 0;
+    ucc_ec_cuda_config = ucc_derived_of(ucc_ec_cuda.super.config,
+                                        ucc_ec_cuda_config_t);
     ucc_ec_cuda.exec_streams_initialized = 0;
     ucc_strncpy_safe(ucc_ec_cuda.super.config->log_component.name,
                      ucc_ec_cuda.super.super.name,
@@ -228,9 +117,7 @@ static ucc_status_t ucc_ec_cuda_init(const ucc_ec_params_t *ec_params)
         return UCC_ERR_NO_RESOURCE;
     }
     CUDA_CHECK(cudaGetDevice(&device));
-
     CUDA_CHECK(cudaGetDeviceProperties(&prop, device));
-
     ucc_ec_cuda_set_threads_nbr((int *)&cfg->exec_num_threads,
                                 prop.maxThreadsPerBlock);
     ucc_ec_cuda_set_threads_nbr(&cfg->reduce_num_threads,
@@ -253,52 +140,6 @@ static ucc_status_t ucc_ec_cuda_init(const ucc_ec_params_t *ec_params)
         cfg->exec_num_streams = 1;
     }
 
-    /*create event pool */
-    ucc_ec_cuda.exec_streams = ucc_calloc(cfg->exec_num_streams,
-                                          sizeof(cudaStream_t),
-                                          "ec cuda streams");
-    if (!ucc_ec_cuda.exec_streams) {
-        ec_error(&ucc_ec_cuda.super, "failed to allocate streams array");
-        return UCC_ERR_NO_MEMORY;
-    }
-    status = ucc_mpool_init(&ucc_ec_cuda.events, 0, sizeof(ucc_ec_cuda_event_t),
-                            0, UCC_CACHE_LINE_SIZE, 16, UINT_MAX,
-                            &ucc_ec_cuda_event_mpool_ops, UCC_THREAD_MULTIPLE,
-                            "CUDA Event Objects");
-    if (status != UCC_OK) {
-        ec_error(&ucc_ec_cuda.super, "failed to create event pool");
-        return status;
-    }
-
-    status = ucc_mpool_init(
-        &ucc_ec_cuda.executors, 0, sizeof(ucc_ec_cuda_executor_t), 0,
-        UCC_CACHE_LINE_SIZE, 16, UINT_MAX, &ucc_ec_cuda_ee_executor_mpool_ops,
-        UCC_THREAD_MULTIPLE, "EE executor Objects");
-    if (status != UCC_OK) {
-        ec_error(&ucc_ec_cuda.super, "failed to create executors pool");
-        return status;
-    }
-
-    status = ucc_mpool_init(
-        &ucc_ec_cuda.executor_interruptible_tasks, 0,
-        sizeof(ucc_ec_cuda_executor_interruptible_task_t), 0, UCC_CACHE_LINE_SIZE,
-        16, UINT_MAX, &ucc_ec_cuda_interruptible_task_mpool_ops,
-        UCC_THREAD_MULTIPLE, "interruptible executor tasks");
-    if (status != UCC_OK) {
-        ec_error(&ucc_ec_cuda.super, "failed to create interruptible tasks pool");
-        return status;
-    }
-
-    status = ucc_mpool_init(
-        &ucc_ec_cuda.executor_persistent_tasks, 0,
-        sizeof(ucc_ec_cuda_executor_persistent_task_t), 0, UCC_CACHE_LINE_SIZE,
-        16, UINT_MAX, NULL, UCC_THREAD_MULTIPLE,
-        "persistent executor tasks");
-    if (status != UCC_OK) {
-        ec_error(&ucc_ec_cuda.super, "failed to create persistent tasks pool");
-        return status;
-    }
-
     if (cfg->strm_task_mode == UCC_EC_CUDA_TASK_KERNEL) {
         ucc_ec_cuda.strm_task_mode = UCC_EC_CUDA_TASK_KERNEL;
     } else {
@@ -335,16 +176,17 @@ static ucc_status_t ucc_ec_cuda_init(const ucc_ec_params_t *ec_params)
     }
 
     if (cfg->use_cooperative_launch == 1) {
-        cudaDeviceGetAttribute(&supportsCoopLaunch,
+        cudaDeviceGetAttribute(&supports_coop_launch,
                                cudaDevAttrCooperativeLaunch, device);
-        if (!supportsCoopLaunch) {
+        if (!supports_coop_launch) {
             cfg->use_cooperative_launch = 0;
             ec_warn(&ucc_ec_cuda.super,
-                     "CUDA cooperative groups are not supported. "
-                     "Fall back to non cooperative launch.");
+                    "CUDA cooperative groups are not supported. "
+                    "Fall back to non cooperative launch.");
         }
     }
 
+    ucc_ec_cuda.resources_hash = kh_init(ucc_ec_cuda_resources_hash);
     ucc_spinlock_init(&ucc_ec_cuda.init_spinlock, 0);
     return UCC_OK;
 }
@@ -359,9 +201,15 @@ static ucc_status_t ucc_ec_cuda_get_attr(ucc_ec_attr_t *ec_attr)
 
 ucc_status_t ucc_ec_cuda_event_create(void **event)
 {
-    ucc_ec_cuda_event_t *cuda_event;
+    ucc_ec_cuda_event_t     *cuda_event;
+    ucc_ec_cuda_resources_t *resources;
+    ucc_status_t             status;
 
-    cuda_event = ucc_mpool_get(&ucc_ec_cuda.events);
+    status = ucc_ec_cuda_get_resources(&resources);
+    if (ucc_unlikely(status != UCC_OK)) {
+        return status;
+    }
+    cuda_event = ucc_mpool_get(&resources->events);
     if (ucc_unlikely(!cuda_event)) {
         ec_error(&ucc_ec_cuda.super, "failed to get event from mpool");
         return UCC_ERR_NO_MEMORY;
@@ -390,8 +238,8 @@ ucc_status_t ucc_ec_cuda_event_post(void *ee_context, void *event)
 
 ucc_status_t ucc_ec_cuda_event_test(void *event)
 {
-    cudaError_t cu_err;
     ucc_ec_cuda_event_t *cuda_event = event;
+    cudaError_t cu_err;
 
     cu_err = cudaEventQuery(cuda_event->event);
 
@@ -404,26 +252,68 @@ ucc_status_t ucc_ec_cuda_event_test(void *event)
 
 static ucc_status_t ucc_ec_cuda_finalize()
 {
-    int i;
+    ucc_ec_cuda_resources_t *resources;
 
-    if (ucc_ec_cuda.stream_initialized) {
-        CUDA_FUNC(cudaStreamDestroy(ucc_ec_cuda.stream));
-        ucc_ec_cuda.stream_initialized = 0;
+    resources = ec_cuda_resources_hash_pop(ucc_ec_cuda.resources_hash);
+    while (resources) {
+        ucc_ec_cuda_resources_cleanup(resources);
+        resources = ec_cuda_resources_hash_pop(ucc_ec_cuda.resources_hash);
     }
 
-    if (ucc_ec_cuda.exec_streams_initialized) {
-        for (i = 0; i < EC_CUDA_CONFIG->exec_num_streams; i++) {
-            CUDA_FUNC(cudaStreamDestroy(ucc_ec_cuda.exec_streams[i]));
-        }
-        ucc_ec_cuda.exec_streams_initialized = 0;
+    ucc_spinlock_destroy(&ucc_ec_cuda.init_spinlock);
+
+    return UCC_OK;
+}
+
+ucc_status_t ucc_ec_cuda_get_resources(ucc_ec_cuda_resources_t **resources)
+{
+    CUcontext cu_ctx;
+    unsigned long long int cu_ctx_id;
+    ucc_status_t status;
+
+    status = CUDADRV_FUNC(cuCtxGetCurrent(&cu_ctx));
+    if (ucc_unlikely(status != UCC_OK)) {
+        ec_error(&ucc_ec_cuda.super, "failed to get current CUDA context");
+        return status;
     }
 
-    ucc_mpool_cleanup(&ucc_ec_cuda.events, 1);
-    ucc_mpool_cleanup(&ucc_ec_cuda.executors, 1);
-    ucc_mpool_cleanup(&ucc_ec_cuda.executor_interruptible_tasks, 1);
-    ucc_mpool_cleanup(&ucc_ec_cuda.executor_persistent_tasks, 1);
-    ucc_free(ucc_ec_cuda.exec_streams);
+#if CUDA_VERSION < 12000
+    cu_ctx_id = 1;
+#else
+    status = CUDADRV_FUNC(cuCtxGetId(cu_ctx, &cu_ctx_id));
+    if (ucc_unlikely(status != UCC_OK)) {
+        ec_error(&ucc_ec_cuda.super, "failed to get currect CUDA context ID");
+    }
+#endif
 
+    *resources = ec_cuda_resources_hash_get(ucc_ec_cuda.resources_hash,
+                                            cu_ctx_id);
+    if (ucc_unlikely(*resources == NULL)) {
+        ucc_spin_lock(&ucc_ec_cuda.init_spinlock);
+        *resources = ec_cuda_resources_hash_get(ucc_ec_cuda.resources_hash,
+                                                cu_ctx_id);
+        if (*resources == NULL) {
+            *resources = ucc_malloc(sizeof(ucc_ec_cuda_resources_t),
+                                    "ec cuda resources");
+            if (*resources == NULL) {
+                ec_error(&ucc_ec_cuda.super,
+                         "failed to allocate %zd bytes for resources",
+                         sizeof(ucc_ec_cuda_resources_t));
+                ucc_spin_unlock(&ucc_ec_cuda.init_spinlock);
+                return UCC_ERR_NO_MEMORY;
+            }
+            status = ucc_ec_cuda_resources_init(&ucc_ec_cuda.super,
+                                                *resources);
+            if (status != UCC_OK) {
+                ucc_free(*resources);
+                ucc_spin_unlock(&ucc_ec_cuda.init_spinlock);
+                return status;
+            }
+            ec_cuda_resources_hash_put(ucc_ec_cuda.resources_hash, cu_ctx_id,
+                                       *resources);
+        }
+        ucc_spin_unlock(&ucc_ec_cuda.init_spinlock);
+    }
     return UCC_OK;
 }
 
@@ -455,5 +345,7 @@ ucc_ec_cuda_t ucc_ec_cuda = {
     .super.executor_ops.finalize      = ucc_cuda_executor_finalize,
 };
 
+ucc_ec_cuda_config_t *ucc_ec_cuda_config;
+
 UCC_CONFIG_REGISTER_TABLE_ENTRY(&ucc_ec_cuda.super.config_table,
                                 &ucc_config_global_list);
diff --git a/src/components/ec/cuda/ec_cuda.h b/src/components/ec/cuda/ec_cuda.h
index d732669f12..84b8588605 100644
--- a/src/components/ec/cuda/ec_cuda.h
+++ b/src/components/ec/cuda/ec_cuda.h
@@ -11,109 +11,30 @@
 #include "components/ec/ucc_ec_log.h"
 #include "utils/arch/cuda_def.h"
 #include "utils/ucc_mpool.h"
+#include "ec_cuda_resources.h"
 #include <cuda_runtime.h>
 
 #define WARP_SIZE 32
-#define MAX_SUBTASKS 12
-
-typedef enum ucc_ec_cuda_strm_task_mode {
-    UCC_EC_CUDA_TASK_KERNEL,
-    UCC_EC_CUDA_TASK_MEM_OPS,
-    UCC_EC_CUDA_TASK_AUTO,
-    UCC_EC_CUDA_TASK_LAST,
-} ucc_ec_cuda_strm_task_mode_t;
-
-typedef enum ucc_ec_cuda_executor_state {
-    UCC_EC_CUDA_EXECUTOR_INITIALIZED,
-    UCC_EC_CUDA_EXECUTOR_POSTED,
-    UCC_EC_CUDA_EXECUTOR_STARTED,
-    UCC_EC_CUDA_EXECUTOR_SHUTDOWN,
-    UCC_EC_CUDA_EXECUTOR_SHUTDOWN_ACK
-} ucc_ec_cuda_executor_state_t;
-
-typedef enum ucc_ec_cuda_executor_mode {
-    UCC_EC_CUDA_EXECUTOR_MODE_PERSISTENT,
-    UCC_EC_CUDA_EXECUTOR_MODE_INTERRUPTIBLE
-} ucc_ec_cuda_executor_mode_t;
 
 typedef ucc_status_t (*ucc_ec_cuda_task_post_fn) (uint32_t *dev_status,
                                                   int blocking_wait,
                                                   cudaStream_t stream);
 
-typedef struct ucc_ec_cuda_config {
-    ucc_ec_config_t                super;
-    ucc_ec_cuda_strm_task_mode_t   strm_task_mode;
-    unsigned long                  exec_num_workers;
-    unsigned long                  exec_num_threads;
-    unsigned long                  exec_max_tasks;
-    unsigned long                  exec_num_streams;
-    unsigned long                  reduce_num_blocks;
-    int                            reduce_num_threads;
-    int                            use_cooperative_launch;
-    unsigned long                  exec_copy_thresh;
-} ucc_ec_cuda_config_t;
-
 typedef struct ucc_ec_cuda {
     ucc_ec_base_t                  super;
-    int                            stream_initialized;
-    cudaStream_t                   stream;
     int                            exec_streams_initialized;
-    cudaStream_t                  *exec_streams;
-    ucc_mpool_t                    events;
-    ucc_mpool_t                    executors;
-    ucc_mpool_t                    executor_interruptible_tasks;
-    ucc_mpool_t                    executor_persistent_tasks;
+    ucc_ec_cuda_resources_hash_t  *resources_hash;
     ucc_thread_mode_t              thread_mode;
     ucc_ec_cuda_strm_task_mode_t   strm_task_mode;
     ucc_spinlock_t                 init_spinlock;
 } ucc_ec_cuda_t;
 
-typedef struct ucc_ec_cuda_event {
-    cudaEvent_t    event;
-} ucc_ec_cuda_event_t;
-
 typedef struct ucc_ec_cuda_stream_request {
     uint32_t            status;
     uint32_t           *dev_status;
     cudaStream_t        stream;
 } ucc_ec_cuda_stream_request_t;
 
-typedef struct ucc_ec_cuda_executor_interruptible_task {
-    ucc_ee_executor_task_t  super;
-    void                   *event;
-    cudaGraph_t             graph;
-    cudaGraphExec_t         graph_exec;
-} ucc_ec_cuda_executor_interruptible_task_t;
-
-typedef struct ucc_ec_cuda_executor_persistent_task {
-    ucc_ee_executor_task_t       super;
-    int                          num_subtasks;
-    ucc_ee_executor_task_args_t *subtasks[MAX_SUBTASKS];
-} ucc_ec_cuda_executor_persistent_task_t;
-
-typedef struct ucc_ec_cuda_executor_task_ops {
-    ucc_status_t (*task_post)(ucc_ee_executor_t *executor,
-                              const ucc_ee_executor_task_args_t *task_args,
-                              ucc_ee_executor_task_t **task);
-    ucc_status_t (*task_test)(const ucc_ee_executor_task_t *task);
-    ucc_status_t (*task_finalize)(ucc_ee_executor_task_t *task);
-} ucc_ec_cuda_executor_task_ops_t;
-
-typedef struct ucc_ec_cuda_executor {
-    ucc_ee_executor_t                super;
-    ucc_ec_cuda_executor_mode_t      mode;
-    uint64_t                         requested_ops;
-    ucc_ec_cuda_executor_task_ops_t  ops;
-    ucc_spinlock_t                   tasks_lock;
-    ucc_ec_cuda_executor_state_t     state;
-    int                              pidx;
-    ucc_ee_executor_task_args_t     *tasks;
-    ucc_ec_cuda_executor_state_t    *dev_state;
-    ucc_ee_executor_task_args_t     *dev_tasks;
-    int                             *dev_pidx;
-    int                             *dev_cidx;
-} ucc_ec_cuda_executor_t;
-
 ucc_status_t ucc_ec_cuda_event_create(void **event);
 
 ucc_status_t ucc_ec_cuda_event_destroy(void *event);
@@ -122,6 +43,8 @@ ucc_status_t ucc_ec_cuda_event_post(void *ee_context, void *event);
 
 ucc_status_t ucc_ec_cuda_event_test(void *event);
 
+ucc_status_t ucc_ec_cuda_get_resources(ucc_ec_cuda_resources_t **resources);
+
 extern ucc_ec_cuda_t ucc_ec_cuda;
 
 #define EC_CUDA_CONFIG                                                         \
diff --git a/src/components/ec/cuda/ec_cuda_executor.c b/src/components/ec/cuda/ec_cuda_executor.c
index 49ae469140..1349187b71 100644
--- a/src/components/ec/cuda/ec_cuda_executor.c
+++ b/src/components/ec/cuda/ec_cuda_executor.c
@@ -23,8 +23,16 @@ ucc_status_t ucc_cuda_executor_persistent_wait_stop(ucc_ee_executor_t *executor)
 ucc_status_t ucc_cuda_executor_init(const ucc_ee_executor_params_t *params,
                                     ucc_ee_executor_t **executor)
 {
-    ucc_ec_cuda_executor_t *eee = ucc_mpool_get(&ucc_ec_cuda.executors);
+    ucc_ec_cuda_executor_t  *eee;
+    ucc_ec_cuda_resources_t *resources;
+    ucc_status_t             status;
 
+    status = ucc_ec_cuda_get_resources(&resources);
+    if (ucc_unlikely(status != UCC_OK)) {
+        return status;
+    }
+
+    eee = ucc_mpool_get(&resources->executors);
     if (ucc_unlikely(!eee)) {
         ec_error(&ucc_ec_cuda.super, "failed to allocate executor");
         return UCC_ERR_NO_MEMORY;
diff --git a/src/components/ec/cuda/ec_cuda_executor_interruptible.c b/src/components/ec/cuda/ec_cuda_executor_interruptible.c
index 74cc80b96e..7272b2439f 100644
--- a/src/components/ec/cuda/ec_cuda_executor_interruptible.c
+++ b/src/components/ec/cuda/ec_cuda_executor_interruptible.c
@@ -9,37 +9,43 @@
 
 ucc_status_t ucc_cuda_executor_interruptible_get_stream(cudaStream_t *stream)
 {
-    static uint32_t last_used   = 0;
-    int             num_streams = EC_CUDA_CONFIG->exec_num_streams;
-    ucc_status_t    st;
-    int             i, j;
-    uint32_t        id;
+    static uint32_t          last_used   = 0;
+    int                      num_streams = EC_CUDA_CONFIG->exec_num_streams;
+    ucc_ec_cuda_resources_t *resources;
+    ucc_status_t             st;
+    int                      i, j;
+    uint32_t                 id;
 
     ucc_assert(num_streams > 0);
-    if (ucc_unlikely(!ucc_ec_cuda.exec_streams_initialized)) {
+    st = ucc_ec_cuda_get_resources(&resources);
+    if (ucc_unlikely(st != UCC_OK)) {
+        return st;
+    }
+
+    if (ucc_unlikely(!resources->streams_initialized)) {
         ucc_spin_lock(&ucc_ec_cuda.init_spinlock);
-        if (ucc_ec_cuda.exec_streams_initialized) {
+        if (resources->streams_initialized) {
             goto unlock;
         }
 
         for(i = 0; i < num_streams; i++) {
-            st = CUDA_FUNC(cudaStreamCreateWithFlags(&ucc_ec_cuda.exec_streams[i],
+            st = CUDA_FUNC(cudaStreamCreateWithFlags(&resources->exec_streams[i],
                                                      cudaStreamNonBlocking));
             if (st != UCC_OK) {
                 for (j = 0; j < i; j++) {
-                    CUDA_FUNC(cudaStreamDestroy(ucc_ec_cuda.exec_streams[j]));
+                    CUDA_FUNC(cudaStreamDestroy(resources->exec_streams[j]));
                 }
                 ucc_spin_unlock(&ucc_ec_cuda.init_spinlock);
                 return st;
             }
         }
-        ucc_ec_cuda.exec_streams_initialized = 1;
+        resources->streams_initialized = 1;
 unlock:
         ucc_spin_unlock(&ucc_ec_cuda.init_spinlock);
     }
 
     id = ucc_atomic_fadd32(&last_used, 1);
-    *stream = ucc_ec_cuda.exec_streams[id % num_streams];
+    *stream = resources->exec_streams[id % num_streams];
     return UCC_OK;
 }
 
@@ -52,20 +58,25 @@ ucc_cuda_executor_interruptible_task_post(ucc_ee_executor_t *executor,
                                          const ucc_ee_executor_task_args_t *task_args,
                                          ucc_ee_executor_task_t **task)
 {
-    cudaStream_t stream = NULL;
+    cudaStream_t stream    = NULL;
+    size_t       num_nodes = UCC_EE_EXECUTOR_MULTI_OP_NUM_BUFS;
     ucc_ec_cuda_executor_interruptible_task_t *ee_task;
     ucc_status_t status;
     cudaGraphNode_t nodes[UCC_EE_EXECUTOR_MULTI_OP_NUM_BUFS];
-    size_t num_nodes = UCC_EE_EXECUTOR_MULTI_OP_NUM_BUFS;
+    ucc_ec_cuda_resources_t *resources;
     int i;
 
+    status = ucc_ec_cuda_get_resources(&resources);
+    if (ucc_unlikely(status != UCC_OK)) {
+        return status;
+    }
 
     status = ucc_cuda_executor_interruptible_get_stream(&stream);
     if (ucc_unlikely(status != UCC_OK)) {
         return status;
     }
 
-    ee_task = ucc_mpool_get(&ucc_ec_cuda.executor_interruptible_tasks);
+    ee_task = ucc_mpool_get(&resources->executor_interruptible_tasks);
     if (ucc_unlikely(!ee_task)) {
         return UCC_ERR_NO_MEMORY;
     }
diff --git a/src/components/ec/cuda/ec_cuda_executor_persistent.c b/src/components/ec/cuda/ec_cuda_executor_persistent.c
index b937a89680..c43b132e12 100644
--- a/src/components/ec/cuda/ec_cuda_executor_persistent.c
+++ b/src/components/ec/cuda/ec_cuda_executor_persistent.c
@@ -18,12 +18,19 @@ ucc_cuda_executor_persistent_task_post(ucc_ee_executor_t *executor,
     ucc_ee_executor_task_args_t            *subtask_args;
     ucc_ec_cuda_executor_persistent_task_t *ee_task;
     int                                     i;
+    ucc_ec_cuda_resources_t                *resources;
+    ucc_status_t                            status;
+
+    status = ucc_ec_cuda_get_resources(&resources);
+    if (ucc_unlikely(status != UCC_OK)) {
+        return status;
+    }
 
     if (ucc_ec_cuda.thread_mode == UCC_THREAD_MULTIPLE) {
         ucc_spin_lock(&eee->tasks_lock);
     }
 
-    ee_task = ucc_mpool_get(&ucc_ec_cuda.executor_persistent_tasks);
+    ee_task = ucc_mpool_get(&resources->executor_persistent_tasks);
     if (ucc_unlikely(!ee_task)) {
         return UCC_ERR_NO_MEMORY;
     }
diff --git a/src/components/ec/cuda/ec_cuda_resources.c b/src/components/ec/cuda/ec_cuda_resources.c
new file mode 100644
index 0000000000..5bc0043f1f
--- /dev/null
+++ b/src/components/ec/cuda/ec_cuda_resources.c
@@ -0,0 +1,197 @@
+#include "ec_cuda_resources.h"
+#include "components/ec/ucc_ec_log.h"
+#include "utils/ucc_malloc.h"
+
+static void ucc_ec_cuda_event_init(ucc_mpool_t *mp, void *obj, void *chunk) //NOLINT: mp is unused
+{
+    ucc_ec_cuda_event_t *base = (ucc_ec_cuda_event_t *) obj;
+
+    CUDA_FUNC(cudaEventCreateWithFlags(&base->event, cudaEventDisableTiming));
+}
+
+static void ucc_ec_cuda_event_cleanup(ucc_mpool_t *mp, void *obj) //NOLINT: mp is unused
+{
+    ucc_ec_cuda_event_t *base = (ucc_ec_cuda_event_t *) obj;
+
+    CUDA_FUNC(cudaEventDestroy(base->event));
+}
+
+static ucc_mpool_ops_t ucc_ec_cuda_event_mpool_ops = {
+    .chunk_alloc   = ucc_mpool_hugetlb_malloc,
+    .chunk_release = ucc_mpool_hugetlb_free,
+    .obj_init      = ucc_ec_cuda_event_init,
+    .obj_cleanup   = ucc_ec_cuda_event_cleanup,
+};
+
+static ucc_status_t ucc_ec_cuda_ee_executor_mpool_chunk_malloc(ucc_mpool_t *mp, //NOLINT: mp is unused
+                                                               size_t *size_p,
+                                                               void ** chunk_p)
+{
+    return CUDA_FUNC(cudaHostAlloc((void**)chunk_p, *size_p,
+                                   cudaHostAllocMapped));
+}
+
+static void ucc_ec_cuda_ee_executor_mpool_chunk_free(ucc_mpool_t *mp, //NOLINT: mp is unused
+                                                     void *chunk)
+{
+    CUDA_FUNC(cudaFreeHost(chunk));
+}
+
+static void ucc_ec_cuda_executor_chunk_init(ucc_mpool_t *mp, void *obj, //NOLINT: mp is unused
+                                            void *chunk) //NOLINT: chunk is unused
+{
+    ucc_ec_cuda_executor_t *eee       = (ucc_ec_cuda_executor_t*) obj;
+    int                     max_tasks = ucc_ec_cuda_config->exec_max_tasks;
+
+    CUDA_FUNC(cudaHostGetDevicePointer(
+                  (void**)(&eee->dev_state), (void *)&eee->state, 0));
+    CUDA_FUNC(cudaHostGetDevicePointer(
+                  (void**)(&eee->dev_pidx), (void *)&eee->pidx, 0));
+    CUDA_FUNC(cudaMalloc((void**)&eee->dev_cidx, sizeof(*eee->dev_cidx)));
+    CUDA_FUNC(cudaHostAlloc((void**)&eee->tasks,
+                            max_tasks * MAX_SUBTASKS *
+                            sizeof(ucc_ee_executor_task_args_t),
+                            cudaHostAllocMapped));
+    CUDA_FUNC(cudaHostGetDevicePointer(
+                  (void**)(&eee->dev_tasks), (void *)eee->tasks, 0));
+    ucc_spinlock_init(&eee->tasks_lock, 0);
+}
+
+static void ucc_ec_cuda_executor_chunk_cleanup(ucc_mpool_t *mp, void *obj) //NOLINT: mp is unused
+{
+    ucc_ec_cuda_executor_t *eee = (ucc_ec_cuda_executor_t*) obj;
+
+    CUDA_FUNC(cudaFree((void*)eee->dev_cidx));
+    CUDA_FUNC(cudaFreeHost((void*)eee->tasks));
+    ucc_spinlock_destroy(&eee->tasks_lock);
+}
+
+static ucc_mpool_ops_t ucc_ec_cuda_ee_executor_mpool_ops = {
+    .chunk_alloc   = ucc_ec_cuda_ee_executor_mpool_chunk_malloc,
+    .chunk_release = ucc_ec_cuda_ee_executor_mpool_chunk_free,
+    .obj_init      = ucc_ec_cuda_executor_chunk_init,
+    .obj_cleanup   = ucc_ec_cuda_executor_chunk_cleanup,
+};
+
+static void ucc_ec_cuda_graph_init(ucc_mpool_t *mp, void *obj, void *chunk) //NOLINT: mp is unused
+{
+    ucc_ec_cuda_executor_interruptible_task_t *task =
+         (ucc_ec_cuda_executor_interruptible_task_t *) obj;
+    cudaGraphNode_t memcpy_node;
+    int i;
+
+    CUDA_FUNC(cudaGraphCreate(&task->graph, 0));
+    for (i = 0; i < UCC_EE_EXECUTOR_MULTI_OP_NUM_BUFS; i++) {
+        CUDA_FUNC(
+            cudaGraphAddMemcpyNode1D(&memcpy_node, task->graph, NULL, 0,
+                                     (void*)1, (void*)1, 1, cudaMemcpyDefault));
+    }
+
+    CUDA_FUNC(
+        cudaGraphInstantiateWithFlags(&task->graph_exec, task->graph, 0));
+}
+
+static void ucc_ec_cuda_graph_cleanup(ucc_mpool_t *mp, void *obj) //NOLINT: mp is unused
+{
+    ucc_ec_cuda_executor_interruptible_task_t *task =
+         (ucc_ec_cuda_executor_interruptible_task_t *) obj;
+
+    CUDA_FUNC(cudaGraphExecDestroy(task->graph_exec));
+    CUDA_FUNC(cudaGraphDestroy(task->graph));
+}
+
+static ucc_mpool_ops_t ucc_ec_cuda_interruptible_task_mpool_ops = {
+    .chunk_alloc   = ucc_mpool_hugetlb_malloc,
+    .chunk_release = ucc_mpool_hugetlb_free,
+    .obj_init      = ucc_ec_cuda_graph_init,
+    .obj_cleanup   = ucc_ec_cuda_graph_cleanup,
+};
+
+ucc_status_t ucc_ec_cuda_resources_init(ucc_ec_base_t *ec,
+                                        ucc_ec_cuda_resources_t *resources)
+{
+    ucc_status_t status;
+    int num_streams;
+
+    CUDADRV_CHECK(cuCtxGetCurrent(&resources->cu_ctx));
+    status = ucc_mpool_init(&resources->events, 0, sizeof(ucc_ec_cuda_event_t),
+                            0, UCC_CACHE_LINE_SIZE, 16, UINT_MAX,
+                            &ucc_ec_cuda_event_mpool_ops, UCC_THREAD_MULTIPLE,
+                            "CUDA Event Objects");
+    if (status != UCC_OK) {
+        ec_error(ec, "failed to create CUDA events pool");
+        goto exit_err;
+    }
+
+    status = ucc_mpool_init(&resources->executors, 0,
+                            sizeof(ucc_ec_cuda_executor_t), 0,
+                            UCC_CACHE_LINE_SIZE, 16, UINT_MAX,
+                            &ucc_ec_cuda_ee_executor_mpool_ops,
+                            UCC_THREAD_MULTIPLE, "CUDA EE executor objects");
+    if (status != UCC_OK) {
+        ec_error(ec, "failed to create executors pool");
+        goto free_events_mpool;
+    }
+
+    status = ucc_mpool_init(&resources->executor_interruptible_tasks, 0,
+                            sizeof(ucc_ec_cuda_executor_interruptible_task_t),
+                            0, UCC_CACHE_LINE_SIZE, 16, UINT_MAX,
+                            &ucc_ec_cuda_interruptible_task_mpool_ops,
+                            UCC_THREAD_MULTIPLE, "interruptible executor tasks");
+    if (status != UCC_OK) {
+        ec_error(ec, "failed to create interruptible tasks pool");
+        goto free_executors_mpool;
+    }
+
+    status = ucc_mpool_init(&resources->executor_persistent_tasks, 0,
+                            sizeof(ucc_ec_cuda_executor_persistent_task_t), 0,
+                            UCC_CACHE_LINE_SIZE, 16, UINT_MAX, NULL,
+                            UCC_THREAD_MULTIPLE, "persistent executor tasks");
+    if (status != UCC_OK) {
+        ec_error(ec, "failed to create persistent tasks pool");
+        goto free_interruptible_tasks_mpool;
+    }
+
+    num_streams = ucc_ec_cuda_config->exec_num_streams;
+    resources->exec_streams = ucc_calloc(num_streams, sizeof(cudaStream_t),
+                                         "ec cuda streams");
+    if (!resources->exec_streams) {
+        ec_error(ec, "failed to allocate %zd bytes for executor streams",
+                 sizeof(cudaStream_t) * num_streams);
+        status = UCC_ERR_NO_MEMORY;
+        goto free_persistent_tasks_mpool;
+    }
+
+    return UCC_OK;
+
+free_persistent_tasks_mpool:
+    ucc_mpool_cleanup(&resources->executor_persistent_tasks, 0);
+free_interruptible_tasks_mpool:
+    ucc_mpool_cleanup(&resources->executor_persistent_tasks, 0);
+free_executors_mpool:
+    ucc_mpool_cleanup(&resources->executors, 0);
+free_events_mpool:
+    ucc_mpool_cleanup(&resources->events, 0);
+exit_err:
+    return status;
+}
+
+void ucc_ec_cuda_resources_cleanup(ucc_ec_cuda_resources_t *resources)
+{
+    int i;
+    CUcontext tmp_context;
+
+    cuCtxPushCurrent(resources->cu_ctx);
+    for (i = 0; i < ucc_ec_cuda_config->exec_num_streams; i++) {
+        if (resources->exec_streams[i] != NULL) {
+            CUDA_FUNC(cudaStreamDestroy(resources->exec_streams[i]));
+        }
+    }
+    ucc_mpool_cleanup(&resources->events, 1);
+    ucc_mpool_cleanup(&resources->executors, 1);
+    ucc_mpool_cleanup(&resources->executor_interruptible_tasks, 1);
+    ucc_mpool_cleanup(&resources->executor_persistent_tasks, 1);
+
+    ucc_free(resources->exec_streams);
+    cuCtxPopCurrent(&tmp_context);
+}
diff --git a/src/components/ec/cuda/ec_cuda_resources.h b/src/components/ec/cuda/ec_cuda_resources.h
new file mode 100644
index 0000000000..1390f76cdd
--- /dev/null
+++ b/src/components/ec/cuda/ec_cuda_resources.h
@@ -0,0 +1,158 @@
+/**
+ * Copyright (c) 2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ *
+ * See file LICENSE for terms.
+ */
+
+#ifndef UCC_EC_CUDA_RESOURCES_H_
+#define UCC_EC_CUDA_RESOURCES_H_
+
+#include "components/ec/base/ucc_ec_base.h"
+#include "utils/arch/cuda_def.h"
+#include "utils/ucc_mpool.h"
+
+#define MAX_SUBTASKS 12
+
+typedef enum ucc_ec_cuda_executor_state {
+    UCC_EC_CUDA_EXECUTOR_INITIALIZED,
+    UCC_EC_CUDA_EXECUTOR_POSTED,
+    UCC_EC_CUDA_EXECUTOR_STARTED,
+    UCC_EC_CUDA_EXECUTOR_SHUTDOWN,
+    UCC_EC_CUDA_EXECUTOR_SHUTDOWN_ACK
+} ucc_ec_cuda_executor_state_t;
+
+typedef enum ucc_ec_cuda_executor_mode {
+    UCC_EC_CUDA_EXECUTOR_MODE_PERSISTENT,
+    UCC_EC_CUDA_EXECUTOR_MODE_INTERRUPTIBLE
+} ucc_ec_cuda_executor_mode_t;
+
+typedef struct ucc_ec_cuda_event {
+    cudaEvent_t event;
+} ucc_ec_cuda_event_t;
+
+typedef struct ucc_ec_cuda_executor_task_ops {
+    ucc_status_t (*task_post)(ucc_ee_executor_t *executor,
+                              const ucc_ee_executor_task_args_t *task_args,
+                              ucc_ee_executor_task_t **task);
+    ucc_status_t (*task_test)(const ucc_ee_executor_task_t *task);
+    ucc_status_t (*task_finalize)(ucc_ee_executor_task_t *task);
+} ucc_ec_cuda_executor_task_ops_t;
+
+typedef struct ucc_ec_cuda_executor {
+    ucc_ee_executor_t                super;
+    ucc_ec_cuda_executor_mode_t      mode;
+    uint64_t                         requested_ops;
+    ucc_ec_cuda_executor_task_ops_t  ops;
+    ucc_spinlock_t                   tasks_lock;
+    ucc_ec_cuda_executor_state_t     state;
+    int                              pidx;
+    ucc_ee_executor_task_args_t     *tasks;
+    ucc_ec_cuda_executor_state_t    *dev_state;
+    ucc_ee_executor_task_args_t     *dev_tasks;
+    int                             *dev_pidx;
+    int                             *dev_cidx;
+} ucc_ec_cuda_executor_t;
+
+typedef struct ucc_ec_cuda_executor_interruptible_task {
+    ucc_ee_executor_task_t  super;
+    void                   *event;
+    cudaGraph_t             graph;
+    cudaGraphExec_t         graph_exec;
+} ucc_ec_cuda_executor_interruptible_task_t;
+
+typedef struct ucc_ec_cuda_executor_persistent_task {
+    ucc_ee_executor_task_t       super;
+    int                          num_subtasks;
+    ucc_ee_executor_task_args_t *subtasks[MAX_SUBTASKS];
+} ucc_ec_cuda_executor_persistent_task_t;
+
+typedef struct ucc_ec_cuda_resources {
+    CUcontext     cu_ctx;
+    ucc_mpool_t   events;
+    ucc_mpool_t   executors;
+    ucc_mpool_t   executor_interruptible_tasks;
+    ucc_mpool_t   executor_persistent_tasks;
+    int           streams_initialized;
+    int           num_streams;
+    cudaStream_t *exec_streams;
+} ucc_ec_cuda_resources_t;
+
+typedef enum ucc_ec_cuda_strm_task_mode {
+    UCC_EC_CUDA_TASK_KERNEL,
+    UCC_EC_CUDA_TASK_MEM_OPS,
+    UCC_EC_CUDA_TASK_AUTO,
+    UCC_EC_CUDA_TASK_LAST,
+} ucc_ec_cuda_strm_task_mode_t;
+
+typedef struct ucc_ec_cuda_config {
+    ucc_ec_config_t                super;
+    ucc_ec_cuda_strm_task_mode_t   strm_task_mode;
+    unsigned long                  exec_num_workers;
+    unsigned long                  exec_num_threads;
+    unsigned long                  exec_max_tasks;
+    unsigned long                  exec_num_streams;
+    unsigned long                  reduce_num_blocks;
+    int                            reduce_num_threads;
+    int                            use_cooperative_launch;
+    unsigned long                  exec_copy_thresh;
+} ucc_ec_cuda_config_t;
+
+extern ucc_ec_cuda_config_t *ucc_ec_cuda_config;
+
+ucc_status_t ucc_ec_cuda_resources_init(ucc_ec_base_t *ec,
+                                        ucc_ec_cuda_resources_t *resources);
+
+void ucc_ec_cuda_resources_cleanup(ucc_ec_cuda_resources_t *resources);
+
+KHASH_INIT(ucc_ec_cuda_resources_hash, unsigned long long, void*, 1, \
+           kh_int64_hash_func, kh_int64_hash_equal);
+#define ucc_ec_cuda_resources_hash_t khash_t(ucc_ec_cuda_resources_hash)
+
+static inline
+void* ec_cuda_resources_hash_get(ucc_ec_cuda_resources_hash_t *h,
+                                 unsigned long long key)
+{
+    khiter_t  k;
+    void     *value;
+
+    k = kh_get(ucc_ec_cuda_resources_hash, h , key);
+    if (k == kh_end(h)) {
+        return NULL;
+    }
+    value = kh_value(h, k);
+    return value;
+}
+
+static inline
+void ec_cuda_resources_hash_put(ucc_ec_cuda_resources_hash_t *h,
+                                unsigned long long key,
+                                void *value)
+{
+    int ret;
+    khiter_t k;
+    k = kh_put(ucc_ec_cuda_resources_hash, h, key, &ret);
+    kh_value(h, k) = value;
+}
+
+static inline
+void* ec_cuda_resources_hash_pop(ucc_ec_cuda_resources_hash_t *h)
+{
+    void    *resources = NULL;
+    khiter_t k;
+
+    k = kh_begin(h);
+    while (k != kh_end(h)) {
+        if (kh_exist(h, k)) {
+            resources = kh_value(h, k);
+            break;
+        }
+        k++;
+    }
+
+    if (resources) {
+        kh_del(ucc_ec_cuda_resources_hash, h, k);
+    }
+    return resources;
+}
+
+#endif
diff --git a/src/components/ec/ucc_ec.c b/src/components/ec/ucc_ec.c
index af83e301b4..42cc096a0c 100644
--- a/src/components/ec/ucc_ec.c
+++ b/src/components/ec/ucc_ec.c
@@ -4,6 +4,7 @@
  * See file LICENSE for terms.
  */
 
+#include <pthread.h>
 #include "config.h"
 #include "base/ucc_ec_base.h"
 #include "ucc_ec.h"
@@ -13,6 +14,7 @@
 
 static const ucc_ec_ops_t          *ec_ops[UCC_EE_LAST];
 static const ucc_ee_executor_ops_t *executor_ops[UCC_EE_LAST];
+static pthread_mutex_t ucc_ec_mutex = PTHREAD_MUTEX_INITIALIZER;
 
 #define UCC_CHECK_EC_AVAILABLE(ee)                                             \
     do {                                                                       \
@@ -28,6 +30,7 @@ ucc_status_t ucc_ec_init(const ucc_ec_params_t *ec_params)
     ucc_status_t   status;
     ucc_ec_attr_t  attr;
 
+    pthread_mutex_lock(&ucc_ec_mutex);
     memset(ec_ops, 0, UCC_EE_LAST * sizeof(ucc_ec_ops_t *));
     n_ecs = ucc_global_config.ec_framework.n_components;
     for (i = 0; i < n_ecs; i++) {
@@ -62,6 +65,7 @@ ucc_status_t ucc_ec_init(const ucc_ec_params_t *ec_params)
             attr.field_mask = UCC_EC_ATTR_FIELD_THREAD_MODE;
             status = ec->get_attr(&attr);
             if (status != UCC_OK) {
+                pthread_mutex_unlock(&ucc_ec_mutex);
                 return status;
             }
             if (attr.thread_mode < ec_params->thread_mode) {
@@ -75,6 +79,7 @@ ucc_status_t ucc_ec_init(const ucc_ec_params_t *ec_params)
         ec_ops[ec->type] = &ec->ops;
         executor_ops[ec->type] = &ec->executor_ops;
     }
+    pthread_mutex_unlock(&ucc_ec_mutex);
 
     return UCC_OK;
 }
@@ -102,6 +107,7 @@ ucc_status_t ucc_ec_finalize()
     ucc_ee_type_t  et;
     ucc_ec_base_t *ec;
 
+    pthread_mutex_lock(&ucc_ec_mutex);
     for (et = UCC_EE_FIRST; et < UCC_EE_LAST; et++) {
         if (NULL != ec_ops[et]) {
             ec = ucc_container_of(ec_ops[et], ucc_ec_base_t, ops);
@@ -115,6 +121,7 @@ ucc_status_t ucc_ec_finalize()
             }
         }
     }
+    pthread_mutex_unlock(&ucc_ec_mutex);
 
     return UCC_OK;
 }
diff --git a/src/components/mc/base/ucc_mc_base.h b/src/components/mc/base/ucc_mc_base.h
index d6c67a734a..442088a09d 100644
--- a/src/components/mc/base/ucc_mc_base.h
+++ b/src/components/mc/base/ucc_mc_base.h
@@ -71,7 +71,9 @@ typedef struct ucc_mem_attr {
  * UCC memory component attributes field mask
  */
 typedef enum ucc_mc_attr_field {
-    UCC_MC_ATTR_FIELD_THREAD_MODE = UCC_BIT(0)
+    UCC_MC_ATTR_FIELD_THREAD_MODE      = UCC_BIT(0),
+ /* size of memory pool chunk element */
+    UCC_MC_ATTR_FIELD_FAST_ALLOC_SIZE  = UCC_BIT(1),
 }  ucc_mc_attr_field_t;
 
 typedef struct ucc_mc_attr {
@@ -81,6 +83,7 @@ typedef struct ucc_mc_attr {
      */
     uint64_t          field_mask;
     ucc_thread_mode_t thread_mode;
+    size_t            fast_alloc_size;
 } ucc_mc_attr_t;
 
 /**
diff --git a/src/components/mc/cuda/Makefile.am b/src/components/mc/cuda/Makefile.am
index 1e25e2109a..d8e1dbe55e 100644
--- a/src/components/mc/cuda/Makefile.am
+++ b/src/components/mc/cuda/Makefile.am
@@ -5,8 +5,10 @@
 if HAVE_CUDA
 
 sources =     \
-	mc_cuda.h \
-	mc_cuda.c
+	mc_cuda.h           \
+	mc_cuda.c           \
+	mc_cuda_resources.c \
+	mc_cuda_resources.h
 
 module_LTLIBRARIES         = libucc_mc_cuda.la
 libucc_mc_cuda_la_SOURCES  = $(sources)
diff --git a/src/components/mc/cuda/mc_cuda.c b/src/components/mc/cuda/mc_cuda.c
index 5c820bd768..aa2638b9da 100644
--- a/src/components/mc/cuda/mc_cuda.c
+++ b/src/components/mc/cuda/mc_cuda.c
@@ -50,8 +50,8 @@ static ucc_status_t ucc_mc_cuda_init(const ucc_mc_params_t *mc_params)
     int         num_devices, driver_ver;
     cudaError_t cuda_st;
 
-    ucc_mc_cuda.stream             = NULL;
-    ucc_mc_cuda.stream_initialized = 0;
+    ucc_mc_cuda_config = ucc_derived_of(ucc_mc_cuda.super.config,
+                                        ucc_mc_cuda_config_t);
     ucc_strncpy_safe(ucc_mc_cuda.super.config->log_component.name,
                      ucc_mc_cuda.super.super.name,
                      sizeof(ucc_mc_cuda.super.config->log_component.name));
@@ -100,6 +100,7 @@ static ucc_status_t ucc_mc_cuda_init(const ucc_mc_params_t *mc_params)
                  "with driver version %d", driver_ver);
     }
 #endif
+    ucc_mc_cuda.resources_hash = kh_init(ucc_mc_cuda_resources_hash);
     // lock assures single mpool initiation when multiple threads concurrently execute
     // different collective operations thus concurrently entering init function.
     ucc_spinlock_init(&ucc_mc_cuda.init_spinlock, 0);
@@ -112,6 +113,13 @@ static ucc_status_t ucc_mc_cuda_get_attr(ucc_mc_attr_t *mc_attr)
     if (mc_attr->field_mask & UCC_MC_ATTR_FIELD_THREAD_MODE) {
         mc_attr->thread_mode = ucc_mc_cuda.thread_mode;
     }
+    if (mc_attr->field_mask & UCC_MC_ATTR_FIELD_FAST_ALLOC_SIZE) {
+        if (MC_CUDA_CONFIG->mpool_max_elems > 0) {
+            mc_attr->fast_alloc_size = MC_CUDA_CONFIG->mpool_elem_size;
+        } else {
+            mc_attr->fast_alloc_size = 0;
+        }
+    }
     return UCC_OK;
 }
 
@@ -120,8 +128,9 @@ static ucc_status_t ucc_mc_cuda_mem_alloc(ucc_mc_buffer_header_t **h_ptr,
                                           ucc_memory_type_t        mt)
 {
     cudaError_t             st;
-    ucc_mc_buffer_header_t *h =
-        ucc_malloc(sizeof(ucc_mc_buffer_header_t), "mc cuda");
+    ucc_mc_buffer_header_t *h;
+
+    h = ucc_malloc(sizeof(ucc_mc_buffer_header_t), "mc cuda");
     if (ucc_unlikely(!h)) {
         mc_error(&ucc_mc_cuda.super, "failed to allocate %zd bytes",
                  sizeof(ucc_mc_buffer_header_t));
@@ -132,13 +141,13 @@ static ucc_status_t ucc_mc_cuda_mem_alloc(ucc_mc_buffer_header_t **h_ptr,
                                                           cudaMemAttachGlobal);
     if (ucc_unlikely(st != cudaSuccess)) {
         cudaGetLastError();
-        mc_error(&ucc_mc_cuda.super,
-                 "failed to allocate %zd bytes, "
+        mc_error(&ucc_mc_cuda.super, "failed to allocate %zd bytes, "
                  "cuda error %d(%s)",
                  size, st, cudaGetErrorString(st));
         ucc_free(h);
         return UCC_ERR_NO_MEMORY;
     }
+
     h->from_pool = 0;
     h->mt        = UCC_MEMORY_TYPE_CUDA;
     *h_ptr       = h;
@@ -151,15 +160,25 @@ static ucc_status_t ucc_mc_cuda_mem_pool_alloc(ucc_mc_buffer_header_t **h_ptr,
                                                size_t                   size,
                                                ucc_memory_type_t        mt)
 {
-    ucc_mc_buffer_header_t *h = NULL;
-    if (size <= MC_CUDA_CONFIG->mpool_elem_size &&
-        mt != UCC_MEMORY_TYPE_CUDA_MANAGED) {
-        h = (ucc_mc_buffer_header_t *)ucc_mpool_get(&ucc_mc_cuda.mpool);
+    ucc_mc_buffer_header_t  *h = NULL;
+    ucc_mc_cuda_resources_t *resources;
+    ucc_status_t             status;
+
+    if ((size <= MC_CUDA_CONFIG->mpool_elem_size) &&
+        (mt != UCC_MEMORY_TYPE_CUDA_MANAGED)) {
+        status = ucc_mc_cuda_get_resources(&resources);
+        if (ucc_unlikely(status != UCC_OK)) {
+            return status;
+        }
+
+        h = (ucc_mc_buffer_header_t *)ucc_mpool_get(&resources->scratch_mpool);
     }
+
     if (!h) {
         // Slow path
         return ucc_mc_cuda_mem_alloc(h_ptr, size, mt);
     }
+
     if (ucc_unlikely(!h->addr)){
         return UCC_ERR_NO_MEMORY;
     }
@@ -168,61 +187,6 @@ static ucc_status_t ucc_mc_cuda_mem_pool_alloc(ucc_mc_buffer_header_t **h_ptr,
     return UCC_OK;
 }
 
-static ucc_status_t ucc_mc_cuda_chunk_alloc(ucc_mpool_t *mp, //NOLINT
-                                            size_t *size_p,
-                                            void **chunk_p)
-{
-    *chunk_p = ucc_malloc(*size_p, "mc cuda");
-    if (!*chunk_p) {
-        mc_error(&ucc_mc_cuda.super, "failed to allocate %zd bytes", *size_p);
-        return UCC_ERR_NO_MEMORY;
-    }
-
-    return UCC_OK;
-}
-
-static void ucc_mc_cuda_chunk_init(ucc_mpool_t *mp, //NOLINT
-                                   void *obj, void *chunk) //NOLINT
-{
-    ucc_mc_buffer_header_t *h = (ucc_mc_buffer_header_t *)obj;
-    cudaError_t st = cudaMalloc(&h->addr, MC_CUDA_CONFIG->mpool_elem_size);
-    if (st != cudaSuccess) {
-        // h->addr will be 0 so ucc_mc_cuda_mem_alloc_pool function will
-        // return UCC_ERR_NO_MEMORY. As such mc_error message is suffice.
-        cudaGetLastError();
-        mc_error(&ucc_mc_cuda.super,
-                 "failed to allocate %zd bytes, "
-                 "cuda error %d(%s)",
-                 MC_CUDA_CONFIG->mpool_elem_size, st, cudaGetErrorString(st));
-    }
-    h->from_pool = 1;
-    h->mt        = UCC_MEMORY_TYPE_CUDA;
-}
-
-static void ucc_mc_cuda_chunk_release(ucc_mpool_t *mp, void *chunk) //NOLINT: mp is unused
-{
-    ucc_free(chunk);
-}
-
-static void ucc_mc_cuda_chunk_cleanup(ucc_mpool_t *mp, void *obj) //NOLINT: mp is unused
-{
-    ucc_mc_buffer_header_t *h = (ucc_mc_buffer_header_t *)obj;
-    cudaError_t             st;
-    st = cudaFree(h->addr);
-    if (st != cudaSuccess) {
-        cudaGetLastError();
-        mc_error(&ucc_mc_cuda.super,
-                 "failed to free mem at %p, "
-                 "cuda error %d(%s)",
-                 obj, st, cudaGetErrorString(st));
-    }
-}
-
-static ucc_mpool_ops_t ucc_mc_ops = {.chunk_alloc   = ucc_mc_cuda_chunk_alloc,
-                                     .chunk_release = ucc_mc_cuda_chunk_release,
-                                     .obj_init      = ucc_mc_cuda_chunk_init,
-                                     .obj_cleanup   = ucc_mc_cuda_chunk_cleanup};
-
 static ucc_status_t ucc_mc_cuda_mem_free(ucc_mc_buffer_header_t *h_ptr)
 {
     cudaError_t st;
@@ -250,92 +214,72 @@ static ucc_status_t ucc_mc_cuda_mem_pool_free(ucc_mc_buffer_header_t *h_ptr)
 
 static ucc_status_t
 ucc_mc_cuda_mem_pool_alloc_with_init(ucc_mc_buffer_header_t **h_ptr,
-                                     size_t                   size,
-                                     ucc_memory_type_t        mt)
+                                     size_t size,
+                                     ucc_memory_type_t mt)
 {
-    // lock assures single mpool initiation when multiple threads concurrently execute
-    // different collective operations thus concurrently entering init function.
-    ucc_spin_lock(&ucc_mc_cuda.init_spinlock);
-
     if (MC_CUDA_CONFIG->mpool_max_elems == 0) {
         ucc_mc_cuda.super.ops.mem_alloc = ucc_mc_cuda_mem_alloc;
         ucc_mc_cuda.super.ops.mem_free  = ucc_mc_cuda_mem_free;
-        ucc_spin_unlock(&ucc_mc_cuda.init_spinlock);
         return ucc_mc_cuda_mem_alloc(h_ptr, size, mt);
-    }
-
-    if (!ucc_mc_cuda.mpool_init_flag) {
-        ucc_status_t status = ucc_mpool_init(
-            &ucc_mc_cuda.mpool, 0, sizeof(ucc_mc_buffer_header_t), 0,
-            UCC_CACHE_LINE_SIZE, 1, MC_CUDA_CONFIG->mpool_max_elems,
-            &ucc_mc_ops, ucc_mc_cuda.thread_mode, "mc cuda mpool buffers");
-        if (status != UCC_OK) {
-            ucc_spin_unlock(&ucc_mc_cuda.init_spinlock);
-            return status;
-        }
+    } else {
         ucc_mc_cuda.super.ops.mem_alloc = ucc_mc_cuda_mem_pool_alloc;
-        ucc_mc_cuda.mpool_init_flag     = 1;
+        ucc_mc_cuda.super.ops.mem_free  = ucc_mc_cuda_mem_pool_free;
+        return ucc_mc_cuda_mem_pool_alloc(h_ptr, size, mt);
     }
-    ucc_spin_unlock(&ucc_mc_cuda.init_spinlock);
-    return ucc_mc_cuda_mem_pool_alloc(h_ptr, size, mt);
 }
 
 static ucc_status_t ucc_mc_cuda_memcpy(void *dst, const void *src, size_t len,
                                        ucc_memory_type_t dst_mem,
                                        ucc_memory_type_t src_mem)
 {
-    cudaError_t st;
+    ucc_status_t status;
+    ucc_mc_cuda_resources_t *resources;
+
     ucc_assert(dst_mem == UCC_MEMORY_TYPE_CUDA ||
                src_mem == UCC_MEMORY_TYPE_CUDA ||
                dst_mem == UCC_MEMORY_TYPE_CUDA_MANAGED ||
                src_mem == UCC_MEMORY_TYPE_CUDA_MANAGED);
 
-    UCC_MC_CUDA_INIT_STREAM();
-    st = cudaMemcpyAsync(dst, src, len, cudaMemcpyDefault, ucc_mc_cuda.stream);
-    if (ucc_unlikely(st != cudaSuccess)) {
-        cudaGetLastError();
-        mc_error(&ucc_mc_cuda.super,
-                 "failed to launch cudaMemcpyAsync, dst %p, src %p, len %zd "
-                 "cuda error %d(%s)",
-                 dst, src, len, st, cudaGetErrorString(st));
-        return UCC_ERR_NO_MESSAGE;
+    status = ucc_mc_cuda_get_resources(&resources);
+    if (ucc_unlikely(status) != UCC_OK) {
+        return status;
     }
-    st = cudaStreamSynchronize(ucc_mc_cuda.stream);
-    if (ucc_unlikely(st != cudaSuccess)) {
-        cudaGetLastError();
+
+    status = CUDA_FUNC(cudaMemcpyAsync(dst, src, len, cudaMemcpyDefault,
+                                       resources->stream));
+    if (ucc_unlikely(status != UCC_OK)) {
         mc_error(&ucc_mc_cuda.super,
-                 "failed to synchronize mc_cuda.stream "
-                 "cuda error %d(%s)",
-                 st, cudaGetErrorString(st));
-        return UCC_ERR_NO_MESSAGE;
+                 "failed to launch cudaMemcpyAsync, dst %p, src %p, len %zd",
+                 dst, src, len);
+        return status;
     }
-    return UCC_OK;
+
+    status = CUDA_FUNC(cudaStreamSynchronize(resources->stream));
+
+    return status;
 }
 
 ucc_status_t ucc_mc_cuda_memset(void *ptr, int val, size_t len)
 {
-    cudaError_t st;
+    ucc_status_t status;
+    ucc_mc_cuda_resources_t *resources;
 
-    UCC_MC_CUDA_INIT_STREAM();
-    st = cudaMemsetAsync(ptr, val, len, ucc_mc_cuda.stream);
-    if (ucc_unlikely(st != cudaSuccess)) {
-        cudaGetLastError();
-        mc_error(&ucc_mc_cuda.super,
-                 "failed to launch cudaMemsetAsync, dst %p, len %zd "
-                 "cuda error %d(%s)",
-                 ptr, len, st, cudaGetErrorString(st));
-        return UCC_ERR_NO_MESSAGE;
+    status = ucc_mc_cuda_get_resources(&resources);
+    if (ucc_unlikely(status) != UCC_OK) {
+        return status;
     }
-    st = cudaStreamSynchronize(ucc_mc_cuda.stream);
-    if (ucc_unlikely(st != cudaSuccess)) {
-        cudaGetLastError();
+
+    status = CUDA_FUNC(cudaMemsetAsync(ptr, val, len, resources->stream));
+    if (ucc_unlikely(status != UCC_OK)) {
         mc_error(&ucc_mc_cuda.super,
-                 "failed to synchronize mc_cuda.stream "
-                 "cuda error %d(%s)",
-                 st, cudaGetErrorString(st));
-        return UCC_ERR_NO_MESSAGE;
+                 "failed to launch cudaMemsetAsync, dst %p, len %zd",
+                 ptr, len);
+        return status;
     }
-    return UCC_OK;
+
+    status = CUDA_FUNC(cudaStreamSynchronize(resources->stream));
+
+    return status;
 }
 
 static ucc_status_t ucc_mc_cuda_mem_query(const void *ptr,
@@ -407,17 +351,69 @@ static ucc_status_t ucc_mc_cuda_mem_query(const void *ptr,
     return UCC_OK;
 }
 
-static ucc_status_t ucc_mc_cuda_finalize()
+ucc_status_t ucc_mc_cuda_get_resources(ucc_mc_cuda_resources_t **resources)
 {
-    if (ucc_mc_cuda.stream != NULL) {
-        CUDA_CHECK(cudaStreamDestroy(ucc_mc_cuda.stream));
-        ucc_mc_cuda.stream = NULL;
+    CUcontext cu_ctx;
+    unsigned long long int cu_ctx_id;
+    ucc_status_t status;
+
+    status = CUDADRV_FUNC(cuCtxGetCurrent(&cu_ctx));
+    if (ucc_unlikely(status != UCC_OK)) {
+        mc_error(&ucc_mc_cuda.super, "failed to get current CUDA context");
+        return status;
+    }
+
+#if CUDA_VERSION < 12000
+    cu_ctx_id = 1;
+#else
+    status = CUDADRV_FUNC(cuCtxGetId(cu_ctx, &cu_ctx_id));
+    if (ucc_unlikely(status != UCC_OK)) {
+        mc_error(&ucc_mc_cuda.super, "failed to get currect CUDA context ID");
+    }
+#endif
+
+    *resources = mc_cuda_resources_hash_get(ucc_mc_cuda.resources_hash,
+                                            cu_ctx_id);
+    if (ucc_unlikely(*resources == NULL)) {
+        ucc_spin_lock(&ucc_mc_cuda.init_spinlock);
+        *resources = mc_cuda_resources_hash_get(ucc_mc_cuda.resources_hash,
+                                                cu_ctx_id);
+        if (*resources == NULL) {
+            *resources = ucc_malloc(sizeof(ucc_mc_cuda_resources_t),
+                                    "mc cuda resources");
+            if (*resources == NULL) {
+                mc_error(&ucc_mc_cuda.super,
+                         "failed to allocate %zd bytes for resources",
+                         sizeof(ucc_mc_cuda_resources_t));
+                ucc_spin_unlock(&ucc_mc_cuda.init_spinlock);
+                return UCC_ERR_NO_MEMORY;
+            }
+            status = ucc_mc_cuda_resources_init(&ucc_mc_cuda.super,
+                                                *resources);
+            if (status != UCC_OK) {
+                ucc_free(*resources);
+                ucc_spin_unlock(&ucc_mc_cuda.init_spinlock);
+                return status;
+            }
+            mc_cuda_resources_hash_put(ucc_mc_cuda.resources_hash, cu_ctx_id,
+                                       *resources);
+        }
+        ucc_spin_unlock(&ucc_mc_cuda.init_spinlock);
     }
-    if (ucc_mc_cuda.mpool_init_flag) {
-        ucc_mpool_cleanup(&ucc_mc_cuda.mpool, 1);
-        ucc_mc_cuda.mpool_init_flag     = 0;
-        ucc_mc_cuda.super.ops.mem_alloc = ucc_mc_cuda_mem_pool_alloc_with_init;
+    return UCC_OK;
+}
+
+static ucc_status_t ucc_mc_cuda_finalize()
+{
+    ucc_mc_cuda_resources_t *resources;
+
+    resources = mc_cuda_resources_hash_pop(ucc_mc_cuda.resources_hash);
+    while (resources) {
+        ucc_mc_cuda_resources_cleanup(resources);
+        resources = mc_cuda_resources_hash_pop(ucc_mc_cuda.resources_hash);
     }
+
+    ucc_mc_cuda.super.ops.mem_alloc = ucc_mc_cuda_mem_pool_alloc_with_init;
     ucc_spinlock_destroy(&ucc_mc_cuda.init_spinlock);
     return UCC_OK;
 }
@@ -443,8 +439,9 @@ ucc_mc_cuda_t ucc_mc_cuda = {
             .table  = ucc_mc_cuda_config_table,
             .size   = sizeof(ucc_mc_cuda_config_t),
         },
-    .mpool_init_flag               = 0,
 };
 
+ucc_mc_cuda_config_t *ucc_mc_cuda_config;
+
 UCC_CONFIG_REGISTER_TABLE_ENTRY(&ucc_mc_cuda.super.config_table,
                                 &ucc_config_global_list);
diff --git a/src/components/mc/cuda/mc_cuda.h b/src/components/mc/cuda/mc_cuda.h
index abc82312c2..10779c27cb 100644
--- a/src/components/mc/cuda/mc_cuda.h
+++ b/src/components/mc/cuda/mc_cuda.h
@@ -1,5 +1,5 @@
 /**
- * Copyright (c) 2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2022-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  *
  * See file LICENSE for terms.
  */
@@ -7,29 +7,18 @@
 #ifndef UCC_MC_CUDA_H_
 #define UCC_MC_CUDA_H_
 
-#include <cuComplex.h>
+#include <cuda_runtime.h>
 #include "components/mc/base/ucc_mc_base.h"
 #include "components/mc/ucc_mc_log.h"
 #include "utils/ucc_mpool.h"
 #include "utils/arch/cuda_def.h"
-#include <cuda_runtime.h>
-
-typedef struct ucc_mc_cuda_config {
-    ucc_mc_config_t                super;
-    size_t                         mpool_elem_size;
-    int                            mpool_max_elems;
-} ucc_mc_cuda_config_t;
+#include "mc_cuda_resources.h"
 
 typedef struct ucc_mc_cuda {
     ucc_mc_base_t                  super;
-    int                            stream_initialized;
-    cudaStream_t                   stream;
-    ucc_mpool_t                    events;
-    ucc_mpool_t                    strm_reqs;
-    ucc_mpool_t                    mpool;
-    int                            mpool_init_flag;
     ucc_spinlock_t                 init_spinlock;
     ucc_thread_mode_t              thread_mode;
+    ucc_mc_cuda_resources_hash_t  *resources_hash;
 } ucc_mc_cuda_t;
 
 extern ucc_mc_cuda_t ucc_mc_cuda;
@@ -37,21 +26,7 @@ extern ucc_mc_cuda_t ucc_mc_cuda;
 #define MC_CUDA_CONFIG                                                         \
     (ucc_derived_of(ucc_mc_cuda.super.config, ucc_mc_cuda_config_t))
 
-#define UCC_MC_CUDA_INIT_STREAM() do {                                         \
-    if (!ucc_mc_cuda.stream_initialized) {                                     \
-        cudaError_t cuda_st = cudaSuccess;                                     \
-        ucc_spin_lock(&ucc_mc_cuda.init_spinlock);                             \
-        if (!ucc_mc_cuda.stream_initialized) {                                 \
-            cuda_st = cudaStreamCreateWithFlags(&ucc_mc_cuda.stream,           \
-                                                cudaStreamNonBlocking);        \
-            ucc_mc_cuda.stream_initialized = 1;                                \
-        }                                                                      \
-        ucc_spin_unlock(&ucc_mc_cuda.init_spinlock);                           \
-        if (ucc_unlikely(cudaSuccess != cuda_st)) {                            \
-            return cuda_error_to_ucc_status(cuda_st);                          \
-        }                                                                      \
-    }                                                                          \
-} while(0)
+ucc_status_t ucc_mc_cuda_get_resources(ucc_mc_cuda_resources_t **resources);
 
 ucc_status_t ucc_mc_cuda_memset(void *ptr, int val, size_t len);
 
diff --git a/src/components/mc/cuda/mc_cuda_resources.c b/src/components/mc/cuda/mc_cuda_resources.c
new file mode 100644
index 0000000000..398b83784e
--- /dev/null
+++ b/src/components/mc/cuda/mc_cuda_resources.c
@@ -0,0 +1,92 @@
+#include "mc_cuda_resources.h"
+#include "components/mc/ucc_mc_log.h"
+#include "utils/ucc_malloc.h"
+
+static ucc_status_t ucc_mc_cuda_chunk_alloc(ucc_mpool_t *mp, //NOLINT
+                                            size_t *size_p,
+                                            void **chunk_p)
+{
+    *chunk_p = ucc_malloc(*size_p, "mc cuda");
+    if (!*chunk_p) {
+        return UCC_ERR_NO_MEMORY;
+    }
+
+    return UCC_OK;
+}
+
+static void ucc_mc_cuda_chunk_init(ucc_mpool_t *mp, //NOLINT
+                                   void *obj, void *chunk) //NOLINT
+{
+    ucc_mc_buffer_header_t *h = (ucc_mc_buffer_header_t *)obj;
+    cudaError_t st;
+
+    st = cudaMalloc(&h->addr, ucc_mc_cuda_config->mpool_elem_size);
+    if (st != cudaSuccess) {
+        // h->addr will be 0 so ucc_mc_cuda_mem_alloc_pool function will
+        // return UCC_ERR_NO_MEMORY. As such mc_error message is suffice.
+        cudaGetLastError();
+    }
+    h->from_pool = 1;
+    h->mt        = UCC_MEMORY_TYPE_CUDA;
+}
+
+static void ucc_mc_cuda_chunk_release(ucc_mpool_t *mp, void *chunk) //NOLINT: mp is unused
+{
+    ucc_free(chunk);
+}
+
+static void ucc_mc_cuda_chunk_cleanup(ucc_mpool_t *mp, void *obj) //NOLINT: mp is unused
+{
+    ucc_mc_buffer_header_t *h = (ucc_mc_buffer_header_t *)obj;
+    cudaError_t             st;
+
+    st = cudaFree(h->addr);
+    if (st != cudaSuccess) {
+        cudaGetLastError();
+    }
+}
+
+static ucc_mpool_ops_t ucc_mc_ops = {.chunk_alloc   = ucc_mc_cuda_chunk_alloc,
+                                     .chunk_release = ucc_mc_cuda_chunk_release,
+                                     .obj_init      = ucc_mc_cuda_chunk_init,
+                                     .obj_cleanup   = ucc_mc_cuda_chunk_cleanup};
+
+ucc_status_t ucc_mc_cuda_resources_init(ucc_mc_base_t *mc,
+                                        ucc_mc_cuda_resources_t *resources)
+{
+    ucc_status_t status;
+
+    CUDADRV_CHECK(cuCtxGetCurrent(&resources->cu_ctx));
+    status = ucc_mpool_init(&resources->scratch_mpool, 0,
+                            sizeof(ucc_mc_buffer_header_t), 0,
+                            UCC_CACHE_LINE_SIZE, 1,
+                            ucc_mc_cuda_config->mpool_max_elems, &ucc_mc_ops,
+                            UCC_THREAD_MULTIPLE, "mc cuda mpool buffers");
+    if (status != UCC_OK) {
+        mc_error(mc, "failed to create scratch buffers mpool");
+        return status;
+    }
+
+    status = CUDA_FUNC(cudaStreamCreateWithFlags(&resources->stream,
+                                                 cudaStreamNonBlocking));
+    if (status != UCC_OK) {
+        mc_error(mc, "failed to create CUDA stream");
+        goto free_scratch_mpool;
+    }
+
+    return UCC_OK;
+
+free_scratch_mpool:
+    ucc_mpool_cleanup(&resources->scratch_mpool, 0);
+    return status;
+}
+
+void ucc_mc_cuda_resources_cleanup(ucc_mc_cuda_resources_t *resources)
+{
+    CUcontext tmp_context;
+
+    cuCtxPushCurrent(resources->cu_ctx);
+    ucc_mpool_cleanup(&resources->scratch_mpool, 1);
+    CUDA_FUNC(cudaStreamDestroy(resources->stream));
+    cuCtxPopCurrent(&tmp_context);
+}
diff --git a/src/components/mc/cuda/mc_cuda_resources.h b/src/components/mc/cuda/mc_cuda_resources.h
new file mode 100644
index 0000000000..557effe3c0
--- /dev/null
+++ b/src/components/mc/cuda/mc_cuda_resources.h
@@ -0,0 +1,84 @@
+/**
+ * Copyright (c) 2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ *
+ * See file LICENSE for terms.
+ */
+
+#ifndef UCC_MC_CUDA_RESOURCES_H_
+#define UCC_MC_CUDA_RESOURCES_H_
+
+#include "components/mc/base/ucc_mc_base.h"
+#include "utils/arch/cuda_def.h"
+#include "utils/ucc_mpool.h"
+
+typedef struct ucc_mc_cuda_config {
+    ucc_mc_config_t super;
+    size_t          mpool_elem_size;
+    int             mpool_max_elems;
+} ucc_mc_cuda_config_t;
+
+typedef struct ucc_mc_cuda_resources {
+    CUcontext    cu_ctx;
+    cudaStream_t stream;
+    ucc_mpool_t  scratch_mpool;
+} ucc_mc_cuda_resources_t;
+
+extern ucc_mc_cuda_config_t *ucc_mc_cuda_config;
+
+ucc_status_t ucc_mc_cuda_resources_init(ucc_mc_base_t *mc,
+                                        ucc_mc_cuda_resources_t *resources);
+
+void ucc_mc_cuda_resources_cleanup(ucc_mc_cuda_resources_t *resources);
+
+KHASH_INIT(ucc_mc_cuda_resources_hash, unsigned long long, void*, 1, \
+           kh_int64_hash_func, kh_int64_hash_equal);
+#define ucc_mc_cuda_resources_hash_t khash_t(ucc_mc_cuda_resources_hash)
+
+static inline
+void* mc_cuda_resources_hash_get(ucc_mc_cuda_resources_hash_t *h,
+                                 unsigned long long key)
+{
+    khiter_t  k;
+    void     *value;
+
+    k = kh_get(ucc_mc_cuda_resources_hash, h , key);
+    if (k == kh_end(h)) {
+        return NULL;
+    }
+    value = kh_value(h, k);
+    return value;
+}
+
+static inline
+void mc_cuda_resources_hash_put(ucc_mc_cuda_resources_hash_t *h,
+                                unsigned long long key,
+                                void *value)
+{
+    int ret;
+    khiter_t k;
+    k = kh_put(ucc_mc_cuda_resources_hash, h, key, &ret);
+    kh_value(h, k) = value;
+}
+
+static inline
+void* mc_cuda_resources_hash_pop(ucc_mc_cuda_resources_hash_t *h)
+{
+    void    *resources = NULL;
+    khiter_t k;
+
+    k = kh_begin(h);
+    while (k != kh_end(h)) {
+        if (kh_exist(h, k)) {
+            resources = kh_value(h, k);
+            break;
+        }
+        k++;
+    }
+
+    if (resources) {
+        kh_del(ucc_mc_cuda_resources_hash, h, k);
+    }
+    return resources;
+}
+
+#endif
diff --git a/src/components/mc/ucc_mc.c b/src/components/mc/ucc_mc.c
index 2e8208d0f0..ad3de3a94b 100644
--- a/src/components/mc/ucc_mc.c
+++ b/src/components/mc/ucc_mc.c
@@ -121,6 +121,17 @@ ucc_status_t ucc_mc_get_mem_attr(const void *ptr, ucc_mem_attr_t *mem_attr)
     return UCC_OK;
 }
 
+ucc_status_t ucc_mc_get_attr(ucc_mc_attr_t *attr, ucc_memory_type_t mem_type)
+{
+    ucc_memory_type_t mt = (mem_type == UCC_MEMORY_TYPE_CUDA_MANAGED) ?
+                               UCC_MEMORY_TYPE_CUDA : mem_type;
+    ucc_mc_base_t *mc;
+
+    UCC_CHECK_MC_AVAILABLE(mt);
+    mc = ucc_container_of(mc_ops[mt], ucc_mc_base_t, ops);
+    return mc->get_attr(attr);
+}
+
 UCC_MC_PROFILE_FUNC(ucc_status_t, ucc_mc_alloc, (h_ptr, size, mem_type),
                     ucc_mc_buffer_header_t **h_ptr, size_t size,
                     ucc_memory_type_t mem_type)
@@ -134,8 +145,11 @@ UCC_MC_PROFILE_FUNC(ucc_status_t, ucc_mc_alloc, (h_ptr, size, mem_type),
 
 ucc_status_t ucc_mc_free(ucc_mc_buffer_header_t *h_ptr)
 {
-    UCC_CHECK_MC_AVAILABLE(h_ptr->mt);
-    return mc_ops[h_ptr->mt]->mem_free(h_ptr);
+    ucc_memory_type_t mt = (h_ptr->mt == UCC_MEMORY_TYPE_CUDA_MANAGED) ?
+                               UCC_MEMORY_TYPE_CUDA : h_ptr->mt;
+
+    UCC_CHECK_MC_AVAILABLE(mt);
+    return mc_ops[mt]->mem_free(h_ptr);
 }
 
 UCC_MC_PROFILE_FUNC(ucc_status_t, ucc_mc_memcpy,
diff --git a/src/components/mc/ucc_mc.h b/src/components/mc/ucc_mc.h
index e0ce1030c8..e98396b2f7 100644
--- a/src/components/mc/ucc_mc.h
+++ b/src/components/mc/ucc_mc.h
@@ -24,6 +24,8 @@ ucc_status_t ucc_mc_available(ucc_memory_type_t mem_type);
  */
 ucc_status_t ucc_mc_get_mem_attr(const void *ptr, ucc_mem_attr_t *mem_attr);
 
+ucc_status_t ucc_mc_get_attr(ucc_mc_attr_t *attr, ucc_memory_type_t mem_type);
+
 ucc_status_t ucc_mc_alloc(ucc_mc_buffer_header_t **h_ptr, size_t len,
                           ucc_memory_type_t mem_type);
 
diff --git a/src/components/tl/cuda/tl_cuda_topo.c b/src/components/tl/cuda/tl_cuda_topo.c
index 96862e921e..a0f54d57e6 100644
--- a/src/components/tl/cuda/tl_cuda_topo.c
+++ b/src/components/tl/cuda/tl_cuda_topo.c
@@ -220,7 +220,8 @@ static ucc_status_t ucc_tl_cuda_topo_graph_create(ucc_tl_cuda_topo_t *topo)
     ucc_tl_cuda_topo_dev_type_t dev_type;
     ucc_tl_cuda_device_pci_id_t pci_id;
     ucc_tl_cuda_topo_node_t *node, *peer_node;
-    int num_gpus, num_nvlinks, link, i;
+    int num_nvlinks, link, i;
+    unsigned int num_gpus;
     nvmlReturn_t nvml_st;
 
     nvml_st = nvmlInit_v2();
diff --git a/src/components/tl/mlx5/Makefile.am b/src/components/tl/mlx5/Makefile.am
index 11aec4e5b6..2ac9dc91c7 100644
--- a/src/components/tl/mlx5/Makefile.am
+++ b/src/components/tl/mlx5/Makefile.am
@@ -23,6 +23,7 @@ mcast =                                 \
 	mcast/p2p/ucc_tl_mlx5_mcast_p2p.c   \
 	mcast/tl_mlx5_mcast_progress.h      \
 	mcast/tl_mlx5_mcast_helper.h        \
+	mcast/tl_mlx5_mcast_helper.c        \
 	mcast/tl_mlx5_mcast_team.c
 
 sources =             \
diff --git a/src/components/tl/mlx5/alltoall/alltoall_mkeys.c b/src/components/tl/mlx5/alltoall/alltoall_mkeys.c
index 7dd90d49b8..0fa197e6c7 100644
--- a/src/components/tl/mlx5/alltoall/alltoall_mkeys.c
+++ b/src/components/tl/mlx5/alltoall/alltoall_mkeys.c
@@ -217,7 +217,6 @@ ucc_status_t ucc_tl_mlx5_init_mkeys(ucc_tl_mlx5_team_t *team,
         if (!node->ops[i].send_mkeys) {
             tl_error(lib, "failed to malloc");
             goto err_malloc;
-            return UCC_ERR_NO_MEMORY;
         }
         node->ops[i].recv_mkeys = (struct mlx5dv_mkey **)ucc_malloc(
             sizeof(struct mlx5dv_mkey *) * a2a->max_num_of_columns);
@@ -230,7 +229,7 @@ ucc_status_t ucc_tl_mlx5_init_mkeys(ucc_tl_mlx5_team_t *team,
             status = create_master_key(node->sbgp->group_size + 1, a2a->pd,
                                        &node->ops[i].send_mkeys[j], lib);
             if (status != UCC_OK) {
-                tl_error(lib, " failed to create send masterkey [%d,%d]", i, j);
+                tl_error(lib, "failed to create send masterkey [%d,%d]", i, j);
                 goto err_create_mkey;
             }
             status = create_master_key(node->sbgp->group_size + 1, a2a->pd,
diff --git a/src/components/tl/mlx5/mcast/tl_mlx5_mcast_context.c b/src/components/tl/mlx5/mcast/tl_mlx5_mcast_context.c
index 90014d1400..ad32c459b0 100644
--- a/src/components/tl/mlx5/mcast/tl_mlx5_mcast_context.c
+++ b/src/components/tl/mlx5/mcast/tl_mlx5_mcast_context.c
@@ -10,9 +10,242 @@
 #include <ucs/sys/string.h>
 #include "core/ucc_service_coll.h"
 #include "tl_mlx5.h"
+#include "tl_mlx5_mcast_helper.h"
+#include "tl_mlx5_mcast_rcache.h"
 
-ucc_status_t ucc_tl_mlx5_mcast_context_init(ucc_tl_mlx5_mcast_context_t    *context, /* NOLINT */
-                                            ucc_tl_mlx5_mcast_ctx_params_t *mcast_ctx_conf /* NOLINT */)
+#define UCC_TL_MLX5_MCAST_MAX_MTU_COUNT 5
+int mtu_lookup[UCC_TL_MLX5_MCAST_MAX_MTU_COUNT][2] = {
+    {256,  IBV_MTU_256},
+    {512,  IBV_MTU_512},
+    {1024, IBV_MTU_1024},
+    {2048, IBV_MTU_2048},
+    {4096, IBV_MTU_4096}
+};
+
+ucc_status_t ucc_tl_mlx5_mcast_context_init(ucc_tl_mlx5_mcast_context_t    *context,
+                                            ucc_tl_mlx5_mcast_ctx_params_t *mcast_ctx_conf)
 {
+    ucc_status_t            status        = UCC_OK;
+    struct ibv_device     **device_list   = NULL;
+    struct ibv_device      *dev           = NULL;
+    char                   *devname       = NULL;
+    int                     is_ipv4       = 0;
+    struct sockaddr_in     *in_src_addr   = NULL;
+    struct rdma_cm_event   *revent        = NULL;
+    char                   *ib            = NULL;
+    char                   *ib_name       = NULL;
+    char                   *port          = NULL;
+    int                     active_mtu    = 4096;
+    int                     max_mtu       = 4096;
+    ucc_tl_mlx5_mcast_coll_context_t *ctx = NULL;
+    struct ibv_port_attr    port_attr;
+    struct ibv_device_attr  device_attr;
+    struct sockaddr_storage ip_oib_addr;
+    struct sockaddr_storage dst_addr;
+    int                     num_devices;
+    char                    addrstr[128];
+    ucc_tl_mlx5_context_t  *mlx5_ctx;
+    ucc_base_lib_t         *lib;
+    int                     i;
+    int                     user_provided_ib;
+    int                     ib_valid;
+    const char             *dst;
+
+    ctx = &(context->mcast_context);
+    memset(ctx, 0, sizeof(ucc_tl_mlx5_mcast_coll_context_t));
+    memcpy(&ctx->params, mcast_ctx_conf, sizeof(ucc_tl_mlx5_mcast_ctx_params_t));
+
+    mlx5_ctx = ucc_container_of(context, ucc_tl_mlx5_context_t, mcast);
+    lib      = mlx5_ctx->super.super.lib;
+    ctx->lib = lib;
+
+    /* TODO unify all the contexts under TL mlx5 */
+    device_list = ibv_get_device_list(&num_devices);
+    if (!device_list || !num_devices) {
+        tl_debug(lib, "no ib devices available");
+        status = UCC_ERR_NOT_SUPPORTED;
+        goto error;
+    }
+
+    if (!strcmp(mcast_ctx_conf->ib_dev_name, "")) {
+        dev          = device_list[0];
+        devname      = (char *)ibv_get_device_name(dev);
+        ctx->devname = ucc_malloc(strlen(devname)+3, "devname");
+        if (!ctx->devname) {
+            status = UCC_ERR_NO_MEMORY;
+            goto error;
+        }
+        memset(ctx->devname, 0, strlen(devname)+3);
+        memcpy(ctx->devname, devname, strlen(devname));
+        strncat(ctx->devname, ":1", 3);
+        user_provided_ib = 0;
+    } else {
+        ib_valid = 0;
+        /* user has provided the devname now make sure it is valid */
+        for (i = 0; device_list[i]; ++i) {
+            if (!strcmp(ibv_get_device_name(device_list[i]), mcast_ctx_conf->ib_dev_name)) {
+                ib_valid = 1;
+                break;
+            }
+        }
+        if (!ib_valid) {
+            tl_warn(lib, "ib device %s not found", mcast_ctx_conf->ib_dev_name);
+            status = UCC_ERR_NOT_FOUND;
+            ibv_free_device_list(device_list);
+            goto error;
+        }
+        ctx->devname     = mcast_ctx_conf->ib_dev_name;
+        user_provided_ib = 1;
+    }
+
+    ibv_free_device_list(device_list);
+
+    status = ucc_tl_mlx5_probe_ip_over_ib(ctx->devname, &ip_oib_addr);
+    if (UCC_OK != status) {
+        tl_debug(lib, "failed to get ipoib interface for devname %s", ctx->devname);
+        if (!user_provided_ib) {
+            ucc_free(ctx->devname);
+        }
+        goto error;
+    }
+
+    is_ipv4     = (ip_oib_addr.ss_family == AF_INET) ? 1 : 0;
+    in_src_addr = (struct sockaddr_in*)&ip_oib_addr;
+
+    dst = inet_ntop((is_ipv4) ? AF_INET : AF_INET6,
+                    &in_src_addr->sin_addr, addrstr, sizeof(addrstr) - 1);
+    if (NULL == dst) {
+        tl_error(lib, "inet_ntop failed");
+        status = UCC_ERR_NO_RESOURCE;
+        goto error;
+    }
+
+    tl_debug(ctx->lib, "devname %s, ipoib %s", ctx->devname, addrstr);
+
+    ctx->channel = rdma_create_event_channel();
+    if (!ctx->channel) {
+        tl_debug(lib, "rdma_create_event_channel failed, errno %d", errno);
+        status = UCC_ERR_NO_RESOURCE;
+        goto error;
+    }
+
+    memset(&dst_addr, 0, sizeof(struct sockaddr_storage));
+    dst_addr.ss_family = is_ipv4 ? AF_INET : AF_INET6;
+    if (rdma_create_id(ctx->channel, &ctx->id, NULL, RDMA_PS_UDP)) {
+        tl_debug(lib, "failed to create rdma id, errno %d", errno);
+        status = UCC_ERR_NOT_SUPPORTED;
+        goto error;
+    }
+
+    if (0 != rdma_resolve_addr(ctx->id, (struct sockaddr *)&ip_oib_addr,
+                               (struct sockaddr *) &dst_addr, 1000)) {
+        tl_debug(lib, "failed to resolve rdma addr, errno %d", errno);
+        status = UCC_ERR_NOT_SUPPORTED;
+        goto error;
+    }
+
+    if (rdma_get_cm_event(ctx->channel, &revent) < 0) {
+        tl_error(lib, "failed to get cm event, errno %d", errno);
+        status = UCC_ERR_NO_RESOURCE;
+        goto error;
+    } else if (revent->event != RDMA_CM_EVENT_ADDR_RESOLVED) {
+        tl_error(lib, "cm event is not resolved");
+        if (rdma_ack_cm_event(revent) < 0) {
+            tl_error(lib, "rdma_ack_cm_event failed");
+        }
+        status = UCC_ERR_NO_RESOURCE;
+        goto error;
+    }
+
+    if (rdma_ack_cm_event(revent) < 0) {
+        tl_error(lib, "rdma_ack_cm_event failed");
+        status = UCC_ERR_NO_RESOURCE;
+        goto error;
+    }
+
+    ctx->ctx = ctx->id->verbs;
+    ctx->pd  = ibv_alloc_pd(ctx->ctx);
+    if (!ctx->pd) {
+        tl_error(lib, "failed to allocate pd");
+        status = UCC_ERR_NO_RESOURCE;
+        goto error;
+    }
+
+    ib = strdup(ctx->devname);
+    ucc_string_split(ib, ":", 2, &ib_name, &port);
+    ctx->ib_port = atoi(port);
+    ucc_free(ib);
+
+    /* Determine MTU */
+    if (ibv_query_port(ctx->ctx, ctx->ib_port, &port_attr)) {
+        tl_error(lib, "couldn't query port in ctx create, errno %d", errno);
+        status = UCC_ERR_NO_RESOURCE;
+        goto error;
+    }
+
+
+    for (i = 0; i < UCC_TL_MLX5_MCAST_MAX_MTU_COUNT; i++) {
+        if (mtu_lookup[i][1] == port_attr.max_mtu) {
+            max_mtu = mtu_lookup[i][0];
+        }
+        if (mtu_lookup[i][1] == port_attr.active_mtu) {
+            active_mtu = mtu_lookup[i][0];
+        }
+    }
+
+    ctx->mtu = active_mtu;
+
+    tl_debug(ctx->lib, "port active MTU is %d and port max MTU is %d",
+             active_mtu, max_mtu);
+
+    if (port_attr.max_mtu < port_attr.active_mtu) {
+        tl_debug(ctx->lib, "port active MTU (%d) is smaller than port max MTU (%d)",
+                 active_mtu, max_mtu);
+    }
+
+    if (ibv_query_device(ctx->ctx, &device_attr)) {
+        tl_error(lib, "failed to query device in ctx create, errno %d", errno);
+        status = UCC_ERR_NO_RESOURCE;
+        goto error;
+    }
+
+    tl_debug(ctx->lib, "MTU %d, MAX QP WR: %d, max sqr_wr: %d, max cq: %d, max cqe: %d",
+             ctx->mtu, device_attr.max_qp_wr, device_attr.max_srq_wr,
+             device_attr.max_cq, device_attr.max_cqe);
+
+    ctx->max_qp_wr = device_attr.max_qp_wr;
+    status = ucc_mpool_init(&ctx->compl_objects_mp, 0, sizeof(ucc_tl_mlx5_mcast_p2p_completion_obj_t), 0,
+                            UCC_CACHE_LINE_SIZE, 8, UINT_MAX,
+                            &ucc_coll_task_mpool_ops,
+                            UCC_THREAD_SINGLE,
+                            "ucc_tl_mlx5_mcast_p2p_completion_obj_t");
+    if (ucc_unlikely(UCC_OK != status)) {
+        tl_error(lib, "failed to initialize compl_objects_mp mpool");
+        status = UCC_ERR_NO_MEMORY;
+        goto error;
+    }
+
+    ctx->rcache = NULL;
+    status = ucc_tl_mlx5_mcast_setup_rcache(ctx);
+    if (UCC_OK != status) {
+        tl_error(lib, "failed to setup rcache");
+        goto error;
+    }
+
+    tl_debug(ctx->lib, "multicast context setup complete: ctx %p", ctx);
+
     return UCC_OK;
+
+error:
+    if (ctx->pd) {
+        ibv_dealloc_pd(ctx->pd);
+    }
+    if (ctx->id) {
+        rdma_destroy_id(ctx->id);
+    }
+    if (ctx->channel) {
+        rdma_destroy_event_channel(ctx->channel);
+    }
+
+    return status;
 }
diff --git a/src/components/tl/mlx5/mcast/tl_mlx5_mcast_helper.c b/src/components/tl/mlx5/mcast/tl_mlx5_mcast_helper.c
new file mode 100644
index 0000000000..8c52a63c73
--- /dev/null
+++ b/src/components/tl/mlx5/mcast/tl_mlx5_mcast_helper.c
@@ -0,0 +1,561 @@
+/**
+ * Copyright (c) 2022-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ *
+ * See file LICENSE for terms.
+ */
+
+#include "tl_mlx5_mcast_helper.h"
+#include <glob.h>
+#include <net/if.h>
+#include <ifaddrs.h>
+
+#define PREF        "/sys/class/net/"
+#define SUFF        "/device/resource"
+#define MAX_STR_LEN 128
+
+static ucc_status_t ucc_tl_mlx5_get_ipoib_ip(char *ifname, struct sockaddr_storage *addr)
+{
+    ucc_status_t    status  = UCC_ERR_NO_RESOURCE;
+    struct ifaddrs *ifaddr  = NULL;
+    struct ifaddrs *ifa     = NULL;
+    int             is_ipv4 = 0;
+    int             family;
+    int             n;
+    int             is_up;
+
+    if (getifaddrs(&ifaddr) == -1) {
+        return UCC_ERR_NO_RESOURCE;
+    }
+
+    for (ifa = ifaddr, n = 0; ifa != NULL; ifa=ifa->ifa_next, n++) {
+        if (ifa->ifa_addr == NULL) {
+            continue;
+        }
+
+        family = ifa->ifa_addr->sa_family;
+        if (family != AF_INET && family != AF_INET6) {
+            continue;
+        }
+
+        is_up   = (ifa->ifa_flags & IFF_UP) == IFF_UP;
+        is_ipv4 = (family == AF_INET) ? 1 : 0;
+
+        if (is_up && !strncmp(ifa->ifa_name, ifname, strlen(ifname)) ) {
+            if (is_ipv4) {
+                memcpy((struct sockaddr_in *) addr,
+                       (struct sockaddr_in *) ifa->ifa_addr,
+                       sizeof(struct sockaddr_in));
+            } else {
+                memcpy((struct sockaddr_in6 *) addr,
+                       (struct sockaddr_in6 *) ifa->ifa_addr,
+                       sizeof(struct sockaddr_in6));
+            }
+
+            status = UCC_OK;
+            break;
+        }
+    }
+
+    freeifaddrs(ifaddr);
+    return status;
+}
+
+static int cmp_files(char *f1, char *f2)
+{
+    int   answer = 0;
+    FILE *fp1;
+    FILE *fp2;
+    int   ch1;
+    int   ch2;
+
+    if ((fp1 = fopen(f1, "r")) == NULL) {
+        goto out;
+    } else if ((fp2 = fopen(f2, "r")) == NULL) {
+        goto close;
+    }
+
+    do {
+        ch1 = getc(fp1);
+        ch2 = getc(fp2);
+    } while((ch1 != EOF) && (ch2 != EOF) && (ch1 == ch2));
+
+
+    if (ch1 == ch2) {
+        answer = 1;
+    }
+
+    if (fclose(fp2) != 0) {
+        return 0;
+    }
+close:
+    if (fclose(fp1) != 0) {
+        return 0;
+    }
+out:
+    return answer;
+}
+
+static int port_from_file(char *port_file)
+{
+    int   res = -1;
+    char  buf1[MAX_STR_LEN];
+    char  buf2[MAX_STR_LEN];
+    FILE *fp;
+    int   len;
+
+    if ((fp = fopen(port_file, "r")) == NULL) {
+        return -1;
+    }
+
+    if (fgets(buf1, MAX_STR_LEN - 1, fp) == NULL) {
+        goto out;
+    }
+
+    len       = strlen(buf1) - 2;
+    strncpy(buf2, buf1 + 2, len);
+    buf2[len] = 0;
+    res       = atoi(buf2);
+
+out:
+    if (fclose(fp) != 0) {
+        return -1;
+    }
+    return res;
+}
+
+static ucc_status_t dev2if(char *dev_name, char *port, struct sockaddr_storage
+                           *rdma_src_addr)
+{
+    ucc_status_t status  = UCC_OK;
+    glob_t       glob_el = {0,};
+    char         dev_file [MAX_STR_LEN];
+    char         port_file[MAX_STR_LEN];
+    char         net_file [MAX_STR_LEN];
+    char         if_name  [MAX_STR_LEN];
+    char         glob_path[MAX_STR_LEN];
+    int          i;
+    char       **p;
+    int          len;
+
+    sprintf(glob_path, PREF"*");
+
+    sprintf(dev_file, "/sys/class/infiniband/%s"SUFF, dev_name);
+    if (glob(glob_path, 0, 0, &glob_el)) {
+        return UCC_ERR_NO_RESOURCE;
+    }
+    p = glob_el.gl_pathv;
+
+    if (glob_el.gl_pathc >= 1) {
+        for (i = 0; i < glob_el.gl_pathc; i++, p++) {
+            sprintf(port_file, "%s/dev_id", *p);
+            sprintf(net_file,  "%s"SUFF,    *p);
+            if(cmp_files(net_file, dev_file) && port != NULL &&
+               port_from_file(port_file) == atoi(port) - 1) {
+                len = strlen(net_file) - strlen(PREF) - strlen(SUFF);
+                strncpy(if_name, net_file + strlen(PREF), len);
+                if_name[len] = 0;
+
+                status = ucc_tl_mlx5_get_ipoib_ip(if_name, rdma_src_addr);
+                if (UCC_OK == status) {
+                    break;
+                }
+            }
+        }
+    }
+
+    globfree(&glob_el);
+    return status;
+}
+
+ucc_status_t ucc_tl_mlx5_probe_ip_over_ib(char* ib_dev, struct
+                                          sockaddr_storage *addr)
+{
+    char                   *ib_name = NULL;
+    char                   *port    = NULL;
+    char                   *ib      = NULL;
+    ucc_status_t            status;
+    struct sockaddr_storage rdma_src_addr;
+
+    if (ib_dev == NULL) {
+        return UCC_ERR_NO_RESOURCE;
+    }
+
+    ib = strdup(ib_dev);
+    if (!ib) {
+        return UCC_ERR_NO_MEMORY;
+    }
+
+    ucc_string_split(ib, ":", 2, &ib_name, &port);
+    status = dev2if(ib_name, port, &rdma_src_addr);
+
+    if (UCC_OK == status) {
+        *addr = rdma_src_addr;
+    }
+    ucc_free(ib);
+
+    return status;
+}
+
+ucc_status_t ucc_tl_mlx5_mcast_join_mcast_post(ucc_tl_mlx5_mcast_coll_context_t *ctx,
+                                               struct sockaddr_in6              *net_addr,
+                                               int                               is_root)
+{
+    char        buf[40];
+    const char *dst;
+
+    dst = inet_ntop(AF_INET6, net_addr, buf, 40);
+    if (NULL == dst) {
+        tl_error(ctx->lib, "inet_ntop failed");
+        return UCC_ERR_NO_RESOURCE;
+    }
+
+    tl_debug(ctx->lib, "joining addr: %s is_root %d", buf, is_root);
+
+    if (rdma_join_multicast(ctx->id, (struct sockaddr*)net_addr, NULL)) {
+        tl_error(ctx->lib, "rdma_join_multicast failed errno %d", errno);
+        return UCC_ERR_NO_RESOURCE;
+    }
+
+    return UCC_OK;
+}
+
+ucc_status_t ucc_tl_mlx5_mcast_join_mcast_test(ucc_tl_mlx5_mcast_coll_context_t *ctx,
+                                               struct rdma_cm_event            **event,
+                                               int                               is_root)
+{
+    char        buf[40];
+    const char *dst;
+
+    if (rdma_get_cm_event(ctx->channel, event) < 0) {
+        if (EINTR != errno) {
+            tl_error(ctx->lib, "rdma_get_cm_event failed, errno %d %s",
+                     errno, strerror(errno));
+            return UCC_ERR_NO_RESOURCE;
+        } else {
+            return UCC_INPROGRESS;
+        }
+    }
+
+    if (RDMA_CM_EVENT_MULTICAST_JOIN != (*event)->event) {
+        tl_error(ctx->lib, "failed to join multicast, is_root %d. unexpected event was"
+                 " received: event=%d, str=%s, status=%d",
+                 is_root, (*event)->event, rdma_event_str((*event)->event),
+                 (*event)->status);
+        if (rdma_ack_cm_event(*event) < 0) {
+            tl_error(ctx->lib, "rdma_ack_cm_event failed");
+        }
+        return UCC_ERR_NO_RESOURCE;
+    }
+
+    dst = inet_ntop(AF_INET6, (*event)->param.ud.ah_attr.grh.dgid.raw, buf, 40);
+    if (NULL == dst) {
+        tl_error(ctx->lib, "inet_ntop failed");
+        return UCC_ERR_NO_RESOURCE;
+    }
+
+    tl_debug(ctx->lib, "is_root %d: joined dgid: %s, mlid 0x%x, sl %d", is_root, buf,
+             (*event)->param.ud.ah_attr.dlid, (*event)->param.ud.ah_attr.sl);
+
+    return UCC_OK;
+
+}
+
+ucc_status_t ucc_tl_mlx5_setup_mcast_group_join_post(ucc_tl_mlx5_mcast_coll_comm_t *comm)
+{
+    ucc_status_t          status;
+    struct sockaddr_in6   net_addr = {0,};
+
+    if (comm->rank == 0) {
+        net_addr.sin6_family   = AF_INET6;
+        net_addr.sin6_flowinfo = comm->comm_id;
+
+        status = ucc_tl_mlx5_mcast_join_mcast_post(comm->ctx, &net_addr, 1);
+        if (status < 0) {
+            tl_error(comm->lib, "rank 0 is unable to join mcast group");
+            return status;
+        }
+    }
+
+    return UCC_OK;
+}
+
+ucc_status_t ucc_tl_mlx5_mcast_init_qps(ucc_tl_mlx5_mcast_coll_context_t *ctx,
+                                        ucc_tl_mlx5_mcast_coll_comm_t *comm)
+{
+    struct ibv_qp_init_attr qp_init_attr = {0};
+
+    qp_init_attr.qp_type             = IBV_QPT_UD;
+    qp_init_attr.send_cq             = comm->scq;
+    qp_init_attr.recv_cq             = comm->rcq;
+    qp_init_attr.sq_sig_all          = 0;
+    qp_init_attr.cap.max_send_wr     = comm->params.sx_depth;
+    qp_init_attr.cap.max_recv_wr     = comm->params.rx_depth;
+    qp_init_attr.cap.max_inline_data = comm->params.sx_inline;
+    qp_init_attr.cap.max_send_sge    = comm->params.sx_sge;
+    qp_init_attr.cap.max_recv_sge    = comm->params.rx_sge;
+
+    comm->mcast.qp = ibv_create_qp(ctx->pd, &qp_init_attr);
+    if (!comm->mcast.qp) {
+        tl_error(ctx->lib, "failed to create mcast qp, errno %d", errno);
+        return UCC_ERR_NO_RESOURCE;
+    }
+
+    comm->max_inline = qp_init_attr.cap.max_inline_data;
+
+    return UCC_OK;
+}
+
+static ucc_status_t ucc_tl_mlx5_mcast_create_ah(ucc_tl_mlx5_mcast_coll_comm_t *comm)
+{
+    struct ibv_ah_attr ah_attr = {
+        .is_global     = 1,
+        .grh           = {.sgid_index = 0},
+        .dlid          = comm->mcast_lid,
+        .sl            = DEF_SL,
+        .src_path_bits = DEF_SRC_PATH_BITS,
+        .port_num      = comm->ctx->ib_port
+    };
+
+    memcpy(ah_attr.grh.dgid.raw, &comm->mgid, sizeof(ah_attr.grh.dgid.raw));
+
+    comm->mcast.ah = ibv_create_ah(comm->ctx->pd, &ah_attr);
+    if (!comm->mcast.ah) {
+        tl_error(comm->lib, "failed to create AH");
+        return UCC_ERR_NO_RESOURCE;
+    }
+    return UCC_OK;
+}
+
+ucc_status_t ucc_tl_mlx5_mcast_setup_qps(ucc_tl_mlx5_mcast_coll_context_t *ctx,
+                                         ucc_tl_mlx5_mcast_coll_comm_t    *comm)
+{
+    struct ibv_port_attr port_attr;
+    struct ibv_qp_attr   attr;
+    uint16_t             pkey;
+
+    ibv_query_port(ctx->ctx, ctx->ib_port, &port_attr);
+
+    for (ctx->pkey_index = 0; ctx->pkey_index < port_attr.pkey_tbl_len;
+         ++ctx->pkey_index) {
+        ibv_query_pkey(ctx->ctx, ctx->ib_port, ctx->pkey_index, &pkey);
+        if (pkey == DEF_PKEY)
+            break;
+    }
+
+    if (ctx->pkey_index >= port_attr.pkey_tbl_len) {
+        ctx->pkey_index = 0;
+        ibv_query_pkey(ctx->ctx, ctx->ib_port, ctx->pkey_index, &pkey);
+        if (!pkey) {
+            tl_error(ctx->lib, "cannot find valid PKEY");
+            return UCC_ERR_NO_RESOURCE;
+        }
+
+        tl_debug(ctx->lib, "cannot find default pkey 0x%04x on port %d, using "
+                 "index 0 pkey:0x%04x", DEF_PKEY, ctx->ib_port, pkey);
+    }
+
+    attr.qp_state   = IBV_QPS_INIT;
+    attr.pkey_index = ctx->pkey_index;
+    attr.port_num   = ctx->ib_port;
+    attr.qkey       = DEF_QKEY;
+
+    if (ibv_modify_qp(comm->mcast.qp, &attr,
+                      IBV_QP_STATE | IBV_QP_PKEY_INDEX | IBV_QP_PORT | IBV_QP_QKEY)) {
+        tl_error(ctx->lib, "failed to move mcast qp to INIT, errno %d", errno);
+        return UCC_ERR_NO_RESOURCE;
+    }
+
+    if (ibv_attach_mcast(comm->mcast.qp, &comm->mgid, comm->mcast_lid)) {
+        tl_error(ctx->lib, "failed to attach QP to the mcast group, errno %d", errno);
+        return UCC_ERR_NO_RESOURCE;
+    }
+
+    /* Ok, now cycle to RTR on everyone */
+    attr.qp_state = IBV_QPS_RTR;
+    if (ibv_modify_qp(comm->mcast.qp, &attr, IBV_QP_STATE)) {
+        tl_error(ctx->lib, "failed to modify QP to RTR, errno %d", errno);
+        return UCC_ERR_NO_RESOURCE;
+    }
+
+    attr.qp_state = IBV_QPS_RTS;
+    attr.sq_psn   = DEF_PSN;
+    if (ibv_modify_qp(comm->mcast.qp, &attr, IBV_QP_STATE | IBV_QP_SQ_PSN)) {
+        tl_error(ctx->lib, "failed to modify QP to RTS, errno %d", errno);
+        return UCC_ERR_NO_RESOURCE;
+    }
+
+    /* Create the address handle */
+    if (UCC_OK != ucc_tl_mlx5_mcast_create_ah(comm)) {
+        tl_error(ctx->lib, "failed to create adress handle");
+        return UCC_ERR_NO_RESOURCE;
+    }
+
+    return UCC_OK;
+}
+
+ucc_status_t ucc_tl_mlx5_fini_mcast_group(ucc_tl_mlx5_mcast_coll_context_t *ctx,
+                                          ucc_tl_mlx5_mcast_coll_comm_t    *comm)
+{
+    char        buf[40];
+    const char *dst;
+
+    dst = inet_ntop(AF_INET6, &comm->mcast_addr, buf, 40);
+    if (NULL == dst) {
+        tl_error(comm->lib, "inet_ntop failed");
+        return UCC_ERR_NO_RESOURCE;
+    }
+
+    tl_debug(ctx->lib, "mcast leave: ctx %p, comm %p, dgid: %s", ctx, comm, buf);
+
+    if (rdma_leave_multicast(ctx->id, (struct sockaddr*)&comm->mcast_addr)) {
+        tl_error(comm->lib, "mcast rmda_leave_multicast failed");
+        return UCC_ERR_NO_RESOURCE;
+    }
+
+    return UCC_OK;
+}
+
+ucc_status_t ucc_tl_mlx5_clean_mcast_comm(ucc_tl_mlx5_mcast_coll_comm_t *comm)
+{
+    int          ret;
+    ucc_status_t status;
+
+    tl_debug(comm->lib, "cleaning  mcast comm: %p, id %d, mlid %x",
+             comm, comm->comm_id, comm->mcast_lid);
+
+    if (UCC_OK != (status = ucc_tl_mlx5_mcast_reliable(comm))) {
+        // TODO handle (UCC_INPROGRESS == ret)
+        tl_error(comm->lib, "couldn't clean mcast team: relibality progress status %d",
+                 status);
+        return status;
+    }
+
+    if (comm->mcast.qp) {
+        ret = ibv_detach_mcast(comm->mcast.qp, &comm->mgid, comm->mcast_lid);
+        if (ret) {
+            tl_error(comm->lib, "couldn't detach QP, ret %d, errno %d", ret, errno);
+            return UCC_ERR_NO_RESOURCE;
+        }
+    }
+
+    if (comm->mcast.qp) {
+        ret = ibv_destroy_qp(comm->mcast.qp);
+        if (ret) {
+            tl_error(comm->lib, "failed to destroy QP %d", ret);
+            return UCC_ERR_NO_RESOURCE;
+        }
+    }
+
+    if (comm->rcq) {
+        ret = ibv_destroy_cq(comm->rcq);
+        if (ret) {
+            tl_error(comm->lib, "couldn't destroy rcq");
+            return UCC_ERR_NO_RESOURCE;
+        }
+    }
+
+    if (comm->scq) {
+        ret = ibv_destroy_cq(comm->scq);
+        if (ret) {
+            tl_error(comm->lib, "couldn't destroy scq");
+            return UCC_ERR_NO_RESOURCE;
+        }
+    }
+
+    if (comm->grh_mr) {
+        ret = ibv_dereg_mr(comm->grh_mr);
+        if (ret) {
+            tl_error(comm->lib, "couldn't destroy grh mr");
+            return UCC_ERR_NO_RESOURCE;
+        }
+    }
+    if (comm->grh_buf) {
+        ucc_free(comm->grh_buf);
+    }
+
+    if (comm->pp) {
+        ucc_free(comm->pp);
+    }
+
+    if (comm->pp_mr) {
+        ret = ibv_dereg_mr(comm->pp_mr);
+        if (ret) {
+            tl_error(comm->lib, "couldn't destroy pp mr");
+            return UCC_ERR_NO_RESOURCE;
+        }
+    }
+
+    if (comm->pp_buf) {
+        ucc_free(comm->pp_buf);
+    }
+
+    if (comm->call_rwr) {
+        ucc_free(comm->call_rwr);
+    }
+
+    if (comm->call_rsgs) {
+        ucc_free(comm->call_rsgs);
+    }
+
+    if (comm->mcast.ah) {
+        ret = ibv_destroy_ah(comm->mcast.ah);
+        if (ret) {
+            tl_error(comm->lib, "couldn't destroy ah");
+            return UCC_ERR_NO_RESOURCE;
+        }
+    }
+
+    if (comm->mcast_lid) {
+        status = ucc_tl_mlx5_fini_mcast_group(comm->ctx, comm);
+        if (status) {
+            tl_error(comm->lib, "couldn't leave mcast group");
+            return status;
+        }
+    }
+
+    if (comm->ctx->params.print_nack_stats) {
+        tl_debug(comm->lib, "comm_id %d, comm_size %d, comm->psn %d, rank %d, "
+                 "nacks counter %d, n_mcast_rel %d",
+                 comm->comm_id, comm->commsize, comm->psn, comm->rank,
+                 comm->nacks_counter, comm->n_mcast_reliable);
+    }
+
+    if (comm->p2p_ctx != NULL) {
+        ucc_free(comm->p2p_ctx);
+    }
+
+    ucc_free(comm);
+
+    return UCC_OK;
+}
+
+ucc_status_t ucc_tl_mlx5_clean_mcast_ctx(ucc_tl_mlx5_mcast_coll_context_t *ctx)
+{
+    tl_debug(ctx->lib, "cleaning mcast ctx: %p", ctx);
+
+    if (ctx->rcache) {
+        ucc_rcache_destroy(ctx->rcache);
+    }
+
+    if (ctx->pd) {
+        if (ibv_dealloc_pd(ctx->pd)) {
+            tl_error(ctx->lib, "ibv_dealloc_pd failed errno %d", errno);
+            return UCC_ERR_NO_RESOURCE;
+        }
+    }
+
+    if (rdma_destroy_id(ctx->id)) {
+        tl_error(ctx->lib, "rdma_destroy_id failed errno %d", errno);
+        return UCC_ERR_NO_RESOURCE;
+    }
+
+    rdma_destroy_event_channel(ctx->channel);
+
+    if (!strcmp(ctx->params.ib_dev_name, "")) {
+        ucc_free(ctx->devname);
+    }
+
+    ucc_free(ctx);
+
+    return UCC_OK;
+}
diff --git a/src/components/tl/mlx5/mcast/tl_mlx5_mcast_helper.h b/src/components/tl/mlx5/mcast/tl_mlx5_mcast_helper.h
index 9ca529f7b9..05037e495f 100644
--- a/src/components/tl/mlx5/mcast/tl_mlx5_mcast_helper.h
+++ b/src/components/tl/mlx5/mcast/tl_mlx5_mcast_helper.h
@@ -352,7 +352,10 @@ static inline ucc_status_t ucc_tl_mlx5_mcast_reliable(ucc_tl_mlx5_mcast_coll_com
     return UCC_INPROGRESS;
 }
 
-ucc_status_t ucc_tl_setup_mcast(ucc_tl_mlx5_mcast_coll_comm_t *comm);
+ucc_status_t ucc_tl_mlx5_probe_ip_over_ib(char* ib_dev_list,
+                                          struct sockaddr_storage *addr);
+
+ucc_status_t ucc_tl_mlx5_setup_mcast(ucc_tl_mlx5_mcast_coll_comm_t *comm);
 
 ucc_status_t ucc_tl_mlx5_mcast_init_qps(ucc_tl_mlx5_mcast_coll_context_t *ctx,
                                         ucc_tl_mlx5_mcast_coll_comm_t *comm);
@@ -360,6 +363,6 @@ ucc_status_t ucc_tl_mlx5_mcast_init_qps(ucc_tl_mlx5_mcast_coll_context_t *ctx,
 ucc_status_t ucc_tl_mlx5_mcast_setup_qps(ucc_tl_mlx5_mcast_coll_context_t *ctx,
                                          ucc_tl_mlx5_mcast_coll_comm_t *comm);
 
-ucc_status_t ucc_tl_clean_mcast_comm(ucc_tl_mlx5_mcast_coll_comm_t *comm);
+ucc_status_t ucc_tl_mlx5_clean_mcast_comm(ucc_tl_mlx5_mcast_coll_comm_t *comm);
 
 #endif /* TL_MLX5_MCAST_HELPER_H_ */
diff --git a/src/components/tl/mlx5/mcast/tl_mlx5_mcast_rcache.c b/src/components/tl/mlx5/mcast/tl_mlx5_mcast_rcache.c
index c67a2d3179..75c62ac81f 100644
--- a/src/components/tl/mlx5/mcast/tl_mlx5_mcast_rcache.c
+++ b/src/components/tl/mlx5/mcast/tl_mlx5_mcast_rcache.c
@@ -19,7 +19,7 @@ static ucs_status_t ucc_tl_mlx5_mcast_coll_reg_mr(ucc_tl_mlx5_mcast_coll_context
         tl_error(ctx->lib, "failed to register MR");
         return UCS_ERR_NO_MEMORY;
     }
-    
+
     return UCS_OK;
 }
 
@@ -33,7 +33,7 @@ static ucc_status_t ucc_tl_mlx5_mcast_coll_dereg_mr(ucc_tl_mlx5_mcast_coll_conte
     }
 
     tl_debug(ctx->lib, "external memory deregister: mr %p", mr);
-    
+
     if (ibv_dereg_mr(mr)) {
         tl_error(ctx->lib, "couldn't destroy mr %p", mr);
         return UCC_ERR_NO_RESOURCE;
@@ -140,12 +140,10 @@ ucc_status_t ucc_tl_mlx5_mcast_setup_rcache(ucc_tl_mlx5_mcast_coll_context_t *ct
 {
     ucc_rcache_params_t rcache_params;
 
-    rcache_params.alignment          = 64;
     rcache_params.ucm_event_priority = 1000;
     rcache_params.max_regions        = ULONG_MAX;
     rcache_params.max_size           = SIZE_MAX;
     rcache_params.region_struct_size = sizeof(ucc_tl_mlx5_mcast_rcache_region_t);
-    rcache_params.max_alignment      = ucc_get_page_size();
     rcache_params.ucm_events         = UCM_EVENT_VM_UNMAPPED |
                                        UCM_EVENT_MEM_TYPE_FREE;
     rcache_params.context            = ctx;
diff --git a/src/components/tl/mlx5/mcast/tl_mlx5_mcast_team.c b/src/components/tl/mlx5/mcast/tl_mlx5_mcast_team.c
index 31044fe8b3..f56bc3c1a1 100644
--- a/src/components/tl/mlx5/mcast/tl_mlx5_mcast_team.c
+++ b/src/components/tl/mlx5/mcast/tl_mlx5_mcast_team.c
@@ -8,6 +8,7 @@
 #include "tl_mlx5.h"
 #include "tl_mlx5_mcast_coll.h"
 #include "coll_score/ucc_coll_score.h"
+#include "tl_mlx5_mcast_helper.h"
 
 ucc_status_t ucc_tl_mlx5_mcast_team_init(ucc_base_context_t           *base_context, /* NOLINT */
                                          ucc_tl_mlx5_mcast_team_t    **mcast_team, /* NOLINT */
@@ -18,3 +19,112 @@ ucc_status_t ucc_tl_mlx5_mcast_team_init(ucc_base_context_t           *base_cont
     return UCC_OK;
 }
 
+ucc_status_t ucc_tl_mlx5_mcast_coll_setup_comm_resources(ucc_tl_mlx5_mcast_coll_comm_t *comm)
+{
+    ucc_status_t status;
+    size_t       page_size;
+    int          buf_size, i, ret;
+
+    status = ucc_tl_mlx5_mcast_init_qps(comm->ctx, comm);
+    if (UCC_OK != status) {
+        goto error;
+    }
+
+    status = ucc_tl_mlx5_mcast_setup_qps(comm->ctx, comm);
+    if (UCC_OK != status) {
+        goto error;
+    }
+
+    page_size = ucc_get_page_size();
+    buf_size  = comm->ctx->mtu;
+
+    // Comm receiving buffers.
+    ret = posix_memalign((void**)&comm->call_rwr, page_size, sizeof(struct ibv_recv_wr) *
+                         comm->params.rx_depth);
+    if (ret) {
+        tl_error(comm->ctx->lib, "posix_memalign failed");
+        return UCC_ERR_NO_MEMORY;
+    }
+
+    ret = posix_memalign((void**)&comm->call_rsgs, page_size, sizeof(struct ibv_sge) *
+                         comm->params.rx_depth * 2);
+    if (ret) {
+        tl_error(comm->ctx->lib, "posix_memalign failed");
+        return UCC_ERR_NO_MEMORY;
+    }
+
+    comm->pending_recv = 0;
+    comm->buf_n        = comm->params.rx_depth * 2;
+
+    ret = posix_memalign((void**) &comm->pp_buf, page_size, buf_size * comm->buf_n);
+    if (ret) {
+        tl_error(comm->ctx->lib, "posix_memalign failed");
+        return UCC_ERR_NO_MEMORY;
+    }
+
+    memset(comm->pp_buf, 0, buf_size * comm->buf_n);
+    
+    comm->pp_mr = ibv_reg_mr(comm->ctx->pd, comm->pp_buf, buf_size * comm->buf_n,
+                             IBV_ACCESS_REMOTE_WRITE | IBV_ACCESS_LOCAL_WRITE);
+    if (!comm->pp_mr) {
+        tl_error(comm->ctx->lib, "could not register pp_buf mr, errno %d", errno);
+        status = UCC_ERR_NO_MEMORY;
+        goto error;
+    }
+
+    ret = posix_memalign((void**) &comm->pp, page_size, sizeof(struct
+                         pp_packet) * comm->buf_n);
+    if (ret) {
+        tl_error(comm->ctx->lib, "posix_memalign failed");
+        return UCC_ERR_NO_MEMORY;
+    }
+
+    for (i = 0; i < comm->buf_n; i++) {
+        ucc_list_head_init(&comm->pp[i].super);
+
+        comm->pp[i].buf     = (uintptr_t) comm->pp_buf + i * buf_size;
+        comm->pp[i].context = 0;
+        
+        ucc_list_add_tail(&comm->bpool, &comm->pp[i].super);
+    }
+
+    comm->mcast.swr.wr.ud.ah          = comm->mcast.ah;
+    comm->mcast.swr.num_sge           = 1;
+    comm->mcast.swr.sg_list           = &comm->mcast.ssg;
+    comm->mcast.swr.opcode            = IBV_WR_SEND_WITH_IMM;
+    comm->mcast.swr.wr.ud.remote_qpn  = MULTICAST_QPN;
+    comm->mcast.swr.wr.ud.remote_qkey = DEF_QKEY;
+    comm->mcast.swr.next              = NULL;
+
+    for (i = 0; i < comm->params.rx_depth; i++) {
+        comm->call_rwr[i].sg_list         = &comm->call_rsgs[2 * i];
+        comm->call_rwr[i].num_sge         = 2;
+        comm->call_rwr[i].wr_id           = MCAST_BCASTRECV_WR;
+        comm->call_rsgs[2 * i].length     = GRH_LENGTH;
+        comm->call_rsgs[2 * i].addr       = (uintptr_t)comm->grh_buf;
+        comm->call_rsgs[2 * i].lkey       = comm->grh_mr->lkey;
+        comm->call_rsgs[2 * i + 1].lkey   = comm->pp_mr->lkey;
+        comm->call_rsgs[2 * i + 1].length = comm->max_per_packet;
+    }
+
+    status = ucc_tl_mlx5_mcast_post_recv_buffers(comm);
+    if (UCC_OK != status) {
+        goto error;
+    }
+
+    memset(comm->parents,  0, sizeof(comm->parents));
+    memset(comm->children, 0, sizeof(comm->children));
+
+    comm->nacks_counter                = 0;
+    comm->tx                           = 0;
+    comm->n_prep_reliable              = 0;
+    comm->n_mcast_reliable             = 0;
+    comm->reliable_in_progress         = 0;
+    comm->recv_drop_packet_in_progress = 0;
+
+    return status;
+
+error:
+    ucc_tl_mlx5_clean_mcast_comm(comm);
+    return status;
+}
diff --git a/src/components/tl/mlx5/tl_mlx5.c b/src/components/tl/mlx5/tl_mlx5.c
index bab4808ece..0210f2302c 100644
--- a/src/components/tl/mlx5/tl_mlx5.c
+++ b/src/components/tl/mlx5/tl_mlx5.c
@@ -67,6 +67,27 @@ static ucc_config_field_t ucc_tl_mlx5_lib_config_table[] = {
      ucc_offsetof(ucc_tl_mlx5_lib_config_t, qp_conf.qp_max_atomic),
      UCC_CONFIG_TYPE_UINT},
 
+    {"MCAST_SX_DEPTH", "512", "Send context depth of the Mcast comm",
+     ucc_offsetof(ucc_tl_mlx5_lib_config_t, mcast_conf.sx_depth),
+     UCC_CONFIG_TYPE_INT},
+
+    {"MCAST_SX_INLINE", "128", "Minimal size for inline data send in Mcast",
+     ucc_offsetof(ucc_tl_mlx5_lib_config_t, mcast_conf.sx_inline),
+     UCC_CONFIG_TYPE_MEMUNITS},
+
+    {"MCAST_RX_DEPTH", "4096", "Recv context depth of the Mcast comm",
+     ucc_offsetof(ucc_tl_mlx5_lib_config_t, mcast_conf.rx_depth),
+     UCC_CONFIG_TYPE_INT},
+
+    {"MCAST_POST_RECV_THRESH", "64",
+        "Threshold for posting recv into rx ctx of the Mcast comm",
+     ucc_offsetof(ucc_tl_mlx5_lib_config_t, mcast_conf.post_recv_thresh),
+     UCC_CONFIG_TYPE_INT},
+
+    {"MCAST_WINDOW_SIZE", "64", "Reliability Mcast window size",
+     ucc_offsetof(ucc_tl_mlx5_lib_config_t, mcast_conf.wsize),
+     UCC_CONFIG_TYPE_INT},
+
     {NULL}};
 
 static ucc_config_field_t ucc_tl_mlx5_context_config_table[] = {
@@ -77,6 +98,14 @@ static ucc_config_field_t ucc_tl_mlx5_context_config_table[] = {
      ucc_offsetof(ucc_tl_mlx5_context_config_t, devices),
      UCC_CONFIG_TYPE_STRING_ARRAY},
 
+    {"MCAST_TIMEOUT", "10000", "Timeout [usec] for the reliability NACK in Mcast",
+     ucc_offsetof(ucc_tl_mlx5_context_config_t, mcast_ctx_conf.timeout),
+     UCC_CONFIG_TYPE_INT},
+
+    {"MCAST_NET_DEVICE", "", "Specifies which network device to use for Mcast",
+     ucc_offsetof(ucc_tl_mlx5_context_config_t, mcast_ctx_conf.ib_dev_name),
+     UCC_CONFIG_TYPE_STRING},
+
     {NULL}};
 
 UCC_CLASS_DEFINE_NEW_FUNC(ucc_tl_mlx5_lib_t, ucc_base_lib_t,
diff --git a/src/components/tl/mlx5/tl_mlx5_context.c b/src/components/tl/mlx5/tl_mlx5_context.c
index 0c56ff9390..5ac7b59f7d 100644
--- a/src/components/tl/mlx5/tl_mlx5_context.c
+++ b/src/components/tl/mlx5/tl_mlx5_context.c
@@ -14,6 +14,7 @@
 #include "tl_mlx5_ib.h"
 
 #define PD_OWNER_RANK 0
+#define TL_MLX5_IB_PORT_INVALID -1
 
 UCC_CLASS_INIT_FUNC(ucc_tl_mlx5_context_t,
                     const ucc_base_context_params_t *params,
@@ -210,7 +211,8 @@ ucc_status_t ucc_tl_mlx5_context_ib_ctx_pd_setup(ucc_base_context_t *context)
     if (!ctx->is_imported) {
         status = ucc_tl_mlx5_ib_ctx_pd_init(ctx);
         if (UCC_OK != status) {
-            goto err_ib_ctx_pd_init;
+            ctx->ib_port = TL_MLX5_IB_PORT_INVALID;
+            goto start_bcast;
         }
         if (UCC_SBGP_NOT_EXISTS == sbgp->status) {
             goto topo_ppn_1;
@@ -228,21 +230,20 @@ ucc_status_t ucc_tl_mlx5_context_ib_ctx_pd_setup(ucc_base_context_t *context)
             tl_debug(context->lib, "failed to create tmp file for socket path");
             sock_path[0] = '\0';
         }
-        sbcast_data->ib_port = ctx->ib_port;
         memcpy(sbcast_data->sock_path, sock_path, sizeof(sock_path));
     }
+start_bcast:
+    sbcast_data->ib_port = ctx->ib_port;
     steam = core_ctx->service_team;
-
     s.map    = sbgp->map;
     s.myrank = sbgp->group_rank;
-    status   = UCC_TL_TEAM_IFACE(steam)->scoll.bcast(
+    status = UCC_TL_TEAM_IFACE(steam)->scoll.bcast(
         &steam->super, sbcast_data, sbcast_data_length, PD_OWNER_RANK, s, &req);
 
     if (UCC_OK != status) {
         tl_debug(context->lib, "failed to start mlx5 ctx bcast");
         goto err;
     }
-
     while (UCC_INPROGRESS == (status = ucc_collective_test(&req->super))) {
         ucc_context_progress(core_ctx);
     }
@@ -256,9 +257,15 @@ ucc_status_t ucc_tl_mlx5_context_ib_ctx_pd_setup(ucc_base_context_t *context)
     ctx->ib_port = sbcast_data->ib_port;
     memcpy(sock_path, sbcast_data->sock_path, sizeof(sock_path));
 
+    if (ctx->ib_port == TL_MLX5_IB_PORT_INVALID) {
+        tl_debug(context->lib, "invalid ib port received");
+        status = UCC_ERR_NO_RESOURCE;
+        goto err_ib_ctx_pd_init;
+    }
+
     if (strlen(sock_path) == 0) {
         tl_debug(context->lib, "failed to share ctx and pd");
-        status = UCC_ERR_NO_MESSAGE;
+        status = UCC_ERR_NO_RESOURCE;
         goto err;
     }
     status = ucc_tl_mlx5_share_ctx_pd(ctx, sock_path, sbgp->group_size,
diff --git a/src/components/tl/mlx5/tl_mlx5_pd.c b/src/components/tl/mlx5/tl_mlx5_pd.c
index a553dbc5f5..bf98352883 100644
--- a/src/components/tl/mlx5/tl_mlx5_pd.c
+++ b/src/components/tl/mlx5/tl_mlx5_pd.c
@@ -263,7 +263,8 @@ ucc_status_t ucc_tl_mlx5_share_ctx_pd(ucc_tl_mlx5_context_t *ctx,
 }
 
 static void ucc_tl_mlx5_context_barrier(ucc_context_oob_coll_t *oob,
-                                        ucc_base_lib_t         *lib)
+                                        ucc_context_t *core_ctx,
+                                        ucc_base_lib_t *lib)
 {
     char        *rbuf;
     char         sbuf;
@@ -284,6 +285,7 @@ static void ucc_tl_mlx5_context_barrier(ucc_context_oob_coll_t *oob,
         oob->allgather(&sbuf, rbuf, sizeof(char), oob->coll_info, &req)) {
         ucc_assert(req != NULL);
         while (UCC_OK != (status = oob->req_test(req))) {
+            ucc_context_progress(core_ctx);
             if (status < 0) {
                 tl_debug(lib, "failed to test oob req");
                 break;
@@ -303,7 +305,8 @@ ucc_status_t ucc_tl_mlx5_remove_shared_ctx_pd(ucc_tl_mlx5_context_t *ctx)
     if (ctx->shared_pd && ctx->is_imported) {
         ibv_unimport_pd(ctx->shared_pd);
     }
-    ucc_tl_mlx5_context_barrier(&UCC_TL_CTX_OOB(ctx), lib);
+    ucc_tl_mlx5_context_barrier(&UCC_TL_CTX_OOB(ctx),
+                                ctx->super.super.ucc_context, lib);
     if (ctx->shared_pd && !ctx->is_imported) {
         err = ibv_dealloc_pd(ctx->shared_pd);
         if (err) {
diff --git a/src/components/tl/mlx5/tl_mlx5_rcache.c b/src/components/tl/mlx5/tl_mlx5_rcache.c
index 1414c82d15..d6f2aa47d8 100644
--- a/src/components/tl/mlx5/tl_mlx5_rcache.c
+++ b/src/components/tl/mlx5/tl_mlx5_rcache.c
@@ -63,8 +63,6 @@ ucc_status_t tl_mlx5_rcache_create(ucc_tl_mlx5_context_t *ctx)
     ucc_rcache_params_t     rcache_params;
 
     rcache_params.region_struct_size = sizeof(ucc_tl_mlx5_rcache_region_t);
-    rcache_params.alignment          = UCS_PGT_ADDR_ALIGN;
-    rcache_params.max_alignment      = ucc_get_page_size();
     rcache_params.ucm_event_priority = 1000;
     rcache_params.context            = (void *)ctx;
     rcache_params.ops                = &ucc_rcache_ops;
diff --git a/src/components/tl/mlx5/tl_mlx5_team.c b/src/components/tl/mlx5/tl_mlx5_team.c
index 712691078f..b326166674 100644
--- a/src/components/tl/mlx5/tl_mlx5_team.c
+++ b/src/components/tl/mlx5/tl_mlx5_team.c
@@ -66,7 +66,7 @@ UCC_CLASS_INIT_FUNC(ucc_tl_mlx5_team_t, ucc_base_context_t *tl_context,
     }
 
     self->a2a = NULL;
-    status    = ucc_tl_mlx5_team_init_alltoall(self);
+    status = ucc_tl_mlx5_team_init_alltoall(self);
     if (UCC_OK != status) {
         return status;
     }
@@ -105,9 +105,8 @@ ucc_status_t ucc_tl_mlx5_team_create_test(ucc_base_team_t *team)
 {
     ucc_tl_mlx5_team_t *tl_team   = ucc_derived_of(team, ucc_tl_mlx5_team_t);
     ucc_team_t         *core_team = UCC_TL_CORE_TEAM(tl_team);
-    ucc_subset_t        subset    = {.map.type   = UCC_EP_MAP_FULL,
-                                     .map.ep_num = core_team->size,
-                                     .myrank     = core_team->rank};
+    ucc_subset_t        subset    = {.map    = UCC_TL_TEAM_MAP(tl_team),
+                                     .myrank = UCC_TL_TEAM_RANK(tl_team)};
     ucc_status_t        status    = UCC_OK;
 
     switch (tl_team->state) {
diff --git a/src/components/tl/nccl/tl_nccl.c b/src/components/tl/nccl/tl_nccl.c
index 8e71cdc1e2..46fdcff8e3 100644
--- a/src/components/tl/nccl/tl_nccl.c
+++ b/src/components/tl/nccl/tl_nccl.c
@@ -39,12 +39,17 @@ static ucs_config_field_t ucc_tl_nccl_context_config_table[] = {
      UCS_CONFIG_TYPE_ENUM(ucc_tl_nccl_completion_sync_names)
     },
 
-    {"BLOCKING", "1",
-     "If set to 0 will use non-blocking mode communicator behavior, "
-     "if set to 1 will use blocking mode",
+    {"BLOCKING", "yes",
+     "If set to no will use non-blocking mode communicator behavior, "
+     "if set to yes will use blocking mode",
      ucs_offsetof(ucc_tl_nccl_context_config_t, nccl_cfg_blocking),
      UCS_CONFIG_TYPE_BOOL},
 
+    {"LAZY_INIT", "yes",
+     "Initialize NCCL communicator on first collective",
+     ucc_offsetof(ucc_tl_nccl_context_config_t, nccl_lazy_init),
+     UCC_CONFIG_TYPE_BOOL},
+
     {NULL}};
 
 UCC_CLASS_DEFINE_NEW_FUNC(ucc_tl_nccl_lib_t, ucc_base_lib_t,
diff --git a/src/components/tl/nccl/tl_nccl.h b/src/components/tl/nccl/tl_nccl.h
index 06f32c0371..b922601812 100644
--- a/src/components/tl/nccl/tl_nccl.h
+++ b/src/components/tl/nccl/tl_nccl.h
@@ -45,6 +45,15 @@
 #define NCCL_VERSION_COMM_INIT_NB NCCL_VERSION(2,14,3)
 #define NCCL_USE_NON_BLOCKING NCCL_VERSION_CODE >= NCCL_VERSION_COMM_INIT_NB
 
+enum {
+    TL_NCCL_COMM_STATE_ERROR,
+    TL_NCCL_COMM_STATE_OOB,
+    TL_NCCL_COMM_STATE_INIT_TEAM,
+    TL_NCCL_COMM_STATE_INIT_COMM,
+    TL_NCCL_COMM_STATE_DESTROY_COMM,
+    TL_NCCL_COMM_STATE_READY,
+};
+
 typedef struct ucc_tl_nccl_iface {
     ucc_tl_iface_t super;
 } ucc_tl_nccl_iface_t;
@@ -66,6 +75,7 @@ typedef struct ucc_tl_nccl_context_config {
     ucc_tl_context_config_t            super;
     ucc_tl_nccl_completion_sync_type_t sync_type;
     int                                nccl_cfg_blocking;
+    int                                nccl_lazy_init;
 } ucc_tl_nccl_context_config_t;
 
 typedef struct ucc_tl_nccl_lib {
@@ -85,7 +95,7 @@ UCC_CLASS_DECLARE(ucc_tl_nccl_context_t, const ucc_base_context_params_t *,
 
 typedef struct ucc_tl_nccl_team {
     ucc_tl_team_t        super;
-    ucc_status_t         comm_state;
+    int                  comm_state;
     ncclUniqueId        *unique_id;
     void                *oob_req;
     ncclComm_t           nccl_comm;
@@ -146,6 +156,8 @@ static inline ucc_status_t ucc_tl_nccl_check_nb(ncclResult_t *nccl_status, // NO
     return UCC_OK;
 }
 
+ucc_status_t ucc_tl_nccl_comm_init(ucc_tl_nccl_team_t *team);
+
 #define NCCLCHECK_GOTO(_cmd, _label, _st, _lib, _task_st, _comm, _check_nb)    \
     do {                                                                       \
         ncclResult_t e = _cmd;                                                 \
diff --git a/src/components/tl/nccl/tl_nccl_coll.c b/src/components/tl/nccl/tl_nccl_coll.c
index 8a225c268b..ee3d523b0b 100644
--- a/src/components/tl/nccl/tl_nccl_coll.c
+++ b/src/components/tl/nccl/tl_nccl_coll.c
@@ -131,6 +131,7 @@ ucc_status_t ucc_tl_nccl_init_task(ucc_base_coll_args_t *coll_args,
                                    ucc_base_team_t *team,
                                    ucc_tl_nccl_task_t **coll_task)
 {
+    ucc_tl_nccl_team_t    *nccl_team = ucc_derived_of(team, ucc_tl_nccl_team_t);
     ucc_tl_nccl_context_t *nccl_ctx  = ucc_derived_of(team->context,
                                                       ucc_tl_nccl_context_t);
     ucc_tl_nccl_task_t    *task;
@@ -143,6 +144,13 @@ ucc_status_t ucc_tl_nccl_init_task(ucc_base_coll_args_t *coll_args,
         return UCC_ERR_NOT_SUPPORTED;
     }
 
+    if (ucc_unlikely(nccl_team->comm_state != TL_NCCL_COMM_STATE_READY)) {
+        status = ucc_tl_nccl_comm_init(nccl_team);
+        if (ucc_unlikely(status != UCC_OK)) {
+            return status;
+        }
+    }
+
     task = ucc_mpool_get(&nccl_ctx->req_mp);
     if (ucc_unlikely(!task)) {
         tl_error(team->context->lib, "failed to get task from mpool");
@@ -206,7 +214,7 @@ ucc_status_t ucc_tl_nccl_coll_finalize(ucc_coll_task_t *coll_task)
     ucc_status_t        status = UCC_OK;
 
     if (ucc_unlikely(task->super.super.status != UCC_OK)) {
-        team->comm_state = task->super.super.status;
+        team->comm_state = TL_NCCL_COMM_STATE_ERROR;
     }
     tl_debug(UCC_TASK_LIB(task), "finalizing coll task %p", task);
     ucc_tl_nccl_free_task(task);
diff --git a/src/components/tl/nccl/tl_nccl_team.c b/src/components/tl/nccl/tl_nccl_team.c
index af2aff2ac6..bf8caf7e53 100644
--- a/src/components/tl/nccl/tl_nccl_team.c
+++ b/src/components/tl/nccl/tl_nccl_team.c
@@ -15,14 +15,17 @@
 UCC_CLASS_INIT_FUNC(ucc_tl_nccl_team_t, ucc_base_context_t *tl_context,
                     const ucc_base_team_params_t *params)
 {
-    ucc_tl_nccl_context_t *ctx    =
-        ucc_derived_of(tl_context, ucc_tl_nccl_context_t);
+    ucc_tl_nccl_context_t *ctx = ucc_derived_of(tl_context,
+                                                ucc_tl_nccl_context_t);
+    ucc_team_oob_coll_t *oob;
     ucc_status_t status;
     ucc_rank_t size;
-    UCC_CLASS_CALL_SUPER_INIT(ucc_tl_team_t, &ctx->super, params);
 
+    UCC_CLASS_CALL_SUPER_INIT(ucc_tl_team_t, &ctx->super, params);
+    oob = &(UCC_TL_TEAM_OOB(self));
     size = UCC_TL_TEAM_SIZE(self);
-    self->comm_state = UCC_OK;
+    self->stream     = NULL;
+    self->nccl_comm  = NULL;
     self->unique_id  = ucc_malloc(sizeof(ncclUniqueId) * (size + 1),
                                   "tl_nccl_unique_id");
     if (!self->unique_id) {
@@ -31,6 +34,7 @@ UCC_CLASS_INIT_FUNC(ucc_tl_nccl_team_t, ucc_base_context_t *tl_context,
                  sizeof(ncclUniqueId) * (size + 1));
         return UCC_ERR_NO_MEMORY;
     }
+
     if (UCC_TL_TEAM_RANK(self) == 0) {
         ncclResult_t st;
         st = ncclGetUniqueId(&self->unique_id[size]);
@@ -39,14 +43,16 @@ UCC_CLASS_INIT_FUNC(ucc_tl_nccl_team_t, ucc_base_context_t *tl_context,
             memset(&self->unique_id[size], 0, sizeof(ncclUniqueId));
         }
     }
-    status = UCC_TL_TEAM_OOB(self).allgather(
-        &self->unique_id[size], self->unique_id,
-        sizeof(ncclUniqueId), UCC_TL_TEAM_OOB(self).coll_info,
-        &self->oob_req);
+
+    status = oob->allgather(&self->unique_id[size],
+                            self->unique_id, sizeof(ncclUniqueId),
+                            oob->coll_info, &self->oob_req);
     if (status != UCC_OK) {
         tl_error(ctx->super.super.lib, "failed to start oob allgather");
         goto free_unique_id;
     }
+    self->comm_state = TL_NCCL_COMM_STATE_OOB;
+
     return UCC_OK;
 
 free_unique_id:
@@ -69,15 +75,17 @@ ucc_status_t ucc_tl_nccl_team_destroy(ucc_base_team_t *tl_team)
 #if NCCL_USE_NON_BLOCKING
     ncclResult_t nccl_status, st;
 
-    if (team->nccl_comm && team->comm_state == UCC_INPROGRESS) {
+    if (team->comm_state == TL_NCCL_COMM_STATE_DESTROY_COMM) {
         goto check_finalize;
     }
 #endif
 
+    if (team->stream) {
+        cudaStreamDestroy(team->stream);
+        team->stream = NULL;
+    }
     if (team->nccl_comm) {
-        if (team->comm_state != UCC_OK && team->comm_state != UCC_INPROGRESS) {
-            /* if communication error was detected ncclCommAbort should be used
-               since ncclCommDestroy could block */
+        if (team->comm_state == TL_NCCL_COMM_STATE_ERROR) {
             ncclCommAbort(team->nccl_comm);
         } else {
 #if NCCL_USE_NON_BLOCKING
@@ -91,7 +99,7 @@ ucc_status_t ucc_tl_nccl_team_destroy(ucc_base_team_t *tl_team)
                 ncclCommAbort(team->nccl_comm);
                 return UCC_ERR_NO_MESSAGE;
             } else if (nccl_status == ncclInProgress) {
-                team->comm_state = UCC_INPROGRESS;
+                team->comm_state = TL_NCCL_COMM_STATE_DESTROY_COMM;
                 return UCC_INPROGRESS;
             } else {
                 ncclCommDestroy(team->nccl_comm);
@@ -101,95 +109,125 @@ ucc_status_t ucc_tl_nccl_team_destroy(ucc_base_team_t *tl_team)
             ncclCommDestroy(team->nccl_comm);
 #endif
         }
-        cudaStreamDestroy(team->stream);
     }
 
     UCC_CLASS_DELETE_FUNC_NAME(ucc_tl_nccl_team_t)(tl_team);
     return UCC_OK;
 }
 
-ucc_status_t ucc_tl_nccl_team_create_test(ucc_base_team_t *tl_team)
+ucc_status_t ucc_tl_nccl_comm_init(ucc_tl_nccl_team_t *team)
 {
-    ucc_tl_nccl_team_t *team = ucc_derived_of(tl_team, ucc_tl_nccl_team_t);
+    ucc_rank_t   tsize    = UCC_TL_TEAM_SIZE(team);
+    ucc_rank_t   trank    = UCC_TL_TEAM_RANK(team);
     ucc_status_t status;
     ncclResult_t nccl_status;
-    ncclUniqueId errorid;
-
 #if NCCL_USE_NON_BLOCKING
     ncclConfig_t nccl_cfg = NCCL_CONFIG_INITIALIZER;
-    ncclResult_t st;
-
-    if (team->comm_state == UCC_INPROGRESS) {
-        goto ncclInitStage;
-    }
+    ncclResult_t async_status;
 #endif
 
-    status = UCC_TL_TEAM_OOB(team).req_test(team->oob_req);
-    if (status == UCC_INPROGRESS) {
-        return UCC_INPROGRESS;
-    }
-    if (status != UCC_OK) {
-        UCC_TL_TEAM_OOB(team).req_free(team->oob_req);
-        tl_error(tl_team->context->lib, "oob req test failed");
-        goto free_unique_id;
-    }
-    status = UCC_TL_TEAM_OOB(team).req_free(team->oob_req);
-    if (status != UCC_OK) {
-        tl_error(tl_team->context->lib, "oob req free failed");
-        goto free_unique_id;
-    }
-    /* check unique id is valid */
-    memset(&errorid, 0, sizeof(errorid));
-    if (!memcmp(&errorid, team->unique_id, sizeof(errorid))) {
-        tl_error(tl_team->context->lib, "incorrect unique id");
-        goto free_unique_id;
+    if (team->comm_state == TL_NCCL_COMM_STATE_READY) {
+        return UCC_OK;
+    } else if (team->comm_state == TL_NCCL_COMM_STATE_ERROR) {
+        return UCC_ERR_NOT_SUPPORTED;
+    } else if (team->comm_state == TL_NCCL_COMM_STATE_INIT_COMM) {
+#if NCCL_USE_NON_BLOCKING
+        goto nccl_async_init;
+#else
+        ucc_assert_always(0);
+#endif
     }
 
     CUDA_CHECK_GOTO(cudaStreamCreateWithFlags(&team->stream,
-                    cudaStreamNonBlocking), free_unique_id, status);
+                                              cudaStreamNonBlocking),
+                    exit_err, status);
 #if NCCL_USE_NON_BLOCKING
-    nccl_cfg.blocking = UCC_TL_NCCL_TEAM_CTX(team)->cfg.nccl_cfg_blocking;
-    nccl_status = ncclCommInitRankConfig(&team->nccl_comm,
-                                         UCC_TL_TEAM_SIZE(team),
-                                         team->unique_id[0],
-                                         UCC_TL_TEAM_RANK(team),
-                                         &nccl_cfg);
-    if (nccl_status != ncclInProgress && nccl_status != ncclSuccess) {
-        goto free_stream;
+    /*
+    * if NCCL comm initialized during first call to collective init a.k.a lazy init
+    * we need to use blocking init to correctly fallback to other TL in case of error
+    */
+    nccl_cfg.blocking = (UCC_TL_NCCL_TEAM_CTX(team)->cfg.nccl_cfg_blocking ||
+                         UCC_TL_NCCL_TEAM_CTX(team)->cfg.nccl_lazy_init) ? 1: 0;
+
+    nccl_status = ncclCommInitRankConfig(&team->nccl_comm, tsize,
+                                         team->unique_id[0], trank, &nccl_cfg);
+    if ((nccl_status != ncclInProgress) && (nccl_status != ncclSuccess)) {
+        goto nccl_comm_init_err;
     }
-ncclInitStage:
-    st = ncclCommGetAsyncError(team->nccl_comm, &nccl_status);
-    if (st != ncclSuccess) {
-        nccl_status = st;
+nccl_async_init:
+    nccl_status = ncclCommGetAsyncError(team->nccl_comm, &async_status);
+    if (nccl_status != ncclSuccess) {
+        goto nccl_comm_init_err;
     }
-    if (nccl_status == ncclInProgress){
-        team->comm_state = UCC_INPROGRESS;
-        return UCC_INPROGRESS;
+    if (async_status == ncclInProgress) {
+        team->comm_state = TL_NCCL_COMM_STATE_INIT_COMM;
     }
 #else
-    nccl_status = ncclCommInitRank(&team->nccl_comm, UCC_TL_TEAM_SIZE(team),
-                                   team->unique_id[0], UCC_TL_TEAM_RANK(team));
-#endif
+    nccl_status = ncclCommInitRank(&team->nccl_comm, tsize, team->unique_id[0],
+                                   trank);
     if (nccl_status != ncclSuccess) {
-        goto free_stream;
+        goto nccl_comm_init_err;
     }
-    ucc_free(team->unique_id);
-    tl_debug(tl_team->context->lib, "initialized tl team: %p", team);
+#endif
+
+    team->comm_state = TL_NCCL_COMM_STATE_READY;
     return UCC_OK;
 
-free_stream:
-    tl_debug(tl_team->context->lib, "NCCL error %d %s", nccl_status,
-             ncclGetErrorString(nccl_status));
-    status = UCC_ERR_NO_MESSAGE;
-#if NCCL_USE_NON_BLOCKING
-    ncclCommAbort(team->nccl_comm);
-#endif
-    cudaStreamDestroy(team->stream);
-free_unique_id:
-    ucc_free(team->unique_id);
+nccl_comm_init_err:
+    tl_debug(team->super.super.context->lib, "NCCL error %d %s",
+             nccl_status, ncclGetErrorString(nccl_status));
+    if (nccl_status == ncclInvalidUsage) {
+        /*
+        * handles the case when trying to inititize multiple ranks
+        * on the same GPU. Return "not supported" and fallback to other TL
+        */
+        status = UCC_ERR_NOT_SUPPORTED;
+    } else {
+        status = UCC_ERR_NO_RESOURCE;
+    }
+    team->comm_state = TL_NCCL_COMM_STATE_ERROR;
+
+exit_err:
     return status;
 }
 
+ucc_status_t ucc_tl_nccl_team_create_test(ucc_base_team_t *tl_team)
+{
+    ucc_tl_nccl_team_t  *team = ucc_derived_of(tl_team, ucc_tl_nccl_team_t);
+    ucc_team_oob_coll_t *oob  = &(UCC_TL_TEAM_OOB(team));
+    ncclUniqueId errorid;
+    ucc_status_t status;
+
+
+    if (team->comm_state == TL_NCCL_COMM_STATE_OOB) {
+        status = oob->req_test(team->oob_req);
+        if (status == UCC_INPROGRESS) {
+            return UCC_INPROGRESS;
+        }
+
+        oob->req_free(team->oob_req);
+        if (status != UCC_OK) {
+            tl_error(tl_team->context->lib, "oob req test failed");
+            return status;
+        }
+
+        /* check unique id is valid */
+        memset(&errorid, 0, sizeof(errorid));
+        if (!memcmp(&errorid, team->unique_id, sizeof(errorid))) {
+            tl_error(tl_team->context->lib, "incorrect unique id");
+            return status;
+        }
+
+        team->comm_state = TL_NCCL_COMM_STATE_INIT_TEAM;
+    }
+
+    if (UCC_TL_NCCL_TEAM_CTX(team)->cfg.nccl_lazy_init) {
+        return UCC_OK;
+    }
+
+    return ucc_tl_nccl_comm_init(team);
+}
+
 ucc_status_t ucc_tl_nccl_coll_init(ucc_base_coll_args_t *coll_args,
                                    ucc_base_team_t *team,
                                    ucc_coll_task_t **task_h)
diff --git a/src/components/tl/sharp/tl_sharp.h b/src/components/tl/sharp/tl_sharp.h
index cc44e9e1f4..adfbc86036 100644
--- a/src/components/tl/sharp/tl_sharp.h
+++ b/src/components/tl/sharp/tl_sharp.h
@@ -108,6 +108,10 @@ typedef struct ucc_tl_sharp_task {
             ucc_tl_sharp_reg_t *s_mem_h;
             ucc_tl_sharp_reg_t *r_mem_h;
         } allreduce;
+        struct {
+            ucc_tl_sharp_reg_t *s_mem_h;
+            ucc_tl_sharp_reg_t *r_mem_h;
+        } reduce_scatter;
         struct {
             ucc_tl_sharp_reg_t *mem_h;
         } bcast;
@@ -131,9 +135,16 @@ ucc_status_t sharp_status_to_ucc_status(int status);
     (ucc_derived_of((_task)->super.team->context->lib, ucc_tl_sharp_lib_t))
 #define TASK_ARGS(_task) (_task)->super.bargs.args
 
-#define UCC_TL_SHARP_SUPPORTED_COLLS                                           \
+#define UCC_TL_BASIC_SHARP_SUPPORTED_COLLS                                     \
     (UCC_COLL_TYPE_ALLREDUCE | UCC_COLL_TYPE_BARRIER | UCC_COLL_TYPE_BCAST)
 
+#if HAVE_DECL_SHARP_COLL_DO_REDUCE_SCATTER
+#define UCC_TL_SHARP_SUPPORTED_COLLS                                           \
+    (UCC_TL_BASIC_SHARP_SUPPORTED_COLLS | UCC_COLL_TYPE_REDUCE_SCATTER)
+#else
+#define UCC_TL_SHARP_SUPPORTED_COLLS (UCC_TL_BASIC_SHARP_SUPPORTED_COLLS)
+#endif
+
 UCC_CLASS_DECLARE(ucc_tl_sharp_team_t, ucc_base_context_t *,
                   const ucc_base_team_params_t *);
 
diff --git a/src/components/tl/sharp/tl_sharp_coll.c b/src/components/tl/sharp/tl_sharp_coll.c
index d246fcc563..5884e18918 100644
--- a/src/components/tl/sharp/tl_sharp_coll.c
+++ b/src/components/tl/sharp/tl_sharp_coll.c
@@ -26,9 +26,9 @@ enum sharp_datatype ucc_to_sharp_dtype[] = {
     [UCC_DT_PREDEFINED_ID(UCC_DT_FLOAT64)]          = SHARP_DTYPE_DOUBLE,
     [UCC_DT_PREDEFINED_ID(UCC_DT_FLOAT128)]         = SHARP_DTYPE_NULL,
 #if SHARP_API > SHARP_VERSION(3, 0)
-    [UCC_DT_PREDEFINED_ID(UCC_DT_INT8)]             = SHARP_DTYPE_UNKNOWN,
-    [UCC_DT_PREDEFINED_ID(UCC_DT_UINT8)]            = SHARP_DTYPE_UNKNOWN,
-    [UCC_DT_PREDEFINED_ID(UCC_DT_BFLOAT16)]         = SHARP_DTYPE_UNKNOWN,
+    [UCC_DT_PREDEFINED_ID(UCC_DT_INT8)]             = (enum sharp_datatype)SHARP_DTYPE_UNKNOWN,
+    [UCC_DT_PREDEFINED_ID(UCC_DT_UINT8)]            = (enum sharp_datatype)SHARP_DTYPE_UNKNOWN,
+    [UCC_DT_PREDEFINED_ID(UCC_DT_BFLOAT16)]         = (enum sharp_datatype)SHARP_DTYPE_UNKNOWN,
 #else
     [UCC_DT_PREDEFINED_ID(UCC_DT_INT8)]             = SHARP_DTYPE_NULL,
     [UCC_DT_PREDEFINED_ID(UCC_DT_UINT8)]            = SHARP_DTYPE_NULL,
@@ -308,6 +308,100 @@ ucc_status_t ucc_tl_sharp_bcast_start(ucc_coll_task_t *coll_task)
     return ucc_progress_queue_enqueue(UCC_TL_CORE_CTX(team)->pq, &task->super);
 }
 
+#if HAVE_DECL_SHARP_COLL_DO_REDUCE_SCATTER
+ucc_status_t ucc_tl_sharp_reduce_scatter_start(ucc_coll_task_t *coll_task)
+{
+    ucc_tl_sharp_task_t *task  = ucc_derived_of(coll_task, ucc_tl_sharp_task_t);
+    ucc_tl_sharp_team_t *team  = TASK_TEAM(task);
+    ucc_coll_args_t     *args  = &TASK_ARGS(task);
+    size_t              count = args->dst.info.count;
+    ucc_datatype_t      dt    = args->dst.info.datatype;
+    struct sharp_coll_reduce_spec reduce_spec;
+    enum sharp_datatype           sharp_type;
+    enum sharp_reduce_op          op_type;
+    size_t                        src_data_size, dst_data_size;
+    int                           ret;
+
+    UCC_TL_SHARP_PROFILE_REQUEST_EVENT(coll_task, "sharp_reduce_scatter_start",
+                                       0);
+
+    sharp_type    = ucc_to_sharp_dtype[UCC_DT_PREDEFINED_ID(dt)];
+    op_type       = ucc_to_sharp_reduce_op[args->op];
+    src_data_size = ucc_dt_size(dt) * count * UCC_TL_TEAM_SIZE(team);
+    dst_data_size = ucc_dt_size(dt) * count;
+
+    if (!UCC_IS_INPLACE(*args)) {
+        ucc_tl_sharp_mem_register(TASK_CTX(task), team, args->src.info.buffer,
+                                  src_data_size, &task->reduce_scatter.s_mem_h);
+    }
+    ucc_tl_sharp_mem_register(TASK_CTX(task), team, args->dst.info.buffer,
+                              dst_data_size, &task->reduce_scatter.r_mem_h);
+
+    if (!UCC_IS_INPLACE(*args)) {
+        reduce_spec.sbuf_desc.buffer.ptr = args->src.info.buffer;
+        reduce_spec.sbuf_desc.buffer.mem_handle =
+            task->reduce_scatter.s_mem_h->mr;
+        reduce_spec.sbuf_desc.mem_type =
+            ucc_to_sharp_memtype[args->src.info.mem_type];
+    } else {
+        reduce_spec.sbuf_desc.buffer.ptr = args->dst.info.buffer;
+        reduce_spec.sbuf_desc.buffer.mem_handle =
+            task->reduce_scatter.r_mem_h->mr;
+        reduce_spec.sbuf_desc.mem_type =
+            ucc_to_sharp_memtype[args->dst.info.mem_type];
+    }
+
+    reduce_spec.sbuf_desc.buffer.length     = src_data_size;
+    reduce_spec.sbuf_desc.type              = SHARP_DATA_BUFFER;
+    reduce_spec.rbuf_desc.buffer.ptr        = args->dst.info.buffer;
+    reduce_spec.rbuf_desc.buffer.length     = dst_data_size;
+    reduce_spec.rbuf_desc.buffer.mem_handle = task->reduce_scatter.r_mem_h->mr;
+    reduce_spec.rbuf_desc.type              = SHARP_DATA_BUFFER;
+    reduce_spec.rbuf_desc.mem_type =
+        ucc_to_sharp_memtype[args->dst.info.mem_type];
+    reduce_spec.aggr_mode = SHARP_AGGREGATION_NONE;
+    reduce_spec.length    = count;
+    reduce_spec.dtype     = sharp_type;
+    reduce_spec.op        = op_type;
+    reduce_spec.offset    = 0;
+
+    ret = sharp_coll_do_reduce_scatter_nb(team->sharp_comm, &reduce_spec,
+                                          &task->req_handle);
+    if (ret != SHARP_COLL_SUCCESS) {
+        tl_error(UCC_TASK_LIB(task),
+                 "sharp_coll_do_reduce_scatter_nb failed:%s",
+                 sharp_coll_strerror(ret));
+        coll_task->status = ucc_tl_sharp_status_to_ucc(ret);
+        return ucc_task_complete(coll_task);
+    }
+    coll_task->status = UCC_INPROGRESS;
+
+    return ucc_progress_queue_enqueue(UCC_TL_CORE_CTX(team)->pq, &task->super);
+}
+
+ucc_status_t ucc_tl_sharp_reduce_scatter_init(ucc_tl_sharp_task_t *task)
+{
+    ucc_coll_args_t *args = &TASK_ARGS(task);
+
+    if (!ucc_coll_args_is_predefined_dt(args, UCC_RANK_INVALID)) {
+        return UCC_ERR_NOT_SUPPORTED;
+    }
+
+    if ((!UCC_IS_INPLACE(*args) &&
+        ucc_to_sharp_memtype[args->src.info.mem_type] == SHARP_MEM_TYPE_LAST) ||
+        ucc_to_sharp_memtype[args->dst.info.mem_type] == SHARP_MEM_TYPE_LAST ||
+        ucc_to_sharp_dtype[UCC_DT_PREDEFINED_ID(args->dst.info.datatype)] ==
+            SHARP_DTYPE_NULL ||
+        ucc_to_sharp_reduce_op[args->op] == SHARP_OP_NULL) {
+        return UCC_ERR_NOT_SUPPORTED;
+    }
+
+    task->super.post     = ucc_tl_sharp_reduce_scatter_start;
+    task->super.progress = ucc_tl_sharp_collective_progress;
+    return UCC_OK;
+};
+#endif
+
 ucc_status_t ucc_tl_sharp_allreduce_init(ucc_tl_sharp_task_t *task)
 {
     ucc_coll_args_t *args = &TASK_ARGS(task);
diff --git a/src/components/tl/sharp/tl_sharp_coll.h b/src/components/tl/sharp/tl_sharp_coll.h
index 4b0dba17b6..6557dc56e8 100644
--- a/src/components/tl/sharp/tl_sharp_coll.h
+++ b/src/components/tl/sharp/tl_sharp_coll.h
@@ -1,5 +1,5 @@
 /**
- * Copyright (c) 2021-2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2021-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  *
  * See file LICENSE for terms.
  */
@@ -10,7 +10,7 @@
 #include "tl_sharp.h"
 
 /* need to query for datatype support at runtime */
-#define SHARP_DTYPE_UNKNOWN -1
+#define SHARP_DTYPE_UNKNOWN 0xFFFF
 
 extern enum sharp_datatype ucc_to_sharp_dtype[];
 
@@ -20,4 +20,7 @@ ucc_status_t ucc_tl_sharp_barrier_init(ucc_tl_sharp_task_t *task);
 
 ucc_status_t ucc_tl_sharp_bcast_init(ucc_tl_sharp_task_t *task);
 
+#if HAVE_DECL_SHARP_COLL_DO_REDUCE_SCATTER
+ucc_status_t ucc_tl_sharp_reduce_scatter_init(ucc_tl_sharp_task_t *task);
+#endif
 #endif
diff --git a/src/components/tl/sharp/tl_sharp_context.c b/src/components/tl/sharp/tl_sharp_context.c
index 6e0477680a..72461066b3 100644
--- a/src/components/tl/sharp/tl_sharp_context.c
+++ b/src/components/tl/sharp/tl_sharp_context.c
@@ -269,12 +269,10 @@ ucc_status_t ucc_tl_sharp_rcache_create(struct sharp_coll_context *context,
 {
     ucc_rcache_params_t rcache_params;
 
-    rcache_params.alignment          = 64;
     rcache_params.ucm_event_priority = 1000;
     rcache_params.max_regions        = ULONG_MAX;
     rcache_params.max_size           = SIZE_MAX;
     rcache_params.region_struct_size = sizeof(ucc_tl_sharp_rcache_region_t);
-    rcache_params.max_alignment      = ucc_get_page_size();
     rcache_params.ucm_events         = UCM_EVENT_VM_UNMAPPED |
         UCM_EVENT_MEM_TYPE_FREE;
     rcache_params.context            = context;
@@ -436,7 +434,7 @@ ucc_status_t ucc_tl_sharp_context_create_epilog(ucc_base_context_t *context)
     if (lib->cfg.use_internal_oob) {
         sharp_ctx->oob_ctx.subset = set;
     } else {
-        sharp_ctx->oob_ctx.oob      = &UCC_TL_CTX_OOB(sharp_ctx);
+        sharp_ctx->oob_ctx.oob = &UCC_TL_CTX_OOB(sharp_ctx);
     }
 
     status = ucc_topo_init(set, core_ctx->topo, &topo);
diff --git a/src/components/tl/sharp/tl_sharp_team.c b/src/components/tl/sharp/tl_sharp_team.c
index fe4a5875fb..a8bd380936 100644
--- a/src/components/tl/sharp/tl_sharp_team.c
+++ b/src/components/tl/sharp/tl_sharp_team.c
@@ -117,7 +117,7 @@ UCC_CLASS_INIT_FUNC(ucc_tl_sharp_team_t, ucc_base_context_t *tl_context,
 
         if (sharp_caps.support_mask.dtypes & UCC_BIT(SHARP_DTYPE_BFLOAT16)) {
             tl_debug(ctx->super.super.lib, "enabling support for UCC_DT_BFLOAT16");
-            ucc_to_sharp_dtype[UCC_DT_PREDEFINED_ID(UCC_DT_BFLOAT16)] = UCC_DT_BFLOAT16;
+            ucc_to_sharp_dtype[UCC_DT_PREDEFINED_ID(UCC_DT_BFLOAT16)] = SHARP_DTYPE_BFLOAT16;
         } else {
             tl_debug(ctx->super.super.lib, "disabling support for UCC_DT_BFLOAT16");
             ucc_to_sharp_dtype[UCC_DT_PREDEFINED_ID(UCC_DT_BFLOAT16)] = SHARP_DTYPE_NULL;
@@ -234,6 +234,11 @@ ucc_status_t ucc_tl_sharp_coll_init(ucc_base_coll_args_t *coll_args,
     case UCC_COLL_TYPE_BCAST:
         status = ucc_tl_sharp_bcast_init(task);
         break;
+#if HAVE_DECL_SHARP_COLL_DO_REDUCE_SCATTER
+    case UCC_COLL_TYPE_REDUCE_SCATTER:
+        status = ucc_tl_sharp_reduce_scatter_init(task);
+        break;
+#endif
     default:
         tl_debug(UCC_TASK_LIB(task),
                  "collective %d is not supported by sharp tl",
diff --git a/src/components/tl/ucc_tl.c b/src/components/tl/ucc_tl.c
index dcbb2b6d71..3134c9fd14 100644
--- a/src/components/tl/ucc_tl.c
+++ b/src/components/tl/ucc_tl.c
@@ -242,6 +242,11 @@ ucc_status_t ucc_tl_team_create_multiple(ucc_team_multiple_req_t *req)
     }
     req->descs[*id].status = UCC_TL_CTX_IFACE(req->descs[*id].ctx)
                                ->team.create_test(&req->descs[*id].team->super);
+    if (req->descs[*id].status < 0) {
+        /* if team create failed in team create test need to cleanup resources */
+        UCC_TL_CTX_IFACE(req->descs[*id].ctx)->team.destroy(
+            &req->descs[*id].team->super);
+    }
     return UCC_INPROGRESS;
 }
 
diff --git a/src/components/tl/ucc_tl.h b/src/components/tl/ucc_tl.h
index 53e62052dc..75a5e3e1a0 100644
--- a/src/components/tl/ucc_tl.h
+++ b/src/components/tl/ucc_tl.h
@@ -138,8 +138,18 @@ typedef struct ucc_tl_lib_attr {
 #define UCC_TL_TEAM_IFACE(_tl_team)                                            \
     (ucc_derived_of((_tl_team)->super.context->lib, ucc_tl_lib_t))->iface
 
+/**
+ * Get TL team lib
+ * @param [in] _tl_team pointer to TL team object
+ * @return pointer to TL lib object
+ */
 #define UCC_TL_TEAM_LIB(_tl_team) (_tl_team)->super.super.context->lib
 
+/**
+ * Get TL team context
+ * @param [in] _tl_team pointer to TL team object
+ * @return pointer to TL context object
+ */
 #define UCC_TL_TEAM_CTX(_tl_team) (_tl_team)->super.super.context
 
 #define UCC_TL_CORE_CTX(_tl_team) ((_tl_team)->super.super.context->ucc_context)
diff --git a/src/components/tl/ucp/Makefile.am b/src/components/tl/ucp/Makefile.am
index 4d684adfb5..30d00633da 100644
--- a/src/components/tl/ucp/Makefile.am
+++ b/src/components/tl/ucp/Makefile.am
@@ -32,7 +32,8 @@ alltoallv =                        \
 	alltoallv/alltoallv.h          \
 	alltoallv/alltoallv.c          \
 	alltoallv/alltoallv_pairwise.c \
-	alltoallv/alltoallv_hybrid.c
+	alltoallv/alltoallv_hybrid.c   \
+	alltoallv/alltoallv_onesided.c
 
 allreduce =                                    \
 	allreduce/allreduce.h                      \
@@ -40,6 +41,7 @@ allreduce =                                    \
 	allreduce/allreduce_knomial.c              \
 	allreduce/allreduce_sliding_window.c       \
 	allreduce/allreduce_sliding_window_setup.c \
+  allreduce/allreduce_dbt.c                  \
 	allreduce/allreduce_sra_knomial.c
 
 barrier =                     \
@@ -51,7 +53,8 @@ bcast =                       \
 	bcast/bcast.h             \
 	bcast/bcast.c             \
 	bcast/bcast_knomial.c     \
-	bcast/bcast_sag_knomial.c
+	bcast/bcast_sag_knomial.c \
+	bcast/bcast_dbt.c
 
 fanin =           \
 	fanin/fanin.h \
@@ -74,7 +77,8 @@ gatherv =                    \
 reduce =                    \
 	reduce/reduce.h         \
 	reduce/reduce.c         \
-	reduce/reduce_knomial.c
+	reduce/reduce_knomial.c \
+	reduce/reduce_dbt.c
 
 reduce_scatter =                            \
 	reduce_scatter/reduce_scatter.h         \
diff --git a/src/components/tl/ucp/allgather/allgather.c b/src/components/tl/ucp/allgather/allgather.c
index 90b06e99ee..926b732e55 100644
--- a/src/components/tl/ucp/allgather/allgather.c
+++ b/src/components/tl/ucp/allgather/allgather.c
@@ -38,7 +38,14 @@ char *ucc_tl_ucp_allgather_score_str_get(ucc_tl_ucp_team_t *team)
                          ? UCC_TL_UCP_ALLGATHER_ALG_RING
                          : UCC_TL_UCP_ALLGATHER_ALG_NEIGHBOR;
     char *str      = ucc_malloc(max_size * sizeof(char));
+    ucc_sbgp_t *sbgp;
 
+    if (team->cfg.use_reordering) {
+        sbgp = ucc_topo_get_sbgp(team->topo, UCC_SBGP_FULL_HOST_ORDERED);
+        if (!ucc_ep_map_is_identity(&sbgp->map)) {
+            algo_num = UCC_TL_UCP_ALLGATHER_ALG_RING;
+        }
+    }
     ucc_snprintf_safe(str, max_size,
                       UCC_TL_UCP_ALLGATHER_DEFAULT_ALG_SELECT_STR, algo_num);
     return str;
diff --git a/src/components/tl/ucp/allgather/allgather_neighbor.c b/src/components/tl/ucp/allgather/allgather_neighbor.c
index 771ba2d3b8..534c197e4e 100644
--- a/src/components/tl/ucp/allgather/allgather_neighbor.c
+++ b/src/components/tl/ucp/allgather/allgather_neighbor.c
@@ -15,7 +15,9 @@
 static ucc_rank_t get_recv_from_rank(ucc_rank_t rank, ucc_rank_t size, int i)
 {
     const int  i_parity = i % 2;
-    ucc_rank_t offset_at_step[2], recv_data_from;
+    int offset_at_step[2];
+    ucc_rank_t recv_data_from;
+
     if (rank % 2) {
         recv_data_from    = (rank - 1 + size) % size;
         offset_at_step[0] = (-2);
diff --git a/src/components/tl/ucp/allgather/allgather_ring.c b/src/components/tl/ucp/allgather/allgather_ring.c
index 93d7b95fc4..07178aea25 100644
--- a/src/components/tl/ucp/allgather/allgather_ring.c
+++ b/src/components/tl/ucp/allgather/allgather_ring.c
@@ -108,7 +108,7 @@ ucc_status_t ucc_tl_ucp_allgather_ring_start(ucc_coll_task_t *coll_task)
 
 ucc_status_t ucc_tl_ucp_allgather_ring_init_common(ucc_tl_ucp_task_t *task)
 {
-    ucc_tl_ucp_team_t *team      = TASK_TEAM(task);
+    ucc_tl_ucp_team_t *team = TASK_TEAM(task);
     ucc_sbgp_t *sbgp;
 
     if (!ucc_coll_args_is_predefined_dt(&TASK_ARGS(task), UCC_RANK_INVALID)) {
diff --git a/src/components/tl/ucp/allreduce/allreduce.c b/src/components/tl/ucp/allreduce/allreduce.c
index 1b01cb5455..1149d382fe 100644
--- a/src/components/tl/ucp/allreduce/allreduce.c
+++ b/src/components/tl/ucp/allreduce/allreduce.c
@@ -24,6 +24,11 @@ ucc_base_coll_alg_info_t
             {.id   = UCC_TL_UCP_ALLREDUCE_ALG_SLIDING_WINDOW,
              .name = "sliding_window",
              .desc = "sliding window allreduce (optimized for running on DPU)"},
+        [UCC_TL_UCP_ALLREDUCE_ALG_DBT] =
+            {.id   = UCC_TL_UCP_ALLREDUCE_ALG_SRA_KNOMIAL,
+             .name = "dbt",
+             .desc = "alreduce over double binary tree where a leaf in one tree "
+                     "will be intermediate in other (optimized for BW)"},
         [UCC_TL_UCP_ALLREDUCE_ALG_LAST] = {
             .id = 0, .name = NULL, .desc = NULL}};
 
diff --git a/src/components/tl/ucp/allreduce/allreduce.h b/src/components/tl/ucp/allreduce/allreduce.h
index 5e545b6135..3ec7b3f94c 100644
--- a/src/components/tl/ucp/allreduce/allreduce.h
+++ b/src/components/tl/ucp/allreduce/allreduce.h
@@ -12,6 +12,7 @@ enum {
     UCC_TL_UCP_ALLREDUCE_ALG_KNOMIAL,
     UCC_TL_UCP_ALLREDUCE_ALG_SRA_KNOMIAL,
     UCC_TL_UCP_ALLREDUCE_ALG_SLIDING_WINDOW,
+    UCC_TL_UCP_ALLREDUCE_ALG_DBT,
     UCC_TL_UCP_ALLREDUCE_ALG_LAST
 };
 
@@ -102,8 +103,8 @@ typedef struct ucc_tl_ucp_allreduce_sw_host_allgather {
 } ucc_tl_ucp_allreduce_sw_host_allgather;
 
 ucc_status_t ucc_tl_ucp_allreduce_knomial_init(ucc_base_coll_args_t *coll_args,
-                                               ucc_base_team_t *     team,
-                                               ucc_coll_task_t **    task_h);
+                                               ucc_base_team_t *team,
+                                               ucc_coll_task_t **task_h);
 
 ucc_status_t
 ucc_tl_ucp_allreduce_sliding_window_init(ucc_base_coll_args_t *coll_args,
@@ -142,15 +143,22 @@ ucc_tl_ucp_allreduce_sliding_window_finalize(ucc_coll_task_t *task);
 
 ucc_status_t ucc_tl_ucp_allreduce_knomial_finalize(ucc_coll_task_t *task);
 
-ucc_status_t
-ucc_tl_ucp_allreduce_sra_knomial_init(ucc_base_coll_args_t *coll_args,
-                                      ucc_base_team_t *     team,
-                                      ucc_coll_task_t **    task_h);
+ucc_status_t ucc_tl_ucp_allreduce_sra_knomial_init(ucc_base_coll_args_t *coll_args,
+                                                   ucc_base_team_t *team,
+                                                   ucc_coll_task_t **task_h);
 
 ucc_status_t ucc_tl_ucp_allreduce_sra_knomial_start(ucc_coll_task_t *task);
 
 ucc_status_t ucc_tl_ucp_allreduce_sra_knomial_progress(ucc_coll_task_t *task);
 
+ucc_status_t ucc_tl_ucp_allreduce_dbt_init(ucc_base_coll_args_t *coll_args,
+                                           ucc_base_team_t *team,
+                                           ucc_coll_task_t **task_h);
+
+ucc_status_t ucc_tl_ucp_allreduce_dbt_start(ucc_coll_task_t *task);
+
+ucc_status_t ucc_tl_ucp_allreduce_dbt_progress(ucc_coll_task_t *task);
+
 static inline int ucc_tl_ucp_allreduce_alg_from_str(const char *str)
 {
     int i;
diff --git a/src/components/tl/ucp/allreduce/allreduce_dbt.c b/src/components/tl/ucp/allreduce/allreduce_dbt.c
new file mode 100644
index 0000000000..709f4e5f43
--- /dev/null
+++ b/src/components/tl/ucp/allreduce/allreduce_dbt.c
@@ -0,0 +1,94 @@
+/**
+ * Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ *
+ * See file LICENSE for terms.
+ */
+
+#include "config.h"
+#include "tl_ucp.h"
+#include "allreduce.h"
+#include "../reduce/reduce.h"
+#include "../bcast/bcast.h"
+
+ucc_status_t ucc_tl_ucp_allreduce_dbt_start(ucc_coll_task_t *coll_task)
+{
+    ucc_schedule_t  *schedule = ucc_derived_of(coll_task, ucc_schedule_t);
+    ucc_coll_args_t *args     = &schedule->super.bargs.args;
+    ucc_coll_task_t *reduce_task, *bcast_task;
+
+    reduce_task = schedule->tasks[0];
+    reduce_task->bargs.args.src.info.buffer = args->src.info.buffer;
+    reduce_task->bargs.args.dst.info.buffer = args->dst.info.buffer;
+    reduce_task->bargs.args.src.info.count  = args->src.info.count;
+    reduce_task->bargs.args.dst.info.count  = args->dst.info.count;
+
+    bcast_task = schedule->tasks[1];
+    bcast_task->bargs.args.src.info.buffer = args->dst.info.buffer;
+    bcast_task->bargs.args.src.info.count  = args->dst.info.count;
+
+    UCC_TL_UCP_PROFILE_REQUEST_EVENT(coll_task, "ucp_allreduce_dbt_start", 0);
+    return ucc_schedule_start(coll_task);
+}
+
+ucc_status_t ucc_tl_ucp_allreduce_dbt_finalize(ucc_coll_task_t *coll_task)
+{
+    ucc_schedule_t *schedule = ucc_derived_of(coll_task, ucc_schedule_t);
+    ucc_status_t    status;
+
+    UCC_TL_UCP_PROFILE_REQUEST_EVENT(schedule, "ucp_allreduce_dbt_done", 0);
+    status = ucc_schedule_finalize(coll_task);
+    ucc_tl_ucp_put_schedule(schedule);
+    return status;
+}
+
+ucc_status_t ucc_tl_ucp_allreduce_dbt_init(ucc_base_coll_args_t *coll_args,
+                                           ucc_base_team_t *team,
+                                           ucc_coll_task_t **task_h)
+{
+    ucc_tl_ucp_team_t   *tl_team  = ucc_derived_of(team, ucc_tl_ucp_team_t);
+    ucc_base_coll_args_t args     = *coll_args;
+    ucc_schedule_t      *schedule;
+    ucc_coll_task_t     *reduce_task, *bcast_task;
+    ucc_status_t         status;
+
+    if (UCC_IS_INPLACE(args.args)) {
+        return UCC_ERR_NOT_SUPPORTED;
+    }
+
+    status = ucc_tl_ucp_get_schedule(tl_team, coll_args,
+                                     (ucc_tl_ucp_schedule_t **)&schedule);
+    if (ucc_unlikely(UCC_OK != status)) {
+        return status;
+    }
+
+    args.args.root = 0;
+    UCC_CHECK_GOTO(ucc_tl_ucp_reduce_dbt_init(&args, team, &reduce_task),
+                   out, status);
+    UCC_CHECK_GOTO(ucc_schedule_add_task(schedule, reduce_task),
+                   out, status);
+    UCC_CHECK_GOTO(ucc_event_manager_subscribe(&schedule->super,
+                                               UCC_EVENT_SCHEDULE_STARTED,
+                                               reduce_task,
+                                               ucc_task_start_handler),
+                   out, status);
+
+    UCC_CHECK_GOTO(ucc_tl_ucp_bcast_dbt_init(&args, team, &bcast_task),
+                   out, status);
+    UCC_CHECK_GOTO(ucc_schedule_add_task(schedule, bcast_task),
+                   out, status);
+    UCC_CHECK_GOTO(ucc_event_manager_subscribe(reduce_task, UCC_EVENT_COMPLETED,
+                                               bcast_task,
+                                               ucc_task_start_handler),
+                   out, status);
+
+    schedule->super.post = ucc_tl_ucp_allreduce_dbt_start;
+    schedule->super.progress = NULL;
+    schedule->super.finalize =  ucc_tl_ucp_allreduce_dbt_finalize;
+    *task_h = &schedule->super;
+
+    return UCC_OK;
+
+out:
+    ucc_tl_ucp_put_schedule(schedule);
+    return status;
+}
diff --git a/src/components/tl/ucp/allreduce/allreduce_sra_knomial.c b/src/components/tl/ucp/allreduce/allreduce_sra_knomial.c
index d51ee23802..d24eca786c 100644
--- a/src/components/tl/ucp/allreduce/allreduce_sra_knomial.c
+++ b/src/components/tl/ucp/allreduce/allreduce_sra_knomial.c
@@ -11,6 +11,7 @@
 #include "coll_patterns/sra_knomial.h"
 #include "utils/ucc_math.h"
 #include "utils/ucc_coll_utils.h"
+#include "components/mc/ucc_mc.h"
 #include "../reduce_scatter/reduce_scatter.h"
 #include "../allgather/allgather.h"
 
@@ -53,41 +54,40 @@ ucc_tl_ucp_allreduce_sra_knomial_frag_finalize(ucc_coll_task_t *task)
     return status;
 }
 
-static ucc_status_t ucc_tl_ucp_allreduce_sra_knomial_frag_setup(
-    ucc_schedule_pipelined_t *schedule_p, ucc_schedule_t *frag, int frag_num)
+static ucc_status_t
+ucc_tl_ucp_allreduce_sra_knomial_frag_setup(ucc_schedule_pipelined_t *schedule_p,
+                                            ucc_schedule_t *frag, int frag_num)
 {
-    ucc_coll_args_t *args    = &schedule_p->super.super.bargs.args;
-    ucc_datatype_t   dt      = args->dst.info.datatype;
-    size_t           dt_size = ucc_dt_size(dt);
-    ucc_coll_args_t *targs;
+    ucc_coll_args_t *args       = &schedule_p->super.super.bargs.args;
+    ucc_datatype_t   dt         = args->dst.info.datatype;
+    size_t           dt_size    = ucc_dt_size(dt);
     int              n_frags    = schedule_p->super.n_tasks;
     size_t           frag_count = ucc_buffer_block_count(args->dst.info.count,
                                                          n_frags, frag_num);
     size_t           offset     = ucc_buffer_block_offset(args->dst.info.count,
                                                           n_frags, frag_num);
+    ucc_coll_args_t *targs;
 
-    targs = &frag->tasks[0]->bargs.args; //REDUCE_SCATTER
-    targs->src.info.buffer =
-        PTR_OFFSET(args->src.info.buffer, offset * dt_size);
-    targs->dst.info.buffer =
-        PTR_OFFSET(args->dst.info.buffer, offset * dt_size);
-    targs->src.info.count = frag_count;
-    targs->dst.info.count = frag_count;
+    targs = &frag->tasks[0]->bargs.args; /* REDUCE_SCATTER */
+    targs->src.info.buffer = PTR_OFFSET(args->src.info.buffer, offset * dt_size);
+    targs->src.info.count  = frag_count;
+    targs->dst.info.buffer = PTR_OFFSET(args->dst.info.buffer, offset * dt_size);
+    targs->dst.info.count  = frag_count;
 
-    targs                  = &frag->tasks[1]->bargs.args; //ALLGATHER
+    targs = &frag->tasks[1]->bargs.args; /* ALLGATHER */
     targs->src.info.buffer = NULL;
-    targs->dst.info.buffer =
-        PTR_OFFSET(args->dst.info.buffer, offset * dt_size);
-    targs->src.info.count = 0;
-    targs->dst.info.count = frag_count;
+    targs->src.info.count  = 0;
+    targs->dst.info.buffer = PTR_OFFSET(args->dst.info.buffer, offset * dt_size);
+    targs->dst.info.count  = frag_count;
 
     return UCC_OK;
 }
 
-static ucc_status_t ucc_tl_ucp_allreduce_sra_knomial_frag_init(
-    ucc_base_coll_args_t     *coll_args,
-    ucc_schedule_pipelined_t *sp, //NOLINT
-    ucc_base_team_t *team, ucc_schedule_t **frag_p)
+static ucc_status_t
+ucc_tl_ucp_allreduce_sra_knomial_frag_init(ucc_base_coll_args_t *coll_args,
+                                           ucc_schedule_pipelined_t *sp, //NOLINT
+                                           ucc_base_team_t *team,
+                                           ucc_schedule_t **frag_p)
 {
     ucc_tl_ucp_team_t   *tl_team  = ucc_derived_of(team, ucc_tl_ucp_team_t);
     ucc_datatype_t       dtype    = coll_args->args.dst.info.datatype;
@@ -166,55 +166,84 @@ ucc_status_t ucc_tl_ucp_allreduce_sra_knomial_start(ucc_coll_task_t *task)
     return ucc_schedule_pipelined_post(task);
 }
 
-ucc_status_t
-ucc_tl_ucp_allreduce_sra_knomial_init(ucc_base_coll_args_t *coll_args,
-                                      ucc_base_team_t      *team,
-                                      ucc_coll_task_t     **task_h)
+static void
+ucc_tl_ucp_allreduce_sra_knomial_get_pipeline_params(ucc_tl_ucp_team_t *team,
+                                                     ucc_coll_args_t *args,
+                                                     ucc_pipeline_params_t *pp)
 {
-    ucc_tl_ucp_team_t        *tl_team = ucc_derived_of(team, ucc_tl_ucp_team_t);
-    ucc_tl_ucp_lib_config_t  *cfg     = &tl_team->cfg;
-    int                       n_frags, pipeline_depth;
-    ucc_schedule_pipelined_t *schedule_p;
-    ucc_status_t         status;
-    ucc_base_coll_args_t bargs;
-    size_t               max_frag_count, dt_size;
+    ucc_tl_ucp_lib_config_t *cfg = &team->cfg;
 
-    dt_size = ucc_dt_size(coll_args->args.dst.info.datatype);
-    status  = ucc_tl_ucp_get_schedule(tl_team, coll_args,
-                                      (ucc_tl_ucp_schedule_t **)&schedule_p);
-    if (ucc_unlikely(UCC_OK != status)) {
-        return status;
+    if (!ucc_pipeline_params_is_auto(&cfg->allreduce_sra_kn_pipeline)) {
+        *pp = cfg->allreduce_sra_kn_pipeline;
+        return;
     }
-    bargs = *coll_args;
 
-    if (bargs.mask & UCC_BASE_CARGS_MAX_FRAG_COUNT) {
-        max_frag_count = bargs.max_frag_count;
+    if ((args->src.info.mem_type == UCC_MEMORY_TYPE_CUDA) &&
+        (UCC_IS_INPLACE(*args))) {
+        ucc_mc_attr_t mc_attr;
+        mc_attr.field_mask = UCC_MC_ATTR_FIELD_FAST_ALLOC_SIZE;
+        ucc_mc_get_attr(&mc_attr, UCC_MEMORY_TYPE_CUDA);
+        pp->threshold = mc_attr.fast_alloc_size;
+        pp->n_frags   = 2;
+        pp->frag_size = mc_attr.fast_alloc_size;
+        pp->order     = UCC_PIPELINE_PARALLEL;
+        pp->pdepth    = 2;
     } else {
-        max_frag_count = coll_args->args.dst.info.count;
+        pp->threshold = SIZE_MAX;
+        pp->n_frags   = 0;
+        pp->frag_size = 0;
+        pp->pdepth    = 1;
+        pp->order     = UCC_PIPELINE_PARALLEL;
+
     }
+}
 
-    ucc_pipeline_nfrags_pdepth(&cfg->allreduce_sra_kn_pipeline,
-                               max_frag_count * dt_size, &n_frags,
-                               &pipeline_depth);
+ucc_status_t
+ucc_tl_ucp_allreduce_sra_knomial_init(ucc_base_coll_args_t *coll_args,
+                                      ucc_base_team_t *team,
+                                      ucc_coll_task_t **task_h)
+{
+    ucc_tl_ucp_team_t        *tl_team = ucc_derived_of(team, ucc_tl_ucp_team_t);
+    ucc_coll_args_t          *args    = &coll_args->args;
+    size_t                    dt_size = ucc_dt_size(args->dst.info.datatype);
+    int                       n_frags, pipeline_depth;
+    ucc_schedule_pipelined_t *schedule_p;
+    ucc_status_t              st;
+    ucc_base_coll_args_t      bargs;
+    size_t                    max_frag_count;
+    ucc_pipeline_params_t     pipeline_params;
+
+    st  = ucc_tl_ucp_get_schedule(tl_team, coll_args,
+                                  (ucc_tl_ucp_schedule_t **)&schedule_p);
+    if (ucc_unlikely(UCC_OK != st)) {
+        return st;
+    }
 
+    bargs = *coll_args;
+    max_frag_count = (bargs.mask & UCC_BASE_CARGS_MAX_FRAG_COUNT) ?
+                     bargs.max_frag_count: args->dst.info.count;
+    ucc_tl_ucp_allreduce_sra_knomial_get_pipeline_params(tl_team, args,
+                                                         &pipeline_params);
+    ucc_pipeline_nfrags_pdepth(&pipeline_params, max_frag_count * dt_size,
+                               &n_frags, &pipeline_depth);
     if (n_frags > 1) {
-        bargs.mask         |= UCC_BASE_CARGS_MAX_FRAG_COUNT;
-        bargs.max_frag_count =
-            ucc_buffer_block_count(max_frag_count, n_frags, 0);
+        bargs.mask           |= UCC_BASE_CARGS_MAX_FRAG_COUNT;
+        bargs.max_frag_count = ucc_buffer_block_count(max_frag_count, n_frags, 0);
     }
 
-    status = ucc_schedule_pipelined_init(
-        &bargs, team, ucc_tl_ucp_allreduce_sra_knomial_frag_init,
-        ucc_tl_ucp_allreduce_sra_knomial_frag_setup, pipeline_depth, n_frags,
-        cfg->allreduce_sra_kn_pipeline.order, schedule_p);
-    if (UCC_OK != status) {
+    st = ucc_schedule_pipelined_init(&bargs, team,
+                                     ucc_tl_ucp_allreduce_sra_knomial_frag_init,
+                                     ucc_tl_ucp_allreduce_sra_knomial_frag_setup,
+                                     pipeline_depth, n_frags,
+                                     pipeline_params.order, schedule_p);
+    if (ucc_unlikely(UCC_OK != st)) {
         tl_error(team->context->lib, "failed to init pipelined schedule");
         ucc_tl_ucp_put_schedule(&schedule_p->super);
-        return status;
+        return st;
     }
-    schedule_p->super.super.finalize =
-        ucc_tl_ucp_allreduce_sra_knomial_finalize;
-    schedule_p->super.super.post = ucc_tl_ucp_allreduce_sra_knomial_start;
-    *task_h                                = &schedule_p->super.super;
+
+    schedule_p->super.super.finalize = ucc_tl_ucp_allreduce_sra_knomial_finalize;
+    schedule_p->super.super.post     = ucc_tl_ucp_allreduce_sra_knomial_start;
+    *task_h = &schedule_p->super.super;
     return UCC_OK;
 }
diff --git a/src/components/tl/ucp/alltoall/alltoall.c b/src/components/tl/ucp/alltoall/alltoall.c
index faa888dcc0..3803d96426 100644
--- a/src/components/tl/ucp/alltoall/alltoall.c
+++ b/src/components/tl/ucp/alltoall/alltoall.c
@@ -56,8 +56,8 @@ ucc_status_t ucc_tl_ucp_alltoall_init(ucc_tl_ucp_task_t *task)
 }
 
 ucc_status_t ucc_tl_ucp_alltoall_pairwise_init(ucc_base_coll_args_t *coll_args,
-                                               ucc_base_team_t      *team,
-                                               ucc_coll_task_t     **task_h)
+                                               ucc_base_team_t *team,
+                                               ucc_coll_task_t **task_h)
 {
     ucc_tl_ucp_team_t *tl_team = ucc_derived_of(team, ucc_tl_ucp_team_t);
     ucc_tl_ucp_task_t *task;
@@ -72,8 +72,8 @@ ucc_status_t ucc_tl_ucp_alltoall_pairwise_init(ucc_base_coll_args_t *coll_args,
 }
 
 ucc_status_t ucc_tl_ucp_alltoall_onesided_init(ucc_base_coll_args_t *coll_args,
-                                               ucc_base_team_t *     team,
-                                               ucc_coll_task_t **    task_h)
+                                               ucc_base_team_t *team,
+                                               ucc_coll_task_t **task_h)
 {
     ucc_tl_ucp_team_t *tl_team = ucc_derived_of(team, ucc_tl_ucp_team_t);
     ucc_tl_ucp_task_t *task;
diff --git a/src/components/tl/ucp/alltoall/alltoall_bruck.c b/src/components/tl/ucp/alltoall/alltoall_bruck.c
index 984b900b9c..4424437f8a 100644
--- a/src/components/tl/ucp/alltoall/alltoall_bruck.c
+++ b/src/components/tl/ucp/alltoall/alltoall_bruck.c
@@ -12,10 +12,15 @@
 #include "coll_patterns/bruck_alltoall.h"
 
 #define RADIX 2
+#define SAVE_STATE(_phase)                                                     \
+    do {                                                                       \
+        task->alltoall_bruck.phase = _phase;                                   \
+    } while (0)
 
 enum {
     PHASE_MERGE,
-    PHASE_SENDRECV
+    PHASE_SENDRECV,
+    PHASE_BCOPY
 };
 
 static inline int msb_pos_for_level(unsigned int nthbit, ucc_rank_t number)
@@ -33,7 +38,8 @@ static inline int msb_pos_for_level(unsigned int nthbit, ucc_rank_t number)
     return msb_set;
 }
 
-static inline int find_seg_index(ucc_rank_t seg_index, int level, int nsegs_per_rblock)
+static inline int find_seg_index(ucc_rank_t seg_index, int level,
+                                 int nsegs_per_rblock)
 {
     int block, blockseg;
 
@@ -53,7 +59,8 @@ static inline int find_seg_index(ucc_rank_t seg_index, int level, int nsegs_per_
     return block * nsegs_per_rblock + blockseg;
 }
 
-ucc_status_t ucc_tl_ucp_alltoall_bruck_backward_rotation(void *dst, void *src,
+ucc_status_t ucc_tl_ucp_alltoall_bruck_backward_rotation(void *dst,
+                                                         void *src,
                                                          ucc_rank_t trank,
                                                          ucc_rank_t tsize,
                                                          size_t seg_size)
@@ -107,18 +114,29 @@ void ucc_tl_ucp_alltoall_bruck_progress(ucc_coll_task_t *coll_task)
     ucc_rank_t         tsize      = UCC_TL_TEAM_SIZE(team);
     ucc_coll_args_t   *args       = &TASK_ARGS(task);
     void              *scratch    = task->alltoall_bruck.scratch_mc_header->addr;
-    void              *mergebuf   = args->dst.info.buffer;
+    void              *mergebuf   = task->alltoall_bruck.dst;
     const ucc_rank_t   nrecv_segs = tsize / 2;
     const size_t       seg_size   = ucc_dt_size(args->src.info.datatype) *
                                     args->src.info.count / tsize;
-    void              *data;
+    ucc_memory_type_t  smtype     = args->src.info.mem_type;
+    ucc_memory_type_t  dmtype     = args->dst.info.mem_type;
     ucc_rank_t sendto, recvfrom, step, index;
+    void *data;
     ucc_rank_t level, snd_count;
     int send_buffer_index;
-    ucc_status_t st;
-
-    if (task->alltoall_bruck.phase == PHASE_SENDRECV) {
+    ucc_status_t status;
+    ucc_ee_executor_t *exec;
+    ucc_ee_executor_task_args_t eargs;
+
+    EXEC_TASK_TEST(task->alltoall_bruck.phase,
+                   "failed to copy data from user buffer to scratch",
+                   task->alltoall_bruck.etask);
+    switch (task->alltoall_bruck.phase) {
+    case PHASE_SENDRECV:
         goto ALLTOALL_BRUCK_PHASE_SENDRECV;
+    case PHASE_BCOPY:
+        task->super.status = UCC_OK;
+        goto out;
     }
 
     step = 1 << (task->alltoall_bruck.iteration - 1);
@@ -133,16 +151,16 @@ void ucc_tl_ucp_alltoall_bruck_progress(ucc_coll_task_t *coll_task)
              index = GET_NEXT_BRUCK_NUM(index, RADIX, step)) {
             send_buffer_index = find_seg_index(index, level + 1, nrecv_segs);
             if (send_buffer_index == -1) {
-                data = PTR_OFFSET(args->src.info.buffer,
+                data = PTR_OFFSET(task->alltoall_bruck.src,
                                   ((index + trank) % tsize) * seg_size);
             } else {
                 data = PTR_OFFSET(scratch, send_buffer_index * seg_size);
             }
-            st = ucc_mc_memcpy(PTR_OFFSET(mergebuf, seg_size * snd_count),
-                               data, seg_size, UCC_MEMORY_TYPE_HOST,
-                               UCC_MEMORY_TYPE_HOST);
-            if (ucc_unlikely(UCC_OK != st)) {
-                task->super.status = st;
+            status = ucc_mc_memcpy(PTR_OFFSET(mergebuf, seg_size * snd_count),
+                                   data, seg_size, UCC_MEMORY_TYPE_HOST,
+                                   UCC_MEMORY_TYPE_HOST);
+            if (ucc_unlikely(UCC_OK != status)) {
+                task->super.status = status;
                 return;
             }
             snd_count++;
@@ -158,36 +176,88 @@ void ucc_tl_ucp_alltoall_bruck_progress(ucc_coll_task_t *coll_task)
                       task, out);
 ALLTOALL_BRUCK_PHASE_SENDRECV:
         if (ucc_tl_ucp_test(task) == UCC_INPROGRESS) {
-            task->alltoall_bruck.phase = PHASE_SENDRECV;
+            SAVE_STATE(PHASE_SENDRECV);
             return;
         }
         task->alltoall_bruck.iteration++;
         step = 1 << (task->alltoall_bruck.iteration - 1);
     }
 
-    st = ucc_mc_memcpy(PTR_OFFSET(args->dst.info.buffer, trank * seg_size),
-                       PTR_OFFSET(args->src.info.buffer, trank * seg_size),
-                       seg_size, UCC_MEMORY_TYPE_HOST, UCC_MEMORY_TYPE_HOST);
-    if (ucc_unlikely(st != UCC_OK)) {
-        task->super.status = st;
+    status = ucc_mc_memcpy(PTR_OFFSET(task->alltoall_bruck.dst, trank * seg_size),
+                           PTR_OFFSET(task->alltoall_bruck.src, trank * seg_size),
+                           seg_size, UCC_MEMORY_TYPE_HOST, UCC_MEMORY_TYPE_HOST);
+    if (ucc_unlikely(status != UCC_OK)) {
+        task->super.status = status;
         return;
     }
-    task->super.status =
-        ucc_tl_ucp_alltoall_bruck_backward_rotation(args->dst.info.buffer,
-                                                    scratch, trank, tsize,
-                                                    seg_size);
+    status = ucc_tl_ucp_alltoall_bruck_backward_rotation(mergebuf, scratch,
+                                                         trank, tsize,
+                                                         seg_size);
+    if (ucc_unlikely(status != UCC_OK)) {
+        task->super.status = status;
+        return;
+    }
+
+    if (smtype != UCC_MEMORY_TYPE_HOST || dmtype != UCC_MEMORY_TYPE_HOST) {
+        task->alltoall_bruck.phase = PHASE_BCOPY;
+        status = ucc_coll_task_get_executor(&task->super, &exec);
+        if (ucc_unlikely(status != UCC_OK)) {
+            task->super.status = status;
+            return;
+        }
+
+        eargs.task_type = UCC_EE_EXECUTOR_TASK_COPY;
+        eargs.copy.src  = mergebuf;
+        eargs.copy.dst  = args->dst.info.buffer;
+        eargs.copy.len  = seg_size * tsize;
+        status = ucc_ee_executor_task_post(exec, &eargs,
+                                           &task->alltoall_bruck.etask);
+        if (ucc_unlikely(status != UCC_OK)) {
+            task->super.status = status;
+            return;
+        }
+        EXEC_TASK_TEST(PHASE_BCOPY, "failed to copy data to user buffer",
+                       task->alltoall_bruck.etask);
+    }
+
+    task->super.status = UCC_OK;
 out:
     return;
 }
 
 ucc_status_t ucc_tl_ucp_alltoall_bruck_start(ucc_coll_task_t *coll_task)
 {
-    ucc_tl_ucp_task_t *task    = ucc_derived_of(coll_task, ucc_tl_ucp_task_t);
-    ucc_tl_ucp_team_t *team    = TASK_TEAM(task);
+    ucc_tl_ucp_task_t *task = ucc_derived_of(coll_task, ucc_tl_ucp_task_t);
+    ucc_tl_ucp_team_t *team = TASK_TEAM(task);
+    ucc_coll_args_t   *args = &TASK_ARGS(task);
+    size_t             size = ucc_dt_size(args->src.info.datatype) *
+                              args->src.info.count;
+    ucc_ee_executor_t *exec;
+    ucc_ee_executor_task_args_t eargs;
+    ucc_status_t status;
 
+    ucc_tl_ucp_task_reset(task, UCC_INPROGRESS);
     task->alltoall_bruck.iteration = 1;
     task->alltoall_bruck.phase     = PHASE_MERGE;
-    ucc_tl_ucp_task_reset(task, UCC_INPROGRESS);
+    task->alltoall_bruck.etask     = NULL;
+
+    if ((args->src.info.mem_type != UCC_MEMORY_TYPE_HOST) ||
+        (args->dst.info.mem_type != UCC_MEMORY_TYPE_HOST)) {
+        status = ucc_coll_task_get_executor(&task->super, &exec);
+        if (ucc_unlikely(status != UCC_OK)) {
+            return status;
+        }
+
+        eargs.task_type = UCC_EE_EXECUTOR_TASK_COPY;
+        eargs.copy.src  = args->src.info.buffer;
+        eargs.copy.dst  = task->alltoall_bruck.src;
+        eargs.copy.len  = size;
+        status = ucc_ee_executor_task_post(exec, &eargs,
+                                           &task->alltoall_bruck.etask);
+        if (ucc_unlikely(status != UCC_OK)) {
+            return status;
+        }
+    }
 
     return ucc_progress_queue_enqueue(UCC_TL_CORE_CTX(team)->pq, &task->super);
 }
@@ -199,25 +269,28 @@ ucc_status_t ucc_tl_ucp_alltoall_bruck_init(ucc_base_coll_args_t *coll_args,
     ucc_tl_ucp_team_t *tl_team  = ucc_derived_of(team, ucc_tl_ucp_team_t);
     ucc_rank_t         tsize    = UCC_TL_TEAM_SIZE(tl_team);
     ucc_coll_args_t   *args     = &coll_args->args;
-    size_t             seg_size = ucc_dt_size(args->src.info.datatype) *
-                                  args->src.info.count / tsize;
+    size_t             ssize    = ucc_dt_size(args->src.info.datatype) *
+                                  args->src.info.count;
+    size_t             seg_size = ssize / tsize;
+    int                is_bcopy = 0;
     size_t scratch_size;
     ucc_tl_ucp_task_t *task;
     ucc_status_t status;
 
-    if ((coll_args->args.src.info.mem_type != UCC_MEMORY_TYPE_HOST) ||
-        (coll_args->args.dst.info.mem_type != UCC_MEMORY_TYPE_HOST)) {
-        status = UCC_ERR_NOT_SUPPORTED;
-        goto out;
-    }
     ALLTOALL_TASK_CHECK(coll_args->args, tl_team);
-
     task                 = ucc_tl_ucp_init_task(coll_args, team);
     task->super.post     = ucc_tl_ucp_alltoall_bruck_start;
     task->super.progress = ucc_tl_ucp_alltoall_bruck_progress;
     task->super.finalize = ucc_tl_ucp_alltoall_bruck_finalize;
+    task->super.flags    |= UCC_COLL_TASK_FLAG_EXECUTOR;
 
     scratch_size = lognum(tsize) * ucc_div_round_up(tsize, 2) * seg_size;
+    if ((coll_args->args.src.info.mem_type != UCC_MEMORY_TYPE_HOST) ||
+        (coll_args->args.dst.info.mem_type != UCC_MEMORY_TYPE_HOST)) {
+        is_bcopy = 1;
+        scratch_size += 2 * ssize;
+    }
+
     status = ucc_mc_alloc(&task->alltoall_bruck.scratch_mc_header,
                           scratch_size, UCC_MEMORY_TYPE_HOST);
     if (ucc_unlikely(status != UCC_OK)) {
@@ -226,6 +299,17 @@ ucc_status_t ucc_tl_ucp_alltoall_bruck_init(ucc_base_coll_args_t *coll_args,
         return status;
     }
 
+    if (is_bcopy) {
+        task->alltoall_bruck.src =
+            PTR_OFFSET(task->alltoall_bruck.scratch_mc_header->addr,
+                       lognum(tsize) * ucc_div_round_up(tsize, 2) * seg_size);
+        task->alltoall_bruck.dst =
+            PTR_OFFSET(task->alltoall_bruck.src, ssize);
+    } else {
+        task->alltoall_bruck.src = args->src.info.buffer;
+        task->alltoall_bruck.dst = args->dst.info.buffer;
+    }
+
     *task_h = &task->super;
     return UCC_OK;
 
diff --git a/src/components/tl/ucp/alltoall/alltoall_onesided.c b/src/components/tl/ucp/alltoall/alltoall_onesided.c
index 99c56d281c..856b392534 100644
--- a/src/components/tl/ucp/alltoall/alltoall_onesided.c
+++ b/src/components/tl/ucp/alltoall/alltoall_onesided.c
@@ -55,9 +55,7 @@ void ucc_tl_ucp_alltoall_onesided_progress(ucc_coll_task_t *ctask)
     ucc_rank_t         gsize = UCC_TL_TEAM_SIZE(team);
     long *             pSync = TASK_ARGS(task).global_work_buffer;
 
-    if ((*pSync < gsize) ||
-        (task->onesided.put_completed < task->onesided.put_posted)) {
-        ucp_worker_progress(UCC_TL_UCP_TEAM_CTX(team)->worker.ucp_worker);
+    if (ucc_tl_ucp_test_onesided(task, gsize) == UCC_INPROGRESS) {
         return;
     }
 
diff --git a/src/components/tl/ucp/alltoallv/alltoallv.c b/src/components/tl/ucp/alltoallv/alltoallv.c
index bc21df9f10..063cbd22bf 100644
--- a/src/components/tl/ucp/alltoallv/alltoallv.c
+++ b/src/components/tl/ucp/alltoallv/alltoallv.c
@@ -19,6 +19,10 @@ ucc_base_coll_alg_info_t
             {.id   = UCC_TL_UCP_ALLTOALLV_ALG_HYBRID,
              .name = "hybrid",
              .desc = "hybrid a2av alg "},
+        [UCC_TL_UCP_ALLTOALLV_ALG_ONESIDED] =
+            {.id   = UCC_TL_UCP_ALLTOALLV_ALG_ONESIDED,
+             .name = "onesided",
+             .desc = "O(N) onesided alltoallv"},
         [UCC_TL_UCP_ALLTOALLV_ALG_LAST] = {
             .id = 0, .name = NULL, .desc = NULL}};
 
diff --git a/src/components/tl/ucp/alltoallv/alltoallv.h b/src/components/tl/ucp/alltoallv/alltoallv.h
index 5aef136564..a501cc4205 100644
--- a/src/components/tl/ucp/alltoallv/alltoallv.h
+++ b/src/components/tl/ucp/alltoallv/alltoallv.h
@@ -13,6 +13,7 @@
 enum {
     UCC_TL_UCP_ALLTOALLV_ALG_PAIRWISE,
     UCC_TL_UCP_ALLTOALLV_ALG_HYBRID,
+    UCC_TL_UCP_ALLTOALLV_ALG_ONESIDED,
     UCC_TL_UCP_ALLTOALLV_ALG_LAST
 };
 
@@ -32,6 +33,9 @@ ucc_status_t ucc_tl_ucp_alltoallv_hybrid_init(ucc_base_coll_args_t *coll_args,
                                               ucc_base_team_t      *team,
                                               ucc_coll_task_t     **task_h);
 
+ucc_status_t ucc_tl_ucp_alltoallv_onesided_init(ucc_base_coll_args_t *coll_args,
+                                                ucc_base_team_t      *team,
+                                                ucc_coll_task_t     **task_h);
 
 ucc_status_t ucc_tl_ucp_alltoallv_pairwise_init_common(ucc_tl_ucp_task_t *task);
 
diff --git a/src/components/tl/ucp/alltoallv/alltoallv_hybrid.c b/src/components/tl/ucp/alltoallv/alltoallv_hybrid.c
index 61b130eaa5..7b8c7b7b67 100644
--- a/src/components/tl/ucp/alltoallv/alltoallv_hybrid.c
+++ b/src/components/tl/ucp/alltoallv/alltoallv_hybrid.c
@@ -510,9 +510,11 @@ ucc_status_t post_recv(ucc_rank_t recvfrom, ucc_rank_t tsize, size_t dt_size,
 
     /* check if we have space for maximum recieve. If not, recycle */
     if (meta->offset * dt_size + step_buf_size > tmp_buf_size) {
-        new_offset = receive_buffer_recycler(tsize, (int *)op_metadata, (int *)op_metadata + tsize,
-                                             seg_st, p_tmp_recv_region, dt_size, BytesForPacking,
-                                             step, user_rbuf, rdisps, trank, radix, node_edge_id);
+        new_offset = receive_buffer_recycler(tsize, (unsigned int *)op_metadata,
+                                             (int *)op_metadata + tsize,
+                                             seg_st, p_tmp_recv_region, dt_size,
+                                             BytesForPacking, step, user_rbuf,
+                                             rdisps, trank, radix, node_edge_id);
         meta->offset = new_offset;
     }
     ucc_assert(meta->offset * dt_size + step_buf_size <= tmp_buf_size);
@@ -595,8 +597,8 @@ static ucc_status_t complete_current_step_receives(ucc_rank_t tsize, int step,
                     temp_offset = PTR_OFFSET(temp_offset, cur_buf_length * dt_size);
                 } else {
                     /* data will be sent pairwise */
-                    ((int *)op_metadata)[i]         = COUNT_DIRECT;
-                    ((int *)op_metadata)[i + tsize] = COUNT_DIRECT;
+                    ((int *)op_metadata)[i]         = (int)COUNT_DIRECT;
+                    ((int *)op_metadata)[i + tsize] = (int)COUNT_DIRECT;
                     if (i < (step * radix)) {
                         int pairwise_src = (trank - i + tsize) % tsize;
                         if (rcounts[pairwise_src] > 0) {
@@ -636,8 +638,8 @@ static ucc_status_t complete_current_step_receives(ucc_rank_t tsize, int step,
                         next_p = tsize;
                     }
                 } else {
-                    ((int *)op_metadata)[i]         = COUNT_DIRECT;
-                    ((int *)op_metadata)[i + tsize] = COUNT_DIRECT;
+                    ((int *)op_metadata)[i]         = (int)COUNT_DIRECT;
+                    ((int *)op_metadata)[i + tsize] = (int)COUNT_DIRECT;
                     if (i < (step * radix)) {
                         int pairwise_src = (trank - i + tsize) % tsize;
                         if (rcounts[pairwise_src] > 0) {
@@ -709,7 +711,7 @@ ucc_status_t pairwise_manager(ucc_rank_t trank, ucc_rank_t tsize,
     int               *r_disps          = (int*)TASK_ARGS(task).dst.info_v.displacements;
     int               *scounts          = (int*)TASK_ARGS(task).src.info_v.counts;
     int               *rcounts          = (int*)TASK_ARGS(task).dst.info_v.counts;
-    int*               cur              = &task->alltoallv_hybrid.cur_out;
+    ucc_rank_t        *cur              = &task->alltoallv_hybrid.cur_out;
     int                chunk_num_limit  = UCC_TL_UCP_TEAM_LIB(team)->cfg.alltoallv_hybrid_pairwise_num_posts;
     int                chunk_byte_limit = UCC_TL_UCP_TEAM_LIB(team)->cfg.alltoallv_hybrid_chunk_byte_limit;
     ucc_status_t status;
diff --git a/src/components/tl/ucp/alltoallv/alltoallv_onesided.c b/src/components/tl/ucp/alltoallv/alltoallv_onesided.c
new file mode 100644
index 0000000000..bb6fa14b3e
--- /dev/null
+++ b/src/components/tl/ucp/alltoallv/alltoallv_onesided.c
@@ -0,0 +1,104 @@
+/**
+ * Copyright (c) 2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ *
+ * See file LICENSE for terms.
+ */
+
+#include "config.h"
+#include "tl_ucp.h"
+#include "alltoallv.h"
+#include "core/ucc_progress_queue.h"
+#include "utils/ucc_math.h"
+#include "tl_ucp_sendrecv.h"
+
+ucc_status_t ucc_tl_ucp_alltoallv_onesided_start(ucc_coll_task_t *ctask)
+{
+    ucc_tl_ucp_task_t *task     = ucc_derived_of(ctask, ucc_tl_ucp_task_t);
+    ucc_tl_ucp_team_t *team     = TASK_TEAM(task);
+    ptrdiff_t          src      = (ptrdiff_t)TASK_ARGS(task).src.info_v.buffer;
+    ptrdiff_t          dest     = (ptrdiff_t)TASK_ARGS(task).dst.info_v.buffer;
+    ucc_rank_t         grank    = UCC_TL_TEAM_RANK(team);
+    ucc_rank_t         gsize    = UCC_TL_TEAM_SIZE(team);
+    long              *pSync    = TASK_ARGS(task).global_work_buffer;
+    ucc_aint_t        *s_disp   = TASK_ARGS(task).src.info_v.displacements;
+    ucc_aint_t        *d_disp   = TASK_ARGS(task).dst.info_v.displacements;
+    size_t             sdt_size = ucc_dt_size(TASK_ARGS(task).src.info_v.datatype);
+    size_t             rdt_size = ucc_dt_size(TASK_ARGS(task).dst.info_v.datatype);
+    ucc_rank_t         peer;
+    size_t             sd_disp, dd_disp, data_size;
+
+    ucc_tl_ucp_task_reset(task, UCC_INPROGRESS);
+
+    /* perform a put to each member peer using the peer's index in the
+     * destination displacement. */
+    for (peer = (grank + 1) % gsize; task->onesided.put_posted < gsize;
+         peer = (peer + 1) % gsize) {
+        sd_disp =
+            ucc_coll_args_get_displacement(&TASK_ARGS(task), s_disp, peer) *
+            sdt_size;
+        dd_disp =
+            ucc_coll_args_get_displacement(&TASK_ARGS(task), d_disp, peer) *
+            rdt_size;
+        data_size =
+            ucc_coll_args_get_count(&TASK_ARGS(task),
+                                    TASK_ARGS(task).src.info_v.counts, peer) *
+            sdt_size;
+
+        UCPCHECK_GOTO(ucc_tl_ucp_put_nb(PTR_OFFSET(src, sd_disp),
+                                        PTR_OFFSET(dest, dd_disp),
+                                        data_size, peer, team, task),
+                      task, out);
+        UCPCHECK_GOTO(ucc_tl_ucp_atomic_inc(pSync, peer, team), task, out);
+    }
+    return ucc_progress_queue_enqueue(UCC_TL_CORE_CTX(team)->pq, &task->super);
+out:
+    return task->super.status;
+}
+
+void ucc_tl_ucp_alltoallv_onesided_progress(ucc_coll_task_t *ctask)
+{
+    ucc_tl_ucp_task_t *task  = ucc_derived_of(ctask, ucc_tl_ucp_task_t);
+    ucc_tl_ucp_team_t *team  = TASK_TEAM(task);
+    ucc_rank_t         gsize = UCC_TL_TEAM_SIZE(team);
+    long              *pSync = TASK_ARGS(task).global_work_buffer;
+
+    if (ucc_tl_ucp_test_onesided(task, gsize) == UCC_INPROGRESS) {
+        return;
+    }
+
+    pSync[0]           = 0;
+    task->super.status = UCC_OK;
+}
+
+ucc_status_t ucc_tl_ucp_alltoallv_onesided_init(ucc_base_coll_args_t *coll_args,
+                                                ucc_base_team_t      *team,
+                                                ucc_coll_task_t     **task_h)
+{
+    ucc_tl_ucp_team_t *tl_team = ucc_derived_of(team, ucc_tl_ucp_team_t);
+    ucc_tl_ucp_task_t *task;
+    ucc_status_t       status;
+
+    ALLTOALLV_TASK_CHECK(coll_args->args, tl_team);
+    if (!(coll_args->args.mask & UCC_COLL_ARGS_FIELD_GLOBAL_WORK_BUFFER)) {
+        tl_error(UCC_TL_TEAM_LIB(tl_team),
+                 "global work buffer not provided nor associated with team");
+        status = UCC_ERR_NOT_SUPPORTED;
+        goto out;
+    }
+    if (coll_args->args.mask & UCC_COLL_ARGS_FIELD_FLAGS) {
+        if (!(coll_args->args.flags & UCC_COLL_ARGS_FLAG_MEM_MAPPED_BUFFERS)) {
+            tl_error(UCC_TL_TEAM_LIB(tl_team),
+                     "non memory mapped buffers are not supported");
+            status = UCC_ERR_NOT_SUPPORTED;
+            goto out;
+        }
+    }
+
+    task                 = ucc_tl_ucp_init_task(coll_args, team);
+    *task_h              = &task->super;
+    task->super.post     = ucc_tl_ucp_alltoallv_onesided_start;
+    task->super.progress = ucc_tl_ucp_alltoallv_onesided_progress;
+    status               = UCC_OK;
+out:
+    return status;
+}
diff --git a/src/components/tl/ucp/bcast/bcast.c b/src/components/tl/ucp/bcast/bcast.c
index 6a1d5b7720..b3b98e7779 100644
--- a/src/components/tl/ucp/bcast/bcast.c
+++ b/src/components/tl/ucp/bcast/bcast.c
@@ -19,6 +19,11 @@ ucc_base_coll_alg_info_t
              .name = "sag_knomial",
              .desc = "recursive knomial scatter followed by knomial "
                      "allgather (optimized for BW)"},
+        [UCC_TL_UCP_BCAST_ALG_DBT] =
+            {.id   = UCC_TL_UCP_BCAST_ALG_DBT,
+             .name = "dbt",
+             .desc = "bcast over double binary tree where a leaf in one tree "
+                     "will be intermediate in other (optimized for BW)"},
         [UCC_TL_UCP_BCAST_ALG_LAST] = {
             .id = 0, .name = NULL, .desc = NULL}};
 
@@ -36,8 +41,8 @@ ucc_status_t ucc_tl_ucp_bcast_init(ucc_tl_ucp_task_t *task)
 }
 
 ucc_status_t ucc_tl_ucp_bcast_knomial_init(ucc_base_coll_args_t *coll_args,
-                                               ucc_base_team_t *     team,
-                                               ucc_coll_task_t **    task_h)
+                                           ucc_base_team_t      *team,
+                                           ucc_coll_task_t     **task_h)
 {
     ucc_tl_ucp_task_t *task;
     ucc_status_t       status;
diff --git a/src/components/tl/ucp/bcast/bcast.h b/src/components/tl/ucp/bcast/bcast.h
index 3ea567fb9c..baaa40c313 100644
--- a/src/components/tl/ucp/bcast/bcast.h
+++ b/src/components/tl/ucp/bcast/bcast.h
@@ -11,6 +11,7 @@
 enum {
     UCC_TL_UCP_BCAST_ALG_KNOMIAL,
     UCC_TL_UCP_BCAST_ALG_SAG_KNOMIAL,
+    UCC_TL_UCP_BCAST_ALG_DBT,
     UCC_TL_UCP_BCAST_ALG_LAST
 };
 
@@ -47,4 +48,8 @@ ucc_status_t
 ucc_tl_ucp_bcast_sag_knomial_init(ucc_base_coll_args_t *coll_args,
                               ucc_base_team_t *team, ucc_coll_task_t **task_h);
 
+ucc_status_t ucc_tl_ucp_bcast_dbt_init(
+    ucc_base_coll_args_t *coll_args, ucc_base_team_t *team,
+    ucc_coll_task_t **task_h);
+
 #endif
diff --git a/src/components/tl/ucp/bcast/bcast_dbt.c b/src/components/tl/ucp/bcast/bcast_dbt.c
new file mode 100644
index 0000000000..4e1f77594f
--- /dev/null
+++ b/src/components/tl/ucp/bcast/bcast_dbt.c
@@ -0,0 +1,242 @@
+/**
+ * Copyright (c) 2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ *
+ * See file LICENSE for terms.
+ */
+
+#include "config.h"
+#include "tl_ucp.h"
+#include "bcast.h"
+#include "core/ucc_progress_queue.h"
+#include "tl_ucp_sendrecv.h"
+
+enum {
+    RECV,
+    SEND_T1,
+    SEND_T2,
+    TEST,
+};
+
+#define UCC_BCAST_DBT_CHECK_STATE(_p)                                         \
+    case _p:                                                                  \
+        goto _p;
+
+#define UCC_BCAST_DBT_GOTO_STATE(_state)                                      \
+    do {                                                                      \
+        switch (_state) {                                                     \
+            UCC_BCAST_DBT_CHECK_STATE(SEND_T1);                               \
+            UCC_BCAST_DBT_CHECK_STATE(SEND_T2);                               \
+            UCC_BCAST_DBT_CHECK_STATE(TEST);                                  \
+        };                                                                    \
+    } while (0)
+
+static void recv_completion_common(void *request, ucs_status_t status,
+                                   const ucp_tag_recv_info_t *info, /* NOLINT */
+                                   void *user_data)
+{
+    ucc_tl_ucp_task_t *task = (ucc_tl_ucp_task_t *)user_data;
+    if (ucc_unlikely(UCS_OK != status)) {
+        tl_error(UCC_TASK_LIB(task), "failure in recv completion %s",
+                 ucs_status_string(status));
+        task->super.status = ucs_status_to_ucc_status(status);
+    }
+    task->tagged.recv_completed++;
+    if (request) {
+        ucp_request_free(request);
+    }
+}
+
+static void recv_completion_1(void *request, ucs_status_t status,
+                                   const ucp_tag_recv_info_t *info, /* NOLINT */
+                                   void *user_data)
+{
+    ucc_tl_ucp_task_t *task = (ucc_tl_ucp_task_t *)user_data;
+
+    task->bcast_dbt.t1.recv++;
+    recv_completion_common(request, status, info, user_data);
+}
+
+static void recv_completion_2(void *request, ucs_status_t status,
+                                   const ucp_tag_recv_info_t *info, /* NOLINT */
+                                   void *user_data)
+{
+    ucc_tl_ucp_task_t *task = (ucc_tl_ucp_task_t *)user_data;
+
+    task->bcast_dbt.t2.recv++;
+    recv_completion_common(request, status, info, user_data);
+
+}
+
+void ucc_tl_ucp_bcast_dbt_progress(ucc_coll_task_t *coll_task)
+{
+    ucc_tl_ucp_task_t          *task         =
+        ucc_derived_of(coll_task, ucc_tl_ucp_task_t);
+    ucc_tl_ucp_team_t          *team         = TASK_TEAM(task);
+    ucc_coll_args_t            *args         = &TASK_ARGS(task);
+    ucc_rank_t                  rank         = UCC_TL_TEAM_RANK(team);
+    ucc_dbt_single_tree_t       t1           = task->bcast_dbt.t1;
+    ucc_dbt_single_tree_t       t2           = task->bcast_dbt.t2;
+    void                       *buffer       = args->src.info.buffer;
+    ucc_memory_type_t           mtype        = args->src.info.mem_type;
+    ucc_datatype_t              dt           = args->src.info.datatype;
+    size_t                      count        = args->src.info.count;
+    size_t                      count_t1     = (count % 2) ? count / 2 + 1
+                                                           : count / 2;
+    size_t                      data_size_t1 = count_t1 * ucc_dt_size(dt);
+    size_t                      data_size_t2 = count / 2 * ucc_dt_size(dt);
+    ucc_rank_t                  coll_root    = (ucc_rank_t)args->root;
+    ucp_tag_recv_nbx_callback_t cb[2]        = {recv_completion_1,
+                                                recv_completion_2};
+    uint32_t                    i;
+
+    UCC_BCAST_DBT_GOTO_STATE(task->bcast_dbt.state);
+
+    if (rank != t1.root && rank != coll_root) {
+        UCPCHECK_GOTO(ucc_tl_ucp_recv_cb(buffer, data_size_t1, mtype,
+                                         t1.parent, team, task, cb[0],
+                                         (void *)task),
+                      task, out);
+    }
+
+    if (rank != t2.root && rank != coll_root) {
+        UCPCHECK_GOTO(ucc_tl_ucp_recv_cb(PTR_OFFSET(buffer, data_size_t1),
+                                         data_size_t2, mtype, t2.parent, team,
+                                         task, cb[1], (void *)task),
+                      task, out);
+    }
+    task->bcast_dbt.state = SEND_T1;
+
+SEND_T1:
+    if ((coll_root == rank) || (task->bcast_dbt.t1.recv > 0)) {
+        for (i = 0; i < 2; i++) {
+            if ((t1.children[i] != UCC_RANK_INVALID) &&
+                (t1.children[i] != coll_root)) {
+                UCPCHECK_GOTO(ucc_tl_ucp_send_nb(buffer, data_size_t1, mtype,
+                                                 t1.children[i], team, task),
+                              task, out);
+            }
+        }
+    } else {
+        goto out;
+    }
+    task->bcast_dbt.state = SEND_T2;
+
+SEND_T2:
+    if ((coll_root == rank) || (task->bcast_dbt.t2.recv > 0)) {
+        for (i = 0; i < 2; i++) {
+            if ((t2.children[i] != UCC_RANK_INVALID) &&
+                (t2.children[i] != coll_root)) {
+                UCPCHECK_GOTO(ucc_tl_ucp_send_nb(PTR_OFFSET(buffer,
+                                                            data_size_t1),
+                                                 data_size_t2, mtype,
+                                                 t2.children[i], team, task),
+                              task, out);
+            }
+        }
+    } else {
+        goto out;
+    }
+
+TEST:
+    if (UCC_INPROGRESS == ucc_tl_ucp_test_send(task)) {
+        task->bcast_dbt.state = TEST;
+        return;
+    }
+
+    task->super.status = UCC_OK;
+    UCC_TL_UCP_PROFILE_REQUEST_EVENT(coll_task, "ucp_bcast_dbt_done", 0);
+
+out:
+    return;
+}
+
+ucc_status_t ucc_tl_ucp_bcast_dbt_start(ucc_coll_task_t *coll_task)
+{
+    ucc_tl_ucp_task_t          *task         =
+        ucc_derived_of(coll_task, ucc_tl_ucp_task_t);
+    ucc_tl_ucp_team_t          *team         = TASK_TEAM(task);
+    ucc_coll_args_t            *args         = &TASK_ARGS(task);
+    ucc_status_t                status       = UCC_OK;
+    ucc_rank_t                  rank         = UCC_TL_TEAM_RANK(team);
+    void                       *buffer       = args->src.info.buffer;
+    ucc_memory_type_t           mtype        = args->src.info.mem_type;
+    ucc_datatype_t              dt           = args->src.info.datatype;
+    size_t                      count        = args->src.info.count;
+    size_t                      count_t1     = (count % 2) ? count / 2 + 1
+                                                           : count / 2;
+    size_t                      data_size_t1 = count_t1 * ucc_dt_size(dt);
+    size_t                      data_size_t2 = count / 2 * ucc_dt_size(dt);
+    ucc_rank_t                  coll_root    = (ucc_rank_t)args->root;
+    ucc_rank_t                  t1_root      = task->bcast_dbt.t1.root;
+    ucc_rank_t                  t2_root      = task->bcast_dbt.t2.root;
+    ucp_tag_recv_nbx_callback_t cb[2]        = {recv_completion_1,
+                                              recv_completion_2};
+
+    task->bcast_dbt.t1.recv = 0;
+    task->bcast_dbt.t2.recv = 0;
+    ucc_tl_ucp_task_reset(task, UCC_INPROGRESS);
+
+    if (rank == coll_root && coll_root != t1_root) {
+        status = ucc_tl_ucp_send_nb(buffer, data_size_t1, mtype, t1_root, team,
+                                    task);
+        if (UCC_OK != status) {
+            return status;
+        }
+    }
+
+    if (rank == coll_root && coll_root != t2_root) {
+        status = ucc_tl_ucp_send_nb(PTR_OFFSET(buffer, data_size_t1),
+                                    data_size_t2, mtype, t2_root, team, task);
+        if (UCC_OK != status) {
+            return status;
+        }
+    }
+
+    if (rank != coll_root && rank == t1_root) {
+        status = ucc_tl_ucp_recv_cb(buffer, data_size_t1, mtype, coll_root,
+                                    team, task, cb[0], (void *)task);
+        if (UCC_OK != status) {
+            return status;
+        }
+    }
+
+    if (rank != coll_root && rank == t2_root) {
+        status = ucc_tl_ucp_recv_cb(PTR_OFFSET(buffer, data_size_t1),
+                                    data_size_t2, mtype, coll_root, team, task,
+                                    cb[1], (void *)task);
+        if (UCC_OK != status) {
+            return status;
+        }
+    }
+
+    task->bcast_dbt.state = RECV;
+    UCC_TL_UCP_PROFILE_REQUEST_EVENT(coll_task, "ucp_bcast_dbt_start", 0);
+    return ucc_progress_queue_enqueue(UCC_TL_CORE_CTX(team)->pq, &task->super);
+}
+
+ucc_status_t ucc_tl_ucp_bcast_dbt_finalize(ucc_coll_task_t *coll_task)
+{
+    return ucc_tl_ucp_coll_finalize(coll_task);
+}
+
+ucc_status_t ucc_tl_ucp_bcast_dbt_init(
+    ucc_base_coll_args_t *coll_args, ucc_base_team_t *team,
+    ucc_coll_task_t **task_h)
+{
+    ucc_tl_ucp_team_t *tl_team;
+    ucc_tl_ucp_task_t *task;
+    ucc_rank_t         rank, size;
+
+    task                 = ucc_tl_ucp_init_task(coll_args, team);
+    task->super.post     = ucc_tl_ucp_bcast_dbt_start;
+    task->super.progress = ucc_tl_ucp_bcast_dbt_progress;
+    task->super.finalize = ucc_tl_ucp_bcast_dbt_finalize;
+    tl_team              = TASK_TEAM(task);
+    rank                 = UCC_TL_TEAM_RANK(tl_team);
+    size                 = UCC_TL_TEAM_SIZE(tl_team);
+    ucc_dbt_build_trees(rank, size, &task->bcast_dbt.t1,
+                             &task->bcast_dbt.t2);
+
+    *task_h = &task->super;
+    return UCC_OK;
+}
diff --git a/src/components/tl/ucp/bcast/bcast_sag_knomial.c b/src/components/tl/ucp/bcast/bcast_sag_knomial.c
index 1fa56a7367..3f4a6919f6 100644
--- a/src/components/tl/ucp/bcast/bcast_sag_knomial.c
+++ b/src/components/tl/ucp/bcast/bcast_sag_knomial.c
@@ -70,8 +70,8 @@ ucc_tl_ucp_bcast_sag_knomial_finalize(ucc_coll_task_t *coll_task)
 
 ucc_status_t
 ucc_tl_ucp_bcast_sag_knomial_init(ucc_base_coll_args_t *coll_args,
-                                      ucc_base_team_t      *team,
-                                      ucc_coll_task_t     **task_h)
+                                  ucc_base_team_t *team,
+                                  ucc_coll_task_t **task_h)
 {
     ucc_tl_ucp_team_t   *tl_team  = ucc_derived_of(team, ucc_tl_ucp_team_t);
     size_t               count    = coll_args->args.src.info.count;
diff --git a/src/components/tl/ucp/reduce/reduce.c b/src/components/tl/ucp/reduce/reduce.c
index 82a9380083..039f9f393b 100644
--- a/src/components/tl/ucp/reduce/reduce.c
+++ b/src/components/tl/ucp/reduce/reduce.c
@@ -13,6 +13,11 @@ ucc_base_coll_alg_info_t
              .name = "knomial",
              .desc = "reduce over knomial tree with arbitrary radix "
                      "(optimized for latency)"},
+        [UCC_TL_UCP_REDUCE_ALG_DBT] =
+            {.id   = UCC_TL_UCP_REDUCE_ALG_DBT,
+             .name = "dbt",
+             .desc = "bcast over double binary tree where a leaf in one tree "
+                     "will be intermediate in other (optimized for BW)"},
         [UCC_TL_UCP_REDUCE_ALG_LAST] = {
             .id = 0, .name = NULL, .desc = NULL}};
 
@@ -66,3 +71,16 @@ ucc_status_t ucc_tl_ucp_reduce_init(ucc_tl_ucp_task_t *task)
 
     return status;
 }
+
+ucc_status_t ucc_tl_ucp_reduce_knomial_init(ucc_base_coll_args_t *coll_args,
+                                            ucc_base_team_t      *team,
+                                            ucc_coll_task_t     **task_h)
+{
+    ucc_tl_ucp_task_t *task;
+    ucc_status_t       status;
+
+    task    = ucc_tl_ucp_init_task(coll_args, team);
+    status  = ucc_tl_ucp_reduce_init(task);
+    *task_h = &task->super;
+    return status;
+}
diff --git a/src/components/tl/ucp/reduce/reduce.h b/src/components/tl/ucp/reduce/reduce.h
index e26c4fdf23..98bc183ff3 100644
--- a/src/components/tl/ucp/reduce/reduce.h
+++ b/src/components/tl/ucp/reduce/reduce.h
@@ -9,12 +9,16 @@
 
 enum {
     UCC_TL_UCP_REDUCE_ALG_KNOMIAL,
+    UCC_TL_UCP_REDUCE_ALG_DBT,
     UCC_TL_UCP_REDUCE_ALG_LAST
 };
 
 extern ucc_base_coll_alg_info_t
              ucc_tl_ucp_reduce_algs[UCC_TL_UCP_REDUCE_ALG_LAST + 1];
 
+#define UCC_TL_UCP_REDUCE_DEFAULT_ALG_SELECT_STR \
+    "reduce:0-inf:@0"
+
 /* A set of convenience macros used to implement sw based progress
    of the reduce algorithm that uses kn pattern */
 enum {
@@ -36,12 +40,32 @@ enum {
         };                                                                     \
     } while (0)
 
+
+static inline int ucc_tl_ucp_reduce_alg_from_str(const char *str)
+{
+    int i;
+    for (i = 0; i < UCC_TL_UCP_REDUCE_ALG_LAST; i++) {
+        if (0 == strcasecmp(str, ucc_tl_ucp_reduce_algs[i].name)) {
+            break;
+        }
+    }
+    return i;
+}
+
 ucc_status_t ucc_tl_ucp_reduce_init(ucc_tl_ucp_task_t *task);
 
+ucc_status_t ucc_tl_ucp_reduce_knomial_init(ucc_base_coll_args_t *coll_args,
+                                            ucc_base_team_t      *team,
+                                            ucc_coll_task_t     **task_h);
+
 ucc_status_t ucc_tl_ucp_reduce_knomial_start(ucc_coll_task_t *task);
 
 void ucc_tl_ucp_reduce_knomial_progress(ucc_coll_task_t *task);
 
 ucc_status_t ucc_tl_ucp_reduce_knomial_finalize(ucc_coll_task_t *task);
 
+ucc_status_t ucc_tl_ucp_reduce_dbt_init(ucc_base_coll_args_t *coll_args,
+                                        ucc_base_team_t      *team,
+                                        ucc_coll_task_t     **task_h);
+
 #endif
diff --git a/src/components/tl/ucp/reduce/reduce_dbt.c b/src/components/tl/ucp/reduce/reduce_dbt.c
new file mode 100644
index 0000000000..08e8774974
--- /dev/null
+++ b/src/components/tl/ucp/reduce/reduce_dbt.c
@@ -0,0 +1,358 @@
+/**
+ * Copyright (c) 2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ *
+ * See file LICENSE for terms.
+ */
+
+#include "config.h"
+#include "tl_ucp.h"
+#include "reduce.h"
+#include "core/ucc_progress_queue.h"
+#include "tl_ucp_sendrecv.h"
+#include "utils/ucc_dt_reduce.h"
+
+enum {
+    RECV,
+    REDUCE,
+    TEST,
+    TEST_ROOT,
+};
+
+#define UCC_REDUCE_DBT_CHECK_STATE(_p)                                        \
+    case _p:                                                                  \
+        goto _p;
+
+#define UCC_REDUCE_DBT_GOTO_STATE(_state)                                     \
+    do {                                                                      \
+        switch (_state) {                                                     \
+            UCC_REDUCE_DBT_CHECK_STATE(REDUCE);                               \
+            UCC_REDUCE_DBT_CHECK_STATE(TEST);                                 \
+            UCC_REDUCE_DBT_CHECK_STATE(TEST_ROOT);                            \
+        };                                                                    \
+    } while (0)
+
+static void recv_completion_common(void *request, ucs_status_t status,
+                                   const ucp_tag_recv_info_t *info, /* NOLINT */
+                                   void *user_data)
+{
+    ucc_tl_ucp_task_t *task = (ucc_tl_ucp_task_t *)user_data;
+    if (ucc_unlikely(UCS_OK != status)) {
+        tl_error(UCC_TASK_LIB(task), "failure in recv completion %s",
+                 ucs_status_string(status));
+        task->super.status = ucs_status_to_ucc_status(status);
+    }
+    task->tagged.recv_completed++;
+    if (request) {
+        ucp_request_free(request);
+    }
+}
+
+static void recv_completion_1(void *request, ucs_status_t status,
+                                   const ucp_tag_recv_info_t *info, /* NOLINT */
+                                   void *user_data)
+{
+    ucc_tl_ucp_task_t *task = (ucc_tl_ucp_task_t *)user_data;
+
+    task->reduce_dbt.trees[0].recv++;
+    recv_completion_common(request, status, info, user_data);
+}
+
+static void recv_completion_2(void *request, ucs_status_t status,
+                                   const ucp_tag_recv_info_t *info, /* NOLINT */
+                                   void *user_data)
+{
+    ucc_tl_ucp_task_t *task = (ucc_tl_ucp_task_t *)user_data;
+
+    task->reduce_dbt.trees[1].recv++;
+    recv_completion_common(request, status, info, user_data);
+}
+
+static inline void single_tree_reduce(ucc_tl_ucp_task_t *task, void *sbuf,
+                                      void *rbuf, int n_children, size_t count,
+                                      size_t data_size, ucc_datatype_t dt,
+                                      ucc_coll_args_t *args, int is_avg)
+{
+    ucc_status_t status;
+
+    status = ucc_dt_reduce_strided(
+        sbuf,rbuf, rbuf,
+        n_children, count, data_size,
+        dt, args,
+        is_avg ? UCC_EEE_TASK_FLAG_REDUCE_WITH_ALPHA : 0,
+        AVG_ALPHA(task), task->reduce_dbt.executor,
+        &task->reduce_dbt.etask);
+
+    if (ucc_unlikely(UCC_OK != status)) {
+        tl_error(UCC_TASK_LIB(task),
+                    "failed to perform dt reduction");
+        task->super.status = status;
+        return;
+    }
+    EXEC_TASK_WAIT(task->reduce_dbt.etask);
+}
+
+void ucc_tl_ucp_reduce_dbt_progress(ucc_coll_task_t *coll_task)
+{
+    ucc_tl_ucp_task_t          *task         = ucc_derived_of(coll_task,
+                                                              ucc_tl_ucp_task_t);
+    ucc_tl_ucp_team_t          *team         = TASK_TEAM(task);
+    ucc_coll_args_t            *args         = &TASK_ARGS(task);
+    ucc_dbt_single_tree_t      *trees        = task->reduce_dbt.trees ;
+    ucc_rank_t                  rank         = UCC_TL_TEAM_RANK(team);
+    ucc_rank_t                  coll_root    = (ucc_rank_t)args->root;
+    int                         is_root      = rank == coll_root;
+    ucp_tag_recv_nbx_callback_t cb[2]        = {recv_completion_1,
+                                                recv_completion_2};
+    void                       *sbuf[2], *rbuf[2];
+    uint32_t                    i, j, k;
+    ucc_memory_type_t           mtype;
+    ucc_datatype_t              dt;
+    size_t                      count, data_size, data_size_t1;
+    size_t                      counts[2];
+    int                         avg_pre_op, avg_post_op;
+
+    if (is_root) {
+        mtype = args->dst.info.mem_type;
+        dt    = args->dst.info.datatype;
+        count = args->dst.info.count;
+    } else {
+        mtype = args->src.info.mem_type;
+        dt    = args->src.info.datatype;
+        count = args->src.info.count;
+    }
+
+    counts[0]    = (count % 2) ? count / 2 + 1 : count / 2;
+    counts[1]    = count / 2;
+    data_size    = count * ucc_dt_size(dt);
+    data_size_t1 = counts[0] * ucc_dt_size(dt);
+    avg_pre_op   = ((args->op == UCC_OP_AVG) &&
+                    UCC_TL_UCP_TEAM_LIB(team)->cfg.reduce_avg_pre_op);
+    avg_post_op  = ((args->op == UCC_OP_AVG) &&
+                    !UCC_TL_UCP_TEAM_LIB(team)->cfg.reduce_avg_pre_op);
+
+    rbuf[0] = task->reduce_dbt.scratch;
+    rbuf[1] = PTR_OFFSET(rbuf[0], data_size_t1 * 2);;
+    sbuf[0] = avg_pre_op ? PTR_OFFSET(rbuf[0], data_size * 2)
+                              : args->src.info.buffer;;
+    sbuf[1] = PTR_OFFSET(sbuf[0], data_size_t1);
+
+    UCC_REDUCE_DBT_GOTO_STATE(task->reduce_dbt.state);
+    for (i = 0; i < 2; i++) {
+        j = 0;
+        for (k = 0; k < 2; k++) {
+            if (trees[i].children[k] != UCC_RANK_INVALID) {
+                UCPCHECK_GOTO(ucc_tl_ucp_recv_cb(
+                                PTR_OFFSET(rbuf[i], counts[i] * ucc_dt_size(dt) * j),
+                                counts[i] * ucc_dt_size(dt), mtype,
+                                trees[i].children[k], team, task, cb[i],
+                                (void *)task),
+                            task, out);
+                j++;
+            }
+
+        }
+    }
+    task->reduce_dbt.state = REDUCE;
+
+REDUCE:
+    for (i = 0; i < 2; i++) {
+        if (trees[i].recv == trees[i].n_children &&
+            !task->reduce_dbt.reduction_comp[i]) {
+            if (trees[i].n_children > 0) {
+                single_tree_reduce(task, sbuf[i], rbuf[i], trees[i].n_children,
+                                   counts[i], counts[i] * ucc_dt_size(dt), dt,
+                                   args, avg_post_op && trees[i].root == rank);
+            }
+            task->reduce_dbt.reduction_comp[i] = 1;
+        }
+    }
+
+    for (i = 0; i < 2; i++) {
+        if (rank != trees[i].root && task->reduce_dbt.reduction_comp[i] &&
+            !task->reduce_dbt.send_comp[i]) {
+            UCPCHECK_GOTO(ucc_tl_ucp_send_nb((trees[i].n_children > 0) ? rbuf[i]
+                                                                       : sbuf[i],
+                                              counts[i] * ucc_dt_size(dt),
+                                              mtype, trees[i].parent, team,
+                                              task),
+                        task, out);
+            task->reduce_dbt.send_comp[i] = 1;
+        }
+    }
+
+    if (!task->reduce_dbt.reduction_comp[0] ||
+        !task->reduce_dbt.reduction_comp[1]) {
+        return;
+    }
+TEST:
+    if (UCC_INPROGRESS == ucc_tl_ucp_test_send(task)) {
+        task->reduce_dbt.state = TEST;
+        return;
+    }
+
+    /* tree roots send to coll root*/
+    for (i = 0; i < 2; i++) {
+        if (rank == trees[i].root && !is_root) {
+                UCPCHECK_GOTO(ucc_tl_ucp_send_nb(rbuf[i],
+                                                 counts[i] * ucc_dt_size(dt),
+                                                 mtype, coll_root, team, task),
+                            task, out);
+        }
+    }
+
+    task->reduce_dbt.reduction_comp[0] = trees[0].recv;
+    task->reduce_dbt.reduction_comp[1] = trees[1].recv;
+
+    for (i = 0; i < 2; i++) {
+        if (is_root && rank != trees[i].root) {
+            UCPCHECK_GOTO(ucc_tl_ucp_recv_cb(PTR_OFFSET(args->dst.info.buffer,
+                                             i * counts[0] * ucc_dt_size(dt)),
+                                             counts[i] * ucc_dt_size(dt),
+                                             mtype, trees[i].root, team, task,
+                                             cb[i], (void *)task),
+                          task, out);
+            task->reduce_dbt.reduction_comp[i]++;
+        }
+    }
+
+TEST_ROOT:
+    if (UCC_INPROGRESS == ucc_tl_ucp_test_send(task) ||
+        task->reduce_dbt.reduction_comp[0] != trees[0].recv ||
+        task->reduce_dbt.reduction_comp[1] != trees[1].recv) {
+        task->reduce_dbt.state = TEST_ROOT;
+        return;
+    }
+
+    for (i = 0; i < 2; i++) {
+        if (is_root && rank == trees[i].root) {
+            UCPCHECK_GOTO(ucc_mc_memcpy(PTR_OFFSET(args->dst.info.buffer,
+                                        i * counts[(i + 1) % 2] * ucc_dt_size(dt)),
+                                        rbuf[i], counts[i] * ucc_dt_size(dt),
+                                        mtype, mtype), task, out);
+        }
+    }
+
+    task->super.status = UCC_OK;
+    UCC_TL_UCP_PROFILE_REQUEST_EVENT(coll_task, "ucp_reduce_dbt_done", 0);
+out:
+    return;
+}
+
+ucc_status_t ucc_tl_ucp_reduce_dbt_start(ucc_coll_task_t *coll_task)
+{
+    ucc_tl_ucp_task_t *task       = ucc_derived_of(coll_task,
+                                                   ucc_tl_ucp_task_t);
+    ucc_tl_ucp_team_t *team       = TASK_TEAM(task);
+    ucc_coll_args_t   *args       = &TASK_ARGS(task);
+    ucc_rank_t         rank       = UCC_TL_TEAM_RANK(team);
+    ucc_rank_t         team_size  = UCC_TL_TEAM_SIZE(team);
+    int                avg_pre_op =
+        UCC_TL_UCP_TEAM_LIB(TASK_TEAM(task))->cfg.reduce_avg_pre_op;
+    ucc_datatype_t     dt;
+    size_t             count, data_size;
+    ucc_status_t       status;
+
+    task->reduce_dbt.trees[0].recv     = 0;
+    task->reduce_dbt.trees[1].recv     = 0;
+    task->reduce_dbt.reduction_comp[0] = 0;
+    task->reduce_dbt.reduction_comp[1] = 0;
+    task->reduce_dbt.send_comp[0]      = 0;
+    task->reduce_dbt.send_comp[1]      = 0;
+
+    ucc_tl_ucp_task_reset(task, UCC_INPROGRESS);
+
+    if (args->root == rank) {
+        count = args->dst.info.count;
+        dt    = args->dst.info.datatype;
+    } else {
+        count = args->src.info.count;
+        dt    = args->src.info.datatype;
+    }
+    data_size = count * ucc_dt_size(dt);
+
+    status = ucc_coll_task_get_executor(&task->super,
+                                        &task->reduce_dbt.executor);
+    if (ucc_unlikely(status != UCC_OK)) {
+        return status;
+    }
+
+    if (UCC_IS_INPLACE(*args) && (rank == args->root)) {
+        args->src.info.buffer = args->dst.info.buffer;
+    }
+
+    if (avg_pre_op && args->op == UCC_OP_AVG) {
+        /* In case of avg_pre_op, each process must divide itself by team_size */
+        status =
+            ucc_dt_reduce(args->src.info.buffer, args->src.info.buffer,
+                          PTR_OFFSET(task->reduce_dbt.scratch, data_size * 2),
+                          count, dt, args, UCC_EEE_TASK_FLAG_REDUCE_WITH_ALPHA,
+                          1.0 / (double)(team_size * 2),
+                          task->reduce_dbt.executor, &task->reduce_dbt.etask);
+        if (ucc_unlikely(UCC_OK != status)) {
+            tl_error(UCC_TASK_LIB(task),
+                     "failed to perform dt reduction");
+            return status;
+        }
+        EXEC_TASK_WAIT(task->reduce_dbt.etask, status);
+    }
+
+    task->reduce_dbt.state = RECV;
+    UCC_TL_UCP_PROFILE_REQUEST_EVENT(coll_task, "ucp_reduce_dbt_start", 0);
+    return ucc_progress_queue_enqueue(UCC_TL_CORE_CTX(team)->pq, &task->super);
+}
+
+ucc_status_t ucc_tl_ucp_reduce_dbt_finalize(ucc_coll_task_t *coll_task)
+{
+    ucc_tl_ucp_task_t *task = ucc_derived_of(coll_task, ucc_tl_ucp_task_t);
+
+    if (task->reduce_dbt.scratch_mc_header) {
+        ucc_mc_free(task->reduce_dbt.scratch_mc_header);
+    }
+
+    return ucc_tl_ucp_coll_finalize(coll_task);
+}
+
+ucc_status_t ucc_tl_ucp_reduce_dbt_init(ucc_base_coll_args_t *coll_args,
+                                        ucc_base_team_t      *team,
+                                        ucc_coll_task_t     **task_h)
+{
+    ucc_tl_ucp_team_t *tl_team;
+    ucc_tl_ucp_task_t *task;
+    ucc_rank_t         rank, size;
+    ucc_memory_type_t  mtype;
+    ucc_datatype_t     dt;
+    size_t             count;
+    size_t             data_size;
+    ucc_status_t       status;
+
+    task                 = ucc_tl_ucp_init_task(coll_args, team);
+    task->super.flags    |= UCC_COLL_TASK_FLAG_EXECUTOR;
+    task->super.post     = ucc_tl_ucp_reduce_dbt_start;
+    task->super.progress = ucc_tl_ucp_reduce_dbt_progress;
+    task->super.finalize = ucc_tl_ucp_reduce_dbt_finalize;
+    tl_team              = TASK_TEAM(task);
+    rank                 = UCC_TL_TEAM_RANK(tl_team);
+    size                 = UCC_TL_TEAM_SIZE(tl_team);
+    ucc_dbt_build_trees(rank, size, &task->reduce_dbt.trees[0],
+                        &task->reduce_dbt.trees[1]);
+
+    if (coll_args->args.root == rank) {
+        count = coll_args->args.dst.info.count;
+        dt    = coll_args->args.dst.info.datatype;
+        mtype = coll_args->args.dst.info.mem_type;
+    } else {
+        count = coll_args->args.src.info.count;
+        dt    = coll_args->args.src.info.datatype;
+        mtype = coll_args->args.src.info.mem_type;
+    }
+    data_size                          = count * ucc_dt_size(dt);
+    task->reduce_dbt.scratch_mc_header = NULL;
+    status = ucc_mc_alloc(&task->reduce_dbt.scratch_mc_header, 3 * data_size,
+                          mtype);
+    if (ucc_unlikely(status != UCC_OK)) {
+        return status;
+    }
+    task->reduce_dbt.scratch = task->reduce_dbt.scratch_mc_header->addr;
+    *task_h = &task->super;
+    return UCC_OK;
+}
diff --git a/src/components/tl/ucp/reduce_scatter/reduce_scatter_knomial.c b/src/components/tl/ucp/reduce_scatter/reduce_scatter_knomial.c
index 11a2abc859..ca5457dfb4 100644
--- a/src/components/tl/ucp/reduce_scatter/reduce_scatter_knomial.c
+++ b/src/components/tl/ucp/reduce_scatter/reduce_scatter_knomial.c
@@ -22,7 +22,8 @@ static inline void get_sbuf_rbuf(ucc_tl_ucp_task_t *task, size_t block_count,
     size_t                 dt_size   = ucc_dt_size(args->dst.info.datatype);
     void                  *scratch   = task->reduce_scatter_kn.scratch;
     ucc_knomial_pattern_t *p         = &task->reduce_scatter_kn.p;
-    size_t offset, local_seg_offset, local_seg_count;
+    size_t offset, local_seg_count;
+    ptrdiff_t local_seg_offset;
 
     if (ucc_knomial_pattern_loop_first_iteration(p)) {
         *sbuf = ((KN_NODE_PROXY ==  p->node_type) || UCC_IS_INPLACE(*args))
diff --git a/src/components/tl/ucp/tl_ucp.c b/src/components/tl/ucp/tl_ucp.c
index 83fa7dceeb..1ee970e715 100644
--- a/src/components/tl/ucp/tl_ucp.c
+++ b/src/components/tl/ucp/tl_ucp.c
@@ -126,7 +126,7 @@ ucc_config_field_t ucc_tl_ucp_lib_config_table[] = {
      ucc_offsetof(ucc_tl_ucp_lib_config_t, allreduce_sra_kn_radix),
      UCC_CONFIG_TYPE_UINT_RANGED},
 
-    {"ALLREDUCE_SRA_KN_PIPELINE", "n",
+    {"ALLREDUCE_SRA_KN_PIPELINE", "auto",
      "Pipelining settings for SRA Knomial allreduce algorithm",
      ucc_offsetof(ucc_tl_ucp_lib_config_t, allreduce_sra_kn_pipeline),
      UCC_CONFIG_TYPE_PIPELINE_PARAMS},
diff --git a/src/components/tl/ucp/tl_ucp_coll.c b/src/components/tl/ucp/tl_ucp_coll.c
index e3dd1782af..0efd285db7 100644
--- a/src/components/tl/ucp/tl_ucp_coll.c
+++ b/src/components/tl/ucp/tl_ucp_coll.c
@@ -42,6 +42,10 @@ const ucc_tl_ucp_default_alg_desc_t
             .select_str = UCC_TL_UCP_BCAST_DEFAULT_ALG_SELECT_STR,
             .str_get_fn = NULL
         },
+        {
+            .select_str = UCC_TL_UCP_REDUCE_DEFAULT_ALG_SELECT_STR,
+            .str_get_fn = NULL
+        },
         {
             .select_str = UCC_TL_UCP_REDUCE_SCATTER_DEFAULT_ALG_SELECT_STR,
             .str_get_fn = NULL
@@ -223,6 +227,8 @@ static inline int alg_id_from_str(ucc_coll_type_t coll_type, const char *str)
         return ucc_tl_ucp_alltoallv_alg_from_str(str);
     case UCC_COLL_TYPE_BCAST:
         return ucc_tl_ucp_bcast_alg_from_str(str);
+    case UCC_COLL_TYPE_REDUCE:
+        return ucc_tl_ucp_reduce_alg_from_str(str);
     case UCC_COLL_TYPE_REDUCE_SCATTER:
         return ucc_tl_ucp_reduce_scatter_alg_from_str(str);
     case UCC_COLL_TYPE_REDUCE_SCATTERV:
@@ -239,6 +245,7 @@ ucc_status_t ucc_tl_ucp_alg_id_to_init(int alg_id, const char *alg_id_str,
                                        ucc_base_coll_init_fn_t *init)
 {
     ucc_status_t status = UCC_OK;
+
     if (alg_id_str) {
         alg_id = alg_id_from_str(coll_type, alg_id_str);
     }
@@ -271,6 +278,9 @@ ucc_status_t ucc_tl_ucp_alg_id_to_init(int alg_id, const char *alg_id_str,
         case UCC_TL_UCP_ALLREDUCE_ALG_SLIDING_WINDOW:
             *init = ucc_tl_ucp_allreduce_sliding_window_init;
             break;
+        case UCC_TL_UCP_ALLREDUCE_ALG_DBT:
+            *init = ucc_tl_ucp_allreduce_dbt_init;
+            break;
         default:
             status = UCC_ERR_INVALID_PARAM;
             break;
@@ -284,6 +294,9 @@ ucc_status_t ucc_tl_ucp_alg_id_to_init(int alg_id, const char *alg_id_str,
         case UCC_TL_UCP_BCAST_ALG_SAG_KNOMIAL:
             *init = ucc_tl_ucp_bcast_sag_knomial_init;
             break;
+        case UCC_TL_UCP_BCAST_ALG_DBT:
+            *init = ucc_tl_ucp_bcast_dbt_init;
+            break;
         default:
            status = UCC_ERR_INVALID_PARAM;
            break;
@@ -313,11 +326,27 @@ ucc_status_t ucc_tl_ucp_alg_id_to_init(int alg_id, const char *alg_id_str,
         case UCC_TL_UCP_ALLTOALLV_ALG_HYBRID:
             *init = ucc_tl_ucp_alltoallv_hybrid_init;
             break;
+        case UCC_TL_UCP_ALLTOALLV_ALG_ONESIDED:
+            *init = ucc_tl_ucp_alltoallv_onesided_init;
+            break;
         default:
             status = UCC_ERR_INVALID_PARAM;
             break;
         };
         break;
+    case UCC_COLL_TYPE_REDUCE:
+        switch (alg_id) {
+        case UCC_TL_UCP_REDUCE_ALG_KNOMIAL:
+            *init = ucc_tl_ucp_reduce_knomial_init;
+            break;
+        case UCC_TL_UCP_REDUCE_ALG_DBT:
+            *init = ucc_tl_ucp_reduce_dbt_init;
+            break;
+        default:
+           status = UCC_ERR_INVALID_PARAM;
+           break;
+        };
+        break;
     case UCC_COLL_TYPE_REDUCE_SCATTER:
         switch (alg_id) {
         case UCC_TL_UCP_REDUCE_SCATTER_ALG_RING:
diff --git a/src/components/tl/ucp/tl_ucp_coll.h b/src/components/tl/ucp/tl_ucp_coll.h
index a4def89286..cb4df40bc5 100644
--- a/src/components/tl/ucp/tl_ucp_coll.h
+++ b/src/components/tl/ucp/tl_ucp_coll.h
@@ -11,12 +11,13 @@
 #include "tl_ucp.h"
 #include "schedule/ucc_schedule_pipelined.h"
 #include "coll_patterns/recursive_knomial.h"
+#include "coll_patterns/double_binary_tree.h"
 #include "components/mc/base/ucc_mc_base.h"
 #include "components/ec/ucc_ec.h"
 #include "tl_ucp_tag.h"
 
 #define UCC_UUNITS_AUTO_RADIX 4
-#define UCC_TL_UCP_N_DEFAULT_ALG_SELECT_STR 7
+#define UCC_TL_UCP_N_DEFAULT_ALG_SELECT_STR 8
 
 ucc_status_t ucc_tl_ucp_team_default_score_str_alloc(ucc_tl_ucp_team_t *team,
     char *default_select_str[UCC_TL_UCP_N_DEFAULT_ALG_SELECT_STR]);
@@ -47,6 +48,7 @@ void ucc_tl_ucp_team_default_score_str_free(
             return;                                                            \
         }                                                                      \
         ucc_ee_executor_task_finalize(_etask);                                 \
+        _etask = NULL;                                                         \
         if (ucc_unlikely(status < 0)) {                                        \
             tl_error(UCC_TASK_LIB(task), _errmsg);                             \
             task->super.status = status;                                       \
@@ -209,6 +211,11 @@ typedef struct ucc_tl_ucp_task {
             ucc_rank_t              dist;
             uint32_t                radix;
         } bcast_kn;
+        struct {
+            ucc_dbt_single_tree_t   t1;
+            ucc_dbt_single_tree_t   t2;
+            int                     state;
+        } bcast_dbt;
         struct {
             ucc_rank_t              dist;
             ucc_rank_t              max_dist;
@@ -220,6 +227,16 @@ typedef struct ucc_tl_ucp_task {
             ucc_ee_executor_task_t *etask;
             ucc_ee_executor_t      *executor;
         } reduce_kn;
+        struct {
+            int                     state;
+            ucc_dbt_single_tree_t   trees[2];
+            int                     reduction_comp[2];
+            int                     send_comp[2];
+            void                   *scratch;
+            ucc_mc_buffer_header_t *scratch_mc_header;
+            ucc_ee_executor_task_t *etask;
+            ucc_ee_executor_t      *executor;
+        } reduce_dbt;
         struct {
             ucc_rank_t              dist;
             ucc_rank_t              max_dist;
@@ -245,6 +262,9 @@ typedef struct ucc_tl_ucp_task {
         } alltoallv_hybrid;
         struct {
             ucc_mc_buffer_header_t *scratch_mc_header;
+            ucc_ee_executor_task_t *etask;
+            void                   *src;
+            void                   *dst;
             ucc_rank_t              iteration;
             int                     phase;
         } alltoall_bruck;
@@ -391,6 +411,9 @@ static inline ucc_status_t ucc_tl_ucp_test(ucc_tl_ucp_task_t *task)
 #define UCC_TL_UCP_TASK_RECV_COMPLETE(_task)                                   \
     (((_task)->tagged.recv_posted == (_task)->tagged.recv_completed))
 
+#define UCC_TL_UCP_TASK_SEND_COMPLETE(_task)                                   \
+    (((_task)->tagged.send_posted == (_task)->tagged.send_completed))
+
 static inline ucc_status_t ucc_tl_ucp_test_recv(ucc_tl_ucp_task_t *task)
 {
     int polls = 0;
@@ -407,6 +430,22 @@ static inline ucc_status_t ucc_tl_ucp_test_recv(ucc_tl_ucp_task_t *task)
     return UCC_INPROGRESS;
 }
 
+static inline ucc_status_t ucc_tl_ucp_test_send(ucc_tl_ucp_task_t *task)
+{
+    int polls = 0;
+
+    if (UCC_TL_UCP_TASK_SEND_COMPLETE(task)) {
+        return UCC_OK;
+    }
+    while (polls++ < task->n_polls) {
+        if (UCC_TL_UCP_TASK_SEND_COMPLETE(task)) {
+            return UCC_OK;
+        }
+        ucp_worker_progress(UCC_TL_UCP_TASK_TEAM(task)->worker->ucp_worker);
+    }
+    return UCC_INPROGRESS;
+}
+
 #define UCC_TL_UCP_TASK_RING_P2P_COMPLETE(_task)                               \
     ((((_task)->tagged.send_posted - (_task)->tagged.send_completed) <= 1) &&  \
      ((_task)->tagged.recv_posted == (_task)->tagged.recv_completed))
@@ -427,6 +466,32 @@ static inline ucc_status_t ucc_tl_ucp_test_ring(ucc_tl_ucp_task_t *task)
     return UCC_INPROGRESS;
 }
 
+#define UCC_TL_UCP_TASK_ONESIDED_P2P_COMPLETE(_task)                           \
+    (((_task)->onesided.put_posted == (_task)->onesided.put_completed) &&      \
+     ((_task)->onesided.get_posted == (_task)->onesided.get_completed))
+
+#define UCC_TL_UCP_TASK_ONESIDED_SYNC_COMPLETE(_task, _end)                    \
+    (*((long *)(TASK_ARGS(_task).global_work_buffer)) == _end)
+
+static inline ucc_status_t ucc_tl_ucp_test_onesided(ucc_tl_ucp_task_t *task,
+                                                    int                sync_end)
+{
+    int polls = 0;
+
+    if (UCC_TL_UCP_TASK_ONESIDED_P2P_COMPLETE(task) &&
+        UCC_TL_UCP_TASK_ONESIDED_SYNC_COMPLETE(task, sync_end)) {
+        return UCC_OK;
+    }
+    while (polls++ < task->n_polls) {
+        if (UCC_TL_UCP_TASK_ONESIDED_P2P_COMPLETE(task) &&
+            UCC_TL_UCP_TASK_ONESIDED_SYNC_COMPLETE(task, sync_end)) {
+            return UCC_OK;
+        }
+        ucp_worker_progress(UCC_TL_UCP_TASK_TEAM(task)->worker->ucp_worker);
+    }
+    return UCC_INPROGRESS;
+}
+
 ucc_status_t ucc_tl_ucp_alg_id_to_init(int alg_id, const char *alg_id_str,
                                        ucc_coll_type_t          coll_type,
                                        ucc_memory_type_t        mem_type,
diff --git a/src/components/tl/ucp/tl_ucp_context.c b/src/components/tl/ucp/tl_ucp_context.c
index e00109ad95..6da05132ba 100644
--- a/src/components/tl/ucp/tl_ucp_context.c
+++ b/src/components/tl/ucp/tl_ucp_context.c
@@ -162,12 +162,13 @@ UCC_CLASS_INIT_FUNC(ucc_tl_ucp_context_t,
               "failed to read ucp configuration", err_cfg_read, self);
 
     ucp_params.field_mask =
-        UCP_PARAM_FIELD_FEATURES | UCP_PARAM_FIELD_TAG_SENDER_MASK;
+        UCP_PARAM_FIELD_FEATURES | UCP_PARAM_FIELD_TAG_SENDER_MASK | UCP_PARAM_FIELD_NAME;
     ucp_params.features = UCP_FEATURE_TAG | UCP_FEATURE_AM;
     if (params->params.mask & UCC_CONTEXT_PARAM_FIELD_MEM_PARAMS) {
         ucp_params.features |= UCP_FEATURE_RMA | UCP_FEATURE_AMO64;
     }
     ucp_params.tag_sender_mask = UCC_TL_UCP_TAG_SENDER_MASK;
+    ucp_params.name = "UCC_UCP_CONTEXT";
 
     if (params->estimated_num_ppn > 0) {
         ucp_params.field_mask |= UCP_PARAM_FIELD_ESTIMATED_NUM_PPN;
diff --git a/src/components/tl/ucp/tl_ucp_sendrecv.h b/src/components/tl/ucp/tl_ucp_sendrecv.h
index 9f234cb039..ab815bad71 100644
--- a/src/components/tl/ucp/tl_ucp_sendrecv.h
+++ b/src/components/tl/ucp/tl_ucp_sendrecv.h
@@ -1,5 +1,5 @@
 /**
- * Copyright (c) 2021-2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2021-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * Copyright (c) Meta Platforms, Inc. and affiliates. 2022.
  *
  * See file LICENSE for terms.
@@ -254,16 +254,18 @@ ucc_tl_ucp_resolve_p2p_by_va(ucc_tl_ucp_team_t *team, void *va, ucp_ep_h *ep,
     keys        = PTR_OFFSET(base_offset, (section_offset * 3));
 
     for (int i = 0; i < ctx->n_rinfo_segs; i++) {
-        if ((uint64_t)va >= (uint64_t)team->va_base[i] &&
-            (uint64_t)va < (uint64_t)team->va_base[i] + team->base_length[i]) {
+        uint64_t base = (uint64_t)team->va_base[i];
+        uint64_t end = base + team->base_length[i];
+        if ((uint64_t)va >= base &&
+            (uint64_t)va < end) {
             *segment = i;
             break;
         }
         key_offset += key_sizes[i];
     }
-    if (0 > *segment) {
+    if (ucc_unlikely(0 > *segment)) {
         tl_error(UCC_TL_TEAM_LIB(team),
-            "attempt to perform one-sided operation on non-registered memory");
+            "attempt to perform one-sided operation on non-registered memory %p", va);
         return UCC_ERR_NOT_FOUND;
     }
     if (ucc_unlikely(NULL == UCC_TL_UCP_REMOTE_RKEY(ctx, peer, *segment))) {
diff --git a/src/core/ucc_constructor.c b/src/core/ucc_constructor.c
index 2cabdc2f32..c113d2ea56 100644
--- a/src/core/ucc_constructor.c
+++ b/src/core/ucc_constructor.c
@@ -15,6 +15,7 @@
 #include "utils/profile/ucc_profile.h"
 #include "ucc/api/ucc_version.h"
 #include <dlfcn.h>
+#include <pthread.h>
 
 static ucc_status_t ucc_check_config_file(void)
 {
@@ -93,100 +94,106 @@ static ucc_status_t init_lib_paths(void)
 UCC_CONFIG_REGISTER_TABLE(ucc_global_config_table, "UCC global", NULL,
                           ucc_global_config, &ucc_config_global_list)
 
+static pthread_mutex_t ucc_constructor_mutex = PTHREAD_MUTEX_INITIALIZER;
+
 ucc_status_t ucc_constructor(void)
 {
-    ucc_global_config_t *cfg = &ucc_global_config;
-    ucc_status_t         status;
+    ucc_global_config_t *cfg    = &ucc_global_config;
+    ucc_status_t         status = UCC_OK;
     Dl_info              dl_info;
     int                  ret;
 
-    if (!cfg->initialized) {
-        cfg->initialized = 1;
-        status = ucc_config_parser_fill_opts(
-            &ucc_global_config, UCC_CONFIG_GET_TABLE(ucc_global_config_table),
-            "UCC_", 1);
-        if (UCC_OK != status) {
-            ucc_error("failed to parse global options");
-            return status;
-        }
+    pthread_mutex_lock(&ucc_constructor_mutex);
+    if (cfg->initialized) {
+        goto exit_unlock_mutex;
+    }
 
-        if (UCC_OK != (status = init_lib_paths())) {
-            ucc_error("failed to init ucc components path");
-            return status;
-        }
+    cfg->initialized = 1;
+    status = ucc_config_parser_fill_opts(
+        &ucc_global_config, UCC_CONFIG_GET_TABLE(ucc_global_config_table),
+        "UCC_", 1);
+    if (UCC_OK != status) {
+        ucc_error("failed to parse global options");
+        goto exit_unlock_mutex;
+    }
 
-        status = ucc_check_config_file();
-        if (UCC_OK != status && UCC_ERR_NOT_FOUND != status) {
-            /* bail only in case of real error */
-            return status;
-        }
+    if (UCC_OK != (status = init_lib_paths())) {
+        ucc_error("failed to init ucc components path");
+        goto exit_unlock_mutex;
+    }
 
-        status = ucc_components_load("cl", &cfg->cl_framework);
-        if (UCC_OK != status) {
-            ucc_error("no CL components were found in the "
-                      "ucc modules dir: %s",
-                      cfg->component_path);
-            return status;
-        }
-        status = ucc_component_check_scores_uniq(&cfg->cl_framework);
-        if (UCC_OK != status) {
-            ucc_error("CLs must have distinct uniq default scores");
-            return status;
-        }
-        status = ucc_components_load("tl", &cfg->tl_framework);
-        if (UCC_OK != status) {
-            /* not critical - some CLs may operate w/o use of TL */
-            ucc_debug("no TL components were found in the "
-                      "ucc modules dir: %s",
-                      cfg->component_path);
-        }
-        status = ucc_component_check_scores_uniq(&cfg->tl_framework);
-        if (UCC_OK != status) {
-            ucc_error("TLs must have distinct uniq default scores");
-            return status;
-        }
-        status = ucc_components_load("mc", &cfg->mc_framework);
-        if (UCC_OK != status) {
-            ucc_error("no memory components were found in the "
-                      "ucc modules dir: %s",
+    status = ucc_check_config_file();
+    if (UCC_OK != status && UCC_ERR_NOT_FOUND != status) {
+        /* bail only in case of real error */
+        goto exit_unlock_mutex;
+    }
+
+    status = ucc_components_load("cl", &cfg->cl_framework);
+    if (UCC_OK != status) {
+        ucc_error("no CL components were found in the "
+                  "ucc modules dir: %s", cfg->component_path);
+        goto exit_unlock_mutex;
+    }
+    status = ucc_component_check_scores_uniq(&cfg->cl_framework);
+    if (UCC_OK != status) {
+        ucc_error("CLs must have distinct uniq default scores");
+        goto exit_unlock_mutex;
+    }
+    status = ucc_components_load("tl", &cfg->tl_framework);
+    if (UCC_OK != status) {
+        /* not critical - some CLs may operate w/o use of TL */
+        ucc_debug("no TL components were found in the "
+                  "ucc modules dir: %s", cfg->component_path);
+    }
+    status = ucc_component_check_scores_uniq(&cfg->tl_framework);
+    if (UCC_OK != status) {
+        ucc_error("TLs must have distinct uniq default scores");
+        goto exit_unlock_mutex;
+    }
+    status = ucc_components_load("mc", &cfg->mc_framework);
+    if (UCC_OK != status) {
+        ucc_error("no memory components were found in the "
+                  "ucc modules dir: %s", cfg->component_path);
+        goto exit_unlock_mutex;
+    }
+    status = ucc_components_load("ec", &cfg->ec_framework);
+    if (status != UCC_OK) {
+        if (status == UCC_ERR_NOT_FOUND) {
+            ucc_info("no execution components were found in the "
+                     "ucc modules dir: %s. "
+                     "Triggered operations might not work",
                       cfg->component_path);
-            return status;
-        }
-        status = ucc_components_load("ec", &cfg->ec_framework);
-        if (status != UCC_OK) {
-            if (status == UCC_ERR_NOT_FOUND) {
-                ucc_info("no execution components were found in the "
-                         "ucc modules dir: %s. "
-                         "Triggered operations might not work",
-                         cfg->component_path);
-            } else {
-                ucc_error("failed to load execution components %d (%s)",
-                           status, ucc_status_string(status));
-                return status;
-            }
+        } else {
+            ucc_error("failed to load execution components %d (%s)",
+                      status, ucc_status_string(status));
+            goto exit_unlock_mutex;
         }
+    }
 
-        if (UCC_OK != ucc_local_proc_info_init()) {
-            ucc_error("failed to initialize local proc info");
-            return status;
-        }
+    if (UCC_OK != ucc_local_proc_info_init()) {
+        ucc_error("failed to initialize local proc info");
+        goto exit_unlock_mutex;
+    }
 #ifdef HAVE_PROFILING
-        ucc_profile_init(cfg->profile_mode, cfg->profile_file,
-                         cfg->profile_log_size);
+    ucc_profile_init(cfg->profile_mode, cfg->profile_file,
+                     cfg->profile_log_size);
 #endif
-        if (ucc_global_config.log_component.log_level >= UCC_LOG_LEVEL_INFO) {
-            ret = dladdr(ucc_init_version, &dl_info);
-            if (ret == 0) {
-                ucc_error("failed to get ucc_init_version handler");
-                return UCC_ERR_NO_MESSAGE;
-            }
-            ucc_info("version: %s, loaded from: %s, cfg file: %s",
-                     ucc_get_version_string(), dl_info.dli_fname,
-                     ucc_global_config.file_cfg ?
-                     ucc_global_config.file_cfg->filename: "n/a");
+    if (ucc_global_config.log_component.log_level >= UCC_LOG_LEVEL_INFO) {
+        ret = dladdr(ucc_init_version, &dl_info);
+        if (ret == 0) {
+            ucc_error("failed to get ucc_init_version handler");
+            status = UCC_ERR_NO_RESOURCE;
+            goto exit_unlock_mutex;
         }
+        ucc_info("version: %s, loaded from: %s, cfg file: %s",
+                 ucc_get_version_string(), dl_info.dli_fname,
+                 ucc_global_config.file_cfg ?
+                 ucc_global_config.file_cfg->filename: "n/a");
     }
-    return UCC_OK;
+
+exit_unlock_mutex:
+    pthread_mutex_unlock(&ucc_constructor_mutex);
+    return status;
 }
 
 __attribute__((destructor)) static void ucc_destructor(void)
diff --git a/src/core/ucc_global_opts.h b/src/core/ucc_global_opts.h
index 54079ad6fc..203ca65e9d 100644
--- a/src/core/ucc_global_opts.h
+++ b/src/core/ucc_global_opts.h
@@ -35,8 +35,8 @@ typedef struct ucc_global_config {
 
     /* Limit for profiling log size */
     size_t                     profile_log_size;
-    char *                     cfg_filename;
-    ucc_file_config_t *        file_cfg;
+    char                      *cfg_filename;
+    ucc_file_config_t         *file_cfg;
 } ucc_global_config_t;
 
 extern ucc_global_config_t ucc_global_config;
diff --git a/src/ucc/api/ucc.h b/src/ucc/api/ucc.h
index c7c0ce10b0..a269dfb940 100644
--- a/src/ucc/api/ucc.h
+++ b/src/ucc/api/ucc.h
@@ -1337,7 +1337,7 @@ struct ucc_ep_map_cb {
  *  @ingroup UCC_TEAM_DT
  */
 typedef enum {
-    UCC_EP_MAP_FULL     = 1, /*!< The ep range of the team  spans all eps from a context*/
+    UCC_EP_MAP_FULL     = 1, /*!< The ep range of the team spans all eps from a context. */
     UCC_EP_MAP_STRIDED  = 2, /*!< The ep range of the team can be described by the 2 values: start, stride.*/
     UCC_EP_MAP_ARRAY    = 3, /*!< The ep range is given as an array of intergers that map the ep in the team to
                                        the team_context rank. */
diff --git a/src/utils/arch/cuda_def.h b/src/utils/arch/cuda_def.h
index 7f690531e2..d758846c9d 100644
--- a/src/utils/arch/cuda_def.h
+++ b/src/utils/arch/cuda_def.h
@@ -74,6 +74,15 @@ static inline ucc_status_t cuda_error_to_ucc_status(cudaError_t cuda_status)
         }                                                                      \
     } while(0)
 
+#define CUDADRV_CHECK(_cmd)                                                    \
+    /* coverity[dead_error_line] */                                            \
+    do {                                                                       \
+        ucc_status_t _cuda_status = CUDADRV_FUNC(_cmd);                        \
+        if (ucc_unlikely(_cuda_status != UCC_OK)) {                            \
+            return _cuda_status;                                               \
+        }                                                                      \
+    } while(0)
+
 #define CUDA_CHECK_GOTO(_cmd, _label, _cuda_status)                            \
     do {                                                                       \
         _cuda_status = CUDA_FUNC(_cmd);                                        \
diff --git a/src/utils/ucc_coll_utils.c b/src/utils/ucc_coll_utils.c
index 3921f1262e..75a49400e2 100644
--- a/src/utils/ucc_coll_utils.c
+++ b/src/utils/ucc_coll_utils.c
@@ -266,10 +266,11 @@ ucc_ep_map_from_array_generic(void **array, ucc_rank_t size,
                               ucc_rank_t full_size, int need_free, int is64)
 {
     int          is_const_stride = 0;
-    ucc_ep_map_t map             = {0};
+    ucc_ep_map_t map;
     int64_t      stride;
     ucc_rank_t   i;
 
+    map.type   = (ucc_ep_map_type_t)0;
     map.ep_num = size;
     if (size > 1) {
         /* try to detect strided pattern */
@@ -303,6 +304,7 @@ ucc_ep_map_from_array_generic(void **array, ucc_rank_t size,
         map.array.map       = (void *)(*array);
         map.array.elem_size = is64 ? sizeof(uint64_t) : sizeof(ucc_rank_t);
     }
+
     return map;
 }
 
@@ -359,6 +361,12 @@ void ucc_coll_args_str(const ucc_coll_args_t *args, ucc_rank_t trank,
         strncat(hdr, tmp, left);
     }
 
+    if (UCC_IS_PERSISTENT(*args)) {
+        ucc_snprintf_safe(tmp, sizeof(tmp), " persistent");
+        left = COLL_ARGS_HEADER_STR_MAX_SIZE - strlen(hdr);
+        strncat(hdr, tmp, left);
+    }
+
     if (ucc_coll_args_is_rooted(ct)) {
         ucc_snprintf_safe(tmp, sizeof(tmp), " root %u", root);
         left = COLL_ARGS_HEADER_STR_MAX_SIZE - strlen(hdr);
@@ -636,6 +644,18 @@ ucc_ep_map_t ucc_ep_map_create_reverse(ucc_rank_t size)
     return map;
 }
 
+int ucc_ep_map_is_identity(const ucc_ep_map_t *map)
+{
+    if ((map->type == UCC_EP_MAP_FULL) ||
+        ((map->type == UCC_EP_MAP_STRIDED) &&
+        (map->strided.start == 0) &&
+        (map->strided.stride == 1))) {
+        return 1;
+    } else {
+        return 0;
+    }
+}
+
 static inline int ucc_ep_map_is_reverse(ucc_ep_map_t *map,
                                         int reversed_reordered_flag)
 {
diff --git a/src/utils/ucc_coll_utils.h b/src/utils/ucc_coll_utils.h
index 2d3a919f08..c5cb2ef392 100644
--- a/src/utils/ucc_coll_utils.h
+++ b/src/utils/ucc_coll_utils.h
@@ -71,7 +71,11 @@
 #define UCC_COLL_ARGS_ACTIVE_SET(_args)                                        \
     ((_args)->mask & UCC_COLL_ARGS_FIELD_ACTIVE_SET)
 
-#define UCC_MEM_TYPE_MASK_FULL -1
+#define UCC_MEM_TYPE_MASK_FULL (UCC_BIT(UCC_MEMORY_TYPE_HOST) |                \
+                                UCC_BIT(UCC_MEMORY_TYPE_CUDA) |                \
+                                UCC_BIT(UCC_MEMORY_TYPE_CUDA_MANAGED) |        \
+                                UCC_BIT(UCC_MEMORY_TYPE_ROCM) |                \
+                                UCC_BIT(UCC_MEMORY_TYPE_ROCM_MANAGED))
 
 static inline int ucc_coll_args_is_reduction(ucc_coll_type_t ct)
 {
@@ -119,29 +123,6 @@ ucc_coll_args_get_displacement(const ucc_coll_args_t *args,
     return ((uint32_t *)displacements)[idx];
 }
 
-static inline const char* ucc_mem_type_str(ucc_memory_type_t ct)
-{
-    switch((int)ct) {
-    case UCC_MEMORY_TYPE_HOST:
-        return "Host";
-    case UCC_MEMORY_TYPE_CUDA:
-        return "Cuda";
-    case UCC_MEMORY_TYPE_CUDA_MANAGED:
-        return "CudaManaged";
-    case UCC_MEMORY_TYPE_ROCM:
-        return "Rocm";
-    case UCC_MEMORY_TYPE_ROCM_MANAGED:
-        return "RocmManaged";
-    case UCC_MEMORY_TYPE_ASYMMETRIC:
-        return "asymmetric";
-    case UCC_MEMORY_TYPE_NOT_APPLY:
-        return "n/a";
-    default:
-        break;
-    }
-    return "invalid";
-}
-
 static inline size_t
 ucc_coll_args_get_total_count(const ucc_coll_args_t *args,
                               const ucc_count_t *counts, ucc_rank_t size)
@@ -244,6 +225,8 @@ ucc_status_t ucc_ep_map_create_nested(ucc_ep_map_t *base_map,
                                       ucc_ep_map_t *sub_map,
                                       ucc_ep_map_t *out);
 
+int ucc_ep_map_is_identity(const ucc_ep_map_t *map);
+
 void ucc_ep_map_destroy_nested(ucc_ep_map_t *out);
 
 void ucc_ep_map_destroy(ucc_ep_map_t *map);
diff --git a/src/utils/ucc_compiler_def.h b/src/utils/ucc_compiler_def.h
index 41d13ecb78..b204df67f3 100644
--- a/src/utils/ucc_compiler_def.h
+++ b/src/utils/ucc_compiler_def.h
@@ -26,6 +26,7 @@
 #define ucc_snprintf_safe snprintf
 #define ucc_likely        ucs_likely
 #define ucc_unlikely      ucs_unlikely
+#define ucc_string_split  ucs_string_split
 
 /**
  * Prevent compiler from reordering instructions
diff --git a/src/utils/ucc_log.h b/src/utils/ucc_log.h
index 21ad88dd05..b480ee55ae 100644
--- a/src/utils/ucc_log.h
+++ b/src/utils/ucc_log.h
@@ -187,4 +187,27 @@ static inline const char* ucc_reduction_op_str(ucc_reduction_op_t op)
     }
 }
 
+static inline const char* ucc_mem_type_str(ucc_memory_type_t ct)
+{
+    switch((int)ct) {
+    case UCC_MEMORY_TYPE_HOST:
+        return "Host";
+    case UCC_MEMORY_TYPE_CUDA:
+        return "Cuda";
+    case UCC_MEMORY_TYPE_CUDA_MANAGED:
+        return "CudaManaged";
+    case UCC_MEMORY_TYPE_ROCM:
+        return "Rocm";
+    case UCC_MEMORY_TYPE_ROCM_MANAGED:
+        return "RocmManaged";
+    case UCC_MEMORY_TYPE_ASYMMETRIC:
+        return "asymmetric";
+    case UCC_MEMORY_TYPE_NOT_APPLY:
+        return "n/a";
+    default:
+        break;
+    }
+    return "invalid";
+}
+
 #endif
diff --git a/src/utils/ucc_parser.c b/src/utils/ucc_parser.c
index fff69e47c6..6db8ef52f8 100644
--- a/src/utils/ucc_parser.c
+++ b/src/utils/ucc_parser.c
@@ -86,25 +86,28 @@ static inline int ucc_check_range(char *range_str, ucc_rank_t *begin,
     char   **range = ucc_str_split(range_str, "-");
     char    *str_end;
     unsigned n_range;
+    long pbegin, pend;
 
     if (!range) {
         goto split_err;
     }
 
     n_range = ucc_str_split_count(range);
-    *begin  = (size_t) strtol(range[0], &str_end, 10);
-    *end    = *begin;
+    pbegin  = strtol(range[0], &str_end, 10);
+    pend    = pbegin;
 
-    if (n_range > 2 || *str_end != '\0' || *begin < 0) {
+    if (n_range > 2 || *str_end != '\0' || pbegin < 0) {
         goto val_err;
     }
 
     if (n_range == 2) {
-        *end = (size_t) strtol(range[1], &str_end, 10);
-        if (*str_end != '\0' || *end < 0) {
+        pend = strtol(range[1], &str_end, 10);
+        if (*str_end != '\0' || pend < 0) {
             goto val_err;
         }
     }
+    *begin = (ucc_rank_t)pbegin;
+    *end = (ucc_rank_t)pend;
     ucc_str_split_free(range);
     return 1;
 
@@ -852,7 +855,7 @@ int ucc_config_sscanf_uint_ranged(const char *buf, void *dest,
             if (!r) {
                 goto err_tokens;
             }
-            r->mtypes = -1; //mask all types
+            r->mtypes = UCC_MEM_TYPE_MASK_FULL;
             r->start  = 0;
             r->end    = SIZE_MAX;
 
@@ -905,7 +908,7 @@ int ucc_config_sprintf_uint_ranged(char *buf, size_t max, const void *src,
     ucc_list_for_each(r, &s->ranges, list_elem) {
         ucs_memunits_to_str(r->start, tmp_start, tmp_max);
         ucs_memunits_to_str(r->end, tmp_end, tmp_max);
-        if (r->mtypes == -1) {
+        if (r->mtypes == UCC_MEM_TYPE_MASK_FULL) {
             ucc_snprintf_safe(buf, max, "%s-%s:%u", tmp_start, tmp_end,
                               r->value);
         } else {
diff --git a/src/utils/ucc_parser.h b/src/utils/ucc_parser.h
index 17a64c3df4..517dd88be8 100644
--- a/src/utils/ucc_parser.h
+++ b/src/utils/ucc_parser.h
@@ -1,5 +1,5 @@
 /**
- * Copyright (c) 2020-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2020-2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  *
  * See file LICENSE for terms.
  */
@@ -168,8 +168,13 @@ static inline ucc_status_t
 ucc_config_parser_set_value(void *opts, ucc_config_field_t *fields,
                             const char *name, const char *value)
 {
-    ucs_status_t status =
-        ucs_config_parser_set_value(opts, fields, name, value);
+    ucs_status_t status;
+
+#if UCS_HAVE_PARSER_SET_VALUE_TABLE_PREFIX
+    status = ucs_config_parser_set_value(opts, fields, NULL, name, value);
+#else
+    status = ucs_config_parser_set_value(opts, fields, name, value);
+#endif
     return ucs_status_to_ucc_status(status);
 }
 
@@ -263,8 +268,29 @@ int ucc_config_sprintf_uint_ranged(char *buf, size_t max, const void *src,
 ucs_status_t ucc_config_clone_uint_ranged(const void *src, void *dest,
                                           const void *arg);
 
-void         ucc_config_release_uint_ranged(void *ptr, const void *arg);
+void ucc_config_release_uint_ranged(void *ptr, const void *arg);
+
+#ifdef UCS_HAVE_PARSER_CONFIG_DOC
+#define UCC_CONFIG_TYPE_UINT_RANGED                                            \
+    {                                                                          \
+        ucc_config_sscanf_uint_ranged, ucc_config_sprintf_uint_ranged,         \
+            ucc_config_clone_uint_ranged, ucc_config_release_uint_ranged,      \
+            ucs_config_help_generic, ucs_config_doc_nop,                       \
+            "[<munit>-<munit>:[mtype]:value,"                                  \
+            "<munit>-<munit>:[mtype]:value,...,]default_value\n"               \
+            "#            value and default_value can be \"auto\""             \
+    }
 
+#define UCC_CONFIG_TYPE_PIPELINE_PARAMS                                        \
+    {                                                                          \
+        ucc_config_sscanf_pipeline_params, ucc_config_sprintf_pipeline_params, \
+            ucc_config_clone_pipeline_params,                                  \
+            ucc_config_release_pipeline_params, ucs_config_help_generic,       \
+            ucs_config_doc_nop,                                                \
+            "thresh=<memunit>:fragsize=<memunit>:nfrags="                      \
+            "<uint>:pdepth=<uint>:<ordered/parallel/sequential>"               \
+    }
+#else
 #define UCC_CONFIG_TYPE_UINT_RANGED                                            \
     {                                                                          \
         ucc_config_sscanf_uint_ranged, ucc_config_sprintf_uint_ranged,         \
@@ -280,7 +306,8 @@ void         ucc_config_release_uint_ranged(void *ptr, const void *arg);
             ucc_config_clone_pipeline_params,                                  \
             ucc_config_release_pipeline_params, ucs_config_help_generic,       \
             "thresh=<memunit>:fragsize=<memunit>:nfrags="                      \
-            "<uint>:pdepth=<uint>:<ordered/parallel/serial>"                   \
+            "<uint>:pdepth=<uint>:<ordered/parallel/sequential>"               \
     }
+#endif
 
 #endif
diff --git a/src/utils/ucc_rcache.h b/src/utils/ucc_rcache.h
index dd1d6298e6..46993caacb 100644
--- a/src/utils/ucc_rcache.h
+++ b/src/utils/ucc_rcache.h
@@ -8,6 +8,7 @@
 
 #include <ucs/memory/rcache.h>
 #include <ucm/api/ucm.h>
+#include <utils/ucc_sys.h>
 
 //TODO: handle external events
 #define ucc_rcache_t                 ucs_rcache_t
@@ -25,8 +26,17 @@ static inline ucc_status_t
 ucc_rcache_create(const ucc_rcache_params_t *params,
                   const char *name, ucc_rcache_t **rcache_p)
 {
+#ifndef UCS_HAVE_RCACHE_REGION_ALIGNMENT
+    ucc_rcache_params_t params_dup = *params;
+    params_dup.alignment     = UCS_PGT_ADDR_ALIGN;
+    params_dup.max_alignment = ucc_get_page_size();
+
+    return ucs_status_to_ucc_status(ucs_rcache_create(
+                                    &params_dup, name, NULL, rcache_p));
+#else
     return ucs_status_to_ucc_status(ucs_rcache_create(
-                                        params, name, NULL, rcache_p));
+                                    params, name, NULL, rcache_p));
+#endif
 }
 
 /* [arg] parameter allows passing additional information from mem_reg callabck.
@@ -36,9 +46,16 @@ static inline ucc_status_t
 ucc_rcache_get(ucc_rcache_t *rcache, void *address, size_t length,
                void *arg, ucc_rcache_region_t **region_p)
 {
+#ifdef UCS_HAVE_RCACHE_REGION_ALIGNMENT
     return ucs_status_to_ucc_status(ucs_rcache_get(
                                        rcache, address, length,
+                                       ucc_get_page_size(),
                                        PROT_READ | PROT_WRITE, arg, region_p));
+#else
+    return ucs_status_to_ucc_status(ucs_rcache_get(
+                                       rcache, address, length,
+                                       PROT_READ | PROT_WRITE, arg, region_p));
+#endif
 }
 
 #endif
diff --git a/test/gtest/coll/test_allreduce.cc b/test/gtest/coll/test_allreduce.cc
index 3384f997e7..ef0a8aed24 100644
--- a/test/gtest/coll/test_allreduce.cc
+++ b/test/gtest/coll/test_allreduce.cc
@@ -331,6 +331,43 @@ TYPED_TEST(test_allreduce_alg, sra_knomial_pipelined) {
     }
 }
 
+TYPED_TEST(test_allreduce_alg, dbt) {
+    int           n_procs = 15;
+    ucc_job_env_t env     = {{"UCC_CL_BASIC_TUNE", "inf"},
+                             {"UCC_TL_UCP_TUNE", "allreduce:@dbt:inf"}};
+    UccJob        job(n_procs, UccJob::UCC_JOB_CTX_GLOBAL, env);
+    UccTeam_h     team   = job.create_team(n_procs);
+    int           repeat = 3;
+    UccCollCtxVec ctxs;
+    std::vector<ucc_memory_type_t> mt = {UCC_MEMORY_TYPE_HOST};
+
+    if (UCC_OK == ucc_mc_available(UCC_MEMORY_TYPE_CUDA)) {
+        mt.push_back(UCC_MEMORY_TYPE_CUDA);
+    }
+    if (UCC_OK == ucc_mc_available( UCC_MEMORY_TYPE_CUDA_MANAGED)) {
+        mt.push_back( UCC_MEMORY_TYPE_CUDA_MANAGED);
+    }
+
+    for (auto count : {65536, 123567}) {
+        for (auto inplace : {TEST_NO_INPLACE, TEST_INPLACE}) {
+            for (auto m : mt) {
+                SET_MEM_TYPE(m);
+                this->set_inplace(inplace);
+                this->data_init(n_procs, TypeParam::dt, count, ctxs, true);
+                UccReq req(team, ctxs);
+
+                for (auto i = 0; i < repeat; i++) {
+                    req.start();
+                    req.wait();
+                    EXPECT_EQ(true, this->data_validate(ctxs));
+                    this->reset(ctxs);
+                }
+                this->data_fini(ctxs);
+            }
+        }
+    }
+}
+
 TYPED_TEST(test_allreduce_alg, rab) {
     int           n_procs = 15;
     ucc_job_env_t env     = {{"UCC_CL_HIER_TUNE", "allreduce:@rab:0-inf:inf"},
diff --git a/test/gtest/coll/test_bcast.cc b/test/gtest/coll/test_bcast.cc
index ace5f50a9b..6d80816a31 100644
--- a/test/gtest/coll/test_bcast.cc
+++ b/test/gtest/coll/test_bcast.cc
@@ -8,6 +8,7 @@
 
 using Param_0 = std::tuple<int, ucc_datatype_t, ucc_memory_type_t, int, int>;
 using Param_1 = std::tuple<ucc_datatype_t, ucc_memory_type_t, int, int>;
+using Param_2 = std::tuple<ucc_memory_type_t, ucc_job_env_t, int, int>;
 
 class test_bcast : public UccCollArgs, public ucc::test
 {
@@ -241,42 +242,49 @@ INSTANTIATE_TEST_CASE_P(
         ::testing::Values(1,3,65536), // count
         ::testing::Values(0,1))); // root
 
-class test_bcast_alg : public test_bcast
+class test_bcast_alg : public test_bcast,
+        public ::testing::WithParamInterface<Param_2>
 {};
 
-UCC_TEST_F(test_bcast_alg, 2step) {
-    int           n_procs = 15;
-    ucc_job_env_t env     = {{"UCC_CL_HIER_TUNE", "bcast:@2step:0-inf:inf"},
-                             {"UCC_CLS", "all"}};
-    UccJob        job(n_procs, UccJob::UCC_JOB_CTX_GLOBAL, env);
-    UccTeam_h     team   = job.create_team(n_procs);
-    int           repeat = 1;
+UCC_TEST_P(test_bcast_alg,) {
+    const ucc_memory_type_t mt      = std::get<0>(GetParam());
+    const ucc_job_env_t     env     = std::get<1>(GetParam());
+    const int               count   = std::get<2>(GetParam());
+    const int               n_procs = std::get<3>(GetParam());
+    UccJob                  job(n_procs, UccJob::UCC_JOB_CTX_GLOBAL, env);
+    UccTeam_h               team    = job.create_team(n_procs);
+    int                     repeat  = 1;
     UccCollCtxVec ctxs;
-    std::vector<ucc_memory_type_t> mt = {UCC_MEMORY_TYPE_HOST};
 
-    if (UCC_OK == ucc_mc_available(UCC_MEMORY_TYPE_CUDA)) {
-        mt.push_back(UCC_MEMORY_TYPE_CUDA);
-    }
-    if (UCC_OK == ucc_mc_available(UCC_MEMORY_TYPE_CUDA_MANAGED)) {
-        mt.push_back(UCC_MEMORY_TYPE_CUDA_MANAGED);
-    }
-
-    for (auto count : {8, 65536}) {
-        for (int root = 0; root < n_procs; root++) {
-            for (auto m : mt) {
-                this->set_root(root);
-                SET_MEM_TYPE(m);
-                this->data_init(n_procs, UCC_DT_INT8, count, ctxs, false);
-                UccReq req(team, ctxs);
-
-                for (auto i = 0; i < repeat; i++) {
-                    req.start();
-                    req.wait();
-                    EXPECT_EQ(true, this->data_validate(ctxs));
-                    this->reset(ctxs);
-                }
-                this->data_fini(ctxs);
-            }
+    SET_MEM_TYPE(mt);
+    for (int root = 0; root < n_procs; root++) {
+        this->set_root(root);
+        this->data_init(n_procs, UCC_DT_INT8, count, ctxs, false);
+        UccReq req(team, ctxs);
+
+        for (auto i = 0; i < repeat; i++) {
+            req.start();
+            req.wait();
+            EXPECT_EQ(true, this->data_validate(ctxs));
+            this->reset(ctxs);
         }
+        this->data_fini(ctxs);
     }
 }
+
+ucc_job_env_t two_step_env = {{"UCC_CL_HIER_TUNE", "bcast:@2step:0-inf:inf"},
+                              {"UCC_CLS", "all"}};
+ucc_job_env_t dbt_env      = {{"UCC_TL_UCP_TUNE", "bcast:@dbt:0-inf:inf"},
+                              {"UCC_CLS", "basic"}};
+INSTANTIATE_TEST_CASE_P(
+    , test_bcast_alg,
+    ::testing::Combine(
+#ifdef HAVE_CUDA
+        ::testing::Values(UCC_MEMORY_TYPE_HOST, UCC_MEMORY_TYPE_CUDA,
+                          UCC_MEMORY_TYPE_CUDA_MANAGED),
+#else
+        ::testing::Values(UCC_MEMORY_TYPE_HOST),
+#endif
+        ::testing::Values(two_step_env, dbt_env), //env
+        ::testing::Values(8, 65536), // count
+        ::testing::Values(15,16))); // n_procs
diff --git a/test/gtest/coll/test_reduce.cc b/test/gtest/coll/test_reduce.cc
index 393e97decc..0f8bfc034f 100644
--- a/test/gtest/coll/test_reduce.cc
+++ b/test/gtest/coll/test_reduce.cc
@@ -23,17 +23,9 @@ class test_reduce : public UccCollArgs, public testing::Test {
             ucc_coll_args_t *coll = (ucc_coll_args_t*)
                     calloc(1, sizeof(ucc_coll_args_t));
 
-            ctxs[r] = (gtest_ucc_coll_ctx_t*)calloc(1,
-                          sizeof(gtest_ucc_coll_ctx_t));
-            ctxs[r]->args = coll;
-
-            coll->coll_type = UCC_COLL_TYPE_REDUCE;
-            coll->op        = T::redop;
-            coll->root      = root;
-            coll->src.info.mem_type = mem_type;
-            coll->src.info.count    = (ucc_count_t)count;
-            coll->src.info.datatype = dt;
-
+            ctxs[r]           = (gtest_ucc_coll_ctx_t*)calloc(1,
+                                sizeof(gtest_ucc_coll_ctx_t));
+            ctxs[r]->args     = coll;
             ctxs[r]->init_buf = ucc_malloc(ucc_dt_size(dt) * count,
                                                         "init buf");
             EXPECT_NE(ctxs[r]->init_buf, nullptr);
@@ -48,6 +40,21 @@ class test_reduce : public UccCollArgs, public testing::Test {
                 ptr[i] = (typename T::type)((i + r + 1) % 8);
             }
 
+            coll->coll_type = UCC_COLL_TYPE_REDUCE;
+            coll->op        = T::redop;
+            coll->root      = root;
+            if (r != root || !inplace) {
+                coll->src.info.mem_type = mem_type;
+                coll->src.info.count    = (ucc_count_t)count;
+                coll->src.info.datatype = dt;
+                UCC_CHECK(ucc_mc_alloc(&ctxs[r]->src_mc_header,
+                                       ucc_dt_size(dt) * count, mem_type));
+                coll->src.info.buffer = ctxs[r]->src_mc_header->addr;
+                UCC_CHECK(ucc_mc_memcpy(coll->src.info.buffer,
+                                        ctxs[r]->init_buf,
+                                        ucc_dt_size(dt) * count, mem_type,
+                                        UCC_MEMORY_TYPE_HOST));
+            }
             if (r == root) {
                 coll->dst.info.mem_type = mem_type;
                 coll->dst.info.count = (ucc_count_t)count;
@@ -65,15 +72,6 @@ class test_reduce : public UccCollArgs, public testing::Test {
                 coll->mask  |= UCC_COLL_ARGS_FIELD_FLAGS;
                 coll->flags |= UCC_COLL_ARGS_FLAG_IN_PLACE;
             }
-            if (r != root || !inplace) {
-                UCC_CHECK(ucc_mc_alloc(&ctxs[r]->src_mc_header,
-                                       ucc_dt_size(dt) * count, mem_type));
-                coll->src.info.buffer = ctxs[r]->src_mc_header->addr;
-                UCC_CHECK(ucc_mc_memcpy(coll->src.info.buffer,
-                                        ctxs[r]->init_buf,
-                                        ucc_dt_size(dt) * count, mem_type,
-                                        UCC_MEMORY_TYPE_HOST));
-            }
             if (persistent) {
                 coll->mask  |= UCC_COLL_ARGS_FIELD_FLAGS;
                 coll->flags |= UCC_COLL_ARGS_FLAG_PERSISTENT;
@@ -282,42 +280,58 @@ TYPED_TEST(test_reduce_cuda, multiple_inplace_managed) {
 template <typename T> class test_reduce_avg_order : public test_reduce<T> {
 };
 
+template <typename T> class test_reduce_dbt : public test_reduce<T> {
+};
+
+#define TEST_DECLARE_WITH_ENV(_env, _n_procs)                                    \
+    {                                                                            \
+        UccJob        job(_n_procs, UccJob::UCC_JOB_CTX_GLOBAL, _env);           \
+        UccTeam_h     team   = job.create_team(_n_procs);                        \
+        int           repeat = 3;                                                \
+        UccCollCtxVec ctxs;                                                      \
+        std::vector<ucc_memory_type_t> mt = {UCC_MEMORY_TYPE_HOST};              \
+        if (UCC_OK == ucc_mc_available(UCC_MEMORY_TYPE_CUDA)) {                  \
+            mt.push_back(UCC_MEMORY_TYPE_CUDA);                                  \
+        }                                                                        \
+        if (UCC_OK == ucc_mc_available(UCC_MEMORY_TYPE_CUDA_MANAGED)) {          \
+            mt.push_back(UCC_MEMORY_TYPE_CUDA_MANAGED);                          \
+        }                                                                        \
+        for (auto count : {5, 256, 65536}) {                                     \
+            for (auto inplace : {TEST_NO_INPLACE, TEST_INPLACE}) {               \
+                for (auto m : mt) {                                              \
+                    CHECK_TYPE_OP_SKIP(TypeParam::dt, TypeParam::redop, m);      \
+                    SET_MEM_TYPE(m);                                             \
+                    this->set_inplace(inplace);                                  \
+                    this->data_init(_n_procs, TypeParam::dt, count, ctxs, true); \
+                    UccReq req(team, ctxs);                                      \
+                    CHECK_REQ_NOT_SUPPORTED_SKIP(req, this->data_fini(ctxs));    \
+                    for (auto i = 0; i < repeat; i++) {                          \
+                        req.start();                                             \
+                        req.wait();                                              \
+                        EXPECT_EQ(true, this->data_validate(ctxs));              \
+                        this->reset(ctxs);                                       \
+                    }                                                            \
+                    this->data_fini(ctxs);                                       \
+                }                                                                \
+            }                                                                    \
+        }                                                                        \
+    }
+
 TYPED_TEST_CASE(test_reduce_avg_order, CollReduceTypeOpsAvg);
+TYPED_TEST_CASE(test_reduce_dbt, CollReduceTypeOpsHost);
 
-TYPED_TEST(test_reduce_avg_order, avg_post_op)
-{
-    int           n_procs = 15;
-    ucc_job_env_t env     = {{"UCC_TL_UCP_REDUCE_AVG_PRE_OP", "0"}};
-    UccJob        job(n_procs, UccJob::UCC_JOB_CTX_GLOBAL, env);
-    UccTeam_h     team   = job.create_team(n_procs);
-    int           repeat = 3;
-    UccCollCtxVec ctxs;
-    std::vector<ucc_memory_type_t> mt = {UCC_MEMORY_TYPE_HOST};
+ucc_job_env_t post_op_env    = {{"UCC_TL_UCP_REDUCE_AVG_PRE_OP", "0"}};
+ucc_job_env_t reduce_dbt_env = {{"UCC_TL_UCP_TUNE", "reduce:@dbt:0-inf:inf"},
+                                {"UCC_CLS", "basic"}};
 
-    if (UCC_OK == ucc_mc_available(UCC_MEMORY_TYPE_CUDA)) {
-        mt.push_back(UCC_MEMORY_TYPE_CUDA);
-    }
-    if (UCC_OK == ucc_mc_available(UCC_MEMORY_TYPE_CUDA_MANAGED)) {
-        mt.push_back(UCC_MEMORY_TYPE_CUDA_MANAGED);
-    }
+TYPED_TEST(test_reduce_avg_order, avg_post_op) {
+    TEST_DECLARE_WITH_ENV(post_op_env, 15);
+}
 
-    for (auto count : {4, 256, 65536}) {
-        for (auto inplace : {TEST_NO_INPLACE, TEST_INPLACE}) {
-            for (auto m : mt) {
-                CHECK_TYPE_OP_SKIP(TypeParam::dt, TypeParam::redop, m);
-                SET_MEM_TYPE(m);
-                this->set_inplace(inplace);
-                this->data_init(n_procs, TypeParam::dt, count, ctxs, true);
-                UccReq req(team, ctxs);
-                CHECK_REQ_NOT_SUPPORTED_SKIP(req, this->data_fini(ctxs));
-                for (auto i = 0; i < repeat; i++) {
-                    req.start();
-                    req.wait();
-                    EXPECT_EQ(true, this->data_validate(ctxs));
-                    this->reset(ctxs);
-                }
-                this->data_fini(ctxs);
-            }
-        }
-    }
+TYPED_TEST(test_reduce_dbt, reduce_dbt_shift) {
+    TEST_DECLARE_WITH_ENV(reduce_dbt_env, 15);
+}
+
+TYPED_TEST(test_reduce_dbt, reduce_dbt_mirror) {
+    TEST_DECLARE_WITH_ENV(reduce_dbt_env, 16);
 }
diff --git a/test/gtest/core/test_mc_reduce.cc b/test/gtest/core/test_mc_reduce.cc
index e528119835..674808ccdb 100644
--- a/test/gtest/core/test_mc_reduce.cc
+++ b/test/gtest/core/test_mc_reduce.cc
@@ -101,6 +101,7 @@ class test_mc_reduce : public testing::Test {
                 std::cerr << "failed to destory cuda stream" << std::endl;
                 return UCC_ERR_NO_MESSAGE;
             }
+            ee_context = NULL;
         }
 #endif
         return status;
@@ -110,11 +111,11 @@ class test_mc_reduce : public testing::Test {
     {
         ucc_status_t status;
 
-        status = alloc_executor(mtype);
+        status = alloc_bufs(mtype, n);
         if (UCC_OK != status) {
             return status;
         }
-        return alloc_bufs(mtype, n);
+        return alloc_executor(mtype);
     }
 
     ucc_status_t alloc_bufs(ucc_memory_type_t mtype, size_t n)
@@ -192,9 +193,6 @@ class test_mc_reduce : public testing::Test {
     virtual void TearDown() override
     {
         free_bufs(mem_type);
-        if (executor) {
-            free_executor();
-        }
         ucc_mc_finalize();
     }
 
@@ -246,6 +244,9 @@ class test_mc_reduce : public testing::Test {
             GTEST_SKIP();
         }
         ASSERT_EQ(status, UCC_OK);
+        if (executor) {
+            free_executor();
+        }
 
         if (mt != UCC_MEMORY_TYPE_HOST) {
             ucc_mc_memcpy(this->res_h, this->res_d, this->COUNT * sizeof(*this->res_d),
@@ -272,6 +273,9 @@ class test_mc_reduce : public testing::Test {
             GTEST_SKIP();
         }
         ASSERT_EQ(status, UCC_OK);
+        if (executor) {
+            free_executor();
+        }
 
         if (mt != UCC_MEMORY_TYPE_HOST) {
             ucc_mc_memcpy(this->res_h, this->res_d, this->COUNT * sizeof(*this->res_d),
@@ -305,6 +309,9 @@ class test_mc_reduce : public testing::Test {
             GTEST_SKIP();
         }
         ASSERT_EQ(status, UCC_OK);
+        if (executor) {
+            free_executor();
+        }
 
         if (mt != UCC_MEMORY_TYPE_HOST) {
             ucc_mc_memcpy(this->res_h, this->res_d, this->COUNT * sizeof(*this->res_d),
diff --git a/test/mpi/buffer.cc b/test/mpi/buffer.cc
index 69c6d4bc58..f31f42c553 100644
--- a/test/mpi/buffer.cc
+++ b/test/mpi/buffer.cc
@@ -1,5 +1,5 @@
 /**
- * Copyright (c) 2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2022-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  *
  * See file LICENSE for terms.
  */
@@ -25,7 +25,7 @@ void init_buffer_host(void *buf, size_t count, int _value)
 }
 
 void init_buffer(void *_buf, size_t count, ucc_datatype_t dt,
-                 ucc_memory_type_t mt, int value)
+                 ucc_memory_type_t mt, int value, int offset)
 {
     void *buf = NULL;
     if (mt == UCC_MEMORY_TYPE_CUDA || mt == UCC_MEMORY_TYPE_ROCM) {
@@ -37,6 +37,8 @@ void init_buffer(void *_buf, size_t count, ucc_datatype_t dt,
         std::cerr << "Unsupported mt\n";
         MPI_Abort(MPI_COMM_WORLD, -1);
     }
+
+    value += offset;
     switch(dt) {
     case UCC_DT_INT8:
         init_buffer_host<int8_t>(buf, count, value);
diff --git a/test/mpi/main.cc b/test/mpi/main.cc
index b719f366f0..f4a571fa14 100644
--- a/test/mpi/main.cc
+++ b/test/mpi/main.cc
@@ -9,6 +9,7 @@
 #include <sstream>
 #include <algorithm>
 #include <chrono>
+#include <iomanip>
 #include "test_mpi.h"
 
 int test_rand_seed = -1;
@@ -25,7 +26,7 @@ static std::vector<ucc_coll_type_t> colls = {
     UCC_COLL_TYPE_SCATTER,        UCC_COLL_TYPE_SCATTERV};
 
 static std::vector<ucc_coll_type_t> onesided_colls = {
-    UCC_COLL_TYPE_ALLTOALL};
+    UCC_COLL_TYPE_ALLTOALL, UCC_COLL_TYPE_ALLTOALLV};
 
 static std::vector<ucc_memory_type_t> mtypes = {
     UCC_MEMORY_TYPE_HOST};
@@ -82,7 +83,7 @@ static std::vector<std::string> str_split(const char *value, const char *delimit
     return rst;
 }
 
-void PrintHelp()
+void print_help()
 {
     std::cout <<
        "-c, --colls            <c1,c2,..>\n\tlist of collectives: "
@@ -135,6 +136,23 @@ static ucc_test_mpi_team_t team_str_to_type(std::string team)
     throw std::string("incorrect team type: ") + team;
 }
 
+static std::string team_type_to_str(ucc_test_mpi_team_t team)
+{
+    switch (team) {
+    case TEAM_WORLD:
+        return "world";
+    case TEAM_SPLIT_HALF:
+        return "half";
+    case TEAM_SPLIT_ODD_EVEN:
+        return "odd_even";
+    case TEAM_REVERSE:
+        return "reverse";
+    default:
+        break;
+    }
+    throw std::string("incorrect team type: ");
+}
+
 static ucc_coll_type_t coll_str_to_type(std::string coll)
 {
     if (coll == "barrier") {
@@ -168,10 +186,8 @@ static ucc_coll_type_t coll_str_to_type(std::string coll)
     } else if (coll == "scatterv") {
         return UCC_COLL_TYPE_SCATTERV;
     } else {
-        std::cerr << "incorrect coll type: " << coll << std::endl;
-        PrintHelp();
+        throw std::string("incorrect coll type: ") + coll;
     }
-    throw std::string("incorrect coll type: ") + coll;
 }
 
 static ucc_memory_type_t mtype_str_to_type(std::string mtype)
@@ -394,18 +410,55 @@ int init_rand_seed(int user_seed)
     return seed;
 }
 
-void PrintInfo()
+void print_info()
 {
     int world_rank;
-    MPI_Comm_rank(MPI_COMM_WORLD, &world_rank);
 
+    MPI_Comm_rank(MPI_COMM_WORLD, &world_rank);
     if (world_rank) {
         return;
     }
-    std::cout << "\n===== UCC MPI TEST INFO =======\n"
-              << "   seed        : " << std::to_string(test_rand_seed) << "\n"
-              <<   "===============================\n"
-              << std::endl;
+
+    std::cout << "===== UCC MPI TEST INFO =======" << std::endl;
+    std::cout <<"seed:         " << std::to_string(test_rand_seed) << std::endl;
+    std::cout <<"collectives:  ";
+    for (const auto &c : colls) {
+        std::cout << ucc_coll_type_str(c);
+        if (c != colls.back()) {
+            std::cout << ", ";
+        } else {
+            std::cout << std::endl;
+        }
+    }
+    std::cout <<"data types:   ";
+    for (const auto &d : dtypes) {
+        std::cout << ucc_datatype_str(d);
+        if (d != dtypes.back()) {
+            std::cout << ", ";
+        } else {
+            std::cout << std::endl;
+        }
+    }
+
+    std::cout <<"memory types: ";
+    for (const auto &m : mtypes) {
+        std::cout << ucc_mem_type_str(m);
+        if (m != mtypes.back()) {
+            std::cout << ", ";
+        } else {
+            std::cout << std::endl;
+        }
+    }
+
+    std::cout <<"teams:        ";
+    for (const auto &t : teams) {
+        std::cout << team_type_to_str(t);
+        if (t != teams.back()) {
+            std::cout << ", ";
+        } else {
+            std::cout << std::endl;
+        }
+    }
 }
 
 void ProcessArgs(int argc, char** argv)
@@ -521,8 +574,8 @@ void ProcessArgs(int argc, char** argv)
 
 int main(int argc, char *argv[])
 {
-    int failed                       = 0;
-    int total_done_skipped_failed[4] = {0};
+    int failed                                                          = 0;
+    int total_done_skipped_failed[ucc_ilog2(UCC_COLL_TYPE_LAST) + 1][4] = {0};
     std::chrono::steady_clock::time_point begin;
     int size, required, provided, completed, rank;
     UccTestMpi *test;
@@ -548,7 +601,7 @@ int main(int argc, char *argv[])
     if (!err.empty() || show_help) {
         if (rank == 0) {
             std::cerr << "ParseArgs error:" << err << "\n\n";
-            PrintHelp();
+            print_help();
         }
         goto mpi_exit;
     }
@@ -589,7 +642,7 @@ int main(int argc, char *argv[])
     test->set_max_size(test_max_size);
     test_rand_seed = init_rand_seed(test_rand_seed);
 
-    PrintInfo();
+    print_info();
 
     for (auto inpl : inplace) {
         for (auto pers : persistent) {
@@ -625,19 +678,20 @@ int main(int argc, char *argv[])
     }
     std::cout << std::flush;
 
-    total_done_skipped_failed[0] = test->results.size();
     for (auto s : test->results) {
-        switch(s) {
+        int coll_num = ucc_ilog2(std::get<0>(s));
+        switch(std::get<1>(s)) {
         case UCC_OK:
-            total_done_skipped_failed[1]++;
+            total_done_skipped_failed[coll_num][1]++;
             break;
         case UCC_ERR_NOT_IMPLEMENTED:
         case UCC_ERR_LAST:
-            total_done_skipped_failed[2]++;
+            total_done_skipped_failed[coll_num][2]++;
             break;
         default:
-            total_done_skipped_failed[3]++;
+            total_done_skipped_failed[coll_num][3]++;
         }
+        total_done_skipped_failed[coll_num][0]++;
     }
     MPI_Iallreduce(MPI_IN_PLACE, total_done_skipped_failed,
                    sizeof(total_done_skipped_failed)/sizeof(int),
@@ -650,21 +704,60 @@ int main(int argc, char *argv[])
     if (0 == rank) {
         std::chrono::steady_clock::time_point end =
             std::chrono::steady_clock::now();
+        ucc_coll_type_t coll_type;
+        int num_all = 0, num_skipped = 0, num_done =0, num_failed = 0;
+        std::ios iostate(nullptr);
+
+        iostate.copyfmt(std::cout);
         std::cout << "\n===== UCC MPI TEST REPORT =====\n" <<
-            "   total tests : " << total_done_skipped_failed[0] << "\n" <<
-            "   passed      : " << total_done_skipped_failed[1] << "\n" <<
-            "   skipped     : " << total_done_skipped_failed[2] << "\n" <<
-            "   failed      : " << total_done_skipped_failed[3] << "\n" <<
-            "   elapsed     : " <<
+            std::setw(22) << std::left << "collective" <<
+            std::setw(10) << std::right << "tests" <<
+            std::setw(10) << std::right << "passed" <<
+            std::setw(10) << std::right << "failed" <<
+            std::setw(10) << std::right << "skipped" << std::endl;
+
+        for (coll_type =  (ucc_coll_type_t)1;
+             coll_type < UCC_COLL_TYPE_LAST;
+             coll_type = (ucc_coll_type_t)(coll_type << 1))
+        {
+            int coll_num = ucc_ilog2(coll_type);
+            if (total_done_skipped_failed[coll_num][0] == 0) {
+                continue;
+            }
+            num_all += total_done_skipped_failed[coll_num][0];
+            num_done += total_done_skipped_failed[coll_num][1];
+            num_skipped += total_done_skipped_failed[coll_num][2];
+            num_failed += total_done_skipped_failed[coll_num][3];
+            std::cout <<
+                std::setw(22) << std::left << ucc_coll_type_str(coll_type) <<
+                std::setw(10) << std::right << total_done_skipped_failed[coll_num][0] <<
+                std::setw(10) << std::right << total_done_skipped_failed[coll_num][1] <<
+                std::setw(10) << std::right << total_done_skipped_failed[coll_num][3] <<
+                std::setw(10) << std::right << total_done_skipped_failed[coll_num][2] <<
+                std::endl;
+
+        }
+        std::cout <<
+            " \n===== UCC MPI TEST SUMMARY =====\n" <<
+            "total tests:  " << num_all << "\n" <<
+            "passed:       " << num_done << "\n" <<
+            "skipped:      " << num_skipped << "\n" <<
+            "failed:       " << num_failed << "\n" <<
+            "elapsed:      " <<
             std::chrono::duration_cast<std::chrono::seconds>(end - begin).count()
                   << "s" << std::endl;
+        std::cout.copyfmt(iostate);
 
         /* check if all tests have been skipped */
-        if (total_done_skipped_failed[0] == total_done_skipped_failed[2]) {
+        if (num_all == num_skipped) {
             std::cout << "\n All tests have been skipped, indicating most likely "
                          "a problem\n";
             failed = 1;
         }
+
+        if (num_failed != 0) {
+            failed = 1;
+        }
     }
 
 test_exit:
diff --git a/test/mpi/test_allgather.cc b/test/mpi/test_allgather.cc
index 12b603e1cf..ebca8c4c95 100644
--- a/test/mpi/test_allgather.cc
+++ b/test/mpi/test_allgather.cc
@@ -53,6 +53,7 @@ ucc_status_t TestAllgather::set_input(int iter_persistent)
     int    rank;
     void  *buf, *check;
 
+    this->iter_persistent = iter_persistent;
     MPI_Comm_rank(team.comm, &rank);
     if (inplace) {
         buf   = PTR_OFFSET(rbuf, rank * single_rank_size);
@@ -70,18 +71,18 @@ ucc_status_t TestAllgather::set_input(int iter_persistent)
 
 ucc_status_t TestAllgather::check()
 {
-    int size, completed;
+    size_t dt_size, single_rank_count;
+    int    size, i;
+
     MPI_Comm_size(team.comm, &size);
-    size_t       single_rank_count = args.dst.info.count / size;
-    MPI_Datatype mpi_dt            = ucc_dt_to_mpi(dt);
-    MPI_Request  req;
+    single_rank_count = args.dst.info.count / size;
+    dt_size = ucc_dt_size(dt);
+    for (i = 0; i < size; i++) {
+        init_buffer(PTR_OFFSET(check_buf, i * single_rank_count * dt_size),
+                    single_rank_count, dt, UCC_MEMORY_TYPE_HOST,
+                    i * (iter_persistent + 1));
+    }
 
-    MPI_Iallgather(MPI_IN_PLACE, 0, MPI_DATATYPE_NULL, check_buf,
-                   single_rank_count, mpi_dt, team.comm, &req);
-    do {
-        MPI_Test(&req, &completed, MPI_STATUS_IGNORE);
-        ucc_context_progress(team.ctx);
-    } while(!completed);
 
     return compare_buffers(rbuf, check_buf, single_rank_count * size, dt,
                            mem_type);
diff --git a/test/mpi/test_allgatherv.cc b/test/mpi/test_allgatherv.cc
index 9554f1c616..a3bfa55d93 100644
--- a/test/mpi/test_allgatherv.cc
+++ b/test/mpi/test_allgatherv.cc
@@ -82,18 +82,17 @@ ucc_status_t TestAllgatherv::set_input(int iter_persistent)
 {
     size_t dt_size = ucc_dt_size(dt);
     int    rank;
-    void  *buf, *check;
+    void  *buf;
 
+    this->iter_persistent = iter_persistent;
     MPI_Comm_rank(team.comm, &rank);
     if (inplace) {
         buf = PTR_OFFSET(rbuf, displacements[rank] * dt_size);
     } else {
         buf = sbuf;
     }
-    check = PTR_OFFSET(check_buf, displacements[rank] * dt_size);
     init_buffer(buf, counts[rank], dt, mem_type, rank * (iter_persistent + 1));
-    UCC_CHECK(ucc_mc_memcpy(check, buf, counts[rank] * dt_size,
-                            UCC_MEMORY_TYPE_HOST, mem_type));
+
     return UCC_OK;
 }
 
@@ -108,23 +107,19 @@ TestAllgatherv::~TestAllgatherv() {
 
 ucc_status_t TestAllgatherv::check()
 {
-    MPI_Datatype mpi_dt      = ucc_dt_to_mpi(dt);
-    int          total_count = 0;
-    int          size, rank, completed, i;
-    MPI_Request  req;
+    int total_count = 0;
+    int size, i;
 
     MPI_Comm_size(team.comm, &size);
-    MPI_Comm_rank(team.comm, &rank);
     for (i = 0 ; i < size; i++) {
         total_count += counts[i];
     }
-    MPI_Iallgatherv(MPI_IN_PLACE, 0, MPI_DATATYPE_NULL, check_buf,
-                    (int *)counts, (int *)displacements, mpi_dt, team.comm,
-                    &req);
-    do {
-        MPI_Test(&req, &completed, MPI_STATUS_IGNORE);
-        ucc_context_progress(team.ctx);
-    } while(!completed);
+
+    for (i = 0; i < size; i++) {
+        init_buffer(PTR_OFFSET(check_buf, displacements[i] * ucc_dt_size(dt)),
+                    counts[i], dt, UCC_MEMORY_TYPE_HOST,
+                    i * (iter_persistent + 1));
+    }
 
     return compare_buffers(rbuf, check_buf, total_count, dt, mem_type);
 }
diff --git a/test/mpi/test_alltoall.cc b/test/mpi/test_alltoall.cc
index 7597353e6b..a92900265e 100644
--- a/test/mpi/test_alltoall.cc
+++ b/test/mpi/test_alltoall.cc
@@ -74,6 +74,7 @@ ucc_status_t TestAlltoall::set_input(int iter_persistent)
     void *      buf;
     int         rank, nprocs, completed;
 
+    this->iter_persistent = iter_persistent;
     MPI_Comm_rank(team.comm, &rank);
     MPI_Comm_size(team.comm, &nprocs);
     if (inplace) {
@@ -99,19 +100,18 @@ ucc_status_t TestAlltoall::set_input(int iter_persistent)
 
 ucc_status_t TestAlltoall::check()
 {
-    int         size, completed;
-    size_t      single_rank_count;
-    MPI_Request req;
+    int    size, rank, i;
+    size_t single_rank_count;
 
+    MPI_Comm_rank(team.comm, &rank);
     MPI_Comm_size(team.comm, &size);
     single_rank_count = args.src.info.count / size;
 
-    MPI_Ialltoall(MPI_IN_PLACE, 0, MPI_DATATYPE_NULL, check_buf,
-                  single_rank_count, ucc_dt_to_mpi(dt), team.comm, &req);
-    do {
-        MPI_Test(&req, &completed, MPI_STATUS_IGNORE);
-        ucc_context_progress(team.ctx);
-    } while(!completed);
+    for (i = 0; i < size; i++) {
+        init_buffer(PTR_OFFSET(check_buf, i * single_rank_count * ucc_dt_size(dt)),
+                    single_rank_count, dt, UCC_MEMORY_TYPE_HOST,
+                    i * (iter_persistent + 1), single_rank_count * rank);
+    }
 
     return compare_buffers(rbuf, check_buf, single_rank_count * size, dt,
                            mem_type);
diff --git a/test/mpi/test_alltoallv.cc b/test/mpi/test_alltoallv.cc
index c939ea3968..aaa65b7e10 100644
--- a/test/mpi/test_alltoallv.cc
+++ b/test/mpi/test_alltoallv.cc
@@ -25,22 +25,26 @@ TestAlltoallv::TestAlltoallv(ucc_test_team_t &_team, TestCaseParams &params) :
     std::default_random_engine eng;
     size_t                     dt_size, count;
     int                        rank, nprocs, rank_count;
-
-    dt         = params.dt;
-    dt_size    = ucc_dt_size(dt);
-    count      = msgsize / dt_size;
-    sncounts   = 0;
-    rncounts   = 0;
-    scounts    = NULL;
-    sdispls    = NULL;
-    rcounts    = NULL;
-    rdispls    = NULL;
-    scounts64  = NULL;
-    sdispls64  = NULL;
-    rcounts64  = NULL;
-    rdispls64  = NULL;
-    count_bits = params.count_bits;
-    displ_bits = params.displ_bits;
+    bool                       is_onesided;
+    void                      *work_buf;
+
+    dt          = params.dt;
+    dt_size     = ucc_dt_size(dt);
+    count       = msgsize / dt_size;
+    sncounts    = 0;
+    rncounts    = 0;
+    scounts     = NULL;
+    sdispls     = NULL;
+    rcounts     = NULL;
+    rdispls     = NULL;
+    scounts64   = NULL;
+    sdispls64   = NULL;
+    rcounts64   = NULL;
+    rdispls64   = NULL;
+    count_bits  = params.count_bits;
+    displ_bits  = params.displ_bits;
+    is_onesided = (params.buffers != NULL);
+    work_buf    = NULL;
 
     std::uniform_int_distribution<int> urd(count / 2, count);
     eng.seed(test_rand_seed);
@@ -56,6 +60,10 @@ TestAlltoallv::TestAlltoallv(ucc_test_team_t &_team, TestCaseParams &params) :
     args.mask  = UCC_COLL_ARGS_FIELD_FLAGS;
     args.flags |= UCC_COLL_ARGS_FLAG_CONTIG_SRC_BUFFER |
                   UCC_COLL_ARGS_FLAG_CONTIG_DST_BUFFER;
+    if (is_onesided) {
+        args.mask  |= UCC_COLL_ARGS_FIELD_GLOBAL_WORK_BUFFER;
+        args.flags |= UCC_COLL_ARGS_FLAG_MEM_MAPPED_BUFFERS;
+    }
     if (count_bits == TEST_FLAG_VSIZE_64BIT) {
         args.flags |= UCC_COLL_ARGS_FLAG_COUNT_64BIT;
     }
@@ -92,14 +100,21 @@ TestAlltoallv::TestAlltoallv(ucc_test_team_t &_team, TestCaseParams &params) :
     if (TEST_SKIP_NONE != skip_reduce(test_skip, team.comm)) {
         return;
     }
-
-    UCC_CHECK(ucc_mc_alloc(&sbuf_mc_header, sncounts * dt_size, mem_type));
-    UCC_CHECK(ucc_mc_alloc(&rbuf_mc_header, rncounts * dt_size, mem_type));
-    sbuf      = sbuf_mc_header->addr;
-    rbuf      = rbuf_mc_header->addr;
-    check_buf = ucc_malloc((sncounts + rncounts) * dt_size, "check buf");
+    check_buf = ucc_malloc(rncounts * dt_size, "check buf");
     UCC_MALLOC_CHECK(check_buf);
 
+    if (!is_onesided) {
+        UCC_CHECK(ucc_mc_alloc(&sbuf_mc_header, sncounts * dt_size, mem_type));
+        UCC_CHECK(ucc_mc_alloc(&rbuf_mc_header, rncounts * dt_size, mem_type));
+        sbuf = sbuf_mc_header->addr;
+        rbuf = rbuf_mc_header->addr;
+    } else {
+        sbuf                    = params.buffers[MEM_SEND_SEGMENT];
+        rbuf                    = params.buffers[MEM_RECV_SEGMENT];
+        work_buf                = params.buffers[MEM_WORK_SEGMENT];
+        args.global_work_buffer = work_buf;
+    }
+
     args.src.info_v.buffer = sbuf;
     args.src.info_v.datatype = dt;
     args.src.info_v.mem_type = mem_type;
@@ -140,19 +155,40 @@ TestAlltoallv::TestAlltoallv(ucc_test_team_t &_team, TestCaseParams &params) :
         args.src.info_v.displacements = (ucc_aint_t*)sdispls;
         args.dst.info_v.displacements = (ucc_aint_t*)rdispls;
     }
+    if (is_onesided) {
+        MPI_Datatype datatype;
+        size_t       disp_size;
+        void        *ldisp;
+        int          alltoall_status;
+
+        if (TEST_FLAG_VSIZE_64BIT == displ_bits) {
+            datatype  = MPI_LONG;
+            disp_size = sizeof(uint64_t);
+        } else {
+            datatype  = MPI_INT;
+            disp_size = sizeof(uint32_t);
+        }
+        ldisp = ucc_calloc(nprocs, disp_size, "displacements");
+        UCC_MALLOC_CHECK(ldisp);
+        alltoall_status = MPI_Alltoall(args.dst.info_v.displacements, 1,
+                                       datatype, ldisp, 1, datatype, team.comm);
+        if (MPI_SUCCESS != alltoall_status) {
+            std::cerr << "*** MPI ALLTOALL FAILED" << std::endl;
+            MPI_Abort(MPI_COMM_WORLD, -1);
+        }
+        args.dst.info_v.displacements = (ucc_aint_t *)ldisp;
+    }
     UCC_CHECK(set_input());
     UCC_CHECK_SKIP(ucc_collective_init(&args, &req, team.team), test_skip);
 }
 
 ucc_status_t TestAlltoallv::set_input(int iter_persistent)
 {
-    size_t dt_size = ucc_dt_size(dt);
-    int    rank;
+    int rank;
 
+    this->iter_persistent = iter_persistent;
     MPI_Comm_rank(team.comm, &rank);
     init_buffer(sbuf, sncounts, dt, mem_type, rank * (iter_persistent + 1));
-    UCC_CHECK(ucc_mc_memcpy(check_buf, sbuf, sncounts * dt_size,
-                            UCC_MEMORY_TYPE_HOST, mem_type));
 
     return UCC_OK;
 }
@@ -171,20 +207,25 @@ TestAlltoallv::~TestAlltoallv()
 
 ucc_status_t TestAlltoallv::check()
 {
-    size_t      dt_size = ucc_dt_size(dt);
     MPI_Request req;
-    int         completed;
-    void       *check;
+    int         i, size, rank, completed;
+
+    MPI_Comm_size(team.comm, &size);
+    MPI_Comm_rank(team.comm, &rank);
 
-    check = PTR_OFFSET(check_buf, sncounts * dt_size);
-    MPI_Ialltoallv(check_buf, scounts, sdispls, ucc_dt_to_mpi(dt), check,
-                   rcounts, rdispls, ucc_dt_to_mpi(dt), team.comm, &req);
+    MPI_Ialltoall(sdispls, 1, MPI_INT, scounts, 1, MPI_INT, team.comm, &req);
     do {
         MPI_Test(&req, &completed, MPI_STATUS_IGNORE);
         ucc_context_progress(team.ctx);
     } while(!completed);
 
-    return compare_buffers(rbuf, check, rncounts, dt, mem_type);
+    for (i = 0; i < size; i++) {
+        init_buffer(PTR_OFFSET(check_buf, rdispls[i] * ucc_dt_size(dt)),
+                    rcounts[i], dt, UCC_MEMORY_TYPE_HOST,
+                    i * (iter_persistent + 1), scounts[i]);
+    }
+
+    return compare_buffers(rbuf, check_buf, rncounts, dt, mem_type);
 }
 
 std::string TestAlltoallv::str()
diff --git a/test/mpi/test_bcast.cc b/test/mpi/test_bcast.cc
index 1a541bcfee..080cbb436f 100644
--- a/test/mpi/test_bcast.cc
+++ b/test/mpi/test_bcast.cc
@@ -45,6 +45,7 @@ ucc_status_t TestBcast::set_input(int iter_persistent)
     size_t count   = msgsize / dt_size;
     int    rank;
 
+    this->iter_persistent = iter_persistent;
     MPI_Comm_rank(team.comm, &rank);
     if (rank == root) {
         init_buffer(sbuf, count, dt, mem_type, rank * (iter_persistent + 1));
@@ -56,18 +57,12 @@ ucc_status_t TestBcast::set_input(int iter_persistent)
 
 ucc_status_t TestBcast::check()
 {
-    size_t       count = args.src.info.count;
-    MPI_Datatype mpi_dt = ucc_dt_to_mpi(dt);
-    int          rank, completed;
-    MPI_Request  req;
+    size_t count = args.src.info.count;
+    int rank;
 
     MPI_Comm_rank(team.comm, &rank);
-    MPI_Ibcast(check_buf, count, mpi_dt, root, team.comm, &req);
-    do {
-        MPI_Test(&req, &completed, MPI_STATUS_IGNORE);
-        ucc_context_progress(team.ctx);
-    } while(!completed);
-
+    init_buffer(check_buf, count, dt, UCC_MEMORY_TYPE_HOST,
+                root * (iter_persistent + 1));
     return (rank == root)
                ? UCC_OK
                : compare_buffers(sbuf, check_buf, count, dt, mem_type);
diff --git a/test/mpi/test_case.cc b/test/mpi/test_case.cc
index 43ad770e6e..7a37c2ec9c 100644
--- a/test/mpi/test_case.cc
+++ b/test/mpi/test_case.cc
@@ -149,7 +149,14 @@ test_skip_cause_t TestCase::skip_reduce(int skip_cond, test_skip_cause_t cause,
 {
     test_skip_cause_t test_skip;
     test_skip_cause_t skip = skip_cond ? cause : TestCase::test_skip;
-    MPI_Allreduce((void*)&skip, (void*)&test_skip, 1, MPI_INT, MPI_MAX, comm);
+    MPI_Request req;
+    int completed;
+
+    MPI_Iallreduce((void*)&skip, (void*)&test_skip, 1, MPI_INT, MPI_MAX, comm, &req);
+    do {
+        MPI_Test(&req, &completed, MPI_STATUS_IGNORE);
+        tc_progress_ctx();
+    } while(!completed);
     TestCase::test_skip = test_skip;
     return test_skip;
 }
diff --git a/test/mpi/test_mpi.cc b/test/mpi/test_mpi.cc
index 1d89779046..147ce1fd7d 100644
--- a/test/mpi/test_mpi.cc
+++ b/test/mpi/test_mpi.cc
@@ -96,7 +96,7 @@ UccTestMpi::UccTestMpi(int argc, char *argv[], ucc_thread_mode_t _tm,
     ucc_context_config_release(ctx_config);
     if (with_onesided) {
         prev_env = getenv("UCC_TL_UCP_TUNE");
-        setenv("UCC_TL_UCP_TUNE", "alltoall:0-inf:@onesided", 1);
+        setenv("UCC_TL_UCP_TUNE", "alltoall:0-inf:@onesided#alltoallv:0-inf:@onesided", 1);
         UCC_CHECK(ucc_lib_config_read(NULL, NULL, &lib_config));
         UCC_CHECK(ucc_init(&lib_params, lib_config, &onesided_lib));
         ucc_lib_config_release(lib_config);
@@ -474,7 +474,7 @@ void set_gpu_device(test_set_gpu_device_t set_device)
 
 #endif
 
-std::vector<ucc_status_t> UccTestMpi::exec_tests(
+std::vector<ucc_test_mpi_result_t> UccTestMpi::exec_tests(
         std::vector<std::shared_ptr<TestCase>> tcs, bool triggered,
                                                     bool persistent)
 {
@@ -483,7 +483,7 @@ std::vector<ucc_status_t> UccTestMpi::exec_tests(
     ucc_status_t status;
 
     MPI_Comm_rank(MPI_COMM_WORLD, &world_rank);
-    std::vector<ucc_status_t> rst;
+    std::vector<ucc_test_mpi_result_t> rst;
 
     for (i = 0; i < n_persistent; i++) {
         for (auto tc: tcs) {
@@ -501,7 +501,7 @@ std::vector<ucc_status_t> UccTestMpi::exec_tests(
                     std::cout << "SKIPPED: " << skip_str(tc->test_skip) << ": "
                               << tc->str() << " " << std::endl;
                 }
-                rst.push_back(UCC_ERR_LAST);
+                rst.push_back(std::make_tuple(tc->args.coll_type, UCC_ERR_LAST));
                 return rst;
             }
         }
@@ -528,14 +528,14 @@ std::vector<ucc_status_t> UccTestMpi::exec_tests(
             if (UCC_OK != status) {
                 std::cerr << "FAILURE in: " << tc->str() << std::endl;
             }
-            rst.push_back(status);
+            rst.push_back(std::make_tuple(tc->args.coll_type, status));
        }
     }
     return rst;
 }
 
 void UccTestMpi::run_all_at_team(ucc_test_team_t &team,
-                                 std::vector<ucc_status_t> &rst)
+                                 std::vector<ucc_test_mpi_result_t> &rst)
 {
     TestCaseParams params;
 
@@ -586,11 +586,13 @@ void UccTestMpi::run_all_at_team(ucc_test_team_t &team,
             for (auto r : roots) {
                 for (auto mt: test_memtypes) {
                     if (triggered && !ucc_coll_triggered_supported(mt)) {
-                        rst.push_back(UCC_ERR_NOT_IMPLEMENTED);
+                        rst.push_back(std::make_tuple(c, UCC_ERR_NOT_IMPLEMENTED));
                         continue;
                     }
 
-                    if (c == UCC_COLL_TYPE_ALLTOALL && team.ctx != ctx) {
+                    if ((c == UCC_COLL_TYPE_ALLTOALL ||
+                         c == UCC_COLL_TYPE_ALLTOALLV) &&
+                        team.ctx != ctx) {
                         /* onesided alltoall */
                         if (mt != UCC_MEMORY_TYPE_HOST) {
                             continue;
@@ -640,10 +642,10 @@ void UccTestMpi::run_all_at_team(ucc_test_team_t &team,
 }
 
 typedef struct ucc_test_thread {
-    pthread_t                 thread;
-    int                       id;
-    UccTestMpi *              test;
-    std::vector<ucc_status_t> rst;
+    pthread_t                          thread;
+    int                                id;
+    UccTestMpi *                       test;
+    std::vector<ucc_test_mpi_result_t> rst;
 } ucc_test_thread_t;
 
 static void *thread_start(void *arg)
diff --git a/test/mpi/test_mpi.h b/test/mpi/test_mpi.h
index fcc11b544b..391cb21996 100644
--- a/test/mpi/test_mpi.h
+++ b/test/mpi/test_mpi.h
@@ -145,7 +145,6 @@ static inline const char* skip_str(test_skip_cause_t s) {
     default:
         return "unknown";
     }
-    return NULL;
 }
 
 static inline const char* team_str(ucc_test_mpi_team_t t) {
@@ -268,7 +267,6 @@ class TestCase {
     size_t msgsize;
     bool inplace;
     bool persistent;
-    ucc_coll_args_t args;
     ucc_coll_req_h req;
     ucc_mc_buffer_header_t *sbuf_mc_header, *rbuf_mc_header;
     void *sbuf;
@@ -278,8 +276,9 @@ class TestCase {
     uint8_t     progress_buf[1];
     size_t test_max_size;
     ucc_datatype_t dt;
-
+    int iter_persistent;
 public:
+    ucc_coll_args_t args;
     void mpi_progress(void);
     test_skip_cause_t test_skip;
     static std::shared_ptr<TestCase> init_single(
@@ -305,6 +304,7 @@ class TestCase {
                                   MPI_Comm comm);
 };
 
+typedef std::tuple<ucc_coll_type_t, ucc_status_t> ucc_test_mpi_result_t;
 class UccTestMpi {
     ucc_thread_mode_t         tm;
     ucc_context_h             ctx;
@@ -332,14 +332,15 @@ class UccTestMpi {
     std::vector<int> gen_roots(ucc_test_team_t &team);
     std::vector<ucc_test_vsize_flag_t> counts_vsize;
     std::vector<ucc_test_vsize_flag_t> displs_vsize;
-    std::vector<ucc_status_t> exec_tests(
+    std::vector<ucc_test_mpi_result_t> exec_tests(
             std::vector<std::shared_ptr<TestCase>> tcs,
             bool triggered, bool persistent);
 public:
     std::vector<ucc_test_team_t> teams;
     std::vector<ucc_test_team_t> onesided_teams;
-    void run_all_at_team(ucc_test_team_t &team, std::vector<ucc_status_t> &rst);
-    std::vector<ucc_status_t> results;
+    void run_all_at_team(ucc_test_team_t &team,
+                         std::vector<ucc_test_mpi_result_t> &rst);
+    std::vector<ucc_test_mpi_result_t> results;
     UccTestMpi(int argc, char *argv[], ucc_thread_mode_t tm, int is_local,
                bool with_onesided);
     ~UccTestMpi();
@@ -379,6 +380,9 @@ class UccTestMpi {
                       bool                              is_onesided = false);
     void progress_ctx() {
         ucc_context_progress(ctx);
+        if (onesided_ctx) {
+            ucc_context_progress(onesided_ctx);
+        }
     }
 };
 
@@ -523,7 +527,7 @@ class TestScatterv : public TestCase {
 };
 
 void init_buffer(void *buf, size_t count, ucc_datatype_t dt,
-                 ucc_memory_type_t mt, int value);
+                 ucc_memory_type_t mt, int value, int offset = 0);
 
 ucc_status_t compare_buffers(void *rst, void *expected, size_t count,
                              ucc_datatype_t dt, ucc_memory_type_t mt);
diff --git a/tools/perf/ucc_pt_benchmark.cc b/tools/perf/ucc_pt_benchmark.cc
index c4ef8c6289..cbaa5d664a 100644
--- a/tools/perf/ucc_pt_benchmark.cc
+++ b/tools/perf/ucc_pt_benchmark.cc
@@ -18,54 +18,61 @@ ucc_pt_benchmark::ucc_pt_benchmark(ucc_pt_benchmark_config cfg,
 {
     switch (cfg.op_type) {
     case UCC_PT_OP_TYPE_ALLGATHER:
-        coll = new ucc_pt_coll_allgather(cfg.dt, cfg.mt, cfg.inplace, comm);
+        coll = new ucc_pt_coll_allgather(cfg.dt, cfg.mt, cfg.inplace,
+                                         cfg.persistent, comm);
         break;
     case UCC_PT_OP_TYPE_ALLGATHERV:
-        coll = new ucc_pt_coll_allgatherv(cfg.dt, cfg.mt, cfg.inplace, comm);
+        coll = new ucc_pt_coll_allgatherv(cfg.dt, cfg.mt, cfg.inplace,
+                                          cfg.persistent, comm);
         break;
     case UCC_PT_OP_TYPE_ALLREDUCE:
         coll = new ucc_pt_coll_allreduce(cfg.dt, cfg.mt, cfg.op, cfg.inplace,
-                                         comm);
+                                         cfg.persistent, comm);
         break;
     case UCC_PT_OP_TYPE_ALLTOALL:
-        coll = new ucc_pt_coll_alltoall(cfg.dt, cfg.mt, cfg.inplace, comm);
+        coll = new ucc_pt_coll_alltoall(cfg.dt, cfg.mt, cfg.inplace,
+                                        cfg.persistent, comm);
         break;
     case UCC_PT_OP_TYPE_ALLTOALLV:
-        coll = new ucc_pt_coll_alltoallv(cfg.dt, cfg.mt, cfg.inplace, comm);
+        coll = new ucc_pt_coll_alltoallv(cfg.dt, cfg.mt, cfg.inplace,
+                                         cfg.persistent, comm);
         break;
     case UCC_PT_OP_TYPE_BARRIER:
         coll = new ucc_pt_coll_barrier(comm);
         break;
     case UCC_PT_OP_TYPE_BCAST:
-        coll = new ucc_pt_coll_bcast(cfg.dt, cfg.mt, cfg.root_shift, comm);
+        coll = new ucc_pt_coll_bcast(cfg.dt, cfg.mt, cfg.root_shift,
+                                     cfg.persistent, comm);
         break;
     case UCC_PT_OP_TYPE_GATHER:
         coll = new ucc_pt_coll_gather(cfg.dt, cfg.mt, cfg.inplace,
-                                      cfg.root_shift, comm);
+                                      cfg.persistent, cfg.root_shift, comm);
         break;
     case UCC_PT_OP_TYPE_GATHERV:
         coll = new ucc_pt_coll_gatherv(cfg.dt, cfg.mt, cfg.inplace,
-                                       cfg.root_shift, comm);
+                                       cfg.persistent, cfg.root_shift, comm);
         break;
     case UCC_PT_OP_TYPE_REDUCE:
         coll = new ucc_pt_coll_reduce(cfg.dt, cfg.mt, cfg.op, cfg.inplace,
-                                      cfg.root_shift, comm);
+                                      cfg.persistent, cfg.root_shift, comm);
         break;
     case UCC_PT_OP_TYPE_REDUCE_SCATTER:
         coll = new ucc_pt_coll_reduce_scatter(cfg.dt, cfg.mt, cfg.op,
-                                              cfg.inplace, comm);
+                                              cfg.inplace,
+                                              cfg.persistent, comm);
         break;
     case UCC_PT_OP_TYPE_REDUCE_SCATTERV:
         coll = new ucc_pt_coll_reduce_scatterv(cfg.dt, cfg.mt, cfg.op,
-                                               cfg.inplace, comm);
+                                               cfg.inplace, cfg.persistent,
+                                               comm);
         break;
     case UCC_PT_OP_TYPE_SCATTER:
         coll = new ucc_pt_coll_scatter(cfg.dt, cfg.mt, cfg.inplace,
-                                       cfg.root_shift, comm);
+                                       cfg.persistent, cfg.root_shift, comm);
         break;
     case UCC_PT_OP_TYPE_SCATTERV:
         coll = new ucc_pt_coll_scatterv(cfg.dt, cfg.mt, cfg.inplace,
-                                        cfg.root_shift, comm);
+                                        cfg.persistent, cfg.root_shift, comm);
         break;
     case UCC_PT_OP_TYPE_MEMCPY:
         coll = new ucc_pt_op_memcpy(cfg.dt, cfg.mt, cfg.n_bufs, comm);
@@ -137,10 +144,11 @@ ucc_status_t ucc_pt_benchmark::run_single_coll_test(ucc_coll_args_t args,
                                                     double &time)
                                                     noexcept
 {
-    const bool    triggered = config.triggered;
-    ucc_team_h    team      = comm->get_team();
-    ucc_context_h ctx       = comm->get_context();
-    ucc_status_t  st        = UCC_OK;
+    const bool    triggered  = config.triggered;
+    const bool    persistent = config.persistent;
+    ucc_team_h    team       = comm->get_team();
+    ucc_context_h ctx        = comm->get_context();
+    ucc_status_t  st         = UCC_OK;
     ucc_coll_req_h req;
     ucc_ee_h ee;
     ucc_ev_t comp_ev, *post_ev;
@@ -161,10 +169,18 @@ ucc_status_t ucc_pt_benchmark::run_single_coll_test(ucc_coll_args_t args,
         comp_ev.ev_context_size = 0;
     }
 
+    if (persistent) {
+        UCCCHECK_GOTO(ucc_collective_init(&args, &req, team), exit_err, st);
+    }
+
     args.root = config.root % comm->get_size();
     for (int i = 0; i < nwarmup + niter; i++) {
         double s = get_time_us();
-        UCCCHECK_GOTO(ucc_collective_init(&args, &req, team), exit_err, st);
+
+        if (!persistent) {
+            UCCCHECK_GOTO(ucc_collective_init(&args, &req, team), exit_err, st);
+        }
+
         if (triggered) {
             comp_ev.req = req;
             UCCCHECK_GOTO(ucc_collective_triggered_post(ee, &comp_ev),
@@ -175,12 +191,16 @@ ucc_status_t ucc_pt_benchmark::run_single_coll_test(ucc_coll_args_t args,
         } else {
             UCCCHECK_GOTO(ucc_collective_post(req), free_req, st);
         }
+
         st = ucc_collective_test(req);
         while (st > 0) {
             UCCCHECK_GOTO(ucc_context_progress(ctx), free_req, st);
             st = ucc_collective_test(req);
         }
-        ucc_collective_finalize(req);
+
+        if (!persistent) {
+            ucc_collective_finalize(req);
+        }
         double f = get_time_us();
         if (st != UCC_OK) {
             goto exit_err;
@@ -191,6 +211,11 @@ ucc_status_t ucc_pt_benchmark::run_single_coll_test(ucc_coll_args_t args,
         args.root = (args.root + config.root_shift) % comm->get_size();
         UCCCHECK_GOTO(comm->barrier(), exit_err, st);
     }
+
+    if (persistent) {
+        ucc_collective_finalize(req);
+    }
+
     if (niter != 0) {
         time /= niter;
     }
diff --git a/tools/perf/ucc_pt_coll.cc b/tools/perf/ucc_pt_coll.cc
index a561ea73b4..e013615dd8 100644
--- a/tools/perf/ucc_pt_coll.cc
+++ b/tools/perf/ucc_pt_coll.cc
@@ -5,11 +5,56 @@
  */
 
 #include "ucc_pt_coll.h"
+#include "ucc_pt_cuda.h"
+#include "utils/ucc_malloc.h"
 
 ucc_status_t ucc_pt_alloc(ucc_mc_buffer_header_t **h_ptr, size_t len,
                           ucc_memory_type_t mem_type)
 {
     ucc_status_t status;
+    int cuda_st;
+
+    switch (mem_type) {
+    case UCC_MEMORY_TYPE_CUDA:
+        *h_ptr = new ucc_mc_buffer_header_t;
+        (*h_ptr)->mt = UCC_MEMORY_TYPE_CUDA;
+        cuda_st = ucc_pt_cudaMalloc(&((*h_ptr)->addr), len);
+        if (cuda_st != 0) {
+            return UCC_ERR_NO_MEMORY;
+        }
+        cuda_st = ucc_pt_cudaMemset((*h_ptr)->addr, 0, len);
+        if (cuda_st != 0) {
+            ucc_pt_cudaFree((*h_ptr)->addr);
+            delete *h_ptr;
+            return UCC_ERR_NO_MEMORY;
+        }
+        return UCC_OK;
+    case UCC_MEMORY_TYPE_CUDA_MANAGED:
+        *h_ptr = new ucc_mc_buffer_header_t;
+        (*h_ptr)->mt = UCC_MEMORY_TYPE_CUDA_MANAGED;
+        cuda_st = ucc_pt_cudaMallocManaged(&((*h_ptr)->addr), len);
+        if (cuda_st != 0) {
+            return UCC_ERR_NO_MEMORY;
+        }
+        cuda_st = ucc_pt_cudaMemset((*h_ptr)->addr, 0, len);
+        if (cuda_st != 0) {
+            ucc_pt_cudaFree((*h_ptr)->addr);
+            delete *h_ptr;
+            return UCC_ERR_NO_MEMORY;
+        }
+        return UCC_OK;
+    case UCC_MEMORY_TYPE_HOST:
+        *h_ptr = new ucc_mc_buffer_header_t;
+        (*h_ptr)->mt = UCC_MEMORY_TYPE_HOST;
+        (*h_ptr)->addr = ucc_malloc(len, "perftest data");
+        if (!((*h_ptr)->addr)) {
+            return UCC_ERR_NO_MEMORY;
+        }
+        memset((*h_ptr)->addr, 0, len);
+        return UCC_OK;
+    default:
+        break;
+    }
 
     status = ucc_mc_alloc(h_ptr, len, mem_type);
     if (status != UCC_OK) {
@@ -26,6 +71,20 @@ ucc_status_t ucc_pt_alloc(ucc_mc_buffer_header_t **h_ptr, size_t len,
 
 ucc_status_t ucc_pt_free(ucc_mc_buffer_header_t *h_ptr)
 {
+    switch (h_ptr->mt) {
+    case UCC_MEMORY_TYPE_CUDA:
+    case UCC_MEMORY_TYPE_CUDA_MANAGED:
+        ucc_pt_cudaFree(h_ptr->addr);
+        delete h_ptr;
+        return UCC_OK;
+    case UCC_MEMORY_TYPE_HOST:
+        ucc_free(h_ptr->addr);
+        delete h_ptr;
+        return UCC_OK;
+    default:
+        break;
+    }
+
     return ucc_mc_free(h_ptr);
 }
 
diff --git a/tools/perf/ucc_pt_coll.h b/tools/perf/ucc_pt_coll.h
index 63afc9bd9e..0b92039fab 100644
--- a/tools/perf/ucc_pt_coll.h
+++ b/tools/perf/ucc_pt_coll.h
@@ -58,7 +58,8 @@ class ucc_pt_coll {
 class ucc_pt_coll_allgather: public ucc_pt_coll {
 public:
     ucc_pt_coll_allgather(ucc_datatype_t dt, ucc_memory_type mt,
-                          bool is_inplace, ucc_pt_comm *communicator);
+                          bool is_inplace, bool is_persistent,
+                          ucc_pt_comm *communicator);
     ucc_status_t init_args(size_t count, ucc_pt_test_args_t &args) override;
     void free_args(ucc_pt_test_args_t &args) override;
     float get_bw(float time_ms, int grsize, ucc_pt_test_args_t args) override;
@@ -67,7 +68,8 @@ class ucc_pt_coll_allgather: public ucc_pt_coll {
 class ucc_pt_coll_allgatherv: public ucc_pt_coll {
 public:
     ucc_pt_coll_allgatherv(ucc_datatype_t dt, ucc_memory_type mt,
-                           bool is_inplace, ucc_pt_comm *communicator);
+                           bool is_inplace, bool is_persistent,
+                           ucc_pt_comm *communicator);
     ucc_status_t init_args(size_t count, ucc_pt_test_args_t &args) override;
     void free_args(ucc_pt_test_args_t &args) override;
 };
@@ -76,7 +78,7 @@ class ucc_pt_coll_allreduce: public ucc_pt_coll {
 public:
     ucc_pt_coll_allreduce(ucc_datatype_t dt, ucc_memory_type mt,
                           ucc_reduction_op_t op, bool is_inplace,
-                          ucc_pt_comm *communicator);
+                          bool is_persistent, ucc_pt_comm *communicator);
     ucc_status_t init_args(size_t count, ucc_pt_test_args_t &args) override;
     void free_args(ucc_pt_test_args_t &args) override;
     float get_bw(float time_ms, int grsize, ucc_pt_test_args_t args) override;
@@ -85,7 +87,8 @@ class ucc_pt_coll_allreduce: public ucc_pt_coll {
 class ucc_pt_coll_alltoall: public ucc_pt_coll {
 public:
     ucc_pt_coll_alltoall(ucc_datatype_t dt, ucc_memory_type mt,
-                         bool is_inplace, ucc_pt_comm *communicator);
+                         bool is_inplace, bool is_persistent,
+                         ucc_pt_comm *communicator);
     ucc_status_t init_args(size_t count, ucc_pt_test_args_t &args) override;
     void free_args(ucc_pt_test_args_t &args) override;
     float get_bw(float time_ms, int grsize, ucc_pt_test_args_t args) override;
@@ -94,7 +97,8 @@ class ucc_pt_coll_alltoall: public ucc_pt_coll {
 class ucc_pt_coll_alltoallv: public ucc_pt_coll {
 public:
     ucc_pt_coll_alltoallv(ucc_datatype_t dt, ucc_memory_type mt,
-                          bool is_inplace, ucc_pt_comm *communicator);
+                          bool is_inplace, bool is_persistent,
+                          ucc_pt_comm *communicator);
     ucc_status_t init_args(size_t count, ucc_pt_test_args_t &args) override;
     void free_args(ucc_pt_test_args_t &args) override;
 };
@@ -109,7 +113,7 @@ class ucc_pt_coll_barrier: public ucc_pt_coll {
 class ucc_pt_coll_bcast: public ucc_pt_coll {
 public:
     ucc_pt_coll_bcast(ucc_datatype_t dt, ucc_memory_type mt, int root_shift,
-                      ucc_pt_comm *communicator);
+                      bool is_persistent, ucc_pt_comm *communicator);
     ucc_status_t init_args(size_t count, ucc_pt_test_args_t &args) override;
     void free_args(ucc_pt_test_args_t &args) override;
     float get_bw(float time_ms, int grsize, ucc_pt_test_args_t args) override;
@@ -118,7 +122,7 @@ class ucc_pt_coll_bcast: public ucc_pt_coll {
 class ucc_pt_coll_gather: public ucc_pt_coll {
 public:
     ucc_pt_coll_gather(ucc_datatype_t dt, ucc_memory_type mt,
-                       bool is_inplace, int root_shift,
+                       bool is_inplace, bool is_persistent, int root_shift,
                        ucc_pt_comm *communicator);
     ucc_status_t init_args(size_t count, ucc_pt_test_args_t &args) override;
     void free_args(ucc_pt_test_args_t &args) override;
@@ -128,7 +132,7 @@ class ucc_pt_coll_gather: public ucc_pt_coll {
 class ucc_pt_coll_gatherv: public ucc_pt_coll {
 public:
     ucc_pt_coll_gatherv(ucc_datatype_t dt, ucc_memory_type mt,
-                        bool is_inplace, int root_shift,
+                        bool is_inplace, bool is_persistent, int root_shift,
                         ucc_pt_comm *communicator);
     ucc_status_t init_args(size_t count, ucc_pt_test_args_t &args) override;
     void free_args(ucc_pt_test_args_t &args) override;
@@ -137,8 +141,8 @@ class ucc_pt_coll_gatherv: public ucc_pt_coll {
 class ucc_pt_coll_reduce: public ucc_pt_coll {
 public:
     ucc_pt_coll_reduce(ucc_datatype_t dt, ucc_memory_type mt,
-                       ucc_reduction_op_t op, bool is_inplace, int root_shift,
-                       ucc_pt_comm *communicator);
+                       ucc_reduction_op_t op, bool is_inplace, bool is_persistent,
+                       int root_shift, ucc_pt_comm *communicator);
     ucc_status_t init_args(size_t count, ucc_pt_test_args_t &args) override;
     void free_args(ucc_pt_test_args_t &args) override;
     float get_bw(float time_ms, int grsize, ucc_pt_test_args_t args) override;
@@ -148,7 +152,7 @@ class ucc_pt_coll_reduce_scatter: public ucc_pt_coll {
 public:
     ucc_pt_coll_reduce_scatter(ucc_datatype_t dt, ucc_memory_type mt,
                                ucc_reduction_op_t op, bool is_inplace,
-                               ucc_pt_comm *communicator);
+                               bool is_persistent, ucc_pt_comm *communicator);
     ucc_status_t init_args(size_t count, ucc_pt_test_args_t &args) override;
     void free_args(ucc_pt_test_args_t &args) override;
     float get_bw(float time_ms, int grsize, ucc_pt_test_args_t args) override;
@@ -158,7 +162,7 @@ class ucc_pt_coll_reduce_scatterv: public ucc_pt_coll {
 public:
     ucc_pt_coll_reduce_scatterv(ucc_datatype_t dt, ucc_memory_type mt,
                                 ucc_reduction_op_t op, bool is_inplace,
-                                ucc_pt_comm *communicator);
+                                bool is_persistent, ucc_pt_comm *communicator);
     ucc_status_t init_args(size_t count, ucc_pt_test_args_t &args) override;
     void free_args(ucc_pt_test_args_t &args) override;
 };
@@ -166,7 +170,7 @@ class ucc_pt_coll_reduce_scatterv: public ucc_pt_coll {
 class ucc_pt_coll_scatter: public ucc_pt_coll {
 public:
     ucc_pt_coll_scatter(ucc_datatype_t dt, ucc_memory_type mt,
-                        bool is_inplace, int root_shift,
+                        bool is_inplace, bool is_persistent, int root_shift,
                         ucc_pt_comm *communicator);
     ucc_status_t init_args(size_t count, ucc_pt_test_args_t &args) override;
     void free_args(ucc_pt_test_args_t &args) override;
@@ -176,7 +180,7 @@ class ucc_pt_coll_scatter: public ucc_pt_coll {
 class ucc_pt_coll_scatterv: public ucc_pt_coll {
 public:
     ucc_pt_coll_scatterv(ucc_datatype_t dt, ucc_memory_type mt,
-                         bool is_inplace, int root_shift,
+                         bool is_inplace, bool is_persistent, int root_shift,
                          ucc_pt_comm *communicator);
     ucc_status_t init_args(size_t count, ucc_pt_test_args_t &args) override;
     void free_args(ucc_pt_test_args_t &args) override;
diff --git a/tools/perf/ucc_pt_coll_allgather.cc b/tools/perf/ucc_pt_coll_allgather.cc
index 76e6084032..b8185dd9e8 100644
--- a/tools/perf/ucc_pt_coll_allgather.cc
+++ b/tools/perf/ucc_pt_coll_allgather.cc
@@ -12,6 +12,7 @@
 
 ucc_pt_coll_allgather::ucc_pt_coll_allgather(ucc_datatype_t dt,
                          ucc_memory_type mt, bool is_inplace,
+                         bool is_persistent,
                          ucc_pt_comm *communicator) : ucc_pt_coll(communicator)
 
 {
@@ -21,16 +22,23 @@ ucc_pt_coll_allgather::ucc_pt_coll_allgather(ucc_datatype_t dt,
     has_bw_        = true;
     root_shift_    = 0;
 
-    coll_args.mask = 0;
-    coll_args.coll_type = UCC_COLL_TYPE_ALLGATHER;
+    coll_args.mask              = 0;
+    coll_args.flags             = 0;
+    coll_args.coll_type         = UCC_COLL_TYPE_ALLGATHER;
     coll_args.src.info.datatype = dt;
     coll_args.src.info.mem_type = mt;
     coll_args.dst.info.datatype = dt;
     coll_args.dst.info.mem_type = mt;
+
     if (is_inplace) {
-        coll_args.mask = UCC_COLL_ARGS_FIELD_FLAGS;
+        coll_args.mask  = UCC_COLL_ARGS_FIELD_FLAGS;
         coll_args.flags = UCC_COLL_ARGS_FLAG_IN_PLACE;
     }
+
+    if (is_persistent) {
+        coll_args.mask  |= UCC_COLL_ARGS_FIELD_FLAGS;
+        coll_args.flags |= UCC_COLL_ARGS_FLAG_PERSISTENT;
+    }
 }
 
 ucc_status_t ucc_pt_coll_allgather::init_args(size_t single_rank_count,
diff --git a/tools/perf/ucc_pt_coll_allgatherv.cc b/tools/perf/ucc_pt_coll_allgatherv.cc
index 8642322c64..c6c18a7c5a 100644
--- a/tools/perf/ucc_pt_coll_allgatherv.cc
+++ b/tools/perf/ucc_pt_coll_allgatherv.cc
@@ -12,6 +12,7 @@
 
 ucc_pt_coll_allgatherv::ucc_pt_coll_allgatherv(ucc_datatype_t dt,
                          ucc_memory_type mt, bool is_inplace,
+                         bool is_persistent,
                          ucc_pt_comm *communicator) : ucc_pt_coll(communicator)
 {
     has_inplace_   = true;
@@ -20,16 +21,23 @@ ucc_pt_coll_allgatherv::ucc_pt_coll_allgatherv(ucc_datatype_t dt,
     has_bw_        = false;
     root_shift_    = 0;
 
-    coll_args.mask = 0;
-    coll_args.coll_type = UCC_COLL_TYPE_ALLGATHERV;
-    coll_args.src.info.datatype = dt;
-    coll_args.src.info.mem_type = mt;
+    coll_args.mask                = 0;
+    coll_args.flags               = 0;
+    coll_args.coll_type           = UCC_COLL_TYPE_ALLGATHERV;
+    coll_args.src.info.datatype   = dt;
+    coll_args.src.info.mem_type   = mt;
     coll_args.dst.info_v.datatype = dt;
     coll_args.dst.info_v.mem_type = mt;
+
     if (is_inplace) {
-        coll_args.mask = UCC_COLL_ARGS_FIELD_FLAGS;
+        coll_args.mask  = UCC_COLL_ARGS_FIELD_FLAGS;
         coll_args.flags = UCC_COLL_ARGS_FLAG_IN_PLACE;
     }
+
+    if (is_persistent) {
+        coll_args.mask  |= UCC_COLL_ARGS_FIELD_FLAGS;
+        coll_args.flags |= UCC_COLL_ARGS_FLAG_PERSISTENT;
+    }
 }
 
 ucc_status_t ucc_pt_coll_allgatherv::init_args(size_t count,
diff --git a/tools/perf/ucc_pt_coll_allreduce.cc b/tools/perf/ucc_pt_coll_allreduce.cc
index 8234f26dd7..3159dc3a9f 100644
--- a/tools/perf/ucc_pt_coll_allreduce.cc
+++ b/tools/perf/ucc_pt_coll_allreduce.cc
@@ -12,7 +12,7 @@
 
 ucc_pt_coll_allreduce::ucc_pt_coll_allreduce(ucc_datatype_t dt,
                          ucc_memory_type mt, ucc_reduction_op_t op,
-                         bool is_inplace,
+                         bool is_inplace, bool is_persistent,
                          ucc_pt_comm *communicator) : ucc_pt_coll(communicator)
 {
     has_inplace_   = true;
@@ -21,17 +21,25 @@ ucc_pt_coll_allreduce::ucc_pt_coll_allreduce(ucc_datatype_t dt,
     has_bw_        = true;
     root_shift_    = 0;
 
-    coll_args.coll_type = UCC_COLL_TYPE_ALLREDUCE;
-    coll_args.mask = 0;
-    if (is_inplace) {
-        coll_args.mask = UCC_COLL_ARGS_FIELD_FLAGS;
-        coll_args.flags = UCC_COLL_ARGS_FLAG_IN_PLACE;
-    }
+    coll_args.mask              = 0;
+    coll_args.flags             = 0;
+    coll_args.coll_type         = UCC_COLL_TYPE_ALLREDUCE;
     coll_args.op                = op;
     coll_args.src.info.datatype = dt;
     coll_args.dst.info.datatype = dt;
     coll_args.src.info.mem_type = mt;
     coll_args.dst.info.mem_type = mt;
+
+    if (is_inplace) {
+        coll_args.mask  = UCC_COLL_ARGS_FIELD_FLAGS;
+        coll_args.flags = UCC_COLL_ARGS_FLAG_IN_PLACE;
+    }
+
+    if (is_persistent) {
+        coll_args.mask  |= UCC_COLL_ARGS_FIELD_FLAGS;
+        coll_args.flags |= UCC_COLL_ARGS_FLAG_PERSISTENT;
+
+    }
 }
 
 ucc_status_t ucc_pt_coll_allreduce::init_args(size_t count,
diff --git a/tools/perf/ucc_pt_coll_alltoall.cc b/tools/perf/ucc_pt_coll_alltoall.cc
index f4e9cf57b5..77a2608f7f 100644
--- a/tools/perf/ucc_pt_coll_alltoall.cc
+++ b/tools/perf/ucc_pt_coll_alltoall.cc
@@ -12,6 +12,7 @@
 
 ucc_pt_coll_alltoall::ucc_pt_coll_alltoall(ucc_datatype_t dt,
                         ucc_memory_type mt, bool is_inplace,
+                         bool is_persistent,
                         ucc_pt_comm *communicator) : ucc_pt_coll(communicator)
 {
     has_inplace_   = true;
@@ -20,16 +21,23 @@ ucc_pt_coll_alltoall::ucc_pt_coll_alltoall(ucc_datatype_t dt,
     has_bw_        = true;
     root_shift_    = 0;
 
-    coll_args.mask = 0;
-    coll_args.coll_type = UCC_COLL_TYPE_ALLTOALL;
+    coll_args.mask              = 0;
+    coll_args.flags             = 0;
+    coll_args.coll_type         = UCC_COLL_TYPE_ALLTOALL;
     coll_args.src.info.datatype = dt;
     coll_args.src.info.mem_type = mt;
     coll_args.dst.info.datatype = dt;
     coll_args.dst.info.mem_type = mt;
+
     if (is_inplace) {
-        coll_args.mask = UCC_COLL_ARGS_FIELD_FLAGS;
+        coll_args.mask  = UCC_COLL_ARGS_FIELD_FLAGS;
         coll_args.flags = UCC_COLL_ARGS_FLAG_IN_PLACE;
     }
+
+    if (is_persistent) {
+        coll_args.mask  |= UCC_COLL_ARGS_FIELD_FLAGS;
+        coll_args.flags |= UCC_COLL_ARGS_FLAG_PERSISTENT;
+    }
 }
 
 ucc_status_t ucc_pt_coll_alltoall::init_args(size_t single_rank_count,
diff --git a/tools/perf/ucc_pt_coll_alltoallv.cc b/tools/perf/ucc_pt_coll_alltoallv.cc
index 4ba88ec123..6ce68ed032 100644
--- a/tools/perf/ucc_pt_coll_alltoallv.cc
+++ b/tools/perf/ucc_pt_coll_alltoallv.cc
@@ -12,6 +12,7 @@
 
 ucc_pt_coll_alltoallv::ucc_pt_coll_alltoallv(ucc_datatype_t dt,
                          ucc_memory_type mt, bool is_inplace,
+                         bool is_persistent,
                          ucc_pt_comm *communicator) : ucc_pt_coll(communicator)
 {
     has_inplace_   = true;
@@ -31,6 +32,11 @@ ucc_pt_coll_alltoallv::ucc_pt_coll_alltoallv(ucc_datatype_t dt,
     if (is_inplace) {
         coll_args.flags |= UCC_COLL_ARGS_FLAG_IN_PLACE;
     }
+
+    if (is_persistent) {
+        coll_args.flags |= UCC_COLL_ARGS_FLAG_PERSISTENT;
+    }
+
 }
 
 ucc_status_t ucc_pt_coll_alltoallv::init_args(size_t count,
diff --git a/tools/perf/ucc_pt_coll_bcast.cc b/tools/perf/ucc_pt_coll_bcast.cc
index b389228c38..b869c902c1 100644
--- a/tools/perf/ucc_pt_coll_bcast.cc
+++ b/tools/perf/ucc_pt_coll_bcast.cc
@@ -11,7 +11,8 @@
 #include <utils/ucc_coll_utils.h>
 
 ucc_pt_coll_bcast::ucc_pt_coll_bcast(ucc_datatype_t dt, ucc_memory_type mt,
-                                     int root_shift, ucc_pt_comm *communicator)
+                                     int root_shift, bool is_persistent,
+                                     ucc_pt_comm *communicator)
                    : ucc_pt_coll(communicator)
 {
     has_inplace_   = false;
@@ -20,10 +21,16 @@ ucc_pt_coll_bcast::ucc_pt_coll_bcast(ucc_datatype_t dt, ucc_memory_type mt,
     has_bw_        = true;
     root_shift_    = root_shift;
 
-    coll_args.mask = 0;
-    coll_args.coll_type = UCC_COLL_TYPE_BCAST;
+    coll_args.mask              = 0;
+    coll_args.flags             = 0;
+    coll_args.coll_type         = UCC_COLL_TYPE_BCAST;
     coll_args.src.info.datatype = dt;
     coll_args.src.info.mem_type = mt;
+
+    if (is_persistent) {
+        coll_args.mask  |= UCC_COLL_ARGS_FIELD_FLAGS;
+        coll_args.flags |= UCC_COLL_ARGS_FLAG_PERSISTENT;
+    }
 }
 
 ucc_status_t ucc_pt_coll_bcast::init_args(size_t count,
diff --git a/tools/perf/ucc_pt_coll_gather.cc b/tools/perf/ucc_pt_coll_gather.cc
index e189164484..660356bee8 100644
--- a/tools/perf/ucc_pt_coll_gather.cc
+++ b/tools/perf/ucc_pt_coll_gather.cc
@@ -11,7 +11,8 @@
 #include <utils/ucc_coll_utils.h>
 
 ucc_pt_coll_gather::ucc_pt_coll_gather(ucc_datatype_t dt,
-                         ucc_memory_type mt, bool is_inplace, int root_shift,
+                         ucc_memory_type mt, bool is_inplace,
+                         bool is_persistent, int root_shift,
                          ucc_pt_comm *communicator) : ucc_pt_coll(communicator)
 {
     has_inplace_   = true;
@@ -21,15 +22,22 @@ ucc_pt_coll_gather::ucc_pt_coll_gather(ucc_datatype_t dt,
     root_shift_    = root_shift;
 
     coll_args.mask              = 0;
+    coll_args.flags             = 0;
     coll_args.coll_type         = UCC_COLL_TYPE_GATHER;
     coll_args.src.info.datatype = dt;
     coll_args.src.info.mem_type = mt;
     coll_args.dst.info.datatype = dt;
     coll_args.dst.info.mem_type = mt;
+
     if (is_inplace) {
         coll_args.mask  = UCC_COLL_ARGS_FIELD_FLAGS;
         coll_args.flags = UCC_COLL_ARGS_FLAG_IN_PLACE;
     }
+
+    if (is_persistent) {
+        coll_args.mask  |= UCC_COLL_ARGS_FIELD_FLAGS;
+        coll_args.flags |= UCC_COLL_ARGS_FLAG_PERSISTENT;
+    }
 }
 
 ucc_status_t ucc_pt_coll_gather::init_args(size_t single_rank_count,
diff --git a/tools/perf/ucc_pt_coll_gatherv.cc b/tools/perf/ucc_pt_coll_gatherv.cc
index 6739f241d6..ab8715b3cc 100644
--- a/tools/perf/ucc_pt_coll_gatherv.cc
+++ b/tools/perf/ucc_pt_coll_gatherv.cc
@@ -11,7 +11,8 @@
 #include <utils/ucc_coll_utils.h>
 
 ucc_pt_coll_gatherv::ucc_pt_coll_gatherv(ucc_datatype_t dt,
-                         ucc_memory_type mt, bool is_inplace, int root_shift,
+                         ucc_memory_type mt, bool is_inplace,
+                         bool is_persistent, int root_shift,
                          ucc_pt_comm *communicator) : ucc_pt_coll(communicator)
 {
     has_inplace_   = true;
@@ -21,15 +22,22 @@ ucc_pt_coll_gatherv::ucc_pt_coll_gatherv(ucc_datatype_t dt,
     root_shift_    = root_shift;
 
     coll_args.mask                = 0;
+    coll_args.flags               = 0;
     coll_args.coll_type           = UCC_COLL_TYPE_GATHERV;
     coll_args.src.info.datatype   = dt;
     coll_args.src.info.mem_type   = mt;
     coll_args.dst.info_v.datatype = dt;
     coll_args.dst.info_v.mem_type = mt;
+
     if (is_inplace) {
         coll_args.mask  = UCC_COLL_ARGS_FIELD_FLAGS;
         coll_args.flags = UCC_COLL_ARGS_FLAG_IN_PLACE;
     }
+
+    if (is_persistent) {
+        coll_args.mask  |= UCC_COLL_ARGS_FIELD_FLAGS;
+        coll_args.flags |= UCC_COLL_ARGS_FLAG_PERSISTENT;
+    }
 }
 
 ucc_status_t ucc_pt_coll_gatherv::init_args(size_t count,
diff --git a/tools/perf/ucc_pt_coll_reduce.cc b/tools/perf/ucc_pt_coll_reduce.cc
index ad013bab67..47610bb68c 100644
--- a/tools/perf/ucc_pt_coll_reduce.cc
+++ b/tools/perf/ucc_pt_coll_reduce.cc
@@ -11,7 +11,8 @@
 #include <utils/ucc_coll_utils.h>
 
 ucc_pt_coll_reduce::ucc_pt_coll_reduce(ucc_datatype_t dt, ucc_memory_type mt,
-                        ucc_reduction_op_t op, bool is_inplace, int root_shift,
+                        ucc_reduction_op_t op, bool is_inplace,
+                        bool is_persistent, int root_shift,
                         ucc_pt_comm *communicator) : ucc_pt_coll(communicator)
 {
     has_inplace_   = true;
@@ -20,18 +21,24 @@ ucc_pt_coll_reduce::ucc_pt_coll_reduce(ucc_datatype_t dt, ucc_memory_type mt,
     has_bw_        = true;
     root_shift_    = root_shift;
 
-    coll_args.coll_type = UCC_COLL_TYPE_REDUCE;
-    coll_args.mask = 0;
+    coll_args.mask              = 0;
+    coll_args.flags             = 0;
+    coll_args.coll_type         = UCC_COLL_TYPE_REDUCE;
+    coll_args.op                = op;
+    coll_args.src.info.datatype = dt;
+    coll_args.src.info.mem_type = mt;
+    coll_args.dst.info.datatype = dt;
+    coll_args.dst.info.mem_type = mt;
+
     if (is_inplace) {
         coll_args.mask = UCC_COLL_ARGS_FIELD_FLAGS;
         coll_args.flags = UCC_COLL_ARGS_FLAG_IN_PLACE;
     }
 
-    coll_args.op   = op;
-    coll_args.src.info.datatype = dt;
-    coll_args.src.info.mem_type = mt;
-    coll_args.dst.info.datatype = dt;
-    coll_args.dst.info.mem_type = mt;
+    if (is_persistent) {
+        coll_args.mask  |= UCC_COLL_ARGS_FIELD_FLAGS;
+        coll_args.flags |= UCC_COLL_ARGS_FLAG_PERSISTENT;
+    }
 }
 
 ucc_status_t ucc_pt_coll_reduce::init_args(size_t count,
diff --git a/tools/perf/ucc_pt_coll_reduce_scatter.cc b/tools/perf/ucc_pt_coll_reduce_scatter.cc
index 8c51a5ffbd..e15bf80bcb 100644
--- a/tools/perf/ucc_pt_coll_reduce_scatter.cc
+++ b/tools/perf/ucc_pt_coll_reduce_scatter.cc
@@ -12,7 +12,7 @@
 
 ucc_pt_coll_reduce_scatter::ucc_pt_coll_reduce_scatter(ucc_datatype_t dt,
                         ucc_memory_type mt, ucc_reduction_op_t op,
-                        bool is_inplace,
+                        bool is_inplace, bool is_persistent,
                         ucc_pt_comm *communicator) : ucc_pt_coll(communicator)
 {
     has_inplace_   = true;
@@ -21,18 +21,24 @@ ucc_pt_coll_reduce_scatter::ucc_pt_coll_reduce_scatter(ucc_datatype_t dt,
     has_bw_        = true;
     root_shift_    = 0;
 
-    coll_args.coll_type = UCC_COLL_TYPE_REDUCE_SCATTER;
-    coll_args.mask = 0;
-    if (is_inplace) {
-        coll_args.mask = UCC_COLL_ARGS_FIELD_FLAGS;
-        coll_args.flags = UCC_COLL_ARGS_FLAG_IN_PLACE;
-    }
-
+    coll_args.mask              = 0;
+    coll_args.flags             = 0;
+    coll_args.coll_type         = UCC_COLL_TYPE_REDUCE_SCATTER;
     coll_args.op                = op;
     coll_args.src.info.datatype = dt;
     coll_args.src.info.mem_type = mt;
     coll_args.dst.info.datatype = dt;
     coll_args.dst.info.mem_type = mt;
+
+    if (is_inplace) {
+        coll_args.mask  = UCC_COLL_ARGS_FIELD_FLAGS;
+        coll_args.flags = UCC_COLL_ARGS_FLAG_IN_PLACE;
+    }
+
+    if (is_persistent) {
+        coll_args.mask  |= UCC_COLL_ARGS_FIELD_FLAGS;
+        coll_args.flags |= UCC_COLL_ARGS_FLAG_PERSISTENT;
+    }
 }
 
 ucc_status_t ucc_pt_coll_reduce_scatter::init_args(size_t count,
diff --git a/tools/perf/ucc_pt_coll_reduce_scatterv.cc b/tools/perf/ucc_pt_coll_reduce_scatterv.cc
index 84f55a2132..932ad600d9 100644
--- a/tools/perf/ucc_pt_coll_reduce_scatterv.cc
+++ b/tools/perf/ucc_pt_coll_reduce_scatterv.cc
@@ -12,7 +12,7 @@
 
 ucc_pt_coll_reduce_scatterv::ucc_pt_coll_reduce_scatterv(ucc_datatype_t dt,
                         ucc_memory_type mt, ucc_reduction_op_t op,
-                        bool is_inplace,
+                        bool is_inplace, bool is_persistent,
                         ucc_pt_comm *communicator) : ucc_pt_coll(communicator)
 {
     has_inplace_   = true;
@@ -21,18 +21,24 @@ ucc_pt_coll_reduce_scatterv::ucc_pt_coll_reduce_scatterv(ucc_datatype_t dt,
     has_bw_        = false;
     root_shift_    = 0;
 
-    coll_args.coll_type = UCC_COLL_TYPE_REDUCE_SCATTERV;
-    coll_args.mask = 0;
-    if (is_inplace) {
-        coll_args.mask = UCC_COLL_ARGS_FIELD_FLAGS;
-        coll_args.flags = UCC_COLL_ARGS_FLAG_IN_PLACE;
-    }
-
+    coll_args.mask                = 0;
+    coll_args.flags               = 0;
+    coll_args.coll_type           = UCC_COLL_TYPE_REDUCE_SCATTERV;
     coll_args.op                  = op;
     coll_args.src.info.datatype   = dt;
     coll_args.src.info.mem_type   = mt;
     coll_args.dst.info_v.datatype = dt;
     coll_args.dst.info_v.mem_type = mt;
+
+    if (is_inplace) {
+        coll_args.mask = UCC_COLL_ARGS_FIELD_FLAGS;
+        coll_args.flags = UCC_COLL_ARGS_FLAG_IN_PLACE;
+    }
+
+    if (is_persistent) {
+        coll_args.mask  |= UCC_COLL_ARGS_FIELD_FLAGS;
+        coll_args.flags |= UCC_COLL_ARGS_FLAG_PERSISTENT;
+    }
 }
 
 ucc_status_t ucc_pt_coll_reduce_scatterv::init_args(size_t count,
diff --git a/tools/perf/ucc_pt_coll_scatter.cc b/tools/perf/ucc_pt_coll_scatter.cc
index 4d66f51d99..ac414dd2ed 100644
--- a/tools/perf/ucc_pt_coll_scatter.cc
+++ b/tools/perf/ucc_pt_coll_scatter.cc
@@ -11,7 +11,8 @@
 #include <utils/ucc_coll_utils.h>
 
 ucc_pt_coll_scatter::ucc_pt_coll_scatter(ucc_datatype_t dt,
-                         ucc_memory_type mt, bool is_inplace, int root_shift,
+                         ucc_memory_type mt, bool is_inplace,
+                         bool is_persistent, int root_shift,
                          ucc_pt_comm *communicator) : ucc_pt_coll(communicator)
 {
     has_inplace_   = true;
@@ -21,15 +22,22 @@ ucc_pt_coll_scatter::ucc_pt_coll_scatter(ucc_datatype_t dt,
     root_shift_    = root_shift;
 
     coll_args.mask              = 0;
+    coll_args.flags             = 0;
     coll_args.coll_type         = UCC_COLL_TYPE_SCATTER;
     coll_args.src.info.datatype = dt;
     coll_args.src.info.mem_type = mt;
     coll_args.dst.info.datatype = dt;
     coll_args.dst.info.mem_type = mt;
+
     if (is_inplace) {
         coll_args.mask  = UCC_COLL_ARGS_FIELD_FLAGS;
         coll_args.flags = UCC_COLL_ARGS_FLAG_IN_PLACE;
     }
+
+    if (is_persistent) {
+        coll_args.mask  |= UCC_COLL_ARGS_FIELD_FLAGS;
+        coll_args.flags |= UCC_COLL_ARGS_FLAG_PERSISTENT;
+    }
 }
 
 ucc_status_t ucc_pt_coll_scatter::init_args(size_t single_rank_count,
diff --git a/tools/perf/ucc_pt_coll_scatterv.cc b/tools/perf/ucc_pt_coll_scatterv.cc
index 328752022c..1dc9bf7db9 100644
--- a/tools/perf/ucc_pt_coll_scatterv.cc
+++ b/tools/perf/ucc_pt_coll_scatterv.cc
@@ -11,7 +11,8 @@
 #include <utils/ucc_coll_utils.h>
 
 ucc_pt_coll_scatterv::ucc_pt_coll_scatterv(ucc_datatype_t dt,
-                         ucc_memory_type mt, bool is_inplace, int root_shift,
+                         ucc_memory_type mt, bool is_inplace,
+                         bool is_persistent, int root_shift,
                          ucc_pt_comm *communicator) : ucc_pt_coll(communicator)
 {
     has_inplace_   = true;
@@ -21,15 +22,22 @@ ucc_pt_coll_scatterv::ucc_pt_coll_scatterv(ucc_datatype_t dt,
     root_shift_    = root_shift;
 
     coll_args.mask                = 0;
+    coll_args.flags               = 0;
     coll_args.coll_type           = UCC_COLL_TYPE_SCATTERV;
     coll_args.src.info_v.datatype = dt;
     coll_args.src.info_v.mem_type = mt;
     coll_args.dst.info.datatype   = dt;
     coll_args.dst.info.mem_type   = mt;
+
     if (is_inplace) {
         coll_args.mask  = UCC_COLL_ARGS_FIELD_FLAGS;
         coll_args.flags = UCC_COLL_ARGS_FLAG_IN_PLACE;
     }
+
+    if (is_persistent) {
+        coll_args.mask  |= UCC_COLL_ARGS_FIELD_FLAGS;
+        coll_args.flags |= UCC_COLL_ARGS_FLAG_PERSISTENT;
+    }
 }
 
 ucc_status_t ucc_pt_coll_scatterv::init_args(size_t count,
diff --git a/tools/perf/ucc_pt_config.cc b/tools/perf/ucc_pt_config.cc
index 3fcb2b01c9..e59b62ce26 100644
--- a/tools/perf/ucc_pt_config.cc
+++ b/tools/perf/ucc_pt_config.cc
@@ -18,6 +18,7 @@ ucc_pt_config::ucc_pt_config() {
     bench.mt             = UCC_MEMORY_TYPE_HOST;
     bench.op             = UCC_OP_SUM;
     bench.inplace        = false;
+    bench.persistent     = false;
     bench.triggered      = false;
     bench.n_iter_small   = 1000;
     bench.n_warmup_small = 100;
@@ -89,7 +90,7 @@ ucc_status_t ucc_pt_config::process_args(int argc, char *argv[])
     int c;
     ucc_status_t st;
 
-    while ((c = getopt(argc, argv, "c:b:e:d:m:n:w:o:N:r:S:ihFT")) != -1) {
+    while ((c = getopt(argc, argv, "c:b:e:d:m:n:w:o:N:r:S:iphFT")) != -1) {
         switch (c) {
             case 'c':
                 if (ucc_pt_op_map.count(optarg) == 0) {
@@ -158,6 +159,9 @@ ucc_status_t ucc_pt_config::process_args(int argc, char *argv[])
             case 'i':
                 bench.inplace = true;
                 break;
+            case 'p':
+                bench.persistent = true;
+                break;
             case 'T':
                 bench.triggered = true;
                 break;
@@ -180,6 +184,7 @@ void ucc_pt_config::print_help()
     std::cout << "  -b <count>: Min number of elements"<<std::endl;
     std::cout << "  -e <count>: Max number of elements"<<std::endl;
     std::cout << "  -i: inplace collective"<<std::endl;
+    std::cout << "  -p: persistent collective"<<std::endl;
     std::cout << "  -d <dt name>: datatype"<<std::endl;
     std::cout << "  -o <op name>: reduction operation type"<<std::endl;
     std::cout << "  -r <number>: root for rooted collectives"<<std::endl;
diff --git a/tools/perf/ucc_pt_config.h b/tools/perf/ucc_pt_config.h
index 80543d7491..b7f0a26ff2 100644
--- a/tools/perf/ucc_pt_config.h
+++ b/tools/perf/ucc_pt_config.h
@@ -1,5 +1,5 @@
 /**
- * Copyright (c) 2021, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2021-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  *
  * See file LICENSE for terms.
  */
@@ -79,6 +79,7 @@ struct ucc_pt_benchmark_config {
     ucc_memory_type_t  mt;
     ucc_reduction_op_t op;
     bool               inplace;
+    bool               persistent;
     bool               triggered;
     size_t             large_thresh;
     int                n_iter_small;
diff --git a/tools/perf/ucc_pt_cuda.cc b/tools/perf/ucc_pt_cuda.cc
index fdf17457be..1d9e55ab4a 100644
--- a/tools/perf/ucc_pt_cuda.cc
+++ b/tools/perf/ucc_pt_cuda.cc
@@ -1,5 +1,5 @@
 /**
- * Copyright (c) 2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2022-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  *
  * See file LICENSE for terms.
  */
@@ -15,7 +15,7 @@ ucc_pt_cuda_iface_t ucc_pt_cuda_iface = {
 
 #define LOAD_CUDA_SYM(_sym, _pt_sym) ({                                    \
             void *h = dlsym(handle, _sym);                                 \
-            if ((error = dlerror()) != NULL)  {                            \
+            if (dlerror() != NULL)  {                                      \
                 return;                                                    \
             }                                                              \
             ucc_pt_cuda_iface. _pt_sym =                                   \
@@ -24,7 +24,6 @@ ucc_pt_cuda_iface_t ucc_pt_cuda_iface = {
 
 void ucc_pt_cuda_init(void)
 {
-    char *error;
     void *handle;
 
     handle = dlopen ("libcudart.so", RTLD_LAZY);
@@ -37,6 +36,10 @@ void ucc_pt_cuda_init(void)
     LOAD_CUDA_SYM("cudaGetErrorString", getErrorString);
     LOAD_CUDA_SYM("cudaStreamCreateWithFlags", streamCreateWithFlags);
     LOAD_CUDA_SYM("cudaStreamDestroy", streamDestroy);
+    LOAD_CUDA_SYM("cudaMalloc", cudaMalloc);
+    LOAD_CUDA_SYM("cudaFree", cudaFree);
+    LOAD_CUDA_SYM("cudaMemset", cudaMemset);
+    LOAD_CUDA_SYM("cudaMallocManaged", cudaMallocManaged);
 
     ucc_pt_cuda_iface.available = 1;
 }
diff --git a/tools/perf/ucc_pt_cuda.h b/tools/perf/ucc_pt_cuda.h
index 1a5844c1bc..5a370c3528 100644
--- a/tools/perf/ucc_pt_cuda.h
+++ b/tools/perf/ucc_pt_cuda.h
@@ -10,6 +10,7 @@
 
 #define cudaSuccess 0
 #define cudaStreamNonBlocking 0x01  /**< Stream does not synchronize with stream 0 (the NULL stream) */
+#define cudaMemAttachGlobal   0x01  /**< Memory can be accessed by any stream on any device*/
 typedef struct CUStream_st *cudaStream_t;
 
 #define STR(x) #x
@@ -31,6 +32,10 @@ typedef struct ucc_pt_cuda_iface {
     int (*streamCreateWithFlags)(cudaStream_t *stream, unsigned int flags);
     int (*streamDestroy)(cudaStream_t stream);
     char* (*getErrorString)(int err);
+    int (*cudaMalloc)(void **devptr, size_t size);
+    int (*cudaMallocManaged)(void **ptr, size_t size, unsigned int flags);
+    int (*cudaFree)(void *devptr);
+    int (*cudaMemset)(void *devptr, int value, size_t count);
 } ucc_pt_cuda_iface_t;
 
 extern ucc_pt_cuda_iface_t ucc_pt_cuda_iface;
@@ -74,4 +79,41 @@ static inline int ucc_pt_cudaStreamDestroy(cudaStream_t stream)
     return 0;
 }
 
+static inline int ucc_pt_cudaMalloc(void **devptr, size_t size)
+{
+    if (!ucc_pt_cuda_iface.available) {
+        return 1;
+    }
+    CUDA_CHECK(ucc_pt_cuda_iface.cudaMalloc(devptr, size));
+    return 0;
+}
+
+static inline int ucc_pt_cudaMallocManaged(void **ptr, size_t size)
+{
+    if (!ucc_pt_cuda_iface.available) {
+        return 1;
+    }
+    CUDA_CHECK(ucc_pt_cuda_iface.cudaMallocManaged(ptr, size,
+               cudaMemAttachGlobal));
+    return 0;
+}
+
+static inline int ucc_pt_cudaFree(void *devptr)
+{
+    if (!ucc_pt_cuda_iface.available) {
+        return 1;
+    }
+    CUDA_CHECK(ucc_pt_cuda_iface.cudaFree(devptr));
+    return 0;
+}
+
+static inline int ucc_pt_cudaMemset(void *devptr, int value, size_t count)
+{
+    if (!ucc_pt_cuda_iface.available) {
+        return 1;
+    }
+    CUDA_CHECK(ucc_pt_cuda_iface.cudaMemset(devptr, value, count));
+    return 0;
+}
+
 #endif
diff --git a/tools/perf/ucc_pt_rocm.cc b/tools/perf/ucc_pt_rocm.cc
index 2e1f121b8f..2851be0deb 100644
--- a/tools/perf/ucc_pt_rocm.cc
+++ b/tools/perf/ucc_pt_rocm.cc
@@ -1,5 +1,5 @@
 /**
- * Copyright (c) 2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2022-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * Copyright (C) Advanced Micro Devices, Inc. 2022. ALL RIGHTS RESERVED.
  *
  * See file LICENSE for terms.
@@ -16,7 +16,7 @@ ucc_pt_rocm_iface_t ucc_pt_rocm_iface = {
 
 #define LOAD_ROCM_SYM(_sym, _pt_sym) ({                                    \
             void *h = dlsym(handle, _sym);                                 \
-            if ((error = dlerror()) != NULL)  {                            \
+            if (dlerror() != NULL)  {                                      \
                 return;                                                    \
             }                                                              \
             ucc_pt_rocm_iface. _pt_sym =                                   \
@@ -25,7 +25,6 @@ ucc_pt_rocm_iface_t ucc_pt_rocm_iface = {
 
 void ucc_pt_rocm_init(void)
 {
-    char *error;
     void *handle;
 
     handle = dlopen ("libamdhip64.so", RTLD_LAZY);