diff --git a/.ci/Dockerfile.centos8 b/.ci/Dockerfile.centos8
index 05859cb55f..fc20124ef3 100644
--- a/.ci/Dockerfile.centos8
+++ b/.ci/Dockerfile.centos8
@@ -4,13 +4,20 @@ FROM harbor.mellanox.com/torch-ucc/ucc/1.0.0/x86_64/centos8/cuda${CUDA_VER}:base
 RUN rm -rf  ${SRC_DIR}/ucc
 COPY . ${SRC_DIR}/ucc
 
+RUN sed -i 's/mirrorlist/#mirrorlist/g' /etc/yum.repos.d/CentOS-* && \
+    sed -i 's|#baseurl=http://mirror.centos.org|baseurl=http://vault.centos.org|g' /etc/yum.repos.d/CentOS-*
+RUN yum install -y sudo && \
+    echo "swx-jenkins ALL=(ALL) NOPASSWD: ALL" >> /etc/sudoers
 #==============================================================================
 # Build UCC
 RUN ${SRC_DIR}/ucc/.ci/scripts/build_ucc.sh
 #==============================================================================
 # Install torch_ucc (UCC version) python module and build a wheel package
 RUN ${SRC_DIR}/ucc/.ci/scripts/install_torch_ucc.sh
+RUN chown -R 6213:11429 /opt/nvidia
 #==============================================================================
 RUN groupadd -g 11429 swx-jenkins
 RUN adduser --no-create-home --uid 6213 --gid 11429 --home /labhome/swx-jenkins swx-jenkins
 #==============================================================================
+USER swx-jenkins
+
diff --git a/.ci/job_matrix.yaml b/.ci/job_matrix.yaml
index ed5fa2528b..af23b10578 100644
--- a/.ci/job_matrix.yaml
+++ b/.ci/job_matrix.yaml
@@ -20,13 +20,13 @@ volumes:
     }
 
 env:
-  CUDA_VER: "11.4.2"
+  CUDA_VER: '11.4.2'
   UCC_URI_SUFFIX: "ucc/${UCC_VERSION}/x86_64/centos8/cuda${CUDA_VER}"
   UCC_DOCKER_IMAGE_NAME: "${registry_host}${registry_path}/${UCC_URI_SUFFIX}"
   NVIDIA_ROOT_DIR: "/opt/nvidia"
   SRC_DIR: "${NVIDIA_ROOT_DIR}/src"
   BIN_DIR: "${NVIDIA_ROOT_DIR}/bin"
-  DOCKER_OPT: "--pull always --network=host --uts=host --ipc=host --ulimit stack=67108864 --ulimit memlock=-1 --security-opt seccomp=unconfined --cap-add=SYS_ADMIN --device=/dev/infiniband/ --gpus all --user root"
+  DOCKER_OPT: "--pull always --network=host --uts=host --ipc=host --ulimit stack=67108864 --ulimit memlock=-1 --security-opt seccomp=unconfined --cap-add=SYS_ADMIN --device=/dev/infiniband/ --gpus all"
 
 docker_opt: "$DOCKER_OPT"
 
@@ -54,6 +54,7 @@ runs_on_dockers:
 # bare metal
 runs_on_agents:
   - nodeLabel: "swx-clx01"
+  - nodeLabel: "ml-test-node-gpu"
 
 timeout_minutes: 360
 
@@ -72,7 +73,7 @@ steps:
   #============================================================================
   - name: Run Coverity
     credentialsId: "bc9a18d3-1153-449c-b924-7fc9249c9cc0"
-    agentSelector: "{nodeLabel: 'swx-clx01'}"
+    agentSelector: "{nodeLabel: 'ml-test-node-gpu'}"
     run: |
       export UCC_PASSWORD=$UCC_PASSWORD
       export UCC_USERNAME=$UCC_USERNAME
diff --git a/.ci/scripts/coverity.sh b/.ci/scripts/coverity.sh
index 61f124eccb..63122fbbd5 100755
--- a/.ci/scripts/coverity.sh
+++ b/.ci/scripts/coverity.sh
@@ -24,9 +24,6 @@ module load hpcx-gcc
 module load dev/cuda12.1.1
 module load dev/nccl_2.18.3-1_cuda12.1.1_"$(uname -i)"
 module load tools/cov-2021.12
-previous_date=$(date -d "yesterday" +'%Y-%m-%d')
-HPCX_UCX_DIR=/hpc/local/benchmarks/daily/next/$previous_date/hpcx-gcc-redhat7/ucx
-HPCX_SHARP_DIR=/hpc/local/benchmarks/daily/next/$previous_date/hpcx-gcc-redhat7/sharp
 ./autogen.sh
 ./configure --with-nccl --with-tls=cuda,nccl,self,sharp,shm,ucp,mlx5 --with-ucx="${HPCX_UCX_DIR}" --with-sharp="${HPCX_SHARP_DIR}"
 make_opt="-j$(($(nproc) / 2 + 1))"
diff --git a/.ci/scripts/run_dlrm.sh b/.ci/scripts/run_dlrm.sh
index 46e71542bc..da9ebc2e8b 100755
--- a/.ci/scripts/run_dlrm.sh
+++ b/.ci/scripts/run_dlrm.sh
@@ -32,7 +32,6 @@ mpirun \
     -np $NP \
     --hostfile ${HOSTFILE} \
     --map-by node \
-    --allow-run-as-root \
     --mca plm_rsh_args '-p 12345' \
     -x PATH \
     -x LD_LIBRARY_PATH \
@@ -43,7 +42,6 @@ mpirun \
     -np $NP \
     --hostfile ${HOSTFILE} \
     --map-by node \
-    --allow-run-as-root \
     --mca plm_rsh_args '-p 12345' \
     -x PATH \
     -x LD_LIBRARY_PATH \
@@ -54,7 +52,6 @@ mpirun \
     -np $NP \
     --hostfile ${HOSTFILE} \
     --map-by node \
-    --allow-run-as-root \
     --mca plm_rsh_args '-p 12345' \
     -x PATH \
     -x LD_LIBRARY_PATH \
diff --git a/.ci/scripts/run_dlrm_docker.sh b/.ci/scripts/run_dlrm_docker.sh
index 7936a3c509..39fb74e92b 100755
--- a/.ci/scripts/run_dlrm_docker.sh
+++ b/.ci/scripts/run_dlrm_docker.sh
@@ -19,4 +19,4 @@ HEAD_NODE=$(head -1 "$HOSTFILE")
 export HEAD_NODE
 
 #sudo ssh -p "${DOCKER_SSH_PORT}" "${HEAD_NODE}" /opt/nvidia/src/ucc/.ci/scripts/run_dlrm.sh cpu "/opt/nvidia/src/ucc/.ci/configs/${HEAD_NODE}/hostfile.txt"
-sudo ssh -p "${DOCKER_SSH_PORT}" "${HEAD_NODE}" /opt/nvidia/src/ucc/.ci/scripts/run_dlrm.sh gpu "/opt/nvidia/src/ucc/.ci/configs/${HEAD_NODE}/hostfile.txt"
+ssh -p "${DOCKER_SSH_PORT}" "${HEAD_NODE}" /opt/nvidia/src/ucc/.ci/scripts/run_dlrm.sh gpu "/opt/nvidia/src/ucc/.ci/configs/${HEAD_NODE}/hostfile.txt"
diff --git a/.ci/scripts/run_docker.sh b/.ci/scripts/run_docker.sh
index 10f16bc352..7f141d65c9 100755
--- a/.ci/scripts/run_docker.sh
+++ b/.ci/scripts/run_docker.sh
@@ -41,13 +41,11 @@ DOCKER_RUN_ARGS="\
 --cap-add=SYS_ADMIN \
 --device=/dev/infiniband/ \
 --gpus all \
---user root \
 -it \
 -d \
 --rm \
 --name=${DOCKER_CONTAINER_NAME} \
 -v /labhome:/labhome \
--v /root/.ssh:/root/.ssh \
 "
 
 # shellcheck disable=SC2013
@@ -78,16 +76,16 @@ pdsh -w "${HOST_LIST}" -R ssh docker pull "${DOCKER_IMAGE_NAME}"
 for HOST in $(cat "$HOSTFILE"); do
     echo "INFO: start docker container on $HOST ..."
     # shellcheck disable=SC2029
-    sudo ssh "$HOST" "docker run \
+    ssh "$HOST" "docker run \
         ${DOCKER_RUN_ARGS} \
         ${DOCKER_IMAGE_NAME} \
-        bash -c '/usr/sbin/sshd -p ${DOCKER_SSH_PORT}; sleep infinity'"
+        bash -c 'sudo /usr/sbin/sshd -p ${DOCKER_SSH_PORT}; sleep infinity'"
     echo "INFO: start docker container on $HOST ... DONE"
 
     sleep 5
 
     echo "INFO: verify docker container on $HOST ..."
-    sudo ssh -p "${DOCKER_SSH_PORT}" "$HOST" hostname
-    sudo ssh -p "${DOCKER_SSH_PORT}" "$HOST" cat /proc/1/cgroup
+    ssh -p "${DOCKER_SSH_PORT}" "$HOST" hostname
+    ssh -p "${DOCKER_SSH_PORT}" "$HOST" cat /proc/1/cgroup
     echo "INFO: verify docker container on $HOST ... DONE"
 done
diff --git a/.ci/scripts/run_tests_ucc_mpi.sh b/.ci/scripts/run_tests_ucc_mpi.sh
index 2eb26ea526..4701a7c04e 100755
--- a/.ci/scripts/run_tests_ucc_mpi.sh
+++ b/.ci/scripts/run_tests_ucc_mpi.sh
@@ -46,7 +46,7 @@ function mpi_params {
         nnodes=$NNODES
     fi
     echo "-np $((nnodes*ppn)) --oversubscribe --hostfile ${HOSTFILE} \
---map-by ppr:$ppn:node --bind-to socket --allow-run-as-root \
+--map-by ppr:$ppn:node --bind-to socket  \
 -x PATH -x LD_LIBRARY_PATH --mca opal_common_ucx_opal_mem_hooks 1 --mca plm_rsh_args -p12345 \
 --mca coll ^ucc,hcoll \
 -x UCX_NET_DEVICES=$DEV:1"
diff --git a/.ci/scripts/run_tests_ucc_mpi_docker.sh b/.ci/scripts/run_tests_ucc_mpi_docker.sh
index c3e6736c4a..c1929e7cc3 100755
--- a/.ci/scripts/run_tests_ucc_mpi_docker.sh
+++ b/.ci/scripts/run_tests_ucc_mpi_docker.sh
@@ -18,4 +18,4 @@ fi
 HEAD_NODE=$(head -1 "$HOSTFILE")
 export HEAD_NODE
 
-sudo ssh -p "${DOCKER_SSH_PORT}" "${HEAD_NODE}" /opt/nvidia/src/ucc/.ci/scripts/run_tests_ucc_mpi.sh "/opt/nvidia/src/ucc/.ci/configs/${HEAD_NODE}/hostfile.txt"
+ssh -p "${DOCKER_SSH_PORT}" "${HEAD_NODE}" /opt/nvidia/src/ucc/.ci/scripts/run_tests_ucc_mpi.sh "/opt/nvidia/src/ucc/.ci/configs/${HEAD_NODE}/hostfile.txt"
diff --git a/src/components/tl/mlx5/tl_mlx5_context.c b/src/components/tl/mlx5/tl_mlx5_context.c
index 1a65c1d690..0c56ff9390 100644
--- a/src/components/tl/mlx5/tl_mlx5_context.c
+++ b/src/components/tl/mlx5/tl_mlx5_context.c
@@ -26,6 +26,7 @@ UCC_CLASS_INIT_FUNC(ucc_tl_mlx5_context_t,
     UCC_CLASS_CALL_SUPER_INIT(ucc_tl_context_t, &tl_mlx5_config->super,
                               params->context);
     memcpy(&self->cfg, tl_mlx5_config, sizeof(*tl_mlx5_config));
+    self->sock       = 0;
     self->rcache     = NULL;
     self->shared_pd  = NULL;
     self->shared_ctx = NULL;
@@ -73,8 +74,11 @@ UCC_CLASS_CLEANUP_FUNC(ucc_tl_mlx5_context_t)
         tl_debug(self->super.super.lib, "failed to free ib ctx and pd");
     };
 
+    if (!self->sock) {
+        close(self->sock);
+    }
+
     ucc_mpool_cleanup(&self->req_mp, 1);
-    close(self->sock);
 }
 
 UCC_CLASS_DEFINE(ucc_tl_mlx5_context_t, ucc_tl_context_t);
diff --git a/src/components/tl/mlx5/tl_mlx5_pd.c b/src/components/tl/mlx5/tl_mlx5_pd.c
index 2e4e4ff676..a553dbc5f5 100644
--- a/src/components/tl/mlx5/tl_mlx5_pd.c
+++ b/src/components/tl/mlx5/tl_mlx5_pd.c
@@ -127,7 +127,7 @@ ucc_status_t ucc_tl_mlx5_socket_init(ucc_tl_mlx5_context_t *ctx,
 
 static ucc_status_t client_recv_data(int *shared_cmd_fd,
                                      uint32_t *shared_pd_handle,
-                                     const char *sock_path,
+                                     const char *sock_path, int *sock_p,
                                      ucc_tl_mlx5_lib_t *lib)
 {
     struct sockaddr_storage sockaddr = {};
@@ -159,7 +159,8 @@ static ucc_status_t client_recv_data(int *shared_cmd_fd,
         goto out;
     }
 
-    return status;
+    *sock_p = sock;
+    return UCC_OK;
 
 out:
     if (close(sock) == -1) {
@@ -229,7 +230,8 @@ ucc_status_t ucc_tl_mlx5_share_ctx_pd(ucc_tl_mlx5_context_t *ctx,
     ucc_status_t status;
 
     if (!is_ctx_owner) {
-        status = client_recv_data(&ctx_fd, &pd_handle, sock_path, lib);
+        status =
+            client_recv_data(&ctx_fd, &pd_handle, sock_path, &ctx->sock, lib);
         if (UCC_OK != status) {
             tl_debug(lib, "failed to share ctx & pd from client side");
             return status;
diff --git a/src/components/tl/ucp/Makefile.am b/src/components/tl/ucp/Makefile.am
index b7f93b881e..cc0a118c60 100644
--- a/src/components/tl/ucp/Makefile.am
+++ b/src/components/tl/ucp/Makefile.am
@@ -9,10 +9,11 @@ SUBDIRS = .
 include makefile.coll_plugins.am
 endif
 
-allgather =                       \
-	allgather/allgather.h         \
-	allgather/allgather.c         \
-	allgather/allgather_ring.c    \
+allgather =                        \
+	allgather/allgather.h          \
+	allgather/allgather.c          \
+	allgather/allgather_ring.c     \
+	allgather/allgather_neighbor.c \
 	allgather/allgather_knomial.c
 
 allgatherv =                     \
diff --git a/src/components/tl/ucp/allgather/allgather.c b/src/components/tl/ucp/allgather/allgather.c
index e0ef976c01..90b06e99ee 100644
--- a/src/components/tl/ucp/allgather/allgather.c
+++ b/src/components/tl/ucp/allgather/allgather.c
@@ -7,6 +7,8 @@
 #include "tl_ucp.h"
 #include "allgather.h"
 
+#define ALLGATHER_MAX_PATTERN_SIZE (sizeof(UCC_TL_UCP_ALLGATHER_DEFAULT_ALG_SELECT_STR))
+
 ucc_base_coll_alg_info_t
     ucc_tl_ucp_allgather_algs[UCC_TL_UCP_ALLGATHER_ALG_LAST + 1] = {
         [UCC_TL_UCP_ALLGATHER_ALG_KNOMIAL] =
@@ -17,6 +19,10 @@ ucc_base_coll_alg_info_t
             {.id   = UCC_TL_UCP_ALLGATHER_ALG_RING,
              .name = "ring",
              .desc = "O(N) Ring"},
+        [UCC_TL_UCP_ALLGATHER_ALG_NEIGHBOR] =
+            {.id   = UCC_TL_UCP_ALLGATHER_ALG_NEIGHBOR,
+             .name = "neighbor",
+             .desc = "O(N) Neighbor Exchange N/2 steps"},
         [UCC_TL_UCP_ALLGATHER_ALG_LAST] = {
             .id = 0, .name = NULL, .desc = NULL}};
 
@@ -24,3 +30,16 @@ ucc_status_t ucc_tl_ucp_allgather_init(ucc_tl_ucp_task_t *task)
 {
     return ucc_tl_ucp_allgather_ring_init_common(task);
 }
+
+char *ucc_tl_ucp_allgather_score_str_get(ucc_tl_ucp_team_t *team)
+{
+    int   max_size = ALLGATHER_MAX_PATTERN_SIZE;
+    int   algo_num = UCC_TL_TEAM_SIZE(team) % 2
+                         ? UCC_TL_UCP_ALLGATHER_ALG_RING
+                         : UCC_TL_UCP_ALLGATHER_ALG_NEIGHBOR;
+    char *str      = ucc_malloc(max_size * sizeof(char));
+
+    ucc_snprintf_safe(str, max_size,
+                      UCC_TL_UCP_ALLGATHER_DEFAULT_ALG_SELECT_STR, algo_num);
+    return str;
+}
diff --git a/src/components/tl/ucp/allgather/allgather.h b/src/components/tl/ucp/allgather/allgather.h
index d7d9c2ea8d..b68ab00e95 100644
--- a/src/components/tl/ucp/allgather/allgather.h
+++ b/src/components/tl/ucp/allgather/allgather.h
@@ -11,14 +11,17 @@
 enum {
     UCC_TL_UCP_ALLGATHER_ALG_KNOMIAL,
     UCC_TL_UCP_ALLGATHER_ALG_RING,
+    UCC_TL_UCP_ALLGATHER_ALG_NEIGHBOR,
     UCC_TL_UCP_ALLGATHER_ALG_LAST
 };
 
 extern ucc_base_coll_alg_info_t
-             ucc_tl_ucp_allgather_algs[UCC_TL_UCP_ALLGATHER_ALG_LAST + 1];
+    ucc_tl_ucp_allgather_algs[UCC_TL_UCP_ALLGATHER_ALG_LAST + 1];
 
 #define UCC_TL_UCP_ALLGATHER_DEFAULT_ALG_SELECT_STR                            \
-    "allgather:0-4k:@0#allgather:4k-inf:@1"
+    "allgather:0-4k:@0#allgather:4k-inf:@%d"
+
+char *ucc_tl_ucp_allgather_score_str_get(ucc_tl_ucp_team_t *team);
 
 static inline int ucc_tl_ucp_allgather_alg_from_str(const char *str)
 {
@@ -33,20 +36,30 @@ static inline int ucc_tl_ucp_allgather_alg_from_str(const char *str)
 
 ucc_status_t ucc_tl_ucp_allgather_init(ucc_tl_ucp_task_t *task);
 
+/* Ring */
 ucc_status_t ucc_tl_ucp_allgather_ring_init(ucc_base_coll_args_t *coll_args,
-                                            ucc_base_team_t *     team,
-                                            ucc_coll_task_t **    task_h);
+                                            ucc_base_team_t      *team,
+                                            ucc_coll_task_t     **task_h);
 
 ucc_status_t ucc_tl_ucp_allgather_ring_init_common(ucc_tl_ucp_task_t *task);
 
-void  ucc_tl_ucp_allgather_ring_progress(ucc_coll_task_t *task);
+void ucc_tl_ucp_allgather_ring_progress(ucc_coll_task_t *task);
 
 ucc_status_t ucc_tl_ucp_allgather_ring_start(ucc_coll_task_t *task);
 
+/* Neighbor Exchange */
+ucc_status_t ucc_tl_ucp_allgather_neighbor_init(ucc_base_coll_args_t *coll_args,
+                                                ucc_base_team_t      *team,
+                                                ucc_coll_task_t     **task_h);
+
+void ucc_tl_ucp_allgather_neighbor_progress(ucc_coll_task_t *task);
+
+ucc_status_t ucc_tl_ucp_allgather_neighbor_start(ucc_coll_task_t *task);
+
 /* Uses allgather_kn_radix from config */
 ucc_status_t ucc_tl_ucp_allgather_knomial_init(ucc_base_coll_args_t *coll_args,
-                                               ucc_base_team_t *     team,
-                                               ucc_coll_task_t **    task_h);
+                                               ucc_base_team_t      *team,
+                                               ucc_coll_task_t     **task_h);
 
 /* Internal interface with custom radix */
 ucc_status_t ucc_tl_ucp_allgather_knomial_init_r(
diff --git a/src/components/tl/ucp/allgather/allgather_neighbor.c b/src/components/tl/ucp/allgather/allgather_neighbor.c
new file mode 100644
index 0000000000..771ba2d3b8
--- /dev/null
+++ b/src/components/tl/ucp/allgather/allgather_neighbor.c
@@ -0,0 +1,176 @@
+/**
+ * Copyright (c) 2021-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ *
+ * See file LICENSE for terms.
+ */
+#include "config.h"
+#include "tl_ucp.h"
+#include "allgather.h"
+#include "core/ucc_progress_queue.h"
+#include "tl_ucp_sendrecv.h"
+#include "utils/ucc_math.h"
+#include "utils/ucc_coll_utils.h"
+#include "components/mc/ucc_mc.h"
+
+static ucc_rank_t get_recv_from_rank(ucc_rank_t rank, ucc_rank_t size, int i)
+{
+    const int  i_parity = i % 2;
+    ucc_rank_t offset_at_step[2], recv_data_from;
+    if (rank % 2) {
+        recv_data_from    = (rank - 1 + size) % size;
+        offset_at_step[0] = (-2);
+        offset_at_step[1] = (+2);
+    } else {
+        recv_data_from    = rank;
+        offset_at_step[0] = (+2);
+        offset_at_step[1] = (-2);
+    }
+
+    return (recv_data_from + offset_at_step[i_parity] * ucc_div_round_up(i, 2) +
+            size) %
+           size;
+}
+
+ucc_status_t ucc_tl_ucp_allgather_neighbor_init(ucc_base_coll_args_t *coll_args,
+                                                ucc_base_team_t      *team,
+                                                ucc_coll_task_t     **task_h)
+{
+    ucc_status_t       status = UCC_OK;
+    ucc_tl_ucp_task_t *task;
+    ucc_tl_ucp_team_t *ucp_team;
+
+    task     = ucc_tl_ucp_init_task(coll_args, team);
+    ucp_team = TASK_TEAM(task);
+
+    if (!ucc_coll_args_is_predefined_dt(&TASK_ARGS(task), UCC_RANK_INVALID)) {
+        tl_error(UCC_TASK_LIB(task), "user defined datatype is not supported");
+        status = UCC_ERR_NOT_SUPPORTED;
+        goto out;
+    }
+
+    if (UCC_TL_TEAM_SIZE(ucp_team) % 2) {
+        tl_debug(UCC_TASK_LIB(task),
+                 "odd team size is not supported, switching to ring");
+        status = ucc_tl_ucp_allgather_ring_init_common(task);
+    } else {
+        task->super.post     = ucc_tl_ucp_allgather_neighbor_start;
+        task->super.progress = ucc_tl_ucp_allgather_neighbor_progress;
+    }
+
+out:
+    if (status != UCC_OK) {
+        ucc_tl_ucp_put_task(task);
+        return status;
+    }
+
+    *task_h = &task->super;
+    return status;
+}
+
+/* Original implementation: https://github.com/open-mpi/ompi/blob/main/ompi/mca/coll/base/coll_base_allgather.c */
+void ucc_tl_ucp_allgather_neighbor_progress(ucc_coll_task_t *coll_task)
+{
+    ucc_tl_ucp_task_t *task      = ucc_derived_of(coll_task, ucc_tl_ucp_task_t);
+    ucc_tl_ucp_team_t *team      = TASK_TEAM(task);
+    ucc_rank_t         trank     = UCC_TL_TEAM_RANK(team);
+    ucc_rank_t         tsize     = UCC_TL_TEAM_SIZE(team);
+    void              *rbuf      = TASK_ARGS(task).dst.info.buffer;
+    ucc_memory_type_t  rmem      = TASK_ARGS(task).dst.info.mem_type;
+    ucc_datatype_t     dt        = TASK_ARGS(task).dst.info.datatype;
+    size_t             count     = TASK_ARGS(task).dst.info.count;
+    size_t             data_size = (count / tsize) * ucc_dt_size(dt);
+    ucc_rank_t         neighbors[2], i;
+    int                i_parity, even_rank;
+    void              *tmprecv, *tmpsend;
+
+    if (UCC_INPROGRESS == ucc_tl_ucp_test(task)) {
+        return;
+    }
+
+    even_rank = !(trank % 2);
+    if (even_rank) {
+        neighbors[0] = (trank + 1) % tsize;
+        neighbors[1] = (trank - 1 + tsize) % tsize;
+    } else {
+        neighbors[0] = (trank - 1 + tsize) % tsize;
+        neighbors[1] = (trank + 1) % tsize;
+    }
+
+    while (task->tagged.send_posted < (tsize / 2)) {
+        i        = task->tagged.send_posted;
+        i_parity = i % 2;
+
+        tmprecv =
+            PTR_OFFSET(rbuf, get_recv_from_rank(trank, tsize, i) * data_size);
+        tmpsend = PTR_OFFSET(rbuf, get_recv_from_rank(trank, tsize, i - 1) *
+                                       data_size);
+
+        /* Sendreceive */
+        UCPCHECK_GOTO(ucc_tl_ucp_send_nb(tmpsend, 2 * data_size, rmem,
+                                         neighbors[i_parity], team, task),
+                      task, out);
+        UCPCHECK_GOTO(ucc_tl_ucp_recv_nb(tmprecv, 2 * data_size, rmem,
+                                         neighbors[i_parity], team, task),
+                      task, out);
+
+        if (UCC_INPROGRESS == ucc_tl_ucp_test(task)) {
+            return;
+        }
+    }
+
+    ucc_assert(UCC_TL_UCP_TASK_P2P_COMPLETE(task));
+    task->super.status = UCC_OK;
+
+out:
+    UCC_TL_UCP_PROFILE_REQUEST_EVENT(coll_task, "ucp_allgather_neighbor_done",
+                                     0);
+}
+
+ucc_status_t ucc_tl_ucp_allgather_neighbor_start(ucc_coll_task_t *coll_task)
+{
+    ucc_tl_ucp_task_t *task      = ucc_derived_of(coll_task, ucc_tl_ucp_task_t);
+    ucc_tl_ucp_team_t *team      = TASK_TEAM(task);
+    size_t             count     = TASK_ARGS(task).dst.info.count;
+    void              *sbuf      = TASK_ARGS(task).src.info.buffer;
+    void              *rbuf      = TASK_ARGS(task).dst.info.buffer;
+    ucc_memory_type_t  smem      = TASK_ARGS(task).src.info.mem_type;
+    ucc_memory_type_t  rmem      = TASK_ARGS(task).dst.info.mem_type;
+    ucc_datatype_t     dt        = TASK_ARGS(task).dst.info.datatype;
+    ucc_rank_t         trank     = UCC_TL_TEAM_RANK(team);
+    ucc_rank_t         tsize     = UCC_TL_TEAM_SIZE(team);
+    size_t             data_size = (count / tsize) * ucc_dt_size(dt);
+    ucc_status_t       status;
+    ucc_rank_t         neighbor;
+    void              *tmprecv, *tmpsend;
+
+    UCC_TL_UCP_PROFILE_REQUEST_EVENT(coll_task, "ucp_allgather_neighbor_start",
+                                     0);
+    ucc_tl_ucp_task_reset(task, UCC_INPROGRESS);
+
+    if (!UCC_IS_INPLACE(TASK_ARGS(task))) {
+        status = ucc_mc_memcpy(PTR_OFFSET(rbuf, data_size * trank), sbuf,
+                               data_size, rmem, smem);
+        if (ucc_unlikely(UCC_OK != status)) {
+            return status;
+        }
+    }
+
+    if (trank % 2) {
+        neighbor = (trank - 1 + tsize) % tsize;
+    } else {
+        neighbor = (trank + 1) % tsize;
+    }
+
+    tmprecv = PTR_OFFSET(rbuf, neighbor * data_size);
+    tmpsend = PTR_OFFSET(rbuf, trank * data_size);
+
+    /* Sendreceive */
+    UCPCHECK_GOTO(
+        ucc_tl_ucp_send_nb(tmpsend, data_size, rmem, neighbor, team, task),
+        task, out);
+    UCPCHECK_GOTO(
+        ucc_tl_ucp_recv_nb(tmprecv, data_size, rmem, neighbor, team, task),
+        task, out);
+out:
+    return ucc_progress_queue_enqueue(UCC_TL_CORE_CTX(team)->pq, &task->super);
+}
diff --git a/src/components/tl/ucp/tl_ucp_coll.c b/src/components/tl/ucp/tl_ucp_coll.c
index b75e47ff4c..bbbac03fc7 100644
--- a/src/components/tl/ucp/tl_ucp_coll.c
+++ b/src/components/tl/ucp/tl_ucp_coll.c
@@ -27,8 +27,8 @@
 const ucc_tl_ucp_default_alg_desc_t
     ucc_tl_ucp_default_alg_descs[UCC_TL_UCP_N_DEFAULT_ALG_SELECT_STR] = {
         {
-            .select_str = UCC_TL_UCP_ALLGATHER_DEFAULT_ALG_SELECT_STR,
-            .str_get_fn = NULL
+            .select_str = NULL,
+            .str_get_fn = ucc_tl_ucp_allgather_score_str_get
         },
         {
             .select_str = NULL,
@@ -252,6 +252,9 @@ ucc_status_t ucc_tl_ucp_alg_id_to_init(int alg_id, const char *alg_id_str,
         case UCC_TL_UCP_ALLGATHER_ALG_RING:
             *init = ucc_tl_ucp_allgather_ring_init;
             break;
+        case UCC_TL_UCP_ALLGATHER_ALG_NEIGHBOR:
+            *init = ucc_tl_ucp_allgather_neighbor_init;
+            break;
         default:
             status = UCC_ERR_INVALID_PARAM;
             break;
diff --git a/test/gtest/coll/test_allgather.cc b/test/gtest/coll/test_allgather.cc
index 1cdf17be77..e1cacac5ac 100644
--- a/test/gtest/coll/test_allgather.cc
+++ b/test/gtest/coll/test_allgather.cc
@@ -296,7 +296,7 @@ INSTANTIATE_TEST_CASE_P(
 #endif
         ::testing::Values(1,3,8192), // count
         ::testing::Values(TEST_INPLACE, TEST_NO_INPLACE),
-        ::testing::Values("knomial", "ring")),
+        ::testing::Values("knomial", "ring", "neighbor")),
         [](const testing::TestParamInfo<test_allgather_alg::ParamType>& info) {
             std::string name;
             name += ucc_datatype_str(std::get<0>(info.param));