diff --git a/.ci/scripts/run_tests_ucc_mpi.sh b/.ci/scripts/run_tests_ucc_mpi.sh index 8cf15db0e5..e9257f7fc0 100755 --- a/.ci/scripts/run_tests_ucc_mpi.sh +++ b/.ci/scripts/run_tests_ucc_mpi.sh @@ -119,7 +119,7 @@ for MT in "" "-T"; do tlmlx5_args+=" -x UCC_LOG_LEVEL=debug -x UCC_COLL_TRACE=info " echo $CX7_DEV tlmlx5_colls="alltoall" - mpirun $(mpi_params $PPN) $tlmlx5_args $EXE $MT $TG --mtypes host,cuda -c $tlmlx5_colls -t world -d uint8 -O 0 -m 1:128 + mpirun $(mpi_params $PPN) $tlmlx5_args /opt/nvidia/src/ucc/build/test/mpi/ucc_test_mpi --mtypes host -c $tlmlx5_colls -t world -d uint8 -O 0 -m 128 fi echo "INFO: UCC MPI unit tests (TL/MLX5) ... DONE" diff --git a/src/components/tl/mlx5/alltoall/alltoall.c b/src/components/tl/mlx5/alltoall/alltoall.c index 5afc7c7d30..81d18230b7 100644 --- a/src/components/tl/mlx5/alltoall/alltoall.c +++ b/src/components/tl/mlx5/alltoall/alltoall.c @@ -83,7 +83,7 @@ ucc_status_t ucc_tl_mlx5_team_init_alltoall(ucc_tl_mlx5_team_t *team) node_size = node->group_size; nnodes = ucc_topo_nnodes(topo); team_size = UCC_TL_TEAM_SIZE(team); - + // while(1) {;}; if (!ucc_topo_isoppn(topo)) { tl_debug(ctx->super.super.lib, "disabling mlx5 a2a for team with non-uniform ppn, " @@ -93,7 +93,7 @@ ucc_status_t ucc_tl_mlx5_team_init_alltoall(ucc_tl_mlx5_team_t *team) } ppn = ucc_topo_max_ppn(topo); - if (net->status == UCC_SBGP_NOT_EXISTS) { + if (nnodes == 1) { tl_debug(ctx->super.super.lib, "disabling mlx5 a2a for single node team"); goto non_fatal_error;