Skip to content

Commit

Permalink
TL/MLX5: set up ci
Browse files Browse the repository at this point in the history
  • Loading branch information
samnordmann committed Nov 7, 2023
1 parent 814ece7 commit 88623e5
Show file tree
Hide file tree
Showing 2 changed files with 21 additions and 3 deletions.
3 changes: 2 additions & 1 deletion .ci/scripts/build_ucc.sh
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,8 @@ cd "${UCC_SRC_DIR}"
mkdir -p "${UCC_SRC_DIR}/build"
cd "${UCC_SRC_DIR}/build"
"${UCC_SRC_DIR}/configure" --with-ucx="${UCX_INSTALL_DIR}" --with-cuda="${CUDA_HOME}" \
--prefix="${UCC_INSTALL_DIR}" --enable-gtest --with-mpi
--prefix="${UCC_INSTALL_DIR}" --enable-gtest --with-mpi \
--with-tls=cuda,nccl,self,ucp,mlx5,sharp,rccl
make -j install
echo "${UCC_INSTALL_DIR}/lib" > /etc/ld.so.conf.d/ucc.conf
ldconfig
Expand Down
21 changes: 19 additions & 2 deletions .ci/scripts/run_tests_ucc_mpi.sh
Original file line number Diff line number Diff line change
Expand Up @@ -24,13 +24,20 @@ export MASTER_ADDR=${HEAD_NODE}

NNODES=$(wc --lines "$HOSTFILE" | awk '{print $1}')
DEV=""
CX7_DEV=""

# Find first available active device
for d in $(ssh $HEAD_NODE "ibstat -l"); do
state=$(ssh $HEAD_NODE "ibstat $d" | grep 'State:' | awk '{print $2}')
type=$(ssh $HEAD_NODE "ibstat $d" | grep 'CA type:' | awk '{print $2}')
if [ $state == 'Active' ]; then
DEV=$d
break
if [ "x$DEV" == "x" ]; then
DEV=$d
fi
if [ $state == 'MT4129' ]; then
CX7_DEV=$d
break
fi
fi
done

Expand Down Expand Up @@ -104,6 +111,16 @@ for MT in "" "-T"; do
mpirun $(mpi_params $PPN 1) $ucx_tls_no_cuda_ipc $tlcuda_args $EXE $MT $TG --mtypes cuda -c $tlcuda_colls
echo "INFO: UCC MPI unit tests (TL/CUDA) ... DONE"

echo "INFO: UCC MPI unit tests (TL/MLX5) ..."
# shellcheck disable=SC2086
if [ "x$CX7_DEV" == "x" ]; then
echo "No active CX7 devices found on $HEAD_NODE"
else
tlmlx5_args=" -x UCC_CLS=basic -x UCC_CL_BASIC_TLS=ucp,mlx5 -x UCC_TL_MLXS_NET_DEVICES=$CX7_DEV -x UCC_TL_MLX5_TUNE=inf -x UCX_RC_MLX5_DM_COUNT=0 -x UCX_DC_MLX5_DM_COUNT=0 "
tlmlx5_colls="alltoall"
mpirun $(mpi_params $PPN) $tlmlx5_args $EXE $MT $TG --mtypes host,cuda -c $tlmlx5_colls -t world -d uint8 -O 0 -m 1:128
fi
echo "INFO: UCC MPI unit tests (TL/MLX5) ... DONE"

echo "INFO: UCC MPI unit tests (CL/HIER) ..."
# shellcheck disable=SC2086
Expand Down

0 comments on commit 88623e5

Please sign in to comment.