Skip to content

Commit

Permalink
Merge branch 'master' into topic/fix_onesided_tests
Browse files Browse the repository at this point in the history
  • Loading branch information
manjugv authored Sep 6, 2023
2 parents 1bfdf76 + 2929235 commit ff2bfc9
Show file tree
Hide file tree
Showing 16 changed files with 254 additions and 36 deletions.
7 changes: 7 additions & 0 deletions .ci/Dockerfile.centos8
Original file line number Diff line number Diff line change
Expand Up @@ -4,13 +4,20 @@ FROM harbor.mellanox.com/torch-ucc/ucc/1.0.0/x86_64/centos8/cuda${CUDA_VER}:base
RUN rm -rf ${SRC_DIR}/ucc
COPY . ${SRC_DIR}/ucc

RUN sed -i 's/mirrorlist/#mirrorlist/g' /etc/yum.repos.d/CentOS-* && \
sed -i 's|#baseurl=http://mirror.centos.org|baseurl=http://vault.centos.org|g' /etc/yum.repos.d/CentOS-*
RUN yum install -y sudo && \
echo "swx-jenkins ALL=(ALL) NOPASSWD: ALL" >> /etc/sudoers
#==============================================================================
# Build UCC
RUN ${SRC_DIR}/ucc/.ci/scripts/build_ucc.sh
#==============================================================================
# Install torch_ucc (UCC version) python module and build a wheel package
RUN ${SRC_DIR}/ucc/.ci/scripts/install_torch_ucc.sh
RUN chown -R 6213:11429 /opt/nvidia
#==============================================================================
RUN groupadd -g 11429 swx-jenkins
RUN adduser --no-create-home --uid 6213 --gid 11429 --home /labhome/swx-jenkins swx-jenkins
#==============================================================================
USER swx-jenkins

7 changes: 4 additions & 3 deletions .ci/job_matrix.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -20,13 +20,13 @@ volumes:
}

env:
CUDA_VER: "11.4.2"
CUDA_VER: '11.4.2'
UCC_URI_SUFFIX: "ucc/${UCC_VERSION}/x86_64/centos8/cuda${CUDA_VER}"
UCC_DOCKER_IMAGE_NAME: "${registry_host}${registry_path}/${UCC_URI_SUFFIX}"
NVIDIA_ROOT_DIR: "/opt/nvidia"
SRC_DIR: "${NVIDIA_ROOT_DIR}/src"
BIN_DIR: "${NVIDIA_ROOT_DIR}/bin"
DOCKER_OPT: "--pull always --network=host --uts=host --ipc=host --ulimit stack=67108864 --ulimit memlock=-1 --security-opt seccomp=unconfined --cap-add=SYS_ADMIN --device=/dev/infiniband/ --gpus all --user root"
DOCKER_OPT: "--pull always --network=host --uts=host --ipc=host --ulimit stack=67108864 --ulimit memlock=-1 --security-opt seccomp=unconfined --cap-add=SYS_ADMIN --device=/dev/infiniband/ --gpus all"

docker_opt: "$DOCKER_OPT"

Expand Down Expand Up @@ -54,6 +54,7 @@ runs_on_dockers:
# bare metal
runs_on_agents:
- nodeLabel: "swx-clx01"
- nodeLabel: "ml-test-node-gpu"

timeout_minutes: 360

Expand All @@ -72,7 +73,7 @@ steps:
#============================================================================
- name: Run Coverity
credentialsId: "bc9a18d3-1153-449c-b924-7fc9249c9cc0"
agentSelector: "{nodeLabel: 'swx-clx01'}"
agentSelector: "{nodeLabel: 'ml-test-node-gpu'}"
run: |
export UCC_PASSWORD=$UCC_PASSWORD
export UCC_USERNAME=$UCC_USERNAME
Expand Down
3 changes: 0 additions & 3 deletions .ci/scripts/coverity.sh
Original file line number Diff line number Diff line change
Expand Up @@ -24,9 +24,6 @@ module load hpcx-gcc
module load dev/cuda12.1.1
module load dev/nccl_2.18.3-1_cuda12.1.1_"$(uname -i)"
module load tools/cov-2021.12
previous_date=$(date -d "yesterday" +'%Y-%m-%d')
HPCX_UCX_DIR=/hpc/local/benchmarks/daily/next/$previous_date/hpcx-gcc-redhat7/ucx
HPCX_SHARP_DIR=/hpc/local/benchmarks/daily/next/$previous_date/hpcx-gcc-redhat7/sharp
./autogen.sh
./configure --with-nccl --with-tls=cuda,nccl,self,sharp,shm,ucp,mlx5 --with-ucx="${HPCX_UCX_DIR}" --with-sharp="${HPCX_SHARP_DIR}"
make_opt="-j$(($(nproc) / 2 + 1))"
Expand Down
3 changes: 0 additions & 3 deletions .ci/scripts/run_dlrm.sh
Original file line number Diff line number Diff line change
Expand Up @@ -32,7 +32,6 @@ mpirun \
-np $NP \
--hostfile ${HOSTFILE} \
--map-by node \
--allow-run-as-root \
--mca plm_rsh_args '-p 12345' \
-x PATH \
-x LD_LIBRARY_PATH \
Expand All @@ -43,7 +42,6 @@ mpirun \
-np $NP \
--hostfile ${HOSTFILE} \
--map-by node \
--allow-run-as-root \
--mca plm_rsh_args '-p 12345' \
-x PATH \
-x LD_LIBRARY_PATH \
Expand All @@ -54,7 +52,6 @@ mpirun \
-np $NP \
--hostfile ${HOSTFILE} \
--map-by node \
--allow-run-as-root \
--mca plm_rsh_args '-p 12345' \
-x PATH \
-x LD_LIBRARY_PATH \
Expand Down
2 changes: 1 addition & 1 deletion .ci/scripts/run_dlrm_docker.sh
Original file line number Diff line number Diff line change
Expand Up @@ -19,4 +19,4 @@ HEAD_NODE=$(head -1 "$HOSTFILE")
export HEAD_NODE

#sudo ssh -p "${DOCKER_SSH_PORT}" "${HEAD_NODE}" /opt/nvidia/src/ucc/.ci/scripts/run_dlrm.sh cpu "/opt/nvidia/src/ucc/.ci/configs/${HEAD_NODE}/hostfile.txt"
sudo ssh -p "${DOCKER_SSH_PORT}" "${HEAD_NODE}" /opt/nvidia/src/ucc/.ci/scripts/run_dlrm.sh gpu "/opt/nvidia/src/ucc/.ci/configs/${HEAD_NODE}/hostfile.txt"
ssh -p "${DOCKER_SSH_PORT}" "${HEAD_NODE}" /opt/nvidia/src/ucc/.ci/scripts/run_dlrm.sh gpu "/opt/nvidia/src/ucc/.ci/configs/${HEAD_NODE}/hostfile.txt"
10 changes: 4 additions & 6 deletions .ci/scripts/run_docker.sh
Original file line number Diff line number Diff line change
Expand Up @@ -41,13 +41,11 @@ DOCKER_RUN_ARGS="\
--cap-add=SYS_ADMIN \
--device=/dev/infiniband/ \
--gpus all \
--user root \
-it \
-d \
--rm \
--name=${DOCKER_CONTAINER_NAME} \
-v /labhome:/labhome \
-v /root/.ssh:/root/.ssh \
"

# shellcheck disable=SC2013
Expand Down Expand Up @@ -78,16 +76,16 @@ pdsh -w "${HOST_LIST}" -R ssh docker pull "${DOCKER_IMAGE_NAME}"
for HOST in $(cat "$HOSTFILE"); do
echo "INFO: start docker container on $HOST ..."
# shellcheck disable=SC2029
sudo ssh "$HOST" "docker run \
ssh "$HOST" "docker run \
${DOCKER_RUN_ARGS} \
${DOCKER_IMAGE_NAME} \
bash -c '/usr/sbin/sshd -p ${DOCKER_SSH_PORT}; sleep infinity'"
bash -c 'sudo /usr/sbin/sshd -p ${DOCKER_SSH_PORT}; sleep infinity'"
echo "INFO: start docker container on $HOST ... DONE"

sleep 5

echo "INFO: verify docker container on $HOST ..."
sudo ssh -p "${DOCKER_SSH_PORT}" "$HOST" hostname
sudo ssh -p "${DOCKER_SSH_PORT}" "$HOST" cat /proc/1/cgroup
ssh -p "${DOCKER_SSH_PORT}" "$HOST" hostname
ssh -p "${DOCKER_SSH_PORT}" "$HOST" cat /proc/1/cgroup
echo "INFO: verify docker container on $HOST ... DONE"
done
2 changes: 1 addition & 1 deletion .ci/scripts/run_tests_ucc_mpi.sh
Original file line number Diff line number Diff line change
Expand Up @@ -46,7 +46,7 @@ function mpi_params {
nnodes=$NNODES
fi
echo "-np $((nnodes*ppn)) --oversubscribe --hostfile ${HOSTFILE} \
--map-by ppr:$ppn:node --bind-to socket --allow-run-as-root \
--map-by ppr:$ppn:node --bind-to socket \
-x PATH -x LD_LIBRARY_PATH --mca opal_common_ucx_opal_mem_hooks 1 --mca plm_rsh_args -p12345 \
--mca coll ^ucc,hcoll \
-x UCX_NET_DEVICES=$DEV:1"
Expand Down
2 changes: 1 addition & 1 deletion .ci/scripts/run_tests_ucc_mpi_docker.sh
Original file line number Diff line number Diff line change
Expand Up @@ -18,4 +18,4 @@ fi
HEAD_NODE=$(head -1 "$HOSTFILE")
export HEAD_NODE

sudo ssh -p "${DOCKER_SSH_PORT}" "${HEAD_NODE}" /opt/nvidia/src/ucc/.ci/scripts/run_tests_ucc_mpi.sh "/opt/nvidia/src/ucc/.ci/configs/${HEAD_NODE}/hostfile.txt"
ssh -p "${DOCKER_SSH_PORT}" "${HEAD_NODE}" /opt/nvidia/src/ucc/.ci/scripts/run_tests_ucc_mpi.sh "/opt/nvidia/src/ucc/.ci/configs/${HEAD_NODE}/hostfile.txt"
6 changes: 5 additions & 1 deletion src/components/tl/mlx5/tl_mlx5_context.c
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,7 @@ UCC_CLASS_INIT_FUNC(ucc_tl_mlx5_context_t,
UCC_CLASS_CALL_SUPER_INIT(ucc_tl_context_t, &tl_mlx5_config->super,
params->context);
memcpy(&self->cfg, tl_mlx5_config, sizeof(*tl_mlx5_config));
self->sock = 0;
self->rcache = NULL;
self->shared_pd = NULL;
self->shared_ctx = NULL;
Expand Down Expand Up @@ -73,8 +74,11 @@ UCC_CLASS_CLEANUP_FUNC(ucc_tl_mlx5_context_t)
tl_debug(self->super.super.lib, "failed to free ib ctx and pd");
};

if (!self->sock) {
close(self->sock);
}

ucc_mpool_cleanup(&self->req_mp, 1);
close(self->sock);
}

UCC_CLASS_DEFINE(ucc_tl_mlx5_context_t, ucc_tl_context_t);
Expand Down
8 changes: 5 additions & 3 deletions src/components/tl/mlx5/tl_mlx5_pd.c
Original file line number Diff line number Diff line change
Expand Up @@ -127,7 +127,7 @@ ucc_status_t ucc_tl_mlx5_socket_init(ucc_tl_mlx5_context_t *ctx,

static ucc_status_t client_recv_data(int *shared_cmd_fd,
uint32_t *shared_pd_handle,
const char *sock_path,
const char *sock_path, int *sock_p,
ucc_tl_mlx5_lib_t *lib)
{
struct sockaddr_storage sockaddr = {};
Expand Down Expand Up @@ -159,7 +159,8 @@ static ucc_status_t client_recv_data(int *shared_cmd_fd,
goto out;
}

return status;
*sock_p = sock;
return UCC_OK;

out:
if (close(sock) == -1) {
Expand Down Expand Up @@ -229,7 +230,8 @@ ucc_status_t ucc_tl_mlx5_share_ctx_pd(ucc_tl_mlx5_context_t *ctx,
ucc_status_t status;

if (!is_ctx_owner) {
status = client_recv_data(&ctx_fd, &pd_handle, sock_path, lib);
status =
client_recv_data(&ctx_fd, &pd_handle, sock_path, &ctx->sock, lib);
if (UCC_OK != status) {
tl_debug(lib, "failed to share ctx & pd from client side");
return status;
Expand Down
9 changes: 5 additions & 4 deletions src/components/tl/ucp/Makefile.am
Original file line number Diff line number Diff line change
Expand Up @@ -9,10 +9,11 @@ SUBDIRS = .
include makefile.coll_plugins.am
endif

allgather = \
allgather/allgather.h \
allgather/allgather.c \
allgather/allgather_ring.c \
allgather = \
allgather/allgather.h \
allgather/allgather.c \
allgather/allgather_ring.c \
allgather/allgather_neighbor.c \
allgather/allgather_knomial.c

allgatherv = \
Expand Down
19 changes: 19 additions & 0 deletions src/components/tl/ucp/allgather/allgather.c
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,8 @@
#include "tl_ucp.h"
#include "allgather.h"

#define ALLGATHER_MAX_PATTERN_SIZE (sizeof(UCC_TL_UCP_ALLGATHER_DEFAULT_ALG_SELECT_STR))

ucc_base_coll_alg_info_t
ucc_tl_ucp_allgather_algs[UCC_TL_UCP_ALLGATHER_ALG_LAST + 1] = {
[UCC_TL_UCP_ALLGATHER_ALG_KNOMIAL] =
Expand All @@ -17,10 +19,27 @@ ucc_base_coll_alg_info_t
{.id = UCC_TL_UCP_ALLGATHER_ALG_RING,
.name = "ring",
.desc = "O(N) Ring"},
[UCC_TL_UCP_ALLGATHER_ALG_NEIGHBOR] =
{.id = UCC_TL_UCP_ALLGATHER_ALG_NEIGHBOR,
.name = "neighbor",
.desc = "O(N) Neighbor Exchange N/2 steps"},
[UCC_TL_UCP_ALLGATHER_ALG_LAST] = {
.id = 0, .name = NULL, .desc = NULL}};

ucc_status_t ucc_tl_ucp_allgather_init(ucc_tl_ucp_task_t *task)
{
return ucc_tl_ucp_allgather_ring_init_common(task);
}

char *ucc_tl_ucp_allgather_score_str_get(ucc_tl_ucp_team_t *team)
{
int max_size = ALLGATHER_MAX_PATTERN_SIZE;
int algo_num = UCC_TL_TEAM_SIZE(team) % 2
? UCC_TL_UCP_ALLGATHER_ALG_RING
: UCC_TL_UCP_ALLGATHER_ALG_NEIGHBOR;
char *str = ucc_malloc(max_size * sizeof(char));

ucc_snprintf_safe(str, max_size,
UCC_TL_UCP_ALLGATHER_DEFAULT_ALG_SELECT_STR, algo_num);
return str;
}
27 changes: 20 additions & 7 deletions src/components/tl/ucp/allgather/allgather.h
Original file line number Diff line number Diff line change
Expand Up @@ -11,14 +11,17 @@
enum {
UCC_TL_UCP_ALLGATHER_ALG_KNOMIAL,
UCC_TL_UCP_ALLGATHER_ALG_RING,
UCC_TL_UCP_ALLGATHER_ALG_NEIGHBOR,
UCC_TL_UCP_ALLGATHER_ALG_LAST
};

extern ucc_base_coll_alg_info_t
ucc_tl_ucp_allgather_algs[UCC_TL_UCP_ALLGATHER_ALG_LAST + 1];
ucc_tl_ucp_allgather_algs[UCC_TL_UCP_ALLGATHER_ALG_LAST + 1];

#define UCC_TL_UCP_ALLGATHER_DEFAULT_ALG_SELECT_STR \
"allgather:0-4k:@0#allgather:4k-inf:@1"
"allgather:0-4k:@0#allgather:4k-inf:@%d"

char *ucc_tl_ucp_allgather_score_str_get(ucc_tl_ucp_team_t *team);

static inline int ucc_tl_ucp_allgather_alg_from_str(const char *str)
{
Expand All @@ -33,20 +36,30 @@ static inline int ucc_tl_ucp_allgather_alg_from_str(const char *str)

ucc_status_t ucc_tl_ucp_allgather_init(ucc_tl_ucp_task_t *task);

/* Ring */
ucc_status_t ucc_tl_ucp_allgather_ring_init(ucc_base_coll_args_t *coll_args,
ucc_base_team_t * team,
ucc_coll_task_t ** task_h);
ucc_base_team_t *team,
ucc_coll_task_t **task_h);

ucc_status_t ucc_tl_ucp_allgather_ring_init_common(ucc_tl_ucp_task_t *task);

void ucc_tl_ucp_allgather_ring_progress(ucc_coll_task_t *task);
void ucc_tl_ucp_allgather_ring_progress(ucc_coll_task_t *task);

ucc_status_t ucc_tl_ucp_allgather_ring_start(ucc_coll_task_t *task);

/* Neighbor Exchange */
ucc_status_t ucc_tl_ucp_allgather_neighbor_init(ucc_base_coll_args_t *coll_args,
ucc_base_team_t *team,
ucc_coll_task_t **task_h);

void ucc_tl_ucp_allgather_neighbor_progress(ucc_coll_task_t *task);

ucc_status_t ucc_tl_ucp_allgather_neighbor_start(ucc_coll_task_t *task);

/* Uses allgather_kn_radix from config */
ucc_status_t ucc_tl_ucp_allgather_knomial_init(ucc_base_coll_args_t *coll_args,
ucc_base_team_t * team,
ucc_coll_task_t ** task_h);
ucc_base_team_t *team,
ucc_coll_task_t **task_h);

/* Internal interface with custom radix */
ucc_status_t ucc_tl_ucp_allgather_knomial_init_r(
Expand Down
Loading

0 comments on commit ff2bfc9

Please sign in to comment.