Skip to content

Commit

Permalink
CI: NVIDIA hardening removed root (#816)
Browse files Browse the repository at this point in the history
  • Loading branch information
B-a-S authored Sep 5, 2023
1 parent 25a3d95 commit 2929235
Show file tree
Hide file tree
Showing 7 changed files with 16 additions and 14 deletions.
7 changes: 7 additions & 0 deletions .ci/Dockerfile.centos8
Original file line number Diff line number Diff line change
Expand Up @@ -4,13 +4,20 @@ FROM harbor.mellanox.com/torch-ucc/ucc/1.0.0/x86_64/centos8/cuda${CUDA_VER}:base
RUN rm -rf ${SRC_DIR}/ucc
COPY . ${SRC_DIR}/ucc

RUN sed -i 's/mirrorlist/#mirrorlist/g' /etc/yum.repos.d/CentOS-* && \
sed -i 's|#baseurl=http://mirror.centos.org|baseurl=http://vault.centos.org|g' /etc/yum.repos.d/CentOS-*
RUN yum install -y sudo && \
echo "swx-jenkins ALL=(ALL) NOPASSWD: ALL" >> /etc/sudoers
#==============================================================================
# Build UCC
RUN ${SRC_DIR}/ucc/.ci/scripts/build_ucc.sh
#==============================================================================
# Install torch_ucc (UCC version) python module and build a wheel package
RUN ${SRC_DIR}/ucc/.ci/scripts/install_torch_ucc.sh
RUN chown -R 6213:11429 /opt/nvidia
#==============================================================================
RUN groupadd -g 11429 swx-jenkins
RUN adduser --no-create-home --uid 6213 --gid 11429 --home /labhome/swx-jenkins swx-jenkins
#==============================================================================
USER swx-jenkins

4 changes: 2 additions & 2 deletions .ci/job_matrix.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -20,13 +20,13 @@ volumes:
}

env:
CUDA_VER: "11.4.2"
CUDA_VER: '11.4.2'
UCC_URI_SUFFIX: "ucc/${UCC_VERSION}/x86_64/centos8/cuda${CUDA_VER}"
UCC_DOCKER_IMAGE_NAME: "${registry_host}${registry_path}/${UCC_URI_SUFFIX}"
NVIDIA_ROOT_DIR: "/opt/nvidia"
SRC_DIR: "${NVIDIA_ROOT_DIR}/src"
BIN_DIR: "${NVIDIA_ROOT_DIR}/bin"
DOCKER_OPT: "--pull always --network=host --uts=host --ipc=host --ulimit stack=67108864 --ulimit memlock=-1 --security-opt seccomp=unconfined --cap-add=SYS_ADMIN --device=/dev/infiniband/ --gpus all --user root"
DOCKER_OPT: "--pull always --network=host --uts=host --ipc=host --ulimit stack=67108864 --ulimit memlock=-1 --security-opt seccomp=unconfined --cap-add=SYS_ADMIN --device=/dev/infiniband/ --gpus all"

docker_opt: "$DOCKER_OPT"

Expand Down
3 changes: 0 additions & 3 deletions .ci/scripts/run_dlrm.sh
Original file line number Diff line number Diff line change
Expand Up @@ -32,7 +32,6 @@ mpirun \
-np $NP \
--hostfile ${HOSTFILE} \
--map-by node \
--allow-run-as-root \
--mca plm_rsh_args '-p 12345' \
-x PATH \
-x LD_LIBRARY_PATH \
Expand All @@ -43,7 +42,6 @@ mpirun \
-np $NP \
--hostfile ${HOSTFILE} \
--map-by node \
--allow-run-as-root \
--mca plm_rsh_args '-p 12345' \
-x PATH \
-x LD_LIBRARY_PATH \
Expand All @@ -54,7 +52,6 @@ mpirun \
-np $NP \
--hostfile ${HOSTFILE} \
--map-by node \
--allow-run-as-root \
--mca plm_rsh_args '-p 12345' \
-x PATH \
-x LD_LIBRARY_PATH \
Expand Down
2 changes: 1 addition & 1 deletion .ci/scripts/run_dlrm_docker.sh
Original file line number Diff line number Diff line change
Expand Up @@ -19,4 +19,4 @@ HEAD_NODE=$(head -1 "$HOSTFILE")
export HEAD_NODE

#sudo ssh -p "${DOCKER_SSH_PORT}" "${HEAD_NODE}" /opt/nvidia/src/ucc/.ci/scripts/run_dlrm.sh cpu "/opt/nvidia/src/ucc/.ci/configs/${HEAD_NODE}/hostfile.txt"
sudo ssh -p "${DOCKER_SSH_PORT}" "${HEAD_NODE}" /opt/nvidia/src/ucc/.ci/scripts/run_dlrm.sh gpu "/opt/nvidia/src/ucc/.ci/configs/${HEAD_NODE}/hostfile.txt"
ssh -p "${DOCKER_SSH_PORT}" "${HEAD_NODE}" /opt/nvidia/src/ucc/.ci/scripts/run_dlrm.sh gpu "/opt/nvidia/src/ucc/.ci/configs/${HEAD_NODE}/hostfile.txt"
10 changes: 4 additions & 6 deletions .ci/scripts/run_docker.sh
Original file line number Diff line number Diff line change
Expand Up @@ -41,13 +41,11 @@ DOCKER_RUN_ARGS="\
--cap-add=SYS_ADMIN \
--device=/dev/infiniband/ \
--gpus all \
--user root \
-it \
-d \
--rm \
--name=${DOCKER_CONTAINER_NAME} \
-v /labhome:/labhome \
-v /root/.ssh:/root/.ssh \
"

# shellcheck disable=SC2013
Expand Down Expand Up @@ -78,16 +76,16 @@ pdsh -w "${HOST_LIST}" -R ssh docker pull "${DOCKER_IMAGE_NAME}"
for HOST in $(cat "$HOSTFILE"); do
echo "INFO: start docker container on $HOST ..."
# shellcheck disable=SC2029
sudo ssh "$HOST" "docker run \
ssh "$HOST" "docker run \
${DOCKER_RUN_ARGS} \
${DOCKER_IMAGE_NAME} \
bash -c '/usr/sbin/sshd -p ${DOCKER_SSH_PORT}; sleep infinity'"
bash -c 'sudo /usr/sbin/sshd -p ${DOCKER_SSH_PORT}; sleep infinity'"
echo "INFO: start docker container on $HOST ... DONE"

sleep 5

echo "INFO: verify docker container on $HOST ..."
sudo ssh -p "${DOCKER_SSH_PORT}" "$HOST" hostname
sudo ssh -p "${DOCKER_SSH_PORT}" "$HOST" cat /proc/1/cgroup
ssh -p "${DOCKER_SSH_PORT}" "$HOST" hostname
ssh -p "${DOCKER_SSH_PORT}" "$HOST" cat /proc/1/cgroup
echo "INFO: verify docker container on $HOST ... DONE"
done
2 changes: 1 addition & 1 deletion .ci/scripts/run_tests_ucc_mpi.sh
Original file line number Diff line number Diff line change
Expand Up @@ -46,7 +46,7 @@ function mpi_params {
nnodes=$NNODES
fi
echo "-np $((nnodes*ppn)) --oversubscribe --hostfile ${HOSTFILE} \
--map-by ppr:$ppn:node --bind-to socket --allow-run-as-root \
--map-by ppr:$ppn:node --bind-to socket \
-x PATH -x LD_LIBRARY_PATH --mca opal_common_ucx_opal_mem_hooks 1 --mca plm_rsh_args -p12345 \
--mca coll ^ucc,hcoll \
-x UCX_NET_DEVICES=$DEV:1"
Expand Down
2 changes: 1 addition & 1 deletion .ci/scripts/run_tests_ucc_mpi_docker.sh
Original file line number Diff line number Diff line change
Expand Up @@ -18,4 +18,4 @@ fi
HEAD_NODE=$(head -1 "$HOSTFILE")
export HEAD_NODE

sudo ssh -p "${DOCKER_SSH_PORT}" "${HEAD_NODE}" /opt/nvidia/src/ucc/.ci/scripts/run_tests_ucc_mpi.sh "/opt/nvidia/src/ucc/.ci/configs/${HEAD_NODE}/hostfile.txt"
ssh -p "${DOCKER_SSH_PORT}" "${HEAD_NODE}" /opt/nvidia/src/ucc/.ci/scripts/run_tests_ucc_mpi.sh "/opt/nvidia/src/ucc/.ci/configs/${HEAD_NODE}/hostfile.txt"

0 comments on commit 2929235

Please sign in to comment.