-
Notifications
You must be signed in to change notification settings - Fork 103
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
Showing
14 changed files
with
258 additions
and
22 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,21 @@ | ||
ARG CUDA_VER='11.4.2' | ||
FROM harbor.mellanox.com/torch-ucc/ucc/1.0.0/x86_64/centos8/cuda${CUDA_VER}:base | ||
#FROM nvcr.io/nvidia/pytorch:23.10-py3 | ||
RUN rm -rf ${SRC_DIR}/ucc | ||
COPY . ${SRC_DIR}/ucc | ||
|
||
RUN apt update && apt install -y sudo && \ | ||
echo "swx-jenkins ALL=(ALL) NOPASSWD: ALL" >> /etc/sudoers | ||
RUN pip install 'protobuf<=3.19.0' | ||
#============================================================================== | ||
# Build UCC | ||
RUN ${SRC_DIR}/ucc/.ci/scripts/build_ucc.sh | ||
#============================================================================== | ||
# Install torch_ucc (UCC version) python module and build a wheel package | ||
RUN chown -R 6213:11429 /opt/nvidia | ||
#============================================================================== | ||
RUN groupadd -g 11429 swx-jenkins | ||
RUN adduser --no-create-home --uid 6213 --gid 11429 --home /labhome/swx-jenkins swx-jenkins | ||
#============================================================================== | ||
USER swx-jenkins | ||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,23 @@ | ||
ARG CUDA_VER='12.2' | ||
FROM ucc_ubi8:latest | ||
|
||
RUN rm -rf ${SRC_DIR}/ucc | ||
COPY . ${SRC_DIR}/ucc | ||
|
||
#RUN sed -i 's/mirrorlist/#mirrorlist/g' /etc/yum.repos.d/CentOS-* && \ | ||
# sed -i 's|#baseurl=http://mirror.centos.org|baseurl=http://vault.centos.org|g' /etc/yum.repos.d/CentOS-* | ||
RUN yum install -y sudo && \ | ||
echo "swx-jenkins ALL=(ALL) NOPASSWD: ALL" >> /etc/sudoers | ||
#============================================================================== | ||
# Build UCC | ||
RUN ${SRC_DIR}/ucc/.ci/scripts/build_ucc.sh | ||
#============================================================================== | ||
# Install torch_ucc (UCC version) python module and build a wheel package | ||
RUN ${SRC_DIR}/ucc/.ci/scripts/install_torch_ucc.sh | ||
RUN chown -R 6213:11429 /opt/nvidia | ||
#============================================================================== | ||
RUN groupadd -g 11429 swx-jenkins | ||
RUN adduser --no-create-home --uid 6213 --gid 11429 --home /labhome/swx-jenkins swx-jenkins | ||
#============================================================================== | ||
USER swx-jenkins | ||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,71 @@ | ||
ARG CUDA_VER='12.1.1' | ||
FROM nvcr.io/nvidia/pytorch:23.11-py3 | ||
#============================================================================== | ||
ARG NVIDIA_ROOT_DIR=/opt/nvidia | ||
ENV DEBIAN_FRONTEND=noninteractive | ||
ENV TZ=Etc/UTC | ||
ENV SRC_DIR=${NVIDIA_ROOT_DIR}/src | ||
ENV PKG_DIR=${NVIDIA_ROOT_DIR}/pkg | ||
ENV BIN_DIR=${NVIDIA_ROOT_DIR}/bin | ||
ENV WORKLOADS_DIR=${NVIDIA_ROOT_DIR}/workloads | ||
ENV TORCH_UCC_GITHUB_URL=https://github.com/facebookresearch/torch_ucc.git | ||
ENV TORCH_UCC_BRANCH=main | ||
ENV CUDA_HOME=/usr/local/cuda | ||
ENV UCX_GITHUB_URL=https://github.com/openucx/ucx.git | ||
ENV UCX_BRANCH=master | ||
ENV UCX_BUILD_TYPE=release-mt | ||
ENV UCX_INSTALL_DIR=${BIN_DIR}/ucx/build-${UCX_BUILD_TYPE} | ||
ENV UCC_INSTALL_DIR=${BIN_DIR}/ucc/build | ||
ENV OFED_PKG='lsof kmod udev swig libelf1 libfuse2 pciutils tk gfortran libpci3 libusb-1.0-0 libltdl-dev libmnl0 bison tcl flex chrpath debhelper ethtool graphviz' | ||
ENV PACKAGES='numactl openssh-server protobuf-compiler rdma-core vim libevent-dev build-essential git make autoconf libtool' | ||
ENV OS_VERSION=ubuntu22.04 | ||
ENV PLATFORM=x86_64 | ||
ENV MOFED_VERSION=23.10-0.5.5.0 | ||
ENV MOFED_URL="https://content.mellanox.com/ofed/MLNX_OFED-${MOFED_VERSION}/MLNX_OFED_LINUX-${MOFED_VERSION}-${OS_VERSION}-${PLATFORM}.tgz" | ||
ENV OMPI_PATH="/opt/hpcx/ompi" | ||
#============================================================================== | ||
RUN apt update && apt install -y ${OFED_PKG} && \ | ||
mkdir -p /tmp/ofed && wget --quiet -O /tmp/ofed/ofed.tgz ${MOFED_URL} && \ | ||
tar -xvf /tmp/ofed/ofed.tgz --strip-components=2 -C /tmp/ofed && \ | ||
/tmp/ofed/mlnxofedinstall --user-space-only --without-fw-update -q --distro ${OS_VERSION} --basic && \ | ||
rm -rf /tmp/ofed | ||
|
||
RUN apt install -y ${PACKAGES} | ||
|
||
# Remove old UCX | ||
RUN rm -rf /opt/hpcx/uc? | ||
ENV PATH=${OMPI_PATH}/bin:$PATH | ||
RUN echo "export PATH=\"\$OMPI_PATH:\$PATH\"" >> /etc/bashrc && \ | ||
export LD_LIBRARY_PATH=\"\$OMPI_PATH/lib64:\${LD_LIBRARY_PATH}\" >> /etc/bashrc | ||
#============================================================================== | ||
# Configure SSH | ||
RUN mkdir -p /var/run/sshd && \ | ||
cat /etc/ssh/ssh_config | grep -v StrictHostKeyChecking > /etc/ssh/ssh_config.new && \ | ||
echo " StrictHostKeyChecking no" >> /etc/ssh/ssh_config.new && \ | ||
mv /etc/ssh/ssh_config.new /etc/ssh/ssh_config && \ | ||
ssh-keygen -A && \ | ||
rm -f /run/nologin | ||
#============================================================================== | ||
|
||
#============================================================================== | ||
RUN mkdir -p ${SRC_DIR} ${PKG_DIR} ${BIN_DIR} ${WORKLOADS_DIR} && \ | ||
cd ${SRC_DIR} && \ | ||
mkdir -p ${SRC_DIR}/ucx && \ | ||
git clone --recursive ${UCX_GITHUB_URL} ${SRC_DIR}/ucx && \ | ||
cd ${SRC_DIR}/ucx && \ | ||
git checkout ${UCX_BRANCH} | ||
|
||
COPY . ${SRC_DIR}/ucc | ||
#============================================================================== | ||
# Build UCX | ||
RUN ${SRC_DIR}/ucc/.ci/scripts/build_ucx.sh | ||
ENV PATH=${UCX_INSTALL_DIR}/bin:${PATH} | ||
#============================================================================== | ||
# Install workloads | ||
WORKDIR ${WORKLOADS_DIR} | ||
RUN git clone https://github.com/facebookresearch/dlrm.git && \ | ||
cd ${WORKLOADS_DIR}/dlrm && \ | ||
pip3 install -r ${WORKLOADS_DIR}/dlrm/requirements.txt && \ | ||
pip3 install tensorboard | ||
RUN git clone https://github.com/facebookresearch/param.git && \ | ||
pip3 install -r ${WORKLOADS_DIR}/param/requirements.txt |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,85 @@ | ||
ARG CUDA_VER='12.1.1' | ||
FROM nvidia/cuda:${CUDA_VER}-devel-ubi8 | ||
#============================================================================== | ||
ARG NVIDIA_ROOT_DIR=/opt/nvidia | ||
ENV SRC_DIR=${NVIDIA_ROOT_DIR}/src | ||
ENV PKG_DIR=${NVIDIA_ROOT_DIR}/pkg | ||
ENV BIN_DIR=${NVIDIA_ROOT_DIR}/bin | ||
ENV WORKLOADS_DIR=${NVIDIA_ROOT_DIR}/workloads | ||
ENV TORCH_UCC_GITHUB_URL=https://github.com/facebookresearch/torch_ucc.git | ||
ENV TORCH_UCC_BRANCH=main | ||
ENV CUDA_HOME=/usr/local/cuda | ||
ENV UCX_GITHUB_URL=https://github.com/openucx/ucx.git | ||
ENV UCX_BRANCH=master | ||
ENV UCX_BUILD_TYPE=release-mt | ||
ENV UCX_INSTALL_DIR=${BIN_DIR}/ucx/build-${UCX_BUILD_TYPE} | ||
ENV UCC_INSTALL_DIR=${BIN_DIR}/ucc/build | ||
ENV OFED_PKG 'python36 tk pciutils-libs fuse-libs kernel-modules-extra libmnl wget numactl-libs gcc-gfortran' | ||
ENV PACKAGES 'numactl numactl-devel openssh-server protobuf-compiler protobuf-devel python3.8 python38-devel vim openmpi openmpi-devel hostname' | ||
ENV OS_VERSION rhel8.0 | ||
ENV PLATFORM x86_64 | ||
ENV MOFED_VERSION 23.10-0.5.5.0 | ||
ENV MOFED_URL="https://content.mellanox.com/ofed/MLNX_OFED-${MOFED_VERSION}/MLNX_OFED_LINUX-${MOFED_VERSION}-${OS_VERSION}-${PLATFORM}.tgz" | ||
ENV OMPI_PATH "/usr/lib64/openmpi" | ||
#============================================================================== | ||
COPY .ci/build_base_docker/local.repo /etc/yum.repos.d/local.repo | ||
RUN yum groupinstall -y \ | ||
'Development Tools' && \ | ||
yum install -y ${OFED_PKG} && \ | ||
mkdir -p /tmp/ofed && wget --quiet -O /tmp/ofed/ofed.tgz ${MOFED_URL} && \ | ||
tar -xvf /tmp/ofed/ofed.tgz --strip-components=2 -C /tmp/ofed && \ | ||
/tmp/ofed/mlnxofedinstall --user-space-only --without-fw-update --basic -q --distro ${OS_VERSION} && \ | ||
rm -rf /tmp/ofed | ||
|
||
RUN yum install -y ${PACKAGES} && \ | ||
update-alternatives --set python3 /usr/bin/python3.8 | ||
|
||
# Remove old UCX | ||
RUN rpm -e --nodeps ucx | ||
#ENV PATH=/usr/lib64/openmpi/bin:$PATH | ||
ENV PATH=${OMPI_PATH}/bin:$PATH | ||
RUN echo "export PATH=\"\$OMPI_PATH:\$PATH\"" >> /etc/bashrc && \ | ||
export LD_LIBRARY_PATH=\"\$OMPI_PATH/lib:\${LD_LIBRARY_PATH}\" >> /etc/bashrc | ||
RUN cd /tmp && wget https://github.com/Kitware/CMake/releases/download/v3.20.4/cmake-3.20.4-linux-x86_64.sh && \ | ||
chmod +x /tmp/cmake-3.20.4-linux-x86_64.sh && /tmp/cmake-3.20.4-linux-x86_64.sh --skip-license --prefix=/usr && \ | ||
rm -f /tmp/cmake-3.20.4-linux-x86_64.sh | ||
#============================================================================== | ||
# Configure SSH | ||
RUN mkdir -p /var/run/sshd && \ | ||
cat /etc/ssh/ssh_config | grep -v StrictHostKeyChecking > /etc/ssh/ssh_config.new && \ | ||
echo " StrictHostKeyChecking no" >> /etc/ssh/ssh_config.new && \ | ||
mv /etc/ssh/ssh_config.new /etc/ssh/ssh_config && \ | ||
ssh-keygen -A && \ | ||
rm -f /run/nologin | ||
#============================================================================== | ||
|
||
#============================================================================== | ||
RUN mkdir -p ${SRC_DIR} ${PKG_DIR} ${BIN_DIR} ${WORKLOADS_DIR} && \ | ||
git clone ${TORCH_UCC_GITHUB_URL} ${SRC_DIR} && \ | ||
cd ${SRC_DIR} && \ | ||
git checkout ${TORCH_UCC_BRANCH} && \ | ||
mkdir -p ${SRC_DIR}/ucx && \ | ||
git clone --recursive ${UCX_GITHUB_URL} ${SRC_DIR}/ucx && \ | ||
cd ${SRC_DIR}/ucx && \ | ||
git checkout ${UCX_BRANCH} | ||
|
||
COPY . ${SRC_DIR}/ucc | ||
#============================================================================== | ||
# Build UCX | ||
RUN ${SRC_DIR}/ucc/.ci/scripts/build_ucx.sh | ||
ENV PATH=${UCX_INSTALL_DIR}/bin:${PATH} | ||
#============================================================================== | ||
# Configure Python | ||
RUN ${SRC_DIR}/ucc/.ci/scripts/configure_python.sh | ||
#============================================================================== | ||
# Install PyTorch | ||
RUN ${SRC_DIR}/ucc/.ci/scripts/install_torch.sh | ||
#============================================================================== | ||
# Install workloads | ||
WORKDIR ${WORKLOADS_DIR} | ||
RUN git clone https://github.com/facebookresearch/dlrm.git && \ | ||
cd ${WORKLOADS_DIR}/dlrm && \ | ||
pip3 install -r ${WORKLOADS_DIR}/dlrm/requirements.txt && \ | ||
pip3 install tensorboard | ||
RUN git clone https://github.com/facebookresearch/param.git && \ | ||
pip3 install -r ${WORKLOADS_DIR}/param/requirements.txt |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,16 @@ | ||
[Local_appStream] | ||
baseurl=http://webrepo/RH/rh-mirrors/8-upstream/appstream/x86_64/ | ||
enabled=1 | ||
gpgcheck=0 | ||
[Local_BaseOs] | ||
baseurl=http://webrepo/RH/rh-mirrors/8-upstream/baseos/x86_64/ | ||
enabled=1 | ||
gpgcheck=0 | ||
[Local_Builder] | ||
baseurl=http://webrepo/RH/rh-mirrors/8-upstream/codeready-builder/x86_64/ | ||
enabled=1 | ||
gpgcheck=0 | ||
[Local_High] | ||
baseurl=http://webrepo/RH/rh-mirrors/8-upstream/highavailability/x86_64/ | ||
enabled=1 | ||
gpgcheck=0 |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -31,7 +31,10 @@ set -o pipefail | |
#conda uninstall -y pytorch torchvision | ||
#conda install pytorch torchvision cudatoolkit=11.0 -c pytorch-nightly | ||
#conda install pytorch cudatoolkit=11.0 -c pytorch-nightly | ||
|
||
ls /usr/local/lib64/python3.8/dist-packages/torch/lib -la | ||
ls -la /usr/local/lib64/python3.8/ | ||
pip3 install --default-timeout=900 numpy | ||
pip3 install --default-timeout=900 --pre torch -f https://download.pytorch.org/whl/nightly/cu113/torch_nightly.html | ||
#pip3 install torch torchvision torchaudio | ||
pip3 install --default-timeout=900 --pre torch -f https://download.pytorch.org/whl/nightly/cu121/torch_nightly.html | ||
#pip3 install --default-timeout=900 --pre torch torchvision torchaudio --index-url https://download.pytorch.org/whl/nightly/cu121 | ||
pip3 install "git+https://github.com/mlperf/[email protected]" |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Oops, something went wrong.