Skip to content

Commit

Permalink
Merge branch 'master' into nsarka/sliding-window-allreduce-stubs
Browse files Browse the repository at this point in the history
  • Loading branch information
nsarka authored Jan 23, 2024
2 parents 4772b81 + 75ecf74 commit 0935db0
Show file tree
Hide file tree
Showing 131 changed files with 4,706 additions and 1,201 deletions.
20 changes: 20 additions & 0 deletions .ci/Dockerfile.ngc_pytorch
Original file line number Diff line number Diff line change
@@ -0,0 +1,20 @@
ARG CUDA_VER='12.1.1'
FROM harbor.mellanox.com/torch-ucc/ucc/1.0.0/x86_64/centos8/cuda${CUDA_VER}:base
RUN rm -rf ${SRC_DIR}/ucc
COPY . ${SRC_DIR}/ucc

RUN apt update && apt install -y sudo && \
echo "swx-jenkins ALL=(ALL) NOPASSWD: ALL" >> /etc/sudoers
RUN pip install 'protobuf<=3.19.0'
#==============================================================================
# Build UCC
RUN ${SRC_DIR}/ucc/.ci/scripts/build_ucc.sh
#==============================================================================
# Install torch_ucc (UCC version) python module and build a wheel package
RUN chown -R 6213:11429 /opt/nvidia
#==============================================================================
RUN groupadd -g 11429 swx-jenkins
RUN adduser --no-create-home --uid 6213 --gid 11429 --home /labhome/swx-jenkins swx-jenkins
#==============================================================================
USER swx-jenkins

69 changes: 69 additions & 0 deletions .ci/build_base_docker/Dockerfile.ngc_pytorch.base
Original file line number Diff line number Diff line change
@@ -0,0 +1,69 @@
ARG CUDA_VER='12.1.1'
FROM nvcr.io/nvidia/pytorch:23.11-py3
#==============================================================================
ARG NVIDIA_ROOT_DIR=/opt/nvidia
ENV DEBIAN_FRONTEND=noninteractive
ENV TZ=Etc/UTC
ENV SRC_DIR=${NVIDIA_ROOT_DIR}/src
ENV PKG_DIR=${NVIDIA_ROOT_DIR}/pkg
ENV BIN_DIR=${NVIDIA_ROOT_DIR}/bin
ENV WORKLOADS_DIR=${NVIDIA_ROOT_DIR}/workloads
ENV CUDA_HOME=/usr/local/cuda
ENV UCX_GITHUB_URL=https://github.com/openucx/ucx.git
ENV UCX_BRANCH=master
ENV UCX_BUILD_TYPE=release-mt
ENV UCX_INSTALL_DIR=${BIN_DIR}/ucx/build-${UCX_BUILD_TYPE}
ENV UCC_INSTALL_DIR=${BIN_DIR}/ucc/build
ENV OFED_PKG='lsof kmod udev swig libelf1 libfuse2 pciutils tk gfortran libpci3 libusb-1.0-0 libltdl-dev libmnl0 bison tcl flex chrpath debhelper ethtool graphviz'
ENV PACKAGES='numactl openssh-server protobuf-compiler rdma-core vim libevent-dev build-essential git make autoconf libtool'
ENV OS_VERSION=ubuntu22.04
ENV PLATFORM=x86_64
ENV MOFED_VERSION=23.10-0.5.5.0
ENV MOFED_URL="https://content.mellanox.com/ofed/MLNX_OFED-${MOFED_VERSION}/MLNX_OFED_LINUX-${MOFED_VERSION}-${OS_VERSION}-${PLATFORM}.tgz"
ENV OMPI_PATH="/opt/hpcx/ompi"
#==============================================================================
RUN apt update && apt install -y ${OFED_PKG} && \
mkdir -p /tmp/ofed && wget --quiet -O /tmp/ofed/ofed.tgz ${MOFED_URL} && \
tar -xvf /tmp/ofed/ofed.tgz --strip-components=2 -C /tmp/ofed && \
/tmp/ofed/mlnxofedinstall --user-space-only --without-fw-update -q --distro ${OS_VERSION} --basic && \
rm -rf /tmp/ofed

RUN apt install -y ${PACKAGES}

# Remove old UCX
RUN rm -rf /opt/hpcx/uc?
ENV PATH=${OMPI_PATH}/bin:$PATH
RUN echo "export PATH=\"\$OMPI_PATH:\$PATH\"" >> /etc/bashrc && \
export LD_LIBRARY_PATH=\"\$OMPI_PATH/lib64:\${LD_LIBRARY_PATH}\" >> /etc/bashrc
#==============================================================================
# Configure SSH
RUN mkdir -p /var/run/sshd && \
cat /etc/ssh/ssh_config | grep -v StrictHostKeyChecking > /etc/ssh/ssh_config.new && \
echo " StrictHostKeyChecking no" >> /etc/ssh/ssh_config.new && \
mv /etc/ssh/ssh_config.new /etc/ssh/ssh_config && \
ssh-keygen -A && \
rm -f /run/nologin
#==============================================================================

#==============================================================================
RUN mkdir -p ${SRC_DIR} ${PKG_DIR} ${BIN_DIR} ${WORKLOADS_DIR} && \
cd ${SRC_DIR} && \
mkdir -p ${SRC_DIR}/ucx && \
git clone --recursive ${UCX_GITHUB_URL} ${SRC_DIR}/ucx && \
cd ${SRC_DIR}/ucx && \
git checkout ${UCX_BRANCH}

COPY . ${SRC_DIR}/ucc
#==============================================================================
# Build UCX
RUN ${SRC_DIR}/ucc/.ci/scripts/build_ucx.sh
ENV PATH=${UCX_INSTALL_DIR}/bin:${PATH}
#==============================================================================
# Install workloads
WORKDIR ${WORKLOADS_DIR}
RUN git clone https://github.com/facebookresearch/dlrm.git && \
cd ${WORKLOADS_DIR}/dlrm && \
pip3 install -r ${WORKLOADS_DIR}/dlrm/requirements.txt && \
pip3 install tensorboard
RUN git clone https://github.com/facebookresearch/param.git && \
pip3 install -r ${WORKLOADS_DIR}/param/requirements.txt
11 changes: 3 additions & 8 deletions .ci/job_matrix.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@ volumes:
}

env:
CUDA_VER: '11.4.2'
CUDA_VER: '12.1.1'
UCC_URI_SUFFIX: "ucc/${UCC_VERSION}/x86_64/centos8/cuda${CUDA_VER}"
UCC_DOCKER_IMAGE_NAME: "${registry_host}${registry_path}/${UCC_URI_SUFFIX}"
NVIDIA_ROOT_DIR: "/opt/nvidia"
Expand All @@ -42,8 +42,8 @@ credentials:

runs_on_dockers:
- {
file: ".ci/Dockerfile.centos8",
name: "centos8",
file: ".ci/Dockerfile.ngc_pytorch",
name: "ngc_pytorch",
tag: "${BUILD_NUMBER}",
arch: "x86_64",
uri: "${UCC_URI_SUFFIX}",
Expand All @@ -69,7 +69,6 @@ steps:
docker pull ${DOCKER_IMAGE_NAME}
docker create -ti --rm $DOCKER_OPT ${DOCKER_IMAGE_NAME} /bin/bash > ${WORKSPACE}/ucc_docker.id
docker start $(cat ${WORKSPACE}/ucc_docker.id)
#============================================================================
- name: Run Coverity
credentialsId: "bc9a18d3-1153-449c-b924-7fc9249c9cc0"
Expand All @@ -80,17 +79,13 @@ steps:
echo "Running coverity"
${WORKSPACE}/.ci/scripts/coverity.sh
archiveArtifacts: .ci/scripts/cov-build/*

#============================================================================
- name: Run UCC / Torch-UCC tests
agentSelector: "{nodeLabel: 'swx-clx01'}"
run: |
echo "INFO: Run UCC tests"
hostname
docker exec $(cat ${WORKSPACE}/ucc_docker.id) bash -c "\${SRC_DIR}/ucc/.ci/scripts/run_tests_ucc.sh"
echo "INFO: Run Torch-UCC tests (UCC)"
docker exec $(cat ${WORKSPACE}/ucc_docker.id) bash -c "\${SRC_DIR}/ucc/.ci/scripts/run_tests_torch_ucc.sh"
always: |
docker rm --force $(cat ${WORKSPACE}/ucc_docker.id)
#============================================================================
Expand Down
4 changes: 4 additions & 0 deletions .ci/scripts/env.sh
Original file line number Diff line number Diff line change
@@ -1,5 +1,9 @@
#!/bin/bash -eEx

export PATH="/opt/hpcx/ompi/bin:$PATH"
export LD_LIBRARY_PATH="/opt/hpcx/ompi/lib:${LD_LIBRARY_PATH}"
export OPAL_PREFIX=/opt/hpcx/ompi

SCRIPT_DIR="$(cd "$(dirname "$0")" && pwd -P)"

# shellcheck disable=SC2034
Expand Down
2 changes: 1 addition & 1 deletion .ci/scripts/run_docker.sh
Original file line number Diff line number Diff line change
Expand Up @@ -45,7 +45,7 @@ DOCKER_RUN_ARGS="\
-d \
--rm \
--name=${DOCKER_CONTAINER_NAME} \
-v /labhome:/labhome \
-v /labhome/swx-jenkins:/labhome/swx-jenkins \
"

# shellcheck disable=SC2013
Expand Down
3 changes: 0 additions & 3 deletions .ci/scripts/run_tests_ucc_mpi.sh
Original file line number Diff line number Diff line change
Expand Up @@ -15,9 +15,6 @@ if [ -z "$HOSTFILE" ]; then
exit 1
fi

export PATH="/usr/lib64/openmpi/bin:$PATH"
export LD_LIBRARY_PATH="/usr/lib64/openmpi/lib:${LD_LIBRARY_PATH}"

HEAD_NODE=$(head -1 "$HOSTFILE")
export HEAD_NODE
export MASTER_ADDR=${HEAD_NODE}
Expand Down
8 changes: 4 additions & 4 deletions .github/workflows/clang-tidy-nvidia.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@ on: [push, pull_request]
env:
OPEN_UCX_LINK: https://github.com/openucx/ucx
OPEN_UCX_BRANCH: master
HPCX_LINK: http://content.mellanox.com/hpc/hpc-x/v2.13/hpcx-v2.13-gcc-MLNX_OFED_LINUX-5-ubuntu20.04-cuda11-gdrcopy2-nccl2.12-x86_64.tbz
HPCX_LINK: https://content.mellanox.com/hpc/hpc-x/v2.17.1rc2/hpcx-v2.17.1-gcc-mlnx_ofed-ubuntu20.04-cuda12-x86_64.tbz
CLANG_VER: 12
MLNX_OFED_VER: 5.9-0.5.6.0
CUDA_VER: 11-4
Expand Down Expand Up @@ -33,7 +33,7 @@ jobs:
wget https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2004/x86_64/cuda-keyring_1.0-1_all.deb
sudo dpkg -i cuda-keyring_1.0-1_all.deb
sudo apt-get update
sudo apt-get install -y --no-install-recommends cuda-cudart-dev-${CUDA_VER} cuda-nvcc-${CUDA_VER} cuda-nvml-dev-${CUDA_VER}
sudo apt-get install -y --no-install-recommends cuda-cudart-dev-${CUDA_VER} cuda-nvcc-${CUDA_VER} cuda-nvml-dev-${CUDA_VER}
- name: Get UCX
run: git clone ${OPEN_UCX_LINK} -b ${OPEN_UCX_BRANCH} /tmp/ucx
- name: Build UCX
Expand All @@ -45,8 +45,8 @@ jobs:
run: |
cd /tmp
wget ${HPCX_LINK}
tar xjf hpcx-v2.13-gcc-MLNX_OFED_LINUX-5-ubuntu20.04-cuda11-gdrcopy2-nccl2.12-x86_64.tbz
mv hpcx-v2.13-gcc-MLNX_OFED_LINUX-5-ubuntu20.04-cuda11-gdrcopy2-nccl2.12-x86_64 hpcx
tar xjf hpcx-v2.17.1-gcc-mlnx_ofed-ubuntu20.04-cuda12-x86_64.tbz
mv hpcx-v2.17.1-gcc-mlnx_ofed-ubuntu20.04-cuda12-x86_64 hpcx
- uses: actions/checkout@v1
- name: Build UCC
run: |
Expand Down
25 changes: 25 additions & 0 deletions .github/workflows/hpcsdk.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,25 @@
name: HPC_SDK

on: [push, pull_request]

env:
HPCXDIR: /opt/nvidia/hpc_sdk/Linux_x86_64/2023/comm_libs/12.2/hpcx/latest/
NCCLDIR: /opt/nvidia/hpc_sdk/Linux_x86_64/2023/comm_libs/12.2/nccl/
CUDADIR: /opt/nvidia/hpc_sdk/Linux_x86_64/2023/cuda/12.2/

jobs:
build:
runs-on: ubuntu-20.04
container:
image: nvcr.io/nvidia/nvhpc:23.9-devel-cuda12.2-ubuntu22.04
steps:
- name: Install dependencies
run: |
apt-get update
apt-get install -y --no-install-recommends libiberty-dev
- uses: actions/checkout@v1
- name: Build UCC
run: |
./autogen.sh
CC=nvc CXX=nvc++ ./configure --with-tls=ucp,mlx5,cuda,self,nccl,sharp --with-mpi=${HPCXDIR}/ompi --with-sharp=${HPCXDIR}/sharp --with-ucx=${HPCXDIR}/ucx --with-cuda=${CUDADIR} --with-nccl=${NCCLDIR} --with-nvcc-gencode="-gencode=arch=compute_80,code=sm_80"
make -j`nproc` install
1 change: 1 addition & 0 deletions config/m4/sharp.m4
Original file line number Diff line number Diff line change
Expand Up @@ -44,6 +44,7 @@ AS_IF([test "x$with_sharp" != "xno"],
AC_SUBST(SHARP_LDFLAGS, "-lsharp_coll -L$check_sharp_dir/lib")
AC_CHECK_DECLS([SHARP_COLL_HIDE_ERRORS], [], [], [[#include <sharp/api/sharp_coll.h>]])
AC_CHECK_DECLS([SHARP_COLL_DISABLE_LAZY_GROUP_RESOURCE_ALLOC], [], [], [[#include <sharp/api/sharp_coll.h>]])
AC_CHECK_DECLS([sharp_coll_do_reduce_scatter], [], [], [[#include <sharp/api/sharp_coll.h>]])
],
[
AS_IF([test "x$with_sharp" != "xguess"],
Expand Down
21 changes: 20 additions & 1 deletion config/m4/ucx.m4
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
#
# Copyright (c) 2001-2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# Copyright (c) 2001-2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# See file LICENSE for terms.
#

Expand Down Expand Up @@ -113,6 +113,25 @@ AS_IF([test "x$ucx_checked" != "xyes"],[
[AC_DEFINE([UCS_HAVE_CONFIG_GLOBAL_LIST_ENTRY_FLAGS], [1], [flags for config table])],
[],
[#include <ucs/config/parser.h>])
AC_CHECK_MEMBER(ucs_rcache_region_t.alignment,
[AC_DEFINE([UCS_HAVE_RCACHE_REGION_ALIGNMENT], [1], [flags for ucs_rcache_get])],
[],
[#include <ucs/memory/rcache.h>])
AC_COMPILE_IFELSE([AC_LANG_SOURCE([[#include <ucs/config/parser.h>
int main(int argc, char** argv) {
ucs_config_parser_set_value(NULL, NULL, NULL, NULL, NULL);
return 0;
} ]])],
[AC_DEFINE([UCS_HAVE_PARSER_SET_VALUE_TABLE_PREFIX], [1], [flags for ucs_rcache_get])],
[])
AC_CHECK_MEMBER(ucs_config_parser_t.doc,
[AC_DEFINE([UCS_HAVE_PARSER_CONFIG_DOC], [1], [flags for ucs_rcache_get])],
[],
[#include <ucs/memory/rcache.h>])
],
[
AS_IF([test "x$with_ucx" != "xguess"],
Expand Down
113 changes: 57 additions & 56 deletions src/Makefile.am
Original file line number Diff line number Diff line change
Expand Up @@ -36,62 +36,63 @@ nobase_dist_libucc_la_HEADERS = \
ucc/api/ucc_version.h \
ucc/api/ucc_status.h

noinst_HEADERS = \
core/ucc_global_opts.h \
core/ucc_lib.h \
core/ucc_context.h \
core/ucc_team.h \
core/ucc_ee.h \
core/ucc_progress_queue.h \
core/ucc_service_coll.h \
core/ucc_dt.h \
schedule/ucc_schedule.h \
schedule/ucc_schedule_pipelined.h \
coll_score/ucc_coll_score.h \
utils/arch/aarch64/cpu.h \
utils/arch/ppc64/cpu.h \
utils/arch/riscv64/cpu.h \
utils/arch/x86_64/cpu.h \
utils/arch/cpu.h \
utils/arch/cuda_def.h \
utils/ucc_compiler_def.h \
utils/ucc_log.h \
utils/ucc_parser.h \
utils/ucc_component.h \
utils/ucc_datastruct.h \
utils/ucc_math.h \
utils/ucc_coll_utils.h \
utils/ucc_list.h \
utils/ucc_string.h \
utils/ucc_queue.h \
utils/ucc_proc_info.h \
utils/khash.h \
utils/ini.h \
utils/ucc_spinlock.h \
utils/ucc_mpool.h \
utils/ucc_rcache.h \
utils/profile/ucc_profile.h \
utils/profile/ucc_profile_on.h \
utils/profile/ucc_profile_off.h \
utils/ucc_time.h \
utils/ucc_sys.h \
utils/ucc_assert.h \
components/base/ucc_base_iface.h \
components/cl/ucc_cl.h \
components/cl/ucc_cl_log.h \
components/cl/ucc_cl_type.h \
components/tl/ucc_tl.h \
components/tl/ucc_tl_log.h \
components/mc/ucc_mc.h \
components/mc/base/ucc_mc_base.h \
components/mc/ucc_mc_log.h \
components/ec/ucc_ec.h \
components/ec/base/ucc_ec_base.h \
components/ec/ucc_ec_log.h \
coll_patterns/recursive_knomial.h \
coll_patterns/sra_knomial.h \
coll_patterns/bruck_alltoall.h \
components/topo/ucc_topo.h \
noinst_HEADERS = \
core/ucc_global_opts.h \
core/ucc_lib.h \
core/ucc_context.h \
core/ucc_team.h \
core/ucc_ee.h \
core/ucc_progress_queue.h \
core/ucc_service_coll.h \
core/ucc_dt.h \
schedule/ucc_schedule.h \
schedule/ucc_schedule_pipelined.h \
coll_score/ucc_coll_score.h \
utils/arch/aarch64/cpu.h \
utils/arch/ppc64/cpu.h \
utils/arch/riscv64/cpu.h \
utils/arch/x86_64/cpu.h \
utils/arch/cpu.h \
utils/arch/cuda_def.h \
utils/ucc_compiler_def.h \
utils/ucc_log.h \
utils/ucc_parser.h \
utils/ucc_component.h \
utils/ucc_datastruct.h \
utils/ucc_math.h \
utils/ucc_coll_utils.h \
utils/ucc_list.h \
utils/ucc_string.h \
utils/ucc_queue.h \
utils/ucc_proc_info.h \
utils/khash.h \
utils/ini.h \
utils/ucc_spinlock.h \
utils/ucc_mpool.h \
utils/ucc_rcache.h \
utils/profile/ucc_profile.h \
utils/profile/ucc_profile_on.h \
utils/profile/ucc_profile_off.h \
utils/ucc_time.h \
utils/ucc_sys.h \
utils/ucc_assert.h \
components/base/ucc_base_iface.h \
components/cl/ucc_cl.h \
components/cl/ucc_cl_log.h \
components/cl/ucc_cl_type.h \
components/tl/ucc_tl.h \
components/tl/ucc_tl_log.h \
components/mc/ucc_mc.h \
components/mc/base/ucc_mc_base.h \
components/mc/ucc_mc_log.h \
components/ec/ucc_ec.h \
components/ec/base/ucc_ec_base.h \
components/ec/ucc_ec_log.h \
coll_patterns/recursive_knomial.h \
coll_patterns/sra_knomial.h \
coll_patterns/bruck_alltoall.h \
coll_patterns/double_binary_tree.h \
components/topo/ucc_topo.h \
components/topo/ucc_sbgp.h

libucc_la_SOURCES = \
Expand Down
Loading

0 comments on commit 0935db0

Please sign in to comment.