Skip to content

Commit

Permalink
Merge branch 'main' into ko3n1g/ci/multi-tenancy-for-tests
Browse files Browse the repository at this point in the history
  • Loading branch information
ko3n1g authored May 16, 2024
2 parents 226245e + 526b6ad commit 32950d4
Show file tree
Hide file tree
Showing 63 changed files with 2,757 additions and 1,082 deletions.
260 changes: 108 additions & 152 deletions .github/workflows/cicd-main.yml
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,9 @@ name: "CICD NeMo"

on:
pull_request:
branches: [ "main" ]
branches:
- 'main'
- 'r**'
types: [ labeled ]

concurrency:
Expand Down Expand Up @@ -73,92 +75,45 @@ jobs:
uses: actions/checkout@v4
with:
path: ${{ github.run_id }}

- name: Container setup
run: |
# Pull base PyTorch container
docker pull nvcr.io/nvidia/pytorch:24.02-py3
docker run --device=/dev/nvidia0 --gpus all --shm-size=8g --env TRANSFORMERS_OFFLINE=0 --env HYDRA_FULL_ERROR=1 --env PYTHONUNBUFFERED=1 --volume ${{ github.workspace }}/${{ github.run_id }}:/workspace --volume /mnt/datadrive/TestData:/home/TestData nvcr.io/nvidia/pytorch:24.02-py3 /bin/bash -c '
set -x
# PyTorch version
python -c "import torch; print(torch.__version__)"
python -c "import torchvision; print(torchvision.__version__)"
# Install test requirements
apt-get update && apt-get install -y bc && pip install -r requirements/requirements_test.txt && pip install -r requirements/requirements_lightning.txt
# Code formatting checks
python setup.py style
# Copyright Headers check
python tests/check_copyright_header.py --dir .
# NeMo Installation
./reinstall.sh release
# Transformer Engine installation
git clone https://github.com/NVIDIA/TransformerEngine.git && \
pushd TransformerEngine && \
git fetch origin bfe21c3d68b0a9951e5716fb520045db53419c5e && \
git checkout FETCH_HEAD && \
git submodule init && git submodule update && \
NVTE_FRAMEWORK=pytorch NVTE_WITH_USERBUFFERS=1 MPI_HOME=/usr/local/mpi pip install . && \
popd
# Apex installation
git clone https://github.com/NVIDIA/apex.git && \
pushd apex && \
git checkout 810ffae374a2b9cb4b5c5e28eaeca7d7998fca0c && \
cp -R apex /usr/local/lib/python3.10/dist-packages && \
popd
# pip package should be working with main, if not we can update the commit here
# until the pip package is updated
# Megatron Core installation
git clone https://github.com/NVIDIA/Megatron-LM.git && \
pushd Megatron-LM && \
git checkout c90aa1671fc0b97f80fa6c3bb892ce6f8e88e7c9 && \
pip install . && \
pushd megatron/core/datasets && \
make && \
popd && \
popd
export PYTHONPATH="${PYTHONPATH}:/workspace/Megatron-LM"
# Install only for test: L2: Segmentation Tool
pushd tools/ctc_segmentation && \
pip install -r requirements.txt && \
apt-get update && apt-get install libsox-fmt-all -y && \
popd
# AMMO installation
pip install nvidia-ammo~=0.9.0 --extra-index-url https://pypi.nvidia.com --no-cache-dir
# PyTorch Lightning version
python -c "import pytorch_lightning; print(pytorch_lightning.__version__)"
# PyTorch Lightning DDP Checks
CUDA_VISIBLE_DEVICES="0,1" python "tests/core_ptl/check_for_ranks.py"
# Basic Import Checks
python -c "import nemo.collections.asr as nemo_asr"
python -c "import nemo.collections.nlp as nemo_nlp"
python -c "import nemo.collections.tts as nemo_tts"
# set permission
chmod 777 -R /workspace
'
### \'\'
- name: Push container to registry for future use

- name: Set up Docker Buildx
uses: docker/setup-buildx-action@v3
with:
# We use `docker` driver as this speeds things up for
# trivial (non-multi-stage) builds.
driver: docker

- name: Build and push
uses: docker/build-push-action@v5
with:
file: Dockerfile.ci
push: true
cache-from: nemoci.azurecr.io/nemo_container:latest
cache-to: type=inline
tags: |
nemoci.azurecr.io/nemo_container_${{ github.run_id }}
nemoci.azurecr.io/nemo_container:latest
- name: Run some checks
run: |
# Push container
echo "Docker: List containers" && docker ps -a
DOCKER_COMMIT=$(docker ps --latest --quiet) # latest container
docker commit $DOCKER_COMMIT nemoci.azurecr.io/nemo_container_${{ github.run_id }}
docker tag nemoci.azurecr.io/nemo_container_${{ github.run_id }} nemoci.azurecr.io/nemo_container_${{ github.run_id }}
docker push nemoci.azurecr.io/nemo_container_${{ github.run_id }}
docker run --rm --device=/dev/nvidia0 --gpus all --shm-size=8g --env TRANSFORMERS_OFFLINE=0 --env HYDRA_FULL_ERROR=1 --env PYTHONUNBUFFERED=1 nemoci.azurecr.io/nemo_container_${{ github.run_id }} bash -c '\
# PyTorch Lightning version
python -c "import pytorch_lightning; print(pytorch_lightning.__version__)"
# PyTorch Lightning DDP Checks
CUDA_VISIBLE_DEVICES="0,1" python "tests/core_ptl/check_for_ranks.py"
# Basic Import Checks
python -c "import nemo.collections.asr as nemo_asr"
python -c "import nemo.collections.nlp as nemo_nlp"
python -c "import nemo.collections.tts as nemo_tts"
python setup.py style
python tests/check_copyright_header.py --dir .
# These checks are not crucial
exit 0
'
# - name: Build and push to local registry
# uses: docker/build-push-action@v5
Expand Down Expand Up @@ -399,7 +354,7 @@ jobs:
- name: Checkout repository
uses: actions/checkout@v4
- run: |
python examples/nlp/language_modeling/megatron_llama_quantization.py \
python examples/nlp/language_modeling/megatron_quantization.py \
model_file=/home/TestData/nlp/megatron_llama/llama_ci.nemo \
quantization.algorithm=null \
model_save=/home/TestData/nlp/megatron_llama/ci_baseline
Expand All @@ -408,69 +363,70 @@ jobs:
- uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@main"
if: "failure()"

# L2_PTQ_Llama2_FP8:
# needs: [cicd-test-container-setup]
# runs-on: self-hosted-azure
# timeout-minutes: 10
# container:
# image: nemoci.azurecr.io/nemo_container_${{ github.run_id }}
# options:
# # --user 0:128
# --device=/dev/nvidia0
# --gpus all
# --shm-size=8g
# --env TRANSFORMERS_OFFLINE=0
# --env HYDRA_FULL_ERROR=1
# --volume /mnt/datadrive/TestData:/home/TestData
# steps:
# - name: Checkout repository
# uses: actions/checkout@v4
# - run: |
# python examples/nlp/language_modeling/megatron_llama_quantization.py \
# model_file=/home/TestData/nlp/megatron_llama/llama_ci.nemo \
# tensor_model_parallel_size=2 \
# trainer.devices=2 \
# quantization.calib_dataset=/home/TestData/nlp/test_quantization/test.json \
# quantization.algorithm=fp8 \
# quantization.num_calib_size=8 \
# inference.batch_size=2 \
# export.inference_tensor_parallel=2 \
# model_save=/home/TestData/nlp/megatron_llama/ci_fp8.qnemo

# rm -rf /home/TestData/nlp/megatron_llama/ci_fp8.qnemo
# - uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@main"
# if: "failure()"

# L2_PTQ_Llama2_INT8_SQ:
# needs: [cicd-test-container-setup]
# runs-on: self-hosted-azure
# timeout-minutes: 10
# container:
# image: nemoci.azurecr.io/nemo_container_${{ github.run_id }}
# options:
# # --user 0:128
# --device=/dev/nvidia0
# --gpus all
# --shm-size=8g
# --env TRANSFORMERS_OFFLINE=0
# --env HYDRA_FULL_ERROR=1
# --volume /mnt/datadrive/TestData:/home/TestData
# steps:
# - name: Checkout repository
# uses: actions/checkout@v4
# - run: |
# python examples/nlp/language_modeling/megatron_llama_quantization.py \
# model_file=/home/TestData/nlp/megatron_llama/llama_ci.nemo \
# quantization.calib_dataset=/home/TestData/nlp/test_quantization/test.json \
# quantization.algorithm=int8_sq \
# quantization.num_calib_size=8 \
# inference.batch_size=2 \
# model_save=/home/TestData/nlp/megatron_llama/ci_int8_sq.qnemo

# rm -rf /home/TestData/nlp/megatron_llama/ci_int8_sq.qnemo
# - uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@main"
# if: "failure()"

L2_PTQ_Llama2_FP8:
needs: [cicd-test-container-setup]
runs-on: self-hosted-azure
timeout-minutes: 10
container:
image: nemoci.azurecr.io/nemo_container_${{ github.run_id }}
options:
# --user 0:128
--device=/dev/nvidia0
--gpus all
--shm-size=8g
--env TRANSFORMERS_OFFLINE=0
--env HYDRA_FULL_ERROR=1
--volume /mnt/datadrive/TestData:/home/TestData
steps:
- name: Checkout repository
uses: actions/checkout@v4
- run: |
python examples/nlp/language_modeling/megatron_quantization.py \
model_file=/home/TestData/nlp/megatron_llama/llama_ci.nemo \
tensor_model_parallel_size=2 \
trainer.devices=2 \
quantization.calib_dataset=/home/TestData/nlp/test_quantization/test.json \
quantization.algorithm=fp8 \
quantization.num_calib_size=8 \
inference.batch_size=2 \
export.inference_tensor_parallel=2 \
model_save=/home/TestData/nlp/megatron_llama/ci_fp8.qnemo
rm -rf /home/TestData/nlp/megatron_llama/ci_fp8.qnemo
- uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@main"
if: "failure()"

L2_PTQ_Llama2_INT8_SQ:
needs: [cicd-test-container-setup]
runs-on: self-hosted-azure
timeout-minutes: 10
container:
image: nemoci.azurecr.io/nemo_container_${{ github.run_id }}
options:
# --user 0:128
--device=/dev/nvidia0
--gpus all
--shm-size=8g
--env TRANSFORMERS_OFFLINE=0
--env HYDRA_FULL_ERROR=1
--volume /mnt/datadrive/TestData:/home/TestData
steps:
- name: Checkout repository
uses: actions/checkout@v4
- run: |
python examples/nlp/language_modeling/megatron_quantization.py \
model_file=/home/TestData/nlp/megatron_llama/llama_ci.nemo \
quantization.calib_dataset=/home/TestData/nlp/test_quantization/test.json \
quantization.algorithm=int8_sq \
quantization.num_calib_size=8 \
inference.batch_size=2 \
model_save=/home/TestData/nlp/megatron_llama/ci_int8_sq.qnemo
rm -rf /home/TestData/nlp/megatron_llama/ci_int8_sq.qnemo
- uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@main"
if: "failure()"

# TODO: investigate int4_awq stuck issues and restore the test
#L2_PTQ_Llama2_INT4_AWQ:
# needs: [cicd-test-container-setup]
# runs-on: self-hosted-azure
Expand All @@ -489,7 +445,7 @@ jobs:
# - name: Checkout repository
# uses: actions/checkout@v4
# - run: |
# python examples/nlp/language_modeling/megatron_llama_quantization.py \
# python examples/nlp/language_modeling/megatron_quantization.py \
# model_file=/home/TestData/nlp/megatron_llama/llama_ci.nemo \
# tensor_model_parallel_size=1 \
# trainer.devices=1 \
Expand Down Expand Up @@ -6489,7 +6445,7 @@ jobs:
Speech_Checkpoints_tests:
needs: [cicd-test-container-setup]
runs-on: self-hosted-azure
timeout-minutes: 10
timeout-minutes: 20
container:
image: nemoci.azurecr.io/nemo_container_${{ github.run_id }}
options:
Expand Down
2 changes: 0 additions & 2 deletions Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -133,8 +133,6 @@ RUN for f in $(ls requirements*.txt); do pip3 install --disable-pip-version-chec
RUN pip install flash-attn
# install numba for latest containers
RUN pip install numba>=0.57.1
# install ammo
RUN pip install nvidia-ammo~=0.9.0 --extra-index-url https://pypi.nvidia.com --no-cache-dir

# copy nemo source into a scratch image
FROM scratch as nemo-src
Expand Down
74 changes: 74 additions & 0 deletions Dockerfile.ci
Original file line number Diff line number Diff line change
@@ -0,0 +1,74 @@
# syntax=docker/dockerfile:1-labs

# Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

ARG BASE_IMAGE=nvcr.io/nvidia/pytorch:24.02-py3

FROM ${BASE_IMAGE}

ENV TRANSFORMERS_OFFLINE=0
ENV HYDRA_FULL_ERROR=1
ENV PYTHONUNBUFFERED=1

# APT packages
RUN <<"EOF" bash -ex
apt-get update
apt-get install -y bc libsox-fmt-all -y
apt-get clean
EOF

WORKDIR /workspace

# Install NeMo requirements
ARG TE_TAG=bfe21c3d68b0a9951e5716fb520045db53419c5e
ARG MODELOPT_VERSION=0.11.0
ARG MCORE_TAG=c90aa1671fc0b97f80fa6c3bb892ce6f8e88e7c9
ARG APEX_TAG=810ffae374a2b9cb4b5c5e28eaeca7d7998fca0c
RUN \
--mount=type=bind,source=requirements,target=requirements \
--mount=type=bind,source=tools,target=tools \
--mount=type=bind,source=setup.py,target=setup.py \
--mount=type=bind,source=nemo/package_info.py,target=nemo/package_info.py \
--mount=type=bind,source=nemo/__init__.py,target=nemo/__init__.py <<"EOF" bash -ex
pip install --no-cache-dir --no-build-isolation --extra-index-url https://pypi.nvidia.com \
"transformer-engine @ git+https://github.com/NVIDIA/TransformerEngine.git@${TE_TAG}" \
"megatron_core @ git+https://github.com/NVIDIA/Megatron-LM.git@${MCORE_TAG}" \
"nvidia-modelopt[torch]~=${MODELOPT_VERSION}" \
"apex @ git+https://github.com/NVIDIA/apex.git@${APEX_TAG}" \
-r tools/ctc_segmentation/requirements.txt \
".[all]"

# Megatron Core installation
git clone https://github.com/NVIDIA/Megatron-LM.git && \
pushd Megatron-LM && \
git checkout ${MCORE_TAG} && \
pushd megatron/core/datasets && \
make && \
popd && \
popd
export PYTHONPATH="${PYTHONPATH}:/workspace/Megatron-LM"
EOF

# Copy over NeMo code
COPY ./ ./
RUN <<"EOF" bash -ex
pip install --no-cache-dir --no-build-isolation ".[all]"

# set permission
chmod 777 -R /workspace
EOF

ENV PYTHONPATH="${PYTHONPATH}:/workspace/Megatron-LM"

Loading

0 comments on commit 32950d4

Please sign in to comment.