Skip to content

add v100 on 1424

add v100 on 1424 #6

Workflow file for this run

name: runs on 1424
on:
workflow_dispatch:
pull_request:
paths-ignore:
- "**.md"
- ".github/ISSUE_TEMPLATE/**"
- ".git*"
- "CODE_OF_CONDUCT**"
concurrency:
group: 1424-${{ github.head_ref || github.ref }}
cancel-in-progress: true
env:
DEEPLINK_PATH: '/mnt/cache/share/deeplinkci/github/${{ github.repository }}'
NFS_PATH: '/mnt/cache/share/parrotsci/github/cibuild/${{ github.repository }}'
CUDA_PARTATION: ${{ vars.SH1988_SLURM_PAR != '' && vars.SH1988_SLURM_PAR || 'pat_dev' }}
CLUSTER_V100: SH1424
REPO: ${{ github.event.repository.name }}
ALL_COVERAGE: ${{ (contains( github.ref, 'main') || startsWith(github.ref, 'refs/heads/v') || startsWith(github.ref, 'refs/heads/dev')) && 'ON' || 'OFF' }}
REQUIRE_COVERAGE: ${{ vars.REQUIRE_COVERAGE != '' && vars.REQUIRE_COVERAGE || '0' }}
jobs:
Rsync:
name: Rsync code
runs-on: github-poc-ci
steps:
- name: clone repo
run: |
set -ex
cd ${GITHUB_WORKSPACE} && rm -rf DIPU ${REPO}_DIOPI ${REPO} ${REPO}.dev
if [ -n "${{ github.event.pull_request.head.repo.full_name }}" ] && [[ ! "${{ github.event.pull_request.head.repo.full_name }}" == "${{ github.repository }}" ]]; then
git clone ${{ github.event.pull_request.head.repo.ssh_url }} ${REPO}
cd ${GITHUB_WORKSPACE}/${REPO} && git checkout ${{ github.event.pull_request.head.sha }}
git remote add mainrepo [email protected]:${GITHUB_REPOSITORY}.git
git fetch mainrepo && git merge --no-edit mainrepo/${{ github.base_ref }}
else
git clone https://github.com/DeepLink-org/${REPO}.git && cd ${REPO}
if [ $GITHUB_EVENT_NAME == "pull_request" ]; then
echo "${{ github.base_ref }} "
git checkout ${{ github.event.pull_request.head.sha }} && git merge --no-edit origin/${{ github.base_ref }}
else
echo $GITHUB_EVENT_NAME
git checkout ${{ github.sha }}
fi
fi
cd ${GITHUB_WORKSPACE}/${REPO}/dipu && rm -rf third_party/kineto
git clone --reference /home/autolink/rsync/sourcecode/DeepLink-org/kineto https://github.com/DeepLink-org/kineto.git third_party/kineto
git submodule update --init && cd third_party/kineto && git submodule update --init
cd ${GITHUB_WORKSPACE} && cp -R ${REPO} ${REPO}_DIOPI
cd ${REPO}/dipu && bash /home/autolink/rsync/sourcecode/update_code.sh
rsync -a /home/autolink/rsync/sourcecode/mmlab_pack . && cd mmlab_pack
bash ../scripts/ci/ci_one_iter.sh clone
# dipu_diopi depend on latest target diopi branch, not diopi in submodule. here assume diopi and dipu use same 'target branch' " github.base_ref "
cd ${GITHUB_WORKSPACE}/${REPO}_DIOPI/dipu/third_party && rm -rf DIOPI && git clone https://github.com/DeepLink-org/DIOPI.git
if [ $GITHUB_EVENT_NAME == "pull_request" ]; then
cd ./DIOPI && git checkout ${{ github.base_ref }}
fi
- name: Rsync to Server
run: |
ssh ${CLUSTER_V100} "mkdir -p ${DEEPLINK_PATH}/${GITHUB_RUN_NUMBER}/source ${DEEPLINK_PATH}/${GITHUB_RUN_NUMBER}/source-main" \
&& rsync -a --delete ${GITHUB_WORKSPACE}/${REPO}/ ${CLUSTER_V100}:${DEEPLINK_PATH}/${GITHUB_RUN_NUMBER}/source/ \
&& rsync -a --delete ${GITHUB_WORKSPACE}/${REPO}_DIOPI/ ${CLUSTER_V100}:${DEEPLINK_PATH}/${GITHUB_RUN_NUMBER}/source-main/ || echo "failure to connect to cuda"
Build-Cuda:
name: Build-dipu-cuda
needs: [Rsync]
runs-on: github-poc-ci
env:
GPU_REQUESTS: 1
steps:
- name: Build dipu
run: |
ssh ${CLUSTER_V100} """
set -e
export USE_COVERAGE=ON
cd ${DEEPLINK_PATH}/${GITHUB_RUN_NUMBER} && rm -rf ${GITHUB_JOB} && cp -R source ${GITHUB_JOB} && cd ${GITHUB_JOB}/dipu
source scripts/ci/nv/ci_nv_env.sh
rsync -a /mnt/lustre/share_data/PAT/datasets/huggingface mmlab_pack/
srun --job-name=${GITHUB_RUN_NUMBER}_${GITHUB_JOB} --partition=${CUDA_PARTATION} --gres=gpu:${GPU_REQUESTS} --time=30 bash scripts/ci/nv/ci_nv_script.sh build_dipu \
|| ( cd ${DEEPLINK_PATH}/${GITHUB_RUN_NUMBER}/ && rm -rf ${GITHUB_JOB} && exit 1 )
"""
Tidy-Cuda:
name: Run tidy (cuda)
needs: [Build-Cuda]
runs-on: github-poc-ci
steps:
- name: clang-tidy
run: |
ssh ${CLUSTER_V100} """
set -eo pipefail
source ~/.bashrc
cd $DEEPLINK_PATH/$GITHUB_RUN_NUMBER/Build-Cuda &&
git diff -s --exit-code dipu/SupportedDiopiFunctions.txt ||
{ echo "::error file=dipu/SupportedDiopiFunctions.txt,title=File Not Match::Please commit your compiled SupportedDiopiFunctions.txt" && exit 1; }
srun --job-name=${GITHUB_RUN_NUMBER}_${GITHUB_JOB} --partition=${CUDA_PARTATION} bash -c "bash $DEEPLINK_PATH/$GITHUB_RUN_NUMBER/Build-Cuda/dipu/scripts/ci/nv/ci_nv_tidy.sh"
"""
Test-Cuda:
name: Test-dipu-cuda
needs: [Build-Cuda, Tidy-Cuda]
runs-on: github-poc-ci
env:
GPU_REQUESTS: 1
steps:
- name: Run-test
run: |
ssh ${CLUSTER_V100} """
set -ex
export USE_COVERAGE=ON
cd ${DEEPLINK_PATH}/${GITHUB_RUN_NUMBER}/ && cd Build-Cuda/dipu
source scripts/ci/nv/ci_nv_env.sh
srun --job-name=${GITHUB_RUN_NUMBER}_${GITHUB_JOB} --partition=${CUDA_PARTATION} --gres=gpu:${GPU_REQUESTS} --cpus-per-task=5 --mem=16G --time=70 sh tests/run_nv_tests.sh
if [ "${ALL_COVERAGE}" = "ON" ]; then
bash /mnt/cache/share/platform/dep/sonar/coverage_DIPU_nv.sh ${DEEPLINK_PATH}/${GITHUB_RUN_NUMBER}/Build-Cuda ${GITHUB_RUN_NUMBER} || echo "get coverage fail"
fi
"""
- name: increment coverage check
if: ${{ contains( github.event_name, 'pull_request' ) && contains( github.base_ref, 'main' ) }}
run: |
ssh ${CLUSTER_V100} """
set -e
cd ${DEEPLINK_PATH}/${GITHUB_RUN_NUMBER}/Build-Cuda/
rm -rf scripts
ln -s ${DEEPLINK_PATH}/${GITHUB_RUN_NUMBER}/source-main/dipu/third_party/DIOPI/scripts scripts
source /mnt/cache/share/platform/env/pt2.0_diopi
bash scripts/increment_coverage.sh ${REQUIRE_COVERAGE}
"""
Test-One-Iter_Cuda:
name: Test-one-iter-cuda
needs: [Build-Cuda, Tidy-Cuda]
runs-on: github-poc-ci
env:
GPU_REQUESTS: 1
steps:
- name: build some env
run: |
ssh ${CLUSTER_V100} """
set -ex
cd ${DEEPLINK_PATH}/${GITHUB_RUN_NUMBER}/ && cd Build-Cuda/dipu
source scripts/ci/nv/ci_nv_env.sh
basic_path=${DEEPLINK_PATH}/${GITHUB_RUN_NUMBER}/Build-Cuda/dipu/mmlab_pack
export PYTHONPATH=\${basic_path}/mmengine:\$PYTHONPATH
export PYTHONPATH=\${basic_path}/mmcv:\$PYTHONPATH
export PYTHONPATH=\$(pwd):\$PYTHONPATH
cd mmlab_pack
srun --job-name=${GITHUB_RUN_NUMBER}_${GITHUB_JOB} --partition=${CUDA_PARTATION} --gres=gpu:${GPU_REQUESTS} --time=20 bash ../scripts/ci/ci_one_iter.sh build_cuda
"""
- name: run-one-iter-for-tradition
run: |
ssh ${CLUSTER_V100} """
cd ${DEEPLINK_PATH}/${GITHUB_RUN_NUMBER}/ && cd Build-Cuda/dipu
source scripts/ci/nv/ci_nv_env.sh
basic_path=${DEEPLINK_PATH}/${GITHUB_RUN_NUMBER}/Build-Cuda/dipu/mmlab_pack
source scripts/ci/ci_one_iter.sh export_pythonpath_cuda \${basic_path}
export PYTHONPATH=\$(pwd):\$PYTHONPATH
cd mmlab_pack
rm -rf one_iter_data
python ../scripts/ci/ci_run_one_iter.py cuda ${GITHUB_RUN_NUMBER}_${GITHUB_JOB} "gpu:${GPU_REQUESTS}" \"${CUDA_PARTATION}\" && rm -rf one_iter_data || (rm -rf one_iter_data && exit 1)
"""
- name: run-one-iter-for-llm
run: |
ssh ${CLUSTER_V100} """
cd ${DEEPLINK_PATH}/${GITHUB_RUN_NUMBER}/ && cd Build-Cuda/dipu
source scripts/ci/nv/ci_nv_env.sh
basic_path=${DEEPLINK_PATH}/${GITHUB_RUN_NUMBER}/Build-Cuda/dipu/mmlab_pack
source scripts/ci/ci_one_iter.sh export_pythonpath_cuda \${basic_path}
export PYTHONPATH=\$(pwd):\$PYTHONPATH
cd mmlab_pack
rm -rf one_iter_data
python ../scripts/ci/ci_run_one_iter.py cuda ${GITHUB_RUN_NUMBER}_${GITHUB_JOB} "gpu:${GPU_REQUESTS}" \"${CUDA_PARTATION}\" "llm" && rm -rf one_iter_data || (rm -rf one_iter_data && exit 1)
"""
- name: Perform cleanup one iter data
if: always()
run: |
ssh ${CLUSTER_V100} """
set -ex
echo "${GITHUB_RUN_NUMBER}_${GITHUB_JOB}"
scancel -n "${GITHUB_RUN_NUMBER}_${GITHUB_JOB}"
cd ${DEEPLINK_PATH}/${GITHUB_RUN_NUMBER}/Build-Cuda/dipu/mmlab_pack
rm -rf one_iter_data
touch one_iter_data # 用于占位,防止创建新的 one_iter_data 文件夹
"""
- name: Check for failure
if: ${{ failure() }}
run: exit 1
Build-Cuda-Latest-Target:
name: Build-dipu-cuda-latest-target
needs: [Tidy-Cuda]
runs-on: github-poc-ci
env:
GPU_REQUESTS: 1
steps:
- name: Build dipu diopi-latest-target
run: |
ssh ${CLUSTER_V100} """
set -ex
cd ${DEEPLINK_PATH}/${GITHUB_RUN_NUMBER} && rm -rf ${GITHUB_JOB} && cp -R source-main ${GITHUB_JOB} && cd ${GITHUB_JOB}/dipu
source scripts/ci/nv/ci_nv_env.sh
srun --job-name=${GITHUB_RUN_NUMBER}_${GITHUB_JOB} --partition=${CUDA_PARTATION} --gres=gpu:${GPU_REQUESTS} --cpus-per-task=5 --mem=16G --time=30 bash scripts/ci/nv/ci_nv_script.sh build_dipu \
|| ( cd ${DEEPLINK_PATH}/${GITHUB_RUN_NUMBER}/ && rm -rf ${GITHUB_JOB} && exit 1 )
"""
Test-Cuda-Latest-Target:
name: Test-dipu-cuda-latest-target
needs: [Build-Cuda-Latest-Target]
runs-on: github-poc-ci
env:
GPU_REQUESTS: 1
steps:
- name: Run-test
run: |
ssh ${CLUSTER_V100} """
set -ex
cd ${DEEPLINK_PATH}/${GITHUB_RUN_NUMBER}/ && cd Build-Cuda-Latest-Target/dipu
source scripts/ci/nv/ci_nv_env.sh
srun --job-name=${GITHUB_RUN_NUMBER}_${GITHUB_JOB} --partition=${CUDA_PARTATION} --gres=gpu:${GPU_REQUESTS} --cpus-per-task=5 --mem=16G --time=60 sh tests/run_nv_tests.sh && cd ${DEEPLINK_PATH}/${GITHUB_RUN_NUMBER}/ && rm -rf Build-Cuda-Latest-Target \
|| ( cd ${DEEPLINK_PATH}/${GITHUB_RUN_NUMBER}/ && rm -rf ${GITHUB_JOB} && exit 1 )
"""