Skip to content

Commit

Permalink
Merge branch 'add_perf_benchmark_ci' of https://github.com/DeepLink-o…
Browse files Browse the repository at this point in the history
…rg/deeplink.framework.dev into add_perf_benchmark_ci
  • Loading branch information
Wrench-Git committed Jul 31, 2024
2 parents bca3c3a + caba5e5 commit 8a474a3
Show file tree
Hide file tree
Showing 9 changed files with 77 additions and 42 deletions.
12 changes: 6 additions & 6 deletions .github/workflows/_runs-on-nv-step2.yml
Original file line number Diff line number Diff line change
Expand Up @@ -44,7 +44,7 @@ jobs:
run: |
if [[ "${GETRUNNER}" == *sco* ]];then
set -e
srun --job-name=${GITHUB_JOB} bash -c "export USE_COVERAGE=ON && cd ${DEEPLINK_PATH}/${GITHUB_RUN_NUMBER}/Build-Cuda/dipu \
srun --job-name=need_two_gpus bash -c "export USE_COVERAGE=ON && cd ${DEEPLINK_PATH}/${GITHUB_RUN_NUMBER}/Build-Cuda/dipu \
&& source ${ENV_PATH}/dipu_env \
&& bash tests/run_nv_tests.sh"
if [ "${ALL_COVERAGE}" = "ON" ]; then
Expand All @@ -56,7 +56,7 @@ jobs:
export USE_COVERAGE=ON
cd ${DEEPLINK_PATH}/${GITHUB_RUN_NUMBER}/ && cd Build-Cuda/dipu
source ${ENV_PATH}/dipu_env
srun --job-name=${GITHUB_RUN_NUMBER}_${GITHUB_JOB} --partition=${CUDA_PARTATION} --gres=gpu:1 --cpus-per-task=5 --mem=16G --time=70 sh tests/run_nv_tests.sh
srun --job-name=${GITHUB_RUN_NUMBER}_${GITHUB_JOB} --partition=${CUDA_PARTATION} --gres=gpu:2 --cpus-per-task=5 --mem=16G --time=70 sh tests/run_nv_tests.sh
if [ "${ALL_COVERAGE}" = "ON" ]; then
bash /mnt/cache/share/platform/dep/sonar/coverage_DIPU_nv.sh ${DEEPLINK_PATH}/${GITHUB_RUN_NUMBER}/Build-Cuda ${GITHUB_RUN_NUMBER} || echo "get coverage fail"
fi
Expand Down Expand Up @@ -216,15 +216,15 @@ jobs:
run: |
if [[ "${GETRUNNER}" == *sco* ]];then
set -e
srun --job-name=${GITHUB_JOB} bash -c "cd ${DEEPLINK_PATH}/${GITHUB_RUN_NUMBER}/Build-Cuda-Latest-Target/dipu \
srun --job-name=need_two_gpus bash -c "cd ${DEEPLINK_PATH}/${GITHUB_RUN_NUMBER}/Build-Cuda-Latest-Target/dipu \
&& source ${ENV_PATH}/dipu_env \
&& bash tests/run_nv_tests.sh" && cd ${DEEPLINK_PATH}/${GITHUB_RUN_NUMBER}/ && rm -rf Build-Cuda-Latest-Target
else
ssh SH1424 """
set -ex
cd ${DEEPLINK_PATH}/${GITHUB_RUN_NUMBER}/ && cd Build-Cuda-Latest-Target/dipu
source ${ENV_PATH}/dipu_env
srun --job-name=${GITHUB_RUN_NUMBER}_${GITHUB_JOB} --partition=${CUDA_PARTATION} --gres=gpu:1 --cpus-per-task=5 --mem=16G --time=60 sh tests/run_nv_tests.sh && cd ${DEEPLINK_PATH}/${GITHUB_RUN_NUMBER}/ && rm -rf Build-Cuda-Latest-Target \
srun --job-name=${GITHUB_RUN_NUMBER}_${GITHUB_JOB} --partition=${CUDA_PARTATION} --gres=gpu:2 --cpus-per-task=5 --mem=16G --time=60 sh tests/run_nv_tests.sh && cd ${DEEPLINK_PATH}/${GITHUB_RUN_NUMBER}/ && rm -rf Build-Cuda-Latest-Target \
|| ( cd ${DEEPLINK_PATH}/${GITHUB_RUN_NUMBER}/ && rm -rf Build-Cuda-Latest-Target && exit 1 )
"""
fi
Expand Down Expand Up @@ -271,15 +271,15 @@ jobs:
run: |
if [[ "${GETRUNNER}" == *sco* ]];then
set -e
srun --job-name=${GITHUB_JOB} bash -c "cd ${DEEPLINK_PATH}/${GITHUB_RUN_NUMBER}/Build-Cuda-Pt211/dipu \
srun --job-name=need_two_gpus bash -c "cd ${DEEPLINK_PATH}/${GITHUB_RUN_NUMBER}/Build-Cuda-Pt211/dipu \
&& source ${ENV_PATH}/dipu_env 2.1.1 \
&& bash tests/run_nv_tests.sh" && cd ${DEEPLINK_PATH}/${GITHUB_RUN_NUMBER}/ && rm -rf Build-Cuda-Pt211
else
ssh SH1424 """
set -ex
cd ${DEEPLINK_PATH}/${GITHUB_RUN_NUMBER}/Build-Cuda-Pt211/dipu
source ${ENV_PATH}/dipu_env 2.1.1
srun --job-name=${GITHUB_RUN_NUMBER}_${GITHUB_JOB} --partition=${CUDA_PARTATION} --gres=gpu:1 --cpus-per-task=5 bash tests/run_nv_tests.sh \
srun --job-name=${GITHUB_RUN_NUMBER}_${GITHUB_JOB} --partition=${CUDA_PARTATION} --gres=gpu:2 --cpus-per-task=5 bash tests/run_nv_tests.sh \
|| ( cd ${DEEPLINK_PATH}/${GITHUB_RUN_NUMBER}/ && rm -rf Build-Cuda-Pt211 && exit 1 )
"""
fi
4 changes: 2 additions & 2 deletions .github/workflows/main.yml
Original file line number Diff line number Diff line change
Expand Up @@ -99,7 +99,7 @@ jobs:
TAG=${GITHUB_REF#refs/tags/} # Extract the tag name
echo $TAG
bash /mnt/cache/share/platform/dep/Deploy_DIPU_trigger.sh "$TAG"
Build-Camb:
name: Build-dipu-camb
needs: [Runs-On-Nv-Step1]
Expand Down Expand Up @@ -307,7 +307,7 @@ jobs:
needs: [Build-Camb-Latest-Target]
runs-on: github-poc-ci
env:
MLU_REQUESTS: 1
MLU_REQUESTS: 4
steps:
- name: Run-test
run: |
Expand Down
72 changes: 43 additions & 29 deletions dipu/tests/python/individual_scripts/test_rt_ddp.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
import random
from torch import nn
import os
import subprocess
import torch
import torch.distributed as dist
import torch.nn as nn
Expand Down Expand Up @@ -140,7 +141,8 @@ def demo_allreduce(rank, world_size, port):
expected_tensor = torch.tensor([True, False, True], dtype=torch.bool).to(
dev1
)
assert torch.allclose(te_result, expected_tensor)
print(f"te_result:{te_result}, expected_tensor: {expected_tensor}")
assert torch.allclose(te_result.cpu(), expected_tensor.cpu())

# byte
for op in [dist.reduce_op.SUM, dist.reduce_op.MAX, dist.reduce_op.MIN]:
Expand Down Expand Up @@ -217,13 +219,13 @@ def demo_bcast(rank, world_size, port):
src1 = torch.ones((2, 4)).to(rank)
dst = torch.empty((2, 4)).to(rank)
# print(dst)
for i in range(1, 3):
for i in range(world_size):
if rank == 0:
dist.broadcast(src1, 0)
else:
dist.broadcast(dst, 0)
assert torch.allclose(src1, dst)
print(dst)
if rank != 0:
assert torch.allclose(src1, dst), str(dst)
cleanup()


Expand All @@ -238,7 +240,7 @@ def demo_gather(rank, world_size, port):
gather_list = [torch.empty((2, 4)).to(rank) for _ in range(world_size)]
else:
gather_list = None
for i in range(1, 3):
for i in range(world_size):
dist.gather(src, gather_list, dst=root_rank)
if rank == root_rank:
for i in range(world_size):
Expand Down Expand Up @@ -319,10 +321,11 @@ def demo_reducescatter(rank, world_size, port):

dst = torch.zeros((2, 4)).to(rank)
# print(dst)
for i in range(1, 3):
for i in range(world_size):
dist.reduce_scatter(dst, srcs, op=dist.reduce_op.SUM)

assert torch.allclose(srcs[0], dst)
print(f"src:{srcs[0]}")
print(f"dst:{dst}")
assert torch.allclose(srcs[0].cpu() * world_size, dst.cpu())
print(dst)
cleanup()

Expand All @@ -334,9 +337,9 @@ def demo_reducescatter_base(rank, world_size, port):

src1 = torch.ones((world_size * 2, 4)).to(rank)
dst = torch.zeros((2, 4)).to(rank)
for i in range(1, 3):
for i in range(world_size):
dist.reduce_scatter_tensor(dst, src1, op=dist.reduce_op.SUM)
assert torch.allclose(torch.ones((2, 4)), dst.cpu())
assert torch.allclose(torch.ones((2, 4)) * world_size, dst.cpu())
print(dst)
cleanup()

Expand Down Expand Up @@ -443,26 +446,27 @@ def demo_alltoall(rank, world_size, port):


def demo_model_parallel(rank, world_size, port):
import torch_dipu

print(f"Running DDP with model parallel example on rank {rank}.")
backend = "nccl"
dev1 = rank

# debugat(rank)
setup(backend, rank, world_size)
setup(rank, world_size, port)

# setup mp_model and devices for this process
dev0 = (rank * 2) % world_size
dev1 = (rank * 2 + 1) % world_size
mp_model = ToyMpModel(dev0, dev1)
dev0 = rank
mp_model = ToyModel().to(dev0)
ddp_mp_model = DDP(mp_model)

loss_fn = nn.MSELoss()
optimizer = optim.SGD(ddp_mp_model.parameters(), lr=0.001)

optimizer.zero_grad()
# outputs will be on dev1
outputs = ddp_mp_model(torch.randn(20, 10))
labels = torch.randn(20, 5).to(dev1)
outputs = ddp_mp_model(torch.randn(20, 10).to(dev0))
labels = torch.randn(20, 5).to(dev0)
loss_fn(outputs, labels).backward()
optimizer.step()

Expand Down Expand Up @@ -500,12 +504,12 @@ def demo_allgather_gloo(rank, world_size, port):
cleanup()


def test_special_group_stuck(rank, world_size):
def test_special_group_stuck(rank, world_size, port):
import torch_dipu

print(f"test special group stuck on rank {rank} ")

setup(rank, world_size)
setup(rank, world_size, port)

# ranks check require len(ranks) <= world_size
if world_size >= 2:
Expand All @@ -519,9 +523,11 @@ def test_special_group_stuck(rank, world_size):
cleanup()


def test_new_group(rank, world_size):
def test_new_group(rank, world_size, port):
import torch_dipu

print(f"test group on rank {rank} ws: {world_size}")
setup(rank, world_size)
setup(rank, world_size, port)
for op in [
dist.reduce_op.SUM,
]:
Expand Down Expand Up @@ -577,11 +583,17 @@ def test_get_comm_name(rank, world_size, port):


if __name__ == "__main__":
n_gpus = torch.cuda.device_count()

port = random.randint(10000, 60000)

world_size = 1
# get device_count without "import torch_dipu"
sub_process = subprocess.run(
[
"python",
"-c",
"import torch;import torch_dipu;exit(torch.cuda.device_count())",
]
)
world_size = sub_process.returncode
print(f"world_size: {world_size}")
run_demo(demo_basic_ddp, world_size, port)
run_demo(demo_allreduce, world_size, port)
run_demo(demo_allgather, world_size, port)
Expand All @@ -599,12 +611,14 @@ def test_get_comm_name(rank, world_size, port):
run_demo(test_get_comm_name, world_size, port)

# need 2 card to run
# run_demo(demo_p2p, world_size, port)
# run_demo(demo_bcast, world_size, port)
if world_size >= 2:
run_demo(demo_p2p, world_size, port)
run_demo(demo_bcast, world_size, port)

# run_demo(demo_model_parallel, world_size)
run_demo(demo_model_parallel, world_size, port)

# run_demo(test_special_group_stuck, world_size)
run_demo(test_special_group_stuck, world_size, port)

# need 4 card to run
# run_demo(test_new_group, world_size)
if world_size >= 4:
run_demo(test_new_group, world_size, port)
17 changes: 14 additions & 3 deletions dipu/tests/python/unittests/test_copy.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,12 @@
import itertools
import torch
import torch_dipu
from torch_dipu.testing._internal.common_utils import TestCase, run_tests, skipOn
from torch_dipu.testing._internal.common_utils import (
TestCase,
run_tests,
skipOn,
skipIfDevcieCountLessThan,
)


class TestCopy(TestCase):
Expand Down Expand Up @@ -79,9 +84,12 @@ def test_hollow_device_copy_(self):
dst1.copy_(src)
self.assertEqual(dst1.cpu(), src.cpu())

@skipIfDevcieCountLessThan(2)
def test_d2d_peer_copy_(self):
if torch.cuda.device_count() < 2:
return
assert (
False
), "At least two cards are required for copying between multiple cards"
dst = torch.rand((6400, 4000), device="cuda:0")
src = torch.rand((6400, 4000), device="cuda:1")
dst.copy_(src)
Expand All @@ -96,9 +104,12 @@ def test_d2d_peer_copy_(self):
self.assertEqual(dst.device.index, 1)
self.assertEqual(src.device.index, 0)

@skipIfDevcieCountLessThan(2)
def test_d2d_peer_copy_no_contiguous(self):
if torch.cuda.device_count() < 2:
return
assert (
False
), "At least two cards are required for copying between multiple cards"
src = torch.rand((6400, 9900), device="cuda:1")[::2, ::3]
dst = src.to("cuda:0")
self.assertEqual(dst.cpu(), src.cpu())
Expand Down
1 change: 1 addition & 0 deletions dipu/tests/run_ascend_tests.sh
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@ function run_dipu_tests {
# TODO: Add PyTorch tests
# run_test tests/test_ops/archived/test_tensor_add.py
python tests/python/individual_scripts/test_rt_ddp.py
python tests/python/unittests/test_copy.py
}

if [ "$LOGFILE" != "" ]; then
Expand Down
2 changes: 1 addition & 1 deletion dipu/third_party/DIOPI
Submodule DIOPI updated 95 files
+47 −0 diopi_test/python/configs/diopi_configs.py
+14 −1 diopi_test/python/conformance/customized_test.py
+44 −10 diopi_test/python/conformance/diopi_functions.py
+2 −1 impl/ascend/aclnn/acl_scalar.hpp
+191 −43 impl/ascend/aclnn/adaptor.hpp
+2 −1 impl/ascend/ascend_tensor.cpp
+12 −0 impl/ascend/ascend_tensor.hpp
+40 −44 impl/ascend/convert_config.yaml
+107 −41 impl/ascend/device_configs.py
+23 −0 impl/ascend/functions/abs.cpp
+77 −55 impl/ascend/functions/activation.cpp
+13 −14 impl/ascend/functions/addcdiv.cpp
+12 −6 impl/ascend/functions/addcmul.cpp
+6 −24 impl/ascend/functions/addmm.cpp
+3 −47 impl/ascend/functions/arange.cpp
+13 −8 impl/ascend/functions/argmax.cpp
+23 −0 impl/ascend/functions/atan.cpp
+14 −54 impl/ascend/functions/baddbmm.cpp
+28 −126 impl/ascend/functions/batch_norm.cpp
+47 −129 impl/ascend/functions/binary.cpp
+19 −87 impl/ascend/functions/bitwise.cpp
+8 −10 impl/ascend/functions/bmm.cpp
+6 −55 impl/ascend/functions/cast.cpp
+8 −12 impl/ascend/functions/cat.cpp
+24 −0 impl/ascend/functions/ceil.cpp
+120 −141 impl/ascend/functions/clamp.cpp
+19 −0 impl/ascend/functions/col2im.cpp
+98 −103 impl/ascend/functions/conv2d.cpp
+4 −28 impl/ascend/functions/copy.cpp
+2 −13 impl/ascend/functions/cos.cpp
+9 −17 impl/ascend/functions/cumsum.cpp
+95 −59 impl/ascend/functions/dropout.cpp
+3 −12 impl/ascend/functions/embedding.cpp
+27 −0 impl/ascend/functions/equal.cpp
+11 −3 impl/ascend/functions/expand.cpp
+3 −15 impl/ascend/functions/fill.cpp
+3 −2 impl/ascend/functions/flip.cpp
+6 −3 impl/ascend/functions/floor.cpp
+55 −5 impl/ascend/functions/gather.cpp
+42 −15 impl/ascend/functions/group_norm.cpp
+28 −0 impl/ascend/functions/index_put.cpp
+7 −14 impl/ascend/functions/index_select.cpp
+47 −32 impl/ascend/functions/interpolate.cpp
+3 −2 impl/ascend/functions/isnan.cpp
+32 −24 impl/ascend/functions/layer_norm.cpp
+26 −0 impl/ascend/functions/lerp.cpp
+27 −0 impl/ascend/functions/linalg_vec_norm.cpp
+44 −86 impl/ascend/functions/linear.cpp
+3 −7 impl/ascend/functions/linspace.cpp
+68 −61 impl/ascend/functions/logic.cpp
+0 −18 impl/ascend/functions/loss.cpp
+29 −48 impl/ascend/functions/masked_fill.cpp
+112 −0 impl/ascend/functions/masked_select.cpp
+18 −0 impl/ascend/functions/matmul.cpp
+156 −0 impl/ascend/functions/max_pool2d.cpp
+18 −17 impl/ascend/functions/minmax.cpp
+11 −14 impl/ascend/functions/mm.cpp
+44 −0 impl/ascend/functions/mse_loss.cpp
+54 −0 impl/ascend/functions/mul.cpp
+3 −9 impl/ascend/functions/multinomial.cpp
+32 −29 impl/ascend/functions/nonzero.cpp
+8 −19 impl/ascend/functions/norm.cpp
+50 −59 impl/ascend/functions/normal.cpp
+32 −12 impl/ascend/functions/one_hot.cpp
+18 −0 impl/ascend/functions/ones.cpp
+7 −27 impl/ascend/functions/pool.cpp
+11 −9 impl/ascend/functions/pow.cpp
+95 −148 impl/ascend/functions/reduce.cpp
+29 −0 impl/ascend/functions/remainder.cpp
+9 −2 impl/ascend/functions/repeat.cpp
+38 −49 impl/ascend/functions/scatter.cpp
+6 −29 impl/ascend/functions/sgn.cpp
+4 −18 impl/ascend/functions/sin.cpp
+3 −47 impl/ascend/functions/sort.cpp
+9 −12 impl/ascend/functions/split.cpp
+11 −4 impl/ascend/functions/stack.cpp
+6 −17 impl/ascend/functions/threshold.cpp
+4 −14 impl/ascend/functions/topk.cpp
+21 −13 impl/ascend/functions/transpose.cpp
+1 −7 impl/ascend/functions/tril.cpp
+2 −13 impl/ascend/functions/triu.cpp
+42 −29 impl/ascend/functions/unary.cpp
+6 −24 impl/ascend/functions/uniform.cpp
+0 −38 impl/ascend/functions/upsample.cpp
+3 −20 impl/ascend/functions/where.cpp
+23 −0 impl/ascend/functions/zeros.cpp
+36 −0 impl/ascend/functions_ext/adamw.cpp
+45 −12 impl/ascend/functions_ext/rms_norm.cpp
+120 −0 impl/ascend/functions_ext/rotary_embedding.cpp
+19 −2 impl/ascend_npu/CMakeLists.txt
+191 −185 impl/ascend_npu/ascend_config.yaml
+24 −0 impl/ascend_npu/diopi_impl/erfinv.cpp
+76 −20 impl/ascend_npu/diopi_impl/functions_ext/flash_attention.cpp
+66 −52 impl/camb/functions_ext/flash_attention_varlen.cpp
+32 −16 proto/include/diopi/functions_ext.h
2 changes: 1 addition & 1 deletion dipu/torch_dipu/csrc_dipu/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -54,7 +54,7 @@ set(GENERATED_KERNELS "${CMAKE_CURRENT_SOURCE_DIR}/aten/ops/AutoGenedKernels.cpp
set(GENERATED_KERNELS_VENDOR "${PROJECT_SOURCE_DIR}/third_party/DIOPI/impl/${UsedVendor}/convert_config.yaml")
set(GENERATED_KERNELS_SCRIPT "${AUTOGEN_DIOPI_WRAPPER_DIR}/autogen_diopi_wrapper.py")
set(GENERATED_KERNELS_CONFIG "${AUTOGEN_DIOPI_WRAPPER_DIR}/diopi_functions.yaml")
set(DEVICE_GUARD_FREE_VENDOR cuda) # vensors that do not need to device guard need to be added to this list.
set(DEVICE_GUARD_FREE_VENDOR "") # Vendors that do not require Device Guard should be included in this list.
if (${UsedVendor} IN_LIST DEVICE_GUARD_FREE_VENDOR)
set(GENERATE_DEVICE_GUARD False)
else()
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -138,6 +138,7 @@ class BSCachingAllocator : public CacheAllocator {
}

void empty_resource_pool() const {
std::lock_guard<mutex_t> lk(mutex_);
DIPU_DEBUG_ALLOCATOR(
8, "BSCachingAllocator::empty_resource_pool ,allocator:" << this);
while (!async_mem_pool()->empty()) {
Expand Down Expand Up @@ -180,6 +181,7 @@ class BSCachingAllocator : public CacheAllocator {
void release_all_memory() const override { release_all_memory_impl(); }

void flush_mem_pool() const {
std::lock_guard<mutex_t> lk(mutex_);
DIPU_DEBUG_ALLOCATOR(
8, "BSCachingAllocator::flush_mem_pool allocator:" << this);
while (async_mem_pool()->ready()) {
Expand Down
7 changes: 7 additions & 0 deletions dipu/torch_dipu/testing/_internal/common_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -69,6 +69,13 @@ def skipOn(vendor: str, reason: str):
return unittest.skipIf(torch_dipu.dipu.vendor_type == vendor, reason)


def skipIfDevcieCountLessThan(number_of_devices_required):
return unittest.skipIf(
torch_dipu.dipu.device_count() < number_of_devices_required,
f"available device are less than {number_of_devices_required}",
)


@overload
def onlyOn(vendor: str): ...

Expand Down

0 comments on commit 8a474a3

Please sign in to comment.