Merge branch 'add_perf_benchmark_ci' of https://github.com/DeepLink-o…

…rg/deeplink.framework.dev into add_perf_benchmark_ci
DeepLink-org · Jul 31, 2024 · 8a474a3 · 8a474a3
2 parents bca3c3a + caba5e5
commit 8a474a3
Show file tree

Hide file tree

Showing 9 changed files with 77 additions and 42 deletions.
diff --git a/.github/workflows/_runs-on-nv-step2.yml b/.github/workflows/_runs-on-nv-step2.yml
@@ -44,7 +44,7 @@ jobs:
         run: |
           if [[ "${GETRUNNER}" == *sco* ]];then
             set -e
-            srun --job-name=${GITHUB_JOB} bash -c "export USE_COVERAGE=ON && cd ${DEEPLINK_PATH}/${GITHUB_RUN_NUMBER}/Build-Cuda/dipu \
+            srun --job-name=need_two_gpus bash -c "export USE_COVERAGE=ON && cd ${DEEPLINK_PATH}/${GITHUB_RUN_NUMBER}/Build-Cuda/dipu \
             && source ${ENV_PATH}/dipu_env  \
             && bash tests/run_nv_tests.sh"
             if [ "${ALL_COVERAGE}" = "ON" ]; then
@@ -56,7 +56,7 @@ jobs:
             export USE_COVERAGE=ON
             cd ${DEEPLINK_PATH}/${GITHUB_RUN_NUMBER}/ && cd Build-Cuda/dipu
             source ${ENV_PATH}/dipu_env
-            srun --job-name=${GITHUB_RUN_NUMBER}_${GITHUB_JOB} --partition=${CUDA_PARTATION} --gres=gpu:1 --cpus-per-task=5 --mem=16G --time=70 sh tests/run_nv_tests.sh
+            srun --job-name=${GITHUB_RUN_NUMBER}_${GITHUB_JOB} --partition=${CUDA_PARTATION} --gres=gpu:2 --cpus-per-task=5 --mem=16G --time=70 sh tests/run_nv_tests.sh
             if [ "${ALL_COVERAGE}" = "ON" ]; then
             bash /mnt/cache/share/platform/dep/sonar/coverage_DIPU_nv.sh ${DEEPLINK_PATH}/${GITHUB_RUN_NUMBER}/Build-Cuda ${GITHUB_RUN_NUMBER} || echo "get coverage fail"
             fi
@@ -216,15 +216,15 @@ jobs:
         run: |
           if [[ "${GETRUNNER}" == *sco* ]];then
             set -e
-            srun --job-name=${GITHUB_JOB} bash -c "cd ${DEEPLINK_PATH}/${GITHUB_RUN_NUMBER}/Build-Cuda-Latest-Target/dipu \
+            srun --job-name=need_two_gpus bash -c "cd ${DEEPLINK_PATH}/${GITHUB_RUN_NUMBER}/Build-Cuda-Latest-Target/dipu \
             && source ${ENV_PATH}/dipu_env  \
             && bash tests/run_nv_tests.sh" && cd ${DEEPLINK_PATH}/${GITHUB_RUN_NUMBER}/ && rm -rf Build-Cuda-Latest-Target
           else
             ssh SH1424 """
             set -ex
             cd ${DEEPLINK_PATH}/${GITHUB_RUN_NUMBER}/ && cd Build-Cuda-Latest-Target/dipu
             source ${ENV_PATH}/dipu_env
-            srun --job-name=${GITHUB_RUN_NUMBER}_${GITHUB_JOB} --partition=${CUDA_PARTATION} --gres=gpu:1 --cpus-per-task=5 --mem=16G --time=60 sh tests/run_nv_tests.sh && cd ${DEEPLINK_PATH}/${GITHUB_RUN_NUMBER}/ && rm -rf Build-Cuda-Latest-Target \
+            srun --job-name=${GITHUB_RUN_NUMBER}_${GITHUB_JOB} --partition=${CUDA_PARTATION} --gres=gpu:2 --cpus-per-task=5 --mem=16G --time=60 sh tests/run_nv_tests.sh && cd ${DEEPLINK_PATH}/${GITHUB_RUN_NUMBER}/ && rm -rf Build-Cuda-Latest-Target \
             || ( cd ${DEEPLINK_PATH}/${GITHUB_RUN_NUMBER}/ && rm -rf Build-Cuda-Latest-Target && exit 1 )
             """
           fi
@@ -271,15 +271,15 @@ jobs:
         run: |
           if [[ "${GETRUNNER}" == *sco* ]];then
             set -e
-            srun --job-name=${GITHUB_JOB} bash -c "cd ${DEEPLINK_PATH}/${GITHUB_RUN_NUMBER}/Build-Cuda-Pt211/dipu \
+            srun --job-name=need_two_gpus bash -c "cd ${DEEPLINK_PATH}/${GITHUB_RUN_NUMBER}/Build-Cuda-Pt211/dipu \
             && source ${ENV_PATH}/dipu_env 2.1.1 \
             && bash tests/run_nv_tests.sh" && cd ${DEEPLINK_PATH}/${GITHUB_RUN_NUMBER}/ && rm -rf Build-Cuda-Pt211
           else
             ssh SH1424 """
             set -ex
             cd ${DEEPLINK_PATH}/${GITHUB_RUN_NUMBER}/Build-Cuda-Pt211/dipu
             source ${ENV_PATH}/dipu_env 2.1.1
-            srun --job-name=${GITHUB_RUN_NUMBER}_${GITHUB_JOB} --partition=${CUDA_PARTATION} --gres=gpu:1 --cpus-per-task=5 bash tests/run_nv_tests.sh \
+            srun --job-name=${GITHUB_RUN_NUMBER}_${GITHUB_JOB} --partition=${CUDA_PARTATION} --gres=gpu:2 --cpus-per-task=5 bash tests/run_nv_tests.sh \
             || ( cd ${DEEPLINK_PATH}/${GITHUB_RUN_NUMBER}/ && rm -rf Build-Cuda-Pt211 && exit 1 )
             """
           fi
diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml
@@ -99,7 +99,7 @@ jobs:
           TAG=${GITHUB_REF#refs/tags/} # Extract the tag name
           echo $TAG
           bash /mnt/cache/share/platform/dep/Deploy_DIPU_trigger.sh "$TAG"
-          
+
   Build-Camb:
     name: Build-dipu-camb
     needs: [Runs-On-Nv-Step1]
@@ -307,7 +307,7 @@ jobs:
     needs: [Build-Camb-Latest-Target]
     runs-on: github-poc-ci
     env:
-      MLU_REQUESTS: 1
+      MLU_REQUESTS: 4
     steps:
       - name: Run-test
         run: |

diff --git a/dipu/tests/python/individual_scripts/test_rt_ddp.py b/dipu/tests/python/individual_scripts/test_rt_ddp.py
@@ -3,6 +3,7 @@
 import random
 from torch import nn
 import os
+import subprocess
 import torch
 import torch.distributed as dist
 import torch.nn as nn
@@ -140,7 +141,8 @@ def demo_allreduce(rank, world_size, port):
             expected_tensor = torch.tensor([True, False, True], dtype=torch.bool).to(
                 dev1
             )
-        assert torch.allclose(te_result, expected_tensor)
+        print(f"te_result:{te_result}, expected_tensor: {expected_tensor}")
+        assert torch.allclose(te_result.cpu(), expected_tensor.cpu())
 
     # byte
     for op in [dist.reduce_op.SUM, dist.reduce_op.MAX, dist.reduce_op.MIN]:
@@ -217,13 +219,13 @@ def demo_bcast(rank, world_size, port):
     src1 = torch.ones((2, 4)).to(rank)
     dst = torch.empty((2, 4)).to(rank)
     # print(dst)
-    for i in range(1, 3):
+    for i in range(world_size):
         if rank == 0:
             dist.broadcast(src1, 0)
         else:
             dist.broadcast(dst, 0)
-    assert torch.allclose(src1, dst)
-    print(dst)
+    if rank != 0:
+        assert torch.allclose(src1, dst), str(dst)
     cleanup()
 
 
@@ -238,7 +240,7 @@ def demo_gather(rank, world_size, port):
         gather_list = [torch.empty((2, 4)).to(rank) for _ in range(world_size)]
     else:
         gather_list = None
-    for i in range(1, 3):
+    for i in range(world_size):
         dist.gather(src, gather_list, dst=root_rank)
     if rank == root_rank:
         for i in range(world_size):
@@ -319,10 +321,11 @@ def demo_reducescatter(rank, world_size, port):
 
     dst = torch.zeros((2, 4)).to(rank)
     # print(dst)
-    for i in range(1, 3):
+    for i in range(world_size):
         dist.reduce_scatter(dst, srcs, op=dist.reduce_op.SUM)
-
-    assert torch.allclose(srcs[0], dst)
+    print(f"src:{srcs[0]}")
+    print(f"dst:{dst}")
+    assert torch.allclose(srcs[0].cpu() * world_size, dst.cpu())
     print(dst)
     cleanup()
 
@@ -334,9 +337,9 @@ def demo_reducescatter_base(rank, world_size, port):
 
     src1 = torch.ones((world_size * 2, 4)).to(rank)
     dst = torch.zeros((2, 4)).to(rank)
-    for i in range(1, 3):
+    for i in range(world_size):
         dist.reduce_scatter_tensor(dst, src1, op=dist.reduce_op.SUM)
-    assert torch.allclose(torch.ones((2, 4)), dst.cpu())
+    assert torch.allclose(torch.ones((2, 4)) * world_size, dst.cpu())
     print(dst)
     cleanup()
 
@@ -443,26 +446,27 @@ def demo_alltoall(rank, world_size, port):
 
 
 def demo_model_parallel(rank, world_size, port):
+    import torch_dipu
+
     print(f"Running DDP with model parallel example on rank {rank}.")
     backend = "nccl"
     dev1 = rank
 
     # debugat(rank)
-    setup(backend, rank, world_size)
+    setup(rank, world_size, port)
 
     # setup mp_model and devices for this process
-    dev0 = (rank * 2) % world_size
-    dev1 = (rank * 2 + 1) % world_size
-    mp_model = ToyMpModel(dev0, dev1)
+    dev0 = rank
+    mp_model = ToyModel().to(dev0)
     ddp_mp_model = DDP(mp_model)
 
     loss_fn = nn.MSELoss()
     optimizer = optim.SGD(ddp_mp_model.parameters(), lr=0.001)
 
     optimizer.zero_grad()
     # outputs will be on dev1
-    outputs = ddp_mp_model(torch.randn(20, 10))
-    labels = torch.randn(20, 5).to(dev1)
+    outputs = ddp_mp_model(torch.randn(20, 10).to(dev0))
+    labels = torch.randn(20, 5).to(dev0)
     loss_fn(outputs, labels).backward()
     optimizer.step()
 
@@ -500,12 +504,12 @@ def demo_allgather_gloo(rank, world_size, port):
     cleanup()
 
 
-def test_special_group_stuck(rank, world_size):
+def test_special_group_stuck(rank, world_size, port):
     import torch_dipu
 
     print(f"test special group stuck on rank {rank} ")
 
-    setup(rank, world_size)
+    setup(rank, world_size, port)
 
     # ranks check require len(ranks) <= world_size
     if world_size >= 2:
@@ -519,9 +523,11 @@ def test_special_group_stuck(rank, world_size):
     cleanup()
 
 
-def test_new_group(rank, world_size):
+def test_new_group(rank, world_size, port):
+    import torch_dipu
+
     print(f"test group on rank {rank} ws: {world_size}")
-    setup(rank, world_size)
+    setup(rank, world_size, port)
     for op in [
         dist.reduce_op.SUM,
     ]:
@@ -577,11 +583,17 @@ def test_get_comm_name(rank, world_size, port):
 
 
 if __name__ == "__main__":
-    n_gpus = torch.cuda.device_count()
-
     port = random.randint(10000, 60000)
-
-    world_size = 1
+    # get device_count without "import torch_dipu"
+    sub_process = subprocess.run(
+        [
+            "python",
+            "-c",
+            "import torch;import torch_dipu;exit(torch.cuda.device_count())",
+        ]
+    )
+    world_size = sub_process.returncode
+    print(f"world_size: {world_size}")
     run_demo(demo_basic_ddp, world_size, port)
     run_demo(demo_allreduce, world_size, port)
     run_demo(demo_allgather, world_size, port)
@@ -599,12 +611,14 @@ def test_get_comm_name(rank, world_size, port):
     run_demo(test_get_comm_name, world_size, port)
 
     # need 2 card to run
-    # run_demo(demo_p2p, world_size, port)
-    # run_demo(demo_bcast, world_size, port)
+    if world_size >= 2:
+        run_demo(demo_p2p, world_size, port)
+        run_demo(demo_bcast, world_size, port)
 
-    # run_demo(demo_model_parallel, world_size)
+        run_demo(demo_model_parallel, world_size, port)
 
-    # run_demo(test_special_group_stuck, world_size)
+        run_demo(test_special_group_stuck, world_size, port)
 
     # need 4 card to run
-    # run_demo(test_new_group, world_size)
+    if world_size >= 4:
+        run_demo(test_new_group, world_size, port)
diff --git a/dipu/tests/python/unittests/test_copy.py b/dipu/tests/python/unittests/test_copy.py
@@ -2,7 +2,12 @@
 import itertools
 import torch
 import torch_dipu
-from torch_dipu.testing._internal.common_utils import TestCase, run_tests, skipOn
+from torch_dipu.testing._internal.common_utils import (
+    TestCase,
+    run_tests,
+    skipOn,
+    skipIfDevcieCountLessThan,
+)
 
 
 class TestCopy(TestCase):
@@ -79,9 +84,12 @@ def test_hollow_device_copy_(self):
         dst1.copy_(src)
         self.assertEqual(dst1.cpu(), src.cpu())
 
+    @skipIfDevcieCountLessThan(2)
     def test_d2d_peer_copy_(self):
         if torch.cuda.device_count() < 2:
-            return
+            assert (
+                False
+            ), "At least two cards are required for copying between multiple cards"
         dst = torch.rand((6400, 4000), device="cuda:0")
         src = torch.rand((6400, 4000), device="cuda:1")
         dst.copy_(src)
@@ -96,9 +104,12 @@ def test_d2d_peer_copy_(self):
         self.assertEqual(dst.device.index, 1)
         self.assertEqual(src.device.index, 0)
 
+    @skipIfDevcieCountLessThan(2)
     def test_d2d_peer_copy_no_contiguous(self):
         if torch.cuda.device_count() < 2:
-            return
+            assert (
+                False
+            ), "At least two cards are required for copying between multiple cards"
         src = torch.rand((6400, 9900), device="cuda:1")[::2, ::3]
         dst = src.to("cuda:0")
         self.assertEqual(dst.cpu(), src.cpu())

diff --git a/dipu/tests/run_ascend_tests.sh b/dipu/tests/run_ascend_tests.sh
@@ -7,6 +7,7 @@ function run_dipu_tests {
     # TODO: Add PyTorch tests
     # run_test tests/test_ops/archived/test_tensor_add.py
     python tests/python/individual_scripts/test_rt_ddp.py
+    python tests/python/unittests/test_copy.py
 }
 
 if [ "$LOGFILE" != "" ]; then

diff --git a/dipu/third_party/DIOPI b/dipu/third_party/DIOPI
diff --git a/dipu/torch_dipu/csrc_dipu/CMakeLists.txt b/dipu/torch_dipu/csrc_dipu/CMakeLists.txt
@@ -54,7 +54,7 @@ set(GENERATED_KERNELS "${CMAKE_CURRENT_SOURCE_DIR}/aten/ops/AutoGenedKernels.cpp
 set(GENERATED_KERNELS_VENDOR "${PROJECT_SOURCE_DIR}/third_party/DIOPI/impl/${UsedVendor}/convert_config.yaml")
 set(GENERATED_KERNELS_SCRIPT "${AUTOGEN_DIOPI_WRAPPER_DIR}/autogen_diopi_wrapper.py")
 set(GENERATED_KERNELS_CONFIG "${AUTOGEN_DIOPI_WRAPPER_DIR}/diopi_functions.yaml")
-set(DEVICE_GUARD_FREE_VENDOR cuda) # vensors that do not need to device guard need to be added to this list.
+set(DEVICE_GUARD_FREE_VENDOR "") # Vendors that do not require Device Guard should be included in this list.
 if (${UsedVendor} IN_LIST DEVICE_GUARD_FREE_VENDOR)
   set(GENERATE_DEVICE_GUARD False)
 else()

diff --git a/dipu/torch_dipu/csrc_dipu/runtime/core/allocator/DIPUBSCachingAllocator.cpp b/dipu/torch_dipu/csrc_dipu/runtime/core/allocator/DIPUBSCachingAllocator.cpp
@@ -138,6 +138,7 @@ class BSCachingAllocator : public CacheAllocator {
   }
 
   void empty_resource_pool() const {
+    std::lock_guard<mutex_t> lk(mutex_);
     DIPU_DEBUG_ALLOCATOR(
         8, "BSCachingAllocator::empty_resource_pool ,allocator:" << this);
     while (!async_mem_pool()->empty()) {
@@ -180,6 +181,7 @@ class BSCachingAllocator : public CacheAllocator {
   void release_all_memory() const override { release_all_memory_impl(); }
 
   void flush_mem_pool() const {
+    std::lock_guard<mutex_t> lk(mutex_);
     DIPU_DEBUG_ALLOCATOR(
         8, "BSCachingAllocator::flush_mem_pool allocator:" << this);
     while (async_mem_pool()->ready()) {

diff --git a/dipu/torch_dipu/testing/_internal/common_utils.py b/dipu/torch_dipu/testing/_internal/common_utils.py
@@ -69,6 +69,13 @@ def skipOn(vendor: str, reason: str):
     return unittest.skipIf(torch_dipu.dipu.vendor_type == vendor, reason)
 
 
+def skipIfDevcieCountLessThan(number_of_devices_required):
+    return unittest.skipIf(
+        torch_dipu.dipu.device_count() < number_of_devices_required,
+        f"available device are less than {number_of_devices_required}",
+    )
+
+
 @overload
 def onlyOn(vendor: str): ...
+47 −0		diopi_test/python/configs/diopi_configs.py
+14 −1		diopi_test/python/conformance/customized_test.py
+44 −10		diopi_test/python/conformance/diopi_functions.py
+2 −1		impl/ascend/aclnn/acl_scalar.hpp
+191 −43		impl/ascend/aclnn/adaptor.hpp
+2 −1		impl/ascend/ascend_tensor.cpp
+12 −0		impl/ascend/ascend_tensor.hpp
+40 −44		impl/ascend/convert_config.yaml
+107 −41		impl/ascend/device_configs.py
+23 −0		impl/ascend/functions/abs.cpp
+77 −55		impl/ascend/functions/activation.cpp
+13 −14		impl/ascend/functions/addcdiv.cpp
+12 −6		impl/ascend/functions/addcmul.cpp
+6 −24		impl/ascend/functions/addmm.cpp
+3 −47		impl/ascend/functions/arange.cpp
+13 −8		impl/ascend/functions/argmax.cpp
+23 −0		impl/ascend/functions/atan.cpp
+14 −54		impl/ascend/functions/baddbmm.cpp
+28 −126		impl/ascend/functions/batch_norm.cpp
+47 −129		impl/ascend/functions/binary.cpp
+19 −87		impl/ascend/functions/bitwise.cpp
+8 −10		impl/ascend/functions/bmm.cpp
+6 −55		impl/ascend/functions/cast.cpp
+8 −12		impl/ascend/functions/cat.cpp
+24 −0		impl/ascend/functions/ceil.cpp
+120 −141		impl/ascend/functions/clamp.cpp
+19 −0		impl/ascend/functions/col2im.cpp
+98 −103		impl/ascend/functions/conv2d.cpp
+4 −28		impl/ascend/functions/copy.cpp
+2 −13		impl/ascend/functions/cos.cpp
+9 −17		impl/ascend/functions/cumsum.cpp
+95 −59		impl/ascend/functions/dropout.cpp
+3 −12		impl/ascend/functions/embedding.cpp
+27 −0		impl/ascend/functions/equal.cpp
+11 −3		impl/ascend/functions/expand.cpp
+3 −15		impl/ascend/functions/fill.cpp
+3 −2		impl/ascend/functions/flip.cpp
+6 −3		impl/ascend/functions/floor.cpp
+55 −5		impl/ascend/functions/gather.cpp
+42 −15		impl/ascend/functions/group_norm.cpp
+28 −0		impl/ascend/functions/index_put.cpp
+7 −14		impl/ascend/functions/index_select.cpp
+47 −32		impl/ascend/functions/interpolate.cpp
+3 −2		impl/ascend/functions/isnan.cpp
+32 −24		impl/ascend/functions/layer_norm.cpp
+26 −0		impl/ascend/functions/lerp.cpp
+27 −0		impl/ascend/functions/linalg_vec_norm.cpp
+44 −86		impl/ascend/functions/linear.cpp
+3 −7		impl/ascend/functions/linspace.cpp
+68 −61		impl/ascend/functions/logic.cpp
+0 −18		impl/ascend/functions/loss.cpp
+29 −48		impl/ascend/functions/masked_fill.cpp
+112 −0		impl/ascend/functions/masked_select.cpp
+18 −0		impl/ascend/functions/matmul.cpp
+156 −0		impl/ascend/functions/max_pool2d.cpp
+18 −17		impl/ascend/functions/minmax.cpp
+11 −14		impl/ascend/functions/mm.cpp
+44 −0		impl/ascend/functions/mse_loss.cpp
+54 −0		impl/ascend/functions/mul.cpp
+3 −9		impl/ascend/functions/multinomial.cpp
+32 −29		impl/ascend/functions/nonzero.cpp
+8 −19		impl/ascend/functions/norm.cpp
+50 −59		impl/ascend/functions/normal.cpp
+32 −12		impl/ascend/functions/one_hot.cpp
+18 −0		impl/ascend/functions/ones.cpp
+7 −27		impl/ascend/functions/pool.cpp
+11 −9		impl/ascend/functions/pow.cpp
+95 −148		impl/ascend/functions/reduce.cpp
+29 −0		impl/ascend/functions/remainder.cpp
+9 −2		impl/ascend/functions/repeat.cpp
+38 −49		impl/ascend/functions/scatter.cpp
+6 −29		impl/ascend/functions/sgn.cpp
+4 −18		impl/ascend/functions/sin.cpp
+3 −47		impl/ascend/functions/sort.cpp
+9 −12		impl/ascend/functions/split.cpp
+11 −4		impl/ascend/functions/stack.cpp
+6 −17		impl/ascend/functions/threshold.cpp
+4 −14		impl/ascend/functions/topk.cpp
+21 −13		impl/ascend/functions/transpose.cpp
+1 −7		impl/ascend/functions/tril.cpp
+2 −13		impl/ascend/functions/triu.cpp
+42 −29		impl/ascend/functions/unary.cpp
+6 −24		impl/ascend/functions/uniform.cpp
+0 −38		impl/ascend/functions/upsample.cpp
+3 −20		impl/ascend/functions/where.cpp
+23 −0		impl/ascend/functions/zeros.cpp
+36 −0		impl/ascend/functions_ext/adamw.cpp
+45 −12		impl/ascend/functions_ext/rms_norm.cpp
+120 −0		impl/ascend/functions_ext/rotary_embedding.cpp
+19 −2		impl/ascend_npu/CMakeLists.txt
+191 −185		impl/ascend_npu/ascend_config.yaml
+24 −0		impl/ascend_npu/diopi_impl/erfinv.cpp
+76 −20		impl/ascend_npu/diopi_impl/functions_ext/flash_attention.cpp
+66 −52		impl/camb/functions_ext/flash_attention_varlen.cpp
+32 −16		proto/include/diopi/functions_ext.h