From 8f9e211dd672ad47def6db430c7d8af76f881239 Mon Sep 17 00:00:00 2001 From: Aaron Wei Date: Tue, 28 Nov 2023 17:09:56 +0800 Subject: [PATCH 1/3] [FIX] fix virtual memory of SUPA --- .../csrc_dipu/vendor/supa/deviceimpl.cpp | 29 ++++++++++++++----- 1 file changed, 22 insertions(+), 7 deletions(-) diff --git a/dipu/torch_dipu/csrc_dipu/vendor/supa/deviceimpl.cpp b/dipu/torch_dipu/csrc_dipu/vendor/supa/deviceimpl.cpp index c04b74e79..f2f298386 100644 --- a/dipu/torch_dipu/csrc_dipu/vendor/supa/deviceimpl.cpp +++ b/dipu/torch_dipu/csrc_dipu/vendor/supa/deviceimpl.cpp @@ -184,6 +184,8 @@ DIPU_API void freeHost(void* p) { free(p); } extern "C" { void* br_device_malloc(uint64_t bytes); void br_device_free(void* ptr); +// get physical address from ptr(virtual) +void* get_phy_ptr(const void* ptr); } DIPU_API OpStatus mallocDevice(void** p, size_t nbytes, bool throwExcepion) { @@ -206,47 +208,60 @@ DIPU_API bool isPinnedPtr(const void* p) { return false; } // (asynchronous) set val DIPU_API void memSetAsync(const deviceStream_t stream, void* ptr, int val, size_t size) { - SUPA_CALL(suMemsetAsync(ptr, val, size, stream)); + auto phy_gpu_addr = get_phy_ptr(ptr); + SUPA_CALL(suMemsetAsync(phy_gpu_addr, val, size, stream)); } // (synchronous) copy from device to a device DIPU_API void memCopyD2D(size_t nbytes, deviceId_t dstDevId, void* dst, deviceId_t srcDevId, const void* src) { // SUPA uses Unified Virtual Address - SUPA_CALL(suMemcpy(dst, src, nbytes, suMemcpyDeviceToDevice)); + auto phy_src_gpu_addr = get_phy_ptr(src); + auto phy_dst_gpu_addr = get_phy_ptr(dst); + SUPA_CALL(suMemcpy(phy_dst_gpu_addr, phy_src_gpu_addr, nbytes, + suMemcpyDeviceToDevice)); } // (synchronous) copy from host to a device DIPU_API void memCopyH2D(size_t nbytes, /*deviceId_t dstDevId,*/ void* dst, /*Host srcDev,*/ const void* src) { - SUPA_CALL(suMemcpy(dst, src, nbytes, suMemcpyHostToDevice)); + auto phy_dst_gpu_addr = get_phy_ptr(dst); + SUPA_CALL(suMemcpy(phy_dst_gpu_addr, src, nbytes, suMemcpyHostToDevice)); } // (synchronous) copy from a device to host DIPU_API void memCopyD2H(size_t nbytes, /*Host dstDev,*/ void* dst, /*deviceId_t srcDevId,*/ const void* src) { - SUPA_CALL(suMemcpy(dst, src, nbytes, suMemcpyDeviceToHost)); + auto phy_src_gpu_addr = get_phy_ptr(src); + SUPA_CALL(suMemcpy(dst, phy_src_gpu_addr, nbytes, suMemcpyDeviceToHost)); } // (asynchronous) copy from device to a device DIPU_API void memCopyD2DAsync(const deviceStream_t stream, size_t nbytes, deviceId_t dstDevId, void* dst, deviceId_t srcDevId, const void* src) { - SUPA_CALL(suMemcpyAsync(dst, src, nbytes, stream, suMemcpyDeviceToDevice)); + auto phy_src_gpu_addr = get_phy_ptr(src); + auto phy_dst_gpu_addr = get_phy_ptr(dst); + SUPA_CALL(suMemcpyAsync(phy_dst_gpu_addr, phy_src_gpu_addr, nbytes, stream, + suMemcpyDeviceToDevice)); } // (asynchronous) copy from host to a device DIPU_API void memCopyH2DAsync(const deviceStream_t stream, size_t nbytes, /*deviceId_t dstDevId,*/ void* dst, /*Host srcDev,*/ const void* src) { - SUPA_CALL(suMemcpyAsync(dst, src, nbytes, stream, suMemcpyHostToDevice)); + auto phy_dst_gpu_addr = get_phy_ptr(dst); + SUPA_CALL(suMemcpyAsync(phy_dst_gpu_addr, src, nbytes, stream, + suMemcpyHostToDevice)); } // (asynchronous) copy from a device to host DIPU_API void memCopyD2HAsync(const deviceStream_t stream, size_t nbytes, /*Host dstDev,*/ void* dst, /*deviceId_t srcDevId,*/ const void* src) { - SUPA_CALL(suMemcpyAsync(dst, src, nbytes, stream, suMemcpyDeviceToHost)); + auto phy_src_gpu_addr = get_phy_ptr(src); + SUPA_CALL(suMemcpyAsync(dst, phy_src_gpu_addr, nbytes, stream, + suMemcpyDeviceToHost)); } } // end namespace devapis } // end namespace dipu From b7feb8e2c2dd607e3cf96221d17a0354fd42e938 Mon Sep 17 00:00:00 2001 From: Aaron Wei Date: Fri, 1 Dec 2023 16:31:37 +0800 Subject: [PATCH 2/3] [FIX] fix incorrect copy --- .../csrc_dipu/vendor/supa/copyinplace.cpp | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/dipu/torch_dipu/csrc_dipu/vendor/supa/copyinplace.cpp b/dipu/torch_dipu/csrc_dipu/vendor/supa/copyinplace.cpp index 0b84a9e8a..0ce9b44df 100644 --- a/dipu/torch_dipu/csrc_dipu/vendor/supa/copyinplace.cpp +++ b/dipu/torch_dipu/csrc_dipu/vendor/supa/copyinplace.cpp @@ -24,6 +24,19 @@ class SUPACopyInplace : public DIPUCopyInpOnDIOPI { CopyParamsInfo& info) override { dipu_wrap_diopi_copy_inp(dst, src, non_blocking); } + void run(at::Tensor& dst, const at::Tensor& src, bool non_blocking) override { + auto curStream = dipu::getCurrentDIPUStream(); + ::diopiContext context(curStream.rawstream()); + auto ctx = &context; + auto diopi_src = dipu::diopi_helper::toDiopiTensorHandle(src); + auto diopi_dst = dipu::diopi_helper::toDiopiTensorHandle(dst); + TORCH_CHECK(diopiError_t::diopiSuccess == + diopiCopyInp(ctx, diopi_src, diopi_dst)); + // syncAfterCopy + if (!non_blocking) { + dipu::devapis::syncStream(curStream.rawstream()); + } + } }; static SUPACopyInplace copy_inplace; From 709524903f772b0d6bfe393599923512f3f08220 Mon Sep 17 00:00:00 2001 From: Aaron Wei Date: Mon, 4 Dec 2023 19:28:25 +0800 Subject: [PATCH 3/3] [FIX] remove useless copy and add missing 'supa'in cmakelists.txt --- dipu/CMakeLists.txt | 6 +++--- dipu/torch_dipu/csrc_dipu/vendor/supa/copyinplace.cpp | 6 ------ 2 files changed, 3 insertions(+), 9 deletions(-) diff --git a/dipu/CMakeLists.txt b/dipu/CMakeLists.txt index d94770c28..24b368a9d 100644 --- a/dipu/CMakeLists.txt +++ b/dipu/CMakeLists.txt @@ -44,7 +44,7 @@ elseif (${DEVICE} IN_LIST DEVICE_TOPSRIDER) elseif (${DEVICE} IN_LIST DEVICE_SUPA) set(USE_SUPA ON) set(UsedVendor supa) - set(DIOPI_IMPL_OPT "") + set(DIOPI_IMPL_OPT "supa") #SUPA DEVICE DOES NOT NEED TO BUILD DIOPI, so set the target to "" to control the workflow. elseif (${DEVICE} IN_LIST DEVICE_DROPLET) set(USE_DROPLET ON) @@ -81,14 +81,14 @@ if(NOT DEFINED DIPU_ABI_V) OUTPUT_VARIABLE DIPU_ABI_V) endif() -if(NOT DEFINED DIPU_COMPILED_WITH_CXX11_ABI) +if(NOT DEFINED DIPU_COMPILED_WITH_CXX11_ABI) execute_process( COMMAND sh -x -c "python -c 'import torch;print(1 if torch.compiled_with_cxx11_abi() else 0)'" OUTPUT_VARIABLE DIPU_COMPILED_WITH_CXX11_ABI) endif() - + if(DIPU_COMPILED_WITH_CXX11_ABI GREATER 0) set(DIPU_COMPILED_WITH_CXX11_ABI 1) else() diff --git a/dipu/torch_dipu/csrc_dipu/vendor/supa/copyinplace.cpp b/dipu/torch_dipu/csrc_dipu/vendor/supa/copyinplace.cpp index 0ce9b44df..9149e8e98 100644 --- a/dipu/torch_dipu/csrc_dipu/vendor/supa/copyinplace.cpp +++ b/dipu/torch_dipu/csrc_dipu/vendor/supa/copyinplace.cpp @@ -18,12 +18,6 @@ class SUPACopyInplace : public DIPUCopyInpOnDIOPI { SUPACopyInplace() = default; ~SUPACopyInplace() = default; - // assume it can handle between device. - void copyNodirectBetweenDevices(at::Tensor& dst, const at::Tensor& src, - bool non_blocking, - CopyParamsInfo& info) override { - dipu_wrap_diopi_copy_inp(dst, src, non_blocking); - } void run(at::Tensor& dst, const at::Tensor& src, bool non_blocking) override { auto curStream = dipu::getCurrentDIPUStream(); ::diopiContext context(curStream.rawstream());