diff --git a/dipu/CMakeLists.txt b/dipu/CMakeLists.txt index c676a3142..0f4e09073 100644 --- a/dipu/CMakeLists.txt +++ b/dipu/CMakeLists.txt @@ -45,7 +45,7 @@ elseif (${DEVICE} IN_LIST DEVICE_TOPSRIDER) elseif (${DEVICE} IN_LIST DEVICE_SUPA) set(USE_SUPA ON) set(UsedVendor supa) - set(DIOPI_IMPL_OPT "") + set(DIOPI_IMPL_OPT "supa") #SUPA DEVICE DOES NOT NEED TO BUILD DIOPI, so set the target to "" to control the workflow. elseif (${DEVICE} IN_LIST DEVICE_DROPLET) set(USE_DROPLET ON) @@ -86,14 +86,14 @@ if(NOT DEFINED DIPU_ABI_V) OUTPUT_VARIABLE DIPU_ABI_V) endif() -if(NOT DEFINED DIPU_COMPILED_WITH_CXX11_ABI) +if(NOT DEFINED DIPU_COMPILED_WITH_CXX11_ABI) execute_process( COMMAND sh -x -c "python -c 'import torch;print(1 if torch.compiled_with_cxx11_abi() else 0)'" OUTPUT_VARIABLE DIPU_COMPILED_WITH_CXX11_ABI) endif() - + if(DIPU_COMPILED_WITH_CXX11_ABI GREATER 0) set(DIPU_COMPILED_WITH_CXX11_ABI 1) else() diff --git a/dipu/torch_dipu/csrc_dipu/vendor/supa/copyinplace.cpp b/dipu/torch_dipu/csrc_dipu/vendor/supa/copyinplace.cpp index 0b84a9e8a..9149e8e98 100644 --- a/dipu/torch_dipu/csrc_dipu/vendor/supa/copyinplace.cpp +++ b/dipu/torch_dipu/csrc_dipu/vendor/supa/copyinplace.cpp @@ -18,11 +18,18 @@ class SUPACopyInplace : public DIPUCopyInpOnDIOPI { SUPACopyInplace() = default; ~SUPACopyInplace() = default; - // assume it can handle between device. - void copyNodirectBetweenDevices(at::Tensor& dst, const at::Tensor& src, - bool non_blocking, - CopyParamsInfo& info) override { - dipu_wrap_diopi_copy_inp(dst, src, non_blocking); + void run(at::Tensor& dst, const at::Tensor& src, bool non_blocking) override { + auto curStream = dipu::getCurrentDIPUStream(); + ::diopiContext context(curStream.rawstream()); + auto ctx = &context; + auto diopi_src = dipu::diopi_helper::toDiopiTensorHandle(src); + auto diopi_dst = dipu::diopi_helper::toDiopiTensorHandle(dst); + TORCH_CHECK(diopiError_t::diopiSuccess == + diopiCopyInp(ctx, diopi_src, diopi_dst)); + // syncAfterCopy + if (!non_blocking) { + dipu::devapis::syncStream(curStream.rawstream()); + } } }; diff --git a/dipu/torch_dipu/csrc_dipu/vendor/supa/deviceimpl.cpp b/dipu/torch_dipu/csrc_dipu/vendor/supa/deviceimpl.cpp index c04b74e79..f2f298386 100644 --- a/dipu/torch_dipu/csrc_dipu/vendor/supa/deviceimpl.cpp +++ b/dipu/torch_dipu/csrc_dipu/vendor/supa/deviceimpl.cpp @@ -184,6 +184,8 @@ DIPU_API void freeHost(void* p) { free(p); } extern "C" { void* br_device_malloc(uint64_t bytes); void br_device_free(void* ptr); +// get physical address from ptr(virtual) +void* get_phy_ptr(const void* ptr); } DIPU_API OpStatus mallocDevice(void** p, size_t nbytes, bool throwExcepion) { @@ -206,47 +208,60 @@ DIPU_API bool isPinnedPtr(const void* p) { return false; } // (asynchronous) set val DIPU_API void memSetAsync(const deviceStream_t stream, void* ptr, int val, size_t size) { - SUPA_CALL(suMemsetAsync(ptr, val, size, stream)); + auto phy_gpu_addr = get_phy_ptr(ptr); + SUPA_CALL(suMemsetAsync(phy_gpu_addr, val, size, stream)); } // (synchronous) copy from device to a device DIPU_API void memCopyD2D(size_t nbytes, deviceId_t dstDevId, void* dst, deviceId_t srcDevId, const void* src) { // SUPA uses Unified Virtual Address - SUPA_CALL(suMemcpy(dst, src, nbytes, suMemcpyDeviceToDevice)); + auto phy_src_gpu_addr = get_phy_ptr(src); + auto phy_dst_gpu_addr = get_phy_ptr(dst); + SUPA_CALL(suMemcpy(phy_dst_gpu_addr, phy_src_gpu_addr, nbytes, + suMemcpyDeviceToDevice)); } // (synchronous) copy from host to a device DIPU_API void memCopyH2D(size_t nbytes, /*deviceId_t dstDevId,*/ void* dst, /*Host srcDev,*/ const void* src) { - SUPA_CALL(suMemcpy(dst, src, nbytes, suMemcpyHostToDevice)); + auto phy_dst_gpu_addr = get_phy_ptr(dst); + SUPA_CALL(suMemcpy(phy_dst_gpu_addr, src, nbytes, suMemcpyHostToDevice)); } // (synchronous) copy from a device to host DIPU_API void memCopyD2H(size_t nbytes, /*Host dstDev,*/ void* dst, /*deviceId_t srcDevId,*/ const void* src) { - SUPA_CALL(suMemcpy(dst, src, nbytes, suMemcpyDeviceToHost)); + auto phy_src_gpu_addr = get_phy_ptr(src); + SUPA_CALL(suMemcpy(dst, phy_src_gpu_addr, nbytes, suMemcpyDeviceToHost)); } // (asynchronous) copy from device to a device DIPU_API void memCopyD2DAsync(const deviceStream_t stream, size_t nbytes, deviceId_t dstDevId, void* dst, deviceId_t srcDevId, const void* src) { - SUPA_CALL(suMemcpyAsync(dst, src, nbytes, stream, suMemcpyDeviceToDevice)); + auto phy_src_gpu_addr = get_phy_ptr(src); + auto phy_dst_gpu_addr = get_phy_ptr(dst); + SUPA_CALL(suMemcpyAsync(phy_dst_gpu_addr, phy_src_gpu_addr, nbytes, stream, + suMemcpyDeviceToDevice)); } // (asynchronous) copy from host to a device DIPU_API void memCopyH2DAsync(const deviceStream_t stream, size_t nbytes, /*deviceId_t dstDevId,*/ void* dst, /*Host srcDev,*/ const void* src) { - SUPA_CALL(suMemcpyAsync(dst, src, nbytes, stream, suMemcpyHostToDevice)); + auto phy_dst_gpu_addr = get_phy_ptr(dst); + SUPA_CALL(suMemcpyAsync(phy_dst_gpu_addr, src, nbytes, stream, + suMemcpyHostToDevice)); } // (asynchronous) copy from a device to host DIPU_API void memCopyD2HAsync(const deviceStream_t stream, size_t nbytes, /*Host dstDev,*/ void* dst, /*deviceId_t srcDevId,*/ const void* src) { - SUPA_CALL(suMemcpyAsync(dst, src, nbytes, stream, suMemcpyDeviceToHost)); + auto phy_src_gpu_addr = get_phy_ptr(src); + SUPA_CALL(suMemcpyAsync(dst, phy_src_gpu_addr, nbytes, stream, + suMemcpyDeviceToHost)); } } // end namespace devapis } // end namespace dipu