Skip to content

Commit

Permalink
[FIX] fix virtual memory error of using SUPA (DeepLink-org#468)
Browse files Browse the repository at this point in the history
* [FIX] fix virtual memory of SUPA

* [FIX] fix incorrect copy

* [FIX] remove useless copy and add missing 'supa'in cmakelists.txt
  • Loading branch information
Aaron20000101 authored and brianlcy123 committed Dec 21, 2023
1 parent 58dfc22 commit 49f6556
Show file tree
Hide file tree
Showing 3 changed files with 37 additions and 15 deletions.
6 changes: 3 additions & 3 deletions dipu/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -45,7 +45,7 @@ elseif (${DEVICE} IN_LIST DEVICE_TOPSRIDER)
elseif (${DEVICE} IN_LIST DEVICE_SUPA)
set(USE_SUPA ON)
set(UsedVendor supa)
set(DIOPI_IMPL_OPT "")
set(DIOPI_IMPL_OPT "supa")
#SUPA DEVICE DOES NOT NEED TO BUILD DIOPI, so set the target to "" to control the workflow.
elseif (${DEVICE} IN_LIST DEVICE_DROPLET)
set(USE_DROPLET ON)
Expand Down Expand Up @@ -86,14 +86,14 @@ if(NOT DEFINED DIPU_ABI_V)
OUTPUT_VARIABLE DIPU_ABI_V)
endif()

if(NOT DEFINED DIPU_COMPILED_WITH_CXX11_ABI)
if(NOT DEFINED DIPU_COMPILED_WITH_CXX11_ABI)
execute_process(
COMMAND
sh -x -c
"python -c 'import torch;print(1 if torch.compiled_with_cxx11_abi() else 0)'"
OUTPUT_VARIABLE DIPU_COMPILED_WITH_CXX11_ABI)
endif()

if(DIPU_COMPILED_WITH_CXX11_ABI GREATER 0)
set(DIPU_COMPILED_WITH_CXX11_ABI 1)
else()
Expand Down
17 changes: 12 additions & 5 deletions dipu/torch_dipu/csrc_dipu/vendor/supa/copyinplace.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -18,11 +18,18 @@ class SUPACopyInplace : public DIPUCopyInpOnDIOPI {
SUPACopyInplace() = default;
~SUPACopyInplace() = default;

// assume it can handle between device.
void copyNodirectBetweenDevices(at::Tensor& dst, const at::Tensor& src,
bool non_blocking,
CopyParamsInfo& info) override {
dipu_wrap_diopi_copy_inp(dst, src, non_blocking);
void run(at::Tensor& dst, const at::Tensor& src, bool non_blocking) override {
auto curStream = dipu::getCurrentDIPUStream();
::diopiContext context(curStream.rawstream());
auto ctx = &context;
auto diopi_src = dipu::diopi_helper::toDiopiTensorHandle(src);
auto diopi_dst = dipu::diopi_helper::toDiopiTensorHandle(dst);
TORCH_CHECK(diopiError_t::diopiSuccess ==
diopiCopyInp(ctx, diopi_src, diopi_dst));
// syncAfterCopy
if (!non_blocking) {
dipu::devapis::syncStream(curStream.rawstream());
}
}
};

Expand Down
29 changes: 22 additions & 7 deletions dipu/torch_dipu/csrc_dipu/vendor/supa/deviceimpl.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -184,6 +184,8 @@ DIPU_API void freeHost(void* p) { free(p); }
extern "C" {
void* br_device_malloc(uint64_t bytes);
void br_device_free(void* ptr);
// get physical address from ptr(virtual)
void* get_phy_ptr(const void* ptr);
}

DIPU_API OpStatus mallocDevice(void** p, size_t nbytes, bool throwExcepion) {
Expand All @@ -206,47 +208,60 @@ DIPU_API bool isPinnedPtr(const void* p) { return false; }
// (asynchronous) set val
DIPU_API void memSetAsync(const deviceStream_t stream, void* ptr, int val,
size_t size) {
SUPA_CALL(suMemsetAsync(ptr, val, size, stream));
auto phy_gpu_addr = get_phy_ptr(ptr);
SUPA_CALL(suMemsetAsync(phy_gpu_addr, val, size, stream));
}

// (synchronous) copy from device to a device
DIPU_API void memCopyD2D(size_t nbytes, deviceId_t dstDevId, void* dst,
deviceId_t srcDevId, const void* src) {
// SUPA uses Unified Virtual Address
SUPA_CALL(suMemcpy(dst, src, nbytes, suMemcpyDeviceToDevice));
auto phy_src_gpu_addr = get_phy_ptr(src);
auto phy_dst_gpu_addr = get_phy_ptr(dst);
SUPA_CALL(suMemcpy(phy_dst_gpu_addr, phy_src_gpu_addr, nbytes,
suMemcpyDeviceToDevice));
}

// (synchronous) copy from host to a device
DIPU_API void memCopyH2D(size_t nbytes, /*deviceId_t dstDevId,*/ void* dst,
/*Host srcDev,*/ const void* src) {
SUPA_CALL(suMemcpy(dst, src, nbytes, suMemcpyHostToDevice));
auto phy_dst_gpu_addr = get_phy_ptr(dst);
SUPA_CALL(suMemcpy(phy_dst_gpu_addr, src, nbytes, suMemcpyHostToDevice));
}

// (synchronous) copy from a device to host
DIPU_API void memCopyD2H(size_t nbytes, /*Host dstDev,*/ void* dst,
/*deviceId_t srcDevId,*/ const void* src) {
SUPA_CALL(suMemcpy(dst, src, nbytes, suMemcpyDeviceToHost));
auto phy_src_gpu_addr = get_phy_ptr(src);
SUPA_CALL(suMemcpy(dst, phy_src_gpu_addr, nbytes, suMemcpyDeviceToHost));
}

// (asynchronous) copy from device to a device
DIPU_API void memCopyD2DAsync(const deviceStream_t stream, size_t nbytes,
deviceId_t dstDevId, void* dst,
deviceId_t srcDevId, const void* src) {
SUPA_CALL(suMemcpyAsync(dst, src, nbytes, stream, suMemcpyDeviceToDevice));
auto phy_src_gpu_addr = get_phy_ptr(src);
auto phy_dst_gpu_addr = get_phy_ptr(dst);
SUPA_CALL(suMemcpyAsync(phy_dst_gpu_addr, phy_src_gpu_addr, nbytes, stream,
suMemcpyDeviceToDevice));
}

// (asynchronous) copy from host to a device
DIPU_API void memCopyH2DAsync(const deviceStream_t stream, size_t nbytes,
/*deviceId_t dstDevId,*/ void* dst,
/*Host srcDev,*/ const void* src) {
SUPA_CALL(suMemcpyAsync(dst, src, nbytes, stream, suMemcpyHostToDevice));
auto phy_dst_gpu_addr = get_phy_ptr(dst);
SUPA_CALL(suMemcpyAsync(phy_dst_gpu_addr, src, nbytes, stream,
suMemcpyHostToDevice));
}

// (asynchronous) copy from a device to host
DIPU_API void memCopyD2HAsync(const deviceStream_t stream, size_t nbytes,
/*Host dstDev,*/ void* dst,
/*deviceId_t srcDevId,*/ const void* src) {
SUPA_CALL(suMemcpyAsync(dst, src, nbytes, stream, suMemcpyDeviceToHost));
auto phy_src_gpu_addr = get_phy_ptr(src);
SUPA_CALL(suMemcpyAsync(dst, phy_src_gpu_addr, nbytes, stream,
suMemcpyDeviceToHost));
}
} // end namespace devapis
} // end namespace dipu

0 comments on commit 49f6556

Please sign in to comment.