Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[FIX] fix virtual memory error of using SUPA #468

Merged
merged 3 commits into from
Dec 11, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 3 additions & 3 deletions dipu/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -44,7 +44,7 @@ elseif (${DEVICE} IN_LIST DEVICE_TOPSRIDER)
elseif (${DEVICE} IN_LIST DEVICE_SUPA)
set(USE_SUPA ON)
set(UsedVendor supa)
set(DIOPI_IMPL_OPT "")
set(DIOPI_IMPL_OPT "supa")
#SUPA DEVICE DOES NOT NEED TO BUILD DIOPI, so set the target to "" to control the workflow.
elseif (${DEVICE} IN_LIST DEVICE_DROPLET)
set(USE_DROPLET ON)
Expand Down Expand Up @@ -81,14 +81,14 @@ if(NOT DEFINED DIPU_ABI_V)
OUTPUT_VARIABLE DIPU_ABI_V)
endif()

if(NOT DEFINED DIPU_COMPILED_WITH_CXX11_ABI)
if(NOT DEFINED DIPU_COMPILED_WITH_CXX11_ABI)
execute_process(
COMMAND
sh -x -c
"python -c 'import torch;print(1 if torch.compiled_with_cxx11_abi() else 0)'"
OUTPUT_VARIABLE DIPU_COMPILED_WITH_CXX11_ABI)
endif()

if(DIPU_COMPILED_WITH_CXX11_ABI GREATER 0)
set(DIPU_COMPILED_WITH_CXX11_ABI 1)
else()
Expand Down
17 changes: 12 additions & 5 deletions dipu/torch_dipu/csrc_dipu/vendor/supa/copyinplace.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -18,11 +18,18 @@ class SUPACopyInplace : public DIPUCopyInpOnDIOPI {
SUPACopyInplace() = default;
~SUPACopyInplace() = default;

// assume it can handle between device.
void copyNodirectBetweenDevices(at::Tensor& dst, const at::Tensor& src,
bool non_blocking,
CopyParamsInfo& info) override {
dipu_wrap_diopi_copy_inp(dst, src, non_blocking);
void run(at::Tensor& dst, const at::Tensor& src, bool non_blocking) override {
Aaron20000101 marked this conversation as resolved.
Show resolved Hide resolved
auto curStream = dipu::getCurrentDIPUStream();
::diopiContext context(curStream.rawstream());
auto ctx = &context;
auto diopi_src = dipu::diopi_helper::toDiopiTensorHandle(src);
auto diopi_dst = dipu::diopi_helper::toDiopiTensorHandle(dst);
TORCH_CHECK(diopiError_t::diopiSuccess ==
diopiCopyInp(ctx, diopi_src, diopi_dst));
// syncAfterCopy
if (!non_blocking) {
dipu::devapis::syncStream(curStream.rawstream());
}
}
};

Expand Down
29 changes: 22 additions & 7 deletions dipu/torch_dipu/csrc_dipu/vendor/supa/deviceimpl.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -184,6 +184,8 @@ DIPU_API void freeHost(void* p) { free(p); }
extern "C" {
void* br_device_malloc(uint64_t bytes);
void br_device_free(void* ptr);
// get physical address from ptr(virtual)
void* get_phy_ptr(const void* ptr);
}

DIPU_API OpStatus mallocDevice(void** p, size_t nbytes, bool throwExcepion) {
Expand All @@ -206,47 +208,60 @@ DIPU_API bool isPinnedPtr(const void* p) { return false; }
// (asynchronous) set val
DIPU_API void memSetAsync(const deviceStream_t stream, void* ptr, int val,
size_t size) {
SUPA_CALL(suMemsetAsync(ptr, val, size, stream));
auto phy_gpu_addr = get_phy_ptr(ptr);
SUPA_CALL(suMemsetAsync(phy_gpu_addr, val, size, stream));
}

// (synchronous) copy from device to a device
DIPU_API void memCopyD2D(size_t nbytes, deviceId_t dstDevId, void* dst,
deviceId_t srcDevId, const void* src) {
// SUPA uses Unified Virtual Address
SUPA_CALL(suMemcpy(dst, src, nbytes, suMemcpyDeviceToDevice));
auto phy_src_gpu_addr = get_phy_ptr(src);
auto phy_dst_gpu_addr = get_phy_ptr(dst);
SUPA_CALL(suMemcpy(phy_dst_gpu_addr, phy_src_gpu_addr, nbytes,
suMemcpyDeviceToDevice));
}

// (synchronous) copy from host to a device
DIPU_API void memCopyH2D(size_t nbytes, /*deviceId_t dstDevId,*/ void* dst,
/*Host srcDev,*/ const void* src) {
SUPA_CALL(suMemcpy(dst, src, nbytes, suMemcpyHostToDevice));
auto phy_dst_gpu_addr = get_phy_ptr(dst);
SUPA_CALL(suMemcpy(phy_dst_gpu_addr, src, nbytes, suMemcpyHostToDevice));
}

// (synchronous) copy from a device to host
DIPU_API void memCopyD2H(size_t nbytes, /*Host dstDev,*/ void* dst,
/*deviceId_t srcDevId,*/ const void* src) {
SUPA_CALL(suMemcpy(dst, src, nbytes, suMemcpyDeviceToHost));
auto phy_src_gpu_addr = get_phy_ptr(src);
SUPA_CALL(suMemcpy(dst, phy_src_gpu_addr, nbytes, suMemcpyDeviceToHost));
}

// (asynchronous) copy from device to a device
DIPU_API void memCopyD2DAsync(const deviceStream_t stream, size_t nbytes,
deviceId_t dstDevId, void* dst,
deviceId_t srcDevId, const void* src) {
SUPA_CALL(suMemcpyAsync(dst, src, nbytes, stream, suMemcpyDeviceToDevice));
auto phy_src_gpu_addr = get_phy_ptr(src);
auto phy_dst_gpu_addr = get_phy_ptr(dst);
SUPA_CALL(suMemcpyAsync(phy_dst_gpu_addr, phy_src_gpu_addr, nbytes, stream,
suMemcpyDeviceToDevice));
}

// (asynchronous) copy from host to a device
DIPU_API void memCopyH2DAsync(const deviceStream_t stream, size_t nbytes,
/*deviceId_t dstDevId,*/ void* dst,
/*Host srcDev,*/ const void* src) {
SUPA_CALL(suMemcpyAsync(dst, src, nbytes, stream, suMemcpyHostToDevice));
auto phy_dst_gpu_addr = get_phy_ptr(dst);
SUPA_CALL(suMemcpyAsync(phy_dst_gpu_addr, src, nbytes, stream,
suMemcpyHostToDevice));
}

// (asynchronous) copy from a device to host
DIPU_API void memCopyD2HAsync(const deviceStream_t stream, size_t nbytes,
/*Host dstDev,*/ void* dst,
/*deviceId_t srcDevId,*/ const void* src) {
SUPA_CALL(suMemcpyAsync(dst, src, nbytes, stream, suMemcpyDeviceToHost));
auto phy_src_gpu_addr = get_phy_ptr(src);
SUPA_CALL(suMemcpyAsync(dst, phy_src_gpu_addr, nbytes, stream,
suMemcpyDeviceToHost));
}
} // end namespace devapis
} // end namespace dipu