Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fdy/enhance copy #430

Merged
merged 37 commits into from
Nov 27, 2023
Merged
Show file tree
Hide file tree
Changes from 29 commits
Commits
Show all changes
37 commits
Select commit Hold shift + click to select a range
a1c0b3d
mv vopy file path
fandaoyi Nov 9, 2023
bd7b21d
add new copy
fandaoyi Nov 14, 2023
85055da
fix static param err
fandaoyi Nov 14, 2023
e3e45db
fix copy err
fandaoyi Nov 15, 2023
b37a5ae
fix direct copy bug
fandaoyi Nov 20, 2023
a387a75
rm unused bcast template name
fandaoyi Nov 20, 2023
c048741
Merge branch 'main' into fdy/enhance_copy
fandaoyi Nov 20, 2023
19357b0
change clang format
fandaoyi Nov 20, 2023
a3553b3
change name hpp
fandaoyi Nov 20, 2023
8830683
rm unused header file
fandaoyi Nov 20, 2023
b5f4140
remove unused header 2
fandaoyi Nov 20, 2023
bf3ad69
change override behavior
fandaoyi Nov 20, 2023
8ac7ae5
change comment
fandaoyi Nov 20, 2023
45d92ea
change cudacopy
fandaoyi Nov 20, 2023
6624aeb
fix d2d copy err
fandaoyi Nov 20, 2023
105e11f
Merge branch 'fdy/enhance_copy' of https://github.com/DeepLink-org/DI…
fandaoyi Nov 20, 2023
f13e113
change register to use autogen
fandaoyi Nov 20, 2023
68c82b8
Merge branch 'main' into fdy/enhance_copy
fandaoyi Nov 20, 2023
748eefb
revert incorrect format
fandaoyi Nov 20, 2023
b87c91e
config fallback
fandaoyi Nov 21, 2023
e7d4dbc
fix link err
fandaoyi Nov 21, 2023
cb75e87
fix comment wanglei
fandaoyi Nov 21, 2023
64fd010
add newline
fandaoyi Nov 21, 2023
e3d1071
fix cpu copy err
fandaoyi Nov 21, 2023
e8eb3c2
add camb vendor copy
fandaoyi Nov 21, 2023
3f58ded
fix copy err
fandaoyi Nov 22, 2023
e0a47e5
fix copy err 2
fandaoyi Nov 22, 2023
953f724
Merge branch 'main' into fdy/enhance_copy
fandaoyi Nov 22, 2023
2ffb43f
fix compile err
fandaoyi Nov 22, 2023
9a11353
fix lingjie comment1
fandaoyi Nov 22, 2023
784e9cb
fix caikun comment
fandaoyi Nov 22, 2023
6fb980c
fix camb ci
fandaoyi Nov 22, 2023
2f241f2
fix camb ci
fandaoyi Nov 23, 2023
3740006
fix device switch err
fandaoyi Nov 23, 2023
081eaa4
fix ling jie caikun comment 2
fandaoyi Nov 27, 2023
ec74958
fix comment incorrect local ref
fandaoyi Nov 27, 2023
5919424
change init copy
fandaoyi Nov 27, 2023
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
72 changes: 17 additions & 55 deletions dipu/scripts/autogen_diopi_wrapper/diopi_functions.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -2148,75 +2148,37 @@
return out;
interface: diopiNorm(ctx, out, self, p, dimDiopiSize);

- schema: "to.dtype(Tensor(a) self, ScalarType dtype, bool non_blocking=False, bool copy=False, MemoryFormat? memory_format=None) -> Tensor(a)"
# wrap_diopi_cast_dtype has no corresponding aten op and not registed, it's just a diopi func wrapper.
# use this tricky method to support call multiple diopi-op in one aten-op
- schema: "wrap_diopi_cast_dtype(Tensor(a) self, ScalarType dtype) -> Tensor(a)"
register_op: False
custom_code_at_the_beginning: |
auto out = at::empty_like(self, self.options().dtype(dtype));
interface: diopiCastDtype(ctx, out, self);
custom_code_before_return: |
if (memory_format.has_value()) {
auto out1 = at::empty_like(out, out.options(), memory_format.value());
at::copy(out1, out, non_blocking);
out = out1;
}
if (!non_blocking) {
dipu::getCurrentDIPUStream().synchronize();
}

- schema: copy_(Tensor(a!) self, Tensor src, bool non_blocking=False) -> Tensor(a!)
# a diopi func wrapper.
- schema: wrap_diopi_copy_inp(Tensor(a!) self, Tensor src, bool non_blocking=False) -> Tensor(a!)
register_op: False
no_device_check_args: [self, src]
device: [not_for_any_now] #todo
ins: [srcTemp]
interface: diopiCopyInp(ctx, src, self)

# this copy_ aten op may use both diopiCastDtype and diopiCopyInp. it's a proxy/composite op
- schema: copy_(Tensor(a!) self, Tensor src, bool non_blocking=False) -> Tensor(a!)
dummy_call_diopi: True
custom_fallback: True
device: [cuda, camb, ascend, droplet, supa]
custom_code_at_the_beginning: |
dipu::DIPUGuard guard(self.is_cpu() ? src.device() : self.device());
auto stream = dipu::getCurrentDIPUStream();
auto srcTemp = self.dtype() == src.dtype() ? src : src.to(self.dtype());
srcTemp = (srcTemp.numel() == self.numel()) ? srcTemp : srcTemp.expand(self.sizes());
if (non_blocking) {
const bool is_default_stream = dipu::getDefaultDIPUStream() == stream;
if (self.is_cpu()) {
if (self.options().pinned_memory()) {
self.record_stream(stream);
}
} else if (!is_default_stream){
self.record_stream(stream);
}
if (srcTemp.is_cpu()) {
if (srcTemp.options().pinned_memory()) {
srcTemp.record_stream(stream);
}
} else if (!is_default_stream) {
srcTemp.record_stream(stream);
}
}
if (self.device().type() != srcTemp.device().type()) {
srcTemp = srcTemp.is_contiguous(self.suggest_memory_format()) ? srcTemp : srcTemp.contiguous(self.suggest_memory_format());
if (srcTemp.is_cpu() && (!self.is_cpu())) {
// c2d
dipu::devproxy::memCopyH2DAsync(stream.rawstream(), self.nbytes(), self.data_ptr(), srcTemp.data_ptr());
} else if ((!srcTemp.is_cpu()) && self.is_cpu()) {
// d2c
dipu::devproxy::memCopyD2HAsync(stream.rawstream(), self.nbytes(), self.data_ptr(), srcTemp.data_ptr());
}
if (!non_blocking) {
dipu::getCurrentDIPUStream().synchronize();
}

return self;
}
dipu::getDipuCopyClass()->run(self, src, non_blocking);
return self;
// need add [composite] attr? the code behind this is useless.
interface: diopiCopyInp(ctx, srcTemp, self)
fandaoyi marked this conversation as resolved.
Show resolved Hide resolved
custom_code_before_return: |
if (!non_blocking) {
dipu::getCurrentDIPUStream().synchronize();
}

# vendor who has no fully implemented diopi and proper fallback DIPUCopy sub-class
- schema: copy_(Tensor(a!) self, Tensor src, bool non_blocking=False) -> Tensor(a!)
no_device_check_args: [self, src]
custom_fallback: True
dummy_call_diopi: True
custom_code_at_the_beginning: |
return custom_fallback_dipu_copy_(self, src, non_blocking);
device: [topsrider]
interface: diopiCopyInp(ctx, src, self)

- schema: _amp_foreach_non_finite_check_and_unscale_(at::TensorList self, Tensor(b!) found_inf, Tensor inv_scale) -> void
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@
#include "csrc_dipu/profiler/profiler.h"
#include <csrc_dipu/utils/Log.h>
#include "CustomFallbackFunctions.hpp"
#include "csrc_dipu/aten/ops/DIPUCopy.hpp"

$header_include_code

Expand Down
5 changes: 4 additions & 1 deletion dipu/torch_dipu/csrc_dipu/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,8 @@ set(DIPU_AUTOGEN_DIOPI_WRAPPER_SCRIPT
"${DIPU_AUTOGEN_DIOPI_WRAPPER_SOURCE_DIR}/autogen_diopi_wrapper.py")
set(DIPU_AUTOGEN_DIOPI_WRAPPER_CONFIG
"${DIPU_AUTOGEN_DIOPI_WRAPPER_SOURCE_DIR}/diopi_functions.yaml")
set(DIPU_AUTOGEN_DIOPI_WRAPPER_TEMPLATE
"${DIPU_AUTOGEN_DIOPI_WRAPPER_SOURCE_DIR}/diopi_wrapper_template.py")
set(DIPU_AUTOGENED_KERNELS_CPP
"${CMAKE_CURRENT_SOURCE_DIR}/aten/ops/AutoGenedKernels.cpp")
add_custom_command(
Expand All @@ -31,7 +33,8 @@ add_custom_command(
--print_op_arg True --fun_config_dict
'{\"current_device\": \"${UsedVendor}\"}'
DEPENDS ${DIPU_AUTOGEN_DIOPI_WRAPPER_SCRIPT}
${DIPU_AUTOGEN_DIOPI_WRAPPER_CONFIG})
${DIPU_AUTOGEN_DIOPI_WRAPPER_CONFIG}
${DIPU_AUTOGEN_DIOPI_WRAPPER_TEMPLATE})
add_custom_target(autogen_diopi_kernels_cpp
DEPENDS ${DIPU_AUTOGENED_KERNELS_CPP})
add_dependencies(${DIPU_AUTOGENED_KERNELS} autogen_diopi_kernels_cpp)
Expand Down
3 changes: 0 additions & 3 deletions dipu/torch_dipu/csrc_dipu/aten/DIPUATenFunctions.h
Original file line number Diff line number Diff line change
Expand Up @@ -33,9 +33,6 @@ struct DIPUATenFunctions {
c10::optional<at::Device> device_opt,
c10::optional<bool> pin_memory_opt);

static at::Tensor& copy_(at::Tensor& self, const at::Tensor& src,
bool non_blocking);

static const at::Tensor& resize_(
const at::Tensor& self, at::IntArrayRef size,
c10::optional<at::MemoryFormat> memory_format);
Expand Down
1 change: 0 additions & 1 deletion dipu/torch_dipu/csrc_dipu/aten/RegisterDIPU.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,6 @@
#include <csrc_dipu/aten/DIPUATenFunctions.h>
#include <csrc_dipu/base/basedef.h>
#include <csrc_dipu/profiler/profiler.h>
#include <csrc_dipu/runtime/core/DIPUCopyInplace.h>

using dnative = dipu::native::DIPUATenFunctions;

Expand Down
203 changes: 0 additions & 203 deletions dipu/torch_dipu/csrc_dipu/aten/ops/CopyKernel.cpp

This file was deleted.

34 changes: 2 additions & 32 deletions dipu/torch_dipu/csrc_dipu/aten/ops/CustomFallbackFunctions.hpp
Original file line number Diff line number Diff line change
@@ -1,7 +1,6 @@
#pragma once

#include "csrc_dipu/aten/RegisterDIPU.hpp"
#include <csrc_dipu/runtime/core/DIPUCopyInplace.h>

#include "OpUtils.hpp"

Expand Down Expand Up @@ -316,37 +315,8 @@ custom_fallback_dipu_native_batch_norm_backward(
return std::tie(grad_input, grad_weight, grad_bias);
fandaoyi marked this conversation as resolved.
Show resolved Hide resolved
}

static at::Tensor& custom_fallback_dipu_copy_(at::Tensor& self,
const at::Tensor& src,
bool non_blocking) {
DIPU_OP_LOG_WARNING_ONCE("custom fallback to cpu, name=copy_" << std::endl);
dipu::profile::RecordBlockCreator dipu_recorder(__FUNCTION__);
static bool use_slow_copy = (std::getenv("DIPU_USE_SLOW_COPY") != nullptr);
dipu::DIPUGuard guard(self.is_cpu() ? src.device() : self.device());
if (non_blocking) {
auto stream = dipu::getCurrentDIPUStream();
const bool is_default_stream = dipu::getDefaultDIPUStream() == stream;
if (self.is_cpu()) {
if (self.options().pinned_memory()) {
self.record_stream(stream);
}
} else if (!is_default_stream) {
self.record_stream(stream);
}
if (src.is_cpu()) {
if (src.options().pinned_memory()) {
src.record_stream(stream);
}
} else if (!is_default_stream) {
src.record_stream(stream);
}
}
if (use_slow_copy) {
return dipu::native::DIPUATenFunctions::copy_(self, src, non_blocking);
} else {
return dipu::getDipuCopyInplace()->run(self, src, non_blocking);
}
}
at::Tensor& custom_fallback_dipu_copy_(at::Tensor& self, const at::Tensor& src,
bool non_blocking);

void custom_fallback_dipu__amp_foreach_non_finite_check_and_unscale_(
at::TensorList scaled_grads, at::Tensor& found_inf,
Expand Down
Loading
Loading