DeepLink-org · fandaoyi · Nov 27, 2023 · Nov 9, 2023 · Nov 14, 2023 · Nov 14, 2023
@@ -2148,75 +2148,37 @@
     return out;
   interface: diopiNorm(ctx, out, self, p, dimDiopiSize);
 
-- schema: "to.dtype(Tensor(a) self, ScalarType dtype, bool non_blocking=False, bool copy=False, MemoryFormat? memory_format=None) -> Tensor(a)"
+# wrap_diopi_cast_dtype has no corresponding aten op and not registed, it's just a diopi func wrapper.
+# use this tricky method to support call multiple diopi-op in one aten-op
+- schema: "wrap_diopi_cast_dtype(Tensor(a) self, ScalarType dtype) -> Tensor(a)"
   register_op: False
   custom_code_at_the_beginning: |
     auto out = at::empty_like(self, self.options().dtype(dtype));
   interface: diopiCastDtype(ctx, out, self);
-  custom_code_before_return: |
-    if (memory_format.has_value()) {
-      auto out1 = at::empty_like(out, out.options(), memory_format.value());
-      at::copy(out1, out, non_blocking);
-      out = out1;
-    }
-    if (!non_blocking) {
-        dipu::getCurrentDIPUStream().synchronize();
-    }
 
-- schema: copy_(Tensor(a!) self, Tensor src, bool non_blocking=False) -> Tensor(a!)
+# a diopi func wrapper.
+- schema: wrap_diopi_copy_inp(Tensor(a!) self, Tensor src, bool non_blocking=False) -> Tensor(a!)
+  register_op: False
   no_device_check_args: [self, src]
-  device: [not_for_any_now] #todo
-  ins: [srcTemp]
+  interface: diopiCopyInp(ctx, src, self)
+
+# this copy_ aten op may use both diopiCastDtype and diopiCopyInp. it's a proxy/composite op
+- schema: copy_(Tensor(a!) self, Tensor src, bool non_blocking=False) -> Tensor(a!)
+  dummy_call_diopi: True
   custom_fallback: True
+  device: [cuda, camb, ascend, droplet, supa]
   custom_code_at_the_beginning: |
-    dipu::DIPUGuard guard(self.is_cpu() ? src.device() : self.device());
-    auto stream = dipu::getCurrentDIPUStream();
-    auto srcTemp = self.dtype() == src.dtype() ? src : src.to(self.dtype());
-    srcTemp = (srcTemp.numel() == self.numel()) ? srcTemp : srcTemp.expand(self.sizes());
-    if (non_blocking) {
-      const bool is_default_stream = dipu::getDefaultDIPUStream() == stream;
-      if (self.is_cpu()) {
-        if (self.options().pinned_memory()) {
-          self.record_stream(stream);
-        }
-      } else if (!is_default_stream){
-        self.record_stream(stream);
-      }
-      if (srcTemp.is_cpu()) {
-        if (srcTemp.options().pinned_memory()) {
-          srcTemp.record_stream(stream);
-        }
-      } else if (!is_default_stream) {
-        srcTemp.record_stream(stream);
-      }
-    }
-    if (self.device().type() != srcTemp.device().type()) {
-      srcTemp = srcTemp.is_contiguous(self.suggest_memory_format()) ? srcTemp : srcTemp.contiguous(self.suggest_memory_format());
-      if (srcTemp.is_cpu() && (!self.is_cpu())) {
-        // c2d
-        dipu::devproxy::memCopyH2DAsync(stream.rawstream(), self.nbytes(), self.data_ptr(), srcTemp.data_ptr());
-      } else if ((!srcTemp.is_cpu()) && self.is_cpu()) {
-        // d2c
-        dipu::devproxy::memCopyD2HAsync(stream.rawstream(), self.nbytes(), self.data_ptr(), srcTemp.data_ptr());
-      }
-      if (!non_blocking) {
-        dipu::getCurrentDIPUStream().synchronize();
-      }
-
-      return self;
-    }
+    dipu::getDipuCopyClass()->run(self, src, non_blocking);
+    return self;
+    // need add [composite] attr? the code behind this is useless.
   interface: diopiCopyInp(ctx, srcTemp, self)
-  custom_code_before_return: |
-    if (!non_blocking) {
-      dipu::getCurrentDIPUStream().synchronize();
-    }
 
+# vendor who has no fully implemented diopi and proper fallback DIPUCopy sub-class
 - schema: copy_(Tensor(a!) self, Tensor src, bool non_blocking=False) -> Tensor(a!)
-  no_device_check_args: [self, src]
-  custom_fallback: True
   dummy_call_diopi: True
   custom_code_at_the_beginning: |
     return custom_fallback_dipu_copy_(self, src, non_blocking);
+  device: [topsrider]
   interface: diopiCopyInp(ctx, src, self)
 
 - schema: _amp_foreach_non_finite_check_and_unscale_(at::TensorList self, Tensor(b!) found_inf, Tensor inv_scale) -> void

@@ -15,6 +15,7 @@
 #include "csrc_dipu/profiler/profiler.h"
 #include <csrc_dipu/utils/Log.h>
 #include "CustomFallbackFunctions.hpp"
+#include "csrc_dipu/aten/ops/DIPUCopy.hpp"
 
 $header_include_code
 

@@ -20,6 +20,8 @@ set(DIPU_AUTOGEN_DIOPI_WRAPPER_SCRIPT
     "${DIPU_AUTOGEN_DIOPI_WRAPPER_SOURCE_DIR}/autogen_diopi_wrapper.py")
 set(DIPU_AUTOGEN_DIOPI_WRAPPER_CONFIG
     "${DIPU_AUTOGEN_DIOPI_WRAPPER_SOURCE_DIR}/diopi_functions.yaml")
+set(DIPU_AUTOGEN_DIOPI_WRAPPER_TEMPLATE
+    "${DIPU_AUTOGEN_DIOPI_WRAPPER_SOURCE_DIR}/diopi_wrapper_template.py")
 set(DIPU_AUTOGENED_KERNELS_CPP
     "${CMAKE_CURRENT_SOURCE_DIR}/aten/ops/AutoGenedKernels.cpp")
 add_custom_command(
@@ -31,7 +33,8 @@ add_custom_command(
     --print_op_arg True --fun_config_dict
     '{\"current_device\": \"${UsedVendor}\"}'
   DEPENDS ${DIPU_AUTOGEN_DIOPI_WRAPPER_SCRIPT}
-          ${DIPU_AUTOGEN_DIOPI_WRAPPER_CONFIG})
+          ${DIPU_AUTOGEN_DIOPI_WRAPPER_CONFIG}
+          ${DIPU_AUTOGEN_DIOPI_WRAPPER_TEMPLATE})
 add_custom_target(autogen_diopi_kernels_cpp
                   DEPENDS ${DIPU_AUTOGENED_KERNELS_CPP})
 add_dependencies(${DIPU_AUTOGENED_KERNELS} autogen_diopi_kernels_cpp)

@@ -33,9 +33,6 @@ struct DIPUATenFunctions {
                                       c10::optional<at::Device> device_opt,
                                       c10::optional<bool> pin_memory_opt);
 
-  static at::Tensor& copy_(at::Tensor& self, const at::Tensor& src,
-                           bool non_blocking);
-
   static const at::Tensor& resize_(
       const at::Tensor& self, at::IntArrayRef size,
       c10::optional<at::MemoryFormat> memory_format);

@@ -13,7 +13,6 @@
 #include <csrc_dipu/aten/DIPUATenFunctions.h>
 #include <csrc_dipu/base/basedef.h>
 #include <csrc_dipu/profiler/profiler.h>
-#include <csrc_dipu/runtime/core/DIPUCopyInplace.h>
 
 using dnative = dipu::native::DIPUATenFunctions;
 

@@ -1,7 +1,6 @@
 #pragma once
 
 #include "csrc_dipu/aten/RegisterDIPU.hpp"
-#include <csrc_dipu/runtime/core/DIPUCopyInplace.h>
 
 #include "OpUtils.hpp"
 
@@ -316,37 +315,8 @@ custom_fallback_dipu_native_batch_norm_backward(
   return std::tie(grad_input, grad_weight, grad_bias);
 }
 
-static at::Tensor& custom_fallback_dipu_copy_(at::Tensor& self,
-                                              const at::Tensor& src,
-                                              bool non_blocking) {
-  DIPU_OP_LOG_WARNING_ONCE("custom fallback to cpu, name=copy_" << std::endl);
-  dipu::profile::RecordBlockCreator dipu_recorder(__FUNCTION__);
-  static bool use_slow_copy = (std::getenv("DIPU_USE_SLOW_COPY") != nullptr);
-  dipu::DIPUGuard guard(self.is_cpu() ? src.device() : self.device());
-  if (non_blocking) {
-    auto stream = dipu::getCurrentDIPUStream();
-    const bool is_default_stream = dipu::getDefaultDIPUStream() == stream;
-    if (self.is_cpu()) {
-      if (self.options().pinned_memory()) {
-        self.record_stream(stream);
-      }
-    } else if (!is_default_stream) {
-      self.record_stream(stream);
-    }
-    if (src.is_cpu()) {
-      if (src.options().pinned_memory()) {
-        src.record_stream(stream);
-      }
-    } else if (!is_default_stream) {
-      src.record_stream(stream);
-    }
-  }
-  if (use_slow_copy) {
-    return dipu::native::DIPUATenFunctions::copy_(self, src, non_blocking);
-  } else {
-    return dipu::getDipuCopyInplace()->run(self, src, non_blocking);
-  }
-}
+at::Tensor& custom_fallback_dipu_copy_(at::Tensor& self, const at::Tensor& src,
+                                       bool non_blocking);
 
 void custom_fallback_dipu__amp_foreach_non_finite_check_and_unscale_(
     at::TensorList scaled_grads, at::Tensor& found_inf,