DeepLink-org · fandaoyi · Dec 5, 2023 · Dec 7, 2023
@@ -48,6 +48,8 @@ diopiCastDtype
 diopiCat
 diopiCdist
 diopiCdistBackward
+diopiCeil
+diopiCeilInp
 diopiClamp
 diopiClampInp
 diopiClampInpScalar

@@ -643,18 +643,18 @@
     ::diopiSize_t diopi_size = toDiopiSize(dim);
   interface: diopiMean(ctx, out, self_dtype_diopi, diopi_size);
 
-- schema: "std.correction(Tensor self, int[1]? dim=None, *, int? correction=None, bool keepdim=False) -> Tensor"
+- schema: "std.correction(Tensor self, int[1]? dim=None, *, Scalar? correction=None, bool keepdim=False) -> Tensor"
   custom_code_at_the_beginning: |
     std::vector<int64_t> output_shape = infer_reduce_op_shape(self.sizes(), dim.value_or(std::vector<int64_t>()), keepdim);
     auto out = at::empty(output_shape, self.options());
-    bool unbiased = correction.value_or(1) == 1;
+    bool unbiased = correction.value_or(1).toLong() == 1;
     ::diopiSize_t diopi_size = toDiopiSize(dim);
   interface: diopiStd(ctx, out, self, diopi_size, unbiased);
 
-- schema: "std.correction_out(Tensor self, int[1]? dim=None, *, int? correction=None, bool keepdim=False, Tensor(a!) out) -> Tensor(a!)"
+- schema: "std.correction_out(Tensor self, int[1]? dim=None, *, Scalar? correction=None, bool keepdim=False, Tensor(a!) out) -> Tensor(a!)"
   custom_code_at_the_beginning: |
     ::diopiSize_t diopi_size = toDiopiSize(dim);
-    bool unbiased = correction.value_or(1) == 1;
+    bool unbiased = correction.value_or(1).toLong() == 1;
   interface: diopiStd(ctx, out, self, diopi_size, unbiased);
 
 - schema: "linear_backward(Tensor input, Tensor grad_output, Tensor weight, bool[3] output_mask) -> (Tensor grad_input, Tensor grad_weight, Tensor grad_bias)"

@@ -58,8 +58,9 @@ def batched_dot_bmm(a, b):
     # description is the column
     label = "Batched dot"
     sub_label = f"[{b}, {n}]"
-    x = torch.ones((b, n))
-    for num_threads in [1, 4, 16, 32]:
+    x = torch.ones((b, n)).cuda()
+    # cuda tensor, not so many dispatch threads in actual case. 16, 32]:
+    for num_threads in [1, 4]:  
         results.append(
             benchmark.Timer(
                 stmt="batched_dot_mul_sum(x, x)",

@@ -7,8 +7,15 @@ function run_dipu_tests {
   unset DIPU_DUMP_OP_ARGS
   export PYTHONPATH=${DIPU_ROOT}/../:${PYTHONPATH}
   ${CDIR}/python/run_tests.sh
+  echo "fill_.Scalar" >> .dipu_force_fallback_op_list.config
+  run_test "${PYTORCH_DIR}/test/test_tensor_creation_ops.py" "$@" -v -f TestTensorCreationDIPU # --locals -f
+  echo "" >  .dipu_force_fallback_op_list.config
+  # run_test "${PYTORCH_DIR}/test/test_reductions.py" "$@" -v -f TestReductionsDIPU
+
   run_test "${PYTORCH_TEST_DIR}/test/nn/test_convolution.py" -v TestConvolutionNNDeviceTypeDIPU
   # run_test "${PYTORCH_TEST_DIR}/test/test_linalg.py" "$@" -v TestLinalgDIPU
+
+  #　mock cuda cause test number err, temporary ignore
   # run_test "${PYTORCH_TEST_DIR}/test/test_testing.py" "$@" -v TestTestParametrizationDeviceTypeDIPU TestTestingDIPU
   run_test "${PYTORCH_TEST_DIR}/test/test_type_hints.py" "$@" -v
   run_test "${PYTORCH_TEST_DIR}/test/test_type_info.py" "$@" -v
@@ -17,14 +24,14 @@ function run_dipu_tests {
   # run_test "${PYTORCH_TEST_DIR}/test/test_binary_ufuncs.py" "$@" -v TestBinaryUfuncsDIPU
   # run_test "${PYTORCH_TEST_DIR}/test/test_torch.py" "$@" -v TestTorchDeviceTypeDIPU #--subprocess
   #run_test "${PYTORCH_TEST_DIR}/test/test_indexing.py" "$@" -v TestIndexingDIPU
-  #run_test "${PYTORCH_TEST_DIR}/test/test_indexing.py" "$@" -v NumpyTestsDIPU
-  # run_test "${PYTORCH_TEST_DIR}/test/test_view_ops.py" "$@" -v TestViewOpsDIPU
+  run_test "${PYTORCH_TEST_DIR}/test/test_indexing.py" "$@" -v NumpyTestsDIPU
+  run_test "${PYTORCH_TEST_DIR}/test/test_view_ops.py" "$@" -v TestViewOpsDIPU
   # run_test "${PYTORCH_TEST_DIR}/test/test_type_promotion.py" "$@" -v TestTypePromotionDIPU
   # run_test "${PYTORCH_TEST_DIR}/test/test_nn.py" "$@" -v TestNN
-  # run_test "${PYTORCH_TEST_DIR}/test/test_ops_fwd_gradients.py" "$@" -v TestFwdGradientsDIPU
-  # run_test "${PYTORCH_TEST_DIR}/test/test_ops_gradients.py" "$@" -v TestBwdGradientsDIPU
+  run_test "${PYTORCH_TEST_DIR}/test/test_ops_fwd_gradients.py" "$@" -v TestFwdGradientsDIPU
+  run_test "${PYTORCH_TEST_DIR}/test/test_ops_gradients.py" "$@" -v TestBwdGradientsDIPU
   # run_test "${PYTORCH_TEST_DIR}/test/test_ops.py" "$@" -v
-  # run_test "${PYTORCH_TEST_DIR}/test/test_shape_ops.py" "$@" -v TestShapeOpsDIPU
+  run_test "${PYTORCH_TEST_DIR}/test/test_shape_ops.py" "$@" -v TestShapeOpsDIPU
 }
 
 if [ "$LOGFILE" != "" ]; then

@@ -1,7 +1,7 @@
 set(DIPU_LIB torch_dipu)
 set(DIPU_PYTHON_LIB torch_dipu_python)
 set(DIPU_AUTOGENED_KERNELS torch_dipu_autogened_kernels)
-
+add_definitions(-std=c++17)
 # python path
 include_directories(SYSTEM ${PYTHON_INCLUDE_DIR})
 

@@ -2,6 +2,8 @@
 #pragma once
 
 #include <deque>
+#include <fstream>
+#include <iostream>
 #include <mutex>
 
 #include <torch/library.h>
@@ -110,20 +112,16 @@ namespace {
 
 #define DIPU_LIBRARY_IMPL(ns, k, m) _DIPU_LIBRARY_IMPL(ns, k, m, C10_UID)
 
-#define _DIPU_LIBRARY_IMPL(ns, k, m, uid)                                 \
-  static void C10_CONCATENATE(DIPU_LIBRARY_IMPL_init_##ns##_##k##_,       \
-                              uid)(torch::Library&);                      \
-  static const ::at::DIPUOpRegister C10_CONCATENATE(                      \
-      DIPU_LIBRARY_IMPL_static_init_##ns##_##k##_, uid)(                  \
-      c10::guts::if_constexpr<c10::impl::dispatch_key_allowlist_check(    \
-          c10::DispatchKey::k)>(                                          \
-          []() {                                                          \
-            return &C10_CONCATENATE(DIPU_LIBRARY_IMPL_init_##ns##_##k##_, \
-                                    uid);                                 \
-          },                                                              \
-          []() { return [](torch::Library&) -> void {}; }),               \
-      #ns, c10::make_optional(c10::DispatchKey::k), __FILE__, __LINE__);  \
-  void C10_CONCATENATE(DIPU_LIBRARY_IMPL_init_##ns##_##k##_,              \
+#define _DIPU_LIBRARY_IMPL(ns, k, m, uid)                                \
+  static void C10_CONCATENATE(DIPU_LIBRARY_IMPL_init_##ns##_##k##_,      \
+                              uid)(torch::Library&);                     \
+  static const ::at::DIPUOpRegister C10_CONCATENATE(                     \
+      DIPU_LIBRARY_IMPL_static_init_##ns##_##k##_, uid)(                 \
+      (c10::impl::dispatch_key_allowlist_check(c10::DispatchKey::k)      \
+           ? &C10_CONCATENATE(DIPU_LIBRARY_IMPL_init_##ns##_##k##_, uid) \
+           : [](torch::Library&) -> void {}),                            \
+      #ns, c10::make_optional(c10::DispatchKey::k), __FILE__, __LINE__); \
+  void C10_CONCATENATE(DIPU_LIBRARY_IMPL_init_##ns##_##k##_,             \
                        uid)(torch::Library & m)
 
 }  // namespace
@@ -1,4 +1,4 @@
 // Copyright (c) 2023, DeepLink.
 //
 // This file contains definitions of custom fallback functions needed by AMP
 // GradScaler. The corresponding declarations can be found in
@@ -87,16 +87,17 @@
               "found_inf must be a float tensor.");
   if (static_cast<bool>(found_inf.item<float>())) {
     current_scale *= backoff_factor;
-    growth_tracker[0] = 0;
+    growth_tracker.fill_(c10::Scalar(0));
   } else {
     // Entering this branch means we just carried out a successful step,
     // so growth_tracker is incremented before comparing to growth_interval.
     auto successful = growth_tracker.item<int>() + 1;
     if (successful == growth_interval) {
       current_scale *= growth_factor;
-      growth_tracker[0] = 0;
+      growth_tracker.fill_(c10::Scalar(0));
     } else {
-      growth_tracker[0] = successful;
+      //growth_tracker in torch 2.1 is a scalar tensor. in 2.0 is a size=1 tensor.
+      growth_tracker.fill_(c10::Scalar(successful));
     }
   }
   return current_scale;

@@ -117,7 +117,8 @@ struct WrapFunction_<CastPolicy::fp32_append_dtype, device_type, Redispatch, F,
   static Ret call(Args... args) {
     c10::impl::ExcludeDispatchKeyGuard no_autocast(
         get_autocast_dispatch_key_from_device_type(device_type));
-    at::ScalarType out_type = type_from_firstarg(at::kFloat, args...);
+    at::ScalarType out_type =
+        type_from_firstarg(device_type, at::kFloat, args...);
     return (*F)(args..., out_type);
   }
 };

@@ -309,6 +309,7 @@ void DIPUThreadLocalSubqueue::TorchOpStorage::materialize(
         DIPUThreadLocalSubqueue::TorchOpStorage::OpList::correlationID(event),
         time_converter(event->end_time_),
         input_getter(),
+        input_getter(),
         jit_stack(),
         jit_module(),
         extra_args(),

@@ -42,7 +42,7 @@ using torch::autograd::profiler::KinetoEvent;
 using torch::autograd::profiler::post_process_t;
 using torch::autograd::profiler::ProfilerResult;
 using torch::profiler::impl::ActiveProfilerType;
-using torch::profiler::impl::dtypesToStr;
+// using torch::profiler::impl::dtypesToStr;
 using torch::profiler::impl::EventType;
 using torch::profiler::impl::ExtraFields;
 using torch::profiler::impl::op_input_t;
@@ -162,9 +162,9 @@ struct AddGenericMetadata : public MetadataBase {
       addMetadata("Input Dims", shapesToStr(shapes_and_dtypes.first));
     }
 
-    if (!shapes_and_dtypes.second.empty()) {
-      addMetadata("Input type", dtypesToStr(shapes_and_dtypes.second));
-    }
+    // if (!shapes_and_dtypes.second.empty()) {
+    //   addMetadata("Input type", dtypesToStr(shapes_and_dtypes.second));
+    // }
 
     if (config_ && !config_->experimental_config.performance_events.empty()) {
       auto& event_names = config_->experimental_config.performance_events;
@@ -413,9 +413,9 @@ void pushProfilingCallbacks(const std::unordered_set<at::RecordScope>& scopes) {
           .needsInputs(registration_state_ptr->config().report_input_shapes)
           .scopes(scopes);
 
-  auto handle = c10::guts::if_constexpr<use_global_callback>(
-      [&] { return at::addGlobalCallback(recordFunctionCallback); },
-      [&] { return at::addThreadLocalCallback(recordFunctionCallback); });
+  auto handle = use_global_callback
+                    ? at::addGlobalCallback(recordFunctionCallback)
+                    : at::addThreadLocalCallback(recordFunctionCallback);
   registration_state_ptr->setCallbackHandle(handle);
 }
 

@@ -148,8 +148,9 @@ class CallTypeHelper final {
   template <size_t C, typename T, typename FunctorT, typename... Args>
   static void map(T& t, FunctorT& f, Args&&... args) {
     f(std::get<C>(t), args...);
-    c10::guts::if_constexpr<C + 1 < End>(
-        [&](auto _) { map<C + 1>(_(t), f, std::forward<Args>(args)...); });
+    if constexpr (C + 1 < End) {
+      map<C + 1>(t, f, std::forward<Args>(args)...);
+    }
   }
 
  public:

@@ -1,8 +1,9 @@
 // Copyright (c) 2023, DeepLink.

 #include "DIPURawAllocator.h"
 
 #include <mutex>
+#include <map>
 #include <unordered_set>
 #include <utility>
 

@@ -62,7 +62,8 @@ std::tuple<std::vector<at::Tensor>, c10::intrusive_ptr<Work>> broadcast_dipu_(
 std::tuple<std::vector<at::Tensor>, c10::intrusive_ptr<Work>> allreduce_dipu_(
     at::TensorList tensors,
     const c10::intrusive_ptr<ProcessGroup>& process_group,
-    const c10::intrusive_ptr<ReduceOp>& reduce_op, int64_t timeout) {
+    const c10::intrusive_ptr<ReduceOp>& reduce_op,
+    const c10::optional<at::Tensor>& sparse_indices, int64_t timeout) {
   auto tensor_vec = tensors.vec();
   auto work =
       process_group->getBackend(dipu::DIPU_DEVICE_TYPE)

@@ -1,11 +1,11 @@
 // Copyright (c) 2023, DeepLink.
 #include <ATen/Utils.h>

 #include <csrc_dipu/runtime/core/DIPUGeneratorImpl.h>
 
 namespace dipu {
 
-static const size_t states_size = 200 * sizeof(4120);
+static const size_t states_size = 0; // 200 * sizeof(4120);
 static const size_t seed_size = sizeof(uint64_t);
 static const size_t offset_size = sizeof(int64_t);
 static const size_t total_size = states_size + seed_size + offset_size;

@@ -44,27 +44,25 @@ class DIPUCUDAAllocatorProxy : public CUDAAllocator {
     DIPU_PATCH_CUDA_ALLOCATOR();
   }
   virtual SnapshotInfo snapshot() override { DIPU_PATCH_CUDA_ALLOCATOR(); }
-  virtual void notifyCaptureBegin(int device, CaptureId_t graph_id,
-                                  MempoolId_t mempool_id) override {
+  void notifyCaptureBegin(int device, CaptureId_t graph_id,
+                          MempoolId_t mempool_id) {
     DIPU_PATCH_CUDA_ALLOCATOR();
   }
-  virtual void notifyCaptureAboutToEnd(int device,
-                                       CaptureId_t graph_id) override {
+  void notifyCaptureAboutToEnd(int device, CaptureId_t graph_id) {
     DIPU_PATCH_CUDA_ALLOCATOR();
   }
-  virtual void notifyCaptureEnded(int device, CaptureId_t graph_id) override {
+  void notifyCaptureEnded(int device, CaptureId_t graph_id) {
     DIPU_PATCH_CUDA_ALLOCATOR();
   }
-  virtual void notifyCaptureDestroy(int device,
-                                    MempoolId_t mempool_id) override {
+  void notifyCaptureDestroy(int device, MempoolId_t mempool_id) {
     DIPU_PATCH_CUDA_ALLOCATOR();
   }
   virtual std::shared_ptr<void> getIpcDevPtr(std::string handle) override {
     DIPU_PATCH_CUDA_ALLOCATOR();
   }
-  virtual void recordHistory(bool enabled, CreateContextFn context_recorder,
-                             size_t alloc_trace_max_entries,
-                             bool alloc_trace_record_context) override {
+  void recordHistory(bool enabled, CreateContextFn context_recorder,
+                     size_t alloc_trace_max_entries,
+                     bool alloc_trace_record_context) {
     DIPU_PATCH_CUDA_ALLOCATOR();
   }
   virtual void attachOutOfMemoryObserver(
@@ -95,7 +93,7 @@ class DIPUCUDAAllocatorProxy : public CUDAAllocator {
 
   virtual void emptyCache() override { dipu::emptyCachedMem(); }
 
-  virtual bool needsPoolSpecificPeerAccess() override {
+  bool needsPoolSpecificPeerAccess() {
     // DIPU_PATCH_CUDA_ALLOCATOR();
     return false;
   }
@@ -107,6 +105,26 @@ class DIPUCUDAAllocatorProxy : public CUDAAllocator {
         c10::Device(c10::DeviceType::CUDA, data_ptr.device().index()));
     return data_ptr;
   }
+
+  void beginAllocateStreamToPool(int device, cudaStream_t stream,
+                                 MempoolId_t mempool_id) {}
+  void endAllocateStreamToPool(int device, cudaStream_t stream) {}
+
+  void recordHistory(bool enabled, CreateContextFn context_recorder,
+                     size_t alloc_trace_max_entries, RecordContext when) {}
+  void releasePool(int device, MempoolId_t mempool_id) {}
+
+  void enablePeerAccess(int dev, int dev_to_access) {}
+
+  cudaError_t memcpyAsync(void* dst, int dstDevice, const void* src,
+                          int srcDevice, size_t count, cudaStream_t stream,
+                          bool p2p_enabled) {}
+  std::shared_ptr<AllocatorState> getCheckpointState(int device,
+                                                     MempoolId_t id) {}
+  CheckpointDelta setCheckpointPoolState(int device,
+                                         std::shared_ptr<AllocatorState> pps) {
+    return CheckpointDelta();
+  }
 };
 
 }  // namespace CUDACachingAllocator