Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fdy/support torch21 #504

Closed
wants to merge 2 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions dipu/SupportedDiopiFunctions.txt
Original file line number Diff line number Diff line change
Expand Up @@ -48,6 +48,8 @@ diopiCastDtype
diopiCat
diopiCdist
diopiCdistBackward
diopiCeil
diopiCeilInp
diopiClamp
diopiClampInp
diopiClampInpScalar
Expand Down
8 changes: 4 additions & 4 deletions dipu/scripts/autogen_diopi_wrapper/diopi_functions.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -643,18 +643,18 @@
::diopiSize_t diopi_size = toDiopiSize(dim);
interface: diopiMean(ctx, out, self_dtype_diopi, diopi_size);

- schema: "std.correction(Tensor self, int[1]? dim=None, *, int? correction=None, bool keepdim=False) -> Tensor"
- schema: "std.correction(Tensor self, int[1]? dim=None, *, Scalar? correction=None, bool keepdim=False) -> Tensor"
custom_code_at_the_beginning: |
std::vector<int64_t> output_shape = infer_reduce_op_shape(self.sizes(), dim.value_or(std::vector<int64_t>()), keepdim);
auto out = at::empty(output_shape, self.options());
bool unbiased = correction.value_or(1) == 1;
bool unbiased = correction.value_or(1).toLong() == 1;
::diopiSize_t diopi_size = toDiopiSize(dim);
interface: diopiStd(ctx, out, self, diopi_size, unbiased);

- schema: "std.correction_out(Tensor self, int[1]? dim=None, *, int? correction=None, bool keepdim=False, Tensor(a!) out) -> Tensor(a!)"
- schema: "std.correction_out(Tensor self, int[1]? dim=None, *, Scalar? correction=None, bool keepdim=False, Tensor(a!) out) -> Tensor(a!)"
custom_code_at_the_beginning: |
::diopiSize_t diopi_size = toDiopiSize(dim);
bool unbiased = correction.value_or(1) == 1;
bool unbiased = correction.value_or(1).toLong() == 1;
interface: diopiStd(ctx, out, self, diopi_size, unbiased);

- schema: "linear_backward(Tensor input, Tensor grad_output, Tensor weight, bool[3] output_mask) -> (Tensor grad_input, Tensor grad_weight, Tensor grad_bias)"
Expand Down
5 changes: 3 additions & 2 deletions dipu/tests/python/individual_scripts/test_op_benchmark.py
Original file line number Diff line number Diff line change
Expand Up @@ -58,8 +58,9 @@ def batched_dot_bmm(a, b):
# description is the column
label = "Batched dot"
sub_label = f"[{b}, {n}]"
x = torch.ones((b, n))
for num_threads in [1, 4, 16, 32]:
x = torch.ones((b, n)).cuda()
# cuda tensor, not so many dispatch threads in actual case. 16, 32]:
for num_threads in [1, 4]:
results.append(
benchmark.Timer(
stmt="batched_dot_mul_sum(x, x)",
Expand Down
17 changes: 12 additions & 5 deletions dipu/tests/run_nv_tests.sh
Original file line number Diff line number Diff line change
Expand Up @@ -7,8 +7,15 @@ function run_dipu_tests {
unset DIPU_DUMP_OP_ARGS
export PYTHONPATH=${DIPU_ROOT}/../:${PYTHONPATH}
${CDIR}/python/run_tests.sh
echo "fill_.Scalar" >> .dipu_force_fallback_op_list.config
run_test "${PYTORCH_DIR}/test/test_tensor_creation_ops.py" "$@" -v -f TestTensorCreationDIPU # --locals -f
echo "" > .dipu_force_fallback_op_list.config
# run_test "${PYTORCH_DIR}/test/test_reductions.py" "$@" -v -f TestReductionsDIPU

run_test "${PYTORCH_TEST_DIR}/test/nn/test_convolution.py" -v TestConvolutionNNDeviceTypeDIPU
# run_test "${PYTORCH_TEST_DIR}/test/test_linalg.py" "$@" -v TestLinalgDIPU

# mock cuda cause test number err, temporary ignore
# run_test "${PYTORCH_TEST_DIR}/test/test_testing.py" "$@" -v TestTestParametrizationDeviceTypeDIPU TestTestingDIPU
run_test "${PYTORCH_TEST_DIR}/test/test_type_hints.py" "$@" -v
run_test "${PYTORCH_TEST_DIR}/test/test_type_info.py" "$@" -v
Expand All @@ -17,14 +24,14 @@ function run_dipu_tests {
# run_test "${PYTORCH_TEST_DIR}/test/test_binary_ufuncs.py" "$@" -v TestBinaryUfuncsDIPU
# run_test "${PYTORCH_TEST_DIR}/test/test_torch.py" "$@" -v TestTorchDeviceTypeDIPU #--subprocess
#run_test "${PYTORCH_TEST_DIR}/test/test_indexing.py" "$@" -v TestIndexingDIPU
#run_test "${PYTORCH_TEST_DIR}/test/test_indexing.py" "$@" -v NumpyTestsDIPU
# run_test "${PYTORCH_TEST_DIR}/test/test_view_ops.py" "$@" -v TestViewOpsDIPU
run_test "${PYTORCH_TEST_DIR}/test/test_indexing.py" "$@" -v NumpyTestsDIPU
run_test "${PYTORCH_TEST_DIR}/test/test_view_ops.py" "$@" -v TestViewOpsDIPU
# run_test "${PYTORCH_TEST_DIR}/test/test_type_promotion.py" "$@" -v TestTypePromotionDIPU
# run_test "${PYTORCH_TEST_DIR}/test/test_nn.py" "$@" -v TestNN
# run_test "${PYTORCH_TEST_DIR}/test/test_ops_fwd_gradients.py" "$@" -v TestFwdGradientsDIPU
# run_test "${PYTORCH_TEST_DIR}/test/test_ops_gradients.py" "$@" -v TestBwdGradientsDIPU
run_test "${PYTORCH_TEST_DIR}/test/test_ops_fwd_gradients.py" "$@" -v TestFwdGradientsDIPU
run_test "${PYTORCH_TEST_DIR}/test/test_ops_gradients.py" "$@" -v TestBwdGradientsDIPU
# run_test "${PYTORCH_TEST_DIR}/test/test_ops.py" "$@" -v
# run_test "${PYTORCH_TEST_DIR}/test/test_shape_ops.py" "$@" -v TestShapeOpsDIPU
run_test "${PYTORCH_TEST_DIR}/test/test_shape_ops.py" "$@" -v TestShapeOpsDIPU
}

if [ "$LOGFILE" != "" ]; then
Expand Down
2 changes: 1 addition & 1 deletion dipu/torch_dipu/csrc_dipu/CMakeLists.txt
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
set(DIPU_LIB torch_dipu)
set(DIPU_PYTHON_LIB torch_dipu_python)
set(DIPU_AUTOGENED_KERNELS torch_dipu_autogened_kernels)

add_definitions(-std=c++17)
# python path
include_directories(SYSTEM ${PYTHON_INCLUDE_DIR})

Expand Down
26 changes: 12 additions & 14 deletions dipu/torch_dipu/csrc_dipu/aten/RegisterDIPU.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,8 @@
#pragma once

#include <deque>
#include <fstream>
#include <iostream>
#include <mutex>

#include <torch/library.h>
Expand Down Expand Up @@ -110,20 +112,16 @@ namespace {

#define DIPU_LIBRARY_IMPL(ns, k, m) _DIPU_LIBRARY_IMPL(ns, k, m, C10_UID)

#define _DIPU_LIBRARY_IMPL(ns, k, m, uid) \
static void C10_CONCATENATE(DIPU_LIBRARY_IMPL_init_##ns##_##k##_, \
uid)(torch::Library&); \
static const ::at::DIPUOpRegister C10_CONCATENATE( \
DIPU_LIBRARY_IMPL_static_init_##ns##_##k##_, uid)( \
c10::guts::if_constexpr<c10::impl::dispatch_key_allowlist_check( \
c10::DispatchKey::k)>( \
[]() { \
return &C10_CONCATENATE(DIPU_LIBRARY_IMPL_init_##ns##_##k##_, \
uid); \
}, \
[]() { return [](torch::Library&) -> void {}; }), \
#ns, c10::make_optional(c10::DispatchKey::k), __FILE__, __LINE__); \
void C10_CONCATENATE(DIPU_LIBRARY_IMPL_init_##ns##_##k##_, \
#define _DIPU_LIBRARY_IMPL(ns, k, m, uid) \
static void C10_CONCATENATE(DIPU_LIBRARY_IMPL_init_##ns##_##k##_, \
uid)(torch::Library&); \
static const ::at::DIPUOpRegister C10_CONCATENATE( \
DIPU_LIBRARY_IMPL_static_init_##ns##_##k##_, uid)( \
(c10::impl::dispatch_key_allowlist_check(c10::DispatchKey::k) \
? &C10_CONCATENATE(DIPU_LIBRARY_IMPL_init_##ns##_##k##_, uid) \
: [](torch::Library&) -> void {}), \
#ns, c10::make_optional(c10::DispatchKey::k), __FILE__, __LINE__); \
void C10_CONCATENATE(DIPU_LIBRARY_IMPL_init_##ns##_##k##_, \
uid)(torch::Library & m)

} // namespace
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
// Copyright (c) 2023, DeepLink.

Check notice on line 1 in dipu/torch_dipu/csrc_dipu/aten/ops/CustomFallbackFunctionsForAmpGradScaler.cpp

View workflow job for this annotation

GitHub Actions / clang-format

Run clang-format on dipu/torch_dipu/csrc_dipu/aten/ops/CustomFallbackFunctionsForAmpGradScaler.cpp

File dipu/torch_dipu/csrc_dipu/aten/ops/CustomFallbackFunctionsForAmpGradScaler.cpp does not conform to Custom style guidelines. (lines 99)
//
// This file contains definitions of custom fallback functions needed by AMP
// GradScaler. The corresponding declarations can be found in
Expand Down Expand Up @@ -87,16 +87,17 @@
"found_inf must be a float tensor.");
if (static_cast<bool>(found_inf.item<float>())) {
current_scale *= backoff_factor;
growth_tracker[0] = 0;
growth_tracker.fill_(c10::Scalar(0));
} else {
// Entering this branch means we just carried out a successful step,
// so growth_tracker is incremented before comparing to growth_interval.
auto successful = growth_tracker.item<int>() + 1;
if (successful == growth_interval) {
current_scale *= growth_factor;
growth_tracker[0] = 0;
growth_tracker.fill_(c10::Scalar(0));
} else {
growth_tracker[0] = successful;
//growth_tracker in torch 2.1 is a scalar tensor. in 2.0 is a size=1 tensor.
growth_tracker.fill_(c10::Scalar(successful));
}
}
return current_scale;
Expand Down
3 changes: 2 additions & 1 deletion dipu/torch_dipu/csrc_dipu/aten/ops/DIPUAmp.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -117,7 +117,8 @@ struct WrapFunction_<CastPolicy::fp32_append_dtype, device_type, Redispatch, F,
static Ret call(Args... args) {
c10::impl::ExcludeDispatchKeyGuard no_autocast(
get_autocast_dispatch_key_from_device_type(device_type));
at::ScalarType out_type = type_from_firstarg(at::kFloat, args...);
at::ScalarType out_type =
type_from_firstarg(device_type, at::kFloat, args...);
return (*F)(args..., out_type);
}
};
Expand Down
1 change: 1 addition & 0 deletions dipu/torch_dipu/csrc_dipu/profiler/collection.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -309,6 +309,7 @@ void DIPUThreadLocalSubqueue::TorchOpStorage::materialize(
DIPUThreadLocalSubqueue::TorchOpStorage::OpList::correlationID(event),
time_converter(event->end_time_),
input_getter(),
input_getter(),
jit_stack(),
jit_module(),
extra_args(),
Expand Down
14 changes: 7 additions & 7 deletions dipu/torch_dipu/csrc_dipu/profiler/profiler_kineto.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -42,7 +42,7 @@ using torch::autograd::profiler::KinetoEvent;
using torch::autograd::profiler::post_process_t;
using torch::autograd::profiler::ProfilerResult;
using torch::profiler::impl::ActiveProfilerType;
using torch::profiler::impl::dtypesToStr;
// using torch::profiler::impl::dtypesToStr;
using torch::profiler::impl::EventType;
using torch::profiler::impl::ExtraFields;
using torch::profiler::impl::op_input_t;
Expand Down Expand Up @@ -162,9 +162,9 @@ struct AddGenericMetadata : public MetadataBase {
addMetadata("Input Dims", shapesToStr(shapes_and_dtypes.first));
}

if (!shapes_and_dtypes.second.empty()) {
addMetadata("Input type", dtypesToStr(shapes_and_dtypes.second));
}
// if (!shapes_and_dtypes.second.empty()) {
// addMetadata("Input type", dtypesToStr(shapes_and_dtypes.second));
// }

if (config_ && !config_->experimental_config.performance_events.empty()) {
auto& event_names = config_->experimental_config.performance_events;
Expand Down Expand Up @@ -413,9 +413,9 @@ void pushProfilingCallbacks(const std::unordered_set<at::RecordScope>& scopes) {
.needsInputs(registration_state_ptr->config().report_input_shapes)
.scopes(scopes);

auto handle = c10::guts::if_constexpr<use_global_callback>(
[&] { return at::addGlobalCallback(recordFunctionCallback); },
[&] { return at::addThreadLocalCallback(recordFunctionCallback); });
auto handle = use_global_callback
? at::addGlobalCallback(recordFunctionCallback)
: at::addThreadLocalCallback(recordFunctionCallback);
registration_state_ptr->setCallbackHandle(handle);
}

Expand Down
5 changes: 3 additions & 2 deletions dipu/torch_dipu/csrc_dipu/profiler/profiler_python.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -148,8 +148,9 @@ class CallTypeHelper final {
template <size_t C, typename T, typename FunctorT, typename... Args>
static void map(T& t, FunctorT& f, Args&&... args) {
f(std::get<C>(t), args...);
c10::guts::if_constexpr<C + 1 < End>(
[&](auto _) { map<C + 1>(_(t), f, std::forward<Args>(args)...); });
if constexpr (C + 1 < End) {
map<C + 1>(t, f, std::forward<Args>(args)...);
}
}

public:
Expand Down
Original file line number Diff line number Diff line change
@@ -1,8 +1,9 @@
// Copyright (c) 2023, DeepLink.

Check notice on line 1 in dipu/torch_dipu/csrc_dipu/runtime/core/allocator/DIPURawAllocator.cpp

View workflow job for this annotation

GitHub Actions / clang-format

Run clang-format on dipu/torch_dipu/csrc_dipu/runtime/core/allocator/DIPURawAllocator.cpp

File dipu/torch_dipu/csrc_dipu/runtime/core/allocator/DIPURawAllocator.cpp does not conform to Custom style guidelines. (lines 3)

#include "DIPURawAllocator.h"

#include <mutex>
#include <map>
#include <unordered_set>
#include <utility>

Expand Down
3 changes: 2 additions & 1 deletion dipu/torch_dipu/csrc_dipu/runtime/distributed/c10dOps.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -62,7 +62,8 @@ std::tuple<std::vector<at::Tensor>, c10::intrusive_ptr<Work>> broadcast_dipu_(
std::tuple<std::vector<at::Tensor>, c10::intrusive_ptr<Work>> allreduce_dipu_(
at::TensorList tensors,
const c10::intrusive_ptr<ProcessGroup>& process_group,
const c10::intrusive_ptr<ReduceOp>& reduce_op, int64_t timeout) {
const c10::intrusive_ptr<ReduceOp>& reduce_op,
const c10::optional<at::Tensor>& sparse_indices, int64_t timeout) {
auto tensor_vec = tensors.vec();
auto work =
process_group->getBackend(dipu::DIPU_DEVICE_TYPE)
Expand Down
Original file line number Diff line number Diff line change
@@ -1,11 +1,11 @@
// Copyright (c) 2023, DeepLink.

Check notice on line 1 in dipu/torch_dipu/csrc_dipu/vendor/cuda/CudaGeneratorImpl.cpp

View workflow job for this annotation

GitHub Actions / clang-format

Run clang-format on dipu/torch_dipu/csrc_dipu/vendor/cuda/CudaGeneratorImpl.cpp

File dipu/torch_dipu/csrc_dipu/vendor/cuda/CudaGeneratorImpl.cpp does not conform to Custom style guidelines. (lines 8)
#include <ATen/Utils.h>

#include <csrc_dipu/runtime/core/DIPUGeneratorImpl.h>

namespace dipu {

static const size_t states_size = 200 * sizeof(4120);
static const size_t states_size = 0; // 200 * sizeof(4120);
static const size_t seed_size = sizeof(uint64_t);
static const size_t offset_size = sizeof(int64_t);
static const size_t total_size = states_size + seed_size + offset_size;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -44,27 +44,25 @@ class DIPUCUDAAllocatorProxy : public CUDAAllocator {
DIPU_PATCH_CUDA_ALLOCATOR();
}
virtual SnapshotInfo snapshot() override { DIPU_PATCH_CUDA_ALLOCATOR(); }
virtual void notifyCaptureBegin(int device, CaptureId_t graph_id,
MempoolId_t mempool_id) override {
void notifyCaptureBegin(int device, CaptureId_t graph_id,
MempoolId_t mempool_id) {
DIPU_PATCH_CUDA_ALLOCATOR();
}
virtual void notifyCaptureAboutToEnd(int device,
CaptureId_t graph_id) override {
void notifyCaptureAboutToEnd(int device, CaptureId_t graph_id) {
DIPU_PATCH_CUDA_ALLOCATOR();
}
virtual void notifyCaptureEnded(int device, CaptureId_t graph_id) override {
void notifyCaptureEnded(int device, CaptureId_t graph_id) {
DIPU_PATCH_CUDA_ALLOCATOR();
}
virtual void notifyCaptureDestroy(int device,
MempoolId_t mempool_id) override {
void notifyCaptureDestroy(int device, MempoolId_t mempool_id) {
DIPU_PATCH_CUDA_ALLOCATOR();
}
virtual std::shared_ptr<void> getIpcDevPtr(std::string handle) override {
DIPU_PATCH_CUDA_ALLOCATOR();
}
virtual void recordHistory(bool enabled, CreateContextFn context_recorder,
size_t alloc_trace_max_entries,
bool alloc_trace_record_context) override {
void recordHistory(bool enabled, CreateContextFn context_recorder,
size_t alloc_trace_max_entries,
bool alloc_trace_record_context) {
DIPU_PATCH_CUDA_ALLOCATOR();
}
virtual void attachOutOfMemoryObserver(
Expand Down Expand Up @@ -95,7 +93,7 @@ class DIPUCUDAAllocatorProxy : public CUDAAllocator {

virtual void emptyCache() override { dipu::emptyCachedMem(); }

virtual bool needsPoolSpecificPeerAccess() override {
bool needsPoolSpecificPeerAccess() {
// DIPU_PATCH_CUDA_ALLOCATOR();
return false;
}
Expand All @@ -107,6 +105,26 @@ class DIPUCUDAAllocatorProxy : public CUDAAllocator {
c10::Device(c10::DeviceType::CUDA, data_ptr.device().index()));
return data_ptr;
}

void beginAllocateStreamToPool(int device, cudaStream_t stream,
MempoolId_t mempool_id) {}
void endAllocateStreamToPool(int device, cudaStream_t stream) {}

void recordHistory(bool enabled, CreateContextFn context_recorder,
size_t alloc_trace_max_entries, RecordContext when) {}
void releasePool(int device, MempoolId_t mempool_id) {}

void enablePeerAccess(int dev, int dev_to_access) {}

cudaError_t memcpyAsync(void* dst, int dstDevice, const void* src,
int srcDevice, size_t count, cudaStream_t stream,
bool p2p_enabled) {}
std::shared_ptr<AllocatorState> getCheckpointState(int device,
MempoolId_t id) {}
CheckpointDelta setCheckpointPoolState(int device,
std::shared_ptr<AllocatorState> pps) {
return CheckpointDelta();
}
};

} // namespace CUDACachingAllocator
Expand Down
Loading