From e575fedaefdbb14733dcfc69076bfefe70b32728 Mon Sep 17 00:00:00 2001 From: wiryls <7984500+wiryls@users.noreply.github.com> Date: Tue, 14 Nov 2023 16:49:18 +0800 Subject: [PATCH 1/3] refactor: format almost all code --- .clang-format | 3 + .github/workflows/format.yml | 22 + dicp/.clang-format | 3 + dipu/.clang-format | 35 + dipu/tests/cpp/test_relu.cpp | 25 +- dipu/tests/cpp/test_tensor_add.cpp | 22 +- dipu/tests/cpp/testrt.cpp | 9 +- dipu/third_party/.clang-format | 3 + .../torch_dipu/csrc_dipu/aten/CPUFallback.cpp | 231 ++--- .../csrc_dipu/aten/DIPUATenFunctions.h | 85 +- .../csrc_dipu/aten/RegisterDIPU.cpp | 465 +++++----- .../csrc_dipu/aten/RegisterDIPU.hpp | 191 +++-- .../csrc_dipu/aten/ops/CopyKernel.cpp | 315 +++---- .../aten/ops/CustomFallbackFunctions.hpp | 153 ++-- ...ustomFallbackFunctionsForAmpGradScaler.cpp | 24 +- .../torch_dipu/csrc_dipu/aten/ops/DIPUAmp.cpp | 10 +- .../csrc_dipu/aten/ops/EmptyOpsKernel.cpp | 125 +-- .../torch_dipu/csrc_dipu/aten/ops/OpUtils.hpp | 337 ++++---- .../csrc_dipu/aten/ops/PinMemoryKernel.cpp | 19 +- .../csrc_dipu/aten/ops/StorageShapeKernel.cpp | 187 ++-- .../torch_dipu/csrc_dipu/base/DIPUGlobals.cpp | 50 +- dipu/torch_dipu/csrc_dipu/base/DIPUGlobals.h | 2 +- dipu/torch_dipu/csrc_dipu/base/basedef.h | 21 +- .../torch_dipu/csrc_dipu/binding/DIPUpybind.h | 118 +-- .../csrc_dipu/binding/ExportProfiler.cpp | 14 +- .../torch_dipu/csrc_dipu/binding/ExportRT.cpp | 388 +++++---- .../csrc_dipu/binding/ExportTensor.cpp | 94 +- dipu/torch_dipu/csrc_dipu/binding/exportapi.h | 7 +- .../csrc_dipu/binding/patchCsrcDevice.cpp | 68 +- dipu/torch_dipu/csrc_dipu/common.h | 2 - .../csrc_dipu/diopirt/diopi_helper.cpp | 235 ++--- .../csrc_dipu/diopirt/diopirt_impl.cpp | 205 +++-- .../csrc_dipu/diopirt/diopirt_impl.h | 34 +- .../profiler/CorrelationIDManager.cpp | 26 +- .../csrc_dipu/profiler/CorrelationIDManager.h | 45 +- .../csrc_dipu/profiler/DIPUDeviceActivity.cpp | 34 +- .../csrc_dipu/profiler/DIPUDeviceActivity.h | 67 +- .../csrc_dipu/profiler/collection.cpp | 431 +++++----- .../csrc_dipu/profiler/collection.h | 125 ++- dipu/torch_dipu/csrc_dipu/profiler/patch.cpp | 254 +++--- .../csrc_dipu/profiler/profiler.cpp | 608 +++++++------ dipu/torch_dipu/csrc_dipu/profiler/profiler.h | 226 ++--- .../csrc_dipu/profiler/profiler_kineto.cpp | 290 +++---- .../csrc_dipu/profiler/profiler_kineto.h | 16 +- .../csrc_dipu/profiler/profiler_python.cpp | 538 ++++++------ .../csrc_dipu/profiler/profiler_python.h | 7 +- .../runtime/core/DIPUCopyInplace.cpp | 70 +- .../csrc_dipu/runtime/core/DIPUCopyInplace.h | 25 +- .../csrc_dipu/runtime/core/DIPUDeviceInfo.cpp | 19 +- .../csrc_dipu/runtime/core/DIPUDeviceInfo.h | 6 +- .../csrc_dipu/runtime/core/DIPUEvent.h | 53 +- .../csrc_dipu/runtime/core/DIPUEventPool.cpp | 171 ++-- .../csrc_dipu/runtime/core/DIPUEventPool.h | 5 +- .../runtime/core/DIPUGeneratorImpl.cpp | 48 +- .../runtime/core/DIPUGeneratorImpl.h | 18 +- .../csrc_dipu/runtime/core/DIPUGuard.h | 16 +- .../csrc_dipu/runtime/core/DIPUStream.cpp | 79 +- .../csrc_dipu/runtime/core/DIPUStream.h | 58 +- .../csrc_dipu/runtime/core/MemChecker.cpp | 49 +- .../csrc_dipu/runtime/core/MemChecker.h | 21 +- .../core/allocator/DIPUAsyncResourcePool.h | 84 +- .../core/allocator/DIPUBFCachingAllocator.cpp | 811 +++++++++--------- .../core/allocator/DIPUBSCachingAllocator.cpp | 158 ++-- .../core/allocator/DIPUCachingAllocator.cpp | 158 ++-- .../core/allocator/DIPUCachingAllocator.h | 232 +++-- .../core/allocator/DIPURawAllocator.cpp | 96 ++- .../runtime/core/allocator/DIPURawAllocator.h | 44 +- .../allocator/DIPURawCachingAllocator.cpp | 68 +- .../runtime/core/allocator/DIPUSpinMutex.h | 41 +- .../runtime/core/guardimpl/DIPUGuardImpl.cpp | 6 +- .../runtime/core/guardimpl/DIPUGuardImpl.h | 55 +- .../csrc_dipu/runtime/device/basedef.h | 59 +- .../csrc_dipu/runtime/device/deviceapis.h | 42 +- .../csrc_dipu/runtime/device/diclapis.h | 73 +- .../runtime/devproxy/deviceproxy.cpp | 87 +- .../csrc_dipu/runtime/devproxy/deviceproxy.h | 41 +- .../csrc_dipu/runtime/devproxy/diclproxy.cpp | 133 +-- .../csrc_dipu/runtime/devproxy/diclproxy.h | 62 +- .../runtime/distributed/DICLUtils.hpp | 37 +- .../runtime/distributed/ProcessGroupDICL.cpp | 605 +++++++------ .../runtime/distributed/ProcessGroupDICL.h | 167 ++-- .../csrc_dipu/runtime/distributed/c10dOps.cpp | 151 ++-- dipu/torch_dipu/csrc_dipu/runtime/rthelper.h | 10 +- dipu/torch_dipu/csrc_dipu/stub.cpp | 22 +- dipu/torch_dipu/csrc_dipu/utils/Log.h | 12 +- dipu/torch_dipu/csrc_dipu/utils/helpfunc.cpp | 17 +- dipu/torch_dipu/csrc_dipu/utils/helpfunc.hpp | 22 +- .../vendor/ascend/AscendGeneratorImpl.cpp | 28 +- .../csrc_dipu/vendor/ascend/basecommimpl.hpp | 31 +- .../vendor/ascend/communicatorimpl.cpp | 103 ++- .../csrc_dipu/vendor/ascend/deviceimpl.cpp | 101 ++- .../csrc_dipu/vendor/ascend/vendorapi.h | 48 +- .../vendor/camb/CambGeneratorImpl.cpp | 40 +- .../csrc_dipu/vendor/camb/basecommimpl.hpp | 106 ++- .../csrc_dipu/vendor/camb/basedeviceimpl.hpp | 131 ++- .../vendor/camb/cnrt_5.x/deviceimpl.cpp | 22 +- .../vendor/camb/cnrt_6.x/communiatorimpl.cpp | 209 ++--- .../vendor/camb/cnrt_6.x/deviceimpl.cpp | 61 +- .../csrc_dipu/vendor/camb/vendorapi.h | 45 +- .../csrc_dipu/vendor/cuda/CUDACopyInplace.cpp | 39 +- .../vendor/cuda/CudaGeneratorImpl.cpp | 26 +- .../csrc_dipu/vendor/cuda/communiatorimpl.cpp | 220 ++--- .../csrc_dipu/vendor/cuda/deviceimpl.cpp | 231 +++-- .../cuda/patch/DIPUPatchCudaAllocator.cpp | 132 ++- .../vendor/cuda/patch/wrapperRegister.cpp | 2 +- .../csrc_dipu/vendor/cuda/vendorapi.h | 18 +- .../vendor/droplet/DropletGeneratorImpl.cpp | 19 +- .../vendor/droplet/communicatorimpl.cpp | 302 ++++--- .../csrc_dipu/vendor/droplet/deviceimpl.cpp | 215 +++-- .../csrc_dipu/vendor/droplet/vendorapi.h | 35 +- .../csrc_dipu/vendor/supa/commimpl.cpp | 106 ++- .../csrc_dipu/vendor/supa/copyinplace.cpp | 42 +- .../csrc_dipu/vendor/supa/deviceimpl.cpp | 81 +- .../csrc_dipu/vendor/supa/generatorimpl.cpp | 25 +- .../csrc_dipu/vendor/supa/vendorapi.h | 5 +- .../vendor/topsrider/TopsGeneratorImpl.cpp | 62 +- .../vendor/topsrider/communiatorimpl.cpp | 24 +- .../csrc_dipu/vendor/topsrider/deviceimpl.cpp | 219 +++-- .../csrc_dipu/vendor/topsrider/vendorapi.h | 18 +- 119 files changed, 6605 insertions(+), 6241 deletions(-) create mode 100644 .clang-format create mode 100644 .github/workflows/format.yml create mode 100644 dicp/.clang-format create mode 100644 dipu/.clang-format create mode 100644 dipu/third_party/.clang-format diff --git a/.clang-format b/.clang-format new file mode 100644 index 000000000..c3c4f3e1e --- /dev/null +++ b/.clang-format @@ -0,0 +1,3 @@ +--- +Language: Cpp +BasedOnStyle: Google diff --git a/.github/workflows/format.yml b/.github/workflows/format.yml new file mode 100644 index 000000000..b68ab505c --- /dev/null +++ b/.github/workflows/format.yml @@ -0,0 +1,22 @@ +name: code format + +on: + workflow_dispatch: + pull_request: + push: + branches: + - main + +jobs: + cpp-linter: + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v3 + - uses: cpp-linter/cpp-linter-action@v2 + id: linter + with: + style: file + tidy-checks: '-*' # disable clang tidy at this stage + - name: Fail test + if: steps.linter.outputs.checks-failed > 0 + run: echo "Some files failed the linting checks!" && exit 1 diff --git a/dicp/.clang-format b/dicp/.clang-format new file mode 100644 index 000000000..a5121ff07 --- /dev/null +++ b/dicp/.clang-format @@ -0,0 +1,3 @@ +--- +DisableFormat: true +SortIncludes: Never diff --git a/dipu/.clang-format b/dipu/.clang-format new file mode 100644 index 000000000..bf44edbbc --- /dev/null +++ b/dipu/.clang-format @@ -0,0 +1,35 @@ +--- +BasedOnStyle: InheritParentConfig +IncludeCategories: + - Regex: '^("|<)csrc_dipu/.*' + Priority: 9 + SortPriority: 0 + CaseSensitive: false + - Regex: '^("|<)diopi/.*' + Priority: 8 + SortPriority: 0 + CaseSensitive: false + - Regex: '^("|<)(c10|aten|torch).*' + Priority: 4 + SortPriority: 0 + CaseSensitive: false + - Regex: '^("|<)(pybind11|Python\.h|frameobject.\h).*' + Priority: 5 + SortPriority: 0 + CaseSensitive: false + - Regex: '^<((ext/.*)|pthread)\.h.*' + Priority: 2 + SortPriority: 1 + CaseSensitive: false + - Regex: '^("|<)(cuda|su|cn|(..?ccl)|(.*_runtime)).*\.h.*' + Priority: 3 + SortPriority: 0 + CaseSensitive: false + - Regex: '^<.*' + Priority: 2 + SortPriority: 0 + CaseSensitive: false + - Regex: '.*' + Priority: 10 + SortPriority: 0 + CaseSensitive: false diff --git a/dipu/tests/cpp/test_relu.cpp b/dipu/tests/cpp/test_relu.cpp index 924e42253..474d39b20 100644 --- a/dipu/tests/cpp/test_relu.cpp +++ b/dipu/tests/cpp/test_relu.cpp @@ -1,21 +1,22 @@ // Copyright (c) 2023, DeepLink. -#include #include -void testTensorRelu(at::Tensor& self) { - std::cout << self << std::endl; - std::cout << torch::relu(self) << std::endl; - std::cout << self << std::endl; +#include + +void testTensorRelu(at::Tensor &self) { + std::cout << self << std::endl; + std::cout << torch::relu(self) << std::endl; + std::cout << self << std::endl; - std::cout << torch::relu_(self) << std::endl; - std::cout << self << std::endl; + std::cout << torch::relu_(self) << std::endl; + std::cout << self << std::endl; } int main() { - torch::Tensor tensor = torch::randn(10).cuda(); - testTensorRelu(tensor); + torch::Tensor tensor = torch::randn(10).cuda(); + testTensorRelu(tensor); - torch::Tensor tensor_cpu = torch::randn(10); - testTensorRelu(tensor_cpu); - return 0; + torch::Tensor tensor_cpu = torch::randn(10); + testTensorRelu(tensor_cpu); + return 0; } diff --git a/dipu/tests/cpp/test_tensor_add.cpp b/dipu/tests/cpp/test_tensor_add.cpp index c4c01e56d..e8b7533a7 100644 --- a/dipu/tests/cpp/test_tensor_add.cpp +++ b/dipu/tests/cpp/test_tensor_add.cpp @@ -1,20 +1,20 @@ // Copyright (c) 2023, DeepLink. #include -#include #include +#include -void testTensorAdd(const at::Tensor& lhs, const at::Tensor& rhs) { - at::Tensor result = lhs + rhs; - std::cout << lhs << std::endl; - std::cout << rhs << std::endl; - std::cout << result << std::endl; +void testTensorAdd(const at::Tensor &lhs, const at::Tensor &rhs) { + at::Tensor result = lhs + rhs; + std::cout << lhs << std::endl; + std::cout << rhs << std::endl; + std::cout << result << std::endl; } int main() { - at::Tensor t0 = at::randn({2, 2}).cuda(); - at::Tensor t1 = at::randn({2}).cuda(); - testTensorAdd(t0, t1); - testTensorAdd(t0.cpu(), t1.cpu()); - return 0; + at::Tensor t0 = at::randn({2, 2}).cuda(); + at::Tensor t1 = at::randn({2}).cuda(); + testTensorAdd(t0, t1); + testTensorAdd(t0.cpu(), t1.cpu()); + return 0; } \ No newline at end of file diff --git a/dipu/tests/cpp/testrt.cpp b/dipu/tests/cpp/testrt.cpp index 4071c9cef..9c145674e 100644 --- a/dipu/tests/cpp/testrt.cpp +++ b/dipu/tests/cpp/testrt.cpp @@ -1,8 +1,10 @@ // Copyright (c) 2023, DeepLink. -#include #include -#include + +#include + #include +#include using namespace dipu; void testcopy() { @@ -53,12 +55,11 @@ void testStream1() { auto stream3 = getCurrentDIPUStream(); rawStream = stream3.rawstream(); std::cout << "current stream =" << rawStream << std::endl; - } // need change to use gtest. int main() { - for(int i=0; i<3; i++) { + for (int i = 0; i < 3; i++) { // testcopy(); testDeviceSwitch(); // testStream1(); diff --git a/dipu/third_party/.clang-format b/dipu/third_party/.clang-format new file mode 100644 index 000000000..a5121ff07 --- /dev/null +++ b/dipu/third_party/.clang-format @@ -0,0 +1,3 @@ +--- +DisableFormat: true +SortIncludes: Never diff --git a/dipu/torch_dipu/csrc_dipu/aten/CPUFallback.cpp b/dipu/torch_dipu/csrc_dipu/aten/CPUFallback.cpp index 3f5478428..11c6a0a04 100644 --- a/dipu/torch_dipu/csrc_dipu/aten/CPUFallback.cpp +++ b/dipu/torch_dipu/csrc_dipu/aten/CPUFallback.cpp @@ -1,11 +1,10 @@ #define TORCH_ASSERT_ONLY_METHOD_OPERATORS -#include - #include +#include #include #include -#include +#include #ifndef AT_PER_OPERATOR_HEADERS #include @@ -14,48 +13,52 @@ #include #endif - -namespace dipu { namespace native { +namespace dipu { +namespace native { // convenience helper for converting tensors to cpu -std::vector to_cpu(const at::TensorList& tensors) { - // We can't just call at::to_cpu() on the entire list of Tensors - // Because it will break on undefined tensors. Separate out undefined tensors first. - std::vector cpu_tensors(tensors.size()); - std::vector valid_tensors; - std::vector to_translate(tensors.size()); - for (const auto i : c10::irange(tensors.size())) { - const at::Tensor& tensor = tensors[i]; - // Explicitly handling undefined tensors here instead of letting `at::_to_cpu` handle it. - // Otherwise, we'd need to require all backends with their own implementation of _to_cpu - // to properly handle undefined tensors. - if (tensor.defined()) { - to_translate[i] = true; - valid_tensors.push_back(tensor); - } else { - cpu_tensors[i] = tensor; - } +std::vector to_cpu(const at::TensorList &tensors) { + // We can't just call at::to_cpu() on the entire list of Tensors + // Because it will break on undefined tensors. Separate out undefined tensors + // first. + std::vector cpu_tensors(tensors.size()); + std::vector valid_tensors; + std::vector to_translate(tensors.size()); + for (const auto i : c10::irange(tensors.size())) { + const at::Tensor &tensor = tensors[i]; + // Explicitly handling undefined tensors here instead of letting + // `at::_to_cpu` handle it. Otherwise, we'd need to require all backends + // with their own implementation of _to_cpu to properly handle undefined + // tensors. + if (tensor.defined()) { + to_translate[i] = true; + valid_tensors.push_back(tensor); + } else { + cpu_tensors[i] = tensor; } - auto cpu_valid_tensors = at::_to_cpu(valid_tensors); - for (size_t i = 0, defined_pos = 0; i < tensors.size(); ++i) { - if (to_translate[i]) { - cpu_tensors[i] = std::move(cpu_valid_tensors[defined_pos++]); - } + } + auto cpu_valid_tensors = at::_to_cpu(valid_tensors); + for (size_t i = 0, defined_pos = 0; i < tensors.size(); ++i) { + if (to_translate[i]) { + cpu_tensors[i] = std::move(cpu_valid_tensors[defined_pos++]); } + } return cpu_tensors; } -c10::optional compute_target_device(std::vector& t_args, std::vector> tlist_args) { +c10::optional compute_target_device( + std::vector &t_args, + std::vector> tlist_args) { // Decide what device to move the output tensor(s) to. - // The current convention is that we use the first tensor arg to pick the device - // Barring that, we take the first tensor from a TensorList arg. + // The current convention is that we use the first tensor arg to pick the + // device Barring that, we take the first tensor from a TensorList arg. if (!t_args.empty()) { return t_args[0].device(); } else { - // We need to loop through all of the (potentially multiple) TensorList arguments - // In case, e.g. the first one is empty but the second is not. - for (auto& tens_list : tlist_args) { + // We need to loop through all of the (potentially multiple) TensorList + // arguments In case, e.g. the first one is empty but the second is not. + for (auto &tens_list : tlist_args) { for (const auto i : c10::irange(tens_list.size())) { return tens_list.get(i).device(); } @@ -64,9 +67,8 @@ c10::optional compute_target_device(std::vector& t_args return c10::nullopt; } - -void cpu_fallback(const c10::OperatorHandle& op, torch::jit::Stack* stack) { - auto& schema_args = op.schema().arguments(); +void cpu_fallback(const c10::OperatorHandle &op, torch::jit::Stack *stack) { + auto &schema_args = op.schema().arguments(); const auto num_arguments = schema_args.size(); auto arguments = torch::jit::last(stack, num_arguments); const auto arguments_begin = stack->size() - num_arguments; @@ -78,37 +80,42 @@ void cpu_fallback(const c10::OperatorHandle& op, torch::jit::Stack* stack) { std::vector> cpu_tensorlist_args; std::vector tensorlist_args_indices; - static bool log_fallback_detail = std::getenv("DIPU_LOG_FALLBACK_INFO") != nullptr; + static bool log_fallback_detail = + std::getenv("DIPU_LOG_FALLBACK_INFO") != nullptr; // Step 1: Convert all non-CPU tensor inputs into CPU tensors // and put them on the stack at the correct indices. for (const auto idx : c10::irange(arguments.size())) { - const auto& ivalue = arguments[idx]; + const auto &ivalue = arguments[idx]; if (log_fallback_detail) { - std::cout << "cpu_fallback:\t"<< op.schema().name() << "\t arguments["<< idx <<"] :" - << schema_args[idx].name() - << "\ttype:" << schema_args[idx].type() - << "\tisTensor:" << ivalue.isTensor() - << "\tisTensorList:" << ivalue.isTensorList() - << "\talias_info:" << schema_args[idx].alias_info() - << std::endl; + std::cout << "cpu_fallback:\t" << op.schema().name() << "\t arguments[" + << idx << "] :" << schema_args[idx].name() + << "\ttype:" << schema_args[idx].type() + << "\tisTensor:" << ivalue.isTensor() + << "\tisTensorList:" << ivalue.isTensorList() + << "\talias_info:" << schema_args[idx].alias_info() + << std::endl; } if (ivalue.isTensor()) { tensor_args.push_back(ivalue.toTensor()); tensor_args_indices.push_back(idx); } else if (ivalue.isTensorList()) { - // Note: we copy each TensorList argument to CPU individually out of convenience, - // but XLA would benefit from materializing all tensor and TensorList args onto the CPU at the same time. - // We can improve this if we need better perf for XLA's CPU fallbacks. + // Note: we copy each TensorList argument to CPU individually out of + // convenience, but XLA would benefit from materializing all tensor and + // TensorList args onto the CPU at the same time. We can improve this if + // we need better perf for XLA's CPU fallbacks. tensorlist_args.push_back(ivalue.toTensorList()); - auto cpu_ivalue = c10::IValue(c10::List(to_cpu(ivalue.toTensorList().vec()))); + auto cpu_ivalue = c10::IValue( + c10::List(to_cpu(ivalue.toTensorList().vec()))); (*stack)[arguments_begin + idx] = std::move(cpu_ivalue); - cpu_tensorlist_args.push_back((*stack)[arguments_begin + idx].toTensorList()); + cpu_tensorlist_args.push_back( + (*stack)[arguments_begin + idx].toTensorList()); tensorlist_args_indices.push_back(idx); } } - // XLA requires all of the tensor arguments to be gathered up and converted to CPU together. + // XLA requires all of the tensor arguments to be gathered up and converted to + // CPU together. auto cpu_tensors = to_cpu(tensor_args); for (const auto i : c10::irange(tensor_args_indices.size())) { @@ -119,42 +126,48 @@ void cpu_fallback(const c10::OperatorHandle& op, torch::jit::Stack* stack) { // Step 2: Call the underlying CPU implementation of the operator op.redispatchBoxed(c10::DispatchKeySet(c10::DispatchKey::CPU), stack); - static bool force_copy_tensor = std::getenv("DIPU_DISABLE_FORCE_FALLBACK_COPY_TENSOR") == nullptr; + static bool force_copy_tensor = + std::getenv("DIPU_DISABLE_FORCE_FALLBACK_COPY_TENSOR") == nullptr; // Step 3: We need to take special care to handle mutable aliases properly: // If any input tensors are mutable aliases, we need to - // directly copy the updated data on the CPU tensors back to the original inputs. + // directly copy the updated data on the CPU tensors back to the original + // inputs. for (const auto i : c10::irange(tensor_args_indices.size())) { auto tensor_idx = tensor_args_indices[i]; - const at::AliasInfo* alias_info = schema_args[tensor_idx].alias_info(); + const at::AliasInfo *alias_info = schema_args[tensor_idx].alias_info(); if ((alias_info != nullptr && alias_info->isWrite())) { if (log_fallback_detail) { - std::cout << "write back: " << tensor_idx << ":" << cpu_tensors[i].options() - << ",size:" << cpu_tensors[i].sizes() - << ", " << tensor_args[i].options() - << ",size:"<< cpu_tensors[i].sizes() << std::endl; + std::cout << "write back: " << tensor_idx << ":" + << cpu_tensors[i].options() + << ",size:" << cpu_tensors[i].sizes() << ", " + << tensor_args[i].options() + << ",size:" << cpu_tensors[i].sizes() << std::endl; } tensor_args[i].reshape_as(cpu_tensors[i]).copy_(cpu_tensors[i], false); } } for (const auto i : c10::irange(tensorlist_args_indices.size())) { auto tensorlist_idx = tensorlist_args_indices[i]; - const at::AliasInfo* alias_info = schema_args[tensorlist_idx].alias_info(); + const at::AliasInfo *alias_info = schema_args[tensorlist_idx].alias_info(); if ((alias_info != nullptr && alias_info->isWrite())) { c10::List cpu_tensorlist = cpu_tensorlist_args[i]; std::vector tensorlist = tensorlist_args[i].vec(); - for(auto j = 0; j < tensorlist.size(); j++) { - + for (auto j = 0; j < tensorlist.size(); j++) { if (cpu_tensorlist.get(j).defined()) { - tensorlist[j].reshape_as(cpu_tensorlist.get(j)).copy_(cpu_tensorlist.get(j), false); + tensorlist[j] + .reshape_as(cpu_tensorlist.get(j)) + .copy_(cpu_tensorlist.get(j), false); } if (log_fallback_detail) { - std::cout << "write back " << tensorlist_idx << "th args " << j << "th tensor:" - << cpu_tensorlist.get(j).sizes() << cpu_tensorlist.get(j).options() - << tensorlist[j].sizes() << tensorlist[j].options() << std::endl; + std::cout << "write back " << tensorlist_idx << "th args " << j + << "th tensor:" << cpu_tensorlist.get(j).sizes() + << cpu_tensorlist.get(j).options() << tensorlist[j].sizes() + << tensorlist[j].options() << std::endl; } } - (*stack)[arguments_begin + tensorlist_idx] = c10::IValue(c10::List(tensorlist)); + (*stack)[arguments_begin + tensorlist_idx] = + c10::IValue(c10::List(tensorlist)); } } // Step 4: Convert any CPU output tensors back to the original input device. @@ -165,70 +178,88 @@ void cpu_fallback(const c10::OperatorHandle& op, torch::jit::Stack* stack) { // Note [CPU Fallback Does Not Handle View Operators] // Also note that we are incapable of handling immutable alises properly. // Why? - // Schemas with an immutable alias'd tensor outputs correspond to view operators. - // For example, the `view_as` schema from native_functions.yaml: + // Schemas with an immutable alias'd tensor outputs correspond to view + // operators. For example, the `view_as` schema from native_functions.yaml: // `view_as(Tensor(a) self, Tensor other) -> Tensor(a)` // We can't handle these ops properly, because view ops are supposed to return // a NEW tensor that shares the SAME storage as the original tensor. // However, the new tensor that we created cannot share the same storage, // since it lives on CPU and the original tensor lives on a different device. // Because of that, we warn if someone attempts to call the - // CPU fallback on a view operator (this is to maintain BC for view ops for XLA - // that fall back to CPU). - const auto& schema_returns = op.schema().returns(); - const auto& num_returns = schema_returns.size(); + // CPU fallback on a view operator (this is to maintain BC for view ops for + // XLA that fall back to CPU). + const auto &schema_returns = op.schema().returns(); + const auto &num_returns = schema_returns.size(); auto returns = torch::jit::last(stack, num_returns); const auto returns_begin = stack->size() - num_returns; for (const auto idx : c10::irange(returns.size())) { if (returns[idx].isTensor()) { - const auto& return_tens = returns[idx].toTensor(); + const auto &return_tens = returns[idx].toTensor(); if (return_tens.defined()) { - const at::AliasInfo* alias_info = schema_returns[idx].alias_info(); + const at::AliasInfo *alias_info = schema_returns[idx].alias_info(); if (alias_info != nullptr && alias_info->isWrite()) { - // Case (1): mutable alias case. Move the input ivalue directly onto the stack - // in place of the existing cpu output tensor. + // Case (1): mutable alias case. Move the input ivalue directly onto + // the stack in place of the existing cpu output tensor. bool found_alias = false; - // We could store some extra metadata on the function schema to avoid the loop here - // if we need to improve perf. + // We could store some extra metadata on the function schema to avoid + // the loop here if we need to improve perf. for (const auto i : c10::irange(tensor_args_indices.size())) { auto input_tensor_idx = tensor_args_indices[i]; - const auto& input_tensor = cpu_tensors[i]; - const at::AliasInfo* input_alias_info = schema_args[input_tensor_idx].alias_info(); - // Checked above; adding assert to guard against breakage of the below condition due to changing the above if test. + const auto &input_tensor = cpu_tensors[i]; + const at::AliasInfo *input_alias_info = + schema_args[input_tensor_idx].alias_info(); + // Checked above; adding assert to guard against breakage of the + // below condition due to changing the above if test. TORCH_INTERNAL_ASSERT_DEBUG_ONLY(alias_info != nullptr); - if (input_tensor.defined() && (alias_info == input_alias_info || (input_alias_info != nullptr && *alias_info == *input_alias_info))) { - // We've found the original input tensor that aliases with the current output. - // Wrap it in an IValue and put it directly on the stack. + if (input_tensor.defined() && + (alias_info == input_alias_info || + (input_alias_info != nullptr && + *alias_info == *input_alias_info))) { + // We've found the original input tensor that aliases with the + // current output. Wrap it in an IValue and put it directly on the + // stack. (*stack)[returns_begin + idx] = c10::IValue(tensor_args[i]); found_alias = true; break; } } - TORCH_CHECK(found_alias, "The operator ", op.schema().operator_name(), " appears to have invalid alias information. ", - "Found a return tensor argument with a mismatched mutable alias: ", schema_returns[idx]); + TORCH_CHECK(found_alias, "The operator ", op.schema().operator_name(), + " appears to have invalid alias information. ", + "Found a return tensor argument with a mismatched " + "mutable alias: ", + schema_returns[idx]); } else { - c10::optional tgt_device = compute_target_device(tensor_args, tensorlist_args); + c10::optional tgt_device = + compute_target_device(tensor_args, tensorlist_args); if (alias_info != nullptr && !alias_info->isWrite()) { - // immutable alias (view) case: Warn here, since we're copying and not creating a view. - //If this operator is needed, the backend should provide a kernel for it. + // immutable alias (view) case: Warn here, since we're copying and + // not creating a view. + // If this operator is needed, the backend should provide a kernel + // for it. // See Note [CPU Fallback Does Not Handle View Operators] std::stringstream dev_str; if (tgt_device) { - dev_str << *tgt_device; + dev_str << *tgt_device; } else { - dev_str << ""; + dev_str << ""; } - TORCH_WARN(false, "The operator ", op.schema().operator_name(), " appears to be a view operator, ", - "but it has no implementation for the backend \"", dev_str.str(), "\". View operators don't support ", - "falling back to run on the CPU, since the tensor's storage cannot be shared across devices."); + TORCH_WARN(false, "The operator ", op.schema().operator_name(), + " appears to be a view operator, ", + "but it has no implementation for the backend \"", + dev_str.str(), "\". View operators don't support ", + "falling back to run on the CPU, since the tensor's " + "storage cannot be shared across devices."); } - // Case (2): copy case. Copy the cpu output tensor to the original device. + // Case (2): copy case. Copy the cpu output tensor to the original + // device. - // We technically might not have a target device, e.g. if you call torch.cat() with an empty list - // In that case, we shouldn't have any tensors to schlep across devices anyway. + // We technically might not have a target device, e.g. if you call + // torch.cat() with an empty list In that case, we shouldn't have any + // tensors to schlep across devices anyway. if (tgt_device) { - (*stack)[returns_begin + idx] = c10::IValue(returns[idx].toTensor().to(*tgt_device)); + (*stack)[returns_begin + idx] = + c10::IValue(returns[idx].toTensor().to(*tgt_device)); } } } @@ -236,5 +267,5 @@ void cpu_fallback(const c10::OperatorHandle& op, torch::jit::Stack* stack) { } } -} // namespace native -} // namespace at +} // namespace native +} // namespace dipu diff --git a/dipu/torch_dipu/csrc_dipu/aten/DIPUATenFunctions.h b/dipu/torch_dipu/csrc_dipu/aten/DIPUATenFunctions.h index c9919cfe2..496f001ba 100644 --- a/dipu/torch_dipu/csrc_dipu/aten/DIPUATenFunctions.h +++ b/dipu/torch_dipu/csrc_dipu/aten/DIPUATenFunctions.h @@ -1,46 +1,63 @@ // Copyright (c) 2023, DeepLink. #pragma once +#include #include #include -#include namespace dipu::native { struct DIPUATenFunctions { - - // dipu native func - static at::Tensor empty(at::IntArrayRef size, c10::optional dtype_opt, - c10::optional layout_opt, c10::optional device_opt, - c10::optional pin_memory_opt, c10::optional memory_format_opt); - static at::Tensor empty_cpu(at::IntArrayRef size, c10::optional dtype_opt, - c10::optional layout_opt, c10::optional device_opt, - c10::optional pin_memory_opt, c10::optional memory_format_opt); - - static at::Tensor empty_strided(at::IntArrayRef size, at::IntArrayRef stride, c10::optional dtype_opt, - c10::optional layout_opt, c10::optional device_opt, - c10::optional pin_memory_opt); - static at::Tensor empty_strided_cpu(at::IntArrayRef size, at::IntArrayRef stride, c10::optional dtype_opt, - c10::optional layout_opt, c10::optional device_opt, - c10::optional pin_memory_opt); - - static at::Tensor& copy_(at::Tensor & self, const at::Tensor & src, bool non_blocking); - - static const at::Tensor& resize_(const at::Tensor& self, at::IntArrayRef size, c10::optional memory_format); - - static at::Scalar _local_scalar_dense_dipu(const at::Tensor& self); - - static at::Tensor& set_storage_dipu_(at::Tensor& result, c10::Storage storage, int64_t storage_offset, - at::IntArrayRef size, at::IntArrayRef stride); - static at::Tensor& set_dipu_(at::Tensor& self); - - static void resize_bytes_dipu(c10::StorageImpl* storage, size_t newsize_bytes); - - static bool is_pinned(const at::Tensor& self, c10::optional device); - static at::Tensor _pin_memory(const at::Tensor& self, c10::optional device); - - // todo:: use same format as autogen - // diopi function defined in AutoGenedKernels.cpp, + // dipu native func + static at::Tensor empty(at::IntArrayRef size, + c10::optional dtype_opt, + c10::optional layout_opt, + c10::optional device_opt, + c10::optional pin_memory_opt, + c10::optional memory_format_opt); + static at::Tensor empty_cpu( + at::IntArrayRef size, c10::optional dtype_opt, + c10::optional layout_opt, + c10::optional device_opt, c10::optional pin_memory_opt, + c10::optional memory_format_opt); + + static at::Tensor empty_strided(at::IntArrayRef size, at::IntArrayRef stride, + c10::optional dtype_opt, + c10::optional layout_opt, + c10::optional device_opt, + c10::optional pin_memory_opt); + static at::Tensor empty_strided_cpu(at::IntArrayRef size, + at::IntArrayRef stride, + c10::optional dtype_opt, + c10::optional layout_opt, + c10::optional device_opt, + c10::optional pin_memory_opt); + + static at::Tensor ©_(at::Tensor &self, const at::Tensor &src, + bool non_blocking); + + static const at::Tensor &resize_( + const at::Tensor &self, at::IntArrayRef size, + c10::optional memory_format); + + static at::Scalar _local_scalar_dense_dipu(const at::Tensor &self); + + static at::Tensor &set_storage_dipu_(at::Tensor &result, c10::Storage storage, + int64_t storage_offset, + at::IntArrayRef size, + at::IntArrayRef stride); + static at::Tensor &set_dipu_(at::Tensor &self); + + static void resize_bytes_dipu(c10::StorageImpl *storage, + size_t newsize_bytes); + + static bool is_pinned(const at::Tensor &self, + c10::optional device); + static at::Tensor _pin_memory(const at::Tensor &self, + c10::optional device); + + // todo:: use same format as autogen + // diopi function defined in AutoGenedKernels.cpp, }; } // namespace dipu::native diff --git a/dipu/torch_dipu/csrc_dipu/aten/RegisterDIPU.cpp b/dipu/torch_dipu/csrc_dipu/aten/RegisterDIPU.cpp index 53be2134c..dff9de325 100644 --- a/dipu/torch_dipu/csrc_dipu/aten/RegisterDIPU.cpp +++ b/dipu/torch_dipu/csrc_dipu/aten/RegisterDIPU.cpp @@ -1,49 +1,53 @@ // Copyright (c) 2023, DeepLink. #include "RegisterDIPU.hpp" -#include + #include -#include -#include -#include +#include + #include +#include #include +#include +#include +#include #include #include #include -#include using dnative = dipu::native::DIPUATenFunctions; -static std::string force_fallback_operators_list = []()-> std::string { - std::ifstream stream(".dipu_force_fallback_op_list.config", std::ios_base::in | std::ios::binary); - std::string content; - const char* env = std::getenv("DIPU_FORCE_FALLBACK_OPS_LIST"); - if (env != nullptr) { - content += env; - } - if (stream.is_open()) { - while (!stream.eof()) { - std::string line; - stream >> line; - content += "," + line; - } +static std::string force_fallback_operators_list = []() -> std::string { + std::ifstream stream(".dipu_force_fallback_op_list.config", + std::ios_base::in | std::ios::binary); + std::string content; + const char *env = std::getenv("DIPU_FORCE_FALLBACK_OPS_LIST"); + if (env != nullptr) { + content += env; + } + if (stream.is_open()) { + while (!stream.eof()) { + std::string line; + stream >> line; + content += "," + line; } - return content; + } + return content; }(); namespace dipu { -bool get_force_fallback(const char* opname) { +bool get_force_fallback(const char *opname) { if (force_fallback_operators_list.size() <= 0 || opname == nullptr) { return false; } else { std::stringstream strstream(force_fallback_operators_list); std::string force_fallback_pattern; - while(std::getline(strstream, force_fallback_pattern, ',')) { + while (std::getline(strstream, force_fallback_pattern, ',')) { if (force_fallback_pattern.size() <= 0) { continue; } - bool force_fallback = std::regex_match(opname, std::regex(force_fallback_pattern)); + bool force_fallback = + std::regex_match(opname, std::regex(force_fallback_pattern)); if (force_fallback) { return true; } @@ -53,13 +57,14 @@ bool get_force_fallback(const char* opname) { } namespace native { -void cpu_fallback(const c10::OperatorHandle& op, torch::jit::Stack* stack); +void cpu_fallback(const c10::OperatorHandle &op, torch::jit::Stack *stack); } // end of namespace native -void dump_fallback_op_args(const c10::OperatorHandle& op, const torch::jit::Stack* stack) { +void dump_fallback_op_args(const c10::OperatorHandle &op, + const torch::jit::Stack *stack) { static int level = []() { - const char* env_ptr = std::getenv("DIPU_DUMP_OP_ARGS"); - return env_ptr ? std::atoi(env_ptr) : 0; + const char *env_ptr = std::getenv("DIPU_DUMP_OP_ARGS"); + return env_ptr ? std::atoi(env_ptr) : 0; }(); if (level < 1) { @@ -68,33 +73,41 @@ void dump_fallback_op_args(const c10::OperatorHandle& op, const torch::jit::Stac const auto name = c10::toString(op.operator_name()); printf("--%-50s %-30s \n", ("[" + name + "]:").data(), "dipu_fallback"); - auto& schema_args = op.schema().arguments(); + auto &schema_args = op.schema().arguments(); const auto num_arguments = schema_args.size(); auto arguments = torch::jit::last(stack, num_arguments); auto dumpTensor = [&](const at::Tensor tensor) { if (tensor.defined()) { - std::cout << "numel: " << tensor.numel() << ", sizes: " << tensor.sizes() << ", stride: " << tensor.strides() << ", is_view: " << tensor.is_view() << ", dtype: " << tensor.dtype() - << ", device:" << tensor.device() << ", layout:" << tensor.layout() << ", requires_grad: " << (tensor.requires_grad() ? "true" : "false") << ", pinned_memory: " << (tensor.is_pinned() ? "true" : "false") - << ", memory_format: " << tensor.suggest_memory_format() << ", data_ptr: " << tensor.data_ptr(); - if (level > 2) { - std::cout << std::endl << tensor; - } - } else { - std::cout << "undefined"; + std::cout << "numel: " << tensor.numel() << ", sizes: " << tensor.sizes() + << ", stride: " << tensor.strides() + << ", is_view: " << tensor.is_view() + << ", dtype: " << tensor.dtype() + << ", device:" << tensor.device() + << ", layout:" << tensor.layout() << ", requires_grad: " + << (tensor.requires_grad() ? "true" : "false") + << ", pinned_memory: " + << (tensor.is_pinned() ? "true" : "false") + << ", memory_format: " << tensor.suggest_memory_format() + << ", data_ptr: " << tensor.data_ptr(); + if (level > 2) { + std::cout << std::endl << tensor; } + } else { + std::cout << "undefined"; + } }; const auto arguments_begin = stack->size() - num_arguments; for (const auto idx : c10::irange(arguments.size())) { std::cout << "\t" << name << ": \t" << schema_args[idx].name() << ": "; - const auto& ivalue = arguments[idx]; + const auto &ivalue = arguments[idx]; if (ivalue.isTensor()) { - const auto& tensor = ivalue.toTensor(); + const auto &tensor = ivalue.toTensor(); dumpTensor(tensor); std::cout << std::endl; } else if (ivalue.isTensorList()) { - const auto& tensorlist = ivalue.toTensorList(); + const auto &tensorlist = ivalue.toTensorList(); std::cout << std::endl; for (size_t i = 0; i < tensorlist.size(); i++) { std::cout << "\t"; @@ -102,7 +115,7 @@ void dump_fallback_op_args(const c10::OperatorHandle& op, const torch::jit::Stac std::cout << std::endl; } } else { - std:: cout << ivalue << std::endl; + std::cout << ivalue << std::endl; } } } @@ -111,23 +124,25 @@ void dump_fallback_op_args(const c10::OperatorHandle& op, const torch::jit::Stac namespace at { -void dipu_fallback(const c10::OperatorHandle& op, DispatchKeySet dispatch_keys, - torch::jit::Stack* stack) { +void dipu_fallback(const c10::OperatorHandle &op, DispatchKeySet dispatch_keys, + torch::jit::Stack *stack) { dipu::dump_fallback_op_args(op, stack); const auto name = c10::toString(op.operator_name()); - //TORCH_CHECK(name.find("foreach") == std::string::npos, - // "Currently the foreach operator does not support fallback: ", name); + // TORCH_CHECK(name.find("foreach") == std::string::npos, + // "Currently the foreach operator does not support fallback: ", name); const bool forech_op = name.find("foreach") != std::string::npos; DIPU_OP_LOG_WARNING_ONCE("fallback to cpu, name=" << name << std::endl); const static std::vector custom_fallback_operators_list{ - "aten::native_batch_norm", - "aten::native_batch_norm.out", - "aten::native_batch_norm_backward", + "aten::native_batch_norm", + "aten::native_batch_norm.out", + "aten::native_batch_norm_backward", }; - auto iter = std::find(custom_fallback_operators_list.cbegin(), custom_fallback_operators_list.cend(), std::string(name)); + auto iter = + std::find(custom_fallback_operators_list.cbegin(), + custom_fallback_operators_list.cend(), std::string(name)); if (iter != custom_fallback_operators_list.cend() || forech_op) { dipu::native::cpu_fallback(op, stack); } else { @@ -135,13 +150,15 @@ void dipu_fallback(const c10::OperatorHandle& op, DispatchKeySet dispatch_keys, } } -std::deque> DIPUOpRegister::dipuOpRegisterList; +std::deque> + DIPUOpRegister::dipuOpRegisterList; std::mutex DIPUOpRegister::mutex_; void DIPUOpRegister::register_op() { std::lock_guard guard(mutex_); - for (auto iter = dipuOpRegisterList.begin(); iter != dipuOpRegisterList.end(); ++iter) { - torch::Library* lib = std::get<0>(*iter); + for (auto iter = dipuOpRegisterList.begin(); iter != dipuOpRegisterList.end(); + ++iter) { + torch::Library *lib = std::get<0>(*iter); DIPUOpRegister::OpRegFunPtr fun_ptr = std::get<1>(*iter); fun_ptr(*lib); } @@ -149,179 +166,223 @@ void DIPUOpRegister::register_op() { } namespace { - // dipu native ops - at::Tensor wrapper_DIPU_empty_memory_format(at::IntArrayRef size, c10::optional dtype_opt, - c10::optional layout_opt, - c10::optional device_opt, c10::optional pin_memory_opt, - c10::optional memory_format_opt) { - dipu::profile::RecordBlockCreator dipu_recorder(__FUNCTION__); - const DeviceGuard device_guard(device_or_default(device_opt)); - return dnative::empty(size, dtype_opt, layout_opt, device_opt, pin_memory_opt, memory_format_opt); - } - - at::Tensor wrapper_CPU_empty_memory_format(at::IntArrayRef size, c10::optional dtype_opt, - c10::optional layout_opt, - c10::optional device_opt, - c10::optional pin_memory_opt, - c10::optional memory_format_opt) { - return dnative::empty_cpu(size, dtype_opt, layout_opt, device_opt, pin_memory_opt, memory_format_opt); - } - - at::Tensor wrapper_DIPU_empty_strided(at::IntArrayRef size, at::IntArrayRef stride, c10::optional dtype_opt, - c10::optional layout_opt, c10::optional device_opt, c10::optional pin_memory_opt) { - dipu::profile::RecordBlockCreator dipu_recorder(__FUNCTION__); - const DeviceGuard device_guard(device_or_default(device_opt)); - return dnative::empty_strided(size, stride, dtype_opt, layout_opt, device_opt, pin_memory_opt); - } +// dipu native ops +at::Tensor wrapper_DIPU_empty_memory_format( + at::IntArrayRef size, c10::optional dtype_opt, + c10::optional layout_opt, c10::optional device_opt, + c10::optional pin_memory_opt, + c10::optional memory_format_opt) { + dipu::profile::RecordBlockCreator dipu_recorder(__FUNCTION__); + const DeviceGuard device_guard(device_or_default(device_opt)); + return dnative::empty(size, dtype_opt, layout_opt, device_opt, pin_memory_opt, + memory_format_opt); +} - at::Tensor wrapper_CPU_empty_strided(at::IntArrayRef size, at::IntArrayRef stride, c10::optional dtype_opt, - c10::optional layout_opt, c10::optional device_opt, c10::optional pin_memory_opt) { - return dnative::empty_strided_cpu(size, stride, dtype_opt, layout_opt, device_opt, pin_memory_opt); - } +at::Tensor wrapper_CPU_empty_memory_format( + at::IntArrayRef size, c10::optional dtype_opt, + c10::optional layout_opt, c10::optional device_opt, + c10::optional pin_memory_opt, + c10::optional memory_format_opt) { + return dnative::empty_cpu(size, dtype_opt, layout_opt, device_opt, + pin_memory_opt, memory_format_opt); +} +at::Tensor wrapper_DIPU_empty_strided(at::IntArrayRef size, + at::IntArrayRef stride, + c10::optional dtype_opt, + c10::optional layout_opt, + c10::optional device_opt, + c10::optional pin_memory_opt) { + dipu::profile::RecordBlockCreator dipu_recorder(__FUNCTION__); + const DeviceGuard device_guard(device_or_default(device_opt)); + return dnative::empty_strided(size, stride, dtype_opt, layout_opt, device_opt, + pin_memory_opt); +} - at::Tensor wrapper_DIPU___reshape_alias(const at::Tensor & self, c10::SymIntArrayRef size, c10::SymIntArrayRef stride) { - dipu::profile::RecordBlockCreator dipu_recorder(__FUNCTION__); - return at::native::_reshape_alias(self, C10_AS_INTARRAYREF_SLOW(size), C10_AS_INTARRAYREF_SLOW(stride)); - } +at::Tensor wrapper_CPU_empty_strided(at::IntArrayRef size, + at::IntArrayRef stride, + c10::optional dtype_opt, + c10::optional layout_opt, + c10::optional device_opt, + c10::optional pin_memory_opt) { + return dnative::empty_strided_cpu(size, stride, dtype_opt, layout_opt, + device_opt, pin_memory_opt); +} - // only used by cpu_fallback. - at::Tensor wrapper_DIPU___copy_from_and_resize(const at::Tensor & self, const at::Tensor& dst) { - dipu::profile::RecordBlockCreator dipu_recorder(__FUNCTION__); - dst.resize_as_(self).copy_(self); - return dst; - } +at::Tensor wrapper_DIPU___reshape_alias(const at::Tensor &self, + c10::SymIntArrayRef size, + c10::SymIntArrayRef stride) { + dipu::profile::RecordBlockCreator dipu_recorder(__FUNCTION__); + return at::native::_reshape_alias(self, C10_AS_INTARRAYREF_SLOW(size), + C10_AS_INTARRAYREF_SLOW(stride)); +} - const at::Tensor& wrapper_resize_(const at::Tensor& self, at::IntArrayRef size, c10::optional memory_format) { - // add guard for device switch. - dipu::profile::RecordBlockCreator dipu_recorder(__FUNCTION__); - return dnative::resize_(self, size, memory_format); - } +// only used by cpu_fallback. +at::Tensor wrapper_DIPU___copy_from_and_resize(const at::Tensor &self, + const at::Tensor &dst) { + dipu::profile::RecordBlockCreator dipu_recorder(__FUNCTION__); + dst.resize_as_(self).copy_(self); + return dst; +} - at::Tensor wrapper_DIPU__as_strided(const at::Tensor & self, c10::SymIntArrayRef size, c10::SymIntArrayRef stride, c10::optional storage_offset) { - // No device check - // DeviceGuard omitted - dipu::profile::RecordBlockCreator dipu_recorder(__FUNCTION__); - return at::native::as_strided_tensorimpl(self, C10_AS_INTARRAYREF_SLOW(size), C10_AS_INTARRAYREF_SLOW(stride), storage_offset.has_value() ? c10::make_optional(storage_offset->expect_int()) : c10::nullopt); - } +const at::Tensor &wrapper_resize_( + const at::Tensor &self, at::IntArrayRef size, + c10::optional memory_format) { + // add guard for device switch. + dipu::profile::RecordBlockCreator dipu_recorder(__FUNCTION__); + return dnative::resize_(self, size, memory_format); +} - at::Tensor wrapper_DIPU__view(const at::Tensor & self, c10::SymIntArrayRef size) { - // No device check - // DeviceGuard omitted - dipu::profile::RecordBlockCreator dipu_recorder(__FUNCTION__); - return at::native::view(self, C10_AS_INTARRAYREF_SLOW(size)); - } +at::Tensor wrapper_DIPU__as_strided(const at::Tensor &self, + c10::SymIntArrayRef size, + c10::SymIntArrayRef stride, + c10::optional storage_offset) { + // No device check + // DeviceGuard omitted + dipu::profile::RecordBlockCreator dipu_recorder(__FUNCTION__); + return at::native::as_strided_tensorimpl( + self, C10_AS_INTARRAYREF_SLOW(size), C10_AS_INTARRAYREF_SLOW(stride), + storage_offset.has_value() + ? c10::make_optional(storage_offset->expect_int()) + : c10::nullopt); +} - at::Tensor wrapper_DIPU__view_as_real(const at::Tensor & self) { - // DeviceGuard omitted - dipu::profile::RecordBlockCreator dipu_recorder(__FUNCTION__); - return at::native::view_as_real(self); - } +at::Tensor wrapper_DIPU__view(const at::Tensor &self, + c10::SymIntArrayRef size) { + // No device check + // DeviceGuard omitted + dipu::profile::RecordBlockCreator dipu_recorder(__FUNCTION__); + return at::native::view(self, C10_AS_INTARRAYREF_SLOW(size)); +} - at::Tensor wrapper_DIPU__view_as_complex(const at::Tensor & self) { - // DeviceGuard omitted - dipu::profile::RecordBlockCreator dipu_recorder(__FUNCTION__); - return at::native::view_as_complex(self); - } +at::Tensor wrapper_DIPU__view_as_real(const at::Tensor &self) { + // DeviceGuard omitted + dipu::profile::RecordBlockCreator dipu_recorder(__FUNCTION__); + return at::native::view_as_real(self); +} - at::Tensor & wrapper_DIPU__zero_(at::Tensor & self) { - dipu::profile::RecordBlockCreator dipu_recorder(__FUNCTION__); - const OptionalDeviceGuard device_guard(device_of(self)); - return at::native::zero_(self); - } +at::Tensor wrapper_DIPU__view_as_complex(const at::Tensor &self) { + // DeviceGuard omitted + dipu::profile::RecordBlockCreator dipu_recorder(__FUNCTION__); + return at::native::view_as_complex(self); +} - // it's a view op, However it's not registered by RegisterCompositeExplicitAutograd.cpp, - // but by cpu/cuda backend. - at::Tensor wrapper_DIPU__unfold(const at::Tensor & self, int64_t dimension, int64_t size, int64_t step) { - // No device check - // DeviceGuard omitted - dipu::profile::RecordBlockCreator dipu_recorder(__FUNCTION__); - return at::native::unfold(self, dimension, size, step); - } +at::Tensor &wrapper_DIPU__zero_(at::Tensor &self) { + dipu::profile::RecordBlockCreator dipu_recorder(__FUNCTION__); + const OptionalDeviceGuard device_guard(device_of(self)); + return at::native::zero_(self); +} - at::Scalar wrapper_DIPU___local_scalar_dense(const at::Tensor & self) { - dipu::profile::RecordBlockCreator dipu_recorder(__FUNCTION__); - const OptionalDeviceGuard device_guard(device_of(self)); - return dnative::_local_scalar_dense_dipu(self); - } +// it's a view op, However it's not registered by +// RegisterCompositeExplicitAutograd.cpp, but by cpu/cuda backend. +at::Tensor wrapper_DIPU__unfold(const at::Tensor &self, int64_t dimension, + int64_t size, int64_t step) { + // No device check + // DeviceGuard omitted + dipu::profile::RecordBlockCreator dipu_recorder(__FUNCTION__); + return at::native::unfold(self, dimension, size, step); +} - at::Tensor& wrapper_DIPU_source_Storage_set_(at::Tensor& self, at::Storage source) { - // No device check - // DeviceGuard omitted - int64_t new_size = static_cast(source.nbytes() / self.dtype().itemsize()); - return dnative::set_storage_dipu_(self, std::move(source), 0, new_size, {}); - } +at::Scalar wrapper_DIPU___local_scalar_dense(const at::Tensor &self) { + dipu::profile::RecordBlockCreator dipu_recorder(__FUNCTION__); + const OptionalDeviceGuard device_guard(device_of(self)); + return dnative::_local_scalar_dense_dipu(self); +} - at::Tensor& wrapper_DIPU_source_Storage_offset_set_(at::Tensor& self, at::Storage source, c10::SymInt storage_offset, c10::SymIntArrayRef size, c10::SymIntArrayRef stride) { - // No device check - // DeviceGuard omitted - return dnative::set_storage_dipu_(self, source, storage_offset.expect_int(), C10_AS_INTARRAYREF_SLOW(size), C10_AS_INTARRAYREF_SLOW(stride)); - } +at::Tensor &wrapper_DIPU_source_Storage_set_(at::Tensor &self, + at::Storage source) { + // No device check + // DeviceGuard omitted + int64_t new_size = + static_cast(source.nbytes() / self.dtype().itemsize()); + return dnative::set_storage_dipu_(self, std::move(source), 0, new_size, {}); +} - at::Tensor & wrapper_DIPU_source_Tensor_set_(at::Tensor& self, const at::Tensor & source) { - // No device check - // DeviceGuard omitted - if (self.unsafeGetTensorImpl() != source.unsafeGetTensorImpl()) { - return dnative::set_storage_dipu_(self, source.storage(), source.storage_offset(), source.sizes(), source.strides()); - } - return self; - } +at::Tensor &wrapper_DIPU_source_Storage_offset_set_( + at::Tensor &self, at::Storage source, c10::SymInt storage_offset, + c10::SymIntArrayRef size, c10::SymIntArrayRef stride) { + // No device check + // DeviceGuard omitted + return dnative::set_storage_dipu_(self, source, storage_offset.expect_int(), + C10_AS_INTARRAYREF_SLOW(size), + C10_AS_INTARRAYREF_SLOW(stride)); +} - at::Tensor& wrapper_DIPU__set_(at::Tensor & self) { - c10::optional common_device = nullopt; - (void)common_device; // Suppress unused variable warning - c10::impl::check_and_update_common_device(common_device, self, "wrapper_DIPU__set_", "self"); - const OptionalDeviceGuard device_guard(device_of(self)); - return dnative::set_dipu_(self); +at::Tensor &wrapper_DIPU_source_Tensor_set_(at::Tensor &self, + const at::Tensor &source) { + // No device check + // DeviceGuard omitted + if (self.unsafeGetTensorImpl() != source.unsafeGetTensorImpl()) { + return dnative::set_storage_dipu_(self, source.storage(), + source.storage_offset(), source.sizes(), + source.strides()); } + return self; +} - bool wrapper_DIPU__is_set_to(const at::Tensor& self, const at::Tensor& tensor) { - // No device check - // DeviceGuard omitted - return at::native::is_set_to(self, tensor); - } +at::Tensor &wrapper_DIPU__set_(at::Tensor &self) { + c10::optional common_device = nullopt; + (void)common_device; // Suppress unused variable warning + c10::impl::check_and_update_common_device(common_device, self, + "wrapper_DIPU__set_", "self"); + const OptionalDeviceGuard device_guard(device_of(self)); + return dnative::set_dipu_(self); +} - bool wrapper_BackendSelect_is_pinned(const at::Tensor& self, c10::optional device) { - // Only CPU tensors can be pinned - if (!self.is_cpu()) { - return false; - } +bool wrapper_DIPU__is_set_to(const at::Tensor &self, const at::Tensor &tensor) { + // No device check + // DeviceGuard omitted + return at::native::is_set_to(self, tensor); +} - c10::DispatchKeySet dk = c10::DispatchKeySet(c10::computeDispatchKey(c10::nullopt, self.layout(), device.value_or(dipu::DIPU_DEVICE_TYPE))); - return at::_ops::is_pinned::redispatch(dk, self, device); +bool wrapper_BackendSelect_is_pinned(const at::Tensor &self, + c10::optional device) { + // Only CPU tensors can be pinned + if (!self.is_cpu()) { + return false; } - at::Tensor wrapper_BackendSelect__pin_memory(const at::Tensor& self, c10::optional device) { - TORCH_CHECK(self.device().is_cpu(), "cannot pin '", self.toString(), "' only dense CPU tensors can be pinned"); - c10::DispatchKeySet dk = c10::DispatchKeySet(c10::computeDispatchKey(c10::nullopt, self.layout(), device.value_or(dipu::DIPU_DEVICE_TYPE))); - return at::_ops::_pin_memory::redispatch(dk, self, device); - } + c10::DispatchKeySet dk = c10::DispatchKeySet(c10::computeDispatchKey( + c10::nullopt, self.layout(), device.value_or(dipu::DIPU_DEVICE_TYPE))); + return at::_ops::is_pinned::redispatch(dk, self, device); +} - bool wrapper_DIPU_is_pinned(const at::Tensor& self, c10::optional device) { - const OptionalDeviceGuard device_guard(device_of(self)); - return dnative::is_pinned(self, device); - } +at::Tensor wrapper_BackendSelect__pin_memory(const at::Tensor &self, + c10::optional device) { + TORCH_CHECK(self.device().is_cpu(), "cannot pin '", self.toString(), + "' only dense CPU tensors can be pinned"); + c10::DispatchKeySet dk = c10::DispatchKeySet(c10::computeDispatchKey( + c10::nullopt, self.layout(), device.value_or(dipu::DIPU_DEVICE_TYPE))); + return at::_ops::_pin_memory::redispatch(dk, self, device); +} - at::Tensor wrapper_DIPU__pin_memory(const at::Tensor& self, c10::optional device) { - const OptionalDeviceGuard device_guard(device_of(self)); - return dnative::_pin_memory(self, device); - } +bool wrapper_DIPU_is_pinned(const at::Tensor &self, + c10::optional device) { + const OptionalDeviceGuard device_guard(device_of(self)); + return dnative::is_pinned(self, device); +} - void wrapper_DIPU__record_stream(at::Tensor & self, at::Stream s) { - dipu::profile::RecordBlockCreator dipu_recorder(__FUNCTION__); - const OptionalDeviceGuard device_guard(device_of(self)); - dipu::recordStream(self.storage().data_ptr(), dipu::DIPUStream(s)); - } +at::Tensor wrapper_DIPU__pin_memory(const at::Tensor &self, + c10::optional device) { + const OptionalDeviceGuard device_guard(device_of(self)); + return dnative::_pin_memory(self, device); +} -} // end of inner anonymous namespace +void wrapper_DIPU__record_stream(at::Tensor &self, at::Stream s) { + dipu::profile::RecordBlockCreator dipu_recorder(__FUNCTION__); + const OptionalDeviceGuard device_guard(device_of(self)); + dipu::recordStream(self.storage().data_ptr(), dipu::DIPUStream(s)); +} +} // namespace DIPU_LIBRARY_IMPL(_, DIPU_DEVICE_TYPE_MACRO, m) { - m.fallback(torch::CppFunction::makeFromBoxedFunction<&dipu_fallback>()); + m.fallback(torch::CppFunction::makeFromBoxedFunction<&dipu_fallback>()); } -// Change to use XPU which already register this fallback in ATen/core/VariableFallbackKernel.cpp -// TORCH_LIBRARY_IMPL(_, DIPU_AUTOGRAD_DEVICE_TYPE_MACRO, m) { +// Change to use XPU which already register this fallback in +// ATen/core/VariableFallbackKernel.cpp TORCH_LIBRARY_IMPL(_, +// DIPU_AUTOGRAD_DEVICE_TYPE_MACRO, m) { // m.fallback(torch::CppFunction::makeFallthrough()); // } @@ -330,7 +391,8 @@ DIPU_LIBRARY_IMPL(aten, DIPU_DEVICE_TYPE_MACRO, m) { m.impl("empty.memory_format", TORCH_FN(wrapper_DIPU_empty_memory_format)); m.impl("empty_strided", TORCH_FN(wrapper_DIPU_empty_strided)); m.impl("_reshape_alias", TORCH_FN(wrapper_DIPU___reshape_alias)); - m.impl("_copy_from_and_resize", TORCH_FN(wrapper_DIPU___copy_from_and_resize)); + m.impl("_copy_from_and_resize", + TORCH_FN(wrapper_DIPU___copy_from_and_resize)); m.impl("resize_", TORCH_FN(wrapper_resize_)); m.impl("as_strided", TORCH_FN(wrapper_DIPU__as_strided)); m.impl("view", TORCH_FN(wrapper_DIPU__view)); @@ -340,7 +402,8 @@ DIPU_LIBRARY_IMPL(aten, DIPU_DEVICE_TYPE_MACRO, m) { m.impl("unfold", TORCH_FN(wrapper_DIPU__unfold)); m.impl("_local_scalar_dense", TORCH_FN(wrapper_DIPU___local_scalar_dense)); m.impl("set_.source_Storage", TORCH_FN(wrapper_DIPU_source_Storage_set_)); - m.impl("set_.source_Storage_storage_offset", TORCH_FN(wrapper_DIPU_source_Storage_offset_set_)); + m.impl("set_.source_Storage_storage_offset", + TORCH_FN(wrapper_DIPU_source_Storage_offset_set_)); m.impl("set_.source_Tensor", TORCH_FN(wrapper_DIPU_source_Tensor_set_)); m.impl("set_", TORCH_FN(wrapper_DIPU__set_)); m.impl("is_set_to", TORCH_FN(wrapper_DIPU__is_set_to)); @@ -350,21 +413,23 @@ DIPU_LIBRARY_IMPL(aten, DIPU_DEVICE_TYPE_MACRO, m) { } class IgnoreWarningHandler : public c10::WarningHandler { -public: - void process(const c10::Warning& warning) { + public: + void process(const c10::Warning &warning) { // do nothing } }; -c10::WarningHandler* getIgnoreHandler() { +c10::WarningHandler *getIgnoreHandler() { static IgnoreWarningHandler handler_ = IgnoreWarningHandler(); return &handler_; } DIPU_LIBRARY_IMPL(aten, BackendSelect, m) { c10::WarningUtils::WarningHandlerGuard guard(getIgnoreHandler()); - m.impl(TORCH_SELECTIVE_NAME("aten::is_pinned"), TORCH_FN(wrapper_BackendSelect_is_pinned)); - m.impl(TORCH_SELECTIVE_NAME("aten::_pin_memory"), TORCH_FN(wrapper_BackendSelect__pin_memory)); + m.impl(TORCH_SELECTIVE_NAME("aten::is_pinned"), + TORCH_FN(wrapper_BackendSelect_is_pinned)); + m.impl(TORCH_SELECTIVE_NAME("aten::_pin_memory"), + TORCH_FN(wrapper_BackendSelect__pin_memory)); } // override CPU operator @@ -375,4 +440,4 @@ DIPU_LIBRARY_IMPL(aten, CPU, m) { m.impl("empty_strided", TORCH_FN(wrapper_CPU_empty_strided)); } -} //end namespace at +} // end namespace at diff --git a/dipu/torch_dipu/csrc_dipu/aten/RegisterDIPU.hpp b/dipu/torch_dipu/csrc_dipu/aten/RegisterDIPU.hpp index 6ca96e8b7..4b4ae2fdd 100644 --- a/dipu/torch_dipu/csrc_dipu/aten/RegisterDIPU.hpp +++ b/dipu/torch_dipu/csrc_dipu/aten/RegisterDIPU.hpp @@ -3,118 +3,127 @@ #include #include + #include namespace dipu { -bool get_force_fallback(const char* opname); +bool get_force_fallback(const char *opname); }; // namespace dipu namespace at { - void dipu_fallback(const c10::OperatorHandle& op, DispatchKeySet dispatch_keys, - torch::jit::Stack* stack); +void dipu_fallback(const c10::OperatorHandle &op, DispatchKeySet dispatch_keys, + torch::jit::Stack *stack); // Print the warning message only once for one process. -#define DIPU_LOG_WARNING_ONCE(x) \ - do { \ - static bool should_print = true; \ - if (should_print) { \ - std::cout << x; \ - should_print = false; \ - } \ - } while (0) +#define DIPU_LOG_WARNING_ONCE(x) \ + do { \ + static bool should_print = true; \ + if (should_print) { \ + std::cout << x; \ + should_print = false; \ + } \ + } while (0) // Check the environment variable and call the DIPU_LOG_WARNING_ONCE -#define DIPU_OP_LOG_WARNING_ONCE(...) \ - do { \ - const char* env = std::getenv("DIPU_DUMP_OP_ARGS"); \ - int env_value = (env != nullptr) ? std::atoi(env) : 0; \ - if (env_value >= 0) { \ - DIPU_LOG_WARNING_ONCE(__VA_ARGS__); \ - } \ - } while (0) - - -// Temporarily not implement 'sub-dispatch from box' (from torch box func -> ourself unbox func) -// which described in design doc. -// because: 1. it need many add type trait code. 2. pytorch seems are sorting out infer and other pre/post code. -// so we shouldn't created a new preprocess logic? -//so just do a simple runtime cpu fallback to support diopi func loss -#define DIOPI_ATEN_FUNC(opname, diopiFunc, wapperFunc) do { \ - if ((reinterpret_cast(diopiFunc) != nullptr) && (!dipu::get_force_fallback(opname))) { \ - m.impl(opname, TORCH_FN(wapperFunc)); \ - } else { \ - if ((reinterpret_cast(diopiFunc) == nullptr)) { \ - DIPU_OP_LOG_WARNING_ONCE(#diopiFunc << " is not yet implemented, "); \ - } else { \ - DIPU_OP_LOG_WARNING_ONCE("force fallback has been set, "); \ - } \ - DIPU_OP_LOG_WARNING_ONCE(opname << " will be fallback to cpu" << std::endl); \ - } \ -} while (false); - -#define DIOPI_ATEN_FUNC_CUSTOM_FALLBACK(opname, diopi_func, force_fallback, wapper_func, custom_fallback_func) do { \ - if ((reinterpret_cast(diopi_func) != nullptr) && !(force_fallback || dipu::get_force_fallback(opname))) { \ - m.impl(opname, TORCH_FN(wapper_func)); \ - } else { \ - if ((reinterpret_cast(diopi_func) == nullptr)) { \ - DIPU_OP_LOG_WARNING_ONCE(#diopi_func << " is not yet implemented, ") ; \ - } else { \ - DIPU_OP_LOG_WARNING_ONCE("force fallback has been set, "); \ - } \ - DIPU_OP_LOG_WARNING_ONCE(opname << " will be fallback to cpu" << std::endl); \ - m.impl(opname, TORCH_FN(custom_fallback_func)); \ - } \ -} while (false); - +#define DIPU_OP_LOG_WARNING_ONCE(...) \ + do { \ + const char *env = std::getenv("DIPU_DUMP_OP_ARGS"); \ + int env_value = (env != nullptr) ? std::atoi(env) : 0; \ + if (env_value >= 0) { \ + DIPU_LOG_WARNING_ONCE(__VA_ARGS__); \ + } \ + } while (0) + +// Temporarily not implement 'sub-dispatch from box' (from torch box func -> +// ourself unbox func) which described in design doc. because: 1. it need many +// add type trait code. 2. pytorch seems are sorting out infer and other +// pre/post code. so we shouldn't created a new preprocess logic? +// so just do a simple runtime cpu fallback to support diopi func loss +#define DIOPI_ATEN_FUNC(opname, diopiFunc, wapperFunc) \ + do { \ + if ((reinterpret_cast(diopiFunc) != nullptr) && \ + (!dipu::get_force_fallback(opname))) { \ + m.impl(opname, TORCH_FN(wapperFunc)); \ + } else { \ + if ((reinterpret_cast(diopiFunc) == nullptr)) { \ + DIPU_OP_LOG_WARNING_ONCE(#diopiFunc << " is not yet implemented, "); \ + } else { \ + DIPU_OP_LOG_WARNING_ONCE("force fallback has been set, "); \ + } \ + DIPU_OP_LOG_WARNING_ONCE(opname << " will be fallback to cpu" \ + << std::endl); \ + } \ + } while (false); + +#define DIOPI_ATEN_FUNC_CUSTOM_FALLBACK(opname, diopi_func, force_fallback, \ + wapper_func, custom_fallback_func) \ + do { \ + if ((reinterpret_cast(diopi_func) != nullptr) && \ + !(force_fallback || dipu::get_force_fallback(opname))) { \ + m.impl(opname, TORCH_FN(wapper_func)); \ + } else { \ + if ((reinterpret_cast(diopi_func) == nullptr)) { \ + DIPU_OP_LOG_WARNING_ONCE(#diopi_func << " is not yet implemented, "); \ + } else { \ + DIPU_OP_LOG_WARNING_ONCE("force fallback has been set, "); \ + } \ + DIPU_OP_LOG_WARNING_ONCE(opname << " will be fallback to cpu" \ + << std::endl); \ + m.impl(opname, TORCH_FN(custom_fallback_func)); \ + } \ + } while (false); class DIPUOpRegister { -public: - typedef void (*OpRegFunPtr)(torch::Library&); -private: - OpRegFunPtr fun_ptr_; - torch::Library lib_; - static std::deque> dipuOpRegisterList; - static std::mutex mutex_; -public: - DIPUOpRegister(OpRegFunPtr fun_ptr, const char* ns, c10::optional key, const char* file, int line): lib_(torch::Library::IMPL, ns, key, file, line), fun_ptr_(fun_ptr) { - const char* env = std::getenv("DIPU_IMMEDIATE_REGISTER_OP"); - if (env != nullptr && std::atoi(env) > 0) { - fun_ptr_(lib_); - } else { - std::lock_guard guard(mutex_); - dipuOpRegisterList.push_back(std::make_tuple(&lib_, fun_ptr_)); - } + public: + typedef void (*OpRegFunPtr)(torch::Library &); + + private: + OpRegFunPtr fun_ptr_; + torch::Library lib_; + static std::deque> + dipuOpRegisterList; + static std::mutex mutex_; + + public: + DIPUOpRegister(OpRegFunPtr fun_ptr, const char *ns, + c10::optional key, const char *file, + int line) + : lib_(torch::Library::IMPL, ns, key, file, line), fun_ptr_(fun_ptr) { + const char *env = std::getenv("DIPU_IMMEDIATE_REGISTER_OP"); + if (env != nullptr && std::atoi(env) > 0) { + fun_ptr_(lib_); + } else { + std::lock_guard guard(mutex_); + dipuOpRegisterList.push_back(std::make_tuple(&lib_, fun_ptr_)); } + } - static void register_op(); + static void register_op(); }; -} //end ns at +} // namespace at namespace { #define DIPU_LIBRARY_IMPL(ns, k, m) _DIPU_LIBRARY_IMPL(ns, k, m, C10_UID) -#define _DIPU_LIBRARY_IMPL(ns, k, m, uid) \ - static void C10_CONCATENATE( \ - DIPU_LIBRARY_IMPL_init_##ns##_##k##_, uid)(torch::Library&); \ - static const ::at::DIPUOpRegister C10_CONCATENATE( \ - DIPU_LIBRARY_IMPL_static_init_##ns##_##k##_, uid)( \ - c10::guts::if_constexpr( \ - []() { \ - return &C10_CONCATENATE( \ - DIPU_LIBRARY_IMPL_init_##ns##_##k##_, uid); \ - }, \ - []() { return [](torch::Library&) -> void {}; }), \ - #ns, \ - c10::make_optional(c10::DispatchKey::k), \ - __FILE__, \ - __LINE__); \ - void C10_CONCATENATE( \ - DIPU_LIBRARY_IMPL_init_##ns##_##k##_, uid)(torch::Library & m) - -} // namespace +#define _DIPU_LIBRARY_IMPL(ns, k, m, uid) \ + static void C10_CONCATENATE(DIPU_LIBRARY_IMPL_init_##ns##_##k##_, \ + uid)(torch::Library &); \ + static const ::at::DIPUOpRegister C10_CONCATENATE( \ + DIPU_LIBRARY_IMPL_static_init_##ns##_##k##_, uid)( \ + c10::guts::if_constexpr( \ + []() { \ + return &C10_CONCATENATE(DIPU_LIBRARY_IMPL_init_##ns##_##k##_, \ + uid); \ + }, \ + []() { return [](torch::Library &) -> void {}; }), \ + #ns, c10::make_optional(c10::DispatchKey::k), __FILE__, __LINE__); \ + void C10_CONCATENATE(DIPU_LIBRARY_IMPL_init_##ns##_##k##_, \ + uid)(torch::Library & m) + +} // namespace diff --git a/dipu/torch_dipu/csrc_dipu/aten/ops/CopyKernel.cpp b/dipu/torch_dipu/csrc_dipu/aten/ops/CopyKernel.cpp index 0ddf576ec..ab4568bb9 100644 --- a/dipu/torch_dipu/csrc_dipu/aten/ops/CopyKernel.cpp +++ b/dipu/torch_dipu/csrc_dipu/aten/ops/CopyKernel.cpp @@ -1,188 +1,203 @@ // Copyright (c) 2023, DeepLink. +#include #include -#include +#include #include -#include -#include +#include #include -#include -#include +#include +#include #include -#include #include +#include #include +using at::Layout; using c10::device_or_default; +using c10::IntArrayRef; using c10::layout_or_default; using c10::StorageImpl; using c10::TensorImpl; -using at::Layout; using dipu::devapis::deviceId_t; -using c10::IntArrayRef; namespace dipu::native { - // need abstract cast strategy before copy, some device(eg camb) not support all types, - inline at::Tensor cast2CompatibleDeviceTensor(const at::Tensor& hostTensor) { - return hostTensor; +// need abstract cast strategy before copy, some device(eg camb) not support all +// types, +inline at::Tensor cast2CompatibleDeviceTensor(const at::Tensor &hostTensor) { + return hostTensor; +} +inline int64_t getCopyBytes(const at::Tensor &dst, const at::Tensor &src) { + if (dst.nbytes() != + src.nbytes()) { // outer bytes must same. different type is unsuported + TORCH_CHECK(false, "dipu copy with different size is not allowed"); } - inline int64_t getCopyBytes(const at::Tensor& dst, const at::Tensor& src) { - if (dst.nbytes() != src.nbytes()) { // outer bytes must same. different type is unsuported - TORCH_CHECK(false, "dipu copy with different size is not allowed"); - } - int64_t dstBytes = dst.unsafeGetTensorImpl()->unsafe_storage().nbytes(); - int64_t srcBytes = src.unsafeGetTensorImpl()->unsafe_storage().nbytes(); - // a view one + a real stor one is supported - return srcBytes < dstBytes ? srcBytes : dstBytes; + int64_t dstBytes = dst.unsafeGetTensorImpl()->unsafe_storage().nbytes(); + int64_t srcBytes = src.unsafeGetTensorImpl()->unsafe_storage().nbytes(); + // a view one + a real stor one is supported + return srcBytes < dstBytes ? srcBytes : dstBytes; +} + +static void copy_H2D(const at::Tensor &dst, const at::Tensor &src, + bool non_blocking) { + int64_t nbytes = getCopyBytes(dst, src); + dipu::DIPUStream stream = dipu::getCurrentDIPUStream(); + + auto src_cast = cast2CompatibleDeviceTensor(src); + void *src_ptr = src_cast.data_ptr(); + void *dst_ptr = dst.data_ptr(); + + MemChecker::instance().check(dst); + dipu::devproxy::memCopyH2DAsync(stream.rawstream(), nbytes, dst_ptr, src_ptr); + if (!non_blocking) { + dipu::devproxy::syncStream(stream.rawstream()); } +} - static void copy_H2D(const at::Tensor& dst, const at::Tensor& src, bool non_blocking) { - int64_t nbytes = getCopyBytes(dst, src); - dipu::DIPUStream stream = dipu::getCurrentDIPUStream(); +static void copy_D2H(const at::Tensor &dst, const at::Tensor &src, + bool non_blocking) { + int64_t nbytes = getCopyBytes(dst, src); + dipu::DIPUStream stream = dipu::getCurrentDIPUStream(); - auto src_cast = cast2CompatibleDeviceTensor(src); - void* src_ptr = src_cast.data_ptr(); - void* dst_ptr = dst.data_ptr(); + void *src_ptr = src.data_ptr(); + void *dst_ptr = dst.data_ptr(); - MemChecker::instance().check(dst); - dipu::devproxy::memCopyH2DAsync(stream.rawstream(), nbytes, dst_ptr, src_ptr); - if (!non_blocking) { - dipu::devproxy::syncStream(stream.rawstream()); - } + MemChecker::instance().check(src); + dipu::devproxy::memCopyD2HAsync(stream.rawstream(), nbytes, dst_ptr, src_ptr); + if (!non_blocking) { + dipu::devproxy::syncStream(stream.rawstream()); } +} - static void copy_D2H(const at::Tensor& dst, const at::Tensor& src, bool non_blocking) { - int64_t nbytes = getCopyBytes(dst, src); - dipu::DIPUStream stream = dipu::getCurrentDIPUStream(); - - void* src_ptr = src.data_ptr(); - void* dst_ptr = dst.data_ptr(); - - MemChecker::instance().check(src); - dipu::devproxy::memCopyD2HAsync(stream.rawstream(), nbytes, dst_ptr, src_ptr); - if (!non_blocking) { - dipu::devproxy::syncStream(stream.rawstream()); - } +inline bool isDiffStrides(const IntArrayRef stride1, + const IntArrayRef stride2) { + if (stride1.size() != stride2.size()) { + return true; } - - inline bool isDiffStrides(const IntArrayRef stride1, const IntArrayRef stride2) { - if (stride1.size() != stride2.size()) { + for (auto i = 0; i < stride1.size(); i++) { + if (stride1[i] != stride2[i]) { return true; } - for (auto i = 0; i < stride1.size() ; i++ ) { - if (stride1[i] != stride2[i]) { - return true; - } - } + } + return false; +} + +// 1. expand, 2. patial view. 3. type cast. +inline bool canDirectCopy(const at::Tensor &dst, const at::Tensor &src) { + // assume layout always = not suppport Sparse layout + TORCH_CHECK(dst.options().layout() == c10::Layout::Strided, + "only Strided layout is supported"); + + int64_t srcBytes = src.unsafeGetTensorImpl()->unsafe_storage().nbytes(); + int64_t dstBytes = dst.unsafeGetTensorImpl()->unsafe_storage().nbytes(); + if (srcBytes != dstBytes || dst.numel() != src.numel() || + dst.options().dtype() != src.options().dtype()) { return false; } - - // 1. expand, 2. patial view. 3. type cast. - inline bool canDirectCopy(const at::Tensor& dst, const at::Tensor& src) { - // assume layout always = not suppport Sparse layout - TORCH_CHECK(dst.options().layout() == c10::Layout::Strided, "only Strided layout is supported"); - - int64_t srcBytes = src.unsafeGetTensorImpl()->unsafe_storage().nbytes(); - int64_t dstBytes = dst.unsafeGetTensorImpl()->unsafe_storage().nbytes(); - if (srcBytes != dstBytes || dst.numel() != src.numel() || dst.options().dtype() != src.options().dtype()) { - return false; - } - if (isDiffStrides(dst.strides(), src.strides())) { - return false; - } - // view(with no-zero offset) direct copy may cause err(not sure how long real stor data should be copyed) not supported - if (dst.storage_offset() != 0 || src.storage_offset() != 0) { - return false; - } - // even tensors have zero offset and same stride/type cannot do simple safe direct copy - // because we cannot simply decide how much data will be copyed from raw stor (unless check stride). - // so we always return false now. - // need enhance in future, because always copy with the help of cpu is toooo0 slow. - // **** check if copy safely using tensor.nbytes() when is_contiguous() = true. + if (isDiffStrides(dst.strides(), src.strides())) { return false; } - - static void copy_D2D(const at::Tensor& dst, const at::Tensor& src, bool non_blocking) { - int64_t nbytes = getCopyBytes(dst, src); - dipu::DIPUStream stream = dipu::getCurrentDIPUStream(); - - void* src_ptr = src.data_ptr(); - void* dst_ptr = dst.data_ptr(); - - MemChecker::instance().check(src); - MemChecker::instance().check(dst); - dipu::devproxy::memCopyD2DAsync(stream.rawstream(), nbytes, dst.device().index(), dst_ptr, - src.device().index(), src_ptr); - if (!non_blocking) { - dipu::devproxy::syncStream(stream.rawstream()); - } + // view(with no-zero offset) direct copy may cause err(not sure how long real + // stor data should be copyed) not supported + if (dst.storage_offset() != 0 || src.storage_offset() != 0) { + return false; } - - inline void doRealCp(at::Tensor& self, const at::Tensor& src, bool non_blocking) { - if (dipu::isDeviceTensor(self) && !dipu::isDeviceTensor(src)) { - // src is cpu. - copy_H2D(self, src, non_blocking); - } - else if (!dipu::isDeviceTensor(self) && dipu::isDeviceTensor(src)) { - // self is cpu. - copy_D2H(self, src, non_blocking); - } - else { // device to device - copy_D2D(self, src, non_blocking); - } + // even tensors have zero offset and same stride/type cannot do simple safe + // direct copy because we cannot simply decide how much data will be copyed + // from raw stor (unless check stride). so we always return false now. need + // enhance in future, because always copy with the help of cpu is toooo0 slow. + // **** check if copy safely using tensor.nbytes() when is_contiguous() = + // true. + return false; +} + +static void copy_D2D(const at::Tensor &dst, const at::Tensor &src, + bool non_blocking) { + int64_t nbytes = getCopyBytes(dst, src); + dipu::DIPUStream stream = dipu::getCurrentDIPUStream(); + + void *src_ptr = src.data_ptr(); + void *dst_ptr = dst.data_ptr(); + + MemChecker::instance().check(src); + MemChecker::instance().check(dst); + dipu::devproxy::memCopyD2DAsync(stream.rawstream(), nbytes, + dst.device().index(), dst_ptr, + src.device().index(), src_ptr); + if (!non_blocking) { + dipu::devproxy::syncStream(stream.rawstream()); + } +} + +inline void doRealCp(at::Tensor &self, const at::Tensor &src, + bool non_blocking) { + if (dipu::isDeviceTensor(self) && !dipu::isDeviceTensor(src)) { + // src is cpu. + copy_H2D(self, src, non_blocking); + } else if (!dipu::isDeviceTensor(self) && dipu::isDeviceTensor(src)) { + // self is cpu. + copy_D2H(self, src, non_blocking); + } else { // device to device + copy_D2D(self, src, non_blocking); } +} - // self is dest - // not handle storage offset, need? - at::Tensor& DIPUATenFunctions::copy_(at::Tensor& self, const at::Tensor& src, bool non_blocking) { - if (self.numel() == 0) { - return self; - } - // save tensor dim name - c10::optional names = src.opt_names(); - if (names.has_value()) { - internal_set_names_inplace(self, names); - } - if (!canDirectCopy(self, src)) { - at::Tensor src_cpu = src; - // src to cpu - if (dipu::isDeviceTensor(src)) { - src_cpu = at::empty_strided(src.sizes(), src.strides(), - src.options().device(c10::DeviceType::CPU)); - // src storage size may bigger than src_cpu's if src is a partial view. - // but not smaller. because src_cpu use same stride as src. - // src -> src_cpu - doRealCp(src_cpu, src, non_blocking); - } - - if(dipu::isDeviceTensor(self)) { - at::Tensor dst_cpu = at::empty_strided(self.sizes(), self.strides(), - self.options().device(c10::DeviceType::CPU)); - doRealCp(dst_cpu, self, non_blocking); - // proxy to cpu to handle different type/view problem - dst_cpu.copy_(src_cpu); - - doRealCp(self, dst_cpu, non_blocking); - } else { // self is cpu - self.copy_(src_cpu); - } - } else { - doRealCp(self, src, non_blocking); - } +// self is dest +// not handle storage offset, need? +at::Tensor &DIPUATenFunctions::copy_(at::Tensor &self, const at::Tensor &src, + bool non_blocking) { + if (self.numel() == 0) { return self; } + // save tensor dim name + c10::optional names = src.opt_names(); + if (names.has_value()) { + internal_set_names_inplace(self, names); + } + if (!canDirectCopy(self, src)) { + at::Tensor src_cpu = src; + // src to cpu + if (dipu::isDeviceTensor(src)) { + src_cpu = at::empty_strided(src.sizes(), src.strides(), + src.options().device(c10::DeviceType::CPU)); + // src storage size may bigger than src_cpu's if src is a partial view. + // but not smaller. because src_cpu use same stride as src. + // src -> src_cpu + doRealCp(src_cpu, src, non_blocking); + } - at::Scalar DIPUATenFunctions::_local_scalar_dense_dipu(const at::Tensor& self) { - at::Scalar r; - AT_DISPATCH_ALL_TYPES_AND2(at::kHalf, at::kBool, self.scalar_type(), "_local_scalar_dense_dipu", [&] { - scalar_t value; - dipu::DIPUStream stream = dipu::getCurrentDIPUStream(); - MemChecker::instance().check(self); - dipu::devproxy::memCopyD2HAsync(stream.rawstream(), sizeof(scalar_t), &value, self.data_ptr()); - dipu::devproxy::syncStream(stream.rawstream()); - r = at::Scalar(value); - }); - return r; + if (dipu::isDeviceTensor(self)) { + at::Tensor dst_cpu = + at::empty_strided(self.sizes(), self.strides(), + self.options().device(c10::DeviceType::CPU)); + doRealCp(dst_cpu, self, non_blocking); + // proxy to cpu to handle different type/view problem + dst_cpu.copy_(src_cpu); + + doRealCp(self, dst_cpu, non_blocking); + } else { // self is cpu + self.copy_(src_cpu); + } + } else { + doRealCp(self, src, non_blocking); } -} \ No newline at end of file + return self; +} + +at::Scalar DIPUATenFunctions::_local_scalar_dense_dipu(const at::Tensor &self) { + at::Scalar r; + AT_DISPATCH_ALL_TYPES_AND2( + at::kHalf, at::kBool, self.scalar_type(), "_local_scalar_dense_dipu", + [&] { + scalar_t value; + dipu::DIPUStream stream = dipu::getCurrentDIPUStream(); + MemChecker::instance().check(self); + dipu::devproxy::memCopyD2HAsync(stream.rawstream(), sizeof(scalar_t), + &value, self.data_ptr()); + dipu::devproxy::syncStream(stream.rawstream()); + r = at::Scalar(value); + }); + return r; +} +} // namespace dipu::native \ No newline at end of file diff --git a/dipu/torch_dipu/csrc_dipu/aten/ops/CustomFallbackFunctions.hpp b/dipu/torch_dipu/csrc_dipu/aten/ops/CustomFallbackFunctions.hpp index e3fd9d23f..5e3dadb8f 100644 --- a/dipu/torch_dipu/csrc_dipu/aten/ops/CustomFallbackFunctions.hpp +++ b/dipu/torch_dipu/csrc_dipu/aten/ops/CustomFallbackFunctions.hpp @@ -1,15 +1,15 @@ #pragma once -#include "OpUtils.hpp" -#include #include "csrc_dipu/aten/RegisterDIPU.hpp" +#include +#include "OpUtils.hpp" namespace dipu { namespace native { static c10::optional dipu_to_cpu( - const c10::optional& device_tensor) { + const c10::optional &device_tensor) { c10::optional cpu_tensor = c10::nullopt; if (device_tensor.has_value() && device_tensor.value().defined()) { cpu_tensor = device_tensor.value().cpu(); @@ -17,7 +17,7 @@ static c10::optional dipu_to_cpu( return cpu_tensor; } -static at::Tensor to_cpu_no_half(const at::Tensor& devtensor) { +static at::Tensor to_cpu_no_half(const at::Tensor &devtensor) { auto cpu_tensor = devtensor.cpu(); auto intype = devtensor.options().dtype_opt()->toScalarType(); if (intype == at::ScalarType::Half) { @@ -27,9 +27,10 @@ static at::Tensor to_cpu_no_half(const at::Tensor& devtensor) { } } -static at::Tensor& custom_fallback_dipu_silu_out(const at::Tensor& self, - at::Tensor& out) { - DIPU_OP_LOG_WARNING_ONCE("custom fallback to cpu, name=silu_out" << std::endl); +static at::Tensor &custom_fallback_dipu_silu_out(const at::Tensor &self, + at::Tensor &out) { + DIPU_OP_LOG_WARNING_ONCE("custom fallback to cpu, name=silu_out" + << std::endl); auto self_cpu = to_cpu_no_half(self); auto out_cpu = to_cpu_no_half(self); out_cpu = at::silu_out(self_cpu, out_cpu); @@ -38,7 +39,7 @@ static at::Tensor& custom_fallback_dipu_silu_out(const at::Tensor& self, } static c10::List> to_cpu( - const c10::List>& indices) { + const c10::List> &indices) { c10::List> indices_cpu; indices_cpu.reserve(indices.size()); // input as x[1:2, [1, 2]], Slice by first dimension already executed before @@ -51,11 +52,11 @@ static c10::List> to_cpu( } return indices_cpu; } -static at::Tensor& custom_fallback_dipu_index_tensor_out( - const at::Tensor& self, const c10::List>& indices, - at::Tensor& out) { +static at::Tensor &custom_fallback_dipu_index_tensor_out( + const at::Tensor &self, const c10::List> &indices, + at::Tensor &out) { DIPU_OP_LOG_WARNING_ONCE("custom fallback to cpu, name=index.Tensor_out" - << std::endl); + << std::endl); auto indices_cpu = to_cpu(indices); at::Tensor out_cpu = out.cpu(); @@ -64,11 +65,11 @@ static at::Tensor& custom_fallback_dipu_index_tensor_out( return out; } -static at::Tensor& custom_fallback_dipu__index_put_impl_( - at::Tensor& self, const c10::List>& indices, - const at::Tensor& values, bool accumulate, bool unsafe) { +static at::Tensor &custom_fallback_dipu__index_put_impl_( + at::Tensor &self, const c10::List> &indices, + const at::Tensor &values, bool accumulate, bool unsafe) { DIPU_OP_LOG_WARNING_ONCE("custom fallback to cpu, name=_index_put_impl_" - << std::endl); + << std::endl); auto indices_cpu = to_cpu(indices); at::Tensor self_cpu = self.cpu(); @@ -79,16 +80,16 @@ static at::Tensor& custom_fallback_dipu__index_put_impl_( return self; } -static ::std::tuple +static ::std::tuple custom_fallback_dipu_native_batch_norm_out( - const at::Tensor& input, const c10::optional& weight_opt, - const c10::optional& bias_opt, - const c10::optional& running_mean_opt, - const c10::optional& running_var_opt, bool training, - double momentum, double eps, at::Tensor& out, at::Tensor& save_mean, - at::Tensor& save_invstd) { + const at::Tensor &input, const c10::optional &weight_opt, + const c10::optional &bias_opt, + const c10::optional &running_mean_opt, + const c10::optional &running_var_opt, bool training, + double momentum, double eps, at::Tensor &out, at::Tensor &save_mean, + at::Tensor &save_invstd) { DIPU_OP_LOG_WARNING_ONCE("custom fallback to cpu, name=native_batch_norm_out" - << std::endl); + << std::endl); at::Tensor input_cpu = input.cpu(); at::Tensor out_cpu = out.cpu(); at::Tensor save_mean_cpu = save_mean.cpu(); @@ -118,12 +119,12 @@ custom_fallback_dipu_native_batch_norm_out( } static at::Tensor custom_fallback_dipu_convolution_overrideable( - const at::Tensor& input, const at::Tensor& weight, - const c10::optional& bias, at::IntArrayRef stride, + const at::Tensor &input, const at::Tensor &weight, + const c10::optional &bias, at::IntArrayRef stride, at::IntArrayRef padding, at::IntArrayRef dilation, bool transposed, at::IntArrayRef output_padding, int64_t groups) { - DIPU_OP_LOG_WARNING_ONCE("custom fallback to cpu, name=convolution_overrideable" - << std::endl); + DIPU_OP_LOG_WARNING_ONCE( + "custom fallback to cpu, name=convolution_overrideable" << std::endl); auto input_cpu = input.cpu(); auto weight_cpu = weight.cpu(); auto bias_cpu = dipu_to_cpu(bias); @@ -135,8 +136,8 @@ static at::Tensor custom_fallback_dipu_convolution_overrideable( static std::tuple custom_fallback_dipu_convolution_backward_overrideable( - const at::Tensor& grad_output, const at::Tensor& input, - const at::Tensor& weight, at::IntArrayRef stride, at::IntArrayRef padding, + const at::Tensor &grad_output, const at::Tensor &input, + const at::Tensor &weight, at::IntArrayRef stride, at::IntArrayRef padding, at::IntArrayRef dilation, bool transposed, at::IntArrayRef output_padding, int64_t groups, ::std::array output_mask) { DIPU_OP_LOG_WARNING_ONCE( @@ -181,13 +182,13 @@ custom_fallback_dipu_convolution_backward_overrideable( static std::tuple custom_fallback_dipu_native_batch_norm( - const at::Tensor& input, const c10::optional& weight_opt, - const c10::optional& bias_opt, - const c10::optional& running_mean_opt, - const c10::optional& running_var_opt, bool training, + const at::Tensor &input, const c10::optional &weight_opt, + const c10::optional &bias_opt, + const c10::optional &running_mean_opt, + const c10::optional &running_var_opt, bool training, double momentum, double eps) { DIPU_OP_LOG_WARNING_ONCE("custom fallback to cpu, name=dipu_native_batch_norm" - << std::endl); + << std::endl); int64_t dim_c = input.size(1); at::TensorOptions options = input.options().dtype(at::kFloat); @@ -215,12 +216,12 @@ custom_fallback_dipu_native_batch_norm( } static std::tuple -custom_fallback_dipu_linear_backward(const at::Tensor& input, - const at::Tensor& grad_output, - const at::Tensor& weight, +custom_fallback_dipu_linear_backward(const at::Tensor &input, + const at::Tensor &grad_output, + const at::Tensor &weight, ::std::array output_mask) { DIPU_OP_LOG_WARNING_ONCE("custom fallback to cpu, name=linear_backward" - << std::endl); + << std::endl); auto input_cpu = input.cpu(); auto grad_output_cpu = grad_output.cpu(); auto weight_cpu = weight.cpu(); @@ -266,15 +267,15 @@ custom_fallback_dipu_linear_backward(const at::Tensor& input, static std::tuple custom_fallback_dipu_native_batch_norm_backward( - const at::Tensor& grad_out, const at::Tensor& input, - const c10::optional& weight_opt, - const c10::optional& running_mean_opt, - const c10::optional& running_var_opt, - const c10::optional& save_mean_opt, - const c10::optional& save_invstd_opt, bool train, double eps, + const at::Tensor &grad_out, const at::Tensor &input, + const c10::optional &weight_opt, + const c10::optional &running_mean_opt, + const c10::optional &running_var_opt, + const c10::optional &save_mean_opt, + const c10::optional &save_invstd_opt, bool train, double eps, ::std::array output_mask) { - DIPU_OP_LOG_WARNING_ONCE("custom fallback to cpu, name=native_batch_norm_backward" - << std::endl); + DIPU_OP_LOG_WARNING_ONCE( + "custom fallback to cpu, name=native_batch_norm_backward" << std::endl); int64_t dim_c = input.size(1); at::TensorOptions options = input.options().dtype(at::ScalarType::Float); @@ -315,43 +316,45 @@ custom_fallback_dipu_native_batch_norm_backward( return std::tie(grad_input, grad_weight, grad_bias); } -static at::Tensor& custom_fallback_dipu_copy_(at::Tensor& self, const at::Tensor& src, bool non_blocking) { - DIPU_OP_LOG_WARNING_ONCE("custom fallback to cpu, name=copy_" << std::endl); - dipu::profile::RecordBlockCreator dipu_recorder(__FUNCTION__); - static bool use_slow_copy = (std::getenv("DIPU_USE_SLOW_COPY") != nullptr); - dipu::DIPUGuard guard(self.is_cpu() ? src.device() : self.device()); - if (non_blocking) { - auto stream = dipu::getCurrentDIPUStream(); - const bool is_default_stream = dipu::getDefaultDIPUStream() == stream; - if (self.is_cpu()) { - if (self.options().pinned_memory()) { - self.record_stream(stream); - } - } else if (!is_default_stream){ +static at::Tensor &custom_fallback_dipu_copy_(at::Tensor &self, + const at::Tensor &src, + bool non_blocking) { + DIPU_OP_LOG_WARNING_ONCE("custom fallback to cpu, name=copy_" << std::endl); + dipu::profile::RecordBlockCreator dipu_recorder(__FUNCTION__); + static bool use_slow_copy = (std::getenv("DIPU_USE_SLOW_COPY") != nullptr); + dipu::DIPUGuard guard(self.is_cpu() ? src.device() : self.device()); + if (non_blocking) { + auto stream = dipu::getCurrentDIPUStream(); + const bool is_default_stream = dipu::getDefaultDIPUStream() == stream; + if (self.is_cpu()) { + if (self.options().pinned_memory()) { self.record_stream(stream); } - if (src.is_cpu()) { - if (src.options().pinned_memory()) { - src.record_stream(stream); - } - } else if (!is_default_stream) { + } else if (!is_default_stream) { + self.record_stream(stream); + } + if (src.is_cpu()) { + if (src.options().pinned_memory()) { src.record_stream(stream); } - } - if (use_slow_copy) { - return dipu::native::DIPUATenFunctions::copy_(self, src, non_blocking); - } else { - return dipu::getDipuCopyInplace()->run(self, src, non_blocking); + } else if (!is_default_stream) { + src.record_stream(stream); } } + if (use_slow_copy) { + return dipu::native::DIPUATenFunctions::copy_(self, src, non_blocking); + } else { + return dipu::getDipuCopyInplace()->run(self, src, non_blocking); + } +} void custom_fallback_dipu__amp_foreach_non_finite_check_and_unscale_( - at::TensorList scaled_grads, at::Tensor& found_inf, - const at::Tensor& inv_scale); + at::TensorList scaled_grads, at::Tensor &found_inf, + const at::Tensor &inv_scale); -at::Tensor& custom_fallback_dipu__amp_update_scale_(at::Tensor& current_scale, - at::Tensor& growth_tracker, - const at::Tensor& found_inf, +at::Tensor &custom_fallback_dipu__amp_update_scale_(at::Tensor ¤t_scale, + at::Tensor &growth_tracker, + const at::Tensor &found_inf, double growth_factor, double backoff_factor, int64_t growth_interval); diff --git a/dipu/torch_dipu/csrc_dipu/aten/ops/CustomFallbackFunctionsForAmpGradScaler.cpp b/dipu/torch_dipu/csrc_dipu/aten/ops/CustomFallbackFunctionsForAmpGradScaler.cpp index a1983f6ab..da49de419 100644 --- a/dipu/torch_dipu/csrc_dipu/aten/ops/CustomFallbackFunctionsForAmpGradScaler.cpp +++ b/dipu/torch_dipu/csrc_dipu/aten/ops/CustomFallbackFunctionsForAmpGradScaler.cpp @@ -13,9 +13,9 @@ namespace native { namespace { -void _amp_non_finite_check_and_unscale_(at::Tensor& scaled_grad, - at::Tensor& found_inf, - const at::Tensor& inv_scale) { +void _amp_non_finite_check_and_unscale_(at::Tensor &scaled_grad, + at::Tensor &found_inf, + const at::Tensor &inv_scale) { scaled_grad *= inv_scale.item(); if (!scaled_grad.isfinite().all().item()) { found_inf[0] = 1.f; @@ -37,18 +37,18 @@ void _amp_non_finite_check_and_unscale_(at::Tensor& scaled_grad, // inv_scale The inverse of the scale factor by which scaled_grads are // currently multiplied. void custom_fallback_dipu__amp_foreach_non_finite_check_and_unscale_( - at::TensorList scaled_grads, at::Tensor& found_inf, - const at::Tensor& inv_scale) { + at::TensorList scaled_grads, at::Tensor &found_inf, + const at::Tensor &inv_scale) { DIPU_OP_LOG_WARNING_ONCE( "custom fallback to separated ops, " "name=_amp_foreach_non_finite_check_and_unscale_" << std::endl); TORCH_CHECK(inv_scale.numel() == 1, "inv_scale must be a 1-element tensor."); TORCH_CHECK(found_inf.numel() == 1, "found_inf must be a 1-element tensor."); - for (const at::Tensor& t : scaled_grads) { + for (const at::Tensor &t : scaled_grads) { // NOLINTNEXTLINE: const_cast here is safe according to pytorch's source // code - _amp_non_finite_check_and_unscale_(const_cast(t), found_inf, + _amp_non_finite_check_and_unscale_(const_cast(t), found_inf, inv_scale); } } @@ -71,14 +71,14 @@ void custom_fallback_dipu__amp_foreach_non_finite_check_and_unscale_( // // Returns: // current_scale -at::Tensor& custom_fallback_dipu__amp_update_scale_(at::Tensor& current_scale, - at::Tensor& growth_tracker, - const at::Tensor& found_inf, +at::Tensor &custom_fallback_dipu__amp_update_scale_(at::Tensor ¤t_scale, + at::Tensor &growth_tracker, + const at::Tensor &found_inf, double growth_factor, double backoff_factor, int64_t growth_interval) { - DIPU_OP_LOG_WARNING_ONCE("custom fallback to separated ops, name=_amp_update_scale_" - << std::endl); + DIPU_OP_LOG_WARNING_ONCE( + "custom fallback to separated ops, name=_amp_update_scale_" << std::endl); TORCH_CHECK(growth_tracker.scalar_type() == at::ScalarType::Int, "growth_tracker must be an int tensor."); TORCH_CHECK(current_scale.scalar_type() == at::ScalarType::Float, diff --git a/dipu/torch_dipu/csrc_dipu/aten/ops/DIPUAmp.cpp b/dipu/torch_dipu/csrc_dipu/aten/ops/DIPUAmp.cpp index 48d002d75..44c05efcd 100644 --- a/dipu/torch_dipu/csrc_dipu/aten/ops/DIPUAmp.cpp +++ b/dipu/torch_dipu/csrc_dipu/aten/ops/DIPUAmp.cpp @@ -25,10 +25,8 @@ namespace autocast { namespace { template -inline bool firstarg_is_eligible( - DeviceType device_type, - const Tensor& arg, - Args... args) { +inline bool firstarg_is_eligible(DeviceType device_type, const Tensor &arg, + Args... args) { return is_eligible(arg, device_type); } @@ -70,8 +68,8 @@ template > { static Ret call(Args... args) { - // DispatchKey::Autocast is not the alias key of all AutocastType as Autograd, - // it's just alias of AutocastCUDA (see c10/core/DispatchKey.h) + // DispatchKey::Autocast is not the alias key of all AutocastType as + // Autograd, it's just alias of AutocastCUDA (see c10/core/DispatchKey.h) c10::impl::ExcludeDispatchKeyGuard no_autocast( get_autocast_dispatch_key_from_device_type(device_type)); return (*F)( diff --git a/dipu/torch_dipu/csrc_dipu/aten/ops/EmptyOpsKernel.cpp b/dipu/torch_dipu/csrc_dipu/aten/ops/EmptyOpsKernel.cpp index da0b472ae..77fd26fcb 100644 --- a/dipu/torch_dipu/csrc_dipu/aten/ops/EmptyOpsKernel.cpp +++ b/dipu/torch_dipu/csrc_dipu/aten/ops/EmptyOpsKernel.cpp @@ -1,80 +1,99 @@ // Copyright (c) 2023, DeepLink. #include #include -#include #include +#include #include #include -#include #include +#include +using at::Layout; using c10::device_or_default; using c10::layout_or_default; using c10::StorageImpl; using c10::TensorImpl; -using at::Layout; namespace dipu::native { - static c10::Allocator* GetCPUAllocatorMaybePinned(bool pin_memory) { - if (pin_memory) { - return dipu::getAllocator(at::DeviceType::CPU); - } - return c10::GetCPUAllocator(); +static c10::Allocator *GetCPUAllocatorMaybePinned(bool pin_memory) { + if (pin_memory) { + return dipu::getAllocator(at::DeviceType::CPU); } + return c10::GetCPUAllocator(); +} -at::Tensor DIPUATenFunctions::empty(at::IntArrayRef size, c10::optional dtype_opt, - c10::optional layout_opt, c10::optional device_opt, - c10::optional pin_memory_opt, c10::optional memory_format_opt) { - dipu::profile::RecordBlockCreator dipu_recorder(__FUNCTION__); - TORCH_INTERNAL_ASSERT_DEBUG_ONLY(c10::device_or_default(device_opt).type() == dipu::DIPU_DEVICE_TYPE); - TORCH_INTERNAL_ASSERT_DEBUG_ONLY(c10::layout_or_default(layout_opt) == c10::Layout::Strided); +at::Tensor DIPUATenFunctions::empty( + at::IntArrayRef size, c10::optional dtype_opt, + c10::optional layout_opt, c10::optional device_opt, + c10::optional pin_memory_opt, + c10::optional memory_format_opt) { + dipu::profile::RecordBlockCreator dipu_recorder(__FUNCTION__); + TORCH_INTERNAL_ASSERT_DEBUG_ONLY(c10::device_or_default(device_opt).type() == + dipu::DIPU_DEVICE_TYPE); + TORCH_INTERNAL_ASSERT_DEBUG_ONLY(c10::layout_or_default(layout_opt) == + c10::Layout::Strided); - c10::Allocator *allocator = dipu::getAllocator(dipu::DIPU_DEVICE_TYPE); - constexpr c10::DispatchKeySet dipu_ks({dipu::DIPU_DISPATCH_KEY}); - return at::detail::empty_generic(size, allocator, dipu_ks, c10::dtype_or_default(dtype_opt), memory_format_opt); + c10::Allocator *allocator = dipu::getAllocator(dipu::DIPU_DEVICE_TYPE); + constexpr c10::DispatchKeySet dipu_ks({dipu::DIPU_DISPATCH_KEY}); + return at::detail::empty_generic(size, allocator, dipu_ks, + c10::dtype_or_default(dtype_opt), + memory_format_opt); } - at::Tensor DIPUATenFunctions::empty_cpu(at::IntArrayRef size, c10::optional dtype_opt, - c10::optional layout_opt, c10::optional device_opt, - c10::optional pin_memory_opt, c10::optional memory_format_opt) { - TORCH_INTERNAL_ASSERT_DEBUG_ONLY(c10::device_or_default(device_opt).type() == c10::DeviceType::CPU); - TORCH_INTERNAL_ASSERT_DEBUG_ONLY(c10::layout_or_default(layout_opt) == c10::Layout::Strided); +at::Tensor DIPUATenFunctions::empty_cpu( + at::IntArrayRef size, c10::optional dtype_opt, + c10::optional layout_opt, c10::optional device_opt, + c10::optional pin_memory_opt, + c10::optional memory_format_opt) { + TORCH_INTERNAL_ASSERT_DEBUG_ONLY(c10::device_or_default(device_opt).type() == + c10::DeviceType::CPU); + TORCH_INTERNAL_ASSERT_DEBUG_ONLY(c10::layout_or_default(layout_opt) == + c10::Layout::Strided); - auto pin_memory = c10::pinned_memory_or_default(pin_memory_opt); - auto dtype = c10::dtype_or_default(dtype_opt); - auto allocator = GetCPUAllocatorMaybePinned(pin_memory); - constexpr c10::DispatchKeySet cpu_ks(c10::DispatchKey::CPU); - return at::detail::empty_generic(size, allocator, cpu_ks, dtype, memory_format_opt); - } + auto pin_memory = c10::pinned_memory_or_default(pin_memory_opt); + auto dtype = c10::dtype_or_default(dtype_opt); + auto allocator = GetCPUAllocatorMaybePinned(pin_memory); + constexpr c10::DispatchKeySet cpu_ks(c10::DispatchKey::CPU); + return at::detail::empty_generic(size, allocator, cpu_ks, dtype, + memory_format_opt); +} - // use empty_generic, test - at::Tensor DIPUATenFunctions::empty_strided(at::IntArrayRef size, at::IntArrayRef stride, c10::optional dtype_opt, - c10::optional layout_opt, c10::optional device_opt, - c10::optional pin_memory_opt) { - dipu::profile::RecordBlockCreator dipu_recorder(__FUNCTION__); - auto device = c10::device_or_default(device_opt); - AT_ASSERT(device.type() == dipu::DIPU_DEVICE_TYPE); - AT_ASSERT(layout_or_default(layout_opt) == Layout::Strided); - auto dtype = dtype_or_default(dtype_opt); +// use empty_generic, test +at::Tensor DIPUATenFunctions::empty_strided( + at::IntArrayRef size, at::IntArrayRef stride, + c10::optional dtype_opt, + c10::optional layout_opt, c10::optional device_opt, + c10::optional pin_memory_opt) { + dipu::profile::RecordBlockCreator dipu_recorder(__FUNCTION__); + auto device = c10::device_or_default(device_opt); + AT_ASSERT(device.type() == dipu::DIPU_DEVICE_TYPE); + AT_ASSERT(layout_or_default(layout_opt) == Layout::Strided); + auto dtype = dtype_or_default(dtype_opt); - c10::Allocator *allocator = dipu::getAllocator(dipu::DIPU_DEVICE_TYPE); - constexpr c10::DispatchKeySet dipu_ks({dipu::DIPU_DISPATCH_KEY}); - return at::detail::empty_strided_generic(size, stride, allocator, dipu_ks, dtype); - } + c10::Allocator *allocator = dipu::getAllocator(dipu::DIPU_DEVICE_TYPE); + constexpr c10::DispatchKeySet dipu_ks({dipu::DIPU_DISPATCH_KEY}); + return at::detail::empty_strided_generic(size, stride, allocator, dipu_ks, + dtype); +} - at::Tensor DIPUATenFunctions::empty_strided_cpu(at::IntArrayRef size, at::IntArrayRef stride, c10::optional dtype_opt, - c10::optional layout_opt, c10::optional device_opt, - c10::optional pin_memory_opt) { - TORCH_INTERNAL_ASSERT_DEBUG_ONLY(c10::device_or_default(device_opt).type() == c10::DeviceType::CPU); - TORCH_INTERNAL_ASSERT_DEBUG_ONLY(c10::layout_or_default(layout_opt) == c10::Layout::Strided); +at::Tensor DIPUATenFunctions::empty_strided_cpu( + at::IntArrayRef size, at::IntArrayRef stride, + c10::optional dtype_opt, + c10::optional layout_opt, c10::optional device_opt, + c10::optional pin_memory_opt) { + TORCH_INTERNAL_ASSERT_DEBUG_ONLY(c10::device_or_default(device_opt).type() == + c10::DeviceType::CPU); + TORCH_INTERNAL_ASSERT_DEBUG_ONLY(c10::layout_or_default(layout_opt) == + c10::Layout::Strided); - auto pin_memory = c10::pinned_memory_or_default(pin_memory_opt); - auto dtype = c10::dtype_or_default(dtype_opt); - auto allocator = GetCPUAllocatorMaybePinned(pin_memory); - constexpr c10::DispatchKeySet cpu_ks(c10::DispatchKey::CPU); - return at::detail::empty_strided_generic(size, stride, allocator, cpu_ks, dtype); - } + auto pin_memory = c10::pinned_memory_or_default(pin_memory_opt); + auto dtype = c10::dtype_or_default(dtype_opt); + auto allocator = GetCPUAllocatorMaybePinned(pin_memory); + constexpr c10::DispatchKeySet cpu_ks(c10::DispatchKey::CPU); + return at::detail::empty_strided_generic(size, stride, allocator, cpu_ks, + dtype); +} -} //end ns dipu::native \ No newline at end of file +} // namespace dipu::native \ No newline at end of file diff --git a/dipu/torch_dipu/csrc_dipu/aten/ops/OpUtils.hpp b/dipu/torch_dipu/csrc_dipu/aten/ops/OpUtils.hpp index 74e31d7e3..5be79e6e4 100644 --- a/dipu/torch_dipu/csrc_dipu/aten/ops/OpUtils.hpp +++ b/dipu/torch_dipu/csrc_dipu/aten/ops/OpUtils.hpp @@ -3,194 +3,207 @@ namespace dipu::native { inline bool checkDiopiReturnValue() { - static bool enable = std::getenv("DIPU_DISABLE_CHECK_DIOPI_RETURN_VALUE") == nullptr; - return enable; + static bool enable = + std::getenv("DIPU_DISABLE_CHECK_DIOPI_RETURN_VALUE") == nullptr; + return enable; } inline bool checkTensorDevice() { - static bool enable = []() { - const char* env_ptr = std::getenv("DIPU_CHECK_TENSOR_DEVICE"); - if (env_ptr == nullptr) { - return false; - } - return std::atoi(env_ptr) > 0 ? true : false; - }(); - return enable; + static bool enable = []() { + const char *env_ptr = std::getenv("DIPU_CHECK_TENSOR_DEVICE"); + if (env_ptr == nullptr) { + return false; + } + return std::atoi(env_ptr) > 0 ? true : false; + }(); + return enable; } inline void synchronizeIfEnable() { - static const char* mode = std::getenv("DIPU_SYNC_EXEC_MODE"); - if (mode != nullptr) { - DIPU_LOG_ONCE << "The synchronous operation is performed after " - <<"the diopi function call because the DIPU_SYNC_EXEC_MODE environment variable is set" << std::endl; - dipu::getCurrentDIPUStream().synchronize(); - } - return; + static const char *mode = std::getenv("DIPU_SYNC_EXEC_MODE"); + if (mode != nullptr) { + DIPU_LOG_ONCE << "The synchronous operation is performed after " + << "the diopi function call because the DIPU_SYNC_EXEC_MODE " + "environment variable is set" + << std::endl; + dipu::getCurrentDIPUStream().synchronize(); + } + return; } inline int dumpOpArgLevel() { - const char* env_ptr = std::getenv("DIPU_DUMP_OP_ARGS"); - int level = env_ptr ? std::atoi(env_ptr) : 0; - return level; -} - -template -static std::string dumpArg(const T& t) { - std::stringstream stream; - stream << t; - return stream.str(); -} - -template -static std::string dumpArg(const c10::optional & opt_t) { - std::stringstream stream; - if (opt_t.has_value()) { - stream << dumpArg(opt_t.value()); + const char *env_ptr = std::getenv("DIPU_DUMP_OP_ARGS"); + int level = env_ptr ? std::atoi(env_ptr) : 0; + return level; +} + +template +static std::string dumpArg(const T &t) { + std::stringstream stream; + stream << t; + return stream.str(); +} + +template +static std::string dumpArg(const c10::optional &opt_t) { + std::stringstream stream; + if (opt_t.has_value()) { + stream << dumpArg(opt_t.value()); + } + return stream.str(); +} + +template +static std::string dumpArg(const c10::OptionalArrayRef &opt_t) { + std::stringstream stream; + if (opt_t.has_value()) { + stream << dumpArg(opt_t.value()); + } + return stream.str(); +} + +template class container> +static std::string dumpArg(const container &t) { + std::stringstream stream; + for (auto iter = t.begin(); iter != t.end(); ++iter) { + stream << dumpArg(*iter) << ", "; + } + return stream.str(); +} + +template <> +std::string dumpArg(const at::Tensor &tensor) { + std::stringstream stream; + if (tensor.defined()) { + stream << "numel: " << tensor.numel() << ",sizes: " << tensor.sizes() + << ", stride: " << tensor.strides() + << ", is_view: " << tensor.is_view() << ", dtype: " << tensor.dtype() + << ", device:" << tensor.device() << ", layout:" << tensor.layout() + << ", requires_grad: " << (tensor.requires_grad() ? "true" : "false") + << ", pinned_memory: " << (tensor.is_pinned() ? "true" : "false") + << ", memory_format: " << tensor.suggest_memory_format() + << ", data_ptr: " << tensor.data_ptr(); + if (dumpOpArgLevel() > 2) { + stream << std::endl << tensor; } - return stream.str(); + } else { + stream << "undefined"; + } + return stream.str(); } -template -static std::string dumpArg(const c10::OptionalArrayRef & opt_t) { - std::stringstream stream; - if (opt_t.has_value()) { - stream << dumpArg(opt_t.value()); - } - return stream.str(); +template <> +std::string dumpArg(const at::Scalar &scalar) { + std::stringstream stream; + stream << scalar; + return stream.str(); } -template class container> -static std::string dumpArg(const container & t) { - std::stringstream stream; - for (auto iter = t.begin(); iter != t.end(); ++iter) { - stream << dumpArg(*iter) << ", "; - } - return stream.str(); -} - -template<> -std::string dumpArg(const at::Tensor& tensor) { - std::stringstream stream; - if (tensor.defined()) { - stream << "numel: " << tensor.numel() << ",sizes: " << tensor.sizes() << ", stride: " << tensor.strides() << ", is_view: " << tensor.is_view() << ", dtype: " << tensor.dtype() - << ", device:" << tensor.device() << ", layout:" << tensor.layout() << ", requires_grad: " << (tensor.requires_grad() ? "true" : "false") << ", pinned_memory: " << (tensor.is_pinned() ? "true" : "false") - << ", memory_format: " << tensor.suggest_memory_format() << ", data_ptr: " << tensor.data_ptr(); - if (dumpOpArgLevel() > 2) { - stream << std::endl << tensor; - } - } else { - stream << "undefined"; - } - return stream.str(); +template <> +std::string dumpArg(const c10::string_view &str) { + return dumpArg(std::string(str.data())); } -template<> -std::string dumpArg(const at::Scalar& scalar) { - std::stringstream stream; - stream << scalar; - return stream.str(); +template <> +std::string dumpArg(const at::Generator &generator) { + return ""; } -template<> -std::string dumpArg(const c10::string_view& str) { - return dumpArg(std::string(str.data())); +template +static std::string dumpArg(const std::array &t) { + std::stringstream stream; + for (auto iter = t.begin(); iter != t.end(); ++iter) { + stream << dumpArg(*iter) << " "; + } + return stream.str(); } -template<> -std::string dumpArg(const at::Generator& generator) { - return ""; -} - -template -static std::string dumpArg(const std::array& t) { - std::stringstream stream; - for (auto iter = t.begin(); iter != t.end(); ++iter) { - stream << dumpArg(*iter) << " "; +template <> +std::string dumpArg(const c10::List> &t) { + std::stringstream stream; + stream << "size:" << t.size() << std::endl; + for (int i = 0; i < t.size(); ++i) { + bool has_value = t[i].has_value(); + stream << "\t" << i << "th: has_value:" << has_value << " "; + if (has_value) { + stream << dumpArg(t[i].value()); } - return stream.str(); -} - -template<> -std::string dumpArg(const c10::List>& t) { - std::stringstream stream; - stream << "size:" << t.size() << std::endl; - for (int i = 0; i < t.size(); ++i) { - bool has_value = t[i].has_value(); - stream << "\t" << i << "th: has_value:" << has_value << " "; - if (has_value) { - stream << dumpArg(t[i].value()); - } - stream << std::endl; + stream << std::endl; + } + return stream.str(); +} + +template class container1, + template class container2> +static std::vector infer_reduce_op_shape( + const container1 &input_shape, const container2 &dims, + bool keepdim) { + if (dims.size() <= 0) { + return std::vector(); + } + if (keepdim) { + std::vector output_shape(input_shape.begin(), input_shape.end()); + for (auto iter = dims.begin(); iter != dims.end(); ++iter) { + auto dim = *iter; + dim += dim < 0 ? input_shape.size() : 0; + output_shape[dim] = 1; } - return stream.str(); -} - -template class container1, template class container2> -static std::vector infer_reduce_op_shape(const container1 & input_shape, const container2 & dims, bool keepdim) { - if (dims.size() <= 0) { - return std::vector(); - } - if (keepdim) { - std::vector output_shape(input_shape.begin(), input_shape.end()); - for (auto iter = dims.begin(); iter != dims.end(); ++iter) { - auto dim = *iter; - dim += dim < 0 ? input_shape.size() : 0; - output_shape[dim] = 1; - } - return output_shape; - } else { - std::vector output_shape; - output_shape.reserve(input_shape.size() - dims.size()); - for (int i = 0; i < input_shape.size(); ++i) { - bool reduce_dim = false; - for (auto iter = dims.begin(); iter != dims.end(); ++iter) { - auto dim = *iter; - dim += dim < 0 ? input_shape.size() : 0; - if (dim == i) { - reduce_dim = true; - break; - } - } - if (reduce_dim == false) { - output_shape.push_back(input_shape.at(i)); - } - } - return output_shape; - } -} - -static std::string _allclose(const at::Tensor& a, const at::Tensor& b) { - if(a.defined() && b.defined()) { - try { - if(at::allclose(a.cpu(), b.cpu(), 1e-4, 1e-5, true)) { - return "allclose"; - } else { - auto diff = at::abs(a.cpu() - b.cpu()); - auto mae = diff.mean().item(); - auto max_diff = diff.max().item(); - return "not_close, max diff: " + std::to_string(max_diff) + ", MAE: " + std::to_string(mae); - } - } catch (...) { - return "compare_error: not_close"; - } - } else { - if(a.defined() != b.defined()) { - return "not_close, one of tensor inputs is empty"; - } else { - return "allclose"; + return output_shape; + } else { + std::vector output_shape; + output_shape.reserve(input_shape.size() - dims.size()); + for (int i = 0; i < input_shape.size(); ++i) { + bool reduce_dim = false; + for (auto iter = dims.begin(); iter != dims.end(); ++iter) { + auto dim = *iter; + dim += dim < 0 ? input_shape.size() : 0; + if (dim == i) { + reduce_dim = true; + break; } + } + if (reduce_dim == false) { + output_shape.push_back(input_shape.at(i)); + } } -} - -static std::string _allclose(const c10::ArrayRef& a, const c10::ArrayRef& b) { - if (a.size() != b.size()) { - return "not_allclose:"; + return output_shape; + } +} + +static std::string _allclose(const at::Tensor &a, const at::Tensor &b) { + if (a.defined() && b.defined()) { + try { + if (at::allclose(a.cpu(), b.cpu(), 1e-4, 1e-5, true)) { + return "allclose"; + } else { + auto diff = at::abs(a.cpu() - b.cpu()); + auto mae = diff.mean().item(); + auto max_diff = diff.max().item(); + return "not_close, max diff: " + std::to_string(max_diff) + + ", MAE: " + std::to_string(mae); + } + } catch (...) { + return "compare_error: not_close"; } - std::string result; - for (size_t i = 0; i < a.size(); ++i) { - result += std::to_string(i) + "th " + _allclose(a[i], b[i]) + "; "; + } else { + if (a.defined() != b.defined()) { + return "not_close, one of tensor inputs is empty"; + } else { + return "allclose"; } - return result; + } +} + +static std::string _allclose(const c10::ArrayRef &a, + const c10::ArrayRef &b) { + if (a.size() != b.size()) { + return "not_allclose:"; + } + std::string result; + for (size_t i = 0; i < a.size(); ++i) { + result += std::to_string(i) + "th " + _allclose(a[i], b[i]) + "; "; + } + return result; } } // namespace dipu::native \ No newline at end of file diff --git a/dipu/torch_dipu/csrc_dipu/aten/ops/PinMemoryKernel.cpp b/dipu/torch_dipu/csrc_dipu/aten/ops/PinMemoryKernel.cpp index 9106fa791..4fea19446 100644 --- a/dipu/torch_dipu/csrc_dipu/aten/ops/PinMemoryKernel.cpp +++ b/dipu/torch_dipu/csrc_dipu/aten/ops/PinMemoryKernel.cpp @@ -10,7 +10,8 @@ namespace dipu::native { -bool DIPUATenFunctions::is_pinned(const at::Tensor& self, c10::optional device) { +bool DIPUATenFunctions::is_pinned(const at::Tensor &self, + c10::optional device) { // Only CPU tensors can be pinned if (!self.is_cpu()) { return false; @@ -21,14 +22,16 @@ bool DIPUATenFunctions::is_pinned(const at::Tensor& self, c10::optional device) { +at::Tensor DIPUATenFunctions::_pin_memory(const at::Tensor &self, + c10::optional device) { auto allocator = dipu::getAllocator(at::DeviceType::CPU); - auto storage = c10::Storage( - c10::Storage::use_byte_size_t(), - at::detail::computeStorageNbytes(self.sizes(), self.strides(), self.dtype().itemsize()), - allocator, - false); - auto tensor = at::cpu::empty({0}, self.options()).set_(storage, 0, self.sizes(), self.strides()); + auto storage = + c10::Storage(c10::Storage::use_byte_size_t(), + at::detail::computeStorageNbytes( + self.sizes(), self.strides(), self.dtype().itemsize()), + allocator, false); + auto tensor = at::cpu::empty({0}, self.options()) + .set_(storage, 0, self.sizes(), self.strides()); tensor.copy_(self); return tensor; } diff --git a/dipu/torch_dipu/csrc_dipu/aten/ops/StorageShapeKernel.cpp b/dipu/torch_dipu/csrc_dipu/aten/ops/StorageShapeKernel.cpp index b85dfe715..90ad4ad02 100644 --- a/dipu/torch_dipu/csrc_dipu/aten/ops/StorageShapeKernel.cpp +++ b/dipu/torch_dipu/csrc_dipu/aten/ops/StorageShapeKernel.cpp @@ -1,116 +1,123 @@ // Copyright (c) 2023, DeepLink. #include -#include -#include -#include -#include -#include #include +#include +#include +#include +#include #include +#include #include -#include #include +#include +using at::IntArrayRef; +using at::Layout; using c10::device_or_default; using c10::layout_or_default; +using c10::MemoryFormat; using c10::StorageImpl; using c10::TensorImpl; -using c10::MemoryFormat; -using at::Layout; -using at::IntArrayRef; using dipu::devproxy::current_device; namespace dipu::native { - void DIPUATenFunctions::resize_bytes_dipu(StorageImpl* storage, size_t newsize_bytes) { - TORCH_CHECK(storage->resizable(), "Trying to resize dipu storage that is not resizable"); - auto allocator = storage->allocator(); - TORCH_CHECK(allocator != nullptr, "Trying to resize dipu storage without an allocator"); +void DIPUATenFunctions::resize_bytes_dipu(StorageImpl *storage, + size_t newsize_bytes) { + TORCH_CHECK(storage->resizable(), + "Trying to resize dipu storage that is not resizable"); + auto allocator = storage->allocator(); + TORCH_CHECK(allocator != nullptr, + "Trying to resize dipu storage without an allocator"); - auto device = current_device(); - dipu::DIPUStream stream = dipu::getCurrentDIPUStream(); - if (newsize_bytes == 0) { - storage->set_data_ptr_noswap(at::DataPtr(nullptr, at::Device(dipu::DIPU_DEVICE_TYPE, device))); - storage->set_nbytes(0); - return; - } - size_t nbytes = std::min(storage->nbytes(), newsize_bytes); - at::DataPtr data = allocator->allocate(newsize_bytes); // alloc new - if (storage->data_ptr()) { // copy old to new - MemChecker::instance().check(data.get()); - MemChecker::instance().check(storage->data()); - if (storage->data() != nullptr) { - dipu::devproxy::memCopyD2DAsync(stream.rawstream(), nbytes, device, data.get(), - device, storage->data()); - } + auto device = current_device(); + dipu::DIPUStream stream = dipu::getCurrentDIPUStream(); + if (newsize_bytes == 0) { + storage->set_data_ptr_noswap( + at::DataPtr(nullptr, at::Device(dipu::DIPU_DEVICE_TYPE, device))); + storage->set_nbytes(0); + return; + } + size_t nbytes = std::min(storage->nbytes(), newsize_bytes); + at::DataPtr data = allocator->allocate(newsize_bytes); // alloc new + if (storage->data_ptr()) { // copy old to new + MemChecker::instance().check(data.get()); + MemChecker::instance().check(storage->data()); + if (storage->data() != nullptr) { + dipu::devproxy::memCopyD2DAsync(stream.rawstream(), nbytes, device, + data.get(), device, storage->data()); } - // Destructively overwrite data_ptr - storage->set_data_ptr_noswap(std::move(data)); - storage->set_nbytes(newsize_bytes); } + // Destructively overwrite data_ptr + storage->set_data_ptr_noswap(std::move(data)); + storage->set_nbytes(newsize_bytes); +} - static inline TensorImpl* _resize_impl_dipu_(TensorImpl* self, - IntArrayRef size, at::OptionalIntArrayRef stride) { - if (self->sizes() == size && (!stride || self->strides() == stride)) { - return self; - } - // need add guard to support device change. - const auto itemsize = self->dtype().itemsize(); - const auto storage_offset = self->storage_offset(); - size_t new_storage_size = 1; - if (stride) { - self->set_sizes_and_strides(size, *stride); - new_storage_size = at::detail::computeStorageNbytes( - size, *stride, itemsize, storage_offset); - } else { - self->set_sizes_contiguous(size); - new_storage_size = at::detail::computeStorageNbytesContiguous( - size, itemsize, storage_offset); - } - const c10::Storage& storage = self->unsafe_storage(); - TORCH_CHECK(storage, "Tensor: invalid null storage"); - if (self->numel() > 0 && new_storage_size > storage.nbytes()) { - DIPUATenFunctions::resize_bytes_dipu(storage.unsafeGetStorageImpl(), new_storage_size); - } +static inline TensorImpl *_resize_impl_dipu_(TensorImpl *self, IntArrayRef size, + at::OptionalIntArrayRef stride) { + if (self->sizes() == size && (!stride || self->strides() == stride)) { return self; } + // need add guard to support device change. + const auto itemsize = self->dtype().itemsize(); + const auto storage_offset = self->storage_offset(); + size_t new_storage_size = 1; + if (stride) { + self->set_sizes_and_strides(size, *stride); + new_storage_size = at::detail::computeStorageNbytes(size, *stride, itemsize, + storage_offset); + } else { + self->set_sizes_contiguous(size); + new_storage_size = at::detail::computeStorageNbytesContiguous( + size, itemsize, storage_offset); + } + const c10::Storage &storage = self->unsafe_storage(); + TORCH_CHECK(storage, "Tensor: invalid null storage"); + if (self->numel() > 0 && new_storage_size > storage.nbytes()) { + DIPUATenFunctions::resize_bytes_dipu(storage.unsafeGetStorageImpl(), + new_storage_size); + } + return self; +} - const at::Tensor& DIPUATenFunctions::resize_(const at::Tensor& self, at::IntArrayRef size, c10::optional optional_memory_format) { - if(self.has_names()) { - return at::native::resize_named_tensor_(self, size, optional_memory_format); - } - auto* self_ = self.unsafeGetTensorImpl(); - // not support stride now - _resize_impl_dipu_(self_, size, /*strides=*/c10::nullopt); - if(optional_memory_format.has_value()) { - auto memory_format = - optional_memory_format.value(); - TORCH_CHECK( - memory_format != MemoryFormat::Preserve, - "Unsupported memory format", - memory_format); - self_->empty_tensor_restride(memory_format); - } - return self; +const at::Tensor &DIPUATenFunctions::resize_( + const at::Tensor &self, at::IntArrayRef size, + c10::optional optional_memory_format) { + if (self.has_names()) { + return at::native::resize_named_tensor_(self, size, optional_memory_format); } + auto *self_ = self.unsafeGetTensorImpl(); + // not support stride now + _resize_impl_dipu_(self_, size, /*strides=*/c10::nullopt); + if (optional_memory_format.has_value()) { + auto memory_format = optional_memory_format.value(); + TORCH_CHECK(memory_format != MemoryFormat::Preserve, + "Unsupported memory format", memory_format); + self_->empty_tensor_restride(memory_format); + } + return self; +} - at::Tensor& DIPUATenFunctions::set_storage_dipu_(at::Tensor& result, c10::Storage storage, - int64_t storage_offset, at::IntArrayRef size, - at::IntArrayRef stride) { - at::native::checkSetStorage(result, storage, storage_offset, size, stride); +at::Tensor &DIPUATenFunctions::set_storage_dipu_(at::Tensor &result, + c10::Storage storage, + int64_t storage_offset, + at::IntArrayRef size, + at::IntArrayRef stride) { + at::native::checkSetStorage(result, storage, storage_offset, size, stride); - result.unsafeGetTensorImpl()->set_storage_offset(storage_offset); - at::OptionalIntArrayRef stride_opt = stride.data() != nullptr ? - at::OptionalIntArrayRef(stride) : c10::nullopt; - _resize_impl_dipu_(result.unsafeGetTensorImpl(), size, stride_opt); - return result; - } + result.unsafeGetTensorImpl()->set_storage_offset(storage_offset); + at::OptionalIntArrayRef stride_opt = + stride.data() != nullptr ? at::OptionalIntArrayRef(stride) : c10::nullopt; + _resize_impl_dipu_(result.unsafeGetTensorImpl(), size, stride_opt); + return result; +} - at::Tensor& DIPUATenFunctions::set_dipu_(at::Tensor& result) { - caffe2::TypeMeta dtype = result.dtype(); - c10::Storage storage(c10::Storage::use_byte_size_t(), 0, dipu::getAllocator(dipu::DIPU_DEVICE_TYPE), true); - DIPUATenFunctions::set_storage_dipu_(result, storage, 0, {0}, {}); - TORCH_INTERNAL_ASSERT(dtype == result.dtype()); - return result; - } -} \ No newline at end of file +at::Tensor &DIPUATenFunctions::set_dipu_(at::Tensor &result) { + caffe2::TypeMeta dtype = result.dtype(); + c10::Storage storage(c10::Storage::use_byte_size_t(), 0, + dipu::getAllocator(dipu::DIPU_DEVICE_TYPE), true); + DIPUATenFunctions::set_storage_dipu_(result, storage, 0, {0}, {}); + TORCH_INTERNAL_ASSERT(dtype == result.dtype()); + return result; +} +} // namespace dipu::native \ No newline at end of file diff --git a/dipu/torch_dipu/csrc_dipu/base/DIPUGlobals.cpp b/dipu/torch_dipu/csrc_dipu/base/DIPUGlobals.cpp index 4ee93ecfc..415427bd5 100644 --- a/dipu/torch_dipu/csrc_dipu/base/DIPUGlobals.cpp +++ b/dipu/torch_dipu/csrc_dipu/base/DIPUGlobals.cpp @@ -1,21 +1,22 @@ #include "DIPUGlobals.h" -#include "csrc_dipu/runtime/core/allocator/DIPUCachingAllocator.h" + +#include +#include +#include + +#include "csrc_dipu/aten/RegisterDIPU.hpp" #include "csrc_dipu/runtime/core/DIPUEventPool.h" #include "csrc_dipu/runtime/core/DIPUGeneratorImpl.h" -#include "csrc_dipu/aten/RegisterDIPU.hpp" -#include -#include -#include +#include "csrc_dipu/runtime/core/allocator/DIPUCachingAllocator.h" namespace dipu { -const char* getDipuCommitId() { - return DIPU_GIT_HASH; -} +const char *getDipuCommitId() { return DIPU_GIT_HASH; } static void printPromptAtStartup() { auto time = std::time(nullptr); std::string time_str = std::ctime(&time); - std::cout << time_str.substr(0, time_str.size() - 1) << " dipu | git hash:" << getDipuCommitId() << std::endl; + std::cout << time_str.substr(0, time_str.size() - 1) + << " dipu | git hash:" << getDipuCommitId() << std::endl; } static void initResourceImpl() { @@ -44,27 +45,22 @@ static void releaseAllResourcesImpl() { } namespace { - class DIPUIniter { - public: - DIPUIniter() { - initResourceImpl(); - } +class DIPUIniter { + public: + DIPUIniter() { initResourceImpl(); } - ~DIPUIniter() { - releaseAllResourcesImpl(); - } - }; -} // namespace + ~DIPUIniter() { releaseAllResourcesImpl(); } +}; +} // namespace void initResource() { - initResourceImpl(); - /* In some cases(eg: spawn process), the resource cleanup function we registered will not be executed, - so we use the destructor of the static variable in the function here just in case. */ - static DIPUIniter initer; + initResourceImpl(); + /* In some cases(eg: spawn process), the resource cleanup function we + registered will not be executed, so we use the destructor of the static + variable in the function here just in case. */ + static DIPUIniter initer; } -void releaseAllResources() { - releaseAllResourcesImpl(); -} +void releaseAllResources() { releaseAllResourcesImpl(); } -} // namespace dipu \ No newline at end of file +} // namespace dipu \ No newline at end of file diff --git a/dipu/torch_dipu/csrc_dipu/base/DIPUGlobals.h b/dipu/torch_dipu/csrc_dipu/base/DIPUGlobals.h index ae487a6d7..fa12d9f1c 100644 --- a/dipu/torch_dipu/csrc_dipu/base/DIPUGlobals.h +++ b/dipu/torch_dipu/csrc_dipu/base/DIPUGlobals.h @@ -5,4 +5,4 @@ void initResource(); void releaseAllResources(); -}; \ No newline at end of file +}; // namespace dipu \ No newline at end of file diff --git a/dipu/torch_dipu/csrc_dipu/base/basedef.h b/dipu/torch_dipu/csrc_dipu/base/basedef.h index f88eb5c22..3ce26421c 100644 --- a/dipu/torch_dipu/csrc_dipu/base/basedef.h +++ b/dipu/torch_dipu/csrc_dipu/base/basedef.h @@ -3,28 +3,33 @@ #include #include -#include +#include #define C10_COMPILE_TIME_MAX_DIPUS 16 #define DIPU_DEVICE_TYPE_MACRO XPU -#define DIPU_AUTOGRAD_DEVICE_TYPE_MACRO C10_CONCATENATE(Autograd, DIPU_DEVICE_TYPE_MACRO) -#define DIPU_AUTOCAST_DEVICE_TYPE_MACRO C10_CONCATENATE(Autocast, DIPU_DEVICE_TYPE_MACRO) +#define DIPU_AUTOGRAD_DEVICE_TYPE_MACRO \ + C10_CONCATENATE(Autograd, DIPU_DEVICE_TYPE_MACRO) +#define DIPU_AUTOCAST_DEVICE_TYPE_MACRO \ + C10_CONCATENATE(Autocast, DIPU_DEVICE_TYPE_MACRO) // to do: abstract a layer which not depend on pytorch namespace dipu { -// XPU is originally intel output-of-tree code https://github.com/intel/intel-extension-for-pytorch ( branch xpu-master ) -// we use this type but PrivateUse1 not to impersonate our DIPU device. because compared with PrivateUse1, -// XPU has richer support in pytorch trunk and not too much feature in torch to interfere our logic (as XLA). +// XPU is originally intel output-of-tree code +// https://github.com/intel/intel-extension-for-pytorch ( branch xpu-master ) we +// use this type but PrivateUse1 not to impersonate our DIPU device. because +// compared with PrivateUse1, XPU has richer support in pytorch trunk and not +// too much feature in torch to interfere our logic (as XLA). const auto DIPU_DEVICE_TYPE = at::DeviceType::DIPU_DEVICE_TYPE_MACRO; const auto DIPU_DISPATCH_KEY = c10::DispatchKey::DIPU_DEVICE_TYPE_MACRO; -const auto DIPU_DISPATCH_AUTOGRAD_KEY = c10::DispatchKey::DIPU_AUTOGRAD_DEVICE_TYPE_MACRO; +const auto DIPU_DISPATCH_AUTOGRAD_KEY = + c10::DispatchKey::DIPU_AUTOGRAD_DEVICE_TYPE_MACRO; const auto DIPU_Backend_TYPE = c10::Backend::DIPU_DEVICE_TYPE_MACRO; const auto DICL_BACKEND_NAME = "dicl"; -} // end ns dipu +} // namespace dipu diff --git a/dipu/torch_dipu/csrc_dipu/binding/DIPUpybind.h b/dipu/torch_dipu/csrc_dipu/binding/DIPUpybind.h index 804742df0..b7d4b2618 100644 --- a/dipu/torch_dipu/csrc_dipu/binding/DIPUpybind.h +++ b/dipu/torch_dipu/csrc_dipu/binding/DIPUpybind.h @@ -1,75 +1,83 @@ #pragma once -#include +#include #include // for at::ScalarType -#include +#include namespace pybind11 { namespace detail { namespace py = pybind11; -at::ScalarType dtypeToScalarType(PyObject* dtype_obj) { - TORCH_INTERNAL_ASSERT(THPDtype_Check(dtype_obj)); - // PyTorch does not care about aliasing and is compiled with - // `-fno-strict-aliasing`. - // In PyTorch they would write: - // return reinterpret_cast(dtype_obj)->scalar_type; - // But we do care about aliasing. - THPDtype dtype; - std::memcpy(&dtype, dtype_obj, sizeof(dtype)); - return dtype.scalar_type; +at::ScalarType dtypeToScalarType(PyObject *dtype_obj) { + TORCH_INTERNAL_ASSERT(THPDtype_Check(dtype_obj)); + // PyTorch does not care about aliasing and is compiled with + // `-fno-strict-aliasing`. + // In PyTorch they would write: + // return reinterpret_cast(dtype_obj)->scalar_type; + // But we do care about aliasing. + THPDtype dtype; + std::memcpy(&dtype, dtype_obj, sizeof(dtype)); + return dtype.scalar_type; } -PyObject* scalarTypeToDtype(at::ScalarType scalar_type) { - const char* dtype_name = nullptr; - switch (scalar_type) { - case at::ScalarType::Float: dtype_name = "float32"; break; - case at::ScalarType::Double: dtype_name = "float64"; break; - case at::kHalf: dtype_name = "float16"; break; - case at::kBFloat16: dtype_name = "bfloat16"; break; - // ... handle other scalar types here - default: throw std::runtime_error("Unsupported scalar type"); - } - - PyObject* torch_module = PyImport_ImportModule("torch"); - TORCH_INTERNAL_ASSERT(torch_module); - - PyObject* dtype_obj = PyObject_GetAttrString(torch_module, dtype_name); - TORCH_INTERNAL_ASSERT(dtype_obj); - - Py_DECREF(torch_module); // Decrement the refcount for the torch module - - return dtype_obj; // Note: The caller will be responsible for decreasing the refcount of dtype_obj -} - - -bool isDtype(PyObject* obj) { - return THPDtype_Check(obj); +PyObject *scalarTypeToDtype(at::ScalarType scalar_type) { + const char *dtype_name = nullptr; + switch (scalar_type) { + case at::ScalarType::Float: + dtype_name = "float32"; + break; + case at::ScalarType::Double: + dtype_name = "float64"; + break; + case at::kHalf: + dtype_name = "float16"; + break; + case at::kBFloat16: + dtype_name = "bfloat16"; + break; + // ... handle other scalar types here + default: + throw std::runtime_error("Unsupported scalar type"); + } + + PyObject *torch_module = PyImport_ImportModule("torch"); + TORCH_INTERNAL_ASSERT(torch_module); + + PyObject *dtype_obj = PyObject_GetAttrString(torch_module, dtype_name); + TORCH_INTERNAL_ASSERT(dtype_obj); + + Py_DECREF(torch_module); // Decrement the refcount for the torch module + + return dtype_obj; // Note: The caller will be responsible for decreasing the + // refcount of dtype_obj } +bool isDtype(PyObject *obj) { return THPDtype_Check(obj); } template <> struct type_caster { -public: - PYBIND11_TYPE_CASTER(at::ScalarType, _("torch.dtype")); - - bool load(py::handle src, bool) { - // Convert Python torch.dtype to at::ScalarType - PyObject* obj = src.ptr(); - if (isDtype(obj)) { - value = dtypeToScalarType(obj); - return true; - } - return false; - } - - static py::handle cast(const at::ScalarType& src, py::return_value_policy /* policy */, py::handle /* parent */) { - // Convert at::ScalarType to Python torch.dtype - return py::handle(scalarTypeToDtype(src)); + public: + PYBIND11_TYPE_CASTER(at::ScalarType, _("torch.dtype")); + + bool load(py::handle src, bool) { + // Convert Python torch.dtype to at::ScalarType + PyObject *obj = src.ptr(); + if (isDtype(obj)) { + value = dtypeToScalarType(obj); + return true; } + return false; + } + + static py::handle cast(const at::ScalarType &src, + py::return_value_policy /* policy */, + py::handle /* parent */) { + // Convert at::ScalarType to Python torch.dtype + return py::handle(scalarTypeToDtype(src)); + } }; -} // namespace detail -} // namespace pybind11 +} // namespace detail +} // namespace pybind11 diff --git a/dipu/torch_dipu/csrc_dipu/binding/ExportProfiler.cpp b/dipu/torch_dipu/csrc_dipu/binding/ExportProfiler.cpp index e6e0c9902..b84c0c6ef 100644 --- a/dipu/torch_dipu/csrc_dipu/binding/ExportProfiler.cpp +++ b/dipu/torch_dipu/csrc_dipu/binding/ExportProfiler.cpp @@ -4,31 +4,35 @@ #include #include -#include #include +#include + #include -#include "exportapi.h" #include "csrc_dipu/profiler/profiler.h" #include "csrc_dipu/profiler/profiler_kineto.h" #include "csrc_dipu/profiler/profiler_python.h" #include "csrc_dipu/runtime/devproxy/deviceproxy.h" +#include "exportapi.h" + namespace py = pybind11; namespace dipu { -void exportProfiler(PyObject* module) { +void exportProfiler(PyObject *module) { auto m = py::handle(module).cast(); m.def("_prepare_profiler", profile::prepareProfiler); m.def("_enable_profiler", profile::enableProfiler, py::arg("config"), - py::arg("activities"), py::arg("scopes") = std::unordered_set()); + py::arg("activities"), + py::arg("scopes") = std::unordered_set()); m.def("_disable_profiler", profile::disableProfiler); m.def("_add_metadata_json", profile::addMetadataJson); m.def("_kineto_step", profile::profilerStep); m.def("_supported_activities", []() { - std::set activities{torch::profiler::impl::ActivityType::CPU}; + std::set activities{ + torch::profiler::impl::ActivityType::CPU}; if (devproxy::getDeviceCount() > 0) { activities.insert(torch::profiler::impl::ActivityType::CUDA); } diff --git a/dipu/torch_dipu/csrc_dipu/binding/ExportRT.cpp b/dipu/torch_dipu/csrc_dipu/binding/ExportRT.cpp index c1da72ad3..0ec7a0fd6 100644 --- a/dipu/torch_dipu/csrc_dipu/binding/ExportRT.cpp +++ b/dipu/torch_dipu/csrc_dipu/binding/ExportRT.cpp @@ -1,21 +1,23 @@ // Copyright (c) 2023, DeepLink. #include +#include #include #include #include + #include -#include -#include "exportapi.h" -#include +#include #include +#include #include -#include + #include "DIPUpybind.h" -using dipu::getDIPUStreamFromPool; -using dipu::DIPUStream; +#include "exportapi.h" using dipu::DIPUEvent; +using dipu::DIPUStream; +using dipu::getDIPUStreamFromPool; namespace py = pybind11; namespace dipu { @@ -24,14 +26,16 @@ static constexpr size_t kMega = 1024 * 1024; using dipu::devapis::DIPUDeviceProperties; using dipu::devapis::DIPUDeviceStatus; -static void registerDIPUDeviceProperties(py::module& m) { - py::class_>(m, "_DIPUDeviceProperties") +static void registerDIPUDeviceProperties(py::module &m) { + py::class_>( + m, "_DIPUDeviceProperties") .def_readonly("name", &DIPUDeviceProperties::name) .def_readonly("major", &DIPUDeviceProperties::major) .def_readonly("minor", &DIPUDeviceProperties::minor) - .def_readonly("multi_processor_count", &DIPUDeviceProperties::multiProcessorCount) + .def_readonly("multi_processor_count", + &DIPUDeviceProperties::multiProcessorCount) .def_readonly("total_memory", &DIPUDeviceProperties::totalGlobalMem) - .def("__repr__", [](const DIPUDeviceProperties& prop) { + .def("__repr__", [](const DIPUDeviceProperties &prop) { std::ostringstream stream; stream << "_DIPUDeviceProperties(name='" << prop.name << "', major=" << prop.major << ", minor=" << prop.minor @@ -42,10 +46,11 @@ static void registerDIPUDeviceProperties(py::module& m) { }); } -static void registerDIPUDeviceStatus(py::module& m) { - py::class_>(m, "_DIPUDeviceStatus") +static void registerDIPUDeviceStatus(py::module &m) { + py::class_>( + m, "_DIPUDeviceStatus") .def_readonly("free_memory", &DIPUDeviceStatus::freeGlobalMem) - .def("__repr__", [](const DIPUDeviceStatus& status) { + .def("__repr__", [](const DIPUDeviceStatus &status) { std::ostringstream stream; stream << "DIPUDeviceStatus(used_memory=" << status.freeGlobalMem << ")"; @@ -53,10 +58,10 @@ static void registerDIPUDeviceStatus(py::module& m) { }); } -static void exportDevices(py::module& m) { +static void exportDevices(py::module &m) { registerDIPUDeviceProperties(m); registerDIPUDeviceStatus(m); - // Device Management. + // Device Management. m.attr("dipu_vendor") = dipu::VendorTypeToStr(VENDOR_TYPE); m.attr("dipu_device_type") = DeviceTypeName(DIPU_DEVICE_TYPE, true); m.attr("dicl_backend") = DICL_BACKEND_NAME; @@ -68,88 +73,97 @@ static void exportDevices(py::module& m) { poison_fork(); return devproxy::getDeviceCount(); }); - m.def("_dipu_current_device", []() -> int { - return static_cast(devproxy::current_device()); - }); + m.def("_dipu_current_device", + []() -> int { return static_cast(devproxy::current_device()); }); m.def("_dipu_synchronize", []() -> void { devproxy::syncDevice(); return; }); - m.def("_dipu_getDeviceProperties", [](int device) -> std::shared_ptr { + m.def( + "_dipu_getDeviceProperties", + [](int device) -> std::shared_ptr { return dipu::getDevicePropertiesFromCache(device); - }, py::arg("device")); - - /* - different with device properties, fill_status may cause creation of the device stub on the specified device, - the sub will occupy mem, so caller should always fill status after set device() - and only fill status of current device, otherwise you will create stub an other device. + }, + py::arg("device")); + + /* + different with device properties, fill_status may cause creation of the + device stub on the specified device, the sub will occupy mem, so caller + should always fill status after set device() and only fill status of current + device, otherwise you will create stub an other device. */ - m.def("_dipu_getDeviceStatus", [](int device) -> std::shared_ptr { + m.def( + "_dipu_getDeviceStatus", + [](int device) -> std::shared_ptr { return dipu::getDeviceStatus(device); - }, py::arg("device")); - + }, + py::arg("device")); } -static void exportStream(py::module& m) { +static void exportStream(py::module &m) { // Stream Management. follow the api in torch/csrc/cuda/Stream.cpp pybind11::class_(m, "_DIPUStreamBase") - .def(py::init([](int priority, c10::StreamId stream_id, c10::DeviceIndex device_index, - int64_t device_type, uint64_t stream_ptr) { - if (stream_id || device_index || device_type) { - if (device_type != 0) { - TORCH_CHECK(static_cast(device_type) == dipu::DIPU_DEVICE_TYPE); - } - return DIPUStream(device_index, stream_id); - } else if (stream_ptr) { - return dipu::getStreamFromExternal(reinterpret_cast(stream_ptr), - devproxy::current_device()); - } else { - return getDIPUStreamFromPool(); - } - }), - py::arg("priority") = 0, py::arg("stream_id") = 0, py::arg("device_index") = 0, - py::arg("device_type") = 0, py::arg("stream_ptr")=0 - ) - .def(py::init([](c10::DeviceIndex device_index, int isdefault) { - return dipu::getCurrentDIPUStream(device_index); - }) - ) - .def("query", &DIPUStream::isStreamEmpty) - .def("synchronize", - [](DIPUStream& stream) -> void { - pybind11::gil_scoped_release no_gil; - stream.synchronize(); - }) - .def("__eq__", &DIPUStream::operator==) - .def("priority_range", - // not support priority now, return a mock value. - [](DIPUStream& stream) -> py::tuple { - py::tuple range = pybind11::make_tuple(0, 0); - return range; - }) - // cpp properties - .def_property_readonly("stream_id", - [](DIPUStream& stream) -> c10::StreamId { - return stream.id(); - }) - .def_property_readonly("device_index", &DIPUStream::device_index) - .def_property_readonly("device_type", - [](DIPUStream& stream) -> int64_t { - return static_cast(stream.device().type()); - }) - .def_property_readonly("dipu_stream", - [](DIPUStream& stream) -> uint64_t { - return (uint64_t)stream.rawstream(); - }) - // use type_caster - .def_property_readonly("device", - [](DIPUStream& stream) -> at::Device { - return stream.device(); - }); - - m.def("_dipu_setStream", [](c10::StreamId stream_id, c10::DeviceIndex device_index) -> void { - dipu::setCurrentDIPUStream(DIPUStream(device_index, stream_id)); - }, py::arg("stream_id") = 0, py::arg("device_index") = 0); + .def(py::init([](int priority, c10::StreamId stream_id, + c10::DeviceIndex device_index, int64_t device_type, + uint64_t stream_ptr) { + if (stream_id || device_index || device_type) { + if (device_type != 0) { + TORCH_CHECK(static_cast(device_type) == + dipu::DIPU_DEVICE_TYPE); + } + return DIPUStream(device_index, stream_id); + } else if (stream_ptr) { + return dipu::getStreamFromExternal( + reinterpret_cast(stream_ptr), + devproxy::current_device()); + } else { + return getDIPUStreamFromPool(); + } + }), + py::arg("priority") = 0, py::arg("stream_id") = 0, + py::arg("device_index") = 0, py::arg("device_type") = 0, + py::arg("stream_ptr") = 0) + .def(py::init([](c10::DeviceIndex device_index, int isdefault) { + return dipu::getCurrentDIPUStream(device_index); + })) + .def("query", &DIPUStream::isStreamEmpty) + .def("synchronize", + [](DIPUStream &stream) -> void { + pybind11::gil_scoped_release no_gil; + stream.synchronize(); + }) + .def("__eq__", &DIPUStream::operator==) + .def("priority_range", + // not support priority now, return a mock value. + [](DIPUStream &stream) -> py::tuple { + py::tuple range = pybind11::make_tuple(0, 0); + return range; + }) + // cpp properties + .def_property_readonly( + "stream_id", + [](DIPUStream &stream) -> c10::StreamId { return stream.id(); }) + .def_property_readonly("device_index", &DIPUStream::device_index) + .def_property_readonly( + "device_type", + [](DIPUStream &stream) -> int64_t { + return static_cast(stream.device().type()); + }) + .def_property_readonly("dipu_stream", + [](DIPUStream &stream) -> uint64_t { + return (uint64_t)stream.rawstream(); + }) + // use type_caster + .def_property_readonly("device", [](DIPUStream &stream) -> at::Device { + return stream.device(); + }); + + m.def( + "_dipu_setStream", + [](c10::StreamId stream_id, c10::DeviceIndex device_index) -> void { + dipu::setCurrentDIPUStream(DIPUStream(device_index, stream_id)); + }, + py::arg("stream_id") = 0, py::arg("device_index") = 0); m.def("_dipu_getCurrentStream", [](c10::DeviceIndex devIdx) -> DIPUStream { return dipu::getCurrentDIPUStream(devIdx); @@ -159,169 +173,153 @@ static void exportStream(py::module& m) { }); } -static void exportEvent(py::module& m) { - // Event +static void exportEvent(py::module &m) { + // Event pybind11::class_(m, "_DIPUEventBase") // add flag in future - .def(py::init([](bool enable_timing, bool blocking, bool interproces) { - return DIPUEvent(); - }), - py::arg("enable_timing") = false, py::arg("blocking") = false, py::arg("interprocess") = false - ) - .def("record", static_cast(&DIPUEvent::record), "record event") - .def("record", pybind11::overload_cast - (&DIPUEvent::record), "record event on stream") - .def("elapsed_time", &dipu::DIPUEvent::elapsed_time) - .def("synchronize", - [](DIPUEvent& self) { - pybind11::gil_scoped_release no_gil; - self.synchronize(); - }) - .def("query", &DIPUEvent::query) - .def("wait", - [](DIPUEvent& self, const DIPUStream& stream) { - pybind11::gil_scoped_release no_gil; - self.wait(stream); - }) - - .def_property_readonly("dipu_event", [](DIPUEvent& self) { - return (uint64_t)self.rawevent(); - }) - .def_property_readonly("device", [](DIPUEvent& self) { + .def(py::init([](bool enable_timing, bool blocking, bool interproces) { + return DIPUEvent(); + }), + py::arg("enable_timing") = false, py::arg("blocking") = false, + py::arg("interprocess") = false) + .def("record", static_cast(&DIPUEvent::record), + "record event") + .def("record", + pybind11::overload_cast(&DIPUEvent::record), + "record event on stream") + .def("elapsed_time", &dipu::DIPUEvent::elapsed_time) + .def("synchronize", + [](DIPUEvent &self) { + pybind11::gil_scoped_release no_gil; + self.synchronize(); + }) + .def("query", &DIPUEvent::query) + .def("wait", + [](DIPUEvent &self, const DIPUStream &stream) { + pybind11::gil_scoped_release no_gil; + self.wait(stream); + }) + + .def_property_readonly( + "dipu_event", + [](DIPUEvent &self) { return (uint64_t)self.rawevent(); }) + .def_property_readonly("device", [](DIPUEvent &self) { auto device = self.device().value(); return device; - }); + }); } -static void exportCommunicator(py::module& m) { - - pybind11::class_>(m, "ProcessGroupDICL") - .def( - py::init([](const c10::intrusive_ptr& store, - int rank, - int size, - const std::chrono::milliseconds& timeout) { - return createProcessGroupDICL(store, rank, size, timeout); - }), - py::arg("store"), - py::arg("rank"), - py::arg("size"), - py::arg("timeout") = kBackendDefaultTimeout, - py::call_guard()) - .def("store", &ProcessGroupDICL::getStore) - .def("timeout", [](ProcessGroupDICL& self) { +static void exportCommunicator(py::module &m) { + pybind11::class_>(m, "ProcessGroupDICL") + .def(py::init([](const c10::intrusive_ptr &store, int rank, + int size, const std::chrono::milliseconds &timeout) { + return createProcessGroupDICL(store, rank, size, timeout); + }), + py::arg("store"), py::arg("rank"), py::arg("size"), + py::arg("timeout") = kBackendDefaultTimeout, + py::call_guard()) + .def("store", &ProcessGroupDICL::getStore) + .def("timeout", [](ProcessGroupDICL &self) { // need enhance to support tiemout - return kBackendDefaultTimeout; - }); - + return kBackendDefaultTimeout; + }); // py::object mdist = py::module::import("torch.distributed"); - // py::object register_backend = mdist.attr("Backend").attr("register_backend"); - // The first parameter is the backend name used by user in invoking + // py::object register_backend = + // mdist.attr("Backend").attr("register_backend"); The first parameter is the + // backend name used by user in invoking // torch.distributed.init_process_group(). - // register_backend(dipu::DICL_BACKEND_NAME, py::cpp_function(createProcessGroupDICL)); + // register_backend(dipu::DICL_BACKEND_NAME, + // py::cpp_function(createProcessGroupDICL)); } -static void exportMemCaching(py::module& m) { - m.def("_dipu_emptyCache", []() { - emptyCachedMem(); - }); +static void exportMemCaching(py::module &m) { + m.def("_dipu_emptyCache", []() { emptyCachedMem(); }); - m.def("init_resource", []() { - initResource(); - }); + m.def("init_resource", []() { initResource(); }); - m.def("release_all_resources", []() { - releaseAllResources(); - }); + m.def("release_all_resources", []() { releaseAllResources(); }); - m.def("memory_reserved", [](const c10::Device& device)->size_t { + m.def("memory_reserved", [](const c10::Device &device) -> size_t { return memoryReserved(device); }); - m.def("memory_allocated", [](const c10::Device& device)->size_t { + m.def("memory_allocated", [](const c10::Device &device) -> size_t { return memoryAllocated(device); }); - m.def("max_memory_reserved", [](const c10::Device& device)->size_t { + m.def("max_memory_reserved", [](const c10::Device &device) -> size_t { return maxMemoryReserved(device); }); - m.def("max_memory_allocated", [](const c10::Device& device)->size_t { + m.def("max_memory_allocated", [](const c10::Device &device) -> size_t { return maxMemoryAllocated(device); }); } - -static void patchStorage(py::module& m) { +static void patchStorage(py::module &m) { // incremental patch StorageMethods.cpp THPStorage_resize_() - m.def("storage_resize_", [](at::Storage stor, int64_t newsize) -> at::Storage { - if (stor.device_type() != DIPU_DEVICE_TYPE) { - TORCH_CHECK(false, - "UntypedStorage.resize_: dipu storage resize not support other device type ", - stor.device_type()); - } else { - dipu::native::DIPUATenFunctions::resize_bytes_dipu(stor.unsafeGetStorageImpl(), newsize); - return stor; - } - }); + m.def("storage_resize_", + [](at::Storage stor, int64_t newsize) -> at::Storage { + if (stor.device_type() != DIPU_DEVICE_TYPE) { + TORCH_CHECK(false, + "UntypedStorage.resize_: dipu storage resize not " + "support other device type ", + stor.device_type()); + } else { + dipu::native::DIPUATenFunctions::resize_bytes_dipu( + stor.unsafeGetStorageImpl(), newsize); + return stor; + } + }); } -static void patchTensor(py::module& m) { - m.def("is_dipu", [](at::Tensor self) -> bool { - return dipu::isDeviceTensor(self); - }); +static void patchTensor(py::module &m) { + m.def("is_dipu", + [](at::Tensor self) -> bool { return dipu::isDeviceTensor(self); }); } -static void exportGenerator(py::module& m) { - m.def("_manual_seed", [](at::DeviceIndex idx, uint64_t seed) { - manual_seed(idx, seed); - }); +static void exportGenerator(py::module &m) { + m.def("_manual_seed", + [](at::DeviceIndex idx, uint64_t seed) { manual_seed(idx, seed); }); - m.def("_seed", [](at::DeviceIndex idx) { - seed(idx); - }); + m.def("_seed", [](at::DeviceIndex idx) { seed(idx); }); - m.def("_initial_seed", [](at::DeviceIndex idx)->uint64_t { - return initial_seed(idx); - }); + m.def("_initial_seed", + [](at::DeviceIndex idx) -> uint64_t { return initial_seed(idx); }); - m.def("_get_rng_state", [](at::DeviceIndex idx)->at::Tensor { - return get_rng_state(idx); - }); + m.def("_get_rng_state", + [](at::DeviceIndex idx) -> at::Tensor { return get_rng_state(idx); }); m.def("_set_rng_state", [](at::DeviceIndex idx, at::Tensor state) { set_rng_state(idx, state); }); - m.def("_is_in_bad_fork", []()->bool { return is_in_bad_fork(); }); + m.def("_is_in_bad_fork", []() -> bool { return is_in_bad_fork(); }); - m.def("_create_dipu_generator", [](int idx)->at::Generator { + m.def("_create_dipu_generator", [](int idx) -> at::Generator { at::DeviceIndex index = static_cast(idx); return createDIPUGenerator(index); }); } - -static void exportAutocast(py::module& m) { - m.def("get_autocast_dipu_dtype", []()->at::ScalarType { +static void exportAutocast(py::module &m) { + m.def("get_autocast_dipu_dtype", []() -> at::ScalarType { return at::autocast::get_autocast_xpu_dtype(); }); - m.def("is_autocast_dipu_enabled", []()->bool { - return at::autocast::is_xpu_enabled(); - }); - m.def("set_autocast_dipu_enabled", [](bool enabled) { - at::autocast::set_xpu_enabled(enabled); - }); + m.def("is_autocast_dipu_enabled", + []() -> bool { return at::autocast::is_xpu_enabled(); }); + m.def("set_autocast_dipu_enabled", + [](bool enabled) { at::autocast::set_xpu_enabled(enabled); }); m.def("set_autocast_dipu_dtype", [](at::ScalarType dtype) { at::autocast::set_autocast_xpu_dtype(dtype); }); } -extern void patchTorchCsrcDevice(PyObject* module); - +extern void patchTorchCsrcDevice(PyObject *module); -DIPU_API void exportDIPURuntime(PyObject* module) { +DIPU_API void exportDIPURuntime(PyObject *module) { auto m = py::handle(module).cast(); patchTorchCsrcDevice(module); exportDevices(m); @@ -334,4 +332,4 @@ DIPU_API void exportDIPURuntime(PyObject* module) { exportGenerator(m); exportAutocast(m); } -} // end ns dipu \ No newline at end of file +} // namespace dipu \ No newline at end of file diff --git a/dipu/torch_dipu/csrc_dipu/binding/ExportTensor.cpp b/dipu/torch_dipu/csrc_dipu/binding/ExportTensor.cpp index 0bba620a7..cab40407d 100644 --- a/dipu/torch_dipu/csrc_dipu/binding/ExportTensor.cpp +++ b/dipu/torch_dipu/csrc_dipu/binding/ExportTensor.cpp @@ -1,82 +1,92 @@ // Copyright (c) 2023, DeepLink. -#include +#include #include #include -#include -#include #include +#include +#include #include + #include "exportapi.h" namespace dipu { -static at::Tensor dispatch_to(const at::Tensor& self, at::Device device, bool non_blocking, bool copy, c10::optional optional_memory_format) { +static at::Tensor dispatch_to( + const at::Tensor &self, at::Device device, bool non_blocking, bool copy, + c10::optional optional_memory_format) { pybind11::gil_scoped_release no_gil; - // NOTE: this is where we record aten::to in the graph during tracing. However, the behavior of aten::to - // is different with respect to TensorOptions fields that are not present: aten::to inherits fields that - // are missing from the self argument while the tracer assumes that they should be populated with the - // default values (eg. float for scalar type). By explicitly copying over the tensor options here we fully - // specify all tensor options and thus record the proper trace - return self.to(self.options().device(device).memory_format(optional_memory_format), non_blocking, copy); + // NOTE: this is where we record aten::to in the graph during tracing. + // However, the behavior of aten::to is different with respect to + // TensorOptions fields that are not present: aten::to inherits fields that + // are missing from the self argument while the tracer assumes that they + // should be populated with the default values (eg. float for scalar type). By + // explicitly copying over the tensor options here we fully specify all tensor + // options and thus record the proper trace + return self.to( + self.options().device(device).memory_format(optional_memory_format), + non_blocking, copy); } -static std::shared_ptr splitArgs(PyObject* args) { +static std::shared_ptr splitArgs(PyObject *args) { ssize_t rawSize = PyTuple_Size(args); - PyObject* newArgs = PyTuple_New(rawSize - 1); - std::shared_ptr result(new PyObject*[2], [](PyObject** p){ - // if (p[1]) { // cause segfault, why? - // Py_DECREF(p[1]); - // } - delete[] p; - p = nullptr; - }); + PyObject *newArgs = PyTuple_New(rawSize - 1); + std::shared_ptr result(new PyObject *[2], [](PyObject **p) { + // if (p[1]) { // cause segfault, why? + // Py_DECREF(p[1]); + // } + delete[] p; + p = nullptr; + }); // 0 is self result[0] = PyTuple_GET_ITEM(args, 0); result[1] = newArgs; for (int i = 1; i < rawSize; i++) { auto arg = PyTuple_GET_ITEM(args, i); - PyTuple_SetItem(newArgs, i-1, arg); + PyTuple_SetItem(newArgs, i - 1, arg); } return result; } // first parameter is export module torchdipu_module, not self tensor -static PyObject* THPVariable_dipu(PyObject* module, PyObject* args, PyObject* kwargs) -{ +static PyObject *THPVariable_dipu(PyObject *module, PyObject *args, + PyObject *kwargs) { HANDLE_TH_ERRORS - static torch::PythonArgParser parser({ - "dipu(Device? device=None, bool non_blocking=False, *, MemoryFormat? memory_format=None)", - "dipu(Device? device=None, bool async=False, *, MemoryFormat? memory_format=None)|deprecated" - }); + static torch::PythonArgParser parser( + {"dipu(Device? device=None, bool non_blocking=False, *, MemoryFormat? " + "memory_format=None)", + "dipu(Device? device=None, bool async=False, *, MemoryFormat? " + "memory_format=None)|deprecated"}); auto res = splitArgs(args); - PyObject* self = res[0]; - PyObject* newArgs = res[1]; + PyObject *self = res[0]; + PyObject *newArgs = res[1]; - auto& self_ = THPVariable_Unpack(self); + auto &self_ = THPVariable_Unpack(self); torch::ParsedArgs<3> parsed_args; auto r = parser.parse(self, newArgs, kwargs, parsed_args); - if(r.has_torch_function()) { - return torch::handle_torch_function(r, self, newArgs, kwargs, THPVariableClass, "torch.Tensor"); + if (r.has_torch_function()) { + return torch::handle_torch_function(r, self, newArgs, kwargs, + THPVariableClass, "torch.Tensor"); } auto device = r.isNone(0) ? at::Device(dipu::DIPU_DEVICE_TYPE) : r.device(0); auto opt_memory_format = r.memoryformatOptional(2); - TORCH_CHECK(device.type() == dipu::DIPU_DEVICE_TYPE, "Invalid device, must be dipu device"); - return THPVariable_Wrap(dispatch_to(self_, device, r.toBool(1), false, opt_memory_format)); + TORCH_CHECK(device.type() == dipu::DIPU_DEVICE_TYPE, + "Invalid device, must be dipu device"); + return THPVariable_Wrap( + dispatch_to(self_, device, r.toBool(1), false, opt_memory_format)); END_HANDLE_TH_ERRORS } -// we prefer to use pybind11 to export patch func, cpython is used only patching tensor-func -// which has complex dynamic parameters not easy to parsed by pybind. +// we prefer to use pybind11 to export patch func, cpython is used only patching +// tensor-func which has complex dynamic parameters not easy to parsed by +// pybind. static PyMethodDef TorchTensorMethods[] = { - {"dipu", castPyCFunctionWithKeywords(THPVariable_dipu), METH_VARARGS | METH_KEYWORDS, NULL}, - {nullptr, nullptr, 0, nullptr} -}; + {"dipu", castPyCFunctionWithKeywords(THPVariable_dipu), + METH_VARARGS | METH_KEYWORDS, NULL}, + {nullptr, nullptr, 0, nullptr}}; -DIPU_API PyMethodDef* exportTensorFunctions() { - return TorchTensorMethods; -} -} // end ns dipu \ No newline at end of file +DIPU_API PyMethodDef *exportTensorFunctions() { return TorchTensorMethods; } +} // namespace dipu \ No newline at end of file diff --git a/dipu/torch_dipu/csrc_dipu/binding/exportapi.h b/dipu/torch_dipu/csrc_dipu/binding/exportapi.h index 6b8d8c8c8..80e331bdc 100644 --- a/dipu/torch_dipu/csrc_dipu/binding/exportapi.h +++ b/dipu/torch_dipu/csrc_dipu/binding/exportapi.h @@ -2,10 +2,11 @@ #pragma once #include + #include namespace dipu { -DIPU_API PyMethodDef* exportTensorFunctions(); -DIPU_API void exportDIPURuntime(PyObject* module); -DIPU_API void exportProfiler(PyObject* module); +DIPU_API PyMethodDef *exportTensorFunctions(); +DIPU_API void exportDIPURuntime(PyObject *module); +DIPU_API void exportProfiler(PyObject *module); } // namespace dipu \ No newline at end of file diff --git a/dipu/torch_dipu/csrc_dipu/binding/patchCsrcDevice.cpp b/dipu/torch_dipu/csrc_dipu/binding/patchCsrcDevice.cpp index 055cbe20d..9cf9e8202 100644 --- a/dipu/torch_dipu/csrc_dipu/binding/patchCsrcDevice.cpp +++ b/dipu/torch_dipu/csrc_dipu/binding/patchCsrcDevice.cpp @@ -1,38 +1,36 @@ // Copyright (c) 2023, DeepLink. -#include -#include +#include +#include +#include +#include +#include +#include +#include #include +#include +#include #include #include #include #include #include -#include -#include -#include - -#include -#include -#include -#include - #include "exportapi.h" namespace dipu { static bool PythonDeviceAsCuda = false; -static at::DeviceType _get_dipu_python_type(const at::Device& device) { +static at::DeviceType _get_dipu_python_type(const at::Device &device) { if (device.type() == DIPU_DEVICE_TYPE && PythonDeviceAsCuda) { return at::DeviceType::CUDA; } return device.type(); } -PyObject* _THPDevice_type(THPDevice* self, PyObject* noargs) { +PyObject *_THPDevice_type(THPDevice *self, PyObject *noargs) { HANDLE_TH_ERRORS std::ostringstream oss; oss << _get_dipu_python_type(self->device); @@ -41,7 +39,7 @@ PyObject* _THPDevice_type(THPDevice* self, PyObject* noargs) { END_HANDLE_TH_ERRORS } -PyObject* _THPDevice_index(THPDevice* self, PyObject* noargs) { +PyObject *_THPDevice_index(THPDevice *self, PyObject *noargs) { HANDLE_TH_ERRORS if (self->device.has_index()) { return THPUtils_packInt64(self->device.index()); @@ -51,7 +49,7 @@ PyObject* _THPDevice_index(THPDevice* self, PyObject* noargs) { END_HANDLE_TH_ERRORS } -PyObject* DIPU_THPDevice_repr(THPDevice* self) { +PyObject *DIPU_THPDevice_repr(THPDevice *self) { std::ostringstream oss; oss << "device(type=\'" << _get_dipu_python_type(self->device) << "\'"; if (self->device.has_index()) { @@ -64,8 +62,7 @@ PyObject* DIPU_THPDevice_repr(THPDevice* self) { return THPUtils_packString(oss.str().c_str()); } - -PyObject* DIPU_THPDevice_str(THPDevice* self) { +PyObject *DIPU_THPDevice_str(THPDevice *self) { std::ostringstream oss; oss << _get_dipu_python_type(self->device); return THPUtils_packString(oss.str().c_str()); @@ -76,16 +73,17 @@ static struct PyGetSetDef DIPU_THPDevice_properties[] = { {"index", (getter)_THPDevice_index, nullptr, nullptr, nullptr}, {nullptr}}; - /* -why use this method to patch csrc.Device: because -1. csrc.Device is a final cpython class which not support attributes mock in python layer. -2. rewrite a new DeviceType to replace THPDeviceType is not work because torch::PythonArgParser - will check the type of THPDeviceType when parse Device parameter(see csrc/utils/python_arg_parer.cpp - FunctionParameter::check() -> THPDevice_Check()) -so we replace some attributes of THPDeviceType class in c-python layer -*/ -void patchTorchCsrcDevice(PyObject* module) { +why use this method to patch csrc.Device: because +1. csrc.Device is a final cpython class which not support attributes mock in +python layer. +2. rewrite a new DeviceType to replace THPDeviceType is not work because +torch::PythonArgParser will check the type of THPDeviceType when parse Device +parameter(see csrc/utils/python_arg_parer.cpp FunctionParameter::check() -> +THPDevice_Check()) so we replace some attributes of THPDeviceType class in +c-python layer +*/ +void patchTorchCsrcDevice(PyObject *module) { // https://docs.python.org/3/c-api/typeobj.html#c.PyTypeObject.tp_dict THPDeviceType.tp_dict = nullptr; // change Type properties @@ -93,9 +91,11 @@ void patchTorchCsrcDevice(PyObject* module) { THPDeviceType.tp_repr = (reprfunc)DIPU_THPDevice_repr; THPDeviceType.tp_str = (reprfunc)DIPU_THPDevice_str; - // change THPDeviceType as an overriable class need add some other prperties in PyTypeObject, - // It may cause problems and seem un-necessary, so we keep the THPDeviceType as immutable. - THPDeviceType.tp_flags = Py_TPFLAGS_DEFAULT; // | Py_TPFLAGS_BASETYPE | Py_TPFLAGS_HEAPTYPE; + // change THPDeviceType as an overriable class need add some other prperties + // in PyTypeObject, It may cause problems and seem un-necessary, so we keep + // the THPDeviceType as immutable. + THPDeviceType.tp_flags = + Py_TPFLAGS_DEFAULT; // | Py_TPFLAGS_BASETYPE | Py_TPFLAGS_HEAPTYPE; if (PyType_Ready(&THPDeviceType) < 0) { throw python_error(); @@ -104,13 +104,11 @@ void patchTorchCsrcDevice(PyObject* module) { auto m = py::handle(module).cast(); - m.def("_get_python_device_as_cuda", []() -> bool { - return PythonDeviceAsCuda; - }); + m.def("_get_python_device_as_cuda", + []() -> bool { return PythonDeviceAsCuda; }); - m.def ("_set_python_device_as_cuda", [](bool as_cuda) -> void { - PythonDeviceAsCuda = as_cuda; - }); + m.def("_set_python_device_as_cuda", + [](bool as_cuda) -> void { PythonDeviceAsCuda = as_cuda; }); // not really 'export' new type but change original THPDeviceType is enough // if (PyModule_AddObject(module, "device", (PyObject*)&THPDeviceType) != 0) { diff --git a/dipu/torch_dipu/csrc_dipu/common.h b/dipu/torch_dipu/csrc_dipu/common.h index 06a94d79d..a7b4706ec 100644 --- a/dipu/torch_dipu/csrc_dipu/common.h +++ b/dipu/torch_dipu/csrc_dipu/common.h @@ -4,5 +4,3 @@ // @deprecated, dipu code shouldn't use common.h, it is going to be deleted // todo: vendor need to change to only depend on runtime/device/basedef.h #include - - diff --git a/dipu/torch_dipu/csrc_dipu/diopirt/diopi_helper.cpp b/dipu/torch_dipu/csrc_dipu/diopirt/diopi_helper.cpp index fead6d6da..7db77c75f 100644 --- a/dipu/torch_dipu/csrc_dipu/diopirt/diopi_helper.cpp +++ b/dipu/torch_dipu/csrc_dipu/diopirt/diopi_helper.cpp @@ -7,207 +7,218 @@ namespace dipu { namespace diopi_helper { -::diopiTensorHandle_t toDiopiTensorHandle(at::Tensor& tensor) { - return tensor.defined() ? reinterpret_cast<::diopiTensorHandle_t>(&tensor) : nullptr; +::diopiTensorHandle_t toDiopiTensorHandle(at::Tensor &tensor) { + return tensor.defined() ? reinterpret_cast<::diopiTensorHandle_t>(&tensor) + : nullptr; } -::diopiConstTensorHandle_t toDiopiTensorHandle(const at::Tensor& tensor) { - return tensor.defined() ? reinterpret_cast<::diopiConstTensorHandle_t>(&tensor) : nullptr; +::diopiConstTensorHandle_t toDiopiTensorHandle(const at::Tensor &tensor) { + return tensor.defined() + ? reinterpret_cast<::diopiConstTensorHandle_t>(&tensor) + : nullptr; } -::diopiConstTensorHandle_t toDiopiTensorHandle(const at::Tensor* tensor) { - return tensor == nullptr ? nullptr : toDiopiTensorHandle(*tensor); +::diopiConstTensorHandle_t toDiopiTensorHandle(const at::Tensor *tensor) { + return tensor == nullptr ? nullptr : toDiopiTensorHandle(*tensor); } -::diopiConstTensorHandle_t toDiopiTensorHandle(const c10::optional& tensor) { - if (!tensor.has_value()) return nullptr; - return toDiopiTensorHandle(tensor.value()); +::diopiConstTensorHandle_t toDiopiTensorHandle( + const c10::optional &tensor) { + if (!tensor.has_value()) return nullptr; + return toDiopiTensorHandle(tensor.value()); } -::diopiGeneratorHandle_t toDiopiGeneratorHandle(at::Generator& generator) { - return generator.defined() ? reinterpret_cast<::diopiGeneratorHandle_t>(&generator) : nullptr; +::diopiGeneratorHandle_t toDiopiGeneratorHandle(at::Generator &generator) { + return generator.defined() + ? reinterpret_cast<::diopiGeneratorHandle_t>(&generator) + : nullptr; } -::diopiGeneratorHandle_t toDiopiGeneratorHandle(c10::optional& generator) { - if (!generator.has_value()) return nullptr; - return toDiopiGeneratorHandle(generator.value()); +::diopiGeneratorHandle_t toDiopiGeneratorHandle( + c10::optional &generator) { + if (!generator.has_value()) return nullptr; + return toDiopiGeneratorHandle(generator.value()); } -::diopiScalar_t toDiopiScalar(const at::Scalar& scalar) { - ::diopiScalar_t result; - switch (scalar.type()) { +::diopiScalar_t toDiopiScalar(const at::Scalar &scalar) { + ::diopiScalar_t result; + switch (scalar.type()) { case c10::ScalarType::Bool: { - result.stype = ::diopiDtype_t::diopi_dtype_int64; - result.ival = static_cast(scalar.toBool()); - return result; + result.stype = ::diopiDtype_t::diopi_dtype_int64; + result.ival = static_cast(scalar.toBool()); + return result; } case c10::ScalarType::Long: { - result.stype = ::diopiDtype_t::diopi_dtype_int64; - result.ival = static_cast(scalar.toLong()); - return result; + result.stype = ::diopiDtype_t::diopi_dtype_int64; + result.ival = static_cast(scalar.toLong()); + return result; } case c10::ScalarType::Double: { - result.stype = ::diopiDtype_t::diopi_dtype_float64; - result.fval = scalar.toDouble(); - return result; + result.stype = ::diopiDtype_t::diopi_dtype_float64; + result.fval = scalar.toDouble(); + return result; } default: { - TORCH_CHECK(false, "invalid scalar type, type is ", scalar.type()); - break; - } + TORCH_CHECK(false, "invalid scalar type, type is ", scalar.type()); + break; } + } } -::diopiScalar_t toDiopiScalar(const at::Scalar& scalar, const c10::ScalarType& type) { - ::diopiScalar_t result; - TORCH_CHECK(c10::canCast(scalar.type(), type)); - if (type == c10::ScalarType::Bool) { - result.stype = ::diopiDtype_t::diopi_dtype_int64; - result.ival = static_cast(scalar.toBool()); - return result; - } else if (c10::isFloatingType(type)) { - result.stype = ::diopiDtype_t::diopi_dtype_float64; - result.fval = scalar.toDouble(); - return result; - } else if (c10::isIntegralType(type, false)) { - result.stype = ::diopiDtype_t::diopi_dtype_int64; - result.ival = static_cast(scalar.toLong()); - return result; - } - TORCH_CHECK(false, "invalid scalar type, type is ", scalar.type()); +::diopiScalar_t toDiopiScalar(const at::Scalar &scalar, + const c10::ScalarType &type) { + ::diopiScalar_t result; + TORCH_CHECK(c10::canCast(scalar.type(), type)); + if (type == c10::ScalarType::Bool) { + result.stype = ::diopiDtype_t::diopi_dtype_int64; + result.ival = static_cast(scalar.toBool()); + return result; + } else if (c10::isFloatingType(type)) { + result.stype = ::diopiDtype_t::diopi_dtype_float64; + result.fval = scalar.toDouble(); + return result; + } else if (c10::isIntegralType(type, false)) { + result.stype = ::diopiDtype_t::diopi_dtype_int64; + result.ival = static_cast(scalar.toLong()); + return result; + } + TORCH_CHECK(false, "invalid scalar type, type is ", scalar.type()); } ::diopiDtype_t toDiopiDtype(c10::ScalarType type) { - switch (type) { + switch (type) { case at::ScalarType::Bool: - return diopi_dtype_bool; + return diopi_dtype_bool; case at::ScalarType::Char: - return diopi_dtype_int8; + return diopi_dtype_int8; case at::ScalarType::Byte: - return diopi_dtype_uint8; + return diopi_dtype_uint8; case at::ScalarType::Short: - return diopi_dtype_int16; + return diopi_dtype_int16; case at::ScalarType::Int: - return diopi_dtype_int32; + return diopi_dtype_int32; case at::ScalarType::Long: - return diopi_dtype_int64; + return diopi_dtype_int64; case at::ScalarType::Half: - return diopi_dtype_float16; + return diopi_dtype_float16; case at::ScalarType::BFloat16: - return diopi_dtype_bfloat16; + return diopi_dtype_bfloat16; case at::ScalarType::Float: - return diopi_dtype_float32; + return diopi_dtype_float32; case at::ScalarType::Double: - return diopi_dtype_float64; + return diopi_dtype_float64; case at::ScalarType::ComplexFloat: - return diopi_dtype_complex64; + return diopi_dtype_complex64; case at::ScalarType::ComplexDouble: - return diopi_dtype_complex128; + return diopi_dtype_complex128; default: - TORCH_CHECK(false, "invalid scalar type, type is ", type); - } + TORCH_CHECK(false, "invalid scalar type, type is ", type); + } } caffe2::TypeMeta toATenType(::diopiDtype_t dt) { - switch (dt) { + switch (dt) { case diopi_dtype_bool: - return caffe2::TypeMeta::Make(); + return caffe2::TypeMeta::Make(); case diopi_dtype_uint8: - return caffe2::TypeMeta::Make(); + return caffe2::TypeMeta::Make(); case diopi_dtype_int8: - return caffe2::TypeMeta::Make(); + return caffe2::TypeMeta::Make(); case diopi_dtype_int16: - return caffe2::TypeMeta::Make(); + return caffe2::TypeMeta::Make(); case diopi_dtype_uint16: - return caffe2::TypeMeta::Make(); + return caffe2::TypeMeta::Make(); case diopi_dtype_int32: - case diopi_dtype_uint32: - return caffe2::TypeMeta::Make(); + case diopi_dtype_uint32: + return caffe2::TypeMeta::Make(); case diopi_dtype_int64: case diopi_dtype_uint64: - return caffe2::TypeMeta::Make(); - return caffe2::TypeMeta::Make(); + return caffe2::TypeMeta::Make(); + return caffe2::TypeMeta::Make(); case diopi_dtype_float32: - return caffe2::TypeMeta::Make(); + return caffe2::TypeMeta::Make(); case diopi_dtype_float64: - return caffe2::TypeMeta::Make(); + return caffe2::TypeMeta::Make(); case diopi_dtype_float16: - return caffe2::TypeMeta::Make(); + return caffe2::TypeMeta::Make(); case diopi_dtype_bfloat16: - return caffe2::TypeMeta::Make(); + return caffe2::TypeMeta::Make(); case diopi_dtype_complex64: - return caffe2::TypeMeta::Make>(); + return caffe2::TypeMeta::Make>(); case diopi_dtype_complex128: - return caffe2::TypeMeta::Make>(); + return caffe2::TypeMeta::Make>(); default: - TORCH_CHECK(false, "invalid diopi type, diopi type is ", dt); - } + TORCH_CHECK(false, "invalid diopi type, diopi type is ", dt); + } } int64_t getElemSize(::diopiDtype_t dt) { - switch (dt) { + switch (dt) { case diopi_dtype_int32: case diopi_dtype_uint32: case diopi_dtype_float32: case diopi_dtype_tfloat32: - return 4; + return 4; case diopi_dtype_int64: case diopi_dtype_uint64: case diopi_dtype_float64: case diopi_dtype_complex64: - return 8; + return 8; case diopi_dtype_int16: case diopi_dtype_uint16: case diopi_dtype_float16: case diopi_dtype_bfloat16: - return 2; + return 2; case diopi_dtype_int8: case diopi_dtype_uint8: case diopi_dtype_bool: - return 1; + return 1; case diopi_dtype_complex128: - return 16; + return 16; default: - TORCH_CHECK(false, "invalid diopi type, diopi type is ", dt); - } + TORCH_CHECK(false, "invalid diopi type, diopi type is ", dt); + } } c10::DeviceType toATenDevice(::diopiDevice_t device) { - switch (device) { + switch (device) { case diopi_host: - return c10::DeviceType::CPU; + return c10::DeviceType::CPU; case diopi_device: - return dipu::DIPU_DEVICE_TYPE; + return dipu::DIPU_DEVICE_TYPE; default: - TORCH_CHECK(false, "invalid diopi device, diopi device is ", device); - } + TORCH_CHECK(false, "invalid diopi device, diopi device is ", device); + } } -::diopiSize_t toDiopiSize(const at::OptionalIntArrayRef& input) { - ::diopiSize_t diopi_size{nullptr, 0}; - if (input.has_value()) { - diopi_size.data = input.value().data(); - diopi_size.len = input.value().size(); - } - return diopi_size; +::diopiSize_t toDiopiSize(const at::OptionalIntArrayRef &input) { + ::diopiSize_t diopi_size{nullptr, 0}; + if (input.has_value()) { + diopi_size.data = input.value().data(); + diopi_size.len = input.value().size(); + } + return diopi_size; } ::diopiSize_t toDiopiSize(at::IntArrayRef input) { - ::diopiSize_t diopi_size{nullptr, 0}; - diopi_size.data = input.data(); - diopi_size.len = input.size(); - return diopi_size; + ::diopiSize_t diopi_size{nullptr, 0}; + diopi_size.data = input.data(); + diopi_size.len = input.size(); + return diopi_size; } -::diopiRoundMode_t toDiopiRoundMode(const std::string& rounding_mode) { - if (rounding_mode == "none" || rounding_mode == "None" || rounding_mode.size() <= 0) { - return RoundModeNone; - } else if (rounding_mode == "floor") { - return RoundModeFloor; - } else if (rounding_mode == "trunc") { - return RoundModeTrunc; - } - TORCH_CHECK(false, "rounding_mode should be none, 'floor' or 'trunc', but got ", rounding_mode) +::diopiRoundMode_t toDiopiRoundMode(const std::string &rounding_mode) { + if (rounding_mode == "none" || rounding_mode == "None" || + rounding_mode.size() <= 0) { + return RoundModeNone; + } else if (rounding_mode == "floor") { + return RoundModeFloor; + } else if (rounding_mode == "trunc") { + return RoundModeTrunc; + } + TORCH_CHECK(false, + "rounding_mode should be none, 'floor' or 'trunc', but got ", + rounding_mode) } } // namespace diopi_helper diff --git a/dipu/torch_dipu/csrc_dipu/diopirt/diopirt_impl.cpp b/dipu/torch_dipu/csrc_dipu/diopirt/diopirt_impl.cpp index 5cc4092db..9c56eea70 100644 --- a/dipu/torch_dipu/csrc_dipu/diopirt/diopirt_impl.cpp +++ b/dipu/torch_dipu/csrc_dipu/diopirt/diopirt_impl.cpp @@ -1,8 +1,9 @@ // Copyright (c) 2023, DeepLink. -#include +#include "./diopirt_impl.h" + #include +#include -#include "./diopirt_impl.h" #include "csrc_dipu/profiler/profiler.h" namespace diopihelper = dipu::diopi_helper; @@ -12,107 +13,126 @@ extern "C" { static char diopiVersion[256] = {0}; -DIOPI_RT_API const char* diopiGetVersion() { - static bool inited = false; - if (!inited) { - inited = true; - snprintf(diopiVersion, sizeof(diopiVersion), "DIOPI Version: %d.%d.%d", DIOPI_VER_MAJOR, DIOPI_VER_MINOR, DIOPI_VER_PATCH); - } - return diopiVersion; +DIOPI_RT_API const char *diopiGetVersion() { + static bool inited = false; + if (!inited) { + inited = true; + snprintf(diopiVersion, sizeof(diopiVersion), "DIOPI Version: %d.%d.%d", + DIOPI_VER_MAJOR, DIOPI_VER_MINOR, DIOPI_VER_PATCH); + } + return diopiVersion; } -DIOPI_RT_API diopiError_t diopiGetTensorData(diopiTensorHandle_t pth, void** pptr) { - *pptr = (reinterpret_cast(pth))->data_ptr(); - return diopiSuccess; +DIOPI_RT_API diopiError_t diopiGetTensorData(diopiTensorHandle_t pth, + void **pptr) { + *pptr = (reinterpret_cast(pth))->data_ptr(); + return diopiSuccess; } -DIOPI_RT_API diopiError_t diopiGetTensorDataConst(diopiConstTensorHandle_t pth, const void** pptr) { - *pptr = (reinterpret_cast(pth))->data_ptr(); - return diopiSuccess; +DIOPI_RT_API diopiError_t diopiGetTensorDataConst(diopiConstTensorHandle_t pth, + const void **pptr) { + *pptr = (reinterpret_cast(pth))->data_ptr(); + return diopiSuccess; } -DIOPI_RT_API diopiError_t diopiGetTensorShape(diopiConstTensorHandle_t pth, diopiSize_t* size) { - const at::Tensor* ptr = reinterpret_cast(pth); - *size = diopiSize_t{ptr->sizes().data(), static_cast(ptr->dim())}; - return diopiSuccess; +DIOPI_RT_API diopiError_t diopiGetTensorShape(diopiConstTensorHandle_t pth, + diopiSize_t *size) { + const at::Tensor *ptr = reinterpret_cast(pth); + *size = diopiSize_t{ptr->sizes().data(), static_cast(ptr->dim())}; + return diopiSuccess; } -DIOPI_RT_API diopiError_t diopiGetTensorStride(diopiConstTensorHandle_t pth, diopiSize_t* stride) { - const at::Tensor* ptr = reinterpret_cast(pth); - *stride = diopiSize_t{ptr->strides().data(), static_cast(ptr->dim())}; - return diopiSuccess; +DIOPI_RT_API diopiError_t diopiGetTensorStride(diopiConstTensorHandle_t pth, + diopiSize_t *stride) { + const at::Tensor *ptr = reinterpret_cast(pth); + *stride = + diopiSize_t{ptr->strides().data(), static_cast(ptr->dim())}; + return diopiSuccess; } -DIOPI_RT_API diopiError_t diopiGetTensorDtype(diopiConstTensorHandle_t pth, diopiDtype_t* dtype) { - const at::Tensor* ptr = reinterpret_cast(pth); - *dtype = diopihelper::toDiopiDtype(ptr->scalar_type()); - return diopiSuccess; +DIOPI_RT_API diopiError_t diopiGetTensorDtype(diopiConstTensorHandle_t pth, + diopiDtype_t *dtype) { + const at::Tensor *ptr = reinterpret_cast(pth); + *dtype = diopihelper::toDiopiDtype(ptr->scalar_type()); + return diopiSuccess; } -DIOPI_RT_API diopiError_t diopiGetTensorDevice(diopiConstTensorHandle_t pth, diopiDevice_t* device) { - const at::Tensor* ptr = reinterpret_cast(pth); - *device = (ptr->is_cpu() ? diopi_host : diopi_device); - return diopiSuccess; +DIOPI_RT_API diopiError_t diopiGetTensorDevice(diopiConstTensorHandle_t pth, + diopiDevice_t *device) { + const at::Tensor *ptr = reinterpret_cast(pth); + *device = (ptr->is_cpu() ? diopi_host : diopi_device); + return diopiSuccess; } -DIOPI_RT_API diopiError_t diopiGetTensorNumel(diopiConstTensorHandle_t pth, int64_t* numel) { - if (pth == nullptr) { - *numel = 0; - return diopiSuccess; - } - - const at::Tensor* ptr = reinterpret_cast(pth); - *numel = ptr->numel(); +DIOPI_RT_API diopiError_t diopiGetTensorNumel(diopiConstTensorHandle_t pth, + int64_t *numel) { + if (pth == nullptr) { + *numel = 0; return diopiSuccess; + } + + const at::Tensor *ptr = reinterpret_cast(pth); + *numel = ptr->numel(); + return diopiSuccess; } -DIOPI_RT_API diopiError_t diopiGetTensorElemSize(diopiConstTensorHandle_t pth, int64_t* elemsize) { - const at::Tensor* ptr = reinterpret_cast(pth); - diopiDtype_t dtype; - auto ret = diopiGetTensorDtype(pth, &dtype); - if (ret != diopiSuccess) return ret; +DIOPI_RT_API diopiError_t diopiGetTensorElemSize(diopiConstTensorHandle_t pth, + int64_t *elemsize) { + const at::Tensor *ptr = reinterpret_cast(pth); + diopiDtype_t dtype; + auto ret = diopiGetTensorDtype(pth, &dtype); + if (ret != diopiSuccess) return ret; - *elemsize = diopihelper::getElemSize(dtype); - return diopiSuccess; + *elemsize = diopihelper::getElemSize(dtype); + return diopiSuccess; } -DIOPI_RT_API diopiError_t diopiGetStream(diopiContextHandle_t ctx, diopiStreamHandle_t* stream) { - *stream = ctx->stream; - return diopiSuccess; +DIOPI_RT_API diopiError_t diopiGetStream(diopiContextHandle_t ctx, + diopiStreamHandle_t *stream) { + *stream = ctx->stream; + return diopiSuccess; } -DIOPI_RT_API diopiError_t diopiRequireTensor( - diopiContextHandle_t ctx, diopiTensorHandle_t* tensor, - const diopiSize_t* size, const diopiSize_t* stride, - const diopiDtype_t dtype, const diopiDevice_t device) { - // TORCH_CHECK(tensor != nullptr && *tensor == nullptr, "invalid parameter tensor"); - at::IntArrayRef at_dims(size->data, size->len); - caffe2::TypeMeta at_type = diopihelper::toATenType(dtype); - c10::DeviceType at_device = diopihelper::toATenDevice(device); - auto options = at::TensorOptions(at_device).dtype(at_type); - at::Tensor t; - if (stride) { - at::IntArrayRef at_stride(stride->data, stride->len); - t = at::empty_strided(at_dims, at_stride, options); - } else { - t = at::empty(at_dims, options); - } - - ctx->arrays.emplace_back(std::move(t)); - *tensor = reinterpret_cast(&(ctx->arrays.back())); - return diopiSuccess; +DIOPI_RT_API diopiError_t diopiRequireTensor(diopiContextHandle_t ctx, + diopiTensorHandle_t *tensor, + const diopiSize_t *size, + const diopiSize_t *stride, + const diopiDtype_t dtype, + const diopiDevice_t device) { + // TORCH_CHECK(tensor != nullptr && *tensor == nullptr, "invalid parameter + // tensor"); + at::IntArrayRef at_dims(size->data, size->len); + caffe2::TypeMeta at_type = diopihelper::toATenType(dtype); + c10::DeviceType at_device = diopihelper::toATenDevice(device); + auto options = at::TensorOptions(at_device).dtype(at_type); + at::Tensor t; + if (stride) { + at::IntArrayRef at_stride(stride->data, stride->len); + t = at::empty_strided(at_dims, at_stride, options); + } else { + t = at::empty(at_dims, options); + } + + ctx->arrays.emplace_back(std::move(t)); + *tensor = reinterpret_cast(&(ctx->arrays.back())); + return diopiSuccess; } -DIOPI_RT_API diopiError_t diopiRequireBuffer( - diopiContextHandle_t ctx, diopiTensorHandle_t* tensor, - int64_t num_bytes, diopiDevice_t device) { - diopiSize_t size{&num_bytes, 1}; - return diopiRequireTensor(ctx, tensor, &size, nullptr, diopi_dtype_int8, device); +DIOPI_RT_API diopiError_t diopiRequireBuffer(diopiContextHandle_t ctx, + diopiTensorHandle_t *tensor, + int64_t num_bytes, + diopiDevice_t device) { + diopiSize_t size{&num_bytes, 1}; + return diopiRequireTensor(ctx, tensor, &size, nullptr, diopi_dtype_int8, + device); } -DIOPI_RT_API diopiError_t diopiGeneratorGetState(diopiContextHandle_t ctx, diopiConstGeneratorHandle_t th, diopiTensorHandle_t *state) { - const at::Generator* generator = reinterpret_cast(th); - dipu::DIPUGeneratorImpl* gen_impl = at::check_generator(*generator); +DIOPI_RT_API diopiError_t diopiGeneratorGetState(diopiContextHandle_t ctx, + diopiConstGeneratorHandle_t th, + diopiTensorHandle_t *state) { + const at::Generator *generator = reinterpret_cast(th); + dipu::DIPUGeneratorImpl *gen_impl = + at::check_generator(*generator); at::Tensor tensor; { @@ -125,10 +145,12 @@ DIOPI_RT_API diopiError_t diopiGeneratorGetState(diopiContextHandle_t ctx, diopi return diopiSuccess; } -DIOPI_RT_API diopiError_t diopiGeneratorSetState(diopiGeneratorHandle_t th, diopiConstTensorHandle_t new_state) { - at::Generator* generator = reinterpret_cast(th); - dipu::DIPUGeneratorImpl* gen_impl = at::check_generator(*generator); - const at::Tensor* ptr = reinterpret_cast(new_state); +DIOPI_RT_API diopiError_t diopiGeneratorSetState( + diopiGeneratorHandle_t th, diopiConstTensorHandle_t new_state) { + at::Generator *generator = reinterpret_cast(th); + dipu::DIPUGeneratorImpl *gen_impl = + at::check_generator(*generator); + const at::Tensor *ptr = reinterpret_cast(new_state); { std::lock_guard lock(gen_impl->mutex_); @@ -138,18 +160,19 @@ DIOPI_RT_API diopiError_t diopiGeneratorSetState(diopiGeneratorHandle_t th, diop return diopiSuccess; } -DIOPI_RT_API diopiError_t diopiRecordStart(const char* record_name, void** record) { - *record = new RecordBlockCreator(record_name); - return diopiSuccess; +DIOPI_RT_API diopiError_t diopiRecordStart(const char *record_name, + void **record) { + *record = new RecordBlockCreator(record_name); + return diopiSuccess; } -DIOPI_RT_API diopiError_t diopiRecordEnd(void** record) { - TORCH_CHECK(record != nullptr, "invalid parameter record_function"); - auto dipu_record_block = static_cast(*record); - dipu_record_block->end(); - delete dipu_record_block; - *record = nullptr; - return diopiSuccess; +DIOPI_RT_API diopiError_t diopiRecordEnd(void **record) { + TORCH_CHECK(record != nullptr, "invalid parameter record_function"); + auto dipu_record_block = static_cast(*record); + dipu_record_block->end(); + delete dipu_record_block; + *record = nullptr; + return diopiSuccess; } } // extern "C" diff --git a/dipu/torch_dipu/csrc_dipu/diopirt/diopirt_impl.h b/dipu/torch_dipu/csrc_dipu/diopirt/diopirt_impl.h index 2aa68e82d..0d4acff72 100644 --- a/dipu/torch_dipu/csrc_dipu/diopirt/diopirt_impl.h +++ b/dipu/torch_dipu/csrc_dipu/diopirt/diopirt_impl.h @@ -2,6 +2,7 @@ #pragma once #include + #include #include #include @@ -16,12 +17,12 @@ using deviceStream_t = dipu::deviceStream_t; extern "C" { struct diopiContext { - deviceStream_t stream; - // 1. use arrays to hold tensor that avoid tensor deleting when leaving scope - // 2. The address of each array must be fixed, so use list instead of vector - std::list arrays; + deviceStream_t stream; + // 1. use arrays to hold tensor that avoid tensor deleting when leaving scope + // 2. The address of each array must be fixed, so use list instead of vector + std::list arrays; - explicit diopiContext(const deviceStream_t& s) : stream(s) {} + explicit diopiContext(const deviceStream_t &s) : stream(s) {} }; } // extern "C" @@ -30,16 +31,19 @@ namespace dipu { namespace diopi_helper { -::diopiTensorHandle_t toDiopiTensorHandle(at::Tensor& tensor); -::diopiConstTensorHandle_t toDiopiTensorHandle(const at::Tensor& tensor); -::diopiConstTensorHandle_t toDiopiTensorHandle(const at::Tensor* tensor); -::diopiConstTensorHandle_t toDiopiTensorHandle(const c10::optional& tensor); +::diopiTensorHandle_t toDiopiTensorHandle(at::Tensor &tensor); +::diopiConstTensorHandle_t toDiopiTensorHandle(const at::Tensor &tensor); +::diopiConstTensorHandle_t toDiopiTensorHandle(const at::Tensor *tensor); +::diopiConstTensorHandle_t toDiopiTensorHandle( + const c10::optional &tensor); -::diopiGeneratorHandle_t toDiopiGeneratorHandle(at::Generator& generator); -::diopiGeneratorHandle_t toDiopiGeneratorHandle(c10::optional& generator); +::diopiGeneratorHandle_t toDiopiGeneratorHandle(at::Generator &generator); +::diopiGeneratorHandle_t toDiopiGeneratorHandle( + c10::optional &generator); -::diopiScalar_t toDiopiScalar(const at::Scalar& scalar); -::diopiScalar_t toDiopiScalar(const at::Scalar& scalar, const c10::ScalarType& type); +::diopiScalar_t toDiopiScalar(const at::Scalar &scalar); +::diopiScalar_t toDiopiScalar(const at::Scalar &scalar, + const c10::ScalarType &type); ::diopiDtype_t toDiopiDtype(c10::ScalarType type); @@ -48,9 +52,9 @@ int64_t getElemSize(::diopiDtype_t dt); c10::DeviceType toATenDevice(::diopiDevice_t device); -::diopiSize_t toDiopiSize(const at::OptionalIntArrayRef& dim); +::diopiSize_t toDiopiSize(const at::OptionalIntArrayRef &dim); -::diopiRoundMode_t toDiopiRoundMode(const std::string& rounding_mode); +::diopiRoundMode_t toDiopiRoundMode(const std::string &rounding_mode); } // namespace diopi_helper diff --git a/dipu/torch_dipu/csrc_dipu/profiler/CorrelationIDManager.cpp b/dipu/torch_dipu/csrc_dipu/profiler/CorrelationIDManager.cpp index d0e140364..cb002939e 100644 --- a/dipu/torch_dipu/csrc_dipu/profiler/CorrelationIDManager.cpp +++ b/dipu/torch_dipu/csrc_dipu/profiler/CorrelationIDManager.cpp @@ -5,28 +5,32 @@ namespace profile { using libkineto::DeviceActivityInterface; -CorrelationIDManager& CorrelationIDManager::instance() { +CorrelationIDManager &CorrelationIDManager::instance() { static CorrelationIDManager instance; return instance; } -void CorrelationIDManager::pushCorrelationID(uint64_t id, DeviceActivityInterface::CorrelationFlowType type) { - external_ids_[type].emplace_back(id); - type_.push_back(type); +void CorrelationIDManager::pushCorrelationID( + uint64_t id, DeviceActivityInterface::CorrelationFlowType type) { + external_ids_[type].emplace_back(id); + type_.push_back(type); } -void CorrelationIDManager::popCorrelationID(DeviceActivityInterface::CorrelationFlowType type) { - external_ids_[type].pop_back(); - type_.pop_back(); +void CorrelationIDManager::popCorrelationID( + DeviceActivityInterface::CorrelationFlowType type) { + external_ids_[type].pop_back(); + type_.pop_back(); } uint64_t CorrelationIDManager::getCorrelationID() const { - DeviceActivityInterface::CorrelationFlowType type = type_.back(); - return external_ids_[type].back(); + DeviceActivityInterface::CorrelationFlowType type = type_.back(); + return external_ids_[type].back(); } -thread_local std::deque CorrelationIDManager::external_ids_[DeviceActivityInterface::CorrelationFlowType::End]; -thread_local std::deque CorrelationIDManager::type_; +thread_local std::deque CorrelationIDManager::external_ids_ + [DeviceActivityInterface::CorrelationFlowType::End]; +thread_local std::deque + CorrelationIDManager::type_; } // namespace profile } // namespace dipu \ No newline at end of file diff --git a/dipu/torch_dipu/csrc_dipu/profiler/CorrelationIDManager.h b/dipu/torch_dipu/csrc_dipu/profiler/CorrelationIDManager.h index f4e2362a4..80816a9f8 100644 --- a/dipu/torch_dipu/csrc_dipu/profiler/CorrelationIDManager.h +++ b/dipu/torch_dipu/csrc_dipu/profiler/CorrelationIDManager.h @@ -1,31 +1,36 @@ #pragma once -#include -#include - #include +#include +#include namespace dipu { namespace profile { class CorrelationIDManager { -public: - CorrelationIDManager(const CorrelationIDManager&) = delete; - CorrelationIDManager& operator=(const CorrelationIDManager&) = delete; - - // CorrelationIDManager designed as a singleton - static CorrelationIDManager& instance(); - - void pushCorrelationID(uint64_t id, libkineto::DeviceActivityInterface::CorrelationFlowType type); - void popCorrelationID(libkineto::DeviceActivityInterface::CorrelationFlowType type); - uint64_t getCorrelationID() const; - -private: - CorrelationIDManager() = default; - -private: - thread_local static std::deque external_ids_[libkineto::DeviceActivityInterface::CorrelationFlowType::End]; - thread_local static std::deque type_; + public: + CorrelationIDManager(const CorrelationIDManager &) = delete; + CorrelationIDManager &operator=(const CorrelationIDManager &) = delete; + + // CorrelationIDManager designed as a singleton + static CorrelationIDManager &instance(); + + void pushCorrelationID( + uint64_t id, + libkineto::DeviceActivityInterface::CorrelationFlowType type); + void popCorrelationID( + libkineto::DeviceActivityInterface::CorrelationFlowType type); + uint64_t getCorrelationID() const; + + private: + CorrelationIDManager() = default; + + private: + thread_local static std::deque external_ids_ + [libkineto::DeviceActivityInterface::CorrelationFlowType::End]; + thread_local static std::deque< + libkineto::DeviceActivityInterface::CorrelationFlowType> + type_; }; } // namespace profile diff --git a/dipu/torch_dipu/csrc_dipu/profiler/DIPUDeviceActivity.cpp b/dipu/torch_dipu/csrc_dipu/profiler/DIPUDeviceActivity.cpp index 87c3a9142..48c93066f 100644 --- a/dipu/torch_dipu/csrc_dipu/profiler/DIPUDeviceActivity.cpp +++ b/dipu/torch_dipu/csrc_dipu/profiler/DIPUDeviceActivity.cpp @@ -1,7 +1,8 @@ #include "DIPUDeviceActivity.h" -#include #include +#include + #include #include "CorrelationIDManager.h" @@ -17,23 +18,28 @@ DIPUDeviceActivity::~DIPUDeviceActivity() { disableActivities(std::set()); } -DIPUDeviceActivity& DIPUDeviceActivity::instance() { +DIPUDeviceActivity &DIPUDeviceActivity::instance() { static DIPUDeviceActivity instance; return instance; } -void DIPUDeviceActivity::pushCorrelationID(uint64_t id, DeviceActivityInterface::CorrelationFlowType type) { +void DIPUDeviceActivity::pushCorrelationID( + uint64_t id, DeviceActivityInterface::CorrelationFlowType type) { CorrelationIDManager::instance().pushCorrelationID(id, type); } -void DIPUDeviceActivity::popCorrelationID(DeviceActivityInterface::CorrelationFlowType type) { +void DIPUDeviceActivity::popCorrelationID( + DeviceActivityInterface::CorrelationFlowType type) { CorrelationIDManager::instance().popCorrelationID(type); } -void DIPUDeviceActivity::enableActivities(const std::set& selectedActivities) {} +void DIPUDeviceActivity::enableActivities( + const std::set &selectedActivities) {} -void DIPUDeviceActivity::disableActivities(const std::set& selectedActivities) { - if (selectedActivities.find(libkineto::ActivityType::CONCURRENT_KERNEL) != selectedActivities.end()) { +void DIPUDeviceActivity::disableActivities( + const std::set &selectedActivities) { + if (selectedActivities.find(libkineto::ActivityType::CONCURRENT_KERNEL) != + selectedActivities.end()) { setProfileOpen(false); } } @@ -45,13 +51,13 @@ void DIPUDeviceActivity::clearActivities() { } int32_t DIPUDeviceActivity::processActivities( - libkineto::ActivityLogger& logger, - std::function linkedActivity, + libkineto::ActivityLogger &logger, + std::function linkedActivity, int64_t startTime, int64_t endTime) { FlushAllRecords(); auto records = RecordsImpl::get().getAllRecordList(); - for (const auto& record : records) { + for (const auto &record : records) { GenericTraceActivity act; act.startTime = record.begin / 1000; act.endTime = record.end / 1000; @@ -74,8 +80,9 @@ int32_t DIPUDeviceActivity::processActivities( logger.handleGenericActivity(act); } - std::map, libkineto::ResourceInfo> resource_infos = RecordsImpl::get().getResourceInfo(); - for (const auto& kv: resource_infos) { + std::map, libkineto::ResourceInfo> + resource_infos = RecordsImpl::get().getResourceInfo(); + for (const auto &kv : resource_infos) { logger.handleResourceInfo(kv.second, startTime); } @@ -91,6 +98,7 @@ void DIPUDeviceActivity::setMaxBufferSize(int32_t size) {} namespace libkineto { -DeviceActivityInterface* device_activity_singleton = &dipu::profile::DIPUDeviceActivity::instance(); +DeviceActivityInterface *device_activity_singleton = + &dipu::profile::DIPUDeviceActivity::instance(); } // namespace libkineto \ No newline at end of file diff --git a/dipu/torch_dipu/csrc_dipu/profiler/DIPUDeviceActivity.h b/dipu/torch_dipu/csrc_dipu/profiler/DIPUDeviceActivity.h index e7f02e3af..f7ea1878c 100644 --- a/dipu/torch_dipu/csrc_dipu/profiler/DIPUDeviceActivity.h +++ b/dipu/torch_dipu/csrc_dipu/profiler/DIPUDeviceActivity.h @@ -1,42 +1,49 @@ #pragma once +#include +#include #include #include -#include -#include - namespace dipu { namespace profile { class DIPUDeviceActivity : public libkineto::DeviceActivityInterface { -public: - ~DIPUDeviceActivity() override; - DIPUDeviceActivity(const DIPUDeviceActivity&) = delete; - DIPUDeviceActivity& operator=(const DIPUDeviceActivity&) = delete; - - // DIPUDeviceActivity designed as a singleton - static DIPUDeviceActivity& instance(); - - void pushCorrelationID(uint64_t id, libkineto::DeviceActivityInterface::CorrelationFlowType type) override; - void popCorrelationID(libkineto::DeviceActivityInterface::CorrelationFlowType type) override; - - void enableActivities(const std::set& selected_activities) override; - void disableActivities(const std::set& selected_activities) override; - void clearActivities() override; - int32_t processActivities(libkineto::ActivityLogger& logger, - std::function linked_activity, - int64_t start_time, int64_t end_time) override; - - void teardownContext() override; - void setMaxBufferSize(int32_t size) override; - -private: - DIPUDeviceActivity() = default; - -private: - std::unordered_map> cpu_activities_; - std::unordered_map> device_activities_; + public: + ~DIPUDeviceActivity() override; + DIPUDeviceActivity(const DIPUDeviceActivity &) = delete; + DIPUDeviceActivity &operator=(const DIPUDeviceActivity &) = delete; + + // DIPUDeviceActivity designed as a singleton + static DIPUDeviceActivity &instance(); + + void pushCorrelationID( + uint64_t id, + libkineto::DeviceActivityInterface::CorrelationFlowType type) override; + void popCorrelationID( + libkineto::DeviceActivityInterface::CorrelationFlowType type) override; + + void enableActivities( + const std::set &selected_activities) override; + void disableActivities( + const std::set &selected_activities) override; + void clearActivities() override; + int32_t processActivities( + libkineto::ActivityLogger &logger, + std::function linked_activity, + int64_t start_time, int64_t end_time) override; + + void teardownContext() override; + void setMaxBufferSize(int32_t size) override; + + private: + DIPUDeviceActivity() = default; + + private: + std::unordered_map> + cpu_activities_; + std::unordered_map> + device_activities_; }; } // namespace profile diff --git a/dipu/torch_dipu/csrc_dipu/profiler/collection.cpp b/dipu/torch_dipu/csrc_dipu/profiler/collection.cpp index 83dbce3dd..80a76856d 100644 --- a/dipu/torch_dipu/csrc_dipu/profiler/collection.cpp +++ b/dipu/torch_dipu/csrc_dipu/profiler/collection.cpp @@ -1,16 +1,15 @@ #include "collection.h" #include +#include #include +#include #include #include #include #include #include -#include -#include - #include #include #include @@ -30,31 +29,32 @@ namespace profile { constexpr bool kKinetoAvailable{true}; -using torch::profiler::impl::AppendOnlyList; -using torch::profiler::impl::ProfilerConfig; -using torch::profiler::impl::KinetoObserverContext; -using torch::profiler::impl::kineto::DeviceAndResource; using torch::profiler::perf_counters_t; +using torch::profiler::impl::ActivityType; +using torch::profiler::impl::AppendOnlyList; using torch::profiler::impl::approx_time_t; -using torch::profiler::impl::Result; -using torch::profiler::impl::python_tracer::PythonTracerBase; -using torch::profiler::impl::python_tracer::CompressedEvent; -using torch::profiler::impl::RawTensorMetadata; -using torch::profiler::impl::TensorMetadata; +using torch::profiler::impl::ExtraFields; +using torch::profiler::impl::KinetoObserverContext; using torch::profiler::impl::op_input_t; -using torch::profiler::impl::stacksToStr; +using torch::profiler::impl::ProfilerConfig; using torch::profiler::impl::ProfilerState; -using torch::profiler::impl::ExtraFields; -using torch::profiler::impl::kineto::kineto_ids; -using torch::profiler::impl::ActivityType; -using torch::profiler::impl::kineto::interface_trace_t; +using torch::profiler::impl::RawTensorMetadata; +using torch::profiler::impl::Result; +using torch::profiler::impl::stacksToStr; +using torch::profiler::impl::TensorMetadata; using torch::profiler::impl::kineto::ActivityTraceWrapper; +using torch::profiler::impl::kineto::DeviceAndResource; +using torch::profiler::impl::kineto::interface_trace_t; +using torch::profiler::impl::kineto::kineto_ids; +using torch::profiler::impl::python_tracer::CompressedEvent; +using torch::profiler::impl::python_tracer::PythonTracerBase; using result_ptr_t = std::shared_ptr; -using trace_ptr_t = std::unique_ptr; +using trace_ptr_t = + std::unique_ptr; void DIPUInputOutputEncoder::push(c10::ArrayRef values) { - for (const auto& value : values) { + for (const auto &value : values) { if (value.isTensor()) { push(value.toTensor()); } else if (value.isScalar()) { @@ -66,7 +66,7 @@ void DIPUInputOutputEncoder::push(c10::ArrayRef values) { ivalues_.emplace_back(value); } else if (value.isTensorList()) { tags_.emplace_back(Tag::TensorListBegin); - for (const auto& t : value.toTensorList()) { + for (const auto &t : value.toTensorList()) { push(t); } tags_.emplace_back(Tag::TERMINATOR); @@ -77,8 +77,8 @@ void DIPUInputOutputEncoder::push(c10::ArrayRef values) { tags_.emplace_back(Tag::TERMINATOR); } -void DIPUInputOutputEncoder::push(const at::Tensor& t) { - if (t.defined() && !t.is_nested()) { // TODO fix nested sizes +void DIPUInputOutputEncoder::push(const at::Tensor &t) { + if (t.defined() && !t.is_nested()) { // TODO fix nested sizes tags_.emplace_back(Tag::Tensor); tensor_metadata_.emplace_back(t); tensor_sizes_strides_.copy(t.sizes()); @@ -93,13 +93,12 @@ void DIPUInputOutputEncoder::push(const at::Tensor& t) { // This is a custom-iterator-like getter to obtain input shapes and dtypes. auto DIPUInputOutputEncoder::getNextShapesAndDtypes() { - return [this, - tag_it = tags_.begin(), + return [this, tag_it = tags_.begin(), tensor_metadata_it = tensor_metadata_.begin(), tensor_size_strides_it = tensor_sizes_strides_.begin(), ivals_it = ivalues_.begin()]() mutable { auto decode_tensor = [&]() -> TensorMetadata { - const auto& raw_metadata = *tensor_metadata_it++; + const auto &raw_metadata = *tensor_metadata_it++; std::vector sizes; std::vector strides; for (C10_UNUSED const auto _ : c10::irange(raw_metadata.dim_)) { @@ -164,14 +163,15 @@ void DIPUInputOutputEncoder::clear() { // | Correlation ID tracking (OpList & EventBlock) | // --------------------------------------------------- template -DIPUThreadLocalSubqueue::TorchOpStorage::EventBlock::EventBlock() { +DIPUThreadLocalSubqueue::TorchOpStorage::EventBlock::EventBlock() { static std::atomic counter_{0}; id_start_ = 1 + ChunkSize * counter_++; } template -std::pair DIPUThreadLocalSubqueue:: - TorchOpStorage::OpList::emplace_back(Args&&... args) { +std::pair +DIPUThreadLocalSubqueue::TorchOpStorage::OpList::emplace_back(Args &&...args) { maybe_grow(); *next_ = {std::forward(args)...}; auto corr_id = buffer_last_->correlation_id(next_); @@ -179,15 +179,15 @@ std::pair DIPUThreadLocalSubqueue:: } uint64_t DIPUThreadLocalSubqueue::TorchOpStorage::OpList::correlationID( - const OpList::Iterator& e) { + const OpList::Iterator &e) { return e.address().first->correlation_id(&*e); } template -uint64_t DIPUThreadLocalSubqueue::TorchOpStorage::EventBlock:: - correlation_id(const T* ptr) const { - TORCH_INTERNAL_ASSERT_DEBUG_ONLY( - ptr >= this->data() && ptr < this->data() + ChunkSize); +uint64_t DIPUThreadLocalSubqueue::TorchOpStorage::EventBlock< + T, ChunkSize>::correlation_id(const T *ptr) const { + TORCH_INTERNAL_ASSERT_DEBUG_ONLY(ptr >= this->data() && + ptr < this->data() + ChunkSize); return id_start_ + (ptr - this->data()); } @@ -195,16 +195,12 @@ uint64_t DIPUThreadLocalSubqueue::TorchOpStorage::EventBlock:: // | Collection (Observer logic) | // --------------------------------- std::unique_ptr DIPUThreadLocalSubqueue::begin_op( - const at::RecordFunction& fn) { - KinetoObserverContext::Event* event; + const at::RecordFunction &fn) { + KinetoObserverContext::Event *event; uint64_t corr_id; std::tie(event, corr_id) = torch_ops_.op_events_.emplace_back( - fn.seqNr(), - fn.forwardThreadId(), - fn.scope(), - fn.isAsync(), - fn.debugHandle(), - fn.name()); + fn.seqNr(), fn.forwardThreadId(), fn.scope(), fn.isAsync(), + fn.debugHandle(), fn.name()); if (config_.report_input_shapes) { torch_ops_.inputs_outputs_.push(fn.inputs()); } @@ -218,7 +214,8 @@ std::unique_ptr DIPUThreadLocalSubqueue::begin_op( // backward nodes source range corresponds to the forward node // TODO: consider using C++ stack trace if (config_.with_stack && fn.scope() != at::RecordScope::BACKWARD_FUNCTION) { - auto cs = torch::profiler::impl::prepareCallstack(torch::jit::currentCallstack()); + auto cs = + torch::profiler::impl::prepareCallstack(torch::jit::currentCallstack()); torch_ops_.jit_stack_.emplace_back(callstackStr(cs)); } if (config_.with_modules && @@ -247,12 +244,10 @@ std::unique_ptr DIPUThreadLocalSubqueue::begin_op( namespace { template struct StealOrDefault { - StealOrDefault(T& container) + StealOrDefault(T &container) : container_{container}, it_{container.begin()} {} - ~StealOrDefault() { - container_.get().clear(); - } + ~StealOrDefault() { container_.get().clear(); } typename T::Iterator::value_type operator()() { if (it_.exhausted()) { @@ -267,19 +262,18 @@ struct StealOrDefault { std::reference_wrapper container_; typename T::Iterator it_; }; -} // namespace +} // namespace void DIPUThreadLocalSubqueue::TorchOpStorage::materialize( - std::vector>& out, + std::vector> &out, const std::function time_converter, - const uint64_t tid, - const DeviceAndResource& kineto_info) { + const uint64_t tid, const DeviceAndResource &kineto_info) { // Plumb Autograd info to the top level annotation. auto it = op_events_.begin(); for (C10_UNUSED const auto _ : c10::irange(static_cast(op_events_.size()) - 1)) { - auto& first = it->basic_fields_; - auto& second = (++it)->basic_fields_; + auto &first = it->basic_fields_; + auto &second = (++it)->basic_fields_; if (first.scope_ == at::RecordScope::FUNCTION && second.scope_ == at::RecordScope::BACKWARD_FUNCTION && first.name_.rfind("autograd::engine::evaluate_function: ", 0) == 0) { @@ -293,8 +287,8 @@ void DIPUThreadLocalSubqueue::TorchOpStorage::materialize( // particular, Windows will add a "struct " prefix. const std::string accumulate_grad = "torch::autograd::AccumulateGrad"; const std::string windows_pattern = std::string("struct ") + accumulate_grad; - for (auto& event : op_events_) { - auto& name = event.basic_fields_.name_; + for (auto &event : op_events_) { + auto &name = event.basic_fields_.name_; auto position = name.find(windows_pattern); if (position != std::string::npos) { name.replace(position, windows_pattern.size(), accumulate_grad); @@ -322,8 +316,8 @@ void DIPUThreadLocalSubqueue::TorchOpStorage::materialize( event->allow_tf32_cublas_, std::move(event->counters_)}; - out.emplace_back(Result::create( - time_converter(event->start_time_), tid, kineto_info, std::move(e))); + out.emplace_back(Result::create(time_converter(event->start_time_), tid, + kineto_info, std::move(e))); } op_events_.clear(); @@ -334,41 +328,38 @@ namespace { // See `DIPURecordQueue::getSubqueue()` for an overview of this cache. struct SubQueueThreadCache { uint32_t key_; - DIPUThreadLocalSubqueue* ref_; + DIPUThreadLocalSubqueue *ref_; }; // The astute observer will note that this leaves a dangling reference; nothing -// in the teardown of `DIPURecordQueue` or `DIPUThreadLocalSubqueue` clears this value. -// (And the raw pointer in `SubQueueThreadCache` will not extend the lifetime -// of `*ref_`.) This is safe, however, because `getSubqueue` will check +// in the teardown of `DIPURecordQueue` or `DIPUThreadLocalSubqueue` clears this +// value. (And the raw pointer in `SubQueueThreadCache` will not extend the +// lifetime of `*ref_`.) This is safe, however, because `getSubqueue` will check // `sub_queue_cache_.key_` before attempting to access `ref_`, and if `key_` // does not match the DIPURecordQueue's *unique* `id_` it will evict // `sub_queue_cache_` and fall back to a different mechanism. std::atomic queue_id_{0}; thread_local SubQueueThreadCache sub_queue_cache_{0, nullptr}; -std::string toString(const ExtraFields& e) { +std::string toString( + const ExtraFields &e) { if (e.module_.has_value()) { - return fmt::format( - "nn.Module: {}_{}", e.module_->cls_name_.str(), e.module_->id_); + return fmt::format("nn.Module: {}_{}", e.module_->cls_name_.str(), + e.module_->id_); } - return fmt::format( - "{}({}): {}", - e.callsite_.filename_.str(), - e.callsite_.line_no_, - e.callsite_.funcname_.str()); + return fmt::format("{}({}): {}", e.callsite_.filename_.str(), + e.callsite_.line_no_, e.callsite_.funcname_.str()); } auto scopeToType(at::RecordScope scope) { return scope == at::RecordScope::USER_SCOPE - ? libkineto::ActivityType::USER_ANNOTATION - : libkineto::ActivityType::CPU_OP; + ? libkineto::ActivityType::USER_ANNOTATION + : libkineto::ActivityType::CPU_OP; } int64_t torchOpEndNS( - const ExtraFields& e, - const bool finished, - const std::weak_ptr& parent) { + const ExtraFields &e, + const bool finished, const std::weak_ptr &parent) { if (finished && e.end_time_ns_ == std::numeric_limits::min()) { auto p = parent.lock(); if (p) { @@ -379,19 +370,18 @@ int64_t torchOpEndNS( } auto kinetoEventCorrelationID( - const ExtraFields& e, - const std::weak_ptr& parent) { + const ExtraFields &e, + const std::weak_ptr &parent) { if (e.correlation_id_) { return e.correlation_id_; } auto p = parent.lock(); return p ? p->correlationID() : 0; } -} // namespace +} // namespace -DIPUThreadLocalSubqueue::DIPUThreadLocalSubqueue( - const uint64_t tid, - const ProfilerConfig& config) +DIPUThreadLocalSubqueue::DIPUThreadLocalSubqueue(const uint64_t tid, + const ProfilerConfig &config) : tid_{tid}, config_{config}, kineto_info_{kineto_ids()} { libkineto::api().activityProfiler().recordThreadInfo(); if (!config_.experimental_config.performance_events.empty()) { @@ -401,9 +391,8 @@ DIPUThreadLocalSubqueue::DIPUThreadLocalSubqueue( } } -DIPURecordQueue::DIPURecordQueue( - const ProfilerConfig& config, - std::set activities) +DIPURecordQueue::DIPURecordQueue(const ProfilerConfig &config, + std::set activities) : id_(++queue_id_), config_{config}, activities_{std::move(activities)} { if (tracePython()) { python_tracer_ = makeTracer(this); @@ -414,7 +403,7 @@ bool DIPURecordQueue::tracePython() const { return config_.with_stack && activities_.count(ActivityType::CPU); } -DIPUThreadLocalSubqueue* DIPURecordQueue::getSubqueue() { +DIPUThreadLocalSubqueue *DIPURecordQueue::getSubqueue() { // In the most common case, a thread will want to write to the same sub-queue // that it wrote to last call. The only time that isn't true is if: // A) The profiler context has ended and we are in a new one. @@ -432,7 +421,8 @@ DIPUThreadLocalSubqueue* DIPURecordQueue::getSubqueue() { auto it = sub_queues_.find(tid); if (it == sub_queues_.end()) { it = sub_queues_ - .emplace(tid, std::make_unique(tid, config_)) + .emplace(tid, + std::make_unique(tid, config_)) .first; } @@ -447,37 +437,30 @@ void DIPURecordQueue::stop() { } namespace { -void mark_finished(std::shared_ptr& r) { +void mark_finished(std::shared_ptr &r) { TORCH_INTERNAL_ASSERT(!r->finished_, r->name()); r->finished_ = true; TORCH_INTERNAL_ASSERT(r->endTimeNS() >= r->start_time_ns_, r->name()); } -static constexpr const char* indexKey = "Ev Idx"; +static constexpr const char *indexKey = "Ev Idx"; -void passEventsToKineto( - const std::vector>& results, - uint64_t start_time_us, - uint64_t end_time_us) { +void passEventsToKineto(const std::vector> &results, + uint64_t start_time_us, uint64_t end_time_us) { using namespace torch::profiler::impl::kineto; TraceWrapper cpu_trace(start_time_us, "PyTorch Profiler"); // Generate Kineto events for each event recorded by the PyTorch profiler. for (const auto i : c10::irange(results.size())) { - const auto& e = results[i]; - const auto* activity = cpu_trace.addCPUActivity( - e->name(), - e->kinetoType(), - e->kineto_info_, - e->correlationID(), - e->start_time_ns_ / 1000, - e->endTimeNS() / 1000); + const auto &e = results[i]; + const auto *activity = cpu_trace.addCPUActivity( + e->name(), e->kinetoType(), e->kineto_info_, e->correlationID(), + e->start_time_ns_ / 1000, e->endTimeNS() / 1000); TORCH_INTERNAL_ASSERT(activity || !kKinetoAvailable); if (activity) { addMetadata(activity, indexKey, std::to_string(i)); } - } // Kineto adds the events that it collected. @@ -513,11 +496,10 @@ class TransferEvents { using activity_t = torch::profiler::impl::kineto::activity_t; public: - TransferEvents( - std::vector>& results, - trace_ptr_t& trace) + TransferEvents(std::vector> &results, + trace_ptr_t &trace) : results_{results} { - auto* trace_activities_ptr = trace->get()->activities(); + auto *trace_activities_ptr = trace->get()->activities(); TORCH_INTERNAL_ASSERT(trace_activities_ptr != nullptr); trace_activities_ = *trace_activities_ptr; reassociate(); @@ -526,7 +508,7 @@ class TransferEvents { } private: - static long long extractIndex(const std::string& metadata_json) { + static long long extractIndex(const std::string &metadata_json) { static const auto prefix = fmt::format("\"{}\": ", indexKey); auto pos = metadata_json.find(prefix); return (pos == std::string::npos) ? unmatchedIndex : [&]() { @@ -536,7 +518,7 @@ class TransferEvents { }(); } - std::shared_ptr lookup(const itrace_t* key) { + std::shared_ptr lookup(const itrace_t *key) { if (key == nullptr) { return nullptr; } @@ -562,24 +544,24 @@ class TransferEvents { // Match profiler events with the corresponding kineto events. Kineto may // have moved or copied the activities, so we have to recover the // relationship between `libkineto::ITraceActivity` and `Result`. - for (const auto* activity : trace_activities_) { + for (const auto *activity : trace_activities_) { TORCH_INTERNAL_ASSERT(activity != nullptr); auto e = lookup(activity); if (e != nullptr) { TORCH_INTERNAL_ASSERT(e->kineto_activity_ == nullptr); - e->kineto_activity_ = static_cast(activity); + e->kineto_activity_ = static_cast(activity); } } if (results_.get().size() != kineto_events_.size()) { - TORCH_WARN(fmt::format( - "Failed to recover relationship between all profiler and kineto events: " - "{} vs. {} reassociated.", - results_.get().size(), - kineto_events_.size())); + TORCH_WARN( + fmt::format("Failed to recover relationship between all " + "profiler and kineto events: " + "{} vs. {} reassociated.", + results_.get().size(), kineto_events_.size())); } } - std::shared_ptr resultFromActivity(const itrace_t* activity) { + std::shared_ptr resultFromActivity(const itrace_t *activity) { TORCH_INTERNAL_ASSERT(activity != nullptr); // Kineto is inconsistent with types, so we have to cast to int32. @@ -589,7 +571,7 @@ class TransferEvents { auto event = Result::create( activity->timestamp() * 1000, - noTID, // Placeholder + noTID, // Placeholder device_and_resource, ExtraFields{ activity->name(), @@ -607,17 +589,16 @@ class TransferEvents { return event; } - std::shared_ptr toResult(const itrace_t* activity) { + std::shared_ptr toResult(const itrace_t *activity) { auto e = lookup(activity); // Until we are very sure that we can reassociate kineto and profiler // events we need to be very defensive. const auto type = activity->type(); - if (e == nullptr && - (type == libkineto::ActivityType::CPU_OP || - type == libkineto::ActivityType::CPU_INSTANT_EVENT || - type == libkineto::ActivityType::USER_ANNOTATION || - type == libkineto::ActivityType::PYTHON_FUNCTION)) { + if (e == nullptr && (type == libkineto::ActivityType::CPU_OP || + type == libkineto::ActivityType::CPU_INSTANT_EVENT || + type == libkineto::ActivityType::USER_ANNOTATION || + type == libkineto::ActivityType::PYTHON_FUNCTION)) { TORCH_WARN_ONCE( "Detected an event which was likely passed to kineto by the PyTorch " "profiler, but is not present in the set of known events: ", @@ -637,31 +618,30 @@ class TransferEvents { } void extractEventsFromTrace() { - for (const auto* activity : trace_activities_) { + for (const auto *activity : trace_activities_) { auto e = toResult(activity); - const auto* linked_activity = activity->linkedActivity(); + const auto *linked_activity = activity->linkedActivity(); if (e && linked_activity) { e->visit(c10::overloaded( - [&](ExtraFields& i) { + [&](ExtraFields &i) { i.linked_activity_ = toResult(linked_activity); }, - [](auto&) { TORCH_INTERNAL_ASSERT(false); })); + [](auto &) { TORCH_INTERNAL_ASSERT(false); })); } } } - void setKinetoTID( - std::shared_ptr& r, - std::shared_ptr parent) { + void setKinetoTID(std::shared_ptr &r, + std::shared_ptr parent) { r->visit(c10::overloaded( - [&](ExtraFields& i) { + [&](ExtraFields &i) { TORCH_INTERNAL_ASSERT(r->start_tid_ == noTID); r->start_tid_ = parent ? parent->start_tid_ : at::RecordFunction::currentThreadId(); }, - [](auto&) {})); + [](auto &) {})); - for (auto& child : r->children_) { + for (auto &child : r->children_) { setKinetoTID(child, r); } } @@ -669,10 +649,10 @@ class TransferEvents { void setParents() { // First pass: Collect start events and set parent to linked event. ska::flat_hash_map> flow_map; - for (auto& e : results_.get()) { + for (auto &e : results_.get()) { TORCH_INTERNAL_ASSERT(e != nullptr); e->visit(c10::overloaded( - [&](const ExtraFields& i) { + [&](const ExtraFields &i) { if (i.flow.type == libkineto::kLinkAsyncCpuGpu && i.flow.start) { auto inserted = flow_map.insert({i.flow.id, e}); TORCH_INTERNAL_ASSERT(inserted.second); @@ -680,13 +660,13 @@ class TransferEvents { TORCH_INTERNAL_ASSERT(e->parent_.expired()); e->parent_ = i.linked_activity_; }, - [](const auto&) {})); + [](const auto &) {})); } // Second pass - for (auto& e : results_.get()) { + for (auto &e : results_.get()) { e->visit(c10::overloaded( - [&](const ExtraFields& i) { + [&](const ExtraFields &i) { // Flow takes priority over linked event. const auto it = flow_map.find(i.flow.id); if (it != flow_map.end() && @@ -701,11 +681,11 @@ class TransferEvents { mark_finished(e); } }, - [](const auto&) {})); + [](const auto &) {})); } // Set TIDs now that we have established lineage. - for (auto& e : results_.get()) { + for (auto &e : results_.get()) { if (e->parent_.expired()) { setKinetoTID(e, nullptr); } @@ -715,21 +695,17 @@ class TransferEvents { static constexpr long long unmatchedIndex = -1; static constexpr auto noTID = std::numeric_limits::max(); std::reference_wrapper>> results_; - std::vector trace_activities_; - ska::flat_hash_map> kineto_events_; + std::vector trace_activities_; + ska::flat_hash_map> kineto_events_; }; ActivityTraceWrapper stopTrace() { - return ActivityTraceWrapper{ - libkineto::api().activityProfiler().stopTrace() - }; + return ActivityTraceWrapper{libkineto::api().activityProfiler().stopTrace()}; } -trace_ptr_t addKinetoEvents( - std::vector>& results, - uint64_t start_time_us, - uint64_t end_time_us, - const ProfilerConfig& config) { +trace_ptr_t addKinetoEvents(std::vector> &results, + uint64_t start_time_us, uint64_t end_time_us, + const ProfilerConfig &config) { using namespace torch::profiler::impl::kineto; passEventsToKineto(results, start_time_us, end_time_us); @@ -745,26 +721,25 @@ trace_ptr_t addKinetoEvents( } struct ResultGreater { - bool operator()(const result_ptr_t& a, const result_ptr_t& b) const { + bool operator()(const result_ptr_t &a, const result_ptr_t &b) const { return a->endTimeNS() > b->endTimeNS(); } }; -void set_in_tree_building( - std::vector& results, - const bool value) { - for (result_ptr_t& r : results) { +void set_in_tree_building(std::vector &results, + const bool value) { + for (result_ptr_t &r : results) { r->visit(c10::overloaded( - [value](ExtraFields& i) { + [value](ExtraFields &i) { i.in_tree_building_ = value; }, - [&](auto&) { + [&](auto &) { // pass })); } } -void build_tree(std::vector>& sorted_events) { +void build_tree(std::vector> &sorted_events) { set_in_tree_building(sorted_events, true); using op_fields = ExtraFields; @@ -772,28 +747,29 @@ void build_tree(std::vector>& sorted_events) { std::priority_queue, ResultGreater> end_events_; - auto push_event = [&stacks, &end_events_](std::shared_ptr& event) { + auto push_event = [&stacks, &end_events_](std::shared_ptr &event) { // Kineto builds subtrees using correlation ids and flows, so some Kineto // events are already marked finished before the main tree building // algorithm. It's fine to ignore them; the root event of these subtrees // not a Kineto op and will be handled normally. - if (c10::holds_alternative>( + if (c10::holds_alternative< + ExtraFields>( event->extra_fields_) && event->finished_) { return; } TORCH_INTERNAL_ASSERT(event->parent_.expired()); - for (const auto& child : event->children_) { + for (const auto &child : event->children_) { TORCH_INTERNAL_ASSERT(child->finished_); } TORCH_INTERNAL_ASSERT(!event->finished_); auto parent_it = stacks.find(event->start_tid_); if (parent_it == stacks.end()) { - auto fwd_tid = event->visit(c10::overloaded( - [](const op_fields& i) { return i.forward_tid_; }, - [](const auto&) -> uint64_t { return 0; })); + auto fwd_tid = event->visit( + c10::overloaded([](const op_fields &i) { return i.forward_tid_; }, + [](const auto &) -> uint64_t { return 0; })); if (fwd_tid) { parent_it = stacks.find(fwd_tid); } @@ -841,7 +817,7 @@ void build_tree(std::vector>& sorted_events) { }; // Stack replay loop. - for (auto& event : sorted_events) { + for (auto &event : sorted_events) { while (!end_events_.empty() && end_events_.top()->endTimeNS() < event->start_time_ns_) { pop_event(end_events_.top()); @@ -864,33 +840,33 @@ void build_tree(std::vector>& sorted_events) { * of its children's adjusted durations (keeping its start time the same) * (adjust all child durations recursively) */ -int64_t adjust_durations_dfs(std::shared_ptr& r) { +int64_t adjust_durations_dfs(std::shared_ptr &r) { if (SOFT_ASSERT(r != nullptr)) { int64_t original_duration = r->endTimeNS() - r->start_time_ns_; - int64_t children_total_duration = std::accumulate( - r->children_.begin(), - r->children_.end(), - 0, - [](int64_t acc, std::shared_ptr& child) { - return acc + adjust_durations_dfs(child); - }); + int64_t children_total_duration = + std::accumulate(r->children_.begin(), r->children_.end(), 0, + [](int64_t acc, std::shared_ptr &child) { + return acc + adjust_durations_dfs(child); + }); if (children_total_duration > original_duration) { r->visit(c10::overloaded( - [&r, &children_total_duration](ExtraFields& i) { + [&r, &children_total_duration]( + ExtraFields &i) { i.end_time_ns_ = r->start_time_ns_ + children_total_duration; }, - [&children_total_duration](ExtraFields& i) { + [&children_total_duration]( + ExtraFields &i) { i.duration_ns_ = children_total_duration; }, - [](ExtraFields& _) { + [](ExtraFields &_) { // Pass- Allocation events can't have children }, - [&](auto&) { - SOFT_ASSERT( - false, - "unexpected event type in mobile profiler adjust_durations_dfs: ", - r->name()); + [&](auto &) { + SOFT_ASSERT(false, + "unexpected event type in mobile profiler " + "adjust_durations_dfs: ", + r->name()); })); return children_total_duration; } else { @@ -908,41 +884,39 @@ int64_t adjust_durations_dfs(std::shared_ptr& r) { that the last one ends at the same time as r * 3) Return r's final end time */ -int64_t adjust_timestamps_dfs( - std::shared_ptr& r, - int64_t new_start_time) { +int64_t adjust_timestamps_dfs(std::shared_ptr &r, + int64_t new_start_time) { if (SOFT_ASSERT(r != nullptr)) { if (r->start_time_ns_ != new_start_time) { // Adjust start time (keeping duration constant) r->visit(c10::overloaded( - [&r, &new_start_time](ExtraFields& i) { + [&r, &new_start_time]( + ExtraFields &i) { i.end_time_ns_ = new_start_time + (i.end_time_ns_ - r->start_time_ns_); }, - [](ExtraFields& i) { + [](ExtraFields &i) { // Pass- We don't need to manually adjust end time for Vulkan events }, - [](ExtraFields& _) { + [](ExtraFields &_) { // Pass- No duration or end time to adjust }, - [&](auto&) { - SOFT_ASSERT( - false, - "unexpected event type in mobile profiler adjust_timestamps_dfs: ", - r->name()); + [&](auto &) { + SOFT_ASSERT(false, + "unexpected event type in mobile profiler " + "adjust_timestamps_dfs: ", + r->name()); })); r->start_time_ns_ = new_start_time; } int64_t children_total_duration = std::accumulate( - r->children_.begin(), - r->children_.end(), - 0, - [](int64_t acc, std::shared_ptr& child) { + r->children_.begin(), r->children_.end(), 0, + [](int64_t acc, std::shared_ptr &child) { return acc + (child->endTimeNS() - child->start_time_ns_); }); int64_t child_start_time = r->endTimeNS() - children_total_duration; - for (std::shared_ptr& child : r->children_) { + for (std::shared_ptr &child : r->children_) { child_start_time = adjust_timestamps_dfs(child, child_start_time); } } @@ -955,50 +929,46 @@ int64_t adjust_timestamps_dfs( * - Parent event timelines fully contain their child timelines * - No overlaps in timelines for nodes at the same depth */ -void adjust_timestamps(std::vector>& out) { +void adjust_timestamps(std::vector> &out) { if (out.empty()) { return; } int64_t min_start_time = out[0]->start_time_ns_; - for (std::shared_ptr& r : out) { + for (std::shared_ptr &r : out) { // Only begin traversal for root nodes. if (r->parent_.expired()) { adjust_durations_dfs(r); min_start_time = adjust_timestamps_dfs( - r, - std::max( - r->tag() != torch::profiler::impl::EventType::Vulkan - ? r->start_time_ns_ - : std::numeric_limits::min(), - min_start_time)); + r, std::max(r->tag() != torch::profiler::impl::EventType::Vulkan + ? r->start_time_ns_ + : std::numeric_limits::min(), + min_start_time)); } } } -} // namespace - -std::pair< - std::vector>, - std::unique_ptr> -DIPURecordQueue::getRecords( - std::function time_converter, - uint64_t start_time_us, - uint64_t end_time_us) { +} // namespace + +std::pair>, + std::unique_ptr> +DIPURecordQueue::getRecords(std::function time_converter, + uint64_t start_time_us, uint64_t end_time_us) { auto converter = [&](approx_time_t t) { return t == std::numeric_limits::min() - ? std::numeric_limits::min() - : time_converter(t); + ? std::numeric_limits::min() + : time_converter(t); }; std::vector> out; std::vector python_enters; - for (auto& subqueue_it : sub_queues_) { - auto& queue = *subqueue_it.second; - auto materialize = [&](auto& events) { - for (auto& i : events) { + for (auto &subqueue_it : sub_queues_) { + auto &queue = *subqueue_it.second; + auto materialize = [&](auto &events) { + for (auto &i : events) { time_t start_time_ns; if constexpr (std::is_same< std::remove_reference_t, - ExtraFields>::value) { + ExtraFields>::value) { start_time_ns = i.start_time_us_ * 1000; } else { start_time_ns = converter(i.start_time_); @@ -1012,39 +982,40 @@ DIPURecordQueue::getRecords( events.clear(); }; - queue.torch_ops_.materialize( - out, converter, queue.tid(), queue.kineto_info()); + queue.torch_ops_.materialize(out, converter, queue.tid(), + queue.kineto_info()); materialize(queue.backend_events_); - for (auto& i : queue.allocations_) { + for (auto &i : queue.allocations_) { out.emplace_back(Result::create( /*start_time_ns_=*/converter(i.start_time_), /*start_tid_=*/queue.tid(), /*kineto_info_=*/queue.kineto_info(), - /*extra_fields_=*/ExtraFields(i))); + /*extra_fields_=*/ + ExtraFields(i))); } materialize(queue.ooms_); - for (auto& i : queue.py_calls_) { + for (auto &i : queue.py_calls_) { python_enters.push_back( {i.first, queue.tid(), queue.kineto_info(), converter(i.second)}); } } if (python_tracer_) { - for (const auto& i : python_tracer_->getEvents( - converter, python_enters, end_time_us * 1000)) { + for (const auto &i : python_tracer_->getEvents(converter, python_enters, + end_time_us * 1000)) { out.push_back(i); } python_tracer_.reset(); } if (config_.experimental_config.adjust_timestamps) { - std::stable_sort(out.begin(), out.end(), [](const auto& a, const auto& b) { + std::stable_sort(out.begin(), out.end(), [](const auto &a, const auto &b) { return a->start_time_ns_ < b->start_time_ns_; }); build_tree(out); adjust_timestamps(out); - for (auto& r : out) { + for (auto &r : out) { r->parent_.reset(); // Reset these so that second build_tree can happen r->finished_ = false; @@ -1054,7 +1025,7 @@ DIPURecordQueue::getRecords( auto trace = addKinetoEvents(out, start_time_us, end_time_us, config_); - std::stable_sort(out.begin(), out.end(), [](const auto& a, const auto& b) { + std::stable_sort(out.begin(), out.end(), [](const auto &a, const auto &b) { return a->start_time_ns_ < b->start_time_ns_; }); @@ -1066,5 +1037,5 @@ DIPURecordQueue::getRecords( return {out, std::move(trace)}; } -} // namespace profile -} // namespace dipu \ No newline at end of file +} // namespace profile +} // namespace dipu \ No newline at end of file diff --git a/dipu/torch_dipu/csrc_dipu/profiler/collection.h b/dipu/torch_dipu/csrc_dipu/profiler/collection.h index 7e42612ca..7f5e6a057 100644 --- a/dipu/torch_dipu/csrc_dipu/profiler/collection.h +++ b/dipu/torch_dipu/csrc_dipu/profiler/collection.h @@ -12,11 +12,11 @@ #include #include #include +#include #include #include #include #include -#include #include #include #include @@ -43,61 +43,70 @@ class DIPUInputOutputEncoder final { enum class Tag { Tensor = 0, UndefinedTensor, - TensorListBegin, // TODO: generalize to other lists. + TensorListBegin, // TODO: generalize to other lists. Scalar, Other, TERMINATOR }; - void push(const at::Tensor& t); + void push(const at::Tensor &t); - torch::profiler::impl::AppendOnlyList tags_; - torch::profiler::impl::AppendOnlyList + torch::profiler::impl::AppendOnlyList< + Tag, torch::profiler::impl::IO_ENCODER_DEFAULT_BLOCK_SIZE> + tags_; + torch::profiler::impl::AppendOnlyList< + torch::profiler::impl::RawTensorMetadata, + torch::profiler::impl::IO_ENCODER_DEFAULT_BLOCK_SIZE> tensor_metadata_; - torch::profiler::impl::AppendOnlyList tensor_sizes_strides_; - torch::profiler::impl::AppendOnlyList ivalues_; + torch::profiler::impl::AppendOnlyList< + int64_t, torch::profiler::impl::IO_ENCODER_DEFAULT_BLOCK_SIZE> + tensor_sizes_strides_; + torch::profiler::impl::AppendOnlyList< + c10::IValue, torch::profiler::impl::IO_ENCODER_DEFAULT_BLOCK_SIZE> + ivalues_; }; class DIPUThreadLocalSubqueue { public: - DIPUThreadLocalSubqueue(const uint64_t tid, const torch::profiler::impl::ProfilerConfig& config); + DIPUThreadLocalSubqueue(const uint64_t tid, + const torch::profiler::impl::ProfilerConfig &config); - std::unique_ptr begin_op(const at::RecordFunction& fn); + std::unique_ptr begin_op( + const at::RecordFunction &fn); template - void emplace_backend_event(Args&&... args) { + void emplace_backend_event(Args &&...args) { backend_events_.emplace_back(std::forward(args)...); } template - void emplace_vulkan_event(Args&&... args) { + void emplace_vulkan_event(Args &&...args) { vulkan_events_.emplace_back(std::forward(args)...); } template - void emplace_allocation_event(Args&&... args) { + void emplace_allocation_event(Args &&...args) { allocations_.emplace_back(std::forward(args)...); } template - void emplace_ooms_event(Args&&... args) { + void emplace_ooms_event(Args &&...args) { ooms_.emplace_back(std::forward(args)...); } template - void emplace_py_call(Args&&... args) { + void emplace_py_call(Args &&...args) { py_calls_.emplace_back(std::forward(args)...); } - uint64_t tid() const { - return tid_; - } + uint64_t tid() const { return tid_; } - const torch::profiler::impl::kineto::DeviceAndResource& kineto_info() const { + const torch::profiler::impl::kineto::DeviceAndResource &kineto_info() const { return kineto_info_; } - inline void disable_perf_profiler(torch::profiler::perf_counters_t& counters) const { + inline void disable_perf_profiler( + torch::profiler::perf_counters_t &counters) const { perf_profiler_->Disable(counters); } @@ -114,79 +123,106 @@ class DIPUThreadLocalSubqueue { struct TorchOpStorage { // NB: This is a destructive operation. void materialize( - std::vector>& out, - const std::function time_converter, + std::vector> &out, + const std::function + time_converter, const uint64_t tid, - const torch::profiler::impl::kineto::DeviceAndResource& kineto_info); + const torch::profiler::impl::kineto::DeviceAndResource &kineto_info); template class EventBlock : public std::array { public: EventBlock(); - uint64_t correlation_id(const T* ptr) const; + uint64_t correlation_id(const T *ptr) const; private: uint64_t id_start_; }; using event_t = torch::profiler::impl::KinetoObserverContext::Event; - class OpList : public torch::profiler::impl::AppendOnlyList { + class OpList + : public torch::profiler::impl::AppendOnlyList { public: template - std::pair emplace_back(Args&&... args); - static uint64_t correlationID(const OpList::Iterator& e); + std::pair emplace_back(Args &&...args); + static uint64_t correlationID(const OpList::Iterator &e); } op_events_; // report_input_shapes DIPUInputOutputEncoder inputs_outputs_; // with_stack (JIT) - torch::profiler::impl::AppendOnlyList jit_stack_; + torch::profiler::impl::AppendOnlyList + jit_stack_; // with_modules - torch::profiler::impl::AppendOnlyList jit_modules_; + torch::profiler::impl::AppendOnlyList + jit_modules_; // with_flops - torch::profiler::impl::AppendOnlyList extra_args_; + torch::profiler::impl::AppendOnlyList + extra_args_; // ProfilerState::KINETO_GPU_FALLBACK - torch::profiler::impl::AppendOnlyList gpu_fallback_; + torch::profiler::impl::AppendOnlyList + gpu_fallback_; } torch_ops_; // reportBackendEventToActiveKinetoProfiler - torch::profiler::impl::AppendOnlyList, BlockSize> backend_events_; + torch::profiler::impl::AppendOnlyList< + torch::profiler::impl::ExtraFields< + torch::profiler::impl::EventType::Backend>, + BlockSize> + backend_events_; // _reportVulkanEventToProfiler - torch::profiler::impl::AppendOnlyList::raw_event_t, BlockSize> + torch::profiler::impl::AppendOnlyList< + torch::profiler::impl::ExtraFields< + torch::profiler::impl::EventType::Vulkan>::raw_event_t, + BlockSize> vulkan_events_; // reportMemoryUsage - torch::profiler::impl::AppendOnlyList allocations_; + torch::profiler::impl::AppendOnlyList + allocations_; // reportOOMs - torch::profiler::impl::AppendOnlyList, BlockSize> ooms_; + torch::profiler::impl::AppendOnlyList< + torch::profiler::impl::ExtraFields< + torch::profiler::impl::EventType::OutOfMemory>, + BlockSize> + ooms_; // with_stack (Python) - torch::profiler::impl::AppendOnlyList, BlockSize> + torch::profiler::impl::AppendOnlyList< + std::pair, + BlockSize> py_calls_; }; class DIPURecordQueue { public: - DIPURecordQueue(const torch::profiler::impl::ProfilerConfig& config, std::set activities); + DIPURecordQueue(const torch::profiler::impl::ProfilerConfig &config, + std::set activities); bool tracePython() const; - DIPUThreadLocalSubqueue* getSubqueue(); + DIPUThreadLocalSubqueue *getSubqueue(); void stop(); // NB: This is a destructive operation. std::pair< std::vector>, std::unique_ptr> - getRecords( - std::function time_converter, - uint64_t start_time_us, - uint64_t end_time_us); + getRecords(std::function + time_converter, + uint64_t start_time_us, uint64_t end_time_us); private: uint32_t id_; @@ -195,8 +231,9 @@ class DIPURecordQueue { ska::flat_hash_map> sub_queues_; std::mutex sub_queue_mutex_; - std::unique_ptr python_tracer_; + std::unique_ptr + python_tracer_; }; -} // namespace profile -} // namespace dipu +} // namespace profile +} // namespace dipu diff --git a/dipu/torch_dipu/csrc_dipu/profiler/patch.cpp b/dipu/torch_dipu/csrc_dipu/profiler/patch.cpp index 2c0621009..315fa7e0b 100644 --- a/dipu/torch_dipu/csrc_dipu/profiler/patch.cpp +++ b/dipu/torch_dipu/csrc_dipu/profiler/patch.cpp @@ -1,17 +1,16 @@ +#include +#include #include #include -#include -#include -#include -#include -#include #include #include #include - -#include -#include +#include +#include +#include +#include +#include namespace torch { namespace profiler { @@ -50,8 +49,8 @@ ApproximateClockToUnixTimeConverter::measurePairs() { return out; } -std::function ApproximateClockToUnixTimeConverter:: - makeConverter() { +std::function +ApproximateClockToUnixTimeConverter::makeConverter() { auto end_times = measurePairs(); // Compute the real time that passes for each tick of the approximate clock. @@ -99,42 +98,34 @@ namespace linux_perf { /* * Syscall wrapper for perf_event_open(2) */ -inline long perf_event_open( - struct perf_event_attr* hw_event, - pid_t pid, - int cpu, - int group_fd, - unsigned long flags) { +inline long perf_event_open(struct perf_event_attr *hw_event, pid_t pid, + int cpu, int group_fd, unsigned long flags) { return syscall(__NR_perf_event_open, hw_event, pid, cpu, group_fd, flags); } // TODO sync with Kineto level abstract events in profiler/events.h static const std::unordered_map< - std::string, - std::pair> - EventTable{ - {"cycles", - std::make_pair(PERF_TYPE_HARDWARE, PERF_COUNT_HW_CPU_CYCLES)}, - {"instructions", - std::make_pair(PERF_TYPE_HARDWARE, PERF_COUNT_HW_INSTRUCTIONS)}, - - // Non Standard events for testing - {"pagefaults", - std::make_pair(PERF_TYPE_SOFTWARE, PERF_COUNT_SW_PAGE_FAULTS)}, - {"backend-stall-cycles", - std::make_pair( - PERF_TYPE_HARDWARE, - PERF_COUNT_HW_STALLED_CYCLES_BACKEND)}, - {"frontend-stall-cycles", - std::make_pair( - PERF_TYPE_HARDWARE, - PERF_COUNT_HW_STALLED_CYCLES_FRONTEND)}}; + std::string, std::pair> + EventTable{{"cycles", + std::make_pair(PERF_TYPE_HARDWARE, PERF_COUNT_HW_CPU_CYCLES)}, + {"instructions", + std::make_pair(PERF_TYPE_HARDWARE, PERF_COUNT_HW_INSTRUCTIONS)}, + + // Non Standard events for testing + {"pagefaults", + std::make_pair(PERF_TYPE_SOFTWARE, PERF_COUNT_SW_PAGE_FAULTS)}, + {"backend-stall-cycles", + std::make_pair(PERF_TYPE_HARDWARE, + PERF_COUNT_HW_STALLED_CYCLES_BACKEND)}, + {"frontend-stall-cycles", + std::make_pair(PERF_TYPE_HARDWARE, + PERF_COUNT_HW_STALLED_CYCLES_FRONTEND)}}; PerfEvent::~PerfEvent() { if (fd_ > -1) { close(fd_); } - fd_ = -1; // poison + fd_ = -1; // poison } void PerfEvent::Init() { @@ -153,7 +144,7 @@ void PerfEvent::Init() { attr.config = it->second.second; attr.disabled = 1; attr.inherit = 1; - attr.exclude_kernel = 1; // TBD + attr.exclude_kernel = 1; // TBD attr.exclude_hv = 1; /* * These can be used to calculate estimated totals if the PMU is overcommitted @@ -162,15 +153,15 @@ void PerfEvent::Init() { attr.read_format = PERF_FORMAT_TOTAL_TIME_ENABLED | PERF_FORMAT_TOTAL_TIME_RUNNING; - pid_t pid = getpid(); // this pid - int cpu = -1; // all cpus + pid_t pid = getpid(); // this pid + int cpu = -1; // all cpus int group_fd = -1; unsigned long flags = 0; fd_ = static_cast(perf_event_open(&attr, pid, cpu, group_fd, flags)); if (fd_ == -1) { - TORCH_CHECK( - false, "perf_event_open() failed, error: ", std::strerror(errno)); + TORCH_CHECK(false, + "perf_event_open() failed, error: ", std::strerror(errno)); } Reset(); } @@ -178,21 +169,14 @@ void PerfEvent::Init() { uint64_t PerfEvent::ReadCounter() const { PerfCounter counter{}; long n = read(fd_, &counter, sizeof(PerfCounter)); - TORCH_CHECK( - n == sizeof(counter), - "Read failed for Perf event fd, event : ", - name_, - ", error: ", - std::strerror(errno)); + TORCH_CHECK(n == sizeof(counter), + "Read failed for Perf event fd, event : ", name_, + ", error: ", std::strerror(errno)); TORCH_CHECK( counter.time_enabled == counter.time_running, "Hardware performance counter time multiplexing is not handled yet", - ", name: ", - name_, - ", enabled: ", - counter.time_enabled, - ", running: ", - counter.time_running); + ", name: ", name_, ", enabled: ", counter.time_enabled, + ", running: ", counter.time_running); return counter.value; } @@ -201,16 +185,13 @@ uint64_t PerfEvent::ReadCounter() const { * ------------ */ -void PerfProfiler::Configure(std::vector& event_names) { - TORCH_CHECK( - event_names.size() <= MAX_EVENTS, - "Too many events to configure, configured: ", - event_names.size(), - ", max allowed:", - MAX_EVENTS); +void PerfProfiler::Configure(std::vector &event_names) { + TORCH_CHECK(event_names.size() <= MAX_EVENTS, + "Too many events to configure, configured: ", event_names.size(), + ", max allowed:", MAX_EVENTS); std::unordered_set s(event_names.begin(), event_names.end()); - TORCH_CHECK( - s.size() == event_names.size(), "Duplicate event names are not allowed!") + TORCH_CHECK(s.size() == event_names.size(), + "Duplicate event names are not allowed!") for (auto name : event_names) { events_.emplace_back(name); events_.back().Init(); @@ -228,24 +209,23 @@ void PerfProfiler::Enable() { start_values_.emplace(events_.size(), 0); - auto& sv = start_values_.top(); + auto &sv = start_values_.top(); for (int i = 0; i < events_.size(); ++i) { sv[i] = events_[i].ReadCounter(); } StartCounting(); } -void PerfProfiler::Disable(perf_counters_t& vals) { +void PerfProfiler::Disable(perf_counters_t &vals) { StopCounting(); - TORCH_CHECK( - vals.size() == events_.size(), - "Can not fit all perf counters in the supplied container"); - TORCH_CHECK( - !start_values_.empty(), "PerfProfiler must be enabled before disabling"); + TORCH_CHECK(vals.size() == events_.size(), + "Can not fit all perf counters in the supplied container"); + TORCH_CHECK(!start_values_.empty(), + "PerfProfiler must be enabled before disabling"); /* Always connecting this disable event to the last enable event i.e. using * whatever is on the top of the start counter value stack. */ - perf_counters_t& sv = start_values_.top(); + perf_counters_t &sv = start_values_.top(); for (int i = 0; i < events_.size(); ++i) { vals[i] = CalcDelta(sv[i], events_[i].ReadCounter()); } @@ -257,11 +237,11 @@ void PerfProfiler::Disable(perf_counters_t& vals) { } } -} // namespace linux_perf +} // namespace linux_perf namespace kineto { -TraceWrapper::TraceWrapper(const int64_t start_time, const std::string& name) +TraceWrapper::TraceWrapper(const int64_t start_time, const std::string &name) #ifdef USE_KINETO : cpu_trace_(std::make_unique()) { cpu_trace_->span.startTime = start_time; @@ -271,21 +251,18 @@ TraceWrapper::TraceWrapper(const int64_t start_time, const std::string& name) #else { } -#endif // USE_KINETO +#endif // USE_KINETO TraceWrapper::~TraceWrapper() = default; -activity_t* TraceWrapper::addCPUActivity( - const std::string& name, - const libkineto::ActivityType type, - const DeviceAndResource device_and_resource, - const uint64_t correlation_id, - const int64_t start_time, - const int64_t end_time) { +activity_t *TraceWrapper::addCPUActivity( + const std::string &name, const libkineto::ActivityType type, + const DeviceAndResource device_and_resource, const uint64_t correlation_id, + const int64_t start_time, const int64_t end_time) { #ifdef USE_KINETO TORCH_CHECK((bool)(*this), "Cannot add event to non-existent trace."); cpu_trace_->emplace_activity(cpu_trace_->span, type, name); - auto& act = libkineto::CpuTraceBuffer::toRef(cpu_trace_->activities.back()); + auto &act = libkineto::CpuTraceBuffer::toRef(cpu_trace_->activities.back()); act.device = device_and_resource.device; act.resource = device_and_resource.resource; act.id = correlation_id; @@ -296,14 +273,14 @@ activity_t* TraceWrapper::addCPUActivity( return cpu_trace_->activities.back().get(); #else return nullptr; -#endif // USE_KINETO +#endif // USE_KINETO } void TraceWrapper::transferCpuTrace(int64_t end_time) { #ifdef USE_KINETO cpu_trace_->span.endTime = end_time; libkineto::api().activityProfiler().transferCpuTrace(std::move(cpu_trace_)); -#endif // USE_KINETO +#endif // USE_KINETO } TraceWrapper::operator bool() const { @@ -311,11 +288,11 @@ TraceWrapper::operator bool() const { return cpu_trace_ != nullptr; #else return false; -#endif // USE_KINETO +#endif // USE_KINETO } ActivityTraceWrapper::ActivityTraceWrapper( - std::unique_ptr&& trace) + std::unique_ptr &&trace) : trace_(std::move(trace)) {} ActivityTraceWrapper::operator bool() const { @@ -323,46 +300,41 @@ ActivityTraceWrapper::operator bool() const { return trace_ != nullptr; #else return false; -#endif // USE_KINETO +#endif // USE_KINETO } -void ActivityTraceWrapper::save(const std::string& path) { +void ActivityTraceWrapper::save(const std::string &path) { #ifdef USE_KINETO TORCH_CHECK(!saved_, "Trace is already saved."); TORCH_CHECK(trace_ != nullptr, "Missing trace.") trace_->save(path); saved_ = true; #else - TORCH_CHECK( - false, - "Saving a trace requires using torch.profiler with Kineto support (USE_KINETO=1)"); -#endif // USE_KINETO + TORCH_CHECK(false, + "Saving a trace requires using torch.profiler with Kineto " + "support (USE_KINETO=1)"); +#endif // USE_KINETO } -void addMetadata( - const activity_t* activity, - const std::string& key, - const std::string& value) { - const_cast(activity)->addMetadata(key, value); +void addMetadata(const activity_t *activity, const std::string &key, + const std::string &value) { + const_cast(activity)->addMetadata(key, value); } const DeviceAndResource kineto_ids() { #ifdef USE_KINETO - return { - /*device=*/libkineto::processId(), - /*resource=*/libkineto::systemThreadId()}; + return {/*device=*/libkineto::processId(), + /*resource=*/libkineto::systemThreadId()}; #else return {}; -#endif // USE_KINETO +#endif // USE_KINETO } struct RegisterLibKinetoClient { - RegisterLibKinetoClient() { - libkineto::api(); - } + RegisterLibKinetoClient() { libkineto::api(); } } register_libkineto_client; -} // namespace kineto +} // namespace kineto namespace { static constexpr TensorImplAddress NoTensorImpl{nullptr}; @@ -379,43 +351,41 @@ struct RawTensorInfo { }; struct RawTensors { - std::vector& get() { - return tensors_; - } + std::vector &get() { return tensors_; } - void operator()(TensorMetadata& t) { - tensors_.emplace_back(RawTensorInfo{ - t.impl(), t.data_, t.device_, false, t.allocation_id_, t.id_}); + void operator()(TensorMetadata &t) { + tensors_.emplace_back(RawTensorInfo{t.impl(), t.data_, t.device_, false, + t.allocation_id_, t.id_}); } - void operator()(c10::optional& t) { + void operator()(c10::optional &t) { if (t.has_value()) { (*this)(*t); } } - void operator()(ExtraFields& a) { + void operator()(ExtraFields &a) { const StorageImplData ptr{a.ptr_}; const auto is_free = a.alloc_size_ < 0; - tensors_.emplace_back(RawTensorInfo{ - NoTensorImpl, ptr, a.device(), is_free, a.allocation_id_, a.id_}); + tensors_.emplace_back(RawTensorInfo{NoTensorImpl, ptr, a.device(), is_free, + a.allocation_id_, a.id_}); } - void operator()(std::vector& t) { - for (auto& ti : t) { + void operator()(std::vector &t) { + for (auto &ti : t) { (*this)(ti); } } template - void operator()(T&) {} + void operator()(T &) {} std::vector tensors_; }; -} // namespace +} // namespace void calculateUniqueTensorIDs( - std::vector>& sorted_results) { + std::vector> &sorted_results) { // This task is equivilent to https://leetcode.com/problems/number-of-islands/ // We first cluster events with a greedy index assignment, and then merge // groups that overlap. @@ -429,18 +399,18 @@ void calculateUniqueTensorIDs( // The python tracer caches values, so it's only safe to use the first case. ska::flat_hash_set seen_modules; ska::flat_hash_set seen_optimizers; - for (auto& result : sorted_results) { + for (auto &result : sorted_results) { result->visit(c10::overloaded( - [&](ExtraFields& torch_op) { - for (auto& i : torch_op.inputs_) { + [&](ExtraFields &torch_op) { + for (auto &i : torch_op.inputs_) { c10::visit(raw_tensors, i); } }, - [&](ExtraFields& py_call) { + [&](ExtraFields &py_call) { // torch.nn.Module if (py_call.module_.has_value() && seen_modules.insert(py_call.module_->self_).second) { - for (auto& p : py_call.module_->parameters_) { + for (auto &p : py_call.module_->parameters_) { raw_tensors(p.metadata_); raw_tensors(p.grad_metadata_); } @@ -449,16 +419,16 @@ void calculateUniqueTensorIDs( // torch.optim.Optimizer if (py_call.optimizer_.has_value() && seen_optimizers.insert(py_call.optimizer_->self_).second) { - for (auto& p : py_call.optimizer_->parameters_) { + for (auto &p : py_call.optimizer_->parameters_) { raw_tensors(p.metadata_); raw_tensors(p.grad_metadata_); - for (auto& state_i : p.state_) { + for (auto &state_i : p.state_) { raw_tensors(state_i.second); } } } }, - [&](auto& i) { raw_tensors(i); })); + [&](auto &i) { raw_tensors(i); })); } tensors = std::move(raw_tensors.tensors_); } @@ -469,7 +439,7 @@ void calculateUniqueTensorIDs( size_t counter{1}; using key_t = std::pair; ska::flat_hash_map versions; - for (auto& t : tensors) { + for (auto &t : tensors) { auto inserted = versions.insert({{t.storage_, t.device_}, counter}); counter += inserted.second; t.allocation_id_ref_.get().emplace(AllocationID(inserted.first->second)); @@ -483,19 +453,17 @@ void calculateUniqueTensorIDs( // -------------------------------------------------------------------------- { ska::flat_hash_set tensor_set; - for (const auto& t : tensors) { + for (const auto &t : tensors) { if (t.impl_ != NoTensorImpl) { tensor_set.insert(*t.allocation_id_ref_.get()); } } tensors.erase( - std::remove_if( - tensors.begin(), - tensors.end(), - [&tensor_set](const auto& i) { - auto it = tensor_set.find(*i.allocation_id_ref_.get()); - return it == tensor_set.end(); - }), + std::remove_if(tensors.begin(), tensors.end(), + [&tensor_set](const auto &i) { + auto it = tensor_set.find(*i.allocation_id_ref_.get()); + return it == tensor_set.end(); + }), tensors.end()); } @@ -505,7 +473,7 @@ void calculateUniqueTensorIDs( ska::flat_hash_set same_group_set; { ska::flat_hash_map impl_map; - for (const auto& t : tensors) { + for (const auto &t : tensors) { // Storage allocations / frees don't have an associated TensorImpl, so // we don't want all storages to merge through nullptr. if (!t.impl_) { @@ -527,13 +495,13 @@ void calculateUniqueTensorIDs( ska::flat_hash_map id_map; { std::vector unique_pairs; - for (const auto& i : same_group_set) { + for (const auto &i : same_group_set) { unique_pairs.push_back(i); } std::sort(unique_pairs.begin(), unique_pairs.end()); size_t current_id{0}; - for (const auto& i : unique_pairs) { + for (const auto &i : unique_pairs) { auto inserted = id_map.insert({i.first, current_id}); current_id += inserted.second; id_map.insert({i.second, inserted.first->second}); @@ -542,12 +510,12 @@ void calculateUniqueTensorIDs( // Write back to Tensor IDs. // -------------------------------------------------------------------------- - for (const auto& t : tensors) { + for (const auto &t : tensors) { const auto id = id_map.at(*t.allocation_id_ref_.get()); t.id_ref_.get().emplace(TensorID(id)); } } -} // namespace impl -} // namespace profiler -} // namespace torch \ No newline at end of file +} // namespace impl +} // namespace profiler +} // namespace torch \ No newline at end of file diff --git a/dipu/torch_dipu/csrc_dipu/profiler/profiler.cpp b/dipu/torch_dipu/csrc_dipu/profiler/profiler.cpp index e1d7c3bb8..8c2111e55 100644 --- a/dipu/torch_dipu/csrc_dipu/profiler/profiler.cpp +++ b/dipu/torch_dipu/csrc_dipu/profiler/profiler.cpp @@ -1,9 +1,9 @@ #include "profiler.h" +#include #include #include -#include #include #include @@ -14,384 +14,362 @@ namespace profile { static const int32_t DEFAULT_FLUSH_READY_INTERVAL = 1000; class DeviceEvent final { -private: - deviceEvent_t evt_; + private: + deviceEvent_t evt_; -public: - DeviceEvent() { - dipu::devproxy::createEvent(&evt_); - } + public: + DeviceEvent() { dipu::devproxy::createEvent(&evt_); } - ~DeviceEvent() { - dipu::devproxy::destroyEvent(evt_); - } + ~DeviceEvent() { dipu::devproxy::destroyEvent(evt_); } - deviceEvent_t get() const { - return evt_; - } + deviceEvent_t get() const { return evt_; } - DeviceEvent(const DeviceEvent&) = delete; - DeviceEvent& operator=(const DeviceEvent&) = delete; - DeviceEvent(DeviceEvent&&) = default; - DeviceEvent& operator=(DeviceEvent&&) = default; + DeviceEvent(const DeviceEvent &) = delete; + DeviceEvent &operator=(const DeviceEvent &) = delete; + DeviceEvent(DeviceEvent &&) = default; + DeviceEvent &operator=(DeviceEvent &&) = default; }; class StreamTimeOffsetTracker final { - DeviceEvent begin_; - deviceStream_t stream_; - size_t beginOffset_; - float ratio_ = 0.f; - -public: - explicit StreamTimeOffsetTracker(deviceStream_t stream) { - stream_ = stream; - devproxy::recordEvent(begin_.get(), stream_); - devproxy::waitEvent(begin_.get()); - beginOffset_ = torch::profiler::impl::getTime(); - } - - ~StreamTimeOffsetTracker() = default; - - void sync() { - DeviceEvent end; - float time; - dipu::devproxy::recordEvent(end.get(), stream_); - dipu::devproxy::waitEvent(end.get()); - dipu::devproxy::eventElapsedTime(&time, begin_.get(), end.get()); - size_t endOffset = torch::profiler::impl::getTime(); - ratio_ = 1.0f * (endOffset - beginOffset_) / time; - } - - const DeviceEvent& begin() const { - return begin_; - } - - size_t offset() const { - return beginOffset_; - } - - float ratio() const { - return ratio_; - } + DeviceEvent begin_; + deviceStream_t stream_; + size_t beginOffset_; + float ratio_ = 0.f; + + public: + explicit StreamTimeOffsetTracker(deviceStream_t stream) { + stream_ = stream; + devproxy::recordEvent(begin_.get(), stream_); + devproxy::waitEvent(begin_.get()); + beginOffset_ = torch::profiler::impl::getTime(); + } + + ~StreamTimeOffsetTracker() = default; + + void sync() { + DeviceEvent end; + float time; + dipu::devproxy::recordEvent(end.get(), stream_); + dipu::devproxy::waitEvent(end.get()); + dipu::devproxy::eventElapsedTime(&time, begin_.get(), end.get()); + size_t endOffset = torch::profiler::impl::getTime(); + ratio_ = 1.0f * (endOffset - beginOffset_) / time; + } + + const DeviceEvent &begin() const { return begin_; } + + size_t offset() const { return beginOffset_; } + + float ratio() const { return ratio_; } }; -RecordsImpl& RecordsImpl::get() { - static RecordsImpl instance; - return instance; +RecordsImpl &RecordsImpl::get() { + static RecordsImpl instance; + return instance; } void RecordsImpl::abandon() { - std::lock_guard lck(mtx_); - for (auto &kv : allRecordLists_) { - kv.second->clear(); - } - resourceInfo_.clear(); + std::lock_guard lck(mtx_); + for (auto &kv : allRecordLists_) { + kv.second->clear(); + } + resourceInfo_.clear(); } -void RecordsImpl::addRecord(const Record& record) { - if (pRecords == nullptr) { - std::lock_guard lk(mtx_); - int32_t tid = libkineto::systemThreadId(); - allRecordLists_[tid] = std::make_unique(); - pRecords = allRecordLists_[tid].get(); - } - pRecords->emplace_back(record); +void RecordsImpl::addRecord(const Record &record) { + if (pRecords == nullptr) { + std::lock_guard lk(mtx_); + int32_t tid = libkineto::systemThreadId(); + allRecordLists_[tid] = std::make_unique(); + pRecords = allRecordLists_[tid].get(); + } + pRecords->emplace_back(record); } -void RecordsImpl::recordStream(int device, int streamId, const std::string& postfix) { - std::lock_guard lck(mtx_); - if (resourceInfo_.find({device, streamId}) == resourceInfo_.end()) { - resourceInfo_.emplace( - std::make_pair(device, streamId), - libkineto::ResourceInfo(device, streamId, streamId, fmt::format( - "stream {} {}", streamId, postfix))); - } +void RecordsImpl::recordStream(int device, int streamId, + const std::string &postfix) { + std::lock_guard lck(mtx_); + if (resourceInfo_.find({device, streamId}) == resourceInfo_.end()) { + resourceInfo_.emplace(std::make_pair(device, streamId), + libkineto::ResourceInfo( + device, streamId, streamId, + fmt::format("stream {} {}", streamId, postfix))); + } } RecordsImpl::records_t RecordsImpl::getAllRecordList() const { - std::lock_guard lck(mtx_); - records_t allrecords; - for (const auto &kv : allRecordLists_) { - if (!kv.second || kv.second->empty()) { - continue; - } - - for (const auto & r : *(kv.second)) { - allrecords.push_back(r); - } + std::lock_guard lck(mtx_); + records_t allrecords; + for (const auto &kv : allRecordLists_) { + if (!kv.second || kv.second->empty()) { + continue; + } + + for (const auto &r : *(kv.second)) { + allrecords.push_back(r); } - return allrecords; + } + return allrecords; } -std::map, libkineto::ResourceInfo> RecordsImpl::getResourceInfo() const { - std::lock_guard lck(mtx_); - return resourceInfo_; +std::map, libkineto::ResourceInfo> +RecordsImpl::getResourceInfo() const { + std::lock_guard lck(mtx_); + return resourceInfo_; } -thread_local RecordsImpl::records_t* RecordsImpl::pRecords = nullptr; +thread_local RecordsImpl::records_t *RecordsImpl::pRecords = nullptr; class DeviceRecordsImpl final { -private: - // mutex for records and tracker - std::mutex mtx_; - std::list records_; - std::vector ready_records_; - std::unique_ptr pTracker_; - -private: - DeviceRecordsImpl() {} - - static bool enableFlushReadyEvent() { - static bool enable_flush_ready = (std::getenv("DIPU_DISABLE_FLUSH_READY_EVENT") == nullptr); - return enable_flush_ready; - } - - static int32_t flushReadyEventInterval() { - static int32_t flush_ready_event_interval = []() -> int32_t { - const char* str = std::getenv("DIPU_FLUSH_READY_EVENT_INTERVAL"); - return str == nullptr ? DEFAULT_FLUSH_READY_INTERVAL : std::stoi(str); - }(); - return flush_ready_event_interval; - } - - deviceEvent_t beginEvent() const { - TORCH_CHECK(pTracker_, "dipu profiler error with pTracker is not inited"); - return pTracker_->begin().get(); - } - - size_t getTime(const DeviceEvent& evt, - float scale = 1., size_t shift = 0) { - float time; - dipu::devproxy::waitEvent(evt.get()); - dipu::devproxy::eventElapsedTime(&time, beginEvent(), evt.get()); - return static_cast(time * scale) + shift; - } - -public: - ~DeviceRecordsImpl() { - reset(); - } - -public: - void ensureSetup(deviceStream_t stream) { - if (!pTracker_) { - std::lock_guard lk(mtx_); - if (!pTracker_) { - pTracker_.reset(new StreamTimeOffsetTracker(stream)); - } - } - } - - void addDeviceRecord(DeviceRecord record) { - std::lock_guard lk(mtx_); - TORCH_CHECK(pTracker_, "dipu profiler error with pTracker is not inited"); - records_.push_back(record); - if (enableFlushReadyEvent() && (records_.size() % flushReadyEventInterval() == 0)) { - flushReady(); - } - } - - void flushReady() { - while (records_.size() > 0) { - auto& r = records_.front(); - auto start_status = dipu::devproxy::getEventStatus(r.start->get()); - auto end_status = dipu::devproxy::getEventStatus(r.stop->get()); - auto origin_status = dipu::devproxy::getEventStatus(beginEvent()); - if (start_status != devapis::EventStatus::READY || - end_status != devapis::EventStatus::READY || - origin_status != devapis::EventStatus::READY) { - break; - } - float t1 = 0.0f; - float t2 = 0.0f; - dipu::devproxy::eventElapsedTime(&t1, beginEvent(), r.start->get()); - dipu::devproxy::eventElapsedTime(&t2, r.start->get(), r.stop->get()); - ready_records_.push_back(Record({r.name, r.opId, - static_cast(t1 * 1e3), - static_cast((t1 + t2) * 1e3), - r.deviceId, r.streamId, true, - r.linkCorrelationId, r.extraInfo})); - records_.pop_front(); - } - } - - void flush() { - std::lock_guard lk(mtx_); - if (records_.size() > 0) { - TORCH_CHECK(pTracker_, "dipu profiler error with pTracker is not inited"); - auto& trakcer = *pTracker_; - trakcer.sync(); - float ratio = trakcer.ratio(); - size_t offset = trakcer.offset(); - - for (auto& r : ready_records_) { - r.begin = static_cast(r.begin * 1e-3 * ratio) + offset; - r.end = static_cast(r.end * 1e-3 * ratio) + offset; - RecordsImpl::get().addRecord(r); - } - ready_records_.clear(); - - for (auto& r : records_) { - RecordsImpl::get().addRecord( - Record({r.name, r.opId, getTime(*r.start, ratio, offset), - getTime(*r.stop, ratio, offset), - r.deviceId, r.streamId, true, - r.linkCorrelationId, r.extraInfo})); - } - records_.clear(); - } + private: + // mutex for records and tracker + std::mutex mtx_; + std::list records_; + std::vector ready_records_; + std::unique_ptr pTracker_; + + private: + DeviceRecordsImpl() {} + + static bool enableFlushReadyEvent() { + static bool enable_flush_ready = + (std::getenv("DIPU_DISABLE_FLUSH_READY_EVENT") == nullptr); + return enable_flush_ready; + } + + static int32_t flushReadyEventInterval() { + static int32_t flush_ready_event_interval = []() -> int32_t { + const char *str = std::getenv("DIPU_FLUSH_READY_EVENT_INTERVAL"); + return str == nullptr ? DEFAULT_FLUSH_READY_INTERVAL : std::stoi(str); + }(); + return flush_ready_event_interval; + } + + deviceEvent_t beginEvent() const { + TORCH_CHECK(pTracker_, "dipu profiler error with pTracker is not inited"); + return pTracker_->begin().get(); + } + + size_t getTime(const DeviceEvent &evt, float scale = 1., size_t shift = 0) { + float time; + dipu::devproxy::waitEvent(evt.get()); + dipu::devproxy::eventElapsedTime(&time, beginEvent(), evt.get()); + return static_cast(time * scale) + shift; + } + + public: + ~DeviceRecordsImpl() { reset(); } + + public: + void ensureSetup(deviceStream_t stream) { + if (!pTracker_) { + std::lock_guard lk(mtx_); + if (!pTracker_) { + pTracker_.reset(new StreamTimeOffsetTracker(stream)); + } + } + } + + void addDeviceRecord(DeviceRecord record) { + std::lock_guard lk(mtx_); + TORCH_CHECK(pTracker_, "dipu profiler error with pTracker is not inited"); + records_.push_back(record); + if (enableFlushReadyEvent() && + (records_.size() % flushReadyEventInterval() == 0)) { + flushReady(); + } + } + + void flushReady() { + while (records_.size() > 0) { + auto &r = records_.front(); + auto start_status = dipu::devproxy::getEventStatus(r.start->get()); + auto end_status = dipu::devproxy::getEventStatus(r.stop->get()); + auto origin_status = dipu::devproxy::getEventStatus(beginEvent()); + if (start_status != devapis::EventStatus::READY || + end_status != devapis::EventStatus::READY || + origin_status != devapis::EventStatus::READY) { + break; + } + float t1 = 0.0f; + float t2 = 0.0f; + dipu::devproxy::eventElapsedTime(&t1, beginEvent(), r.start->get()); + dipu::devproxy::eventElapsedTime(&t2, r.start->get(), r.stop->get()); + ready_records_.push_back( + Record({r.name, r.opId, static_cast(t1 * 1e3), + static_cast((t1 + t2) * 1e3), r.deviceId, r.streamId, + true, r.linkCorrelationId, r.extraInfo})); + records_.pop_front(); + } + } + + void flush() { + std::lock_guard lk(mtx_); + if (records_.size() > 0) { + TORCH_CHECK(pTracker_, "dipu profiler error with pTracker is not inited"); + auto &trakcer = *pTracker_; + trakcer.sync(); + float ratio = trakcer.ratio(); + size_t offset = trakcer.offset(); + + for (auto &r : ready_records_) { + r.begin = static_cast(r.begin * 1e-3 * ratio) + offset; + r.end = static_cast(r.end * 1e-3 * ratio) + offset; + RecordsImpl::get().addRecord(r); + } + ready_records_.clear(); + + for (auto &r : records_) { + RecordsImpl::get().addRecord( + Record({r.name, r.opId, getTime(*r.start, ratio, offset), + getTime(*r.stop, ratio, offset), r.deviceId, r.streamId, + true, r.linkCorrelationId, r.extraInfo})); + } + records_.clear(); } + } - void reset() { - std::lock_guard lck(mtx_); - records_.clear(); - ready_records_.clear(); - pTracker_.reset(); - } + void reset() { + std::lock_guard lck(mtx_); + records_.clear(); + ready_records_.clear(); + pTracker_.reset(); + } - void abandon() { - reset(); - } + void abandon() { reset(); } - static DeviceRecordsImpl& get() { - static DeviceRecordsImpl instance; - return instance; - } + static DeviceRecordsImpl &get() { + static DeviceRecordsImpl instance; + return instance; + } }; bool gEnableFlag = false; -bool isEnable() { - return gEnableFlag; -} +bool isEnable() { return gEnableFlag; } -void setProfileOpen(bool profileFlag) { - gEnableFlag = profileFlag; -} +void setProfileOpen(bool profileFlag) { gEnableFlag = profileFlag; } -void FlushAllRecords() { - DeviceRecordsImpl::get().flush(); -} +void FlushAllRecords() { DeviceRecordsImpl::get().flush(); } static size_t kInitModuleId = 10000; std::atomic moduleId(kInitModuleId); -size_t generateId() { - return ++moduleId; -} +size_t generateId() { return ++moduleId; } -void resetId() { - moduleId = kInitModuleId; -} +void resetId() { moduleId = kInitModuleId; } void abandonAllRecords() { - RecordsImpl::get().abandon(); - DeviceRecordsImpl::get().abandon(); - resetId(); + RecordsImpl::get().abandon(); + DeviceRecordsImpl::get().abandon(); + resetId(); } -RecordCreator::RecordCreator(const string_t& name, size_t opId, uint64_t linkCorrelationId, - const ExtraRecordInfo& extraInfo) { - if (isEnable()) { - name_ = name; - opId_ = opId; - begin_ = torch::profiler::impl::getTime(); - end_ = false; - linkCorrelationId_ = linkCorrelationId; - extraInfo_ = extraInfo; - } +RecordCreator::RecordCreator(const string_t &name, size_t opId, + uint64_t linkCorrelationId, + const ExtraRecordInfo &extraInfo) { + if (isEnable()) { + name_ = name; + opId_ = opId; + begin_ = torch::profiler::impl::getTime(); + end_ = false; + linkCorrelationId_ = linkCorrelationId; + extraInfo_ = extraInfo; + } } -RecordCreator::~RecordCreator() { - end(); -} +RecordCreator::~RecordCreator() { end(); } void RecordCreator::end() { - if (!end_) { - RecordsImpl::get().addRecord( - Record{name_, opId_, begin_, static_cast(torch::profiler::impl::getTime()), - static_cast(libkineto::processId()), static_cast(libkineto::systemThreadId()), - false, linkCorrelationId_, extraInfo_}); - } - end_ = true; + if (!end_) { + RecordsImpl::get().addRecord( + Record{name_, opId_, begin_, + static_cast(torch::profiler::impl::getTime()), + static_cast(libkineto::processId()), + static_cast(libkineto::systemThreadId()), false, + linkCorrelationId_, extraInfo_}); + } + end_ = true; } - -DeviceRecordCreator::DeviceRecordCreator(string_t name, deviceStream_t stream, int streamId, size_t opId, - uint64_t linkCorrelationId, const ExtraRecordInfo& extraInfo) { - if (isEnable()) { - DeviceRecordsImpl::get().ensureSetup(stream); - name_ = name; - opId_ = opId; - extraInfo_ = extraInfo; - stream_ = stream; - streamId_ = streamId; - pStart_.reset(new DeviceEvent()); - pStop_.reset(new DeviceEvent()); - dipu::devproxy::recordEvent(pStart_->get(), stream_); - linkCorrelationId_ = linkCorrelationId; - end_ = false; - } +DeviceRecordCreator::DeviceRecordCreator(string_t name, deviceStream_t stream, + int streamId, size_t opId, + uint64_t linkCorrelationId, + const ExtraRecordInfo &extraInfo) { + if (isEnable()) { + DeviceRecordsImpl::get().ensureSetup(stream); + name_ = name; + opId_ = opId; + extraInfo_ = extraInfo; + stream_ = stream; + streamId_ = streamId; + pStart_.reset(new DeviceEvent()); + pStop_.reset(new DeviceEvent()); + dipu::devproxy::recordEvent(pStart_->get(), stream_); + linkCorrelationId_ = linkCorrelationId; + end_ = false; + } } -DeviceRecordCreator::~DeviceRecordCreator() { - end(); -} +DeviceRecordCreator::~DeviceRecordCreator() { end(); } void DeviceRecordCreator::end() { - if (!end_) { - TORCH_CHECK(pStart_, "dipu profiler error with pStart_ is not inited"); - TORCH_CHECK(pStop_, "dipu profiler error with pStop_ is not inited"); - dipu::devproxy::recordEvent(pStop_->get(), stream_); - auto deviceId = dipu::devproxy::current_device(); - DeviceRecordsImpl::get().addDeviceRecord(DeviceRecord{ - pStart_, pStop_, static_cast(deviceId), - static_cast(streamId_), name_, opId_, - linkCorrelationId_, extraInfo_}); - RecordsImpl::get().recordStream(deviceId, streamId_); - } - end_ = true; + if (!end_) { + TORCH_CHECK(pStart_, "dipu profiler error with pStart_ is not inited"); + TORCH_CHECK(pStop_, "dipu profiler error with pStop_ is not inited"); + dipu::devproxy::recordEvent(pStop_->get(), stream_); + auto deviceId = dipu::devproxy::current_device(); + DeviceRecordsImpl::get().addDeviceRecord( + DeviceRecord{pStart_, pStop_, static_cast(deviceId), + static_cast(streamId_), name_, opId_, + linkCorrelationId_, extraInfo_}); + RecordsImpl::get().recordStream(deviceId, streamId_); + } + end_ = true; } -static std::string extraceFunction(const std::string& functionName) { - auto start = functionName.find_first_not_of(":"); - if (start == std::string::npos) { - return ""; - } - - auto end = functionName.find_first_of("("); - if (end == std::string::npos) { - end = functionName.size(); - } - - if (end <= start) { - return ""; - } - return functionName.substr(start, end - start); +static std::string extraceFunction(const std::string &functionName) { + auto start = functionName.find_first_not_of(":"); + if (start == std::string::npos) { + return ""; + } + + auto end = functionName.find_first_of("("); + if (end == std::string::npos) { + end = functionName.size(); + } + + if (end <= start) { + return ""; + } + return functionName.substr(start, end - start); } -RecordBlockCreator::RecordBlockCreator(string_t name, const ExtraRecordInfo& extraInfo, - deviceStream_t stream, int streamId, bool enProfile) { - if (enProfile && isEnable()) { - size_t opId = generateId(); - uint64_t correlationId = CorrelationIDManager::instance().getCorrelationID(); - name = extraceFunction(name); - pHostRecord_.reset(new RecordCreator("LaunchKernel_" + name, opId, correlationId, extraInfo)); - pDeviceRecord_.reset(new DeviceRecordCreator(name, stream, streamId, opId, correlationId, extraInfo)); - } +RecordBlockCreator::RecordBlockCreator(string_t name, + const ExtraRecordInfo &extraInfo, + deviceStream_t stream, int streamId, + bool enProfile) { + if (enProfile && isEnable()) { + size_t opId = generateId(); + uint64_t correlationId = + CorrelationIDManager::instance().getCorrelationID(); + name = extraceFunction(name); + pHostRecord_.reset(new RecordCreator("LaunchKernel_" + name, opId, + correlationId, extraInfo)); + pDeviceRecord_.reset(new DeviceRecordCreator(name, stream, streamId, opId, + correlationId, extraInfo)); + } } void RecordBlockCreator::end() { - if (!finish_) { - pHostRecord_.reset(); - pDeviceRecord_.reset(); - } - finish_ = true; + if (!finish_) { + pHostRecord_.reset(); + pDeviceRecord_.reset(); + } + finish_ = true; } -RecordBlockCreator::~RecordBlockCreator() { - end(); -} +RecordBlockCreator::~RecordBlockCreator() { end(); } } // namespace profile diff --git a/dipu/torch_dipu/csrc_dipu/profiler/profiler.h b/dipu/torch_dipu/csrc_dipu/profiler/profiler.h index 25dd68b51..6766cd7b8 100644 --- a/dipu/torch_dipu/csrc_dipu/profiler/profiler.h +++ b/dipu/torch_dipu/csrc_dipu/profiler/profiler.h @@ -1,22 +1,21 @@ #pragma once -#include -#include - -#include -#include -#include +#include #include +#include #include +#include +#include #include +#include +#include #include -#include #include -#include #include #include -#include +#include +#include #include "CorrelationIDManager.h" @@ -40,134 +39,139 @@ void FlushAllRecords(); void abandonAllRecords(); struct ExtraRecordInfo { - string_t scope; - size_t opSeqId; - string_t attrs; - - ExtraRecordInfo() : scope(""), opSeqId(0), attrs("") {} - - ExtraRecordInfo& setScope(const string_t& scopeName) { - scope = scopeName; - return *this; - } - - ExtraRecordInfo& setSeqId(size_t seqId) { - opSeqId = seqId; - return *this; - } - - ExtraRecordInfo& setAttrs(const string_t& sAttrs) { - attrs = sAttrs; - return *this; - } + string_t scope; + size_t opSeqId; + string_t attrs; + + ExtraRecordInfo() : scope(""), opSeqId(0), attrs("") {} + + ExtraRecordInfo &setScope(const string_t &scopeName) { + scope = scopeName; + return *this; + } + + ExtraRecordInfo &setSeqId(size_t seqId) { + opSeqId = seqId; + return *this; + } + + ExtraRecordInfo &setAttrs(const string_t &sAttrs) { + attrs = sAttrs; + return *this; + } }; struct Record { - string_t name; - size_t opId; - // clock real time in nanosecond - size_t begin; - size_t end; - size_t pid; - size_t threadIdx; - bool isKernel = false; - uint64_t linkCorrelationId = 0; - ExtraRecordInfo extraInfo; + string_t name; + size_t opId; + // clock real time in nanosecond + size_t begin; + size_t end; + size_t pid; + size_t threadIdx; + bool isKernel = false; + uint64_t linkCorrelationId = 0; + ExtraRecordInfo extraInfo; }; class RecordsImpl final { -private: - using records_t = std::list; - using mutex_t = std::mutex; + private: + using records_t = std::list; + using mutex_t = std::mutex; - mutable mutex_t mtx_; - // tid -> record list - std::unordered_map> allRecordLists_; - thread_local static records_t* pRecords; + mutable mutex_t mtx_; + // tid -> record list + std::unordered_map> allRecordLists_; + thread_local static records_t *pRecords; - std::map, libkineto::ResourceInfo> resourceInfo_; + std::map, libkineto::ResourceInfo> resourceInfo_; -private: - RecordsImpl() = default; + private: + RecordsImpl() = default; -public: - ~RecordsImpl() = default; + public: + ~RecordsImpl() = default; - static RecordsImpl& get(); - void addRecord(const Record& record); - void recordStream(int device, int streamId, const std::string& postfix = ""); - void abandon(); + static RecordsImpl &get(); + void addRecord(const Record &record); + void recordStream(int device, int streamId, const std::string &postfix = ""); + void abandon(); - records_t getAllRecordList() const; - std::map, libkineto::ResourceInfo> getResourceInfo() const; + records_t getAllRecordList() const; + std::map, libkineto::ResourceInfo> + getResourceInfo() const; }; class RecordCreator final { -private: - string_t name_; - size_t opId_; - size_t begin_; - bool end_ = true; - uint64_t linkCorrelationId_ = 0; - ExtraRecordInfo extraInfo_; - -public: - explicit RecordCreator(const string_t& name, size_t opId, uint64_t linkCorrelationId, - const ExtraRecordInfo& extraInfo = ExtraRecordInfo()); - - ~RecordCreator(); - -private: - void end(); + private: + string_t name_; + size_t opId_; + size_t begin_; + bool end_ = true; + uint64_t linkCorrelationId_ = 0; + ExtraRecordInfo extraInfo_; + + public: + explicit RecordCreator(const string_t &name, size_t opId, + uint64_t linkCorrelationId, + const ExtraRecordInfo &extraInfo = ExtraRecordInfo()); + + ~RecordCreator(); + + private: + void end(); }; class DeviceEvent; struct DeviceRecord { - std::shared_ptr start, stop; - size_t deviceId; - size_t streamId; - string_t name; - size_t opId; - uint64_t linkCorrelationId = 0; - ExtraRecordInfo extraInfo; + std::shared_ptr start, stop; + size_t deviceId; + size_t streamId; + string_t name; + size_t opId; + uint64_t linkCorrelationId = 0; + ExtraRecordInfo extraInfo; }; class DeviceRecordCreator final { -private: - string_t name_; - size_t opId_; - deviceStream_t stream_; - int streamId_; - std::shared_ptr pStart_, pStop_; - bool end_ = true; - uint64_t linkCorrelationId_ = 0; - ExtraRecordInfo extraInfo_; - -public: - DeviceRecordCreator(string_t name, deviceStream_t stream, int streamId, size_t opId, uint64_t linkCorrelationId, - const ExtraRecordInfo& extraInfo = ExtraRecordInfo()); - - ~DeviceRecordCreator(); - -private: - void end(); + private: + string_t name_; + size_t opId_; + deviceStream_t stream_; + int streamId_; + std::shared_ptr pStart_, pStop_; + bool end_ = true; + uint64_t linkCorrelationId_ = 0; + ExtraRecordInfo extraInfo_; + + public: + DeviceRecordCreator(string_t name, deviceStream_t stream, int streamId, + size_t opId, uint64_t linkCorrelationId, + const ExtraRecordInfo &extraInfo = ExtraRecordInfo()); + + ~DeviceRecordCreator(); + + private: + void end(); }; class RecordBlockCreator { -public: - explicit RecordBlockCreator(string_t name, const ExtraRecordInfo& extraInfo = ExtraRecordInfo(), - deviceStream_t stream = dipu::getCurrentDIPUStream(), - int streamId = dipu::getCurrentDIPUStream().id(), bool enProfile = isEnable()); - - void end(); - - ~RecordBlockCreator(); - -private: - std::unique_ptr pHostRecord_ = nullptr; - std::unique_ptr pDeviceRecord_ = nullptr; - bool finish_ = false; + public: + explicit RecordBlockCreator( + string_t name, const ExtraRecordInfo &extraInfo = ExtraRecordInfo(), + deviceStream_t stream = dipu::getCurrentDIPUStream(), + int streamId = dipu::getCurrentDIPUStream().id(), + bool enProfile = isEnable()); + + void end(); + + ~RecordBlockCreator(); + + private: + std::unique_ptr pHostRecord_ = nullptr; + std::unique_ptr pDeviceRecord_ = nullptr; + bool finish_ = false; }; } // namespace profile diff --git a/dipu/torch_dipu/csrc_dipu/profiler/profiler_kineto.cpp b/dipu/torch_dipu/csrc_dipu/profiler/profiler_kineto.cpp index 9b25a39b6..6de8f16e3 100644 --- a/dipu/torch_dipu/csrc_dipu/profiler/profiler_kineto.cpp +++ b/dipu/torch_dipu/csrc_dipu/profiler/profiler_kineto.cpp @@ -1,5 +1,7 @@ #include "profiler_kineto.h" +#include + #include #include #include @@ -7,16 +9,16 @@ #include #include #include -#include -#include +#include #include #include #include +#include #include -#include #include "csrc_dipu/runtime/devproxy/deviceproxy.h" #include "csrc_dipu/utils/Log.h" + #include "collection.h" #include "profiler.h" @@ -24,9 +26,7 @@ namespace dipu { namespace profile { namespace { -inline int64_t getTimeUs() { - return torch::profiler::impl::getTime() / 1000; -} +inline int64_t getTimeUs() { return torch::profiler::impl::getTime() / 1000; } const std::set kCpuTypes{ libkineto::ActivityType::CPU_OP, @@ -37,53 +37,51 @@ const std::set kCpuTypes{ libkineto::ActivityType::PYTHON_FUNCTION, }; +using torch::autograd::profiler::experimental_event_t; +using torch::autograd::profiler::KinetoEvent; +using torch::autograd::profiler::post_process_t; +using torch::autograd::profiler::ProfilerResult; using torch::profiler::impl::ActiveProfilerType; using torch::profiler::impl::dtypesToStr; using torch::profiler::impl::EventType; using torch::profiler::impl::ExtraFields; using torch::profiler::impl::op_input_t; -using torch::profiler::impl::ProfilerStateBase; using torch::profiler::impl::ProfilerState; +using torch::profiler::impl::ProfilerStateBase; using torch::profiler::impl::PyExtraFieldsBase; using torch::profiler::impl::Result; using torch::profiler::impl::shapesToStr; using torch::profiler::impl::stacksToStr; using torch::profiler::impl::TensorMetadata; -using torch::profiler::impl::TensorMetadata; -using torch::autograd::profiler::KinetoEvent; -using torch::autograd::profiler::post_process_t; -using torch::autograd::profiler::experimental_event_t; -using torch::autograd::profiler::ProfilerResult; -auto shapesAndDtypes(const std::vector& inputs) { +auto shapesAndDtypes(const std::vector &inputs) { std::vector> shapes; std::vector dtypes; - for (const auto& i : inputs) { - c10::visit( - c10::overloaded( - [&](const TensorMetadata& t) { - shapes.emplace_back(t.sizes_); - dtypes.emplace_back(scalarTypeToTypeMeta(t.dtype_).name()); - }, - [&](const std::vector&) { - shapes.emplace_back(); - dtypes.emplace_back("TensorList"); - }, - [&](const c10::IValue&) { - shapes.emplace_back(); - dtypes.emplace_back("Scalar"); - }, - [&](const auto&) { - shapes.emplace_back(); - dtypes.emplace_back(); - }), - i); + for (const auto &i : inputs) { + c10::visit(c10::overloaded( + [&](const TensorMetadata &t) { + shapes.emplace_back(t.sizes_); + dtypes.emplace_back(scalarTypeToTypeMeta(t.dtype_).name()); + }, + [&](const std::vector &) { + shapes.emplace_back(); + dtypes.emplace_back("TensorList"); + }, + [&](const c10::IValue &) { + shapes.emplace_back(); + dtypes.emplace_back("Scalar"); + }, + [&](const auto &) { + shapes.emplace_back(); + dtypes.emplace_back(); + }), + i); } return std::make_pair(shapes, dtypes); } struct MetadataBase { - MetadataBase(const std::shared_ptr& result) + MetadataBase(const std::shared_ptr &result) : kineto_activity_{result->kineto_activity_} { if (c10::holds_alternative>( result->extra_fields_)) { @@ -100,69 +98,65 @@ struct MetadataBase { } } - void addMetadata(const std::string& key, const std::string& value) { + void addMetadata(const std::string &key, const std::string &value) { if (kineto_activity_ && !value.empty() && value != "\"\"") { torch::profiler::impl::kineto::addMetadata(kineto_activity_, key, value); } } - bool hasKinetoActivity() const { - return kineto_activity_ != nullptr; - } + bool hasKinetoActivity() const { return kineto_activity_ != nullptr; } private: - const torch::profiler::impl::kineto::activity_t* kineto_activity_{nullptr}; + const torch::profiler::impl::kineto::activity_t *kineto_activity_{nullptr}; }; struct AddTensorboardFields : public MetadataBase { - AddTensorboardFields( - const std::shared_ptr& result, - KinetoEvent& kineto_event) + AddTensorboardFields(const std::shared_ptr &result, + KinetoEvent &kineto_event) : MetadataBase(result) { result->visit(*this); const auto module_hierarchy = kineto_event.moduleHierarchy(); addMetadata("Module Hierarchy", stacksToStr(module_hierarchy.vec(), ".")); addMetadata("Call stack", stacksToStr(kineto_event.stack().vec(), ";")); - result->visit_if_base([&, this](const auto& i) -> void { + result->visit_if_base([&, this](const auto &i) -> void { this->addMetadata("Python id", std::to_string(i.id_)); c10::optional parent_id; std::shared_ptr parent = result->parent_.lock(); while (parent && !parent_id.has_value()) { parent->visit_if_base( - [&](const auto& j) { parent_id = std::to_string(j.id_); }); + [&](const auto &j) { parent_id = std::to_string(j.id_); }); parent = parent->parent_.lock(); } this->addMetadata("Python parent id", parent_id.value_or("null")); }); } - void operator()(const ExtraFields& py_call) { + void operator()(const ExtraFields &py_call) { if (py_call.module_.has_value()) { addMetadata("Python module id", std::to_string(py_call.module_->id_)); } } template - void operator()(const T&) {} + void operator()(const T &) {} }; struct AddGenericMetadata : public MetadataBase { - AddGenericMetadata( - std::shared_ptr& result, - const torch::profiler::impl::ProfilerConfig* config) + AddGenericMetadata(std::shared_ptr &result, + const torch::profiler::impl::ProfilerConfig *config) : MetadataBase(result), config_(config) { result->visit(*this); if (config->experimental_config.verbose) { result->visit_if_base( - [&, this](const auto& i) -> void { + [&, this](const auto &i) -> void { this->addMetadata("Python thread", std::to_string(i.python_tid_)); }); } } - void operator()(ExtraFields& op_event) { + void operator()(ExtraFields &op_event) { const auto shapes_and_dtypes = shapesAndDtypes(op_event.inputs_); if (!shapes_and_dtypes.first.empty()) { addMetadata("Input Dims", shapesToStr(shapes_and_dtypes.first)); @@ -173,11 +167,10 @@ struct AddGenericMetadata : public MetadataBase { } if (config_ && !config_->experimental_config.performance_events.empty()) { - auto& event_names = config_->experimental_config.performance_events; + auto &event_names = config_->experimental_config.performance_events; for (auto i = 0; i < op_event.perf_event_counters_->size(); ++i) { - addMetadata( - event_names[i], - std::to_string((*op_event.perf_event_counters_)[i])); + addMetadata(event_names[i], + std::to_string((*op_event.perf_event_counters_)[i])); } } @@ -189,13 +182,13 @@ struct AddGenericMetadata : public MetadataBase { } } - void operator()(ExtraFields& backend_event) { + void operator()(ExtraFields &backend_event) { if (!backend_event.backend_.empty()) { addMetadata("Backend", "\"" + backend_event.backend_ + "\""); } } - void operator()(const ExtraFields& alloc) { + void operator()(const ExtraFields &alloc) { addMetadata("Device Type", std::to_string((int8_t)alloc.device_type_)); addMetadata("Device Id", std::to_string(alloc.device_index_)); addMetadata("Addr", std::to_string(reinterpret_cast(alloc.ptr_))); @@ -204,7 +197,7 @@ struct AddGenericMetadata : public MetadataBase { addMetadata("Total Reserved", std::to_string(alloc.total_reserved_)); } - void operator()(const ExtraFields& alloc) { + void operator()(const ExtraFields &alloc) { addMetadata("Device Type", std::to_string((int8_t)alloc.device_type_)); addMetadata("Device Id", std::to_string(alloc.device_index_)); addMetadata("Bytes", std::to_string(alloc.alloc_size_)); @@ -213,11 +206,11 @@ struct AddGenericMetadata : public MetadataBase { } template - void operator()(const T&) {} + void operator()(const T &) {} private: /* To get names of the performance events */ - const torch::profiler::impl::ProfilerConfig* config_; + const torch::profiler::impl::ProfilerConfig *config_; }; // Assumption: Total threads number will not exceed 2^16-1, and total ops will // not exceed 2^48 -1. @@ -227,64 +220,48 @@ static inline uint64_t getForwardThreadKey(uint64_t tid, uint64_t seqNr) { struct DIPUKinetoThreadLocalState : public ProfilerStateBase { explicit DIPUKinetoThreadLocalState( - const torch::profiler::impl::ProfilerConfig& config, + const torch::profiler::impl::ProfilerConfig &config, std::set activities) : ProfilerStateBase(config), start_time_(getTimeUs()), record_queue_(config, std::move(activities)) {} ~DIPUKinetoThreadLocalState() override = default; - static DIPUKinetoThreadLocalState* get(bool global) { - auto* state = ProfilerStateBase::get(global); - TORCH_INTERNAL_ASSERT_DEBUG_ONLY( - state == nullptr || - state->profilerType() == ActiveProfilerType::KINETO); - return static_cast(state); + static DIPUKinetoThreadLocalState *get(bool global) { + auto *state = ProfilerStateBase::get(global); + TORCH_INTERNAL_ASSERT_DEBUG_ONLY(state == nullptr || + state->profilerType() == + ActiveProfilerType::KINETO); + return static_cast(state); } ActiveProfilerType profilerType() override { return ActiveProfilerType::KINETO; } - void reportMemoryUsage( - void* ptr, - int64_t alloc_size, - size_t total_allocated, - size_t total_reserved, - c10::Device device) override { + void reportMemoryUsage(void *ptr, int64_t alloc_size, size_t total_allocated, + size_t total_reserved, c10::Device device) override { if (config_.profile_memory && !config_.disabled()) { record_queue_.getSubqueue()->emplace_allocation_event( - torch::profiler::impl::getApproximateTime(), - ptr, - alloc_size, - total_allocated, - total_reserved, - device.type(), - device.index()); + torch::profiler::impl::getApproximateTime(), ptr, alloc_size, + total_allocated, total_reserved, device.type(), device.index()); } } - void reportOutOfMemory( - int64_t alloc_size, - size_t total_allocated, - size_t total_reserved, - c10::Device device) override { + void reportOutOfMemory(int64_t alloc_size, size_t total_allocated, + size_t total_reserved, c10::Device device) override { if (config_.profile_memory && !config_.disabled()) { record_queue_.getSubqueue()->emplace_ooms_event( - torch::profiler::impl::getApproximateTime(), - alloc_size, - total_allocated, - total_reserved, - device.type(), - device.index()); + torch::profiler::impl::getApproximateTime(), alloc_size, + total_allocated, total_reserved, device.type(), device.index()); } } - const post_process_t& getEventPostProcessingCallback() const { + const post_process_t &getEventPostProcessingCallback() const { return event_post_process_cb_; } - void setEventPostProcessingCallback(post_process_t&& cb) { + void setEventPostProcessingCallback(post_process_t &&cb) { event_post_process_cb_ = std::move(cb); } @@ -303,33 +280,31 @@ struct DIPUKinetoThreadLocalState : public ProfilerStateBase { // `kineto_events_` does not include Python events. Instead it exposes them // via the `stacks` property. kineto_events_.erase( - std::remove_if( - kineto_events_.begin(), - kineto_events_.end(), - [](const auto& i) { return i.isPythonFunction(); }), + std::remove_if(kineto_events_.begin(), kineto_events_.end(), + [](const auto &i) { return i.isPythonFunction(); }), kineto_events_.end()); return std::move(records_and_trace.second); } template - void invokeCallback(T& t) { + void invokeCallback(T &t) { if (event_post_process_cb_) { event_post_process_cb_(t.debug_handle_, t.jit_stack_, t.jit_modules_); } } - void materializeOpEvents(std::vector>& events) { - for (auto& e : events) { + void materializeOpEvents(std::vector> &events) { + for (auto &e : events) { if (e->parent_.expired()) { event_tree_.push_back(e); } if (e->finished_) { e->visit(c10::overloaded( - [this](ExtraFields& i) { invokeCallback(i); }, - [this](ExtraFields& i) { invokeCallback(i); }, - [](auto&) {})); + [this](ExtraFields &i) { invokeCallback(i); }, + [this](ExtraFields &i) { invokeCallback(i); }, + [](auto &) {})); kineto_events_.emplace_back(e, config_.experimental_config.verbose); AddTensorboardFields add_tb(e, kineto_events_.back()); @@ -342,18 +317,17 @@ struct DIPUKinetoThreadLocalState : public ProfilerStateBase { } void generateForwardBackwardLink( - const KinetoEvent& kineto_event, - uint64_t& fwd_bwd_link_id, - libkineto::GenericTraceActivity& activity, - std::unordered_map& - tidSeq2activity) { + const KinetoEvent &kineto_event, uint64_t &fwd_bwd_link_id, + libkineto::GenericTraceActivity &activity, + std::unordered_map + &tidSeq2activity) { if (kineto_event.fwdThreadId() > 0) { // act is backward op. - uint64_t key = getForwardThreadKey( - kineto_event.fwdThreadId(), kineto_event.sequenceNr()); + uint64_t key = getForwardThreadKey(kineto_event.fwdThreadId(), + kineto_event.sequenceNr()); auto iter = tidSeq2activity.find(key); if (iter != tidSeq2activity.end()) { - libkineto::GenericTraceActivity* fwd = iter->second; + libkineto::GenericTraceActivity *fwd = iter->second; fwd->flow.start = true; activity.flow.id = fwd->flow.id = fwd_bwd_link_id; activity.flow.type = fwd->flow.type = libkineto::kLinkFwdBwd; @@ -361,8 +335,8 @@ struct DIPUKinetoThreadLocalState : public ProfilerStateBase { } } else if (kineto_event.startThreadId() != 0) { // act is forward op. - uint64_t key = getForwardThreadKey( - kineto_event.startThreadId(), kineto_event.sequenceNr()); + uint64_t key = getForwardThreadKey(kineto_event.startThreadId(), + kineto_event.sequenceNr()); // Assumption: Among all ops with same sequence number, // the one with biggest start time is most likely launching backward op. auto iter = tidSeq2activity.find(key); @@ -388,11 +362,11 @@ struct DIPUKinetoThreadLocalState : public ProfilerStateBase { std::vector event_tree_; // Optional, if event post-processing is enabled. post_process_t event_post_process_cb_; -}; +}; template std::unique_ptr onFunctionEnter( - const at::RecordFunction& fn) { + const at::RecordFunction &fn) { auto state_ptr = DIPUKinetoThreadLocalState::get(use_global_state_ptr); if (!state_ptr) { return nullptr; @@ -402,16 +376,15 @@ std::unique_ptr onFunctionEnter( // @lint-ignore CLANGTIDY clang-diagnostic-unused-parameter template -void onFunctionExit( - const at::RecordFunction& fn, - at::ObserverContext* ctx_ptr) { +void onFunctionExit(const at::RecordFunction &fn, + at::ObserverContext *ctx_ptr) { auto state_ptr = DIPUKinetoThreadLocalState::get(use_global_state_ptr); if (!state_ptr) { return; } - const auto& config = state_ptr->config(); - auto* kineto_ctx_ptr = - static_cast(ctx_ptr); + const auto &config = state_ptr->config(); + auto *kineto_ctx_ptr = + static_cast(ctx_ptr); TORCH_INTERNAL_ASSERT(kineto_ctx_ptr != nullptr); kineto_ctx_ptr->event_->end_time_ = torch::profiler::impl::getApproximateTime(); @@ -421,7 +394,7 @@ void onFunctionExit( } kineto_ctx_ptr->event_->basic_fields_.end_tid_ = at::RecordFunction::currentThreadId(); - + if (fn.scope() == at::RecordScope::USER_SCOPE) { libkineto::api().activityProfiler().popUserCorrelationId(); } else { @@ -430,14 +403,13 @@ void onFunctionExit( } template -void pushProfilingCallbacks(const std::unordered_set& scopes) { +void pushProfilingCallbacks(const std::unordered_set &scopes) { auto registration_state_ptr = DIPUKinetoThreadLocalState::get(use_global_callback); TORCH_INTERNAL_ASSERT(registration_state_ptr, "Expected profiler state set"); auto recordFunctionCallback = - at::RecordFunctionCallback( - onFunctionEnter, - onFunctionExit) + at::RecordFunctionCallback(onFunctionEnter, + onFunctionExit) .needsInputs(registration_state_ptr->config().report_input_shapes) .scopes(scopes); @@ -447,11 +419,12 @@ void pushProfilingCallbacks(const std::unordered_set& scopes) { registration_state_ptr->setCallbackHandle(handle); } -} // namespace +} // namespace -static void prepareTrace(const bool cpuOnly, - const std::set& activities, - const torch::profiler::impl::ExperimentalConfig& config) { +static void prepareTrace( + const bool cpuOnly, + const std::set &activities, + const torch::profiler::impl::ExperimentalConfig &config) { if (!libkineto::api().isProfilerRegistered()) { libkineto_init(/*cpuOnly=*/cpuOnly, /*logOnError=*/true); libkineto::api().suppressLogMessages(); @@ -472,28 +445,27 @@ static void prepareTrace(const bool cpuOnly, } void prepareProfiler( - const torch::profiler::impl::ProfilerConfig& config, - const std::set& activities) { - TORCH_CHECK( - config.state == ProfilerState::KINETO || - config.state == ProfilerState::KINETO_GPU_FALLBACK, - "Supported only in Kineto profiler"); + const torch::profiler::impl::ProfilerConfig &config, + const std::set &activities) { + TORCH_CHECK(config.state == ProfilerState::KINETO || + config.state == ProfilerState::KINETO_GPU_FALLBACK, + "Supported only in Kineto profiler"); bool cpuOnly = (devproxy::getDeviceCount() <= 0); prepareTrace(cpuOnly, activities, config.experimental_config); if (!config.experimental_config.performance_events.empty()) { /* For now only CPU activity is supported */ - TORCH_CHECK( - activities.count(torch::profiler::impl::ActivityType::CPU), - "Cannot run cpu hardware profiler without CPU activities, please only use CPU activity type"); + TORCH_CHECK(activities.count(torch::profiler::impl::ActivityType::CPU), + "Cannot run cpu hardware profiler without CPU activities, " + "please only use CPU activity type"); /* * Sending a warning and passing the non-standard event to the backend * Backend can abort if the event is not supported. * TODO Should we gracefully drop the invalid event if we have atleast one * valid? */ - auto is_standard_event = [](const std::string& event) -> bool { + auto is_standard_event = [](const std::string &event) -> bool { for (auto e : torch::profiler::ProfilerPerfEvents) { if (!std::strcmp(event.c_str(), e)) { return true; @@ -502,7 +474,7 @@ void prepareProfiler( return false; }; - for (const auto& e : config.experimental_config.performance_events) { + for (const auto &e : config.experimental_config.performance_events) { if (!is_standard_event(e)) { TORCH_WARN("Forwarding a non-standard CPU performance event : ", e); } @@ -510,10 +482,11 @@ void prepareProfiler( } } void enableProfiler( - const torch::profiler::impl::ProfilerConfig& config, - const std::set& activities, - const std::unordered_set& scopes) { - const auto has_cpu = activities.count(torch::profiler::impl::ActivityType::CPU); + const torch::profiler::impl::ProfilerConfig &config, + const std::set &activities, + const std::unordered_set &scopes) { + const auto has_cpu = + activities.count(torch::profiler::impl::ActivityType::CPU); TORCH_CHECK( DIPUKinetoThreadLocalState::get(/*global=*/config.global()) == nullptr, "Profiler is already enabled", @@ -521,7 +494,8 @@ void enableProfiler( TORCH_CHECK(config.state == ProfilerState::KINETO || config.global()); TORCH_CHECK(!activities.empty(), "No activities specified."); - TORCH_INTERNAL_ASSERT(has_cpu || !config.global(), "Ondemand profiling must enable CPU tracing"); + TORCH_INTERNAL_ASSERT(has_cpu || !config.global(), + "Ondemand profiling must enable CPU tracing"); DIPUKinetoThreadLocalState::push( std::make_shared(config, activities)); @@ -534,8 +508,9 @@ void enableProfiler( if (!config.global()) { libkineto::api().activityProfiler().startTrace(); } - - const auto has_device = activities.count(torch::profiler::impl::ActivityType::CUDA); + + const auto has_device = + activities.count(torch::profiler::impl::ActivityType::CUDA); if (has_device) { setProfileOpen(true); } @@ -543,11 +518,9 @@ void enableProfiler( std::unique_ptr disableProfiler() { auto state_ptr = ProfilerStateBase::pop(); - const auto& config = state_ptr->config(); - TORCH_CHECK( - state_ptr && - (config.state == ProfilerState::KINETO), - "Can't disable Kineto profiler when it's not running"); + const auto &config = state_ptr->config(); + TORCH_CHECK(state_ptr && (config.state == ProfilerState::KINETO), + "Can't disable Kineto profiler when it's not running"); state_ptr->removeCallback(); @@ -565,19 +538,19 @@ std::unique_ptr disableProfiler() { auto trace = kineto_state_ptr->finalizeTrace(); result = std::make_unique( kineto_state_ptr->start_time_, - std::move(kineto_state_ptr->kineto_events_), - std::move(trace), + std::move(kineto_state_ptr->kineto_events_), std::move(trace), std::move(kineto_state_ptr->event_tree_)); } return result; } -void addMetadataJson(const std::string& key, const std::string& value) { +void addMetadataJson(const std::string &key, const std::string &value) { if (libkineto::api().isProfilerInitialized()) { libkineto::api().activityProfiler().addMetadata(key, value); } else { - DIPU_LOG << "Profiler is not initialized: skipping profiling metadata" << std::endl; + DIPU_LOG << "Profiler is not initialized: skipping profiling metadata" + << std::endl; } } @@ -585,7 +558,8 @@ void profilerStep() { if (libkineto::api().isProfilerInitialized()) { libkineto::api().activityProfiler().step(); } else { - DIPU_LOG << "Profiler is not initialized: skipping step() invocation" << std::endl; + DIPU_LOG << "Profiler is not initialized: skipping step() invocation" + << std::endl; } } diff --git a/dipu/torch_dipu/csrc_dipu/profiler/profiler_kineto.h b/dipu/torch_dipu/csrc_dipu/profiler/profiler_kineto.h index 64fe31b13..8f92e01a4 100644 --- a/dipu/torch_dipu/csrc_dipu/profiler/profiler_kineto.h +++ b/dipu/torch_dipu/csrc_dipu/profiler/profiler_kineto.h @@ -1,29 +1,29 @@ #pragma once #include -#include #include +#include #include #include -#include #include +#include namespace dipu { namespace profile { void prepareProfiler( - const torch::profiler::impl::ProfilerConfig& config, - const std::set& activities); + const torch::profiler::impl::ProfilerConfig &config, + const std::set &activities); void enableProfiler( - const torch::profiler::impl::ProfilerConfig& config, - const std::set& activities, - const std::unordered_set& scopes = {}); + const torch::profiler::impl::ProfilerConfig &config, + const std::set &activities, + const std::unordered_set &scopes = {}); std::unique_ptr disableProfiler(); -void addMetadataJson(const std::string& key, const std::string& value); +void addMetadataJson(const std::string &key, const std::string &value); void profilerStep(); diff --git a/dipu/torch_dipu/csrc_dipu/profiler/profiler_python.cpp b/dipu/torch_dipu/csrc_dipu/profiler/profiler_python.cpp index b7ad4a698..7d3f2cd83 100644 --- a/dipu/torch_dipu/csrc_dipu/profiler/profiler_python.cpp +++ b/dipu/torch_dipu/csrc_dipu/profiler/profiler_python.cpp @@ -9,9 +9,6 @@ #include #include -#include -#include - #include #include #include @@ -28,6 +25,9 @@ #include #include +#include +#include + #include "collection.h" namespace py = pybind11; @@ -35,25 +35,25 @@ namespace py = pybind11; namespace dipu { namespace profile { -using torch::profiler::impl::PyFrameState; +using torch::profiler::impl::AppendOnlyList; +using torch::profiler::impl::approx_time_t; using torch::profiler::impl::EventType; using torch::profiler::impl::ExtraFields; -using torch::profiler::impl::RawTensorMetadata; -using torch::profiler::impl::TensorMetadata; -using torch::profiler::impl::python_tracer::PythonTracerBase; -using torch::profiler::impl::python_tracer::CompressedEvent; -using torch::profiler::impl::kineto::DeviceAndResource; -using torch::profiler::impl::approx_time_t; -using torch::profiler::impl::AppendOnlyList; -using torch::profiler::impl::PyModuleSelf; -using torch::profiler::impl::PyModuleCls; -using torch::profiler::impl::PyMethod; +using torch::profiler::impl::getApproximateTime; using torch::profiler::impl::NNModuleInfo; -using torch::profiler::impl::PyOptimizerSelf; -using torch::profiler::impl::PyOptimizerCls; using torch::profiler::impl::OptimizerInfo; +using torch::profiler::impl::PyFrameState; +using torch::profiler::impl::PyMethod; +using torch::profiler::impl::PyModuleCls; +using torch::profiler::impl::PyModuleSelf; +using torch::profiler::impl::PyOptimizerCls; +using torch::profiler::impl::PyOptimizerSelf; +using torch::profiler::impl::RawTensorMetadata; using torch::profiler::impl::Result; -using torch::profiler::impl::getApproximateTime; +using torch::profiler::impl::TensorMetadata; +using torch::profiler::impl::kineto::DeviceAndResource; +using torch::profiler::impl::python_tracer::CompressedEvent; +using torch::profiler::impl::python_tracer::PythonTracerBase; namespace { enum CallType { PyCall = 0, PyModuleCall, PyCCall, PyOptimizerCall }; @@ -66,28 +66,28 @@ static constexpr uint64_t NoTID = std::numeric_limits::max(); // ============================================================================ struct CodeLocation { CodeLocation() = default; - explicit CodeLocation(PyFrameObject* frame) + explicit CodeLocation(PyFrameObject *frame) : line_number_{PyFrame_GetLineNumber(frame)} { auto code = THPCodeObjectPtr(PyFrame_GetCode(frame)); filename_ = THPUtils_unpackStringView(code->co_filename).data(); name_ = THPUtils_unpackStringView(code->co_name).data(); } - bool operator==(const CodeLocation& other) const { + bool operator==(const CodeLocation &other) const { return filename_ == other.filename_ && name_ == other.name_ && - line_number_ == other.line_number_; + line_number_ == other.line_number_; } - const char* filename_{nullptr}; - const char* name_{nullptr}; + const char *filename_{nullptr}; + const char *name_{nullptr}; int line_number_{0}; }; template -PyCodeObject* getCode(); +PyCodeObject *getCode(); template <> -PyCodeObject* getCode() { +PyCodeObject *getCode() { static auto module_call_code = []() { pybind11::gil_scoped_acquire gil; auto res = py::module::import("torch.nn") @@ -96,13 +96,13 @@ PyCodeObject* getCode() { .attr("__code__") .ptr(); TORCH_INTERNAL_ASSERT(PyCode_Check(res)); - return (PyCodeObject*)res; + return (PyCodeObject *)res; }(); return module_call_code; }; template <> -PyCodeObject* getCode() { +PyCodeObject *getCode() { static auto optimizer_step_code = []() { pybind11::gil_scoped_acquire gil; auto res = py::module::import("torch.optim") @@ -111,18 +111,18 @@ PyCodeObject* getCode() { .attr("__code__") .ptr(); TORCH_INTERNAL_ASSERT(PyCode_Check(res)); - return (PyCodeObject*)res; + return (PyCodeObject *)res; }(); return optimizer_step_code; }; -} // namespace -} // namespace profile -} // namespace dipu +} // namespace +} // namespace profile +} // namespace dipu template <> struct std::hash { - size_t operator()(const dipu::profile::CodeLocation& x) { + size_t operator()(const dipu::profile::CodeLocation &x) { return c10::get_hash(x.filename_, x.name_, x.line_number_); } }; @@ -146,7 +146,7 @@ class CallTypeHelper final { std::index_sequence); template - static void map(T& t, FunctorT& f, Args&&... args) { + static void map(T &t, FunctorT &f, Args &&...args) { f(std::get(t), args...); c10::guts::if_constexpr( [&](auto _) { map(_(t), f, std::forward(args)...); }); @@ -156,7 +156,7 @@ class CallTypeHelper final { using tuple_type = decltype(make_tuple_impl(std::make_index_sequence{})); template - static void map(tuple_type& t, FunctorT& f, Args&&... args) { + static void map(tuple_type &t, FunctorT &f, Args &&...args) { map<0>(t, f, std::forward(args)...); } }; @@ -226,7 +226,7 @@ template struct ExtendedPyCallConfig { using key_t = Key; using cls_t = Cls; - using ephemeral_t = PyFrameObject*; + using ephemeral_t = PyFrameObject *; struct ClsAndParameters { cls_t cls_; @@ -245,21 +245,19 @@ struct ExtendedPyCallConfig { }; template <> -struct Config : ExtendedPyCallConfig< - PyModuleSelf, - PyModuleCls, - NNModuleInfo::ParameterInfo> {}; +struct Config + : ExtendedPyCallConfig {}; template <> -struct Config : ExtendedPyCallConfig< - PyOptimizerSelf, - PyOptimizerCls, - OptimizerInfo::ParameterInfo> {}; +struct Config + : ExtendedPyCallConfig {}; template <> struct Config { using key_t = PyMethod; - using ephemeral_t = PyObject*; + using ephemeral_t = PyObject *; using cache_t = ska::flat_hash_map; static constexpr EventType event_type = EventType::PyCCall; }; @@ -273,14 +271,13 @@ class Callsite { static constexpr CallType call_type = C; using key_t = typename Config::key_t; - static_assert( - std::is_trivially_copyable::value, - "Key should be trivial, as it is passed by value."); + static_assert(std::is_trivially_copyable::value, + "Key should be trivial, as it is passed by value."); template - Callsite(U value, PyFrameObject* f_back) : value_(value), caller_(f_back) {} + Callsite(U value, PyFrameObject *f_back) : value_(value), caller_(f_back) {} - bool operator==(const Callsite& other) const { + bool operator==(const Callsite &other) const { return value_ == other.value_ && caller_ == other.caller_; } @@ -299,20 +296,19 @@ using PyOptimizerCallKey = Config::key_t; class ValueCache { public: ValueCache() = default; - ValueCache(const ValueCache&) = delete; + ValueCache(const ValueCache &) = delete; template - void store(const typename Config::key_t&, typename Config::ephemeral_t); + void store(const typename Config::key_t &, + typename Config::ephemeral_t); template - auto load(const Callsite& callsite, size_t python_tid) const { + auto load(const Callsite &callsite, size_t python_tid) const { auto caller = load(callsite.caller_); TORCH_INTERNAL_ASSERT(!caller.module_info_.has_value()); return ExtraFields::event_type>{ - /*end_time_ns=*/std::numeric_limits::min(), - python_tid, - caller.frame_state_, - load(callsite.value_)}; + /*end_time_ns=*/std::numeric_limits::min(), python_tid, + caller.frame_state_, load(callsite.value_)}; } c10::optional recordIfTensor(py::handle p); @@ -323,7 +319,7 @@ class ValueCache { private: template typename ExtraFields::event_type>::args_t load( - const typename Config::key_t&) const; + const typename Config::key_t &) const; template using State = typename Config::cache_t; @@ -333,10 +329,9 @@ class ValueCache { template typename Config::cls_t set_class( - ValueCache* value_cache, - typename Config::cache_t& cache, - const typename Config::key_t& key, - const typename Config::ephemeral_t& frame) { + ValueCache *value_cache, typename Config::cache_t &cache, + const typename Config::key_t &key, + const typename Config::ephemeral_t &frame) { if (C10_UNLIKELY(!cache.location_.has_value())) { auto code = THPCodeObjectPtr(PyFrame_GetCode(frame)); TORCH_INTERNAL_ASSERT(code.get() == getCode()); @@ -344,7 +339,7 @@ typename Config::cls_t set_class( value_cache->store(*cache.location_, no_ephemeral_t()); } - auto cls_handle = py::handle((PyObject*)key).attr("__class__"); + auto cls_handle = py::handle((PyObject *)key).attr("__class__"); auto cls = typename Config::cls_t(cls_handle.ptr()); if (cache.cls_names_.find(cls) == cache.cls_names_.end()) { cache.cls_names_[cls] = @@ -353,70 +348,65 @@ typename Config::cls_t set_class( return cls; } -TensorMetadata toTensorMetadata(PyObject* self) { +TensorMetadata toTensorMetadata(PyObject *self) { TORCH_INTERNAL_ASSERT(THPVariable_CheckExact(self)); - const auto& t = THPVariable_Unpack(self); + const auto &t = THPVariable_Unpack(self); RawTensorMetadata m{t}; return TensorMetadata{ - m, - t.sizes().vec(), + m, t.sizes().vec(), m.layout_ == at::kStrided ? t.strides().vec() : std::vector()}; } c10::optional ValueCache::recordIfTensor(py::handle p) { return THPVariable_CheckExact(p.ptr()) - ? c10::optional{toTensorMetadata(p.ptr())} - : c10::nullopt; + ? c10::optional{toTensorMetadata(p.ptr())} + : c10::nullopt; } std::vector> ValueCache::unpackTensorMap( py::dict tensor_map) { std::vector> out; - for (auto& it : tensor_map) { - auto* value = it.second.ptr(); + for (auto &it : tensor_map) { + auto *value = it.second.ptr(); if (py::isinstance(it.first) && THPVariable_CheckExact(value)) { - out.emplace_back( - py::cast(it.first), toTensorMetadata(value)); + out.emplace_back(py::cast(it.first), + toTensorMetadata(value)); } } return out; } template <> -void ValueCache::store(const PyCallKey& key, no_ephemeral_t) { - auto& locations = std::get(state_); +void ValueCache::store(const PyCallKey &key, no_ephemeral_t) { + auto &locations = std::get(state_); if (C10_UNLIKELY(locations.find(key) == locations.end())) { - locations[key] = { - key.line_number_, - at::StringView(key.filename_), - at::StringView(key.name_)}; + locations[key] = {key.line_number_, at::StringView(key.filename_), + at::StringView(key.name_)}; } } template <> ExtraFields::args_t ValueCache::load( - const PyCallKey& key) const { + const PyCallKey &key) const { return {std::get(state_).at(key), c10::nullopt}; } template <> void ValueCache::store( - const PyModuleCallKey& key, + const PyModuleCallKey &key, Config::ephemeral_t frame) { - auto& cache = std::get(state_); - if (C10_UNLIKELY( - cache.cls_and_parameters_.find(key) == - cache.cls_and_parameters_.end())) { + auto &cache = std::get(state_); + if (C10_UNLIKELY(cache.cls_and_parameters_.find(key) == + cache.cls_and_parameters_.end())) { auto cls = set_class(this, cache, key, frame); - py::dict params = py::handle((PyObject*)key).attr("_parameters"); + py::dict params = py::handle((PyObject *)key).attr("_parameters"); std::vector params_; - for (auto& it : params) { - auto* p = it.second.ptr(); + for (auto &it : params) { + auto *p = it.second.ptr(); if (py::isinstance(it.first) && THPVariable_CheckExact(p)) { params_.push_back( - {it.first.cast(), - toTensorMetadata(p), + {it.first.cast(), toTensorMetadata(p), recordIfTensor(py::getattr(it.second, "grad", py::none()))}); } } @@ -426,13 +416,13 @@ void ValueCache::store( template <> ExtraFields::args_t ValueCache::load( - const PyModuleCallKey& key) const { - auto& cache = std::get(state_); + const PyModuleCallKey &key) const { + auto &cache = std::get(state_); TORCH_INTERNAL_ASSERT(cache.location_.has_value()); - const auto& cls_and_parameters = cache.cls_and_parameters_.at(key); - const auto& cls = cls_and_parameters.cls_; - NNModuleInfo info{ - key, cls, cache.cls_names_.at(cls), cls_and_parameters.parameters_}; + const auto &cls_and_parameters = cache.cls_and_parameters_.at(key); + const auto &cls = cls_and_parameters.cls_; + NNModuleInfo info{key, cls, cache.cls_names_.at(cls), + cls_and_parameters.parameters_}; return { /*frame_state_=*/std::get(state_).at(*cache.location_), /*module_info_=*/std::move(info), @@ -441,18 +431,17 @@ ExtraFields::args_t ValueCache::load( template <> void ValueCache::store( - const PyOptimizerCallKey& key, + const PyOptimizerCallKey &key, Config::ephemeral_t frame) { - auto& cache = std::get(state_); - if (C10_UNLIKELY( - cache.cls_and_parameters_.find(key) == - cache.cls_and_parameters_.end())) { + auto &cache = std::get(state_); + if (C10_UNLIKELY(cache.cls_and_parameters_.find(key) == + cache.cls_and_parameters_.end())) { auto cls = set_class(this, cache, key, frame); - const py::handle self{(PyObject*)key}; + const py::handle self{(PyObject *)key}; std::vector params; - for (const auto& i : (py::list)self.attr("param_groups")) { - for (auto& param : py::cast(i).attr("get")("params")) { + for (const auto &i : (py::list)self.attr("param_groups")) { + for (auto ¶m : py::cast(i).attr("get")("params")) { if (THPVariable_CheckExact(param.ptr())) { // While `self.state` is permitted to store data in an arbitrary way, // all generic optimizers (SGD, Adam, etc) use param as the key since @@ -472,13 +461,14 @@ void ValueCache::store( } template <> -ExtraFields::args_t ValueCache::load< - CallType::PyOptimizerCall>(const PyOptimizerCallKey& key) const { - auto& cache = std::get(state_); - const auto& cls_and_parameters = cache.cls_and_parameters_.at(key); +ExtraFields::args_t +ValueCache::load( + const PyOptimizerCallKey &key) const { + auto &cache = std::get(state_); + const auto &cls_and_parameters = cache.cls_and_parameters_.at(key); auto cls = cls_and_parameters.cls_; - OptimizerInfo info{ - key, cls, cache.cls_names_.at(cls), cls_and_parameters.parameters_}; + OptimizerInfo info{key, cls, cache.cls_names_.at(cls), + cls_and_parameters.parameters_}; return { /*frame_state_=*/std::get(state_).at(*cache.location_), /*module_info_=*/c10::nullopt, @@ -487,9 +477,8 @@ ExtraFields::args_t ValueCache::load< template <> void ValueCache::store( - const PyCCallKey& key, - Config::ephemeral_t arg) { - auto& names = std::get(state_); + const PyCCallKey &key, Config::ephemeral_t arg) { + auto &names = std::get(state_); if (C10_UNLIKELY(names.find(key) == names.end())) { names[key] = at::StringView(py::repr(arg)); } @@ -497,7 +486,7 @@ void ValueCache::store( template <> ExtraFields::args_t ValueCache::load( - const PyCCallKey& key) const { + const PyCCallKey &key) const { return std::get(state_).at(key); } @@ -510,9 +499,9 @@ void ValueCache::trimPrefixes() { .cast>(); }(); - for (auto& it : std::get(state_)) { + for (auto &it : std::get(state_)) { std::string filename = it.second.filename_.str(); - for (const auto& p : prefixes) { + for (const auto &p : prefixes) { if (filename.compare(0, p.size(), p) == 0) { filename.erase(0, p.size()); it.second.filename_ = at::StringView(filename); @@ -535,15 +524,14 @@ TraceKey nextKey() { template struct TraceKeyCacheState { struct Hash { - size_t operator()(const Callsite& key) { + size_t operator()(const Callsite &key) { return c10::get_hash(key.value_, key.caller_); } }; - TraceKey intern( - Callsite callsite, - typename Config::ephemeral_t ephemeral, - ValueCache& value_cache) { + TraceKey intern(Callsite callsite, + typename Config::ephemeral_t ephemeral, + ValueCache &value_cache) { auto it = state_.find(callsite); if (C10_UNLIKELY(it == state_.end())) { value_cache.store(callsite.value_, ephemeral); @@ -553,10 +541,9 @@ struct TraceKeyCacheState { return it->second; } - auto lookup(Callsite& callsite, ValueCache& value_cache) const { - return std::make_pair( - value_cache.load(callsite.value_), - value_cache.load(callsite.caller_)); + auto lookup(Callsite &callsite, ValueCache &value_cache) const { + return std::make_pair(value_cache.load(callsite.value_), + value_cache.load(callsite.caller_)); } ska::flat_hash_map, TraceKey, Hash> state_; @@ -571,50 +558,50 @@ struct TraceKeyCacheState { struct ThreadLocalResults; struct TraceContext { PyObject_HEAD; - ThreadLocalResults* thread_local_results_; + ThreadLocalResults *thread_local_results_; }; // CPython boilerplate to define `TraceContext` as a proper python object. static PyTypeObject TraceContextType = { PyVarObject_HEAD_INIT(nullptr, 0) "TraceContext", /* tp_name */ - sizeof(TraceContext), /* tp_basicsize */ - 0, /* tp_itemsize */ - nullptr, /* tp_dealloc */ + sizeof(TraceContext), /* tp_basicsize */ + 0, /* tp_itemsize */ + nullptr, /* tp_dealloc */ 0, - /* tp_vectorcall_offset */ // NOLINT: modernize-use-nullptr - nullptr, /* tp_getattr */ - nullptr, /* tp_setattr */ - nullptr, /* tp_reserved */ - nullptr, /* tp_repr */ - nullptr, /* tp_as_number */ - nullptr, /* tp_as_sequence */ - nullptr, /* tp_as_mapping */ - nullptr, /* tp_hash */ - nullptr, /* tp_call */ - nullptr, /* tp_str */ - nullptr, /* tp_getattro */ - nullptr, /* tp_setattro */ - nullptr, /* tp_as_buffer */ - Py_TPFLAGS_DEFAULT, /* tp_flags */ - "Python tracer TLS", /* tp_doc */ - nullptr, /* tp_traverse */ - nullptr, /* tp_clear */ - nullptr, /* tp_richcompare */ - 0, /* tp_weaklistoffset */ - nullptr, /* tp_iter */ - nullptr, /* tp_iternext */ - nullptr, /* tp_methods */ - nullptr, /* tp_members */ - nullptr, /* tp_getset */ - nullptr, /* tp_base */ - nullptr, /* tp_dict */ - nullptr, /* tp_descr_get */ - nullptr, /* tp_descr_set */ - 0, /* tp_dictoffset */ - nullptr, /* tp_init */ - nullptr, /* tp_alloc */ - PyType_GenericNew, /* tp_new */ - nullptr /* tp_free */ + /* tp_vectorcall_offset */ // NOLINT: modernize-use-nullptr + nullptr, /* tp_getattr */ + nullptr, /* tp_setattr */ + nullptr, /* tp_reserved */ + nullptr, /* tp_repr */ + nullptr, /* tp_as_number */ + nullptr, /* tp_as_sequence */ + nullptr, /* tp_as_mapping */ + nullptr, /* tp_hash */ + nullptr, /* tp_call */ + nullptr, /* tp_str */ + nullptr, /* tp_getattro */ + nullptr, /* tp_setattro */ + nullptr, /* tp_as_buffer */ + Py_TPFLAGS_DEFAULT, /* tp_flags */ + "Python tracer TLS", /* tp_doc */ + nullptr, /* tp_traverse */ + nullptr, /* tp_clear */ + nullptr, /* tp_richcompare */ + 0, /* tp_weaklistoffset */ + nullptr, /* tp_iter */ + nullptr, /* tp_iternext */ + nullptr, /* tp_methods */ + nullptr, /* tp_members */ + nullptr, /* tp_getset */ + nullptr, /* tp_base */ + nullptr, /* tp_dict */ + nullptr, /* tp_descr_get */ + nullptr, /* tp_descr_set */ + 0, /* tp_dictoffset */ + nullptr, /* tp_init */ + nullptr, /* tp_alloc */ + PyType_GenericNew, /* tp_new */ + nullptr /* tp_free */ }; class gil_and_restore_thread { @@ -631,13 +618,11 @@ class gil_and_restore_thread { } } - PyThreadState* initial_thread_state() const { - return initial_thread_state_; - } + PyThreadState *initial_thread_state() const { return initial_thread_state_; } private: pybind11::gil_scoped_acquire gil_; - PyThreadState* initial_thread_state_; + PyThreadState *initial_thread_state_; }; // ============================================================================ @@ -645,26 +630,22 @@ class gil_and_restore_thread { // ============================================================================ class DIPUPythonTracer; struct ThreadLocalResults { - ThreadLocalResults( - PyThreadState* thread_state, - ValueCache* value_cache, - DIPUPythonTracer* active_tracer) + ThreadLocalResults(PyThreadState *thread_state, ValueCache *value_cache, + DIPUPythonTracer *active_tracer) : thread_state_{thread_state}, - ctx_{(TraceContext*)TraceContextType.tp_alloc(&TraceContextType, 0)}, + ctx_{(TraceContext *)TraceContextType.tp_alloc(&TraceContextType, 0)}, value_cache_{value_cache}, active_tracer_{active_tracer} { ctx_->thread_local_results_ = this; } ThreadLocalResults() = delete; - ThreadLocalResults(const ThreadLocalResults&) = delete; - ThreadLocalResults(ThreadLocalResults&&) = delete; - ThreadLocalResults& operator=(const ThreadLocalResults&) = delete; - ThreadLocalResults& operator=(const ThreadLocalResults&&) = delete; + ThreadLocalResults(const ThreadLocalResults &) = delete; + ThreadLocalResults(ThreadLocalResults &&) = delete; + ThreadLocalResults &operator=(const ThreadLocalResults &) = delete; + ThreadLocalResults &operator=(const ThreadLocalResults &&) = delete; - ~ThreadLocalResults() { - Py_DECREF((PyObject*)ctx_); - } + ~ThreadLocalResults() { Py_DECREF((PyObject *)ctx_); } template TraceKey intern(Ephemeral ephemeral, Args... args) { @@ -677,10 +658,10 @@ struct ThreadLocalResults { static constexpr size_t BLOCK_SIZE = 1024; - PyThreadState* thread_state_; - TraceContext* ctx_; - ValueCache* value_cache_; - DIPUPythonTracer* active_tracer_; + PyThreadState *thread_state_; + TraceContext *ctx_; + ValueCache *value_cache_; + DIPUPythonTracer *active_tracer_; CallTypeHelper::tuple_type trace_keys_; AppendOnlyList exit_times_; AppendOnlyList c_exit_times_; @@ -691,20 +672,16 @@ struct ThreadLocalResults { // ============================================================================ class DIPUPythonTracer final : public PythonTracerBase { public: - DIPUPythonTracer(DIPURecordQueue* queue); + DIPUPythonTracer(DIPURecordQueue *queue); ~DIPUPythonTracer() override; - static int pyProfileFn( - PyObject* obj, - PyFrameObject* frame, - int what, - PyObject* arg); + static int pyProfileFn(PyObject *obj, PyFrameObject *frame, int what, + PyObject *arg); void stop() override; std::vector> getEvents( std::function time_converter, - std::vector& enters, - time_t end_time_ns) override; + std::vector &enters, time_t end_time_ns) override; struct StartFrame { TraceKey trace_key_; @@ -712,36 +689,33 @@ class DIPUPythonTracer final : public PythonTracerBase { }; private: - void recordPyCall( - ThreadLocalResults& tls, - PyFrameObject* frame, - bool is_startup_frame); + void recordPyCall(ThreadLocalResults &tls, PyFrameObject *frame, + bool is_startup_frame); - void recordCCall( - ThreadLocalResults& tls, - PyFrameObject* frame, - PyObject* arg); + void recordCCall(ThreadLocalResults &tls, PyFrameObject *frame, + PyObject *arg); - const std::vector interpreterThreads() const; + const std::vector interpreterThreads() const; std::atomic active_lock_{false}; bool active_{false}; - DIPURecordQueue* queue_; - PyInterpreterState* interpreter_; - PyCodeObject* module_call_code_; - PyCodeObject* optimizer_hook_; + DIPURecordQueue *queue_; + PyInterpreterState *interpreter_; + PyCodeObject *module_call_code_; + PyCodeObject *optimizer_hook_; std::vector start_frames_; std::deque thread_local_results_; ValueCache value_cache_; }; -const std::vector DIPUPythonTracer::interpreterThreads() const { +const std::vector DIPUPythonTracer::interpreterThreads() + const { pybind11::gil_scoped_acquire gil; - std::vector out; + std::vector out; if (SOFT_ASSERT(interpreter_)) { - auto* thread_state = PyInterpreterState_ThreadHead(interpreter_); + auto *thread_state = PyInterpreterState_ThreadHead(interpreter_); while (thread_state != nullptr) { out.push_back(thread_state); thread_state = PyThreadState_Next(thread_state); @@ -750,7 +724,7 @@ const std::vector DIPUPythonTracer::interpreterThreads() const { return out; } -DIPUPythonTracer::DIPUPythonTracer(DIPURecordQueue* queue) +DIPUPythonTracer::DIPUPythonTracer(DIPURecordQueue *queue) : queue_(queue), interpreter_(nullptr), module_call_code_(getCode()), @@ -779,7 +753,7 @@ DIPUPythonTracer::DIPUPythonTracer(DIPURecordQueue* queue) PyThreadState_Swap(thread_state); thread_local_results_.emplace_back(thread_state, &value_cache_, this); - auto* ctx = thread_local_results_.back().ctx_; + auto *ctx = thread_local_results_.back().ctx_; // When we begin profiling there are already frames on the Python // interpreter stack. To ensure a complete trace, we must push calls @@ -789,7 +763,7 @@ DIPUPythonTracer::DIPUPythonTracer(DIPURecordQueue* queue) auto frame = PyEval_GetFrame(); Py_XINCREF(frame); - size_t depth = 0; // Make sure we can't infinite loop. + size_t depth = 0; // Make sure we can't infinite loop. while (frame != nullptr) { current_stack.emplace_back(frame); if (++depth == 128) { @@ -812,7 +786,7 @@ DIPUPythonTracer::DIPUPythonTracer(DIPURecordQueue* queue) // Note: // This profile will not compose with other CPython profilers, and // cannot be round tripped via `sys.settrace(sys.gettrace())` - PyEval_SetProfile(DIPUPythonTracer::pyProfileFn, (PyObject*)ctx); + PyEval_SetProfile(DIPUPythonTracer::pyProfileFn, (PyObject *)ctx); } }; @@ -839,10 +813,9 @@ DIPUPythonTracer::~DIPUPythonTracer() { } } -void DIPUPythonTracer::recordPyCall( - ThreadLocalResults& tls, - PyFrameObject* frame, - bool is_startup_frame) { +void DIPUPythonTracer::recordPyCall(ThreadLocalResults &tls, + PyFrameObject *frame, + bool is_startup_frame) { static constexpr auto E = EventType::PyCall; const auto key = [&]() -> TraceKey { auto code = THPCodeObjectPtr(PyFrame_GetCode(frame)); @@ -861,16 +834,16 @@ void DIPUPythonTracer::recordPyCall( Py_INCREF(self.get()); auto back = THPFrameObjectPtr(PyFrame_GetBack(frame)); TORCH_INTERNAL_ASSERT(back != nullptr); - return tls.intern( - frame, self.get(), back.get()); + return tls.intern(frame, self.get(), + back.get()); } else if (code.get() == optimizer_hook_) { auto locals = THPObjectPtr(PyFrame_GetLocals(frame)); auto self = THPObjectPtr(PyDict_GetItemString(locals, "self")); Py_INCREF(self.get()); auto back = THPFrameObjectPtr(PyFrame_GetBack(frame)); TORCH_INTERNAL_ASSERT(back != nullptr); - return tls.intern( - frame, self.get(), back.get()); + return tls.intern(frame, self.get(), + back.get()); } else { auto back = THPFrameObjectPtr(PyFrame_GetBack(frame)); auto f_back = (back.get() != nullptr) ? back.get() : frame; @@ -882,17 +855,15 @@ void DIPUPythonTracer::recordPyCall( : queue_->getSubqueue()->emplace_py_call(key, time); } -void DIPUPythonTracer::recordCCall( - ThreadLocalResults& tls, - PyFrameObject* frame, - PyObject* arg) { +void DIPUPythonTracer::recordCCall(ThreadLocalResults &tls, + PyFrameObject *frame, PyObject *arg) { TORCH_INTERNAL_ASSERT_DEBUG_ONLY(Py_TYPE(arg) == &PyCFunction_Type); - auto fn = reinterpret_cast(arg); + auto fn = reinterpret_cast(arg); // NB: For C calls a new frame is not created, so we use `frame` rather than // `frame->f_back`. auto key = tls.intern( - arg, (void*)(fn->m_ml), frame); + arg, (void *)(fn->m_ml), frame); queue_->getSubqueue()->emplace_py_call(key, getApproximateTime()); } @@ -900,9 +871,7 @@ void DIPUPythonTracer::recordCCall( // == Post processing ========================================================= // ============================================================================ struct Exit { - bool operator>(const Exit& other) const { - return t_ > other.t_; - } + bool operator>(const Exit &other) const { return t_ > other.t_; } time_t t_; size_t python_tid_; @@ -910,15 +879,13 @@ struct Exit { class PostProcess { public: - PostProcess( - std::function time_converter, - std::deque& tls, - const ValueCache& value_cache, - time_t end_time_ns) + PostProcess(std::function time_converter, + std::deque &tls, + const ValueCache &value_cache, time_t end_time_ns) : end_time_{end_time_ns}, time_converter_{std::move(time_converter)} { for (size_t python_tid : c10::irange(tls.size())) { - CallTypeHelper::map( - tls[python_tid].trace_keys_, *this, value_cache, python_tid); + CallTypeHelper::map(tls[python_tid].trace_keys_, + *this, value_cache, python_tid); addExits(tls[python_tid].exit_times_, python_tid); addExits(tls[python_tid].c_exit_times_, python_tid); @@ -926,23 +893,20 @@ class PostProcess { } void set_start_frames( - const std::vector& start_frames, - std::vector& enters) { - for (const auto& frame : start_frames) { - enters.push_back( - {frame.trace_key_, - NoTID, // Allows us to detect unhandled start frames - {}, - time_converter_(frame.start_time)}); + const std::vector &start_frames, + std::vector &enters) { + for (const auto &frame : start_frames) { + enters.push_back({frame.trace_key_, + NoTID, // Allows us to detect unhandled start frames + {}, + time_converter_(frame.start_time)}); } } template - void operator()( - const TraceKeyCacheState& trace_cache, - const ValueCache& value_cache, - size_t python_tid) { - for (const auto& it : trace_cache.state_) { + void operator()(const TraceKeyCacheState &trace_cache, + const ValueCache &value_cache, size_t python_tid) { + for (const auto &it : trace_cache.state_) { const auto inserted = get_state::event_type>().fields_.insert( {it.second, value_cache.load(it.first, python_tid)}); TORCH_INTERNAL_ASSERT(inserted.second, "Duplicate key: ", it.second); @@ -950,18 +914,17 @@ class PostProcess { } template - void addExits(AppendOnlyList& exits, size_t python_tid) { + void addExits(AppendOnlyList &exits, size_t python_tid) { for (const auto i : exits) { get_state().exits_.push({time_converter_(i), python_tid}); } } std::vector> run( - std::vector& enters) { + std::vector &enters) { std::stable_sort( - enters.begin(), enters.end(), [](const auto a, const auto b) { - return a.enter_t_ < b.enter_t_; - }); + enters.begin(), enters.end(), + [](const auto a, const auto b) { return a.enter_t_ < b.enter_t_; }); std::vector> out; populate(enters, out); populate(enters, out); @@ -970,40 +933,36 @@ class PostProcess { private: template - void populate( - std::vector& enters, - std::vector>& out) { + void populate(std::vector &enters, + std::vector> &out) { using stack_t = std::vector>; const auto initial_size = out.size(); - auto pop = [](stack_t& stack, time_t t) { + auto pop = [](stack_t &stack, time_t t) { TORCH_INTERNAL_ASSERT(stack.size(), "Python replay stack is empty."); c10::get>(stack.back()->extra_fields_).end_time_ns_ = t; stack.pop_back(); }; ska::flat_hash_map stacks; - auto& state = get_state(); - for (const auto& enter : enters) { + auto &state = get_state(); + for (const auto &enter : enters) { auto fields_it = state.fields_.find(enter.key_); if (fields_it != state.fields_.end()) { while (!state.exits_.empty() && state.exits_.top().t_ < enter.enter_t_) { - auto& exit = state.exits_.top(); + auto &exit = state.exits_.top(); pop(stacks[exit.python_tid_], exit.t_); state.exits_.pop(); } - out.push_back(Result::create( - enter.enter_t_, - enter.system_tid_, - enter.kineto_info_, - fields_it->second)); + out.push_back(Result::create(enter.enter_t_, enter.system_tid_, + enter.kineto_info_, fields_it->second)); stacks[fields_it->second.python_tid_].push_back(out.back()); } } // Handle events which were still running when profiling ended. - for (auto& i : stacks) { + for (auto &i : stacks) { while (!i.second.empty()) { pop(i.second, end_time_); } @@ -1011,14 +970,13 @@ class PostProcess { // Assign system TIDs to start events based on the system TID of the next // observed event with the same Python TID. - ska::flat_hash_map> - tid_map; + ska::flat_hash_map> tid_map; auto it = out.rbegin(); for (C10_UNUSED auto _ : c10::irange(initial_size, out.size())) { const auto python_tid = c10::get>((*it)->extra_fields_).python_tid_; if ((*it)->start_tid_ == NoTID && SOFT_ASSERT(E == EventType::PyCall)) { - const auto& tid_info = + const auto &tid_info = tid_map.insert({python_tid, {NoTID, DeviceAndResource()}}) .first->second; (*it)->start_tid_ = tid_info.first; @@ -1036,7 +994,7 @@ class PostProcess { }; template - auto& get_state() { + auto &get_state() { return std::get < E == EventType::PyCall ? 0 : 1 > (state_); } @@ -1046,21 +1004,21 @@ class PostProcess { }; struct PythonIDVisitor { - void operator()(ExtraFields& py_call) { + void operator()(ExtraFields &py_call) { py_call.id_ = ++current_python_id_; if (py_call.module_.has_value()) { - auto& m = py_call.module_; - auto& module_ids = module_ids_[m->cls_]; + auto &m = py_call.module_; + auto &module_ids = module_ids_[m->cls_]; m->id_ = module_ids.insert({m->self_, module_ids.size()}).first->second; } } - void operator()(ExtraFields& py_call) { + void operator()(ExtraFields &py_call) { py_call.id_ = ++current_python_id_; } template - void operator()(T&) {} + void operator()(T &) {} size_t current_python_id_{0}; ska::flat_hash_map> @@ -1069,23 +1027,19 @@ struct PythonIDVisitor { std::vector> DIPUPythonTracer::getEvents( std::function time_converter, - std::vector& enters, - time_t end_time_ns) { + std::vector &enters, time_t end_time_ns) { value_cache_.trimPrefixes(); - PostProcess post_process( - std::move(time_converter), - thread_local_results_, - value_cache_, - end_time_ns); + PostProcess post_process(std::move(time_converter), thread_local_results_, + value_cache_, end_time_ns); post_process.set_start_frames(start_frames_, enters); auto out = post_process.run(enters); - std::stable_sort(out.begin(), out.end(), [](const auto& a, const auto& b) { + std::stable_sort(out.begin(), out.end(), [](const auto &a, const auto &b) { return a->start_time_ns_ < b->start_time_ns_; }); PythonIDVisitor id_visitor; - for (auto& i : out) { + for (auto &i : out) { c10::visit(id_visitor, i->extra_fields_); } @@ -1095,13 +1049,10 @@ std::vector> DIPUPythonTracer::getEvents( // ============================================================================ // == API ===================================================================== // ============================================================================ -int DIPUPythonTracer::pyProfileFn( - PyObject* obj, - PyFrameObject* frame, - int what, - PyObject* arg) { - auto& local_results = - *reinterpret_cast(obj)->thread_local_results_; +int DIPUPythonTracer::pyProfileFn(PyObject *obj, PyFrameObject *frame, int what, + PyObject *arg) { + auto &local_results = + *reinterpret_cast(obj)->thread_local_results_; switch (what) { case PyTrace_CALL: local_results.active_tracer_->recordPyCall(local_results, frame, false); @@ -1123,10 +1074,9 @@ int DIPUPythonTracer::pyProfileFn( } return 0; } -} // namespace +} // namespace -std::unique_ptr makeTracer( - DIPURecordQueue* queue) { +std::unique_ptr makeTracer(DIPURecordQueue *queue) { return std::make_unique(queue); } @@ -1135,5 +1085,5 @@ void init() { TORCH_CHECK(PyType_Ready(&TraceContextType) == 0); } -} // namespace profile -} // namespace dipu \ No newline at end of file +} // namespace profile +} // namespace dipu \ No newline at end of file diff --git a/dipu/torch_dipu/csrc_dipu/profiler/profiler_python.h b/dipu/torch_dipu/csrc_dipu/profiler/profiler_python.h index f8f40fc70..bd11bb98d 100644 --- a/dipu/torch_dipu/csrc_dipu/profiler/profiler_python.h +++ b/dipu/torch_dipu/csrc_dipu/profiler/profiler_python.h @@ -8,9 +8,10 @@ namespace dipu { namespace profile { class DIPURecordQueue; -std::unique_ptr makeTracer(DIPURecordQueue* queue); +std::unique_ptr +makeTracer(DIPURecordQueue *queue); void init(); -} // namespace profile -} // namespace dipu \ No newline at end of file +} // namespace profile +} // namespace dipu \ No newline at end of file diff --git a/dipu/torch_dipu/csrc_dipu/runtime/core/DIPUCopyInplace.cpp b/dipu/torch_dipu/csrc_dipu/runtime/core/DIPUCopyInplace.cpp index 82106eb97..e9bb372d5 100644 --- a/dipu/torch_dipu/csrc_dipu/runtime/core/DIPUCopyInplace.cpp +++ b/dipu/torch_dipu/csrc_dipu/runtime/core/DIPUCopyInplace.cpp @@ -2,15 +2,17 @@ #include "DIPUCopyInplace.h" #include + #include +#include #include #include -#include namespace dipu { -at::Tensor& DIPUCopyInplace::run(at::Tensor& self, const at::Tensor& src, bool non_blocking) { +at::Tensor &DIPUCopyInplace::run(at::Tensor &self, const at::Tensor &src, + bool non_blocking) { TORCH_CHECK(self.defined(), "self is undefined"); TORCH_CHECK(src.defined(), "src is undefined"); @@ -24,24 +26,22 @@ at::Tensor& DIPUCopyInplace::run(at::Tensor& self, const at::Tensor& src, bool n } // Exit early if self and src are views of the same data - const bool is_same_data = ( - self.is_alias_of(src) && - self.storage_offset() == src.storage_offset() && - self.strides().equals(src.strides()) && - self.sizes().equals(src.sizes()) && - self.scalar_type() == src.scalar_type() - ); + const bool is_same_data = + (self.is_alias_of(src) && self.storage_offset() == src.storage_offset() && + self.strides().equals(src.strides()) && + self.sizes().equals(src.sizes()) && + self.scalar_type() == src.scalar_type()); if (is_same_data) { return self; } auto iter = at::TensorIteratorConfig() - .add_output(self) - .add_input(src) - .resize_outputs(false) - .check_all_same_dtype(false) - .check_all_same_device(false) - .build(); + .add_output(self) + .add_input(src) + .resize_outputs(false) + .check_all_same_dtype(false) + .check_all_same_device(false) + .build(); if (iter.numel() == 0) { return self; } @@ -49,7 +49,8 @@ at::Tensor& DIPUCopyInplace::run(at::Tensor& self, const at::Tensor& src, bool n c10::Device dst_device = iter.device(0); c10::Device src_device = iter.device(1); // 1. copy between devices - if (dst_device.type() == DIPU_DEVICE_TYPE && src_device.type() == DIPU_DEVICE_TYPE) { + if (dst_device.type() == DIPU_DEVICE_TYPE && + src_device.type() == DIPU_DEVICE_TYPE) { return copy_between_devices(iter, self, src, non_blocking); } @@ -63,7 +64,10 @@ at::Tensor& DIPUCopyInplace::run(at::Tensor& self, const at::Tensor& src, bool n return copy_uncontiguous(iter, self, src, non_blocking); } -at::Tensor& DIPUCopyInplace::copy_between_devices(at::TensorIterator& iter, at::Tensor& self, const at::Tensor& src, bool non_blocking) { +at::Tensor &DIPUCopyInplace::copy_between_devices(at::TensorIterator &iter, + at::Tensor &self, + const at::Tensor &src, + bool non_blocking) { int64_t numel = iter.numel(); c10::Device dst_device = iter.device(0); c10::Device src_device = iter.device(1); @@ -82,7 +86,8 @@ at::Tensor& DIPUCopyInplace::copy_between_devices(at::TensorIterator& iter, at:: size_t size = numel * iter.element_size(0); dipu::DIPUStream stream = dipu::getCurrentDIPUStream(); - dipu::devproxy::memCopyD2DAsync(stream.rawstream(), size, dst_device.index(), dst_ptr, src_device.index(), src_ptr); + dipu::devproxy::memCopyD2DAsync(stream.rawstream(), size, dst_device.index(), + dst_ptr, src_device.index(), src_ptr); if (!non_blocking) { dipu::devproxy::syncStream(stream.rawstream()); @@ -90,16 +95,21 @@ at::Tensor& DIPUCopyInplace::copy_between_devices(at::TensorIterator& iter, at:: return self; } -at::Tensor& DIPUCopyInplace::copy_contiguous(at::TensorIterator& iter, at::Tensor& self, const at::Tensor& src, bool non_blocking) { +at::Tensor &DIPUCopyInplace::copy_contiguous(at::TensorIterator &iter, + at::Tensor &self, + const at::Tensor &src, + bool non_blocking) { c10::Device dst_device = iter.device(0); c10::Device src_device = iter.device(1); int64_t nbytes = iter.numel() * iter.element_size(0); dipu::DIPUStream stream = dipu::getCurrentDIPUStream(); if (dst_device.type() == DIPU_DEVICE_TYPE && src_device.is_cpu()) { - dipu::devproxy::memCopyH2DAsync(stream.rawstream(), nbytes, iter.data_ptr(0), iter.data_ptr(1)); + dipu::devproxy::memCopyH2DAsync(stream.rawstream(), nbytes, + iter.data_ptr(0), iter.data_ptr(1)); } else if (dst_device.is_cpu() && src_device.type() == DIPU_DEVICE_TYPE) { - dipu::devproxy::memCopyD2HAsync(stream.rawstream(), nbytes, iter.data_ptr(0), iter.data_ptr(1)); + dipu::devproxy::memCopyD2HAsync(stream.rawstream(), nbytes, + iter.data_ptr(0), iter.data_ptr(1)); } else { TORCH_CHECK(false, "unsupported devices in copy_"); } @@ -110,16 +120,24 @@ at::Tensor& DIPUCopyInplace::copy_contiguous(at::TensorIterator& iter, at::Tenso return self; } -at::Tensor& DIPUCopyInplace::copy_uncontiguous(at::TensorIterator& iter, at::Tensor& self, const at::Tensor& src, bool non_blocking) { - auto& dst = iter.tensor(0); +at::Tensor &DIPUCopyInplace::copy_uncontiguous(at::TensorIterator &iter, + at::Tensor &self, + const at::Tensor &src, + bool non_blocking) { + auto &dst = iter.tensor(0); at::Tensor dst_contig; at::Tensor src_contig; if (iter.device_type(0) == DIPU_DEVICE_TYPE || non_blocking) { - dst_contig = dst.is_contiguous() ? dst : at::empty_like(dst, LEGACY_CONTIGUOUS_MEMORY_FORMAT); + dst_contig = dst.is_contiguous() + ? dst + : at::empty_like(dst, LEGACY_CONTIGUOUS_MEMORY_FORMAT); src_contig = iter.tensor(1).to(iter.dtype(0)).expand_as(dst).contiguous(); } else { bool same_type = iter.dtype(0) == iter.dtype(1); - dst_contig = (dst.is_contiguous() && same_type) ? dst : at::empty_like(dst, iter.dtype(1), LEGACY_CONTIGUOUS_MEMORY_FORMAT); + dst_contig = (dst.is_contiguous() && same_type) + ? dst + : at::empty_like(dst, iter.dtype(1), + LEGACY_CONTIGUOUS_MEMORY_FORMAT); src_contig = iter.tensor(1).expand_as(dst).contiguous(); } // perform a same-dtype copy on contiguous tensors @@ -138,7 +156,7 @@ at::Tensor& DIPUCopyInplace::copy_uncontiguous(at::TensorIterator& iter, at::Ten static DIPUCopyInplace default_copy_inplace_op; static DIPUCopyInplace *dipu_copy_inplace_op = nullptr; -DIPUCopyInplace* getDipuCopyInplace() { +DIPUCopyInplace *getDipuCopyInplace() { TORCH_CHECK(dipu_copy_inplace_op, "dipu copy inplace not registered"); return dipu_copy_inplace_op; } diff --git a/dipu/torch_dipu/csrc_dipu/runtime/core/DIPUCopyInplace.h b/dipu/torch_dipu/csrc_dipu/runtime/core/DIPUCopyInplace.h index f8b574e0d..6c80245a6 100644 --- a/dipu/torch_dipu/csrc_dipu/runtime/core/DIPUCopyInplace.h +++ b/dipu/torch_dipu/csrc_dipu/runtime/core/DIPUCopyInplace.h @@ -1,34 +1,43 @@ // Copyright (c) 2023, DeepLink. #pragma once -#include #include +#include #include namespace dipu { class DIPUCopyInplace { -public: + public: DIPUCopyInplace() = default; virtual ~DIPUCopyInplace() = default; - virtual at::Tensor& run(at::Tensor& self, const at::Tensor& src, bool non_blocking); + virtual at::Tensor &run(at::Tensor &self, const at::Tensor &src, + bool non_blocking); // copy between devices // 1. dtype & shape & stride all equal, use memCopyD2DAsync - // 2. use DIPUATenFunctions::copy_, proxy device tensor to cpu to handle different dtype/view problem - virtual at::Tensor& copy_between_devices(at::TensorIterator& iter, at::Tensor& self, const at::Tensor& src, bool non_blocking); + // 2. use DIPUATenFunctions::copy_, proxy device tensor to cpu to handle + // different dtype/view problem + virtual at::Tensor ©_between_devices(at::TensorIterator &iter, + at::Tensor &self, + const at::Tensor &src, + bool non_blocking); // copy between cpu and device, dtype & shape & stride all equal // 1. host to device, use memCopyH2DAsync // 2. device to host, use memCopyD2HAsync - virtual at::Tensor& copy_contiguous(at::TensorIterator& iter, at::Tensor& self, const at::Tensor& src, bool non_blocking); + virtual at::Tensor ©_contiguous(at::TensorIterator &iter, + at::Tensor &self, const at::Tensor &src, + bool non_blocking); // copy between cpu and device, different dtype or view - virtual at::Tensor& copy_uncontiguous(at::TensorIterator& iter, at::Tensor& self, const at::Tensor& src, bool non_blocking); + virtual at::Tensor ©_uncontiguous(at::TensorIterator &iter, + at::Tensor &self, const at::Tensor &src, + bool non_blocking); }; -DIPUCopyInplace* getDipuCopyInplace(); +DIPUCopyInplace *getDipuCopyInplace(); void setDipuCopyInplace(DIPUCopyInplace *op); } // namespace dipu \ No newline at end of file diff --git a/dipu/torch_dipu/csrc_dipu/runtime/core/DIPUDeviceInfo.cpp b/dipu/torch_dipu/csrc_dipu/runtime/core/DIPUDeviceInfo.cpp index 420c9c41e..649e1007c 100644 --- a/dipu/torch_dipu/csrc_dipu/runtime/core/DIPUDeviceInfo.cpp +++ b/dipu/torch_dipu/csrc_dipu/runtime/core/DIPUDeviceInfo.cpp @@ -1,19 +1,19 @@ // Copyright (c) 2023, DeepLink. +#include "./DIPUDeviceInfo.h" + #include #include #include #include - -#include "./DIPUDeviceInfo.h" namespace dipu { // anonymous ns namespace { -using std::shared_ptr; -using dipu::devapis::DIPUDeviceProperties; using c10::DeviceIndex; +using dipu::devapis::DIPUDeviceProperties; +using std::shared_ptr; DeviceIndex num_gpus = -1; c10::once_flag init_flag; @@ -27,8 +27,10 @@ static void initDIPUContextVectors() { } static void initDeviceProperty(DeviceIndex device_index) { - DIPUDeviceProperties device_prop = dipu::devproxy::getDeviceProperties(device_index); - device_properties[device_index] = std::make_shared(device_prop); + DIPUDeviceProperties device_prop = + dipu::devproxy::getDeviceProperties(device_index); + device_properties[device_index] = + std::make_shared(device_prop); } static inline void checkDevice(int32_t device_index) { @@ -39,9 +41,10 @@ static inline void checkDevice(int32_t device_index) { AT_ASSERT(device_index >= 0 && device_index < num_gpus); } -} // end anonymous +} // namespace -shared_ptr getDevicePropertiesFromCache(int32_t device_index) { +shared_ptr getDevicePropertiesFromCache( + int32_t device_index) { checkDevice(device_index); c10::call_once(device_flags[device_index], initDeviceProperty, device_index); return device_properties[device_index]; diff --git a/dipu/torch_dipu/csrc_dipu/runtime/core/DIPUDeviceInfo.h b/dipu/torch_dipu/csrc_dipu/runtime/core/DIPUDeviceInfo.h index 2e6142538..380f62534 100644 --- a/dipu/torch_dipu/csrc_dipu/runtime/core/DIPUDeviceInfo.h +++ b/dipu/torch_dipu/csrc_dipu/runtime/core/DIPUDeviceInfo.h @@ -5,7 +5,9 @@ namespace dipu { using dipu::devapis::DIPUDeviceProperties; using dipu::devapis::DIPUDeviceStatus; -DIPU_API std::shared_ptr getDevicePropertiesFromCache(int32_t device_index); -DIPU_API std::shared_ptr getDeviceStatus(int32_t device_index); +DIPU_API std::shared_ptr getDevicePropertiesFromCache( + int32_t device_index); +DIPU_API std::shared_ptr getDeviceStatus( + int32_t device_index); } // namespace dipu \ No newline at end of file diff --git a/dipu/torch_dipu/csrc_dipu/runtime/core/DIPUEvent.h b/dipu/torch_dipu/csrc_dipu/runtime/core/DIPUEvent.h index a117882b9..beff5679b 100644 --- a/dipu/torch_dipu/csrc_dipu/runtime/core/DIPUEvent.h +++ b/dipu/torch_dipu/csrc_dipu/runtime/core/DIPUEvent.h @@ -1,20 +1,21 @@ // Copyright (c) 2023, DeepLink. #pragma once -#include "DIPUStream.h" -#include "DIPUGuard.h" -#include #include #include +#include + +#include "DIPUGuard.h" +#include "DIPUStream.h" namespace dipu { /* -* DIPUEvents are movable not copyable wrappers around DIPU's events. -* DIPUEvents are constructed lazily when first recorded. -*/ + * DIPUEvents are movable not copyable wrappers around DIPU's events. + * DIPUEvents are constructed lazily when first recorded. + */ class DIPU_API DIPUEvent { -public: + public: // Constructors // Default value for `flags` is specified below DIPUEvent() {} @@ -30,14 +31,15 @@ class DIPU_API DIPUEvent { DIPUGuard guard(device_index_); devproxy::destroyEvent(event_); } - } catch (...) { /* No throw */ } + } catch (...) { /* No throw */ + } } - DIPUEvent(const DIPUEvent&) = delete; - DIPUEvent& operator=(const DIPUEvent&) = delete; + DIPUEvent(const DIPUEvent &) = delete; + DIPUEvent &operator=(const DIPUEvent &) = delete; - DIPUEvent(DIPUEvent&& other) { moveHelper(std::move(other)); } - DIPUEvent& operator=(DIPUEvent&& other) { + DIPUEvent(DIPUEvent &&other) { moveHelper(std::move(other)); } + DIPUEvent &operator=(DIPUEvent &&other) { moveHelper(std::move(other)); return *this; } @@ -55,7 +57,7 @@ class DIPU_API DIPUEvent { } bool isCreated() const { return event_ != nullptr; } - c10::DeviceIndex device_index() const {return device_index_;} + c10::DeviceIndex device_index() const { return device_index_; } deviceEvent_t rawevent() const { return event_; } bool query() const { @@ -63,7 +65,7 @@ class DIPU_API DIPUEvent { return true; } DIPUGuard guard(device_index_); - auto currStatus = devproxy::getEventStatus(event_); + auto currStatus = devproxy::getEventStatus(event_); if (currStatus == devapis::EventStatus::READY) { return true; } @@ -72,30 +74,32 @@ class DIPU_API DIPUEvent { void record() { record(getCurrentDIPUStream()); } - void recordOnce(const DIPUStream& stream) { + void recordOnce(const DIPUStream &stream) { if (!was_recorded_) record(stream); } - void record(const DIPUStream& stream) { + void record(const DIPUStream &stream) { if (!isCreated()) { createEvent(stream.device_index()); } - TORCH_CHECK(device_index_ == stream.device_index(), "Event device ", device_index_, - " does not match recording stream's device ", stream.device_index(), "."); + TORCH_CHECK(device_index_ == stream.device_index(), "Event device ", + device_index_, " does not match recording stream's device ", + stream.device_index(), "."); DIPUGuard guard(device_index_); devproxy::recordEvent(event_, stream); was_recorded_ = true; } - void wait(const DIPUStream& stream) { + void wait(const DIPUStream &stream) { if (isCreated()) { DIPUGuard guard(stream.device_index()); devproxy::streamWaitEvent(stream, event_); } } - float elapsed_time(const DIPUEvent& other) const { - TORCH_CHECK(isCreated() && other.isCreated(), + float elapsed_time(const DIPUEvent &other) const { + TORCH_CHECK( + isCreated() && other.isCreated(), "Both events must be recorded before calculating elapsed time."); float time_ms = 0; devproxy::eventElapsedTime(&time_ms, event_, other.event_); @@ -110,7 +114,7 @@ class DIPU_API DIPUEvent { // dipu do not support IpcEventHandle until now -private: + private: unsigned int flags_ = 0; bool was_recorded_ = false; c10::DeviceIndex device_index_ = -1; @@ -122,7 +126,7 @@ class DIPU_API DIPUEvent { devproxy::createEvent(&event_); } - void moveHelper(DIPUEvent&& other) { + void moveHelper(DIPUEvent &&other) { std::swap(flags_, other.flags_); std::swap(was_recorded_, other.was_recorded_); std::swap(device_index_, other.device_index_); @@ -130,5 +134,4 @@ class DIPU_API DIPUEvent { } }; -} // namespace c10_dipu - +} // namespace dipu diff --git a/dipu/torch_dipu/csrc_dipu/runtime/core/DIPUEventPool.cpp b/dipu/torch_dipu/csrc_dipu/runtime/core/DIPUEventPool.cpp index 887d659f4..9a835b9eb 100644 --- a/dipu/torch_dipu/csrc_dipu/runtime/core/DIPUEventPool.cpp +++ b/dipu/torch_dipu/csrc_dipu/runtime/core/DIPUEventPool.cpp @@ -1,111 +1,102 @@ #include "DIPUEventPool.h" + #include -#include #include #include +#include namespace dipu { -template +template class EventPool final { -protected: - std::deque event_pool_; - unsigned int allocate_num_ = 0; - - std::function allocator_; - std::function deleter_; - using mutex_t = std::recursive_mutex; - mutex_t event_mutex_; - -public: - EventPool(const std::function& allocator, - const std::function& deleter) - : allocator_(allocator), deleter_(deleter) { + protected: + std::deque event_pool_; + unsigned int allocate_num_ = 0; + + std::function allocator_; + std::function deleter_; + using mutex_t = std::recursive_mutex; + mutex_t event_mutex_; + + public: + EventPool(const std::function &allocator, + const std::function &deleter) + : allocator_(allocator), deleter_(deleter) {} + + EventPool(const EventPool &) = delete; + EventPool(EventPool &&) = delete; + EventPool &operator=(const EventPool &) = delete; + EventPool &operator=(EventPool &&) = delete; + + ~EventPool() = default; + + void release() { + std::lock_guard _(event_mutex_); + for (auto &event : event_pool_) { + deleter_(event); + allocate_num_--; } - - EventPool(const EventPool&) = delete; - EventPool(EventPool&&) = delete; - EventPool& operator = (const EventPool&) = delete; - EventPool& operator = (EventPool&&) = delete; - - ~EventPool() = default; - - void release() { - std::lock_guard _(event_mutex_); - for (auto& event : event_pool_) { - deleter_(event); - allocate_num_--; - } - event_pool_.clear(); + event_pool_.clear(); + } + + void get(T &event) { + bool need_allocator = false; + { + std::lock_guard _(event_mutex_); + if (event_pool_.empty()) { + need_allocator = true; + } else { + event = event_pool_.back(); + event_pool_.pop_back(); + } } - - void get(T& event) { - bool need_allocator = false; - { - std::lock_guard _(event_mutex_); - if (event_pool_.empty()) { - need_allocator = true; - } else { - event = event_pool_.back(); - event_pool_.pop_back(); - } - } - if (need_allocator) { - allocator_(event); - } + if (need_allocator) { + allocator_(event); } + } - void restore(T& event) { - std::lock_guard _(event_mutex_); - event_pool_.emplace_back(event); - } + void restore(T &event) { + std::lock_guard _(event_mutex_); + event_pool_.emplace_back(event); + } }; - - -EventPool* getEventPool() { - const int index = devproxy::current_device(); - // GlobalEventPool for different cards , construct when really needed - #define dispatch_event_pool(device_id) \ - if (index == device_id) { \ - static EventPool gDIPUEventPool( \ - [](deviceEvent_t& event) { \ - devapis::createEvent(&event); \ - }, [](deviceEvent_t& event) { \ - devapis::destroyEvent(event); \ - }); \ - return &gDIPUEventPool; \ - } - - dispatch_event_pool(0); - dispatch_event_pool(1); - dispatch_event_pool(2); - dispatch_event_pool(3); - dispatch_event_pool(4); - dispatch_event_pool(5); - dispatch_event_pool(6); - dispatch_event_pool(7); - dispatch_event_pool(8); - dispatch_event_pool(9); - dispatch_event_pool(10); - dispatch_event_pool(11); - dispatch_event_pool(12); - dispatch_event_pool(13); - dispatch_event_pool(14); - dispatch_event_pool(15); - TORCH_CHECK(false, "support up to 16 cards"); +EventPool *getEventPool() { + const int index = devproxy::current_device(); +// GlobalEventPool for different cards , construct when really needed +#define dispatch_event_pool(device_id) \ + if (index == device_id) { \ + static EventPool gDIPUEventPool( \ + [](deviceEvent_t &event) { devapis::createEvent(&event); }, \ + [](deviceEvent_t &event) { devapis::destroyEvent(event); }); \ + return &gDIPUEventPool; \ + } + + dispatch_event_pool(0); + dispatch_event_pool(1); + dispatch_event_pool(2); + dispatch_event_pool(3); + dispatch_event_pool(4); + dispatch_event_pool(5); + dispatch_event_pool(6); + dispatch_event_pool(7); + dispatch_event_pool(8); + dispatch_event_pool(9); + dispatch_event_pool(10); + dispatch_event_pool(11); + dispatch_event_pool(12); + dispatch_event_pool(13); + dispatch_event_pool(14); + dispatch_event_pool(15); + TORCH_CHECK(false, "support up to 16 cards"); } -void getEventFromPool(deviceEvent_t& event) { - getEventPool()->get(event); -} +void getEventFromPool(deviceEvent_t &event) { getEventPool()->get(event); } -void restoreEventToPool(deviceEvent_t& event) { - getEventPool()->restore(event); +void restoreEventToPool(deviceEvent_t &event) { + getEventPool()->restore(event); } -void releaseAllEvent() { - getEventPool()->release(); -} +void releaseAllEvent() { getEventPool()->release(); } } // namespace dipu diff --git a/dipu/torch_dipu/csrc_dipu/runtime/core/DIPUEventPool.h b/dipu/torch_dipu/csrc_dipu/runtime/core/DIPUEventPool.h index 73ea38b99..a5f328373 100644 --- a/dipu/torch_dipu/csrc_dipu/runtime/core/DIPUEventPool.h +++ b/dipu/torch_dipu/csrc_dipu/runtime/core/DIPUEventPool.h @@ -5,11 +5,10 @@ namespace dipu { -void getEventFromPool(deviceEvent_t& event); +void getEventFromPool(deviceEvent_t &event); -void restoreEventToPool(deviceEvent_t& event); +void restoreEventToPool(deviceEvent_t &event); void releaseAllEvent(); } // namespace dipu - diff --git a/dipu/torch_dipu/csrc_dipu/runtime/core/DIPUGeneratorImpl.cpp b/dipu/torch_dipu/csrc_dipu/runtime/core/DIPUGeneratorImpl.cpp index 2067d4daf..d9b66f960 100644 --- a/dipu/torch_dipu/csrc_dipu/runtime/core/DIPUGeneratorImpl.cpp +++ b/dipu/torch_dipu/csrc_dipu/runtime/core/DIPUGeneratorImpl.cpp @@ -1,8 +1,9 @@ // Copyright (c) 2023, DeepLink. +#include "DIPUGeneratorImpl.h" + #include #include -#include "DIPUGeneratorImpl.h" #include namespace dipu { @@ -20,9 +21,9 @@ static std::deque dipu_gens_init_flag; static std::vector default_gens_dipu; /* -* Populates the global variables related to DIPU generators -* Warning: this function must only be called once! -*/ + * Populates the global variables related to DIPU generators + * Warning: this function must only be called once! + */ static void initDIPUGenerator() { num_dipu = devproxy::getDeviceCount(); dipu_gens_init_flag.resize(num_dipu); @@ -35,7 +36,7 @@ static void initDIPUGenerator() { * maintain a global running state of the pseudo random number generation, * when a user does not explicitly mention any generator. */ -at::Generator& getDefaultDIPUGenerator(at::DeviceIndex device_index) { +at::Generator &getDefaultDIPUGenerator(at::DeviceIndex device_index) { std::call_once(dipu_init_flag, initDIPUGenerator); at::DeviceIndex idx = device_index; @@ -71,8 +72,9 @@ at::Generator createDIPUGenerator(at::DeviceIndex device_index) { * DIPUGeneratorImpl class implementation */ DIPUGeneratorImpl::DIPUGeneratorImpl(at::DeviceIndex device_index) - : c10::GeneratorImpl{at::Device(dipu::DIPU_DEVICE_TYPE, device_index), - at::DispatchKeySet(dipu::DIPU_DISPATCH_KEY)}, state_need_reset_(true) {} + : c10::GeneratorImpl{at::Device(dipu::DIPU_DEVICE_TYPE, device_index), + at::DispatchKeySet(dipu::DIPU_DISPATCH_KEY)}, + state_need_reset_(true) {} /** * Sets the seed to be used by MTGP @@ -87,9 +89,7 @@ void DIPUGeneratorImpl::set_current_seed(uint64_t seed) { /** * Gets the current seed of DIPUGeneratorImpl. */ -uint64_t DIPUGeneratorImpl::current_seed() const { - return seed_; -} +uint64_t DIPUGeneratorImpl::current_seed() const { return seed_; } /** * Gets a nondeterministic random number from /dev/urandom or time, @@ -124,11 +124,11 @@ std::shared_ptr DIPUGeneratorImpl::clone() const { * * See Note [Acquire lock when using random generators] */ -DIPUGeneratorImpl* DIPUGeneratorImpl::clone_impl() const { +DIPUGeneratorImpl *DIPUGeneratorImpl::clone_impl() const { auto gen = new DIPUGeneratorImpl(this->device().index()); gen->set_current_seed(this->seed_); auto state = this->state_; - const auto& state_clone = state.clone(); + const auto &state_clone = state.clone(); gen->set_state(*(state_clone.getIntrusivePtr().get())); gen->set_state_flag(this->state_need_reset_); return gen; @@ -138,7 +138,7 @@ DIPUGeneratorImpl* DIPUGeneratorImpl::clone_impl() const { * get state * * See Note [Acquire lock when using random generators] - */ + */ c10::intrusive_ptr DIPUGeneratorImpl::get_state() const { if (state_need_reset_) { update_state(); @@ -150,10 +150,8 @@ c10::intrusive_ptr DIPUGeneratorImpl::get_state() const { /** * set state flag * See Note [Acquire lock when using random generators] - */ -void DIPUGeneratorImpl::set_state_flag(bool flag) { - state_need_reset_ = flag; -} + */ +void DIPUGeneratorImpl::set_state_flag(bool flag) { state_need_reset_ = flag; } /** * get rng state @@ -161,7 +159,8 @@ void DIPUGeneratorImpl::set_state_flag(bool flag) { **/ at::Tensor get_rng_state(at::DeviceIndex idx) { auto gen = getDefaultDIPUGenerator(idx); - auto gen_impl = at::get_generator_or_default(gen, getDefaultDIPUGenerator()); + auto gen_impl = at::get_generator_or_default( + gen, getDefaultDIPUGenerator()); std::lock_guard lock(gen_impl->mutex_); auto state_ptr = gen_impl->get_state(); auto state = at::Tensor(std::move(state_ptr)); @@ -174,7 +173,8 @@ at::Tensor get_rng_state(at::DeviceIndex idx) { **/ void set_rng_state(at::DeviceIndex idx, at::Tensor state) { auto gen = getDefaultDIPUGenerator(idx); - auto gen_impl = at::get_generator_or_default(gen, getDefaultDIPUGenerator()); + auto gen_impl = at::get_generator_or_default( + gen, getDefaultDIPUGenerator()); std::lock_guard lock(gen_impl->mutex_); gen_impl->set_state(*(state.getIntrusivePtr().get())); } @@ -185,7 +185,8 @@ void set_rng_state(at::DeviceIndex idx, at::Tensor state) { **/ void manual_seed(at::DeviceIndex idx, uint64_t seed) { auto gen = getDefaultDIPUGenerator(idx); - auto gen_impl = at::get_generator_or_default(gen, getDefaultDIPUGenerator()); + auto gen_impl = at::get_generator_or_default( + gen, getDefaultDIPUGenerator()); std::lock_guard lock(gen_impl->mutex_); gen_impl->set_current_seed(seed); } @@ -196,7 +197,8 @@ void manual_seed(at::DeviceIndex idx, uint64_t seed) { **/ void seed(at::DeviceIndex idx) { auto gen = getDefaultDIPUGenerator(idx); - auto gen_impl = at::get_generator_or_default(gen, getDefaultDIPUGenerator()); + auto gen_impl = at::get_generator_or_default( + gen, getDefaultDIPUGenerator()); std::lock_guard lock(gen_impl->mutex_); gen_impl->seed(); } @@ -211,8 +213,6 @@ uint64_t initial_seed(at::DeviceIndex idx) { return seed; } -void releaseAllGenerator() { - default_gens_dipu.clear(); -} +void releaseAllGenerator() { default_gens_dipu.clear(); } } // namespace dipu diff --git a/dipu/torch_dipu/csrc_dipu/runtime/core/DIPUGeneratorImpl.h b/dipu/torch_dipu/csrc_dipu/runtime/core/DIPUGeneratorImpl.h index be201ef3e..dc390843a 100644 --- a/dipu/torch_dipu/csrc_dipu/runtime/core/DIPUGeneratorImpl.h +++ b/dipu/torch_dipu/csrc_dipu/runtime/core/DIPUGeneratorImpl.h @@ -1,14 +1,14 @@ // Copyright (c) 2023, DeepLink. #pragma once +#include +#include #include #include -#include -#include namespace dipu { class DIPUGeneratorImpl : public c10::GeneratorImpl { -public: + public: // Constructors explicit DIPUGeneratorImpl(at::DeviceIndex device_index = -1); ~DIPUGeneratorImpl() = default; @@ -19,21 +19,21 @@ class DIPUGeneratorImpl : public c10::GeneratorImpl { uint64_t seed() override; static at::DeviceType device_type(); c10::intrusive_ptr get_state() const override; - virtual void set_state(const c10::TensorImpl& state) {}; - virtual void set_offset(uint64_t offset) {}; - virtual uint64_t get_offset() const {return 0;}; + virtual void set_state(const c10::TensorImpl &state){}; + virtual void set_offset(uint64_t offset){}; + virtual uint64_t get_offset() const { return 0; }; -protected: + protected: void set_state_flag(bool flag); virtual void update_state() const {} - DIPUGeneratorImpl* clone_impl() const override; + DIPUGeneratorImpl *clone_impl() const override; uint64_t seed_ = c10::default_rng_seed_val; mutable at::Tensor state_; mutable bool state_need_reset_; }; -at::Generator& getDefaultDIPUGenerator(at::DeviceIndex device_index = -1); +at::Generator &getDefaultDIPUGenerator(at::DeviceIndex device_index = -1); at::Generator createDIPUGenerator(at::DeviceIndex device_index = -1); void manual_seed(at::DeviceIndex idx, uint64_t seed); diff --git a/dipu/torch_dipu/csrc_dipu/runtime/core/DIPUGuard.h b/dipu/torch_dipu/csrc_dipu/runtime/core/DIPUGuard.h index b90dee40b..96b94cba7 100644 --- a/dipu/torch_dipu/csrc_dipu/runtime/core/DIPUGuard.h +++ b/dipu/torch_dipu/csrc_dipu/runtime/core/DIPUGuard.h @@ -1,24 +1,24 @@ // Copyright (c) 2023, DeepLink. #pragma once +#include + #include #include -#include #include "./guardimpl/DIPUGuardImpl.h" - namespace dipu { using c10::DeviceGuard; -class DIPUGuard: public DeviceGuard { - public: - explicit DIPUGuard() = delete; +class DIPUGuard : public DeviceGuard { + public: + explicit DIPUGuard() = delete; - explicit DIPUGuard(c10::Device device) : DeviceGuard(device) {} + explicit DIPUGuard(c10::Device device) : DeviceGuard(device) {} - explicit DIPUGuard(c10::DeviceIndex device_index) + explicit DIPUGuard(c10::DeviceIndex device_index) : DeviceGuard(c10::Device(dipu::DIPU_DEVICE_TYPE, device_index)) {} }; @@ -27,4 +27,4 @@ using OptionalDIPUGuard = c10::OptionalDeviceGuard; using DIPUStreamGuard = c10::StreamGuard; using OptionalDIPUStreamGuard = c10::OptionalStreamGuard; -} // namespace dipu \ No newline at end of file +} // namespace dipu \ No newline at end of file diff --git a/dipu/torch_dipu/csrc_dipu/runtime/core/DIPUStream.cpp b/dipu/torch_dipu/csrc_dipu/runtime/core/DIPUStream.cpp index cc1efe59d..92563b23c 100644 --- a/dipu/torch_dipu/csrc_dipu/runtime/core/DIPUStream.cpp +++ b/dipu/torch_dipu/csrc_dipu/runtime/core/DIPUStream.cpp @@ -1,17 +1,18 @@ // Copyright (c) 2023, DeepLink. +#include "DIPUStream.h" + #include #include #include #include -#include +#include #include #include -#include +#include #include #include "DIPUGuard.h" -#include "DIPUStream.h" using dipu::devapis::deviceId_t; namespace dipu { @@ -22,7 +23,7 @@ enum class StreamIdType : uint8_t { POOL = 0x1, }; -std::ostream& operator<<(std::ostream& stream, StreamIdType s) { +std::ostream &operator<<(std::ostream &stream, StreamIdType s) { switch (s) { case StreamIdType::DEFAULT: stream << "DEFAULT"; @@ -36,7 +37,7 @@ std::ostream& operator<<(std::ostream& stream, StreamIdType s) { } return stream; } -// follow old pytorch cuda, seems new version use an opposite strategy. +// follow old pytorch cuda, seems new version use an opposite strategy. static constexpr int kStreamsPerPoolBits = 3; static constexpr int kStreamsPerPool = 1 << kStreamsPerPoolBits; @@ -49,21 +50,21 @@ static std::once_flag global_init_flag; static thread_local std::unique_ptr current_streams = nullptr; static c10::StreamId makeC10StreamId(StreamIdType sType, size_t id) { - return ((uint32_t)static_cast(sType) << kStreamsPerPoolBits) | - static_cast(id); + return ((uint32_t) static_cast(sType) << kStreamsPerPoolBits) | + static_cast(id); } // manage per-device streams struct DIPUStreamDevice { -private: + private: // Default streams std::once_flag pool_flag; std::once_flag default_flag; - deviceId_t devidx_; - // seems pytorch 2.0 giveup default stream and enable cuda per_thread stream feature at compile time. - // it cannot be applied to othe device. + deviceId_t devidx_; + // seems pytorch 2.0 giveup default stream and enable cuda per_thread stream + // feature at compile time. it cannot be applied to othe device. deviceStream_t default_stream = nullptr; - + std::atomic next_pool_pos; std::array pool_streams; @@ -82,7 +83,7 @@ struct DIPUStreamDevice { void _doInitPool() { DIPUGuard device_guard{devidx_}; for (auto i = decltype(kStreamsPerPool){0}; i < kStreamsPerPool; ++i) { - auto& raw_device_stream = pool_streams[i]; + auto &raw_device_stream = pool_streams[i]; devproxy::createStream(&raw_device_stream); } } @@ -94,7 +95,7 @@ struct DIPUStreamDevice { devproxy::setDevice(cur_device); } -public: + public: DIPUStreamDevice(deviceId_t devidx) { devidx_ = devidx; next_pool_pos = 0; @@ -116,25 +117,20 @@ struct DIPUStreamDevice { switch (st) { case StreamIdType::DEFAULT: AT_ASSERTM( - sidx == 0, - "Unrecognized stream ", - stream_id, - " (I think this should be the default stream, but I got a non-zero index ", - sidx, - ").", - " Did you manufacture the StreamId yourself? Don't do that; use the", - " official API like c10::cuda::getStreamFromPool() to get a new stream."); + sidx == 0, "Unrecognized stream ", stream_id, + " (I think this should be the default stream, but I got a non-zero " + "index ", + sidx, ").", + " Did you manufacture the StreamId yourself? Don't do that; use " + "the", + " official API like c10::cuda::getStreamFromPool() to get a new " + "stream."); return default_stream; case StreamIdType::POOL: return pool_streams[sidx]; default: - AT_ASSERTM( - 0, - "Unrecognized stream ", - stream_id, - " (I didn't recognize the stream type, ", - st, - ")"); + AT_ASSERTM(0, "Unrecognized stream ", stream_id, + " (I didn't recognize the stream type, ", st, ")"); } } void initPool() { @@ -145,7 +141,8 @@ struct DIPUStreamDevice { } }; -static std::array, C10_COMPILE_TIME_MAX_DIPUS> streamDeviceList; +static std::array, C10_COMPILE_TIME_MAX_DIPUS> + streamDeviceList; static void initGlobalStreamState() { num_dipus = devproxy::getDeviceCount(); @@ -155,11 +152,11 @@ static void initGlobalStreamState() { num_dipus <= C10_COMPILE_TIME_MAX_DIPUS, "Number of DIPU devices on the machine is larger than the compiled " "max number of dipus expected (", - C10_COMPILE_TIME_MAX_DIPUS, - "). Increase that and recompile."); + C10_COMPILE_TIME_MAX_DIPUS, "). Increase that and recompile."); - for (int i=0; i< num_dipus; i++) { - streamDeviceList[i] = std::move(std::unique_ptr(new DIPUStreamDevice(i))); + for (int i = 0; i < num_dipus; i++) { + streamDeviceList[i] = + std::move(std::unique_ptr(new DIPUStreamDevice(i))); } } @@ -188,11 +185,12 @@ static c10::DeviceIndex initDIPUGlobal(c10::DeviceIndex devIdx) { return devIdx; } -} // end anonymous namespace +} // end anonymous namespace // api deviceStream_t DIPUStream::rawstream() const { - return streamDeviceList[this->device_index()]->obtainRawStream(this->unwrap().id()); + return streamDeviceList[this->device_index()]->obtainRawStream( + this->unwrap().id()); } DIPUStream getDIPUStreamFromPool(c10::DeviceIndex devIdx) { @@ -209,11 +207,12 @@ DIPUStream getDefaultDIPUStream(c10::DeviceIndex devIdx) { DIPUStream getCurrentDIPUStream(c10::DeviceIndex devIdx) { devIdx = initDIPUGlobal(devIdx); - return DIPUStream(devIdx, current_streams[devIdx]); + return DIPUStream(devIdx, current_streams[devIdx]); } // copy from pytorch, not verify -DIPUStream getStreamFromExternal(deviceStream_t ext_stream, c10::DeviceIndex device_index) { +DIPUStream getStreamFromExternal(deviceStream_t ext_stream, + c10::DeviceIndex device_index) { // The stream pointer will be the actual id return DIPUStream(device_index, reinterpret_cast(ext_stream)); } @@ -224,8 +223,8 @@ void setCurrentDIPUStream(DIPUStream stream) { current_streams[devIdx] = stream.unwrap().id(); } -std::ostream& operator<<(std::ostream& os, const DIPUStream& stream) { +std::ostream &operator<<(std::ostream &os, const DIPUStream &stream) { return os << stream.unwrap(); } -} // namespace dipu +} // namespace dipu diff --git a/dipu/torch_dipu/csrc_dipu/runtime/core/DIPUStream.h b/dipu/torch_dipu/csrc_dipu/runtime/core/DIPUStream.h index 826b166ea..714f86622 100644 --- a/dipu/torch_dipu/csrc_dipu/runtime/core/DIPUStream.h +++ b/dipu/torch_dipu/csrc_dipu/runtime/core/DIPUStream.h @@ -7,8 +7,8 @@ #include #include #include -#include #include +#include #include #include @@ -16,7 +16,7 @@ namespace dipu { class DIPU_API DIPUStream { -public: + public: enum Unchecked { UNCHECKED }; explicit DIPUStream(c10::Stream stream) : stream_(stream) { TORCH_CHECK(stream_.device_type() == dipu::DIPU_DEVICE_TYPE); @@ -25,34 +25,28 @@ class DIPU_API DIPUStream { explicit DIPUStream(Unchecked, c10::Stream stream) : stream_(stream) {} explicit DIPUStream(devapis::deviceId_t devidx, c10::StreamId stream_id) - : DIPUStream(Unchecked::UNCHECKED, c10::Stream(c10::Stream::UNSAFE, - c10::Device(dipu::DIPU_DEVICE_TYPE, devidx), stream_id)) { + : DIPUStream(Unchecked::UNCHECKED, + c10::Stream(c10::Stream::UNSAFE, + c10::Device(dipu::DIPU_DEVICE_TYPE, devidx), + stream_id)) {} - } + ~DIPUStream() {} - ~DIPUStream(){} - - bool operator==(const DIPUStream& other) const noexcept { + bool operator==(const DIPUStream &other) const noexcept { return unwrap() == other.unwrap(); } - bool operator!=(const DIPUStream& other) const noexcept { + bool operator!=(const DIPUStream &other) const noexcept { return unwrap() != other.unwrap(); } /// Implicit conversion to pytorch Stream. - operator c10::Stream() const { - return unwrap(); - } + operator c10::Stream() const { return unwrap(); } - operator deviceStream_t() const { - return rawstream(); - } + operator deviceStream_t() const { return rawstream(); } /// Get the device index that this stream is associated with. - c10::DeviceIndex device_index() const { - return stream_.device_index(); - } + c10::DeviceIndex device_index() const { return stream_.device_index(); } /// Get the full Device that this stream is associated with. The Device /// is guaranteed to be a device. @@ -60,9 +54,7 @@ class DIPU_API DIPUStream { return c10::Device(dipu::DIPU_DEVICE_TYPE, device_index()); } - c10::StreamId id() const { - return stream_.id(); - } + c10::StreamId id() const { return stream_.id(); } void synchronize() const { c10::DeviceGuard guard{stream_.device()}; @@ -78,21 +70,18 @@ class DIPU_API DIPUStream { deviceStream_t rawstream() const; /// Explicit conversion to Stream. - c10::Stream unwrap() const { - return stream_; - } + c10::Stream unwrap() const { return stream_; } - c10::StreamData3 pack3() const noexcept { - return stream_.pack3(); - } + c10::StreamData3 pack3() const noexcept { return stream_.pack3(); } - static DIPUStream unpack3(c10::StreamId stream_id, c10::DeviceIndex device_index, - c10::DeviceType device_type) { + static DIPUStream unpack3(c10::StreamId stream_id, + c10::DeviceIndex device_index, + c10::DeviceType device_type) { TORCH_CHECK(device_type == dipu::DIPU_DEVICE_TYPE); return DIPUStream(device_index, stream_id); } -private: + private: c10::Stream stream_; }; @@ -104,10 +93,11 @@ DIPU_API DIPUStream getCurrentDIPUStream(c10::DeviceIndex device_index = -1); DIPU_API void setCurrentDIPUStream(DIPUStream stream); -DIPU_API DIPUStream getStreamFromExternal(deviceStream_t ext_stream, c10::DeviceIndex device_index); +DIPU_API DIPUStream getStreamFromExternal(deviceStream_t ext_stream, + c10::DeviceIndex device_index); -std::ostream& operator<<(std::ostream& stream, const DIPUStream& s); -} // namespace dipu +std::ostream &operator<<(std::ostream &stream, const DIPUStream &s); +} // namespace dipu namespace std { template <> @@ -116,4 +106,4 @@ struct hash { return std::hash{}(s.unwrap()); } }; -} // namespace std +} // namespace std diff --git a/dipu/torch_dipu/csrc_dipu/runtime/core/MemChecker.cpp b/dipu/torch_dipu/csrc_dipu/runtime/core/MemChecker.cpp index 9c4e7fe56..2f599b5c0 100644 --- a/dipu/torch_dipu/csrc_dipu/runtime/core/MemChecker.cpp +++ b/dipu/torch_dipu/csrc_dipu/runtime/core/MemChecker.cpp @@ -6,8 +6,8 @@ #include #include -#include #include +#include namespace dipu { @@ -21,15 +21,17 @@ MemChecker::~MemChecker() { if (!blocks_.empty()) { std::cout << "dipu memory checker: there maybe exist memory leak. " - << blocks_.size() << " blocks not released." << std::endl; + << blocks_.size() << " blocks not released." << std::endl; for (const auto &kv : blocks_) { - std::cout << "key: " << kv.first << ", ptr: " << kv.second.first << ", trace: " << kv.second.second << std::endl; + std::cout << "key: " << kv.first << ", ptr: " << kv.second.first + << ", trace: " << kv.second.second << std::endl; } } - std::cout << "dipu memory checker: going to destruction. " << current_state() << std::endl; + std::cout << "dipu memory checker: going to destruction. " << current_state() + << std::endl; } -MemChecker& MemChecker::instance() { +MemChecker &MemChecker::instance() { static MemChecker checker; return checker; } @@ -40,13 +42,14 @@ bool MemChecker::enable() { } bool MemChecker::enable_backtrace() { - static bool enable_trace = (std::getenv("DIPU_MEM_CHECK_ENABLE_BACKTRACE") != nullptr); + static bool enable_trace = + (std::getenv("DIPU_MEM_CHECK_ENABLE_BACKTRACE") != nullptr); return enable_trace; } int32_t MemChecker::max_block_num() { static int32_t max_block = []() -> int32_t { - const char* str = std::getenv("DIPU_MEM_CHECK_MAX_BLOCK"); + const char *str = std::getenv("DIPU_MEM_CHECK_MAX_BLOCK"); if (str == nullptr) { return DEFAULT_MAX_BLOCK_NUM; } @@ -58,7 +61,7 @@ int32_t MemChecker::max_block_num() { int32_t MemChecker::log_interval() { static int32_t interval = []() -> int32_t { - const char* str = std::getenv("DIPU_MEM_CHECK_LOG_INTERVAL"); + const char *str = std::getenv("DIPU_MEM_CHECK_LOG_INTERVAL"); if (str == nullptr) { return DEFAULT_LOG_INTERVAL; } @@ -70,15 +73,15 @@ int32_t MemChecker::log_interval() { std::string MemChecker::current_state() const { std::stringstream stream; - stream << "current block num = " << blocks_.size() - << ", total_size = " << (total_size_ >> 20) << "MB" - << ", insert count = " << insert_cnt_ - << ", max block num = " << max_block_num() - << ", log interval = " << log_interval(); + stream << "current block num = " << blocks_.size() + << ", total_size = " << (total_size_ >> 20) << "MB" + << ", insert count = " << insert_cnt_ + << ", max block num = " << max_block_num() + << ", log interval = " << log_interval(); return stream.str(); } -void MemChecker::insert(const void* ptr, size_t size) { +void MemChecker::insert(const void *ptr, size_t size) { if (!enable() || ptr == nullptr) { return; } @@ -88,7 +91,8 @@ void MemChecker::insert(const void* ptr, size_t size) { std::string state; { std::lock_guard lck(mtx_); - blocks_[ptr] = std::make_pair(size, enable_backtrace() ? c10::get_backtrace() : ""); + blocks_[ptr] = + std::make_pair(size, enable_backtrace() ? c10::get_backtrace() : ""); total_size_ += static_cast(size); ++insert_cnt_; @@ -104,13 +108,14 @@ void MemChecker::insert(const void* ptr, size_t size) { } if (may_leak) { - std::cout << "dipu memory checker: there may be memory leak. " << state << std::endl; + std::cout << "dipu memory checker: there may be memory leak. " << state + << std::endl; } else if (print_log) { std::cout << "dipu memory checker: " << state << std::endl; } } -void MemChecker::erase(const void* ptr) { +void MemChecker::erase(const void *ptr) { if (!enable() || ptr == nullptr) { return; } @@ -128,7 +133,9 @@ void MemChecker::erase(const void* ptr) { } if (!found) { - std::cout << "dipu memory checker: not found point address going to free, ptr = " << ptr << std::endl; + std::cout + << "dipu memory checker: not found point address going to free, ptr = " + << ptr << std::endl; } } @@ -137,7 +144,7 @@ void MemChecker::check(const at::Tensor &input) { check(ptr); } -void MemChecker::check(const void* ptr) { +void MemChecker::check(const void *ptr) { if (!enable()) { return; } @@ -149,7 +156,9 @@ void MemChecker::check(const void* ptr) { } if (!found) { - std::cout << "dipu memory checker: not found point address when check, ptr = " << ptr << std::endl; + std::cout + << "dipu memory checker: not found point address when check, ptr = " + << ptr << std::endl; } } diff --git a/dipu/torch_dipu/csrc_dipu/runtime/core/MemChecker.h b/dipu/torch_dipu/csrc_dipu/runtime/core/MemChecker.h index 0a1f99921..e4a6ca221 100644 --- a/dipu/torch_dipu/csrc_dipu/runtime/core/MemChecker.h +++ b/dipu/torch_dipu/csrc_dipu/runtime/core/MemChecker.h @@ -1,9 +1,8 @@ // Copyright (c) 2023, DeepLink. #pragma once -#include - #include +#include #include #include #include @@ -13,25 +12,25 @@ namespace dipu { class MemChecker final { -public: - static MemChecker& instance(); + public: + static MemChecker &instance(); static bool enable(); static bool enable_backtrace(); static int32_t max_block_num(); static int32_t log_interval(); - void insert(const void* ptr, size_t size); - void erase(const void* ptr); - void check(const at::Tensor& input); - void check(const void* ptr); + void insert(const void *ptr, size_t size); + void erase(const void *ptr); + void check(const at::Tensor &input); + void check(const void *ptr); ~MemChecker(); -private: + private: std::string current_state() const; -private: + private: std::mutex mtx_; - std::unordered_map> blocks_; + std::unordered_map> blocks_; int64_t total_size_ = 0; int64_t insert_cnt_ = 0; }; diff --git a/dipu/torch_dipu/csrc_dipu/runtime/core/allocator/DIPUAsyncResourcePool.h b/dipu/torch_dipu/csrc_dipu/runtime/core/allocator/DIPUAsyncResourcePool.h index d950feb9f..67a6e4427 100644 --- a/dipu/torch_dipu/csrc_dipu/runtime/core/allocator/DIPUAsyncResourcePool.h +++ b/dipu/torch_dipu/csrc_dipu/runtime/core/allocator/DIPUAsyncResourcePool.h @@ -1,61 +1,63 @@ // Copyright (c) 2023, DeepLink. #pragma once -#include #include +#include #include -#include -#include "DIPUSpinMutex.h" + #include "../DIPUEvent.h" +#include "DIPUSpinMutex.h" namespace dipu { -template +template class AsyncResourcePool { -public: - virtual void add(const T& t, std::deque& events) = 0; - virtual T get() = 0; - virtual bool ready() = 0; - virtual size_t size() = 0; + public: + virtual void add(const T &t, std::deque &events) = 0; + virtual T get() = 0; + virtual bool ready() = 0; + virtual size_t size() = 0; }; -template -class AsyncResourcePoolImpl: public AsyncResourcePool{ - using Res = std::tuple>; - std::deque list_; - using mutex_t = std::mutex; - mutex_t mutex_; - public: - void add(const T& t, std::deque& events) override { - std::lock_guard lk(mutex_); - list_.emplace_back(t, std::move(events)); +template +class AsyncResourcePoolImpl : public AsyncResourcePool { + using Res = std::tuple>; + std::deque list_; + using mutex_t = std::mutex; + mutex_t mutex_; + + public: + void add(const T &t, std::deque &events) override { + std::lock_guard lk(mutex_); + list_.emplace_back(t, std::move(events)); + } + + T get() override { + std::lock_guard lk(mutex_); + T t = std::get<0>(list_.front()); + list_.pop_front(); + return t; + } + + bool ready() override { + std::lock_guard lk(mutex_); + if (list_.empty()) { + return false; } - T get() override { - std::lock_guard lk(mutex_); - T t = std::get<0>(list_.front()); - list_.pop_front(); - return t; - } - - bool ready() override { - std::lock_guard lk(mutex_); - if (list_.empty()) { + for (auto iter = std::get<1>(list_.front()).begin(); + iter != std::get<1>(list_.front()).end(); iter++) { + if (iter->query() == false) { return false; } - - for (auto iter = std::get<1>(list_.front()).begin(); iter != std::get<1>(list_.front()).end(); iter++) { - if (iter->query() == false) { - return false; - } - } - return true; } + return true; + } - size_t size() override { - std::lock_guard lk(mutex_); - return list_.size(); - } + size_t size() override { + std::lock_guard lk(mutex_); + return list_.size(); + } }; -} // namespace dipu \ No newline at end of file +} // namespace dipu \ No newline at end of file diff --git a/dipu/torch_dipu/csrc_dipu/runtime/core/allocator/DIPUBFCachingAllocator.cpp b/dipu/torch_dipu/csrc_dipu/runtime/core/allocator/DIPUBFCachingAllocator.cpp index 5793a05de..23f77d7bf 100644 --- a/dipu/torch_dipu/csrc_dipu/runtime/core/allocator/DIPUBFCachingAllocator.cpp +++ b/dipu/torch_dipu/csrc_dipu/runtime/core/allocator/DIPUBFCachingAllocator.cpp @@ -1,500 +1,511 @@ // Copyright (c) 2023, DeepLink. -#include "DIPUCachingAllocator.h" -#include "DIPUSpinMutex.h" +#include +#include #include -#include #include #include -#include -#include +#include + +#include "DIPUCachingAllocator.h" +#include "DIPUSpinMutex.h" namespace dipu { -class BFCachingAllocatorImpl{ -public: - using allocate_fn_t = std::function; - using deallocate_fn_t = std::function; -private: - allocate_fn_t allocate_fn; - deallocate_fn_t deallocate_fn; - // Number of first level bins (exponentially) - static constexpr int kNumBigBins = 32; - // Number of second level bins (linearly) - static constexpr int kNumSubBins = 4; - static constexpr int kLogNumSubBins = 2; - // Allocation parameters - static constexpr size_t kMinAllocationSize = 512; - static constexpr size_t kMaxInternalFragmentation = 8u << 20u; // 8MB - static constexpr size_t kMinExtendSize = 8u << 20u; // 8MB - static constexpr size_t kMaxExtendSize = 1u << 30u; // 1GB - - - size_t cachedBytes = 0; - size_t allocatedBytes = 0; - - void* allocateOnDevice(size_t nbytes) { - void* ptr = nullptr; - try { - ptr = allocate_fn(nbytes); - cachedBytes += nbytes; - } catch(...) { +class BFCachingAllocatorImpl { + public: + using allocate_fn_t = std::function; + using deallocate_fn_t = std::function; + + private: + allocate_fn_t allocate_fn; + deallocate_fn_t deallocate_fn; + // Number of first level bins (exponentially) + static constexpr int kNumBigBins = 32; + // Number of second level bins (linearly) + static constexpr int kNumSubBins = 4; + static constexpr int kLogNumSubBins = 2; + // Allocation parameters + static constexpr size_t kMinAllocationSize = 512; + static constexpr size_t kMaxInternalFragmentation = 8u << 20u; // 8MB + static constexpr size_t kMinExtendSize = 8u << 20u; // 8MB + static constexpr size_t kMaxExtendSize = 1u << 30u; // 1GB + + size_t cachedBytes = 0; + size_t allocatedBytes = 0; + + void *allocateOnDevice(size_t nbytes) { + void *ptr = nullptr; + try { + ptr = allocate_fn(nbytes); + cachedBytes += nbytes; + } catch (...) { + } + + DIPU_DEBUG_ALLOCATOR(4, "BFCachingAllocatorImpl: allocateOnDevice " + << nbytes << " nbytes, ptr:" << ptr); + return ptr; + } - } + void releaseOnDevice(void *ptr, size_t nbytes) { + DIPU_DEBUG_ALLOCATOR(4, "BFCachingAllocatorImpl: releaseOnDevice " + << nbytes << " nbytes, ptr:" << ptr); + deallocate_fn(ptr); + cachedBytes -= nbytes; + } - DIPU_DEBUG_ALLOCATOR(4, "BFCachingAllocatorImpl: allocateOnDevice " << nbytes << " nbytes, ptr:" << ptr); - return ptr; + // Chunks and bins obtained by a single stream + struct StreamSet { + size_t id; + // Compress whether bins have chunks + // into 128 bits (`kNumBigBins` * `kNumSubBins`) + __uint128_t bits = 0; + // Virtual chunks which are the heads of the bins + int binHeads_[kNumBigBins * kNumSubBins]{0}; + // The extending size next time + size_t currExtendSize_ = kMinExtendSize; + + explicit StreamSet(size_t id) : id(id) {} + + // Find an available bin greater than or equal to `least` + int find(int least) const { + // For the case that `least` >= 128, + // the code below can also handle, we don't have to judge + // Use `mask` to set the bits (< `least`) to 0 + __uint128_t mask = 1; + (mask <<= least) -= 1; + __uint128_t map = (bits | mask) ^ mask; + // Find the index of the first "1" + // `__builtin_ctzll` only support `uint64_t`, + // so we have to divide + uint64_t low_bits = map, high_bits = map >> 64u; + if (low_bits) { + return __builtin_ctzll(low_bits); + } + if (high_bits) { + return 64 + __builtin_ctzll(high_bits); + } + return -1; } - void releaseOnDevice (void* ptr, size_t nbytes) { - DIPU_DEBUG_ALLOCATOR(4, "BFCachingAllocatorImpl: releaseOnDevice " << nbytes << " nbytes, ptr:" << ptr); - deallocate_fn(ptr); - cachedBytes -= nbytes; + // Set `idx` bit into 1 + void set(unsigned idx) { + __uint128_t mask = 1; + mask <<= idx; + bits |= mask; } - // Chunks and bins obtained by a single stream - struct StreamSet { - size_t id; - // Compress whether bins have chunks - // into 128 bits (`kNumBigBins` * `kNumSubBins`) - __uint128_t bits = 0; - // Virtual chunks which are the heads of the bins - int binHeads_[kNumBigBins * kNumSubBins] {0}; - // The extending size next time - size_t currExtendSize_ = kMinExtendSize; - - explicit StreamSet(size_t id): id(id) {} - - // Find an available bin greater than or equal to `least` - int find(int least) const { - // For the case that `least` >= 128, - // the code below can also handle, we don't have to judge - // Use `mask` to set the bits (< `least`) to 0 - __uint128_t mask = 1; - (mask <<= least) -= 1; - __uint128_t map = (bits | mask) ^ mask; - // Find the index of the first "1" - // `__builtin_ctzll` only support `uint64_t`, - // so we have to divide - uint64_t low_bits = map, high_bits = map >> 64u; - if (low_bits) { - return __builtin_ctzll(low_bits); - } - if (high_bits) { - return 64 + __builtin_ctzll(high_bits); - } - return -1; - } - - // Set `idx` bit into 1 - void set(unsigned idx) { - __uint128_t mask = 1; - mask <<= idx; - bits |= mask; - } + // Set `idx` bit into 0 + void remove(unsigned idx) { + __uint128_t mask = 1; + mask <<= idx; + bits &= ~mask; + } + }; - // Set `idx` bit into 0 - void remove(unsigned idx) { - __uint128_t mask = 1; - mask <<= idx; - bits &= ~mask; - } - }; + struct Chunk { + bool allocated = false; + int binId = -1; + int prevChunkInMem = 0, nextChunkInMem = 0; + int prevChunkInList = 0, nextChunkInList = 0; - struct Chunk { - bool allocated = false; - int binId = -1; - int prevChunkInMem = 0, nextChunkInMem = 0; - int prevChunkInList = 0, nextChunkInList = 0; + void *ptr; + size_t size; + // The stream id when created + size_t stream; - void* ptr; - size_t size; - // The stream id when created - size_t stream; + Chunk(void *ptr, size_t size, size_t stream) + : ptr(ptr), size(size), stream(stream) {} - Chunk(void* ptr, size_t size, size_t stream): - ptr(ptr), size(size), stream(stream) {} + bool isMonoBlock() const { return !prevChunkInMem && !nextChunkInMem; } + }; - bool isMonoBlock() const { - return !prevChunkInMem && !nextChunkInMem; - } - }; + std::vector chunks_; + // Use id recycling for better performance + std::stack recycleIds_; - std::vector chunks_; - // Use id recycling for better performance - std::stack recycleIds_; + typedef std::unique_ptr StreamSetHandle; + std::vector streamSets_; - typedef std::unique_ptr StreamSetHandle; - std::vector streamSets_; + using mutex_t = SpinMutex; + mutable mutex_t mut_; - using mutex_t = SpinMutex; - mutable mutex_t mut_; + static size_t roundBytes(size_t nbytes) { + return ((nbytes - 1) | (kMinAllocationSize - 1)) + 1; + } - static size_t roundBytes(size_t nbytes) { - return ((nbytes - 1) | (kMinAllocationSize - 1)) + 1; + int newChunk(void *ptr, size_t size, size_t stream) { + int id; + if (!recycleIds_.empty()) { + id = recycleIds_.top(); + recycleIds_.pop(); + chunks_[id] = Chunk(ptr, size, stream); + } else { + id = chunks_.size(); + chunks_.emplace_back(Chunk(ptr, size, stream)); } - - int newChunk(void* ptr, size_t size, size_t stream) { - int id; - if (!recycleIds_.empty()) { - id = recycleIds_.top(); - recycleIds_.pop(); - chunks_[id] = Chunk(ptr, size, stream); - } else { - id = chunks_.size(); - chunks_.emplace_back(Chunk(ptr, size, stream)); - } - if (!ptr) { - chunks_[id].allocated = true; - } - return id; + if (!ptr) { + chunks_[id].allocated = true; } + return id; + } - static int binIdForSize(size_t nbytes) { - // Big bin range: - // [2^`bigBinIdx`, 2^(`bigBinIdx`+1)), length: 2^`bigBinIdx` - // Split big bin into `kNumSubBins` sub bins - size_t nBlocks = nbytes / kMinAllocationSize; - int bigBinIdx = 63 - __builtin_clzll(nBlocks); - // If `nbytes` is so large, we just put it into the last - if (bigBinIdx > kNumBigBins - 1) - return kNumBigBins * kNumSubBins - 1; - // Get the index of sub bin - int subBinIdx = nBlocks ^ (1ull << bigBinIdx); - subBinIdx >>= std::max(bigBinIdx - kLogNumSubBins, 0); - return bigBinIdx * kNumSubBins + subBinIdx; - } + static int binIdForSize(size_t nbytes) { + // Big bin range: + // [2^`bigBinIdx`, 2^(`bigBinIdx`+1)), length: 2^`bigBinIdx` + // Split big bin into `kNumSubBins` sub bins + size_t nBlocks = nbytes / kMinAllocationSize; + int bigBinIdx = 63 - __builtin_clzll(nBlocks); + // If `nbytes` is so large, we just put it into the last + if (bigBinIdx > kNumBigBins - 1) return kNumBigBins * kNumSubBins - 1; + // Get the index of sub bin + int subBinIdx = nBlocks ^ (1ull << bigBinIdx); + subBinIdx >>= std::max(bigBinIdx - kLogNumSubBins, 0); + return bigBinIdx * kNumSubBins + subBinIdx; + } + void linkChunkInList(int a, int b, int c) { + chunks_[a].nextChunkInList = b; + chunks_[b].prevChunkInList = a; + chunks_[b].nextChunkInList = c; + chunks_[c].prevChunkInList = b; + } - void linkChunkInList(int a, int b, int c) { - chunks_[a].nextChunkInList = b; - chunks_[b].prevChunkInList = a; - chunks_[b].nextChunkInList = c; - chunks_[c].prevChunkInList = b; - } + void linkChunkInMem(int a, int b, int c) { + chunks_[a].nextChunkInMem = b; + chunks_[b].prevChunkInMem = a; + chunks_[b].nextChunkInMem = c; + chunks_[c].prevChunkInMem = b; + } - void linkChunkInMem(int a, int b, int c) { - chunks_[a].nextChunkInMem = b; - chunks_[b].prevChunkInMem = a; - chunks_[b].nextChunkInMem = c; - chunks_[c].prevChunkInMem = b; - } + void removeChunkInList(int a, int c) { + // Remove b + chunks_[a].nextChunkInList = c; + chunks_[c].prevChunkInList = a; + } - void removeChunkInList(int a, int c) { - // Remove b - chunks_[a].nextChunkInList = c; - chunks_[c].prevChunkInList = a; - } + void removeChunkInMem(int a, int c) { + // Remove b + chunks_[a].nextChunkInMem = c; + chunks_[c].prevChunkInMem = a; + } - void removeChunkInMem(int a, int c) { - // Remove b - chunks_[a].nextChunkInMem = c; - chunks_[c].prevChunkInMem = a; - } + void insertChunkIntoBin(int id) { + int binId = (chunks_[id].binId = binIdForSize(chunks_[id].size)); + auto &set = streamSets_[chunks_[id].stream]; + set->set(binId); + linkChunkInList(set->binHeads_[binId], id, + chunks_[set->binHeads_[binId]].nextChunkInList); + } - void insertChunkIntoBin(int id) { - int binId = (chunks_[id].binId = binIdForSize(chunks_[id].size)); - auto &set = streamSets_[chunks_[id].stream]; - set->set(binId); - linkChunkInList(set->binHeads_[binId], id, - chunks_[set->binHeads_[binId]].nextChunkInList); + void removeChunkFromBin(int id) { + int binId = chunks_[id].binId; + auto &set = streamSets_[chunks_[id].stream]; + removeChunkInList(chunks_[id].prevChunkInList, chunks_[id].nextChunkInList); + if (!chunks_[set->binHeads_[binId]].nextChunkInList) { + set->remove(binId); } + } - void removeChunkFromBin(int id) { - int binId = chunks_[id].binId; - auto &set = streamSets_[chunks_[id].stream]; - removeChunkInList(chunks_[id].prevChunkInList, - chunks_[id].nextChunkInList); - if (!chunks_[set->binHeads_[binId]].nextChunkInList) { - set->remove(binId); - } + int findChunk(size_t nbytes, StreamSetHandle &set) { + // Check whether the first chunk in `least` bin satisfies + int least = binIdForSize(nbytes); + int id = chunks_[set->binHeads_[least]].nextChunkInList; + if (id) { + id = chunks_[id].size >= nbytes ? id : 0; } - int findChunk(size_t nbytes, StreamSetHandle &set) { - // Check whether the first chunk in `least` bin satisfies - int least = binIdForSize(nbytes); - int id = chunks_[set->binHeads_[least]].nextChunkInList; - if (id) { - id = chunks_[id].size >= nbytes ? id : 0; - } - - // If not, check the next available bin - if (!id) { - int binId = set->find(least + 1); - id = (binId == -1) ? - 0 : chunks_[set->binHeads_[binId]].nextChunkInList; - } + // If not, check the next available bin + if (!id) { + int binId = set->find(least + 1); + id = (binId == -1) ? 0 : chunks_[set->binHeads_[binId]].nextChunkInList; + } - if (id) { - removeChunkFromBin(id); - } - return id; + if (id) { + removeChunkFromBin(id); } + return id; + } - void shrink(StreamSetHandle &set) { - for (int binHead : set->binHeads_) { - int k = chunks_[binHead].nextChunkInList; - while (k) { - if (chunks_[k].isMonoBlock()) { - releaseOnDevice(chunks_[k].ptr, chunks_[k].size); - removeChunkFromBin(k); - recycleIds_.push(k); - } - k = chunks_[k].nextChunkInList; - } + void shrink(StreamSetHandle &set) { + for (int binHead : set->binHeads_) { + int k = chunks_[binHead].nextChunkInList; + while (k) { + if (chunks_[k].isMonoBlock()) { + releaseOnDevice(chunks_[k].ptr, chunks_[k].size); + removeChunkFromBin(k); + recycleIds_.push(k); } + k = chunks_[k].nextChunkInList; + } } + } - int split(int id, size_t nbytes) { - void* ptr = static_cast(chunks_[id].ptr) + nbytes; - size_t const size = chunks_[id].size - nbytes; - - chunks_[id].size = nbytes; - - int newId = newChunk(ptr, size, chunks_[id].stream); - linkChunkInMem(id, newId, chunks_[id].nextChunkInMem); - insertChunkIntoBin(newId); + int split(int id, size_t nbytes) { + void *ptr = static_cast(chunks_[id].ptr) + nbytes; + size_t const size = chunks_[id].size - nbytes; - return id; - } + chunks_[id].size = nbytes; - int merge(int c1, int c2) { - chunks_[c1].size += chunks_[c2].size; - removeChunkInMem(c1, chunks_[c2].nextChunkInMem); - return c1; - } + int newId = newChunk(ptr, size, chunks_[id].stream); + linkChunkInMem(id, newId, chunks_[id].nextChunkInMem); + insertChunkIntoBin(newId); - int coalesce(int id) { - int next = chunks_[id].nextChunkInMem; - if (next && !chunks_[next].allocated) { - removeChunkFromBin(next); - id = merge(id, next); - recycleIds_.push(next); - } + return id; + } - int prev = chunks_[id].prevChunkInMem; - if (prev && !chunks_[prev].allocated) { - removeChunkFromBin(prev); - int oldId = id; - id = merge(prev, id); - recycleIds_.push(oldId); - } + int merge(int c1, int c2) { + chunks_[c1].size += chunks_[c2].size; + removeChunkInMem(c1, chunks_[c2].nextChunkInMem); + return c1; + } - return id; + int coalesce(int id) { + int next = chunks_[id].nextChunkInMem; + if (next && !chunks_[next].allocated) { + removeChunkFromBin(next); + id = merge(id, next); + recycleIds_.push(next); } - int extend(size_t nbytes, StreamSetHandle &set) { - emptyCacheWithoutLock(); - auto& extSize = set->currExtendSize_; - bool increased = false; - while (extSize < nbytes && extSize < kMaxExtendSize) { - extSize *= 2; - increased = true; - } + int prev = chunks_[id].prevChunkInMem; + if (prev && !chunks_[prev].allocated) { + removeChunkFromBin(prev); + int oldId = id; + id = merge(prev, id); + recycleIds_.push(oldId); + } - size_t currBytes = std::max(nbytes, extSize); - void* ptr = allocateOnDevice(currBytes); - if (ptr) { - if (!increased && extSize < kMaxExtendSize) { - extSize *= 2; - } - } else { - if (currBytes > nbytes) { - currBytes = nbytes; - ptr = allocateOnDevice(currBytes); - } - } - if (!ptr) { - return 0; - } + return id; + } - int id = newChunk(ptr, currBytes, set->id); - return id; + int extend(size_t nbytes, StreamSetHandle &set) { + emptyCacheWithoutLock(); + auto &extSize = set->currExtendSize_; + bool increased = false; + while (extSize < nbytes && extSize < kMaxExtendSize) { + extSize *= 2; + increased = true; } - StreamSetHandle& checkStream(size_t stream) { - if (stream >= streamSets_.size()) { - streamSets_.resize(stream + 1); - } - if (streamSets_[stream] == nullptr) { - streamSets_[stream] = std::make_unique(stream); - for (int &binHead : streamSets_[stream]->binHeads_) { - binHead = newChunk(nullptr, 0, 0); - } - } - return streamSets_[stream]; + size_t currBytes = std::max(nbytes, extSize); + void *ptr = allocateOnDevice(currBytes); + if (ptr) { + if (!increased && extSize < kMaxExtendSize) { + extSize *= 2; + } + } else { + if (currBytes > nbytes) { + currBytes = nbytes; + ptr = allocateOnDevice(currBytes); + } } - - void emptyCacheWithoutLock() { - for (auto &set : streamSets_) { - if (set != nullptr) { - shrink(set); - } - } + if (!ptr) { + return 0; } -public: - BFCachingAllocatorImpl() { - // Avoid zero index later - newChunk(nullptr, 0, 0); - } + int id = newChunk(ptr, currBytes, set->id); + return id; + } - ~BFCachingAllocatorImpl() { - emptyCache(); + StreamSetHandle &checkStream(size_t stream) { + if (stream >= streamSets_.size()) { + streamSets_.resize(stream + 1); } - - void emptyCache() { - std::lock_guard lk(mut_); - emptyCacheWithoutLock(); + if (streamSets_[stream] == nullptr) { + streamSets_[stream] = std::make_unique(stream); + for (int &binHead : streamSets_[stream]->binHeads_) { + binHead = newChunk(nullptr, 0, 0); + } } + return streamSets_[stream]; + } - std::tuple allocateRaw(size_t size) { - if (!size) { - return std::make_tuple(nullptr, 0, 0); - } + void emptyCacheWithoutLock() { + for (auto &set : streamSets_) { + if (set != nullptr) { + shrink(set); + } + } + } - size_t nbytes = roundBytes(size); + public: + BFCachingAllocatorImpl() { + // Avoid zero index later + newChunk(nullptr, 0, 0); + } - allocatedBytes += nbytes; + ~BFCachingAllocatorImpl() { emptyCache(); } - std::lock_guard lk(mut_); - auto &set = checkStream(0); - int id = findChunk(nbytes, set); - if (!id) { - id = extend(nbytes, set); - } + void emptyCache() { + std::lock_guard lk(mut_); + emptyCacheWithoutLock(); + } - if (id) { - if (chunks_[id].size >= nbytes * 2 || - chunks_[id].size >= nbytes + kMaxInternalFragmentation) { - id = split(id, nbytes); - } - chunks_[id].allocated = true; - return std::make_tuple(chunks_[id].ptr, id, nbytes); - } - return std::make_tuple(nullptr, 0, 0);; + std::tuple allocateRaw(size_t size) { + if (!size) { + return std::make_tuple(nullptr, 0, 0); } - void releaseRaw(void* ptr, int id) { - if (!ptr) { - return; - } + size_t nbytes = roundBytes(size); + + allocatedBytes += nbytes; - std::lock_guard lk(mut_); - chunks_[id].allocated = false; - allocatedBytes -= chunks_[id].size; - id = coalesce(id); - insertChunkIntoBin(id); + std::lock_guard lk(mut_); + auto &set = checkStream(0); + int id = findChunk(nbytes, set); + if (!id) { + id = extend(nbytes, set); } - void set_mem_allocate_fn(allocate_fn_t allocate_fn, deallocate_fn_t deallocate_fn) { - DIPU_DEBUG_ALLOCATOR(4, "BFCachingAllocator: set_mem_allocate_fn "); - this->allocate_fn = allocate_fn; - this->deallocate_fn = deallocate_fn; + if (id) { + if (chunks_[id].size >= nbytes * 2 || + chunks_[id].size >= nbytes + kMaxInternalFragmentation) { + id = split(id, nbytes); + } + chunks_[id].allocated = true; + return std::make_tuple(chunks_[id].ptr, id, nbytes); } + return std::make_tuple(nullptr, 0, 0); + ; + } - size_t memory_reserved() { - return cachedBytes; + void releaseRaw(void *ptr, int id) { + if (!ptr) { + return; } + + std::lock_guard lk(mut_); + chunks_[id].allocated = false; + allocatedBytes -= chunks_[id].size; + id = coalesce(id); + insertChunkIntoBin(id); + } + + void set_mem_allocate_fn(allocate_fn_t allocate_fn, + deallocate_fn_t deallocate_fn) { + DIPU_DEBUG_ALLOCATOR(4, "BFCachingAllocator: set_mem_allocate_fn "); + this->allocate_fn = allocate_fn; + this->deallocate_fn = deallocate_fn; + } + + size_t memory_reserved() { return cachedBytes; } }; -static void deleteBFContext(void* ptr); +static void deleteBFContext(void *ptr); -class BFCachingAllocator: public CacheAllocator { - mutable std::unique_ptr impl; - using mutex_t = std::mutex; - mutable mutex_t resource_pool_mutex_; -private: - void restore() const{ +class BFCachingAllocator : public CacheAllocator { + mutable std::unique_ptr impl; + using mutex_t = std::mutex; + mutable mutex_t resource_pool_mutex_; + + private: + void restore() const { std::lock_guard lk(resource_pool_mutex_); while (async_mem_pool()->ready()) { - const auto block = async_mem_pool()->get(); - void* ptr = std::get<0>(block); - int id = std::get<1>(block); - DIPU_DEBUG_ALLOCATOR(8, "BFCachingAllocator: " << __FUNCTION__ << " ,ptr:" << ptr << " ,id:" << id << " ,allocator:" << this << ", device:" << device()); - impl->releaseRaw(ptr, id); + const auto block = async_mem_pool()->get(); + void *ptr = std::get<0>(block); + int id = std::get<1>(block); + DIPU_DEBUG_ALLOCATOR( + 8, "BFCachingAllocator: " << __FUNCTION__ << " ,ptr:" << ptr + << " ,id:" << id << " ,allocator:" << this + << ", device:" << device()); + impl->releaseRaw(ptr, id); } set_memory_reserved(impl->memory_reserved()); } - void empty_resource_pool() const { std::lock_guard lk(resource_pool_mutex_); while (async_mem_pool()->size() > 0) { - if (!async_mem_pool()->ready()) { - std::this_thread::yield(); - continue; - } - const auto block = async_mem_pool()->get(); - void* ptr = std::get<0>(block); - int id = std::get<1>(block); - DIPU_DEBUG_ALLOCATOR(8, "BFCachingAllocator: " << __FUNCTION__ << " ,ptr:" << ptr << " ,id:" << id << " ,allocator:" << this << ", device:" << device()); - impl->releaseRaw(ptr, id); + if (!async_mem_pool()->ready()) { + std::this_thread::yield(); + continue; + } + const auto block = async_mem_pool()->get(); + void *ptr = std::get<0>(block); + int id = std::get<1>(block); + DIPU_DEBUG_ALLOCATOR( + 8, "BFCachingAllocator: " << __FUNCTION__ << " ,ptr:" << ptr + << " ,id:" << id << " ,allocator:" << this + << ", device:" << device()); + impl->releaseRaw(ptr, id); } } - void check_impl() const{ + void check_impl() const { if (impl) { - return; + return; } impl.reset(new BFCachingAllocatorImpl()); - std::function alloc_fn = std::bind(&BFCachingAllocator::allocate_raw, (BFCachingAllocator*)this, std::placeholders::_1); - std::function dealloc_fn = std::bind(&BFCachingAllocator::free_raw, (BFCachingAllocator*)this, std::placeholders::_1); + std::function alloc_fn = + std::bind(&BFCachingAllocator::allocate_raw, (BFCachingAllocator *)this, + std::placeholders::_1); + std::function dealloc_fn = + std::bind(&BFCachingAllocator::free_raw, (BFCachingAllocator *)this, + std::placeholders::_1); impl->set_mem_allocate_fn(alloc_fn, dealloc_fn); } - void* makeContext(void* ptr, size_t size, size_t nbytes, int id) const{ - auto ctx = new Context(ptr, size, nbytes, id, this); - return ctx; + void *makeContext(void *ptr, size_t size, size_t nbytes, int id) const { + auto ctx = new Context(ptr, size, nbytes, id, this); + return ctx; } -public: - struct Context: public DataPtrContextBase { + public: + struct Context : public DataPtrContextBase { int id_ = 0; size_t nbytes_ = 0; - Context(void* ptr, size_t size, size_t nbytes, int id, const BFCachingAllocator* allocator):DataPtrContextBase(allocator, ptr, size), id_(id), nbytes_(nbytes){ - - } + Context(void *ptr, size_t size, size_t nbytes, int id, + const BFCachingAllocator *allocator) + : DataPtrContextBase(allocator, ptr, size), id_(id), nbytes_(nbytes) {} ~Context() { - auto allocator_ = static_cast(allocator()); - DIPU_DEBUG_ALLOCATOR(8, "BFCachingAllocator: add to async_mem_pool:" << ptr() << ", " << size() << " nbytes, id:"<< id_ <<", allocator:" << allocator_ << ", device:" << allocator_->device()); + auto allocator_ = static_cast(allocator()); + DIPU_DEBUG_ALLOCATOR(8, "BFCachingAllocator: add to async_mem_pool:" + << ptr() << ", " << size() << " nbytes, id:" + << id_ << ", allocator:" << allocator_ + << ", device:" << allocator_->device()); if (allocator_->impl) { if (ptr()) { - std::deque events; - for (auto iter = streams().begin(); iter != streams().end(); iter++) { - events.emplace_back(); - DIPU_DEBUG_ALLOCATOR(8, "BFCachingAllocator: record to stream:" << iter->rawstream() ); - events.back().record(*iter); - } - allocator_->async_mem_pool()->add(std::make_tuple(ptr(), id_), events); - allocator_->set_memory_allocated(allocator_->memory_allocated() - nbytes_); + std::deque events; + for (auto iter = streams().begin(); iter != streams().end(); iter++) { + events.emplace_back(); + DIPU_DEBUG_ALLOCATOR(8, "BFCachingAllocator: record to stream:" + << iter->rawstream()); + events.back().record(*iter); + } + allocator_->async_mem_pool()->add(std::make_tuple(ptr(), id_), + events); + allocator_->set_memory_allocated(allocator_->memory_allocated() - + nbytes_); } allocator_->restore(); } else { - DIPU_DEBUG_ALLOCATOR(8, "BFCachingAllocator:~Context: destory tensor when allocator has been destoryed"); + DIPU_DEBUG_ALLOCATOR(8, + "BFCachingAllocator:~Context: destory tensor " + "when allocator has been destoryed"); } } }; friend class Context; - c10::DataPtr allocate(size_t size) const override { restore(); - std::tuple block = impl->allocateRaw(size); - void* ptr = std::get<0>(block); + std::tuple block = impl->allocateRaw(size); + void *ptr = std::get<0>(block); if (ptr == nullptr && size > 0) { - empty_resource_pool(); + empty_resource_pool(); + block = impl->allocateRaw(size); + ptr = std::get<0>(block); + if (ptr == nullptr && size > 0) { + empty_cache(); block = impl->allocateRaw(size); ptr = std::get<0>(block); - if (ptr == nullptr && size > 0) { - empty_cache(); - block = impl->allocateRaw(size); - ptr = std::get<0>(block); - TORCH_CHECK(ptr != nullptr, "no memory available") - } + TORCH_CHECK(ptr != nullptr, "no memory available") + } } int id = std::get<1>(block); @@ -503,14 +514,22 @@ class BFCachingAllocator: public CacheAllocator { set_memory_allocated(memory_allocated() + nbytes); set_memory_reserved(impl->memory_reserved()); - c10::DataPtr data_ptr(ptr, makeContext(ptr, size, nbytes, id), deleteBFContext, device()); - DIPU_DEBUG_ALLOCATOR(4, "BFCachingAllocator: malloc " << nbytes << ",requires " << size << " nbytes, ptr:" << ptr << ",device:" << device()); - c10::reportMemoryUsageToProfiler(ptr, static_cast(nbytes), memory_allocated(), memory_reserved(), c10::Device(c10::DeviceType::CUDA, device().index())); + c10::DataPtr data_ptr(ptr, makeContext(ptr, size, nbytes, id), + deleteBFContext, device()); + DIPU_DEBUG_ALLOCATOR(4, "BFCachingAllocator: malloc " + << nbytes << ",requires " << size + << " nbytes, ptr:" << ptr + << ",device:" << device()); + c10::reportMemoryUsageToProfiler( + ptr, static_cast(nbytes), memory_allocated(), + memory_reserved(), + c10::Device(c10::DeviceType::CUDA, device().index())); return data_ptr; } void empty_cache() const override { - DIPU_DEBUG_ALLOCATOR(8, "BFCachingAllocator: empty_cache, allocator:" << this << ", device:" << device()); + DIPU_DEBUG_ALLOCATOR(8, "BFCachingAllocator: empty_cache, allocator:" + << this << ", device:" << device()); empty_resource_pool(); impl->emptyCache(); set_memory_reserved(impl->memory_reserved()); @@ -518,33 +537,31 @@ class BFCachingAllocator: public CacheAllocator { void release_all_memory() const override { if (!impl) { - return; + return; } - DIPU_DEBUG_ALLOCATOR(8, "BFCachingAllocator: release_all_memory, allocator:" << this << ", device:" << device()); + DIPU_DEBUG_ALLOCATOR(8, "BFCachingAllocator: release_all_memory, allocator:" + << this << ", device:" << device()); empty_cache(); } - BFCachingAllocator() { - check_impl(); - } + BFCachingAllocator() { check_impl(); } ~BFCachingAllocator() { DIPU_DEBUG_ALLOCATOR(8, "~BFCachingAllocator allocator:" << this); release_all_memory(); } - }; -static void deleteBFContext(void* ptr) { - auto ctx = static_cast(ptr); - c10::reportMemoryUsageToProfiler(ctx->ptr(), -static_cast(ctx->nbytes_), ctx->allocator()->memory_allocated(), - ctx->allocator()->memory_reserved(), c10::Device(c10::DeviceType::CUDA, ctx->allocator()->device().index())); +static void deleteBFContext(void *ptr) { + auto ctx = static_cast(ptr); + c10::reportMemoryUsageToProfiler( + ctx->ptr(), -static_cast(ctx->nbytes_), + ctx->allocator()->memory_allocated(), ctx->allocator()->memory_reserved(), + c10::Device(c10::DeviceType::CUDA, ctx->allocator()->device().index())); delete ctx; } DIPU_REGISTER_ALLOCATOR(BF, dipu::DIPU_DEVICE_TYPE, BFCachingAllocator, 0); DIPU_REGISTER_ALLOCATOR(BF, at::DeviceType::CPU, BFCachingAllocator, 0); - - } // namespace dipu diff --git a/dipu/torch_dipu/csrc_dipu/runtime/core/allocator/DIPUBSCachingAllocator.cpp b/dipu/torch_dipu/csrc_dipu/runtime/core/allocator/DIPUBSCachingAllocator.cpp index 35364e52f..1eac90c0e 100644 --- a/dipu/torch_dipu/csrc_dipu/runtime/core/allocator/DIPUBSCachingAllocator.cpp +++ b/dipu/torch_dipu/csrc_dipu/runtime/core/allocator/DIPUBSCachingAllocator.cpp @@ -1,42 +1,41 @@ // Copyright (c) 2023, DeepLink. -#include "DIPUCachingAllocator.h" -#include +#include +#include #include +#include #include +#include #include -#include -#include -#include + +#include "DIPUCachingAllocator.h" namespace dipu { -static void deleteBSContext(void*); +static void deleteBSContext(void *); -class BSCachingAllocator: public CacheAllocator { +class BSCachingAllocator : public CacheAllocator { struct Impl { - std::unordered_map> idel_blocks_; - std::set allocated_; + std::unordered_map> idel_blocks_; + std::set allocated_; size_t total_alocated_bytes_ = 0; size_t total_idel_bytes_ = 0; }; mutable std::unique_ptr impl; using mutex_t = std::recursive_mutex; mutable mutex_t mutex_; -public: - BSCachingAllocator() { - impl.reset(new Impl()); - } - ~BSCachingAllocator() { - release_all_memory(); - } + public: + BSCachingAllocator() { impl.reset(new Impl()); } - // Better adaptability to memory blocks of various sizes, but internal fragmentation will be larger - size_t getAllocateSizeMoreAdaptable(size_t nbytes) const{ - static const int kMinAllocationSizeExp = [](){ + ~BSCachingAllocator() { release_all_memory(); } + + // Better adaptability to memory blocks of various sizes, but internal + // fragmentation will be larger + size_t getAllocateSizeMoreAdaptable(size_t nbytes) const { + static const int kMinAllocationSizeExp = []() { size_t size = 511; - const char* env = std::getenv("DIPU_BS_ALLOCATOR_MIN_ALLOCATE_SIZE"); + const char *env = std::getenv("DIPU_BS_ALLOCATOR_MIN_ALLOCATE_SIZE"); if (env != nullptr) { size = std::atoi(env); } @@ -48,11 +47,12 @@ class BSCachingAllocator: public CacheAllocator { return allocateSize; } - // The internal fragments are smaller, but are less adaptable to scenes with frequent and drastic changes in size. - size_t getAllocateSizeLessFragmentation(size_t nbytes) const{ - static const size_t kMinAllocationSize = [](){ + // The internal fragments are smaller, but are less adaptable to scenes with + // frequent and drastic changes in size. + size_t getAllocateSizeLessFragmentation(size_t nbytes) const { + static const size_t kMinAllocationSize = []() { size_t size = 512; - const char* env = std::getenv("DIPU_BS_ALLOCATOR_MIN_ALLOCATE_SIZE"); + const char *env = std::getenv("DIPU_BS_ALLOCATOR_MIN_ALLOCATE_SIZE"); if (env != nullptr) { size = std::atoi(env); } @@ -62,18 +62,23 @@ class BSCachingAllocator: public CacheAllocator { return allocateSize; } - size_t getAllocateSize(size_t nbytes) const{ - static bool less_fragmentation = std::getenv("DIPU_BS_MORE_ADAPTABLE") == nullptr; - return less_fragmentation ? getAllocateSizeLessFragmentation(nbytes) : getAllocateSizeMoreAdaptable(nbytes); + size_t getAllocateSize(size_t nbytes) const { + static bool less_fragmentation = + std::getenv("DIPU_BS_MORE_ADAPTABLE") == nullptr; + return less_fragmentation ? getAllocateSizeLessFragmentation(nbytes) + : getAllocateSizeMoreAdaptable(nbytes); } - c10::DataPtr allocate(size_t size) const override{ - DIPU_DEBUG_ALLOCATOR(8, "BSCachingAllocator::allocate " << size << ",allocator:" << this <<", memory-usage" << memory_allocated() << "/" << memory_reserved()); + c10::DataPtr allocate(size_t size) const override { + DIPU_DEBUG_ALLOCATOR(8, "BSCachingAllocator::allocate " + << size << ",allocator:" << this + << ", memory-usage" << memory_allocated() << "/" + << memory_reserved()); std::lock_guard lk(mutex_); flush_mem_pool(); size_t nbytes = getAllocateSize(size); - void* ptr = nullptr; - auto& idel_blocks = impl->idel_blocks_[nbytes]; + void *ptr = nullptr; + auto &idel_blocks = impl->idel_blocks_[nbytes]; if (idel_blocks.size() <= 0) { empty_resource_pool(); } @@ -82,7 +87,10 @@ class BSCachingAllocator: public CacheAllocator { ptr = idel_blocks.front(); idel_blocks.pop_front(); impl->total_idel_bytes_ -= nbytes; - DIPU_DEBUG_ALLOCATOR(4, "BSCachingAllocator::reuse " << nbytes << ", requires:" << size << " bytes, ptr:" << ptr << ",allocator:" << this); + DIPU_DEBUG_ALLOCATOR(4, "BSCachingAllocator::reuse " + << nbytes << ", requires:" << size + << " bytes, ptr:" << ptr + << ",allocator:" << this); break; } else { try { @@ -93,11 +101,13 @@ class BSCachingAllocator: public CacheAllocator { set_memory_reserved(memory_reserved() + nbytes); impl->allocated_.insert(ptr); - impl->total_alocated_bytes_+= nbytes; - DIPU_DEBUG_ALLOCATOR(4, "BSCachingAllocator::allocate " << nbytes << ", requires:" << size << " bytes, ptr:" << ptr << ",allocator:" << this); + impl->total_alocated_bytes_ += nbytes; + DIPU_DEBUG_ALLOCATOR(4, "BSCachingAllocator::allocate " + << nbytes << ", requires:" << size + << " bytes, ptr:" << ptr + << ",allocator:" << this); break; - } - catch(...) { + } catch (...) { if (i == 0) { empty_cache(); } else { @@ -107,22 +117,29 @@ class BSCachingAllocator: public CacheAllocator { } } set_memory_allocated(memory_allocated() + nbytes); - c10::DataPtr data_ptr(ptr, makeContext(ptr, size, nbytes), deleteBSContext, device()); - c10::reportMemoryUsageToProfiler(ptr, static_cast(nbytes), memory_allocated(), memory_reserved(), c10::Device(c10::DeviceType::CUDA, device().index())); + c10::DataPtr data_ptr(ptr, makeContext(ptr, size, nbytes), deleteBSContext, + device()); + c10::reportMemoryUsageToProfiler( + ptr, static_cast(nbytes), memory_allocated(), + memory_reserved(), + c10::Device(c10::DeviceType::CUDA, device().index())); return data_ptr; } - void restore(size_t size, void* ptr) const{ + void restore(size_t size, void *ptr) const { size_t nbytes = getAllocateSize(size); std::lock_guard lk(mutex_); - DIPU_DEBUG_ALLOCATOR(8, "BSCachingAllocator::restore " << nbytes << " bytes, ptr:" << ptr << ",allocator:" << this); + DIPU_DEBUG_ALLOCATOR(8, "BSCachingAllocator::restore " + << nbytes << " bytes, ptr:" << ptr + << ",allocator:" << this); impl->idel_blocks_[nbytes].push_back(ptr); impl->total_idel_bytes_ += nbytes; } void empty_resource_pool() const { - DIPU_DEBUG_ALLOCATOR(8, "BSCachingAllocator::empty_resource_pool ,allocator:" << this); - while(async_mem_pool()->size() > 0) { + DIPU_DEBUG_ALLOCATOR( + 8, "BSCachingAllocator::empty_resource_pool ,allocator:" << this); + while (async_mem_pool()->size() > 0) { if (async_mem_pool()->ready()) { flush_mem_pool(); } else { @@ -132,14 +149,16 @@ class BSCachingAllocator: public CacheAllocator { } void empty_cache() const override { - DIPU_DEBUG_ALLOCATOR(8, "BSCachingAllocator::empty_cache ,allocator:" << this); + DIPU_DEBUG_ALLOCATOR(8, + "BSCachingAllocator::empty_cache ,allocator:" << this); empty_resource_pool(); std::lock_guard lk(mutex_); - for(auto iter = impl->idel_blocks_.begin(); iter != impl->idel_blocks_.end(); ++iter) { - auto& idel_blocks = iter->second; + for (auto iter = impl->idel_blocks_.begin(); + iter != impl->idel_blocks_.end(); ++iter) { + auto &idel_blocks = iter->second; const size_t size = iter->first; while (!idel_blocks.empty()) { - void* ptr = idel_blocks.front(); + void *ptr = idel_blocks.front(); idel_blocks.pop_front(); impl->total_alocated_bytes_ -= size; set_memory_reserved(memory_reserved() - size); @@ -150,26 +169,30 @@ class BSCachingAllocator: public CacheAllocator { } void release_all_memory() const { - DIPU_DEBUG_ALLOCATOR(8, "BSCachingAllocator::release_all_memory allocator:" << this); + DIPU_DEBUG_ALLOCATOR( + 8, "BSCachingAllocator::release_all_memory allocator:" << this); empty_cache(); } void flush_mem_pool() const { - DIPU_DEBUG_ALLOCATOR(8, "BSCachingAllocator::flush_mem_pool allocator:" << this); + DIPU_DEBUG_ALLOCATOR( + 8, "BSCachingAllocator::flush_mem_pool allocator:" << this); while (async_mem_pool()->ready()) { - auto mem = async_mem_pool()->get(); - restore(std::get<1>(mem), std::get<0>(mem)); + auto mem = async_mem_pool()->get(); + restore(std::get<1>(mem), std::get<0>(mem)); } } - struct Context: public DataPtrContextBase { - Context(void* ptr, size_t size, size_t real_size, const BSCachingAllocator* allocator):DataPtrContextBase(allocator, ptr, size), real_size_(real_size) { - - } + struct Context : public DataPtrContextBase { + Context(void *ptr, size_t size, size_t real_size, + const BSCachingAllocator *allocator) + : DataPtrContextBase(allocator, ptr, size), real_size_(real_size) {} ~Context() { - auto allocator_ = static_cast(allocator()); - DIPU_DEBUG_ALLOCATOR(8, __FUNCTION__ << " allocator:" << allocator_ << ", ptr:" << ptr() << ", size_:" << size()); + auto allocator_ = static_cast(allocator()); + DIPU_DEBUG_ALLOCATOR(8, __FUNCTION__ << " allocator:" << allocator_ + << ", ptr:" << ptr() + << ", size_:" << size()); if (allocator_->impl) { std::deque events; for (auto iter = streams().begin(); iter != streams().end(); iter++) { @@ -177,31 +200,32 @@ class BSCachingAllocator: public CacheAllocator { events.back().record(*iter); } - allocator_->async_mem_pool()->add(std::make_tuple(ptr(), size()), events); - allocator_->set_memory_allocated(allocator_->memory_allocated() - real_size_); + allocator_->async_mem_pool()->add(std::make_tuple(ptr(), size()), + events); + allocator_->set_memory_allocated(allocator_->memory_allocated() - + real_size_); allocator_->flush_mem_pool(); } } size_t real_size_ = 0; }; - - void* makeContext(void* ptr, size_t size, size_t real_size) const{ + void *makeContext(void *ptr, size_t size, size_t real_size) const { auto ctx = new Context(ptr, size, real_size, this); return ctx; } - }; -static void deleteBSContext(void* ptr) { - auto ctx = static_cast(ptr); - c10::reportMemoryUsageToProfiler(ctx->ptr(), -static_cast(ctx->real_size_), ctx->allocator()->memory_allocated(), - ctx->allocator()->memory_reserved(), c10::Device(c10::DeviceType::CUDA, ctx->allocator()->device().index())); +static void deleteBSContext(void *ptr) { + auto ctx = static_cast(ptr); + c10::reportMemoryUsageToProfiler( + ctx->ptr(), -static_cast(ctx->real_size_), + ctx->allocator()->memory_allocated(), ctx->allocator()->memory_reserved(), + c10::Device(c10::DeviceType::CUDA, ctx->allocator()->device().index())); delete ctx; } - DIPU_REGISTER_ALLOCATOR(BS, dipu::DIPU_DEVICE_TYPE, BSCachingAllocator, 0); DIPU_REGISTER_ALLOCATOR(BS, at::DeviceType::CPU, BSCachingAllocator, 0); -} // namespace dipu \ No newline at end of file +} // namespace dipu \ No newline at end of file diff --git a/dipu/torch_dipu/csrc_dipu/runtime/core/allocator/DIPUCachingAllocator.cpp b/dipu/torch_dipu/csrc_dipu/runtime/core/allocator/DIPUCachingAllocator.cpp index c2c9dedb9..9516e34dc 100644 --- a/dipu/torch_dipu/csrc_dipu/runtime/core/allocator/DIPUCachingAllocator.cpp +++ b/dipu/torch_dipu/csrc_dipu/runtime/core/allocator/DIPUCachingAllocator.cpp @@ -1,9 +1,10 @@ // Copyright (c) 2023, DeepLink. #include "DIPUCachingAllocator.h" + #include -#include #include +#include namespace dipu { @@ -11,179 +12,200 @@ std::mutex DIPURawDeviceAllocator::mutex_; namespace { -//using RegisteredAllocator = std::map>>; -//using RegisteredAllocator = std::map, uint8_t>>>; +// using RegisteredAllocator = std::map>>; using RegisteredAllocator = +// std::map, uint8_t>>>; -using RegisteredAllocator = std::map, uint8_t>>>; +using RegisteredAllocator = std::map< + c10::DeviceType, + std::map, uint8_t>>>; static std::unique_ptr gDIPURegisterdAllocatorPtr; static std::mutex dipu_register_allocator_mutex; -static std::set used_allocator; +static std::set used_allocator; } // namespace -constexpr const char* dipu_default_memcaching_algorithm = "BF"; +constexpr const char *dipu_default_memcaching_algorithm = "BF"; std::string dipu_device_memcaching_algorithm = []() { - const char* env = std::getenv("DIPU_DEVICE_MEMCACHING_ALGORITHM"); + const char *env = std::getenv("DIPU_DEVICE_MEMCACHING_ALGORITHM"); return env ? env : dipu_default_memcaching_algorithm; }(); std::string dipu_host_memcaching_algorithm = []() { - const char* env = std::getenv("DIPU_HOST_MEMCACHING_ALGORITHM"); + const char *env = std::getenv("DIPU_HOST_MEMCACHING_ALGORITHM"); return env ? env : dipu_default_memcaching_algorithm; }(); -void setAllocator(const std::string name, c10::DeviceType device_type, std::function allocator_geter, uint8_t priority) { +void setAllocator(const std::string name, c10::DeviceType device_type, + std::function allocator_geter, + uint8_t priority) { std::lock_guard lock(dipu_register_allocator_mutex); if (!gDIPURegisterdAllocatorPtr) { gDIPURegisterdAllocatorPtr = std::make_unique(); } - auto& gDIPURegisterdAllocator = *gDIPURegisterdAllocatorPtr; + auto &gDIPURegisterdAllocator = *gDIPURegisterdAllocatorPtr; if (gDIPURegisterdAllocator[device_type].count(name) <= 0) { - gDIPURegisterdAllocator[device_type][name] = std::make_tuple(allocator_geter, priority); + gDIPURegisterdAllocator[device_type][name] = + std::make_tuple(allocator_geter, priority); } else { if (std::get<1>(gDIPURegisterdAllocator[device_type][name]) < priority) { - gDIPURegisterdAllocator[device_type][name] = std::make_tuple(allocator_geter, priority); + gDIPURegisterdAllocator[device_type][name] = + std::make_tuple(allocator_geter, priority); } else { - TORCH_CHECK(false, "A higher priority allocator is already registered for the same device:", device_type, name, priority); + TORCH_CHECK(false, + "A higher priority allocator is already registered for the " + "same device:", + device_type, name, priority); } } } -c10::Allocator* getAllocator(const c10::Device& device) { +c10::Allocator *getAllocator(const c10::Device &device) { c10::DeviceType device_type = device.type(); - c10::Allocator* result = nullptr; - auto& gDIPURegisterdAllocator = *gDIPURegisterdAllocatorPtr; - const std::string algorithm = (device_type == dipu::DIPU_DEVICE_TYPE ? dipu_device_memcaching_algorithm : dipu_host_memcaching_algorithm); + c10::Allocator *result = nullptr; + auto &gDIPURegisterdAllocator = *gDIPURegisterdAllocatorPtr; + const std::string algorithm = + (device_type == dipu::DIPU_DEVICE_TYPE ? dipu_device_memcaching_algorithm + : dipu_host_memcaching_algorithm); if (gDIPURegisterdAllocator[device_type].count(algorithm) > 0) { - auto allocator_geter = std::get<0>(gDIPURegisterdAllocator[device_type][algorithm]); + auto allocator_geter = + std::get<0>(gDIPURegisterdAllocator[device_type][algorithm]); int device_index = 0; if (device_type == dipu::DIPU_DEVICE_TYPE) { - device_index = device.has_index() ? device.index() : devproxy::current_device(); + device_index = + device.has_index() ? device.index() : devproxy::current_device(); } auto allocator = allocator_geter(device_index); - if(device_type == dipu::DIPU_DEVICE_TYPE) { + if (device_type == dipu::DIPU_DEVICE_TYPE) { used_allocator.insert(allocator); } return allocator; } - TORCH_CHECK(false, "No allocator found for the device using the given algorithm:", device_type, dipu_device_memcaching_algorithm); + TORCH_CHECK(false, + "No allocator found for the device using the given algorithm:", + device_type, dipu_device_memcaching_algorithm); return nullptr; } -c10::Allocator* getAllocator(c10::DeviceType device_type) { +c10::Allocator *getAllocator(c10::DeviceType device_type) { return getAllocator(c10::Device(device_type)); } void emptyCachedMem() { auto empty_allocator_cache = [](auto allocator) { - auto cached_allocator = dynamic_cast(allocator); - DIPU_DEBUG_ALLOCATOR(8, __FUNCTION__ << " allocator:" << allocator << ", cached_allocator:" << cached_allocator); + auto cached_allocator = dynamic_cast(allocator); + DIPU_DEBUG_ALLOCATOR(8, __FUNCTION__ + << " allocator:" << allocator + << ", cached_allocator:" << cached_allocator); if (cached_allocator != nullptr) { cached_allocator->empty_cache(); } }; - for (auto& allocator : used_allocator) { + for (auto &allocator : used_allocator) { empty_allocator_cache(allocator); } } void releaseAllDeviceMem() { auto release_allocator_memory = [](auto allocator) { - auto cached_allocator = dynamic_cast(allocator); - DIPU_DEBUG_ALLOCATOR(8, "release_allocator_memory: allocator:" << allocator << ", cached_allocator:" << cached_allocator); + auto cached_allocator = dynamic_cast(allocator); + DIPU_DEBUG_ALLOCATOR(8, "release_allocator_memory: allocator:" + << allocator + << ", cached_allocator:" << cached_allocator); if (cached_allocator != nullptr) { cached_allocator->release_all_memory(); } }; - for (auto& allocator : used_allocator) { + for (auto &allocator : used_allocator) { release_allocator_memory(allocator); } } -size_t memoryReserved(const c10::Device& device) { - c10::Allocator* allocator = getAllocator(device); - auto cached_allocator = dynamic_cast(allocator); +size_t memoryReserved(const c10::Device &device) { + c10::Allocator *allocator = getAllocator(device); + auto cached_allocator = dynamic_cast(allocator); if (cached_allocator != nullptr) { - return cached_allocator->memory_reserved(); + return cached_allocator->memory_reserved(); } return 0; } -size_t memoryAllocated(const c10::Device& device) { - c10::Allocator* allocator = getAllocator(device); - auto cached_allocator = dynamic_cast(allocator); +size_t memoryAllocated(const c10::Device &device) { + c10::Allocator *allocator = getAllocator(device); + auto cached_allocator = dynamic_cast(allocator); if (cached_allocator != nullptr) { - return cached_allocator->memory_allocated(); + return cached_allocator->memory_allocated(); } return 0; } -size_t maxMemoryReserved(const c10::Device& device) { - c10::Allocator* allocator = getAllocator(device); - auto cached_allocator = dynamic_cast(allocator); +size_t maxMemoryReserved(const c10::Device &device) { + c10::Allocator *allocator = getAllocator(device); + auto cached_allocator = dynamic_cast(allocator); if (cached_allocator != nullptr) { - return cached_allocator->max_memory_reserved(); + return cached_allocator->max_memory_reserved(); } return 0; } -size_t maxMemoryAllocated(const c10::Device& device) { - c10::Allocator* allocator = getAllocator(device); - auto cached_allocator = dynamic_cast(allocator); +size_t maxMemoryAllocated(const c10::Device &device) { + c10::Allocator *allocator = getAllocator(device); + auto cached_allocator = dynamic_cast(allocator); if (cached_allocator != nullptr) { - return cached_allocator->max_memory_allocated(); + return cached_allocator->max_memory_allocated(); } return 0; } -void recordStream(const c10::DataPtr& ptr, DIPUStream stream) { - void* ctx = ptr.get_context(); - if(ctx == nullptr) { +void recordStream(const c10::DataPtr &ptr, DIPUStream stream) { + void *ctx = ptr.get_context(); + if (ctx == nullptr) { return; } - auto base_cxt = static_cast(ctx); + auto base_cxt = static_cast(ctx); if (base_cxt) { base_cxt->streams().insert(stream); } } -void recordStream(const at::Tensor& tensor, DIPUStream stream) { - dipu::recordStream(tensor.storage().data_ptr(), stream); +void recordStream(const at::Tensor &tensor, DIPUStream stream) { + dipu::recordStream(tensor.storage().data_ptr(), stream); } namespace { - class DIPUDeviceCachingProxy: public c10::Allocator { - c10::DeviceType device_type_; - public: - DIPUDeviceCachingProxy(c10::DeviceType device_type):device_type_(device_type) { +class DIPUDeviceCachingProxy : public c10::Allocator { + c10::DeviceType device_type_; - } - - ~DIPUDeviceCachingProxy() { + public: + DIPUDeviceCachingProxy(c10::DeviceType device_type) + : device_type_(device_type) {} - } + ~DIPUDeviceCachingProxy() {} - c10::DataPtr allocate(size_t size) const { - return getAllocator(device_type_)->allocate(size); - } + c10::DataPtr allocate(size_t size) const { + return getAllocator(device_type_)->allocate(size); + } - c10::DeleterFnPtr raw_deleter() const override { - return getAllocator(device_type_)->raw_deleter(); - } - }; - static DIPUDeviceCachingProxy dipu_default_device_allocator(dipu::DIPU_DEVICE_TYPE); + c10::DeleterFnPtr raw_deleter() const override { + return getAllocator(device_type_)->raw_deleter(); + } }; +static DIPUDeviceCachingProxy dipu_default_device_allocator( + dipu::DIPU_DEVICE_TYPE); +}; // namespace void initCachedAllocator() { // Make the c10::GetAllocator interface available - c10::SetAllocator(dipu::DIPU_DEVICE_TYPE, &dipu_default_device_allocator, 255); + c10::SetAllocator(dipu::DIPU_DEVICE_TYPE, &dipu_default_device_allocator, + 255); c10::SetAllocator(c10::DeviceType::CUDA, &dipu_default_device_allocator, 255); } - } // namespace dipu diff --git a/dipu/torch_dipu/csrc_dipu/runtime/core/allocator/DIPUCachingAllocator.h b/dipu/torch_dipu/csrc_dipu/runtime/core/allocator/DIPUCachingAllocator.h index b13331584..8768615f4 100644 --- a/dipu/torch_dipu/csrc_dipu/runtime/core/allocator/DIPUCachingAllocator.h +++ b/dipu/torch_dipu/csrc_dipu/runtime/core/allocator/DIPUCachingAllocator.h @@ -1,131 +1,117 @@ // Copyright (c) 2023, DeepLink. #pragma once +#include +#include +#include + #include #include -#include "DIPURawAllocator.h" -#include "DIPUAsyncResourcePool.h" -#include "../DIPUEvent.h" -#include -#include -#include +#include "../DIPUEvent.h" +#include "DIPUAsyncResourcePool.h" +#include "DIPURawAllocator.h" namespace dipu { -using AsyncMemPool = AsyncResourcePool>; +using AsyncMemPool = AsyncResourcePool>; - -class MemStats -{ -private: +class MemStats { + private: mutable size_t reserved_in_bytes_ = 0; mutable size_t allocated_in_bytes_ = 0; mutable size_t max_reserved_in_bytes_ = 0; mutable size_t max_allocated_in_bytes_ = 0; -protected: + + protected: void set_memory_reserved(size_t reserved_in_bytes) const { reserved_in_bytes_ = reserved_in_bytes; - max_reserved_in_bytes_ = max_reserved_in_bytes_ > reserved_in_bytes ? max_reserved_in_bytes_ : reserved_in_bytes; + max_reserved_in_bytes_ = max_reserved_in_bytes_ > reserved_in_bytes + ? max_reserved_in_bytes_ + : reserved_in_bytes; } void set_memory_allocated(size_t allocated_in_bytes) const { allocated_in_bytes_ = allocated_in_bytes; - max_allocated_in_bytes_ = max_allocated_in_bytes_ > allocated_in_bytes ? max_allocated_in_bytes_ : allocated_in_bytes; + max_allocated_in_bytes_ = max_allocated_in_bytes_ > allocated_in_bytes + ? max_allocated_in_bytes_ + : allocated_in_bytes; } -public: - MemStats() { - - } + public: + MemStats() {} ~MemStats() { if (allocated_in_bytes_ != 0) { - DIPU_DEBUG_ALLOCATOR(8, "~MemStats: allocated_in_bytes_:" << allocated_in_bytes_); + DIPU_DEBUG_ALLOCATOR( + 8, "~MemStats: allocated_in_bytes_:" << allocated_in_bytes_); } if (reserved_in_bytes_ != 0) { - DIPU_DEBUG_ALLOCATOR(2, "~MemStats: reserved_in_bytes_:" << reserved_in_bytes_); + DIPU_DEBUG_ALLOCATOR( + 2, "~MemStats: reserved_in_bytes_:" << reserved_in_bytes_); } } - size_t memory_allocated() const { - return allocated_in_bytes_; - } + size_t memory_allocated() const { return allocated_in_bytes_; } - size_t memory_reserved() const { - return reserved_in_bytes_; - } + size_t memory_reserved() const { return reserved_in_bytes_; } - size_t max_memory_allocated() { - return max_allocated_in_bytes_; - } + size_t max_memory_allocated() { return max_allocated_in_bytes_; } - size_t max_memory_reserved() { - return max_reserved_in_bytes_; - } + size_t max_memory_reserved() { return max_reserved_in_bytes_; } }; - -class DIPU_API CacheAllocator: public c10::Allocator, public MemStats { - c10::Allocator* raw_allocator_ = nullptr; - AsyncMemPool* async_mem_pool_ = nullptr; +class DIPU_API CacheAllocator : public c10::Allocator, public MemStats { + c10::Allocator *raw_allocator_ = nullptr; + AsyncMemPool *async_mem_pool_ = nullptr; mutable c10::Device device_ = c10::DeviceType::CPU; - protected: - c10::Allocator* raw_allocator() const { - return raw_allocator_; - } - - AsyncMemPool* async_mem_pool() const { - return async_mem_pool_; - } + protected: + c10::Allocator *raw_allocator() const { return raw_allocator_; } - void* allocate_raw(size_t n) { - return raw_allocator()->raw_allocate(n); - } + AsyncMemPool *async_mem_pool() const { return async_mem_pool_; } - void free_raw(void* ptr) { - return raw_allocator()->raw_deallocate(ptr); - } + void *allocate_raw(size_t n) { return raw_allocator()->raw_allocate(n); } - public: - CacheAllocator() { + void free_raw(void *ptr) { return raw_allocator()->raw_deallocate(ptr); } - } + public: + CacheAllocator() {} - void set_raw_allocator(c10::Allocator* raw_allocator) { - raw_allocator_ = raw_allocator; - device_ = raw_allocator_->allocate(0).device(); - } + void set_raw_allocator(c10::Allocator *raw_allocator) { + raw_allocator_ = raw_allocator; + device_ = raw_allocator_->allocate(0).device(); + } - void set_async_mem_pool(AsyncMemPool* async_mem_pool) { - async_mem_pool_ = async_mem_pool; - } + void set_async_mem_pool(AsyncMemPool *async_mem_pool) { + async_mem_pool_ = async_mem_pool; + } - virtual ~CacheAllocator() { + virtual ~CacheAllocator(){ - }; + }; - virtual void empty_cache() const = 0; + virtual void empty_cache() const = 0; - virtual void release_all_memory() const = 0; + virtual void release_all_memory() const = 0; - c10::Device& device() const { - return device_; - } + c10::Device &device() const { return device_; } class DataPtrContextBase { - private: + private: std::set streams_; - mutable const CacheAllocator* allocator_ = nullptr; - void* ptr_ = nullptr; + mutable const CacheAllocator *allocator_ = nullptr; + void *ptr_ = nullptr; size_t size_ = 0; - public: - DataPtrContextBase(const CacheAllocator* allocator, void* ptr, size_t size): allocator_(allocator), ptr_(ptr), size_(size) { + + public: + DataPtrContextBase(const CacheAllocator *allocator, void *ptr, size_t size) + : allocator_(allocator), ptr_(ptr), size_(size) { if (allocator_->device().type() == dipu::DIPU_DEVICE_TYPE) { auto current_stream = getCurrentDIPUStream(); // If current stream is the default stream, we don't need to synchronize - // But before releasing the memory we must synchronize the default stream + // But before releasing the memory we must synchronize the default + // stream if (getDefaultDIPUStream() != current_stream) { streams_.insert(current_stream); } @@ -133,35 +119,31 @@ class DIPU_API CacheAllocator: public c10::Allocator, public MemStats { MemChecker::instance().insert(ptr, size); } - ~DataPtrContextBase() { - MemChecker::instance().erase(ptr_); - } + ~DataPtrContextBase() { MemChecker::instance().erase(ptr_); } - std::set& streams() { - return streams_; - } + std::set &streams() { return streams_; } - const CacheAllocator* allocator() { - return allocator_; - } + const CacheAllocator *allocator() { return allocator_; } - void* ptr() {return ptr_;} + void *ptr() { return ptr_; } - size_t size() {return size_;} + size_t size() { return size_; } }; }; -void setAllocator(const std::string name, c10::DeviceType device_type, std::function allocator_get_fn, uint8_t priority = 0); +void setAllocator(const std::string name, c10::DeviceType device_type, + std::function allocator_get_fn, + uint8_t priority = 0); -c10::Allocator* getAllocator(c10::DeviceType device_type); +c10::Allocator *getAllocator(c10::DeviceType device_type); -size_t memoryReserved(const c10::Device& device); +size_t memoryReserved(const c10::Device &device); -size_t memoryAllocated(const c10::Device& device); +size_t memoryAllocated(const c10::Device &device); -size_t maxMemoryReserved(const c10::Device& device); +size_t maxMemoryReserved(const c10::Device &device); -size_t maxMemoryAllocated(const c10::Device& device); +size_t maxMemoryAllocated(const c10::Device &device); void emptyCachedMem(); @@ -169,51 +151,56 @@ void initCachedAllocator(); void releaseAllDeviceMem(); -void recordStream(const c10::DataPtr& ptr, DIPUStream stream); +void recordStream(const c10::DataPtr &ptr, DIPUStream stream); -void recordStream(const at::Tensor& tensor, DIPUStream stream); +void recordStream(const at::Tensor &tensor, DIPUStream stream); namespace { // For internal implementation only struct AllocatorRegisterer { - explicit AllocatorRegisterer(const std::string name, c10::DeviceType device_type, std::function allocator_get_fn, uint8_t priority = 0) { + explicit AllocatorRegisterer( + const std::string name, c10::DeviceType device_type, + std::function allocator_get_fn, + uint8_t priority = 0) { setAllocator(name, device_type, allocator_get_fn, priority); } }; -template +template struct RawAllocator; -template<> +template <> struct RawAllocator { using type = DIPURawDeviceAllocator; }; -template<> +template <> struct RawAllocator { using type = DIPURawHostAllocator; }; -template -c10::Allocator* get_allocator_impl(c10::Allocator* raw_allocator) { - // Construct when really needed - // async_mem_pool is used when cache_allocator being destructed so it should be destructed after cache_allocator - static AsyncMemPoolImpl async_mem_pool; - static AllocatorImpl cache_allocator; - static int n = [&](){ - cache_allocator.set_raw_allocator(raw_allocator); - cache_allocator.set_async_mem_pool(&async_mem_pool); - return 0; - }(); - return &cache_allocator; +template +c10::Allocator *get_allocator_impl(c10::Allocator *raw_allocator) { + // Construct when really needed + // async_mem_pool is used when cache_allocator being destructed so it should + // be destructed after cache_allocator + static AsyncMemPoolImpl async_mem_pool; + static AllocatorImpl cache_allocator; + static int n = [&]() { + cache_allocator.set_raw_allocator(raw_allocator); + cache_allocator.set_async_mem_pool(&async_mem_pool); + return 0; + }(); + return &cache_allocator; } -template -c10::Allocator* get_allocator(int device_id, c10::Allocator* raw_allocator) { - #define allocator_dispatch_device_id(id) \ - if (device_id == id){ \ - return get_allocator_impl(raw_allocator); \ - } \ +template +c10::Allocator *get_allocator(int device_id, c10::Allocator *raw_allocator) { +#define allocator_dispatch_device_id(id) \ + if (device_id == id) { \ + return get_allocator_impl( \ + raw_allocator); \ + } allocator_dispatch_device_id(0); allocator_dispatch_device_id(1); @@ -234,14 +221,17 @@ c10::Allocator* get_allocator(int device_id, c10::Allocator* raw_allocator) { TORCH_CHECK(false, "support up to 16 cards"); } -#define DIPU_REGISTER_ALLOCATOR(name, device_type, CachingAllocator, priority) \ - namespace name##device_type{ \ - static RawAllocator::type raw_allocator; \ - using AsyncMemPool = AsyncResourcePoolImpl, device_type, priority>; \ - static std::function allocator_get_fn = std::bind(get_allocator, std::placeholders::_1, &raw_allocator); \ - static AllocatorRegisterer g_allocator(#name, device_type, allocator_get_fn, priority); \ +#define DIPU_REGISTER_ALLOCATOR(name, device_type, CachingAllocator, priority) \ + namespace name##device_type { \ + static RawAllocator::type raw_allocator; \ + using AsyncMemPool = AsyncResourcePoolImpl, \ + device_type, priority>; \ + static std::function allocator_get_fn = \ + std::bind(get_allocator, \ + std::placeholders::_1, &raw_allocator); \ + static AllocatorRegisterer g_allocator(#name, device_type, \ + allocator_get_fn, priority); \ } } // namespace } // namespace dipu - diff --git a/dipu/torch_dipu/csrc_dipu/runtime/core/allocator/DIPURawAllocator.cpp b/dipu/torch_dipu/csrc_dipu/runtime/core/allocator/DIPURawAllocator.cpp index 5dff66651..caf3efb15 100644 --- a/dipu/torch_dipu/csrc_dipu/runtime/core/allocator/DIPURawAllocator.cpp +++ b/dipu/torch_dipu/csrc_dipu/runtime/core/allocator/DIPURawAllocator.cpp @@ -1,7 +1,6 @@ // Copyright (c) 2023, DeepLink. #include "DIPURawAllocator.h" -#include "../DIPUStream.h" #include #include @@ -10,25 +9,27 @@ #include #include +#include "../DIPUStream.h" + namespace dipu { static void DIPURawDeviceAllocatorDeleter(void *ptr) { - if (ptr) { - auto device = devproxy::current_device(); - DIPU_DEBUG_ALLOCATOR(2, "devproxy::freeDevice: free " << ptr); - // When only one stream is involved, in order to improve performance and memory usage, - // we actually do not use events for synchronization. - // The memory used by the same stream is allocated to the same stream for use without synchronization, - // this is no problem, but in direct release without synchronization is problematic, so adding synchronization here is necessary. - getDefaultDIPUStream().synchronize(); - devproxy::freeDevice(ptr); - ptr = nullptr; - } + if (ptr) { + auto device = devproxy::current_device(); + DIPU_DEBUG_ALLOCATOR(2, "devproxy::freeDevice: free " << ptr); + // When only one stream is involved, in order to improve performance and + // memory usage, we actually do not use events for synchronization. The + // memory used by the same stream is allocated to the same stream for use + // without synchronization, this is no problem, but in direct release + // without synchronization is problematic, so adding synchronization here is + // necessary. + getDefaultDIPUStream().synchronize(); + devproxy::freeDevice(ptr); + ptr = nullptr; + } } -DIPURawDeviceAllocator::DIPURawDeviceAllocator() { - -} +DIPURawDeviceAllocator::DIPURawDeviceAllocator() {} c10::DataPtr DIPURawDeviceAllocator::allocate(size_t size) const { auto idx = devproxy::current_device(); @@ -39,26 +40,30 @@ c10::DeleterFnPtr DIPURawDeviceAllocator::raw_deleter() const { return &DIPURawDeviceAllocatorDeleter; } -c10::DataPtr DIPURawDeviceAllocator::allocate(size_t nbytes, c10::DeviceIndex device_index) const { - std::lock_guard lock(mutex_); - void *data = nullptr; - if (nbytes > 0) { - devproxy::mallocDevice(&data, nbytes); - DIPU_DEBUG_ALLOCATOR(1, "devproxy::mallocDevice: malloc " << nbytes << " nbytes, ptr:" << data); - } - return {data, data, &DIPURawDeviceAllocatorDeleter, c10::Device(dipu::DIPU_DEVICE_TYPE, device_index)}; +c10::DataPtr DIPURawDeviceAllocator::allocate( + size_t nbytes, c10::DeviceIndex device_index) const { + std::lock_guard lock(mutex_); + void *data = nullptr; + if (nbytes > 0) { + devproxy::mallocDevice(&data, nbytes); + DIPU_DEBUG_ALLOCATOR(1, "devproxy::mallocDevice: malloc " + << nbytes << " nbytes, ptr:" << data); + } + return {data, data, &DIPURawDeviceAllocatorDeleter, + c10::Device(dipu::DIPU_DEVICE_TYPE, device_index)}; } class DIPURawHostAllocatorImpl final { -public: - std::pair allocate(size_t size) { + public: + std::pair allocate(size_t size) { if (size == 0) { return {nullptr, nullptr}; } - void* data = nullptr; + void *data = nullptr; devproxy::mallocHost(&data, size); - DIPU_DEBUG_ALLOCATOR(1, "devproxy::mallocHost: malloc " << size << " nbytes, ptr:" << data); + DIPU_DEBUG_ALLOCATOR( + 1, "devproxy::mallocHost: malloc " << size << " nbytes, ptr:" << data); { std::lock_guard lck(mtx_); blocks_[data] = size; @@ -66,7 +71,7 @@ class DIPURawHostAllocatorImpl final { return {data, data}; } - void free(void* ctx) { + void free(void *ctx) { if (ctx == nullptr) { return; } @@ -85,11 +90,11 @@ class DIPURawHostAllocatorImpl final { { std::lock_guard lck(mtx_); for (auto iter = blocks_.crbegin(); iter != blocks_.crend(); iter++) { - const void* ptr = iter->first; + const void *ptr = iter->first; const size_t size = iter->second; - const char* cptr = static_cast(ptr); - const char* cp = static_cast(p); - const char* max_ptr = cptr + size; + const char *cptr = static_cast(ptr); + const char *cp = static_cast(p); + const char *max_ptr = cptr + size; if (cp >= cptr && cp < max_ptr) { is_pinned = true; break; @@ -103,38 +108,35 @@ class DIPURawHostAllocatorImpl final { return is_pinned; } -private: + private: static std::mutex mtx_; - static std::map blocks_; + static std::map blocks_; }; -std::map DIPURawHostAllocatorImpl::blocks_; +std::map DIPURawHostAllocatorImpl::blocks_; std::mutex DIPURawHostAllocatorImpl::mtx_; namespace { static DIPURawHostAllocatorImpl dipu_host_allocator; -static void DIPURawHostAllocatorDeleter(void* ctx) { +static void DIPURawHostAllocatorDeleter(void *ctx) { dipu_host_allocator.free(ctx); } -} +} // namespace c10::DeleterFnPtr DIPURawHostAllocator::raw_deleter() const { - return &DIPURawHostAllocatorDeleter; + return &DIPURawHostAllocatorDeleter; } - c10::DataPtr DIPURawHostAllocator::allocate(size_t size) const { - auto ptr_and_ctx = dipu_host_allocator.allocate(size); - return { - ptr_and_ctx.first, - ptr_and_ctx.second, - &DIPURawHostAllocatorDeleter, - at::DeviceType::CPU}; - } +c10::DataPtr DIPURawHostAllocator::allocate(size_t size) const { + auto ptr_and_ctx = dipu_host_allocator.allocate(size); + return {ptr_and_ctx.first, ptr_and_ctx.second, &DIPURawHostAllocatorDeleter, + at::DeviceType::CPU}; +} -bool isPinnedPtr(const void* ptr) { +bool isPinnedPtr(const void *ptr) { return dipu_host_allocator.isPinnedPtr(ptr); } diff --git a/dipu/torch_dipu/csrc_dipu/runtime/core/allocator/DIPURawAllocator.h b/dipu/torch_dipu/csrc_dipu/runtime/core/allocator/DIPURawAllocator.h index 7d280cb36..13dad09e2 100644 --- a/dipu/torch_dipu/csrc_dipu/runtime/core/allocator/DIPURawAllocator.h +++ b/dipu/torch_dipu/csrc_dipu/runtime/core/allocator/DIPURawAllocator.h @@ -1,47 +1,49 @@ // Copyright (c) 2023, DeepLink. #pragma once +#include +#include + #include #include #include -#include #include -#include -#include +#include namespace dipu { -#define DIPU_DEBUG_ALLOCATOR(mask, x) \ - { \ - static int value = []() { auto env = std::getenv("DIPU_DEBUG_ALLOCATOR"); return env ? std::atoi(env) : 0; }(); \ - if ((mask & value) == mask) \ - { \ - std::cout << "[" << std::this_thread::get_id() << "]" << x << std::endl; \ - } \ +#define DIPU_DEBUG_ALLOCATOR(mask, x) \ + { \ + static int value = []() { \ + auto env = std::getenv("DIPU_DEBUG_ALLOCATOR"); \ + return env ? std::atoi(env) : 0; \ + }(); \ + if ((mask & value) == mask) { \ + std::cout << "[" << std::this_thread::get_id() << "]" << x << std::endl; \ + } \ } class DIPU_API DIPURawDeviceAllocator : public c10::Allocator { -public: - DIPURawDeviceAllocator(); - - virtual c10::DataPtr allocate(size_t size) const; + public: + DIPURawDeviceAllocator(); - c10::DeleterFnPtr raw_deleter() const override; + virtual c10::DataPtr allocate(size_t size) const; - private: - static std::mutex mutex_; - c10::DataPtr allocate(size_t nbytes, c10::DeviceIndex device_index) const; - }; + c10::DeleterFnPtr raw_deleter() const override; + private: + static std::mutex mutex_; + c10::DataPtr allocate(size_t nbytes, c10::DeviceIndex device_index) const; +}; class DIPURawHostAllocator : public c10::Allocator { -public: + public: c10::DataPtr allocate(size_t size) const; c10::DeleterFnPtr raw_deleter() const override; }; -DIPU_API bool isPinnedPtr(const void* ptr); +DIPU_API bool isPinnedPtr(const void *ptr); } // namespace dipu diff --git a/dipu/torch_dipu/csrc_dipu/runtime/core/allocator/DIPURawCachingAllocator.cpp b/dipu/torch_dipu/csrc_dipu/runtime/core/allocator/DIPURawCachingAllocator.cpp index 69462b478..c5b5d4254 100644 --- a/dipu/torch_dipu/csrc_dipu/runtime/core/allocator/DIPURawCachingAllocator.cpp +++ b/dipu/torch_dipu/csrc_dipu/runtime/core/allocator/DIPURawCachingAllocator.cpp @@ -4,39 +4,38 @@ namespace dipu { -static void deleteRawCachingAllocatorContext(void*); +static void deleteRawCachingAllocatorContext(void *); -class RawCachingAllocator: public CacheAllocator { -public: - RawCachingAllocator() { +class RawCachingAllocator : public CacheAllocator { + public: + RawCachingAllocator() {} - } - - ~RawCachingAllocator() { + ~RawCachingAllocator() {} - } - - class Context: public DataPtrContextBase { - public: - Context(const CacheAllocator* allocator, void* ptr, size_t size, size_t real_size):DataPtrContextBase(allocator, ptr, size), real_size_(real_size){} - ~Context() { - std::deque events; - for (auto iter = streams().begin(); iter != streams().end(); iter++) { - events.emplace_back(); - events.back().record(*iter); - } - auto allocator_ = static_cast(allocator()); - allocator_->async_mem_pool()->add(std::make_tuple(ptr(), size()), events); - allocator_->set_memory_allocated(allocator_->memory_allocated() - real_size_); - allocator_->empty_cache(); + class Context : public DataPtrContextBase { + public: + Context(const CacheAllocator *allocator, void *ptr, size_t size, + size_t real_size) + : DataPtrContextBase(allocator, ptr, size), real_size_(real_size) {} + ~Context() { + std::deque events; + for (auto iter = streams().begin(); iter != streams().end(); iter++) { + events.emplace_back(); + events.back().record(*iter); } - size_t real_size_ = 0; + auto allocator_ = static_cast(allocator()); + allocator_->async_mem_pool()->add(std::make_tuple(ptr(), size()), events); + allocator_->set_memory_allocated(allocator_->memory_allocated() - + real_size_); + allocator_->empty_cache(); + } + size_t real_size_ = 0; }; - size_t getAllocateSize(size_t nbytes) const{ - static const size_t kMinAllocationSize = [](){ + size_t getAllocateSize(size_t nbytes) const { + static const size_t kMinAllocationSize = []() { size_t size = 512; - const char* env = std::getenv("DIPU_RAW_ALLOCATOR_MIN_ALLOCATE_SIZE"); + const char *env = std::getenv("DIPU_RAW_ALLOCATOR_MIN_ALLOCATE_SIZE"); if (env != nullptr) { size = std::atoi(env); } @@ -49,19 +48,22 @@ class RawCachingAllocator: public CacheAllocator { c10::DataPtr allocate(size_t size) const override { size_t nbytes = getAllocateSize(size); empty_cache(); - DIPU_DEBUG_ALLOCATOR(4, "RawCachingAllocator: malloc " << nbytes << " nbytes" << ", requires:" << size << " bytes"); + DIPU_DEBUG_ALLOCATOR(4, "RawCachingAllocator: malloc " + << nbytes << " nbytes" + << ", requires:" << size << " bytes"); auto ptr = raw_allocator()->raw_allocate(nbytes); set_memory_reserved(memory_reserved() + nbytes); set_memory_allocated(memory_allocated() + nbytes); - return c10::DataPtr(ptr, new Context(this, ptr, size, nbytes), deleteRawCachingAllocatorContext, device()); + return c10::DataPtr(ptr, new Context(this, ptr, size, nbytes), + deleteRawCachingAllocatorContext, device()); } void empty_cache() const override { DIPU_DEBUG_ALLOCATOR(8, "RawCachingAllocator: empty_cache"); - while(async_mem_pool()->size() > 0) { - if(async_mem_pool()->ready()) { + while (async_mem_pool()->size() > 0) { + if (async_mem_pool()->ready()) { auto mem = async_mem_pool()->get(); - void* ptr = std::get<0>(mem); + void *ptr = std::get<0>(mem); size_t size = std::get<1>(mem); size_t nbytes = getAllocateSize(size); raw_allocator()->raw_deallocate(ptr); @@ -78,8 +80,8 @@ class RawCachingAllocator: public CacheAllocator { } }; -static void deleteRawCachingAllocatorContext(void* ptr) { - auto ctx = static_cast(ptr); +static void deleteRawCachingAllocatorContext(void *ptr) { + auto ctx = static_cast(ptr); delete ctx; } diff --git a/dipu/torch_dipu/csrc_dipu/runtime/core/allocator/DIPUSpinMutex.h b/dipu/torch_dipu/csrc_dipu/runtime/core/allocator/DIPUSpinMutex.h index 74c977948..c28c65c0a 100644 --- a/dipu/torch_dipu/csrc_dipu/runtime/core/allocator/DIPUSpinMutex.h +++ b/dipu/torch_dipu/csrc_dipu/runtime/core/allocator/DIPUSpinMutex.h @@ -2,38 +2,35 @@ #pragma once #include +#include namespace dipu { /// Simple spin-lock to help build thread-safe functions. class SpinMutex { -private: - std::atomic excl_ { false }; + private: + std::atomic excl_{false}; -public: - constexpr SpinMutex() noexcept = default; + public: + constexpr SpinMutex() noexcept = default; - SpinMutex(const SpinMutex&) = delete; + SpinMutex(const SpinMutex &) = delete; - void delay() const noexcept { - std::this_thread::yield(); - } + void delay() const noexcept { std::this_thread::yield(); } - void lock() { - for (bool exp = false; - !excl_.compare_exchange_weak(exp, true, std::memory_order_acq_rel); - exp = false) delay(); - } + void lock() { + for (bool exp = false; + !excl_.compare_exchange_weak(exp, true, std::memory_order_acq_rel); + exp = false) + delay(); + } - bool try_lock() { - bool exp = false; - return - excl_.compare_exchange_weak(exp, true, std::memory_order_acq_rel); - } + bool try_lock() { + bool exp = false; + return excl_.compare_exchange_weak(exp, true, std::memory_order_acq_rel); + } - void unlock() { - excl_.store(false, std::memory_order_release); - } + void unlock() { excl_.store(false, std::memory_order_release); } }; -} // namespace dipu +} // namespace dipu diff --git a/dipu/torch_dipu/csrc_dipu/runtime/core/guardimpl/DIPUGuardImpl.cpp b/dipu/torch_dipu/csrc_dipu/runtime/core/guardimpl/DIPUGuardImpl.cpp index d3bc06df2..50a66ea78 100644 --- a/dipu/torch_dipu/csrc_dipu/runtime/core/guardimpl/DIPUGuardImpl.cpp +++ b/dipu/torch_dipu/csrc_dipu/runtime/core/guardimpl/DIPUGuardImpl.cpp @@ -4,10 +4,10 @@ namespace dipu { // use c10::DeviceGuard/OptionalDeviceGuard device_guard(device_of(tensor)) -// or c10::StreamGuard/OptionalStreamGuard(c10::Stream) will use DIPUGuardImpl automatically. +// or c10::StreamGuard/OptionalStreamGuard(c10::Stream) will use DIPUGuardImpl +// automatically. constexpr at::DeviceType DIPUGuardImpl::static_type; C10_REGISTER_GUARD_IMPL(DIPU_DEVICE_TYPE_MACRO, DIPUGuardImpl); - -} // namespace torch_dipu +} // namespace dipu diff --git a/dipu/torch_dipu/csrc_dipu/runtime/core/guardimpl/DIPUGuardImpl.h b/dipu/torch_dipu/csrc_dipu/runtime/core/guardimpl/DIPUGuardImpl.h index d962a751e..97abb6014 100644 --- a/dipu/torch_dipu/csrc_dipu/runtime/core/guardimpl/DIPUGuardImpl.h +++ b/dipu/torch_dipu/csrc_dipu/runtime/core/guardimpl/DIPUGuardImpl.h @@ -6,12 +6,13 @@ #include #include -#include #include +#include namespace dipu { - // seems DIPUCachingAllocator.h and this class has Cycle reference? need refactor? -void recordStream(const c10::DataPtr& ptr, DIPUStream stream); +// seems DIPUCachingAllocator.h and this class has Cycle reference? need +// refactor? +void recordStream(const c10::DataPtr &ptr, DIPUStream stream); struct DIPUGuardImpl : public c10::impl::DeviceGuardImplInterface { static constexpr at::DeviceType static_type = dipu::DIPU_DEVICE_TYPE; @@ -19,9 +20,7 @@ struct DIPUGuardImpl : public c10::impl::DeviceGuardImplInterface { explicit DIPUGuardImpl(at::DeviceType t) { AT_ASSERT(t == dipu::DIPU_DEVICE_TYPE); } - at::DeviceType type() const override { - return dipu::DIPU_DEVICE_TYPE; - } + at::DeviceType type() const override { return dipu::DIPU_DEVICE_TYPE; } c10::Device exchangeDevice(c10::Device device) const override { AT_ASSERT(device.type() == dipu::DIPU_DEVICE_TYPE); @@ -37,13 +36,13 @@ struct DIPUGuardImpl : public c10::impl::DeviceGuardImplInterface { } void setDevice(c10::Device device) const override { - if (devproxy::current_device() < 0) return; + if (devproxy::current_device() < 0) return; AT_ASSERT(device.type() == dipu::DIPU_DEVICE_TYPE); devproxy::setDevice(device.index()); } void uncheckedSetDevice(c10::Device device) const noexcept override { - if (devproxy::current_device() < 0 ) return; + if (devproxy::current_device() < 0) return; devproxy::setDevice(device.index()); } @@ -55,8 +54,7 @@ struct DIPUGuardImpl : public c10::impl::DeviceGuardImplInterface { auto oldStream = getCurrentDIPUStream(s.device().index()); DIPUStream stream(s); setCurrentDIPUStream(stream); - return c10::Stream(c10::Stream::UNSAFE, - s.device(), + return c10::Stream(c10::Stream::UNSAFE, s.device(), static_cast(oldStream.id())); } @@ -64,7 +62,8 @@ struct DIPUGuardImpl : public c10::impl::DeviceGuardImplInterface { return devproxy::getDeviceCount(); } - c10::Stream getStreamFromGlobalPool(c10::Device d, bool isHighPriority = false) const override { + c10::Stream getStreamFromGlobalPool( + c10::Device d, bool isHighPriority = false) const override { return getDIPUStreamFromPool(d.index()); } @@ -72,17 +71,13 @@ struct DIPUGuardImpl : public c10::impl::DeviceGuardImplInterface { return getDefaultDIPUStream(device.index()); } - void record( - void** event, - const c10::Stream& s, - const c10::DeviceIndex device_index, - const c10::EventFlag flag) const override { + void record(void **event, const c10::Stream &s, + const c10::DeviceIndex device_index, + const c10::EventFlag flag) const override { TORCH_CHECK(device_index == -1 || device_index == s.device_index(), - "Event device index ", - device_index, - " does not match recording stream's device index ", - s.device_index(), - "."); + "Event device index ", device_index, + " does not match recording stream's device index ", + s.device_index(), "."); deviceEvent_t dipu_event = static_cast(*event); DIPUStream stream(s); @@ -103,9 +98,7 @@ struct DIPUGuardImpl : public c10::impl::DeviceGuardImplInterface { setDevice(orig_device); } - void block( - void* event, - const c10::Stream& s) const override { + void block(void *event, const c10::Stream &s) const override { if (!event) return; deviceEvent_t dipu_event = static_cast(event); const auto orig_device = this->getDevice(); @@ -115,10 +108,9 @@ struct DIPUGuardImpl : public c10::impl::DeviceGuardImplInterface { setDevice(orig_device); } - void destroyEvent(void* event, const c10::DeviceIndex device_index) + void destroyEvent(void *event, const c10::DeviceIndex device_index) const noexcept override { - if (!event) - return; + if (!event) return; auto dipu_event = static_cast(event); const c10::Device orig_device = this->getDevice(); devproxy::setDevice(device_index); @@ -126,10 +118,11 @@ struct DIPUGuardImpl : public c10::impl::DeviceGuardImplInterface { devproxy::destroyEvent(dipu_event); setDevice(orig_device); } - // call from ivalue_inl.h synchronizeWithCurrentStreams with 'current stream' = default stream. - // it's useless in ddp, because output tensor is record with comm stream in colletive(), - // but may be useful in other communication mode. - void recordDataPtrOnStream(const c10::DataPtr& dataptr, const c10::Stream& s) const override { + // call from ivalue_inl.h synchronizeWithCurrentStreams with 'current stream' + // = default stream. it's useless in ddp, because output tensor is record with + // comm stream in colletive(), but may be useful in other communication mode. + void recordDataPtrOnStream(const c10::DataPtr &dataptr, + const c10::Stream &s) const override { DIPUStream stream(s); if (stream != getDefaultDIPUStream()) { dipu::recordStream(dataptr, stream); diff --git a/dipu/torch_dipu/csrc_dipu/runtime/device/basedef.h b/dipu/torch_dipu/csrc_dipu/runtime/device/basedef.h index 84223cace..364844e3a 100644 --- a/dipu/torch_dipu/csrc_dipu/runtime/device/basedef.h +++ b/dipu/torch_dipu/csrc_dipu/runtime/device/basedef.h @@ -1,68 +1,54 @@ // Copyright (c) 2023, DeepLink. #pragma once -// todo:: dev api will remove pytorch dependency +// todo:: dev api will remove pytorch dependency #include + #include // todo: move out deice dir to diopi namespace dipu { -#define DIPU_API __attribute__ ((visibility ("default"))) +#define DIPU_API __attribute__((visibility("default"))) -#define DIPU_WEAK __attribute__((weak)) +#define DIPU_WEAK __attribute__((weak)) // "default", "hidden", "protected" or "internal -#define DIPU_HIDDEN __attribute__ ((visibility ("hidden"))) +#define DIPU_HIDDEN __attribute__((visibility("hidden"))) typedef int32_t enum_t; #define DIPU_STRING(x) #x #define DIPU_CODELOC __FILE__ " (" DIPU_STRING(__LINE__) ")" +#define DIPU_LOGE(fmt, ...) \ + printf("[ERROR]%s,%s:%u:" #fmt "\n", __FUNCTION__, __FILE__, __LINE__, \ + ##__VA_ARGS__) -#define DIPU_LOGE(fmt, ...) \ - printf( \ - "[ERROR]%s,%s:%u:" #fmt "\n", \ - __FUNCTION__, \ - __FILE__, \ - __LINE__, \ - ##__VA_ARGS__) - -#define DIPU_LOGW(fmt, ...) \ - printf( \ - "[WARN]%s,%s:%u:" #fmt "\n", \ - __FUNCTION__, \ - __FILE__, \ - __LINE__, \ - ##__VA_ARGS__) - +#define DIPU_LOGW(fmt, ...) \ + printf("[WARN]%s,%s:%u:" #fmt "\n", __FUNCTION__, __FILE__, __LINE__, \ + ##__VA_ARGS__) namespace devapis { enum class VendorDeviceType : enum_t { - MLU, //camb - NPU, //ascend - CUDA, //cuda - GCU, //gcu - SUPA, //Biren - DROPLET, //droplet + MLU, // camb + NPU, // ascend + CUDA, // cuda + GCU, // gcu + SUPA, // Biren + DROPLET, // droplet }; -enum class EventStatus: enum_t { - PENDING, - RUNNING, - DEFERRED, - READY -}; +enum class EventStatus : enum_t { PENDING, RUNNING, DEFERRED, READY }; -enum class OpStatus: enum_t { +enum class OpStatus : enum_t { SUCCESS, ERR_UNKNOWN, ERR_NOMEM, }; -enum class MemCPKind: enum_t { +enum class MemCPKind : enum_t { D2H, H2D, D2D, @@ -91,6 +77,5 @@ struct DIPUDeviceProperties { using deviceId_t = c10::DeviceIndex; - -} // end namespace devapis -} // end namespace dipu +} // end namespace devapis +} // end namespace dipu diff --git a/dipu/torch_dipu/csrc_dipu/runtime/device/deviceapis.h b/dipu/torch_dipu/csrc_dipu/runtime/device/deviceapis.h index 5ae2e871f..a4e7576ee 100644 --- a/dipu/torch_dipu/csrc_dipu/runtime/device/deviceapis.h +++ b/dipu/torch_dipu/csrc_dipu/runtime/device/deviceapis.h @@ -4,6 +4,7 @@ #include #include + #include "./basedef.h" namespace dipu { @@ -32,11 +33,11 @@ DIPU_API void checkLastError(); DIPU_API int getDeviceCount(); -DIPU_API void getDriverVersion(int* version); +DIPU_API void getDriverVersion(int *version); -DIPU_API void getRuntimeVersion(int* version); +DIPU_API void getRuntimeVersion(int *version); -DIPU_API void createStream(deviceStream_t* stream, bool prior=false); +DIPU_API void createStream(deviceStream_t *stream, bool prior = false); DIPU_API void destroyStream(deviceStream_t stream); DIPU_API void destroyStream(deviceStream_t stream, deviceId_t devId); @@ -56,7 +57,7 @@ DIPU_API bool isStreamEmpty(deviceStream_t stream); // device event related // ===================== -DIPU_API void createEvent(deviceEvent_t* event); +DIPU_API void createEvent(deviceEvent_t *event); DIPU_API void destroyEvent(deviceEvent_t event); @@ -64,45 +65,54 @@ DIPU_API void waitEvent(deviceEvent_t event); DIPU_API void recordEvent(deviceEvent_t event, deviceStream_t stream); -DIPU_API void eventElapsedTime(float *time, deviceEvent_t start, deviceEvent_t end); +DIPU_API void eventElapsedTime(float *time, deviceEvent_t start, + deviceEvent_t end); DIPU_API EventStatus getEventStatus(deviceEvent_t event); // ===================== // mem related // ===================== -DIPU_API void mallocHost(void** p, size_t nbytes); +DIPU_API void mallocHost(void **p, size_t nbytes); -DIPU_API void freeHost(void* p); +DIPU_API void freeHost(void *p); -DIPU_API OpStatus mallocDevice(void** p, size_t nbytes, bool throwExcepion= true); +DIPU_API OpStatus mallocDevice(void **p, size_t nbytes, + bool throwExcepion = true); -DIPU_API void freeDevice(void* p); +DIPU_API void freeDevice(void *p); DIPU_API bool isPinnedPtr(const void *p); // (asynchronous) set val -DIPU_API void memSetAsync(const deviceStream_t stream, void* ptr, int val, size_t size); +DIPU_API void memSetAsync(const deviceStream_t stream, void *ptr, int val, + size_t size); // (synchronous) copy from device to a device -DIPU_API void memCopyD2D(size_t nbytes, deviceId_t dstDevId, void* dst, deviceId_t srcDevId, const void* src); +DIPU_API void memCopyD2D(size_t nbytes, deviceId_t dstDevId, void *dst, + deviceId_t srcDevId, const void *src); // (synchronous) copy from host to a device -DIPU_API void memCopyH2D(size_t nbytes, /*deviceId_t dstDevId,*/ void* dst, /*Host srcDev,*/ const void* src); +DIPU_API void memCopyH2D(size_t nbytes, /*deviceId_t dstDevId,*/ void *dst, + /*Host srcDev,*/ const void *src); // (synchronous) copy from a device to host -DIPU_API void memCopyD2H(size_t nbytes, /*Host dstDev,*/ void* dst, /*deviceId_t srcDevId,*/ const void* src); +DIPU_API void memCopyD2H(size_t nbytes, /*Host dstDev,*/ void *dst, + /*deviceId_t srcDevId,*/ const void *src); // (asynchronous) copy from device to a device DIPU_API void memCopyD2DAsync(const deviceStream_t stream, size_t nbytes, - deviceId_t dstDevId, void* dst, deviceId_t srcDevId, const void* src); + deviceId_t dstDevId, void *dst, + deviceId_t srcDevId, const void *src); // (asynchronous) copy from host to a device DIPU_API void memCopyH2DAsync(const deviceStream_t stream, size_t nbytes, - /*deviceId_t dstDevId,*/ void* dst, /*Host srcDev,*/ const void* src); + /*deviceId_t dstDevId,*/ void *dst, + /*Host srcDev,*/ const void *src); // (asynchronous) copy from a device to host DIPU_API void memCopyD2HAsync(const deviceStream_t stream, size_t nbytes, - /*Host dstDev,*/ void* dst, /*deviceId_t srcDevId,*/ const void* src); + /*Host dstDev,*/ void *dst, + /*deviceId_t srcDevId,*/ const void *src); } // end namespace devapis } // end namespace dipu \ No newline at end of file diff --git a/dipu/torch_dipu/csrc_dipu/runtime/device/diclapis.h b/dipu/torch_dipu/csrc_dipu/runtime/device/diclapis.h index ad47dc597..ef0df8479 100644 --- a/dipu/torch_dipu/csrc_dipu/runtime/device/diclapis.h +++ b/dipu/torch_dipu/csrc_dipu/runtime/device/diclapis.h @@ -1,57 +1,72 @@ #pragma once -#include #include +#include #include + #include "./deviceapis.h" namespace dipu { // need add return status. namespace devapis { - // todo: define new diopi reduceop. - using ReduceOp = c10d::ReduceOp; - - extern const int DICL_UNIQUE_ID_BYTES_SIZE; +// todo: define new diopi reduceop. +using ReduceOp = c10d::ReduceOp; - // todo:: dipu only export devproxy but not devapis (which move o diopi) - DIPU_API diclResult_t diclGetCommAsyncError(diclComm_t comm); +extern const int DICL_UNIQUE_ID_BYTES_SIZE; - DIPU_API diclResult_t diclGetUniqueId(commUniqueId* uniqueId); +// todo:: dipu only export devproxy but not devapis (which move o diopi) +DIPU_API diclResult_t diclGetCommAsyncError(diclComm_t comm); - DIPU_API diclResult_t diclCommInitRank(diclComm_t* comm, int nranks, commUniqueId uniqueId, int rank, int localDeviceId = -1); +DIPU_API diclResult_t diclGetUniqueId(commUniqueId *uniqueId); - // DIPU_API void diclCommInitAll(diclComm_t* comms, int ndev, const int* devlist); +DIPU_API diclResult_t diclCommInitRank(diclComm_t *comm, int nranks, + commUniqueId uniqueId, int rank, + int localDeviceId = -1); - DIPU_API diclResult_t diclCommDestroy(diclComm_t comm); +// DIPU_API void diclCommInitAll(diclComm_t* comms, int ndev, const int* +// devlist); - // DIPU_API diclResult_t diclCommFinalize(diclComm_t comm); +DIPU_API diclResult_t diclCommDestroy(diclComm_t comm); - // DIPU_API diclResult_t diclCommAbort(diclComm_t comm); +// DIPU_API diclResult_t diclCommFinalize(diclComm_t comm); - DIPU_API diclResult_t diclAllReduce(const void *sendBuf, void *recvBuf, size_t count, at::ScalarType datatype, - const ReduceOp& reduceOp, diclComm_t comm, deviceStream_t stream); +// DIPU_API diclResult_t diclCommAbort(diclComm_t comm); - DIPU_API diclResult_t diclBroadcast(const void *sendBuf, void* recvBuf, size_t count, at::ScalarType datatype, - int root, diclComm_t comm, deviceStream_t stream); +DIPU_API diclResult_t diclAllReduce(const void *sendBuf, void *recvBuf, + size_t count, at::ScalarType datatype, + const ReduceOp &reduceOp, diclComm_t comm, + deviceStream_t stream); - DIPU_API diclResult_t diclAllGather(const void *sendBuf, void *recvBuf, size_t sendCount, at::ScalarType datatype, - diclComm_t comm, deviceStream_t stream); +DIPU_API diclResult_t diclBroadcast(const void *sendBuf, void *recvBuf, + size_t count, at::ScalarType datatype, + int root, diclComm_t comm, + deviceStream_t stream); - DIPU_API diclResult_t diclReduce(const void* sendbuf, void* recvBuf, size_t count, at::ScalarType datatype, - const ReduceOp& reduceOp, int root, diclComm_t comm, deviceStream_t stream); +DIPU_API diclResult_t diclAllGather(const void *sendBuf, void *recvBuf, + size_t sendCount, at::ScalarType datatype, + diclComm_t comm, deviceStream_t stream); - DIPU_API diclResult_t diclReduceScatter(void *sendBuf, void *recvBuf, size_t recvCount, at::ScalarType datatype, - const ReduceOp& op, diclComm_t comm, deviceStream_t stream); +DIPU_API diclResult_t diclReduce(const void *sendbuf, void *recvBuf, + size_t count, at::ScalarType datatype, + const ReduceOp &reduceOp, int root, + diclComm_t comm, deviceStream_t stream); - DIPU_API diclResult_t diclSend(void* recvBuf, size_t count, at::ScalarType datatype, int peer, - diclComm_t comm, deviceStream_t stream); +DIPU_API diclResult_t diclReduceScatter(void *sendBuf, void *recvBuf, + size_t recvCount, + at::ScalarType datatype, + const ReduceOp &op, diclComm_t comm, + deviceStream_t stream); - DIPU_API diclResult_t diclRecv(void* recvBuf, size_t count, at::ScalarType datatype, int peer, - diclComm_t comm, deviceStream_t stream); +DIPU_API diclResult_t diclSend(void *recvBuf, size_t count, + at::ScalarType datatype, int peer, + diclComm_t comm, deviceStream_t stream); +DIPU_API diclResult_t diclRecv(void *recvBuf, size_t count, + at::ScalarType datatype, int peer, + diclComm_t comm, deviceStream_t stream); -} // namespace devapis +} // namespace devapis -} // namespace dipu \ No newline at end of file +} // namespace dipu \ No newline at end of file diff --git a/dipu/torch_dipu/csrc_dipu/runtime/devproxy/deviceproxy.cpp b/dipu/torch_dipu/csrc_dipu/runtime/devproxy/deviceproxy.cpp index 9c9c29fd0..0cd765205 100644 --- a/dipu/torch_dipu/csrc_dipu/runtime/devproxy/deviceproxy.cpp +++ b/dipu/torch_dipu/csrc_dipu/runtime/devproxy/deviceproxy.cpp @@ -1,5 +1,6 @@ // Copyright (c) 2023, DeepLink. #include "./deviceproxy.h" + #include "../core/DIPUEventPool.h" namespace dipu { @@ -17,9 +18,7 @@ void finalizeVendor() { } } -deviceId_t current_device() { - return devapis::current_device(); -} +deviceId_t current_device() { return devapis::current_device(); } DIPUDeviceProperties getDeviceProperties(int32_t device_index) { return devapis::getDeviceProperties(device_index); @@ -33,36 +32,26 @@ DIPUDeviceStatus getDeviceStatus(int32_t device_index) { } // set current device given device according to id -void setDevice(deviceId_t devId) { - return devapis::setDevice(devId); -} +void setDevice(deviceId_t devId) { return devapis::setDevice(devId); } -void resetDevice(deviceId_t devId) { - return devapis::resetDevice(devId); -} +void resetDevice(deviceId_t devId) { return devapis::resetDevice(devId); } -void syncDevice() { - return devapis::syncDevice(); -} +void syncDevice() { return devapis::syncDevice(); } // check last launch succ or not, throw if fail -void checkLastError() { - return devapis::checkLastError(); -} +void checkLastError() { return devapis::checkLastError(); } -int getDeviceCount() { - return devapis::getDeviceCount(); -} +int getDeviceCount() { return devapis::getDeviceCount(); } -void getDriverVersion(int* version) { +void getDriverVersion(int *version) { return devapis::getDriverVersion(version); } -void getRuntimeVersion(int* version) { +void getRuntimeVersion(int *version) { return devapis::getRuntimeVersion(version); } -void createStream(deviceStream_t* stream, bool prior) { +void createStream(deviceStream_t *stream, bool prior) { return devapis::createStream(stream, prior); } @@ -74,13 +63,9 @@ void destroyStream(deviceStream_t stream, deviceId_t devId) { return devapis::destroyStream(stream, devId); } -void releaseStream() { - return devapis::releaseStream(); -} +void releaseStream() { return devapis::releaseStream(); } -void syncStream(deviceStream_t stream) { - return devapis::syncStream(stream); -} +void syncStream(deviceStream_t stream) { return devapis::syncStream(stream); } bool streamNotNull(deviceStream_t stream) { return devapis::streamNotNull(stream); @@ -99,17 +84,11 @@ bool isStreamEmpty(deviceStream_t stream) { // device event related // ===================== -void createEvent(deviceEvent_t* event) { - return getEventFromPool(*event); -} +void createEvent(deviceEvent_t *event) { return getEventFromPool(*event); } -void destroyEvent(deviceEvent_t event) { - return restoreEventToPool(event); -} +void destroyEvent(deviceEvent_t event) { return restoreEventToPool(event); } -void waitEvent(deviceEvent_t event) { - return devapis::waitEvent(event); -} +void waitEvent(deviceEvent_t event) { return devapis::waitEvent(event); } void recordEvent(deviceEvent_t event, deviceStream_t stream) { return devapis::recordEvent(event, stream); @@ -126,61 +105,61 @@ EventStatus getEventStatus(deviceEvent_t event) { // ===================== // mem related // ===================== -void mallocHost(void** p, size_t nbytes) { +void mallocHost(void **p, size_t nbytes) { return devapis::mallocHost(p, nbytes); } -void freeHost(void* p) { - return devapis::freeHost(p); -} +void freeHost(void *p) { return devapis::freeHost(p); } -OpStatus mallocDevice(void** p, size_t nbytes, bool throwExcepion) { +OpStatus mallocDevice(void **p, size_t nbytes, bool throwExcepion) { return devapis::mallocDevice(p, nbytes, throwExcepion); } -void freeDevice(void* p) { - return devapis::freeDevice(p); -} +void freeDevice(void *p) { return devapis::freeDevice(p); } -bool isPinnedPtr(const void *p) { - return devapis::isPinnedPtr(p); -} +bool isPinnedPtr(const void *p) { return devapis::isPinnedPtr(p); } // (asynchronous) set val -void memSetAsync(const deviceStream_t stream, void* ptr, int val, size_t size) { +void memSetAsync(const deviceStream_t stream, void *ptr, int val, size_t size) { return devapis::memSetAsync(stream, ptr, val, size); } // (synchronous) copy from device to a device -void memCopyD2D(size_t nbytes, deviceId_t dstDevId, void* dst, deviceId_t srcDevId, const void* src) { +void memCopyD2D(size_t nbytes, deviceId_t dstDevId, void *dst, + deviceId_t srcDevId, const void *src) { return devapis::memCopyD2D(nbytes, dstDevId, dst, srcDevId, src); } // (synchronous) copy from host to a device -void memCopyH2D(size_t nbytes, /*deviceId_t dstDevId,*/ void* dst, /*Host srcDev,*/ const void* src) { +void memCopyH2D(size_t nbytes, /*deviceId_t dstDevId,*/ void *dst, + /*Host srcDev,*/ const void *src) { return devapis::memCopyH2D(nbytes, dst, src); } // (synchronous) copy from a device to host -void memCopyD2H(size_t nbytes, /*Host dstDev,*/ void* dst, /*deviceId_t srcDevId,*/ const void* src) { +void memCopyD2H(size_t nbytes, /*Host dstDev,*/ void *dst, + /*deviceId_t srcDevId,*/ const void *src) { return devapis::memCopyD2H(nbytes, dst, src); } // (asynchronous) copy from device to a device void memCopyD2DAsync(const deviceStream_t stream, size_t nbytes, - deviceId_t dstDevId, void* dst, deviceId_t srcDevId, const void* src) { + deviceId_t dstDevId, void *dst, deviceId_t srcDevId, + const void *src) { return devapis::memCopyD2DAsync(stream, nbytes, dstDevId, dst, srcDevId, src); } // (asynchronous) copy from host to a device void memCopyH2DAsync(const deviceStream_t stream, size_t nbytes, - /*deviceId_t dstDevId,*/ void* dst, /*Host srcDev,*/ const void* src) { + /*deviceId_t dstDevId,*/ void *dst, + /*Host srcDev,*/ const void *src) { return devapis::memCopyH2DAsync(stream, nbytes, dst, src); } // (asynchronous) copy from a device to host void memCopyD2HAsync(const deviceStream_t stream, size_t nbytes, - /*Host dstDev,*/ void* dst, /*deviceId_t srcDevId,*/ const void* src) { + /*Host dstDev,*/ void *dst, + /*deviceId_t srcDevId,*/ const void *src) { return devapis::memCopyD2HAsync(stream, nbytes, dst, src); } diff --git a/dipu/torch_dipu/csrc_dipu/runtime/devproxy/deviceproxy.h b/dipu/torch_dipu/csrc_dipu/runtime/devproxy/deviceproxy.h index 2a69300b9..ac20802c0 100644 --- a/dipu/torch_dipu/csrc_dipu/runtime/devproxy/deviceproxy.h +++ b/dipu/torch_dipu/csrc_dipu/runtime/devproxy/deviceproxy.h @@ -34,11 +34,11 @@ DIPU_API void checkLastError(); DIPU_API int getDeviceCount(); -DIPU_API void getDriverVersion(int* version); +DIPU_API void getDriverVersion(int *version); -DIPU_API void getRuntimeVersion(int* version); +DIPU_API void getRuntimeVersion(int *version); -DIPU_API void createStream(deviceStream_t* stream, bool prior=false); +DIPU_API void createStream(deviceStream_t *stream, bool prior = false); DIPU_API void destroyStream(deviceStream_t stream); DIPU_API void destroyStream(deviceStream_t stream, deviceId_t devId); @@ -58,7 +58,7 @@ DIPU_API bool isStreamEmpty(deviceStream_t stream); // device event related // ===================== -DIPU_API void createEvent(deviceEvent_t* event); +DIPU_API void createEvent(deviceEvent_t *event); DIPU_API void destroyEvent(deviceEvent_t event); @@ -66,46 +66,55 @@ DIPU_API void waitEvent(deviceEvent_t event); DIPU_API void recordEvent(deviceEvent_t event, deviceStream_t stream); -DIPU_API void eventElapsedTime(float *time, deviceEvent_t start, deviceEvent_t end); +DIPU_API void eventElapsedTime(float *time, deviceEvent_t start, + deviceEvent_t end); DIPU_API EventStatus getEventStatus(deviceEvent_t event); // ===================== // mem related // ===================== -DIPU_API void mallocHost(void** p, size_t nbytes); +DIPU_API void mallocHost(void **p, size_t nbytes); -DIPU_API void freeHost(void* p); +DIPU_API void freeHost(void *p); -DIPU_API OpStatus mallocDevice(void** p, size_t nbytes, bool throwExcepion= true); +DIPU_API OpStatus mallocDevice(void **p, size_t nbytes, + bool throwExcepion = true); -DIPU_API void freeDevice(void* p); +DIPU_API void freeDevice(void *p); DIPU_API bool isPinnedPtr(const void *p); // (asynchronous) set val -DIPU_API void memSetAsync(const deviceStream_t stream, void* ptr, int val, size_t size); +DIPU_API void memSetAsync(const deviceStream_t stream, void *ptr, int val, + size_t size); // (synchronous) copy from device to a device -DIPU_API void memCopyD2D(size_t nbytes, deviceId_t dstDevId, void* dst, deviceId_t srcDevId, const void* src); +DIPU_API void memCopyD2D(size_t nbytes, deviceId_t dstDevId, void *dst, + deviceId_t srcDevId, const void *src); // (synchronous) copy from host to a device -DIPU_API void memCopyH2D(size_t nbytes, /*deviceId_t dstDevId,*/ void* dst, /*Host srcDev,*/ const void* src); +DIPU_API void memCopyH2D(size_t nbytes, /*deviceId_t dstDevId,*/ void *dst, + /*Host srcDev,*/ const void *src); // (synchronous) copy from a device to host -DIPU_API void memCopyD2H(size_t nbytes, /*Host dstDev,*/ void* dst, /*deviceId_t srcDevId,*/ const void* src); +DIPU_API void memCopyD2H(size_t nbytes, /*Host dstDev,*/ void *dst, + /*deviceId_t srcDevId,*/ const void *src); // (asynchronous) copy from device to a device DIPU_API void memCopyD2DAsync(const deviceStream_t stream, size_t nbytes, - deviceId_t dstDevId, void* dst, deviceId_t srcDevId, const void* src); + deviceId_t dstDevId, void *dst, + deviceId_t srcDevId, const void *src); // (asynchronous) copy from host to a device DIPU_API void memCopyH2DAsync(const deviceStream_t stream, size_t nbytes, - /*deviceId_t dstDevId,*/ void* dst, /*Host srcDev,*/ const void* src); + /*deviceId_t dstDevId,*/ void *dst, + /*Host srcDev,*/ const void *src); // (asynchronous) copy from a device to host DIPU_API void memCopyD2HAsync(const deviceStream_t stream, size_t nbytes, - /*Host dstDev,*/ void* dst, /*deviceId_t srcDevId,*/ const void* src); + /*Host dstDev,*/ void *dst, + /*deviceId_t srcDevId,*/ const void *src); } // end namespace devproxy } // end namespace dipu \ No newline at end of file diff --git a/dipu/torch_dipu/csrc_dipu/runtime/devproxy/diclproxy.cpp b/dipu/torch_dipu/csrc_dipu/runtime/devproxy/diclproxy.cpp index bbe8dd9e0..c21d217b3 100644 --- a/dipu/torch_dipu/csrc_dipu/runtime/devproxy/diclproxy.cpp +++ b/dipu/torch_dipu/csrc_dipu/runtime/devproxy/diclproxy.cpp @@ -7,64 +7,75 @@ namespace dipu { // need enhance return status. namespace devproxy { - diclResult_t diclGetCommAsyncError(diclComm_t comm) { - return devapis::diclGetCommAsyncError(comm); - } - - diclResult_t diclGetUniqueId(commUniqueId* uniqueId) { - return devapis::diclGetUniqueId(uniqueId); - } - - diclResult_t diclCommInitRank(diclComm_t* comm, int nranks, commUniqueId uniqueId, int rank, int localDeviceId) { - return devapis::diclCommInitRank(comm, nranks, uniqueId, rank, localDeviceId); - } - - // void diclCommInitAll(diclComm_t* comms, int ndev, const int* devlist); - - diclResult_t diclCommDestroy(diclComm_t comm) { - return devapis::diclCommDestroy(comm); - } - - // diclResult_t diclCommFinalize(diclComm_t comm); - - // diclResult_t diclCommAbort(diclComm_t comm); - - diclResult_t diclAllReduce(const void *sendbuff, void *recvbuff, size_t count, at::ScalarType datatype, - const ReduceOp& reduceOp, diclComm_t comm, deviceStream_t stream) { - return devapis::diclAllReduce(sendbuff, recvbuff, count, datatype, reduceOp, comm, stream); - } - - diclResult_t diclBroadcast(const void *sendbuff, void* recvbuff, size_t count, at::ScalarType datatype, - int root, diclComm_t comm, deviceStream_t stream) { - return devapis::diclBroadcast(sendbuff, recvbuff, count, datatype, root, comm, stream); - } - - diclResult_t diclAllGather(const void *sendbuff, void *recvbuff, size_t sendCount, at::ScalarType datatype, - diclComm_t comm, deviceStream_t stream) { - return devapis::diclAllGather(sendbuff, recvbuff, sendCount, datatype, comm, stream); - } - - diclResult_t diclReduce(const void* sendbuff, void* recvbuff, size_t count, at::ScalarType datatype, - const ReduceOp& reduceOp, int root, diclComm_t comm, deviceStream_t stream) { - return devapis::diclReduce(sendbuff, recvbuff, count, datatype, reduceOp, root, comm, stream); - } - - diclResult_t diclReduceScatter(void *sendbuff, void *recvbuff, size_t recvCount, at::ScalarType datatype, - const ReduceOp& op, diclComm_t comm, deviceStream_t stream) { - return devapis::diclReduceScatter(sendbuff, recvbuff, recvCount, datatype, op, comm, stream); - } - - diclResult_t diclSend(void* sendbuff, size_t count, at::ScalarType datatype, int peer, - diclComm_t comm, deviceStream_t stream) { - return devapis::diclSend(sendbuff, count, datatype, peer, comm, stream); - } - - diclResult_t diclRecv(void* recvbuff, size_t count, at::ScalarType datatype, int peer, - diclComm_t comm, deviceStream_t stream) { - return devapis::diclRecv(recvbuff, count, datatype, peer, comm, stream); - } - - -} // namespace devproxy - -} // namespace dipu \ No newline at end of file +diclResult_t diclGetCommAsyncError(diclComm_t comm) { + return devapis::diclGetCommAsyncError(comm); +} + +diclResult_t diclGetUniqueId(commUniqueId *uniqueId) { + return devapis::diclGetUniqueId(uniqueId); +} + +diclResult_t diclCommInitRank(diclComm_t *comm, int nranks, + commUniqueId uniqueId, int rank, + int localDeviceId) { + return devapis::diclCommInitRank(comm, nranks, uniqueId, rank, localDeviceId); +} + +// void diclCommInitAll(diclComm_t* comms, int ndev, const int* devlist); + +diclResult_t diclCommDestroy(diclComm_t comm) { + return devapis::diclCommDestroy(comm); +} + +// diclResult_t diclCommFinalize(diclComm_t comm); + +// diclResult_t diclCommAbort(diclComm_t comm); + +diclResult_t diclAllReduce(const void *sendbuff, void *recvbuff, size_t count, + at::ScalarType datatype, const ReduceOp &reduceOp, + diclComm_t comm, deviceStream_t stream) { + return devapis::diclAllReduce(sendbuff, recvbuff, count, datatype, reduceOp, + comm, stream); +} + +diclResult_t diclBroadcast(const void *sendbuff, void *recvbuff, size_t count, + at::ScalarType datatype, int root, diclComm_t comm, + deviceStream_t stream) { + return devapis::diclBroadcast(sendbuff, recvbuff, count, datatype, root, comm, + stream); +} + +diclResult_t diclAllGather(const void *sendbuff, void *recvbuff, + size_t sendCount, at::ScalarType datatype, + diclComm_t comm, deviceStream_t stream) { + return devapis::diclAllGather(sendbuff, recvbuff, sendCount, datatype, comm, + stream); +} + +diclResult_t diclReduce(const void *sendbuff, void *recvbuff, size_t count, + at::ScalarType datatype, const ReduceOp &reduceOp, + int root, diclComm_t comm, deviceStream_t stream) { + return devapis::diclReduce(sendbuff, recvbuff, count, datatype, reduceOp, + root, comm, stream); +} + +diclResult_t diclReduceScatter(void *sendbuff, void *recvbuff, size_t recvCount, + at::ScalarType datatype, const ReduceOp &op, + diclComm_t comm, deviceStream_t stream) { + return devapis::diclReduceScatter(sendbuff, recvbuff, recvCount, datatype, op, + comm, stream); +} + +diclResult_t diclSend(void *sendbuff, size_t count, at::ScalarType datatype, + int peer, diclComm_t comm, deviceStream_t stream) { + return devapis::diclSend(sendbuff, count, datatype, peer, comm, stream); +} + +diclResult_t diclRecv(void *recvbuff, size_t count, at::ScalarType datatype, + int peer, diclComm_t comm, deviceStream_t stream) { + return devapis::diclRecv(recvbuff, count, datatype, peer, comm, stream); +} + +} // namespace devproxy + +} // namespace dipu \ No newline at end of file diff --git a/dipu/torch_dipu/csrc_dipu/runtime/devproxy/diclproxy.h b/dipu/torch_dipu/csrc_dipu/runtime/devproxy/diclproxy.h index 01a63fdce..b5457ed90 100644 --- a/dipu/torch_dipu/csrc_dipu/runtime/devproxy/diclproxy.h +++ b/dipu/torch_dipu/csrc_dipu/runtime/devproxy/diclproxy.h @@ -10,42 +10,56 @@ namespace dipu { // need enhance return status. namespace devproxy { - DIPU_API diclResult_t diclGetCommAsyncError(diclComm_t comm); +DIPU_API diclResult_t diclGetCommAsyncError(diclComm_t comm); - DIPU_API diclResult_t diclGetUniqueId(commUniqueId* uniqueId); +DIPU_API diclResult_t diclGetUniqueId(commUniqueId *uniqueId); - DIPU_API diclResult_t diclCommInitRank(diclComm_t* comm, int nranks, commUniqueId uniqueId, int rank, int localDeviceId = -1); +DIPU_API diclResult_t diclCommInitRank(diclComm_t *comm, int nranks, + commUniqueId uniqueId, int rank, + int localDeviceId = -1); - // DIPU_API void diclCommInitAll(diclComm_t* comms, int ndev, const int* devlist); +// DIPU_API void diclCommInitAll(diclComm_t* comms, int ndev, const int* +// devlist); - DIPU_API diclResult_t diclCommDestroy(diclComm_t comm); +DIPU_API diclResult_t diclCommDestroy(diclComm_t comm); - // DIPU_API diclResult_t diclCommFinalize(diclComm_t comm); +// DIPU_API diclResult_t diclCommFinalize(diclComm_t comm); - // DIPU_API diclResult_t diclCommAbort(diclComm_t comm); +// DIPU_API diclResult_t diclCommAbort(diclComm_t comm); - DIPU_API diclResult_t diclAllReduce(const void *sendbuff, void *recvbuff, size_t count, at::ScalarType datatype, - const ReduceOp& reduceOp, diclComm_t comm, deviceStream_t stream); +DIPU_API diclResult_t diclAllReduce(const void *sendbuff, void *recvbuff, + size_t count, at::ScalarType datatype, + const ReduceOp &reduceOp, diclComm_t comm, + deviceStream_t stream); - DIPU_API diclResult_t diclBroadcast(const void *sendbuff, void* recvbuff, size_t count, at::ScalarType datatype, - int root, diclComm_t comm, deviceStream_t stream); +DIPU_API diclResult_t diclBroadcast(const void *sendbuff, void *recvbuff, + size_t count, at::ScalarType datatype, + int root, diclComm_t comm, + deviceStream_t stream); - DIPU_API diclResult_t diclAllGather(const void *sendbuff, void *recvbuff, size_t sendCount, at::ScalarType datatype, - diclComm_t comm, deviceStream_t stream); +DIPU_API diclResult_t diclAllGather(const void *sendbuff, void *recvbuff, + size_t sendCount, at::ScalarType datatype, + diclComm_t comm, deviceStream_t stream); - DIPU_API diclResult_t diclReduce(const void* sendbuff, void* recvbuff, size_t count, at::ScalarType datatype, - const ReduceOp& reduceOp, int root, diclComm_t comm, deviceStream_t stream); +DIPU_API diclResult_t diclReduce(const void *sendbuff, void *recvbuff, + size_t count, at::ScalarType datatype, + const ReduceOp &reduceOp, int root, + diclComm_t comm, deviceStream_t stream); - DIPU_API diclResult_t diclReduceScatter(void *sendbuff, void *recvbuff, size_t recvCount, at::ScalarType datatype, - const ReduceOp& op, diclComm_t comm, deviceStream_t stream); +DIPU_API diclResult_t diclReduceScatter(void *sendbuff, void *recvbuff, + size_t recvCount, + at::ScalarType datatype, + const ReduceOp &op, diclComm_t comm, + deviceStream_t stream); - DIPU_API diclResult_t diclSend(void* sendbuff, size_t count, at::ScalarType datatype, int peer, - diclComm_t comm, deviceStream_t stream); +DIPU_API diclResult_t diclSend(void *sendbuff, size_t count, + at::ScalarType datatype, int peer, + diclComm_t comm, deviceStream_t stream); - DIPU_API diclResult_t diclRecv(void* recvbuff, size_t count, at::ScalarType datatype, int peer, - diclComm_t comm, deviceStream_t stream); +DIPU_API diclResult_t diclRecv(void *recvbuff, size_t count, + at::ScalarType datatype, int peer, + diclComm_t comm, deviceStream_t stream); +} // namespace devproxy -} // namespace devproxy - -} // namespace dipu \ No newline at end of file +} // namespace dipu \ No newline at end of file diff --git a/dipu/torch_dipu/csrc_dipu/runtime/distributed/DICLUtils.hpp b/dipu/torch_dipu/csrc_dipu/runtime/distributed/DICLUtils.hpp index 1d6b32ee0..443810564 100644 --- a/dipu/torch_dipu/csrc_dipu/runtime/distributed/DICLUtils.hpp +++ b/dipu/torch_dipu/csrc_dipu/runtime/distributed/DICLUtils.hpp @@ -12,15 +12,15 @@ namespace dipu { // wrapper of vendor raw communicator class DICLComm { -private: + private: void initRawComm(int numRanks, int rank, commUniqueId uniqueid) { - devproxy::diclCommInitRank(&rawComm_, numRanks, uniqueid, rank, static_cast(device_.index())); + devproxy::diclCommInitRank(&rawComm_, numRanks, uniqueid, rank, + static_cast(device_.index())); } -public: - explicit DICLComm(DIPUStream& bindStream): diclStream_(bindStream), - device_(bindStream.device()) { - } + public: + explicit DICLComm(DIPUStream &bindStream) + : diclStream_(bindStream), device_(bindStream.device()) {} ~DICLComm() noexcept { // Add lock in this destructor, as aborted_ needs to be read after memory @@ -31,24 +31,24 @@ class DICLComm { rawComm_ = nullptr; } } - static std::shared_ptr create(int numRanks, int rank, commUniqueId uniqueid, DIPUStream& stream) { + static std::shared_ptr create(int numRanks, int rank, + commUniqueId uniqueid, + DIPUStream &stream) { auto comm = std::make_shared(stream); comm->initRawComm(numRanks, rank, uniqueid); return comm; } // Must not be copyable - DICLComm(const DICLComm&) = delete; - DICLComm& operator=(const DICLComm&) = delete; + DICLComm(const DICLComm &) = delete; + DICLComm &operator=(const DICLComm &) = delete; // Move constructable - DICLComm(DICLComm&& other) = delete; + DICLComm(DICLComm &&other) = delete; // Move assignable - DICLComm& operator=(DICLComm&& other) = delete; + DICLComm &operator=(DICLComm &&other) = delete; - diclComm_t rawComm() const{ - return rawComm_; - } + diclComm_t rawComm() const { return rawComm_; } void preSyncStream() { auto currStream = dipu::getCurrentDIPUStream(device_.index()); @@ -61,18 +61,17 @@ class DICLComm { // The DIPU events used to sync current stream DIPUEvent preEvent_; - // by default, copy should work in comm stream, if in other stream, use preCopyEvent_ - // to guarantee comm finish. + // by default, copy should work in comm stream, if in other stream, use + // preCopyEvent_ to guarantee comm finish. DIPUEvent preCopyEvent_; // The cached list of DIPU devices to operate on at::Device device_; - -protected: + protected: bool aborted_ = false; diclComm_t rawComm_ = nullptr; mutable std::mutex mutex_; }; -} // namespace dipu \ No newline at end of file +} // namespace dipu \ No newline at end of file diff --git a/dipu/torch_dipu/csrc_dipu/runtime/distributed/ProcessGroupDICL.cpp b/dipu/torch_dipu/csrc_dipu/runtime/distributed/ProcessGroupDICL.cpp index b25e48f49..77439edb0 100644 --- a/dipu/torch_dipu/csrc_dipu/runtime/distributed/ProcessGroupDICL.cpp +++ b/dipu/torch_dipu/csrc_dipu/runtime/distributed/ProcessGroupDICL.cpp @@ -1,15 +1,15 @@ // Copyright (c) 2023, DeepLink. #include "ProcessGroupDICL.h" +#include + #include #include -#include - +#include "csrc_dipu/profiler/profiler.h" #include #include #include -#include "csrc_dipu/profiler/profiler.h" namespace dipu { @@ -17,11 +17,12 @@ using std::pair; namespace { -// Get the list of devices from list of tensors, collective comm always use all ranks, -// so no rank prefix required in key. -static inline std::string getDevieceIds(const std::vector& devices) { +// Get the list of devices from list of tensors, collective comm always use all +// ranks, so no rank prefix required in key. +static inline std::string getDevieceIds( + const std::vector &devices) { std::string deviceList; - for (auto& device : devices) { + for (auto &device : devices) { if (deviceList.empty()) { deviceList = std::to_string(device.index()); } else { @@ -32,9 +33,11 @@ static inline std::string getDevieceIds(const std::vector& devices) } static inline pair mapPGRank2P2P(int myRank, int peer) { - // ProcessGroupNCCL support send/recv self, but that seems only work with ncclGroup? - TORCH_CHECK(myRank != peer, "Invalid destination rank: should not be " - "the same as rank of the current process."); + // ProcessGroupNCCL support send/recv self, but that seems only work with + // ncclGroup? + TORCH_CHECK(myRank != peer, + "Invalid destination rank: should not be " + "the same as rank of the current process."); pair p2pRanks; // self p2p rank p2pRanks.first = myRank <= peer ? 0 : 1; @@ -43,24 +46,27 @@ static inline pair mapPGRank2P2P(int myRank, int peer) { return p2pRanks; } -// Get p2p sorted ranks as key, p2p only support 1 device tensor at a time and one comm endpoint -// can bind with either device. so use rank as comm key is enough. -static inline std::string getP2PRankIds(int myRank, int peer, const std::vector& devices) { +// Get p2p sorted ranks as key, p2p only support 1 device tensor at a time and +// one comm endpoint can bind with either device. so use rank as comm key is +// enough. +static inline std::string getP2PRankIds( + int myRank, int peer, const std::vector &devices) { int lowRank = myRank < peer ? myRank : peer; int highRank = myRank < peer ? peer : myRank; - return std::to_string(lowRank) + ":" + std::to_string(highRank); + return std::to_string(lowRank) + ":" + std::to_string(highRank); } -static inline std::vector getDeviceList(const std::vector& tensors) { +static inline std::vector getDeviceList( + const std::vector &tensors) { std::vector res; res.reserve(tensors.size()); - for (auto& tensor : tensors) { + for (auto &tensor : tensors) { res.push_back(tensor.device()); } return res; } -static inline void syncStreams(std::vector>& comms) { +static inline void syncStreams(std::vector> &comms) { for (size_t i = 0; i < comms.size(); ++i) { comms[i]->preSyncStream(); } @@ -73,13 +79,17 @@ static inline void syncStreams(std::vector>& comms) { // ProcessGroupDICL::WorkDICL::~WorkDICL() {} // currently DICL do not support error check -bool ProcessGroupDICL::WorkDICL::isCompleted() { return finishedDICLExecutionInternal(); } +bool ProcessGroupDICL::WorkDICL::isCompleted() { + return finishedDICLExecutionInternal(); +} // currently DICL do not support error check -bool ProcessGroupDICL::WorkDICL::isSuccess() const { return finishedDICLExecutionInternal(); } +bool ProcessGroupDICL::WorkDICL::isSuccess() const { + return finishedDICLExecutionInternal(); +} bool ProcessGroupDICL::WorkDICL::finishedDICLExecutionInternal() const { - for (auto& workEvent : workEvents_) { + for (auto &workEvent : workEvents_) { if (!workEvent.query()) { return false; } @@ -96,9 +106,10 @@ void ProcessGroupDICL::WorkDICL::record() { void ProcessGroupDICL::WorkDICL::synchronize() { for (auto i = 0; i < workEvents_.size(); i++) { - auto currentStream = dipu::getCurrentDIPUStream(diclComms_[i]->device_.index()); + auto currentStream = + dipu::getCurrentDIPUStream(diclComms_[i]->device_.index()); // Block the current stream(calculate stream) on the DICL comm stream event - workEvents_[i].wait(currentStream); + workEvents_[i].wait(currentStream); } // In case of blocking, wait for the operation to complete. @@ -115,11 +126,11 @@ void ProcessGroupDICL::WorkDICL::synchronize() { } } - // Device synchronize only after we've completed timeout checks. - // only barrier() call this + // Device synchronize only after we've completed timeout checks. + // only barrier() call this if (barrier_) { // If we use the work to do barrier, we should block here - for (auto& comm : diclComms_) { + for (auto &comm : diclComms_) { DIPUGuard dipuGuard(comm->device_); devproxy::syncDevice(); } @@ -136,40 +147,39 @@ std::vector ProcessGroupDICL::WorkDICL::result() { return *outputs_; } -c10::intrusive_ptr ProcessGroupDICL::WorkDICL:: - getFuture() { +c10::intrusive_ptr +ProcessGroupDICL::WorkDICL::getFuture() { return future_; } // end WorkDICL -ProcessGroupDICL::ProcessGroupDICL(const c10::intrusive_ptr& store, +ProcessGroupDICL::ProcessGroupDICL(const c10::intrusive_ptr &store, int rank, int size) - : c10d::Backend(rank, size), store_(store) { - char* blockingWait = getenv(DICL_BLOCKING_WAIT); - try { - if (blockingWait != nullptr) { - auto val = std::stoi(blockingWait); - if (val == 1) { - // Make wait() and synchronize() a blocking call. - blockingWait_ = true; - } else if (val != 0) { - throw std::runtime_error( - "Invalid value for environment variable: " + - std::string(DICL_BLOCKING_WAIT)); - } + : c10d::Backend(rank, size), store_(store) { + char *blockingWait = getenv(DICL_BLOCKING_WAIT); + try { + if (blockingWait != nullptr) { + auto val = std::stoi(blockingWait); + if (val == 1) { + // Make wait() and synchronize() a blocking call. + blockingWait_ = true; + } else if (val != 0) { + throw std::runtime_error("Invalid value for environment variable: " + + std::string(DICL_BLOCKING_WAIT)); } - } catch (std::exception& e) { - throw std::runtime_error( - "Invalid value for environment variable: " + - std::string(DICL_BLOCKING_WAIT)); } + } catch (std::exception &e) { + throw std::runtime_error("Invalid value for environment variable: " + + std::string(DICL_BLOCKING_WAIT)); } +} ProcessGroupDICL::~ProcessGroupDICL() {} -void ProcessGroupDICL::broadcastUniqueID(commUniqueId* uniqueId, - const std::string& storeKey, int commRank) { +void ProcessGroupDICL::broadcastUniqueID(commUniqueId *uniqueId, + const std::string &storeKey, + int commRank) { // For collective operations: // For every DICL communicator that we create we need to broadcast // a unique ID from rank 0 to all other ranks. This broadcast is @@ -185,13 +195,13 @@ void ProcessGroupDICL::broadcastUniqueID(commUniqueId* uniqueId, // of sequence number for p2p communications. if (commRank == 0) { - auto vec = std::vector( - reinterpret_cast(uniqueId), - reinterpret_cast(uniqueId) + devapis::DICL_UNIQUE_ID_BYTES_SIZE); + auto vec = std::vector(reinterpret_cast(uniqueId), + reinterpret_cast(uniqueId) + + devapis::DICL_UNIQUE_ID_BYTES_SIZE); store_->set(storeKey, vec); } else { auto vec = store_->get(storeKey); - if (vec.size() != devapis::DICL_UNIQUE_ID_BYTES_SIZE) { + if (vec.size() != devapis::DICL_UNIQUE_ID_BYTES_SIZE) { throw std::runtime_error( "Unexpected DICL unique ID length received " "from the store"); @@ -200,9 +210,10 @@ void ProcessGroupDICL::broadcastUniqueID(commUniqueId* uniqueId, } } -std::vector>& ProcessGroupDICL::getDICLComms(const std::string& localCommsKey, - const std::vector& devices, int commsRank, OpType opType) { - // Sanity check +std::vector> &ProcessGroupDICL::getDICLComms( + const std::string &localCommsKey, const std::vector &devices, + int commsRank, OpType opType) { + // Sanity check if (localCommsKey.empty()) { throw std::runtime_error( "Not able to create/get the DICL Communicator since " @@ -225,17 +236,21 @@ std::vector>& ProcessGroupDICL::getDICLComms(const std if (commsRank == 0) { devproxy::diclGetUniqueId(&diclID); } - std::string bcastKey = isP2POp(opType, false) ? localCommsKey : std::to_string(diclCommCounter_++); + std::string bcastKey = isP2POp(opType, false) + ? localCommsKey + : std::to_string(diclCommCounter_++); broadcastUniqueID(&diclID, bcastKey, commsRank); OptionalDIPUGuard dipuGuard; - for (int i=0; i < devSize; i++) { - int deviceCommRank = isP2POp(opType, false) ? commsRank : getRank() * devSize + i; + for (int i = 0; i < devSize; i++) { + int deviceCommRank = + isP2POp(opType, false) ? commsRank : getRank() * devSize + i; dipuGuard.reset_device(devices[i]); // use pool stream, not current stream auto commStream = getDIPUStreamFromPool(devices[i].index()); - diclComms[i] = DICLComm::create(deviceWorldSize, deviceCommRank, diclID, commStream); + diclComms[i] = + DICLComm::create(deviceWorldSize, deviceCommRank, diclID, commStream); } // Hold the lock before modifying the cache. @@ -249,11 +264,12 @@ namespace { // Flatten each list in `tensor_lists' for a gather or scatter operation, and // ensure compatibility with the corresponding tensor in `other'. -static inline std::vector flatten_for_scatter_gather(std::vector>& tensor_lists, - std::vector& other, size_t world_size) { +static inline std::vector flatten_for_scatter_gather( + std::vector> &tensor_lists, + std::vector &other, size_t world_size) { if (tensor_lists.size() != other.size()) { throw std::runtime_error( - "Tensor list operands to scatter/gather must have the same length"); + "Tensor list operands to scatter/gather must have the same length"); } const auto num_devices = tensor_lists.size(); std::vector flattened; @@ -274,7 +290,7 @@ static inline std::vector flatten_for_scatter_gather(std::vector flatten_for_scatter_gather(std::vector -static inline void copyInCommStream(std::shared_ptr& diclComm, const Dest& dest, - const Src& src, int nums) { +static inline void copyInCommStream(std::shared_ptr &diclComm, + const Dest &dest, const Src &src, + int nums) { auto diclStream = diclComm->diclStream_; DIPUStreamGuard guard(diclStream.unwrap()); for (size_t j = 0; j < nums; ++j) { @@ -301,37 +318,42 @@ static inline void copyInCommStream(std::shared_ptr& diclComm, const D } } -static inline void copyInCurrentStream(std::shared_ptr& diclComm, const std::vector& dest, - const at::Tensor& src) { +static inline void copyInCurrentStream(std::shared_ptr &diclComm, + const std::vector &dest, + const at::Tensor &src) { auto diclStream = diclComm->diclStream_; auto currStream = dipu::getCurrentDIPUStream(diclStream.device_index()); diclComm->preCopyEvent_.record(diclStream); - // copy after comm finish, loss concurrency,assume all dest finish in one comm op + // copy after comm finish, loss concurrency,assume all dest finish in one comm + // op diclComm->preCopyEvent_.wait(currStream); for (size_t j = 0; j < dest.size(); ++j) { dest[j].copy_(src[j], true); } } -} // annoy namespace - +} // namespace -// Check that all `tensors', different device may need extend this func to do device specific check -void ProcessGroupDICL::checkDeviceTensors(const std::vector& tensors) { +// Check that all `tensors', different device may need extend this func to do +// device specific check +void ProcessGroupDICL::checkDeviceTensors( + const std::vector &tensors) { if (tensors.size() == 0) { TORCH_CHECK(false, "Tensor list must be nonempty"); } if (tensors.size() > static_cast(devproxy::getDeviceCount())) { - TORCH_CHECK(false, + TORCH_CHECK( + false, "Tensor list mustn't be larger than the number of available DIPUs"); } - const auto& first = tensors.front(); + const auto &first = tensors.front(); // Set for ensuring that tensors are on separate devices. std::unordered_set usedDevices; usedDevices.reserve(tensors.size()); - for (auto tensor: tensors) { - if (!dipu::isDeviceTensor(tensor) || !tensor.is_non_overlapping_and_dense()) { + for (auto tensor : tensors) { + if (!dipu::isDeviceTensor(tensor) || + !tensor.is_non_overlapping_and_dense()) { TORCH_CHECK(false, "Tensors must be DIPU and non-overlapping and dense"); } if (tensor.scalar_type() != first.scalar_type()) { @@ -351,12 +373,15 @@ void ProcessGroupDICL::checkDeviceTensors(const std::vector& tensors } template -c10::intrusive_ptr ProcessGroupDICL::doComm(std::vector& inputs, - std::vector& outputs, std::vector>& diclComms, - const std::vector& devices, Fn fn, PreProcess pre, PostProcess post, OpType opType) { - // First let DICL streams wait for input tensors allocation streams +c10::intrusive_ptr ProcessGroupDICL::doComm( + std::vector &inputs, std::vector &outputs, + std::vector> &diclComms, + const std::vector &devices, Fn fn, PreProcess pre, + PostProcess post, OpType opType) { + // First let DICL streams wait for input tensors allocation streams syncStreams(diclComms); - auto work = c10::make_intrusive(diclComms, blockingWait_, opTimeout_); + auto work = c10::make_intrusive( + diclComms, blockingWait_, opTimeout_); OptionalDIPUGuard dipuGuard; pre(diclComms); @@ -365,23 +390,26 @@ c10::intrusive_ptr ProcessGroupDICL::doComm(std::vector& input dipuGuard.reset_device(diclComms[i]->device_); // need add adapter to handle int64/double! camb not support double - fn(inputs[i], outputs[i], diclComms[i]->rawComm(), diclComms[i]->diclStream_); + fn(inputs[i], outputs[i], diclComms[i]->rawComm(), + diclComms[i]->diclStream_); dipu::recordStream(inputs[i], diclComms[i]->diclStream_); - if (inputs[i].storage().data_ptr().get() != outputs[i].storage().data_ptr().get()) { + if (inputs[i].storage().data_ptr().get() != + outputs[i].storage().data_ptr().get()) { dipu::recordStream(outputs[i], diclComms[i]->diclStream_); } // mock comm with just copy, used in standalone test. // DIPUStreamGuard guard(diclComms[i]->diclStream_.unwrap()); - // outputs[i].copy_(inputs[i], false); + // outputs[i].copy_(inputs[i], false); } post(diclComms); work->record(); work->outputs_ = std::make_shared>(outputs); - // todo:: dipu need support multistream guard & remove work->workEvents_(future already has events ). + // todo:: dipu need support multistream guard & remove + // work->workEvents_(future already has events ). { DIPUStreamGuard streamGuard(diclComms[0]->diclStream_); @@ -392,18 +420,20 @@ c10::intrusive_ptr ProcessGroupDICL::doComm(std::vector& input return work; } -// std::function< diclResult_t(at::Tensor&, at::Tensor&, DiclComm, DIPUStream&) > -// enhance: need change template params to lamada, make collective() func overridable by sub class +// std::function< diclResult_t(at::Tensor&, at::Tensor&, DiclComm, DIPUStream&) +// > enhance: need change template params to lamada, make collective() func +// overridable by sub class template c10::intrusive_ptr ProcessGroupDICL::collective( - std::vector& inputs, std::vector& outputs, Fn fn, + std::vector &inputs, std::vector &outputs, Fn fn, PreProcess pre, PostProcess post, OpType opType) { const auto devices = getDeviceList(inputs); TORCH_CHECK(devices.size() == 1, - "dipu support one device per process only, nccl multidevices use ncclGroupStart/End, ", - "but we cannot support group based comm now."); - + "dipu support one device per process only, nccl multidevices use " + "ncclGroupStart/End, ", + "but we cannot support group based comm now."); + const auto localCommsKey = getDevieceIds(devices); // collective use PG.rank_ as comsBaseRank @@ -412,165 +442,149 @@ c10::intrusive_ptr ProcessGroupDICL::collective( } template -c10::intrusive_ptr ProcessGroupDICL::collective(std::vector& inputs, - std::vector& outputs, Fn fn, OpType opType) { - return collective(inputs, outputs, fn, [](std::vector>&) {}, - [](std::vector>&) {}, opType); +c10::intrusive_ptr ProcessGroupDICL::collective( + std::vector &inputs, std::vector &outputs, Fn fn, + OpType opType) { + return collective( + inputs, outputs, fn, [](std::vector> &) {}, + [](std::vector> &) {}, opType); } template c10::intrusive_ptr ProcessGroupDICL::pointToPoint( - std::vector& inputs, std::vector& outputs, int peerRank, - Fn fn, PreProcess pre, PostProcess post, OpType opType) { - + std::vector &inputs, std::vector &outputs, + int peerRank, Fn fn, PreProcess pre, PostProcess post, OpType opType) { const auto devices = getDeviceList(inputs); auto p2pPair = mapPGRank2P2P(rank_, peerRank); // pytorch nccl has same problem but not check - TORCH_CHECK(devices.size() == 1, "DICL P2P comm does not support multi-device tensor input"); - - // pytorch nccl always create new comm when new send/recv pair appear, here we follow this behavior. - // However, It's also works well by using the default collective commms to do pair comm, which cost lower resource - // in very big group size but may cause different pairs block in same stream. + TORCH_CHECK(devices.size() == 1, + "DICL P2P comm does not support multi-device tensor input"); + + // pytorch nccl always create new comm when new send/recv pair appear, here we + // follow this behavior. However, It's also works well by using the default + // collective commms to do pair comm, which cost lower resource in very big + // group size but may cause different pairs block in same stream. const auto localCommsKey = getP2PRankIds(rank_, peerRank, devices); // p2p use self p2pRank as commsRank, one commsRank corresponds to one device - auto& diclComms = getDICLComms(localCommsKey, devices, p2pPair.first, opType); + auto &diclComms = getDICLComms(localCommsKey, devices, p2pPair.first, opType); return doComm(inputs, outputs, diclComms, devices, fn, pre, post, opType); } c10::intrusive_ptr ProcessGroupDICL::allreduce( - std::vector& tensors, const AllreduceOptions& opts) { + std::vector &tensors, const AllreduceOptions &opts) { // inplace in = out, every rank use both in&out. checkDeviceTensors(tensors); return collective( - tensors, - tensors, - [&](at::Tensor& input, - at::Tensor& output, - diclComm_t comm, - DIPUStream& stream) { + tensors, tensors, + [&](at::Tensor &input, at::Tensor &output, diclComm_t comm, + DIPUStream &stream) { RECORD_FUNCTION("DiclAllreduce", std::vector({input})); - profile::RecordBlockCreator _("DiclAllreduce", profile::ExtraRecordInfo(), stream.rawstream(), stream.id()); + profile::RecordBlockCreator _("DiclAllreduce", + profile::ExtraRecordInfo(), + stream.rawstream(), stream.id()); return devproxy::diclAllReduce( - input.data_ptr(), - output.data_ptr(), - (size_t)input.numel(), - input.scalar_type(), - opts.reduceOp, - comm, - stream.rawstream()); + input.data_ptr(), output.data_ptr(), (size_t)input.numel(), + input.scalar_type(), opts.reduceOp, comm, stream.rawstream()); }, OpType::ALLREDUCE); } c10::intrusive_ptr ProcessGroupDICL::broadcast( - std::vector& tensors, const BroadcastOptions& opts) { + std::vector &tensors, const BroadcastOptions &opts) { checkDeviceTensors(tensors); // inplace in = out, only rootRank use in. return collective( - tensors, - tensors, - [&](at::Tensor& input, - at::Tensor& output, - diclComm_t comm, - DIPUStream& stream) { - RECORD_FUNCTION("DiclBroadcast", std::vector({input})); - profile::RecordBlockCreator _("DiclBroadcast", profile::ExtraRecordInfo(), stream.rawstream(), stream.id()); - // only one root (root rank root device) - const auto root = opts.rootRank * tensors.size() + opts.rootTensor; - return devproxy::diclBroadcast( - input.data_ptr(), - input.data_ptr(), - (size_t)input.numel(), - input.scalar_type(), - root, - comm, - stream.rawstream()); - }, - OpType::BROADCAST); + tensors, tensors, + [&](at::Tensor &input, at::Tensor &output, diclComm_t comm, + DIPUStream &stream) { + RECORD_FUNCTION("DiclBroadcast", std::vector({input})); + profile::RecordBlockCreator _("DiclBroadcast", + profile::ExtraRecordInfo(), + stream.rawstream(), stream.id()); + // only one root (root rank root device) + const auto root = opts.rootRank * tensors.size() + opts.rootTensor; + return devproxy::diclBroadcast( + input.data_ptr(), input.data_ptr(), (size_t)input.numel(), + input.scalar_type(), root, comm, stream.rawstream()); + }, + OpType::BROADCAST); } c10::intrusive_ptr ProcessGroupDICL::reduce( - std::vector& tensors, const ReduceOptions& opts) { + std::vector &tensors, const ReduceOptions &opts) { // inplace in = out, only rootRank use out. checkDeviceTensors(tensors); auto tensor = tensors.back(); int dev_in_group = 0; return collective( - tensors, - tensors, - [&](at::Tensor& input, - at::Tensor& output, - diclComm_t comm, - DIPUStream& stream) { + tensors, tensors, + [&](at::Tensor &input, at::Tensor &output, diclComm_t comm, + DIPUStream &stream) { RECORD_FUNCTION("DiclReduce", std::vector({input})); - profile::RecordBlockCreator _("DiclReduce", profile::ExtraRecordInfo(), stream.rawstream(), stream.id()); + profile::RecordBlockCreator _("DiclReduce", profile::ExtraRecordInfo(), + stream.rawstream(), stream.id()); const auto root = opts.rootRank * tensors.size() + opts.rootTensor; return devproxy::diclReduce( - input.data_ptr(), - output.data_ptr(), - (size_t)input.numel(), - input.scalar_type(), - opts.reduceOp, - root, - comm, - stream.rawstream()); + input.data_ptr(), output.data_ptr(), (size_t)input.numel(), + input.scalar_type(), opts.reduceOp, root, comm, stream.rawstream()); }, OpType::REDUCE); } c10::intrusive_ptr ProcessGroupDICL::gather( - std::vector>& outputTensors, - std::vector& inputTensors, - const GatherOptions& opts) { + std::vector> &outputTensors, + std::vector &inputTensors, const GatherOptions &opts) { TORCH_CHECK(false, "ProcessGroupDICL does not support gather now"); } c10::intrusive_ptr ProcessGroupDICL::allgather( - std::vector>& outputs, - std::vector& inputs, const AllgatherOptions& opts) { + std::vector> &outputs, + std::vector &inputs, const AllgatherOptions &opts) { checkDeviceTensors(inputs); // output = input * ranks, no inplace. every ranks use both in&out. auto outputFlattened = flatten_for_scatter_gather(outputs, inputs, this->size_); - auto work = collective(inputs, outputFlattened, - [&](at::Tensor& input, - at::Tensor& output, - diclComm_t comm, - DIPUStream& stream) { - RECORD_FUNCTION("DiclAllgather", std::vector({input})); - profile::RecordBlockCreator _("DiclAllgather", profile::ExtraRecordInfo(), stream.rawstream(), stream.id()); - - return devproxy::diclAllGather( - input.data_ptr(), - output.data_ptr(), - (size_t)input.numel(), - input.scalar_type(), - comm, - stream.rawstream()); - }, - [&](std::vector>& diclComms) {}, - [&](std::vector>& diclComms) { - // Copy the flattened output tensors to the outputs. - for (size_t i = 0; i < outputs.size(); ++i) { - // warnning & todo:: copy in comm stream, - // record dest tensor outputs, because src tensor outputFlattened already recorded in collective. - copyInCommStream(diclComms[i], outputs[i], outputFlattened[i], outputs[i].size()); - // copyInCurrentStream(diclComms[i], outputs[i], outputFlattened[i]); - } - }, - OpType::ALLGATHER); - return work; + auto work = collective( + inputs, outputFlattened, + [&](at::Tensor &input, at::Tensor &output, diclComm_t comm, + DIPUStream &stream) { + RECORD_FUNCTION("DiclAllgather", std::vector({input})); + profile::RecordBlockCreator _("DiclAllgather", + profile::ExtraRecordInfo(), + stream.rawstream(), stream.id()); + + return devproxy::diclAllGather( + input.data_ptr(), output.data_ptr(), (size_t)input.numel(), + input.scalar_type(), comm, stream.rawstream()); + }, + [&](std::vector> &diclComms) {}, + [&](std::vector> &diclComms) { + // Copy the flattened output tensors to the outputs. + for (size_t i = 0; i < outputs.size(); ++i) { + // warnning & todo:: copy in comm stream, + // record dest tensor outputs, because src tensor outputFlattened + // already recorded in collective. + copyInCommStream(diclComms[i], outputs[i], outputFlattened[i], + outputs[i].size()); + // copyInCurrentStream(diclComms[i], outputs[i], outputFlattened[i]); + } + }, + OpType::ALLGATHER); + return work; } c10::intrusive_ptr ProcessGroupDICL::_allgather_base( - at::Tensor& outputTensor, at::Tensor& inputTensor, const AllgatherOptions& opts) { + at::Tensor &outputTensor, at::Tensor &inputTensor, + const AllgatherOptions &opts) { // output = input * ranks. - TORCH_CHECK(inputTensor.dtype() == outputTensor.dtype(), "output tensor must have the same type as input tensor"); - TORCH_CHECK(inputTensor.numel() * this->size_ == outputTensor.numel(), + TORCH_CHECK(inputTensor.dtype() == outputTensor.dtype(), + "output tensor must have the same type as input tensor"); + TORCH_CHECK( + inputTensor.numel() * this->size_ == outputTensor.numel(), "output tensor size must be equal to world_size times input tensor size"); // just a wrapper to fit the collective interface @@ -578,63 +592,55 @@ c10::intrusive_ptr ProcessGroupDICL::_allgather_base( auto outputs = std::vector{outputTensor}; return collective( - inputs, - outputs, - [&](at::Tensor& input, - at::Tensor& output, - diclComm_t comm, - DIPUStream& stream) { - RECORD_FUNCTION("DiclAllgather_base", std::vector({input})); - profile::RecordBlockCreator _("DiclAllgather_base", profile::ExtraRecordInfo(), stream.rawstream(), stream.id()); + inputs, outputs, + [&](at::Tensor &input, at::Tensor &output, diclComm_t comm, + DIPUStream &stream) { + RECORD_FUNCTION("DiclAllgather_base", + std::vector({input})); + profile::RecordBlockCreator _("DiclAllgather_base", + profile::ExtraRecordInfo(), + stream.rawstream(), stream.id()); return devproxy::diclAllGather( - input.data_ptr(), - output.data_ptr(), - (size_t)input.numel(), - input.scalar_type(), - comm, - stream.rawstream()); + input.data_ptr(), output.data_ptr(), (size_t)input.numel(), + input.scalar_type(), comm, stream.rawstream()); }, OpType::_ALLGATHER_BASE); } c10::intrusive_ptr ProcessGroupDICL::_reduce_scatter_base( - at::Tensor& outputTensor, - at::Tensor& inputTensor, - const ReduceScatterOptions& opts) { + at::Tensor &outputTensor, at::Tensor &inputTensor, + const ReduceScatterOptions &opts) { // input = output * ranks, no inplace, output = reduced(input)[rank] - TORCH_CHECK(inputTensor.dtype() == outputTensor.dtype(), "output tensor must have the same type as input tensor"); - TORCH_CHECK(inputTensor.numel()== this->size_ * outputTensor.numel(), - "input tensor must be the same size as output size times world size") + TORCH_CHECK(inputTensor.dtype() == outputTensor.dtype(), + "output tensor must have the same type as input tensor"); + TORCH_CHECK( + inputTensor.numel() == this->size_ * outputTensor.numel(), + "input tensor must be the same size as output size times world size") auto inputs = std::vector{inputTensor}; auto outputs = std::vector{outputTensor}; return collective( - inputs, - outputs, - [&](at::Tensor& input, - at::Tensor& output, - diclComm_t comm, - DIPUStream& stream) { - RECORD_FUNCTION("DiclReduceScatter_base", std::vector({input})); - profile::RecordBlockCreator _("DiclReduceScatter_base", profile::ExtraRecordInfo(), stream.rawstream(), stream.id()); + inputs, outputs, + [&](at::Tensor &input, at::Tensor &output, diclComm_t comm, + DIPUStream &stream) { + RECORD_FUNCTION("DiclReduceScatter_base", + std::vector({input})); + profile::RecordBlockCreator _("DiclReduceScatter_base", + profile::ExtraRecordInfo(), + stream.rawstream(), stream.id()); return devproxy::diclReduceScatter( - input.data_ptr(), - output.data_ptr(), - (size_t)output.numel(), - input.scalar_type(), - opts.reduceOp, - comm, - stream.rawstream()); + input.data_ptr(), output.data_ptr(), (size_t)output.numel(), + input.scalar_type(), opts.reduceOp, comm, stream.rawstream()); }, OpType::_REDUCE_SCATTER_BASE); } c10::intrusive_ptr ProcessGroupDICL::reduce_scatter( - std::vector& outputs, - std::vector>& inputs, - const ReduceScatterOptions& opts) { + std::vector &outputs, + std::vector> &inputs, + const ReduceScatterOptions &opts) { // input = output * ranks, no inplace, output = reduced(input)[rank] checkDeviceTensors(outputs); auto inputFlattened = @@ -642,95 +648,75 @@ c10::intrusive_ptr ProcessGroupDICL::reduce_scatter( checkDeviceTensors(inputFlattened); auto work = collective( - inputFlattened, - outputs, - [&](at::Tensor& input, - at::Tensor& output, - diclComm_t comm, - DIPUStream& stream) { - RECORD_FUNCTION("DiclReduceScatter", std::vector({input})); - profile::RecordBlockCreator _("DiclReduceScatter", profile::ExtraRecordInfo(), stream.rawstream(), stream.id()); - return devproxy::diclReduceScatter( - input.data_ptr(), - output.data_ptr(), - (size_t)output.numel(), - input.scalar_type(), - opts.reduceOp, - comm, - stream.rawstream()); - }, - [&](std::vector>& diclComms) { - // Copy the inputs[i].size nums raw tensor intto flattened - for (size_t i = 0; i < inputs.size(); ++i) { - // record src tensor inputs, because dest tensor inputFlattened already recorded in collective - copyInCommStream(diclComms[i], inputFlattened[i], inputs[i], inputs[0].size()); - } - }, - [&](std::vector>& diclComms) {}, - OpType::REDUCE_SCATTER); + inputFlattened, outputs, + [&](at::Tensor &input, at::Tensor &output, diclComm_t comm, + DIPUStream &stream) { + RECORD_FUNCTION("DiclReduceScatter", std::vector({input})); + profile::RecordBlockCreator _("DiclReduceScatter", + profile::ExtraRecordInfo(), + stream.rawstream(), stream.id()); + return devproxy::diclReduceScatter( + input.data_ptr(), output.data_ptr(), (size_t)output.numel(), + input.scalar_type(), opts.reduceOp, comm, stream.rawstream()); + }, + [&](std::vector> &diclComms) { + // Copy the inputs[i].size nums raw tensor intto flattened + for (size_t i = 0; i < inputs.size(); ++i) { + // record src tensor inputs, because dest tensor inputFlattened + // already recorded in collective + copyInCommStream(diclComms[i], inputFlattened[i], inputs[i], + inputs[0].size()); + } + }, + [&](std::vector> &diclComms) {}, + OpType::REDUCE_SCATTER); return work; } c10::intrusive_ptr ProcessGroupDICL::send( - std::vector& tensors, int dstRank, int tag) { + std::vector &tensors, int dstRank, int tag) { checkDeviceTensors(tensors); auto p2pPair = mapPGRank2P2P(rank_, dstRank); return pointToPoint( - tensors, - tensors, - dstRank, - [&](at::Tensor& input, - at::Tensor& output, - diclComm_t comm, - DIPUStream& stream) { - RECORD_FUNCTION("diclSend", std::vector({input})); - profile::RecordBlockCreator _("diclSend", profile::ExtraRecordInfo(), stream.rawstream(), stream.id()); - return devproxy::diclSend( - input.data_ptr(), - (size_t)input.numel(), - input.scalar_type(), - p2pPair.second, - comm, - stream.rawstream()); - }, - [](std::vector>&) {}, - [](std::vector>&) {}, - OpType::SEND); + tensors, tensors, dstRank, + [&](at::Tensor &input, at::Tensor &output, diclComm_t comm, + DIPUStream &stream) { + RECORD_FUNCTION("diclSend", std::vector({input})); + profile::RecordBlockCreator _("diclSend", profile::ExtraRecordInfo(), + stream.rawstream(), stream.id()); + return devproxy::diclSend(input.data_ptr(), (size_t)input.numel(), + input.scalar_type(), p2pPair.second, comm, + stream.rawstream()); + }, + [](std::vector> &) {}, + [](std::vector> &) {}, OpType::SEND); } c10::intrusive_ptr ProcessGroupDICL::recv( - std::vector& tensors, int srcRank, int tag) { + std::vector &tensors, int srcRank, int tag) { checkDeviceTensors(tensors); auto p2pPair = mapPGRank2P2P(rank_, srcRank); return pointToPoint( - tensors, - tensors, - srcRank, - [&](at::Tensor& input, - at::Tensor& output, - diclComm_t comm, - DIPUStream& stream) { - RECORD_FUNCTION("diclRecv", std::vector({input})); - profile::RecordBlockCreator _("diclRecv", profile::ExtraRecordInfo(), stream.rawstream(), stream.id()); - return devproxy::diclRecv( - input.data_ptr(), - (size_t)input.numel(), - input.scalar_type(), - p2pPair.second, - comm, - stream.rawstream()); - }, - [](std::vector>&) {}, - [](std::vector>&) {}, - OpType::RECV); -} - -c10::intrusive_ptr ProcessGroupDICL::barrier( - const BarrierOptions& opts) { + tensors, tensors, srcRank, + [&](at::Tensor &input, at::Tensor &output, diclComm_t comm, + DIPUStream &stream) { + RECORD_FUNCTION("diclRecv", std::vector({input})); + profile::RecordBlockCreator _("diclRecv", profile::ExtraRecordInfo(), + stream.rawstream(), stream.id()); + return devproxy::diclRecv(input.data_ptr(), (size_t)input.numel(), + input.scalar_type(), p2pPair.second, comm, + stream.rawstream()); + }, + [](std::vector> &) {}, + [](std::vector> &) {}, OpType::RECV); +} + +c10::intrusive_ptr ProcessGroupDICL::barrier(const BarrierOptions &opts) { std::vector devices; if (usedDeviceIdxs_.empty()) { auto numDIPUs = devproxy::getDeviceCount(); - int16_t deviceIdx = static_cast(rank_ % std::max(static_cast(numDIPUs), 1)); + int16_t deviceIdx = + static_cast(rank_ % std::max(static_cast(numDIPUs), 1)); devices.push_back(at::Device(dipu::DIPU_DEVICE_TYPE, deviceIdx)); } else { for (auto usedDeviceIdx : usedDeviceIdxs_) { @@ -742,27 +728,26 @@ c10::intrusive_ptr ProcessGroupDICL::barrier( barrierTensors.reserve(devices.size()); OptionalDIPUGuard dipuGuard; - for (auto& device : devices) { + for (auto &device : devices) { dipuGuard.reset_device(device); - barrierTensors.push_back(at::empty({1}, + barrierTensors.push_back(at::empty( + {1}, at::TensorOptions().device(dipu::DIPU_DEVICE_TYPE).dtype(at::kFloat))); } auto work = allreduce(barrierTensors); - auto diclWork = dynamic_cast(work.get()); + auto diclWork = dynamic_cast(work.get()); diclWork->barrier_ = true; return work; } c10::intrusive_ptr createProcessGroupDICL( - const c10::intrusive_ptr<::c10d::Store> &store, - int rank, - int size, - const std::chrono::milliseconds& timeout) { + const c10::intrusive_ptr<::c10d::Store> &store, int rank, int size, + const std::chrono::milliseconds &timeout) { auto options = c10::make_intrusive(); options->timeout = timeout; return c10::make_intrusive(store, rank, size); } -} // namespace c10d +} // namespace dipu diff --git a/dipu/torch_dipu/csrc_dipu/runtime/distributed/ProcessGroupDICL.h b/dipu/torch_dipu/csrc_dipu/runtime/distributed/ProcessGroupDICL.h index 811f389ae..94d69977d 100644 --- a/dipu/torch_dipu/csrc_dipu/runtime/distributed/ProcessGroupDICL.h +++ b/dipu/torch_dipu/csrc_dipu/runtime/distributed/ProcessGroupDICL.h @@ -1,39 +1,39 @@ // Copyright (c) 2023, DeepLink. #pragma once -#include -#include #include +#include +#include #include #include #include -#include #include #include -#include #include +#include #include + #include "./DICLUtils.hpp" namespace dipu { -using c10d::Backend; -using c10d::Store; -using c10d::Work; -using c10d::OpType; -using c10d::BroadcastOptions; -using c10d::AllreduceOptions; -using c10d::ReduceOptions; using c10d::AllgatherOptions; +using c10d::AllreduceOptions; +using c10d::Backend; using c10d::BarrierOptions; +using c10d::BroadcastOptions; using c10d::GatherOptions; +using c10d::OpType; +using c10d::ReduceOptions; using c10d::ReduceScatterOptions; +using c10d::Store; +using c10d::Work; // Environment variable which controls whether or not wait() is blocking or // non-blocking. -constexpr const char* DICL_BLOCKING_WAIT = "DICL_BLOCKING_WAIT"; +constexpr const char *DICL_BLOCKING_WAIT = "DICL_BLOCKING_WAIT"; constexpr int64_t diclSyncBusyWaitMillis = 30; // ProcessGroupDICL implements DICLbindings for c10d. @@ -72,26 +72,29 @@ constexpr int64_t diclSyncBusyWaitMillis = 30; // std::shared_ptr work = pg.allreduce(tensors); // // // At this point, DICL kernel has already by queued successfully -// // Now, let current stream wait for the DICL to finish, originally this function is +// // Now, let current stream wait for the DICL to finish, originally this +// function is // // async operation as well, but currently DIPU is sync. // // work->wait() // // // Now continue on other work in the current stream. -// not support gather/ all _coalesced func func now, +// not support gather/ all _coalesced func func now, // If needed in the future, we will add class DIPU_API ProcessGroupDICL : public Backend { public: class WorkDICL : public Work { - public: + public: // Constructor takes a list of dicl comms - WorkDICL(std::vector>& comms, bool blockingWait, - std::chrono::milliseconds opTimeout): diclComms_(comms), - blockingWait_(blockingWait), opTimeout_(opTimeout), - workStartTime_(std::chrono::steady_clock::now()) { + WorkDICL(std::vector> &comms, bool blockingWait, + std::chrono::milliseconds opTimeout) + : diclComms_(comms), + blockingWait_(blockingWait), + opTimeout_(opTimeout), + workStartTime_(std::chrono::steady_clock::now()) { workEvents_.resize(diclComms_.size()); - } // NOLINT + } // NOLINT virtual ~WorkDICL() {} @@ -105,7 +108,8 @@ class DIPU_API ProcessGroupDICL : public Backend { void record(); // Same as calling synchronize() for DICL work. - bool wait(std::chrono::milliseconds timeout = kBackendDefaultTimeout) override; + bool wait( + std::chrono::milliseconds timeout = kBackendDefaultTimeout) override; // Let current stream wait on the completing of the DICL work // Throws on exceptions @@ -115,14 +119,14 @@ class DIPU_API ProcessGroupDICL : public Backend { c10::intrusive_ptr getFuture() override; - protected: + protected: // Store a reference to DICL collective's outputs, used by result and to // give a more descriptive message when representing the Work as a string. std::shared_ptr> outputs_; // The future returned by getFuture. c10::intrusive_ptr future_; - + // The DICL communicators used for this work item. std::vector> diclComms_; @@ -143,8 +147,8 @@ class DIPU_API ProcessGroupDICL : public Backend { // Time point representing when the work started. std::chrono::time_point workStartTime_; - private: - friend class ProcessGroupDICL; + private: + friend class ProcessGroupDICL; }; struct DIPU_API Options : Backend::Options { @@ -166,81 +170,93 @@ class DIPU_API ProcessGroupDICL : public Backend { // on-demand when a collective runs. If another collective is executed later, // against a different set of devices, the process group creates another DICL // communicator. These DICL communicators are cached and reused if possible. - ProcessGroupDICL(const c10::intrusive_ptr& store, int rank, int size); + ProcessGroupDICL(const c10::intrusive_ptr &store, int rank, int size); virtual ~ProcessGroupDICL(); - c10::intrusive_ptr broadcast(std::vector& tensors, - const BroadcastOptions& opts = BroadcastOptions()) override; + c10::intrusive_ptr broadcast( + std::vector &tensors, + const BroadcastOptions &opts = BroadcastOptions()) override; - c10::intrusive_ptr allreduce(std::vector& tensors, - const AllreduceOptions& opts = AllreduceOptions()) override; + c10::intrusive_ptr allreduce( + std::vector &tensors, + const AllreduceOptions &opts = AllreduceOptions()) override; - c10::intrusive_ptr reduce(std::vector& tensors, - const ReduceOptions& opts = ReduceOptions()) override; + c10::intrusive_ptr reduce( + std::vector &tensors, + const ReduceOptions &opts = ReduceOptions()) override; - c10::intrusive_ptr gather(std::vector>& outputTensors, - std::vector& inputTensors, - const GatherOptions& opts = GatherOptions()) override; + c10::intrusive_ptr gather( + std::vector> &outputTensors, + std::vector &inputTensors, + const GatherOptions &opts = GatherOptions()) override; - c10::intrusive_ptr allgather(std::vector>& output_tensors, - std::vector& input_tensors, const AllgatherOptions& opts = AllgatherOptions()) override; + c10::intrusive_ptr allgather( + std::vector> &output_tensors, + std::vector &input_tensors, + const AllgatherOptions &opts = AllgatherOptions()) override; - c10::intrusive_ptr _allgather_base(at::Tensor& output_tensor, - at::Tensor& input_tensor, const AllgatherOptions& opts = AllgatherOptions()) override; + c10::intrusive_ptr _allgather_base( + at::Tensor &output_tensor, at::Tensor &input_tensor, + const AllgatherOptions &opts = AllgatherOptions()) override; - c10::intrusive_ptr reduce_scatter(std::vector& outputs, - std::vector>& inputs, - const ReduceScatterOptions& opts = ReduceScatterOptions()) override; - - c10::intrusive_ptr _reduce_scatter_base(at::Tensor& outputs, at::Tensor& inputs, - const ReduceScatterOptions& opts = ReduceScatterOptions()) override; + c10::intrusive_ptr reduce_scatter( + std::vector &outputs, + std::vector> &inputs, + const ReduceScatterOptions &opts = ReduceScatterOptions()) override; - c10::intrusive_ptr send(std::vector& tensors, - int dstRank, int tag) override; + c10::intrusive_ptr _reduce_scatter_base( + at::Tensor &outputs, at::Tensor &inputs, + const ReduceScatterOptions &opts = ReduceScatterOptions()) override; - c10::intrusive_ptr recv(std::vector& tensors, - int srcRank, int tag) override; + c10::intrusive_ptr send(std::vector &tensors, int dstRank, + int tag) override; - c10::intrusive_ptr barrier(const BarrierOptions& opts = BarrierOptions()) override; + c10::intrusive_ptr recv(std::vector &tensors, int srcRank, + int tag) override; - c10::intrusive_ptr getStore() { - return this->store_; - } + c10::intrusive_ptr barrier( + const BarrierOptions &opts = BarrierOptions()) override; + + c10::intrusive_ptr getStore() { return this->store_; } protected: // different device may need extend this func to do device specific check - virtual void checkDeviceTensors(const std::vector& tensors); + virtual void checkDeviceTensors(const std::vector &tensors); // Helper that broadcasts DICL clique ID to all ranks through the store - virtual void broadcastUniqueID(commUniqueId* uniqueId, - const std::string& storeKey, int commRank); + virtual void broadcastUniqueID(commUniqueId *uniqueId, + const std::string &storeKey, int commRank); // Helper that either looks up the cached DICL communicators or creates // a new set of DICL communicators as a cache entry - virtual std::vector>& getDICLComms( - const std::string& devicesKey, const std::vector& devices, - int commsRank, OpType opType); + virtual std::vector> &getDICLComms( + const std::string &devicesKey, const std::vector &devices, + int commsRank, OpType opType); template - c10::intrusive_ptr collective( - std::vector& input, std::vector& output, Fn fn, OpType opType); + c10::intrusive_ptr collective(std::vector &input, + std::vector &output, Fn fn, + OpType opType); template - c10::intrusive_ptr collective( - std::vector& inputs, std::vector& outputs, Fn fn, - PreProcess pre, PostProcess post, OpType opType); + c10::intrusive_ptr collective(std::vector &inputs, + std::vector &outputs, Fn fn, + PreProcess pre, PostProcess post, + OpType opType); template - c10::intrusive_ptr pointToPoint( - std::vector& inputs, std::vector& outputs, int peer, - Fn fn, PreProcess pre, PostProcess post, OpType opType); + c10::intrusive_ptr pointToPoint(std::vector &inputs, + std::vector &outputs, + int peer, Fn fn, PreProcess pre, + PostProcess post, OpType opType); template c10::intrusive_ptr doComm( - std::vector& inputs, std::vector& outputs, - std::vector>& diclComms, const std::vector& devices, - Fn fn, PreProcess pre, PostProcess post, OpType opType); + std::vector &inputs, std::vector &outputs, + std::vector> &diclComms, + const std::vector &devices, Fn fn, PreProcess pre, + PostProcess post, OpType opType); // The store is used to broadcast the DICL unique ID of rank 0. c10::intrusive_ptr store_; @@ -257,7 +273,7 @@ class DIPU_API ProcessGroupDICL : public Backend { // // e.g. If the process group op only uses device 0, then the value of // the used device string stored (value of the hashmap) would be "0". - // + // // If the process group op uses device 0 - 7 and the each tensor of the // input tensor list is on device, 0, 1, 2, 3, 4, 5, 6, 7 separately, // then the value of the used device string (key) stored would be @@ -269,7 +285,8 @@ class DIPU_API ProcessGroupDICL : public Backend { // "0,4,5,6,7,1,2,3" // // Note that the order of the device for the tensor list matters. - std::unordered_map>> devDICLCommsMap_; + std::unordered_map>> + devDICLCommsMap_; // Mutex to guard devDICLCommMap_. std::mutex devDICLCommMapLock_; @@ -285,9 +302,7 @@ class DIPU_API ProcessGroupDICL : public Backend { }; c10::intrusive_ptr createProcessGroupDICL( - const c10::intrusive_ptr<::c10d::Store> &store, - int rank, - int size, - const std::chrono::milliseconds& timeout) ; + const c10::intrusive_ptr<::c10d::Store> &store, int rank, int size, + const std::chrono::milliseconds &timeout); } // namespace dipu \ No newline at end of file diff --git a/dipu/torch_dipu/csrc_dipu/runtime/distributed/c10dOps.cpp b/dipu/torch_dipu/csrc_dipu/runtime/distributed/c10dOps.cpp index 5bbf3dc70..06875a33c 100644 --- a/dipu/torch_dipu/csrc_dipu/runtime/distributed/c10dOps.cpp +++ b/dipu/torch_dipu/csrc_dipu/runtime/distributed/c10dOps.cpp @@ -16,8 +16,7 @@ namespace ops { c10::intrusive_ptr send_dipu( at::TensorList tensors, - const c10::intrusive_ptr& process_group, - int64_t dstRank, + const c10::intrusive_ptr &process_group, int64_t dstRank, int64_t tag) { auto tensor_vec = tensors.vec(); return process_group->getBackend(dipu::DIPU_DEVICE_TYPE) @@ -26,8 +25,7 @@ c10::intrusive_ptr send_dipu( c10::intrusive_ptr recv_dipu_( at::TensorList tensors, - const c10::intrusive_ptr& process_group, - int64_t srcRank, + const c10::intrusive_ptr &process_group, int64_t srcRank, int64_t tag) { auto tensor_vec = tensors.vec(); return process_group->getBackend(dipu::DIPU_DEVICE_TYPE) @@ -36,35 +34,26 @@ c10::intrusive_ptr recv_dipu_( c10::intrusive_ptr reduce_dipu_( at::TensorList tensors, - const c10::intrusive_ptr& process_group, - const c10::intrusive_ptr& reduce_op, - int64_t root_rank, - int64_t root_tensor, - int64_t timeout) { + const c10::intrusive_ptr &process_group, + const c10::intrusive_ptr &reduce_op, int64_t root_rank, + int64_t root_tensor, int64_t timeout) { auto tensor_vec = tensors.vec(); return process_group->getBackend(dipu::DIPU_DEVICE_TYPE) - ->reduce( - tensor_vec, - ReduceOptions{ - *reduce_op.get(), - root_rank, - root_tensor, - std::chrono::milliseconds(timeout)}); + ->reduce(tensor_vec, + ReduceOptions{*reduce_op.get(), root_rank, root_tensor, + std::chrono::milliseconds(timeout)}); } std::tuple, c10::intrusive_ptr> broadcast_dipu_( at::TensorList tensors, - const c10::intrusive_ptr& process_group, - int64_t root_rank, - int64_t root_tensor, - int64_t timeout) { + const c10::intrusive_ptr &process_group, int64_t root_rank, + int64_t root_tensor, int64_t timeout) { auto tensor_vec = tensors.vec(); auto work = process_group->getBackend(dipu::DIPU_DEVICE_TYPE) - ->broadcast( - tensor_vec, - BroadcastOptions{ - root_rank, root_tensor, std::chrono::milliseconds(timeout)}); + ->broadcast(tensor_vec, + BroadcastOptions{root_rank, root_tensor, + std::chrono::milliseconds(timeout)}); return std::tuple, c10::intrusive_ptr>( std::move(tensor_vec), work); @@ -72,16 +61,14 @@ std::tuple, c10::intrusive_ptr> broadcast_dipu_( std::tuple, c10::intrusive_ptr> allreduce_dipu_( at::TensorList tensors, - const c10::intrusive_ptr& process_group, - const c10::intrusive_ptr& reduce_op, - int64_t timeout) { + const c10::intrusive_ptr &process_group, + const c10::intrusive_ptr &reduce_op, int64_t timeout) { auto tensor_vec = tensors.vec(); auto work = process_group->getBackend(dipu::DIPU_DEVICE_TYPE) - ->allreduce( - tensor_vec, - AllreduceOptions{ - *reduce_op.get(), std::chrono::milliseconds(timeout)}); + ->allreduce(tensor_vec, + AllreduceOptions{*reduce_op.get(), + std::chrono::milliseconds(timeout)}); // Return input tensors as output tensors to make inplace allreduce look like // a functional API, so that make_fx can correctly build the dependencies in @@ -90,98 +77,91 @@ std::tuple, c10::intrusive_ptr> allreduce_dipu_( std::move(tensor_vec), work); } -std::tuple>, c10::intrusive_ptr> allgather_dipu_( - const std::vector>& output_tensors, - at::TensorList input_tensors, - const c10::intrusive_ptr& process_group, - int64_t timeout) { +std::tuple>, c10::intrusive_ptr> +allgather_dipu_(const std::vector> &output_tensors, + at::TensorList input_tensors, + const c10::intrusive_ptr &process_group, + int64_t timeout) { auto input_tensors_vec = input_tensors.vec(); - auto work = process_group->getBackend(dipu::DIPU_DEVICE_TYPE) - ->allgather( - const_cast>&>(output_tensors), - input_tensors_vec, - AllgatherOptions{std::chrono::milliseconds(timeout)}); + auto work = + process_group->getBackend(dipu::DIPU_DEVICE_TYPE) + ->allgather(const_cast> &>( + output_tensors), + input_tensors_vec, + AllgatherOptions{std::chrono::milliseconds(timeout)}); // Copy output tensors (not storage) so that this can be used in a functional // manner - return std::tuple>, c10::intrusive_ptr>( - output_tensors, work); + return std::tuple>, + c10::intrusive_ptr>(output_tensors, work); } -// refer to distributed/c10d/Ops.cpp +// refer to distributed/c10d/Ops.cpp std::tuple> _allgather_base_dipu_( - at::Tensor& output_tensor, - at::Tensor& input_tensor, - const c10::intrusive_ptr& process_group) { + at::Tensor &output_tensor, at::Tensor &input_tensor, + const c10::intrusive_ptr &process_group) { auto work = process_group->getBackend(dipu::DIPU_DEVICE_TYPE) ->_allgather_base(output_tensor, input_tensor); return std::tuple>(output_tensor, work); } -std::tuple, c10::intrusive_ptr> reduce_scatter_dipu_( - const at::TensorList& output_tensors, - const std::vector>& input_tensors, - const c10::intrusive_ptr& process_group, - const c10::intrusive_ptr& reduce_op, - int64_t timeout) { +std::tuple, c10::intrusive_ptr> +reduce_scatter_dipu_(const at::TensorList &output_tensors, + const std::vector> &input_tensors, + const c10::intrusive_ptr &process_group, + const c10::intrusive_ptr &reduce_op, + int64_t timeout) { auto output_tensors_vec = output_tensors.vec(); auto work = process_group->getBackend(dipu::DIPU_DEVICE_TYPE) ->reduce_scatter( output_tensors_vec, - const_cast>&>(input_tensors), - ReduceScatterOptions{ - *reduce_op.get(), std::chrono::milliseconds(timeout)}); + const_cast> &>(input_tensors), + ReduceScatterOptions{*reduce_op.get(), + std::chrono::milliseconds(timeout)}); return std::tuple, c10::intrusive_ptr>( output_tensors_vec, work); } std::tuple> _reduce_scatter_base_dipu_( - at::Tensor& output_tensor, - at::Tensor& input_tensor, - const c10::intrusive_ptr& process_group, - const c10::intrusive_ptr& reduce_op, - int64_t timeout) { - auto work = - process_group->getBackend(dipu::DIPU_DEVICE_TYPE) - ->_reduce_scatter_base( - output_tensor, - input_tensor, - ReduceScatterOptions{ - *reduce_op.get(), std::chrono::milliseconds(timeout)}); + at::Tensor &output_tensor, at::Tensor &input_tensor, + const c10::intrusive_ptr &process_group, + const c10::intrusive_ptr &reduce_op, int64_t timeout) { + auto work = process_group->getBackend(dipu::DIPU_DEVICE_TYPE) + ->_reduce_scatter_base( + output_tensor, input_tensor, + ReduceScatterOptions{*reduce_op.get(), + std::chrono::milliseconds(timeout)}); return std::tuple>(output_tensor, work); } - c10::intrusive_ptr gather_dipu_( - const std::vector>& output_tensors, - const at::TensorList& input_tensors, - const c10::intrusive_ptr& process_group, - int64_t root_rank, + const std::vector> &output_tensors, + const at::TensorList &input_tensors, + const c10::intrusive_ptr &process_group, int64_t root_rank, int64_t timeout) { auto input_tensors_vec = input_tensors.vec(); return process_group->getBackend(dipu::DIPU_DEVICE_TYPE) ->gather( - const_cast>&>(output_tensors), + const_cast> &>(output_tensors), input_tensors_vec, GatherOptions{root_rank, std::chrono::milliseconds(timeout)}); } std::tuple, c10::intrusive_ptr> scatter_dipu_( - const at::TensorList& output_tensors, - const std::vector>& input_tensors, - const c10::intrusive_ptr& process_group, - int64_t root_rank, + const at::TensorList &output_tensors, + const std::vector> &input_tensors, + const c10::intrusive_ptr &process_group, int64_t root_rank, int64_t timeout) { auto output_tensors_vec = output_tensors.vec(); auto work = process_group->getBackend(dipu::DIPU_DEVICE_TYPE) ->scatter( output_tensors_vec, - const_cast>&>(input_tensors), + const_cast> &>(input_tensors), ScatterOptions{root_rank, std::chrono::milliseconds(timeout)}); return std::tuple, c10::intrusive_ptr>( @@ -190,9 +170,8 @@ std::tuple, c10::intrusive_ptr> scatter_dipu_( c10::intrusive_ptr barrier_dipu( at::Tensor /* unused */, - const c10::intrusive_ptr& process_group, - const std::vector& device_ids, - int64_t timeout) { + const c10::intrusive_ptr &process_group, + const std::vector &device_ids, int64_t timeout) { return process_group->getBackend(dipu::DIPU_DEVICE_TYPE) ->barrier(BarrierOptions{device_ids, std::chrono::milliseconds(timeout)}); } @@ -214,9 +193,9 @@ TORCH_LIBRARY_IMPL(c10d, DIPU_DEVICE_TYPE_MACRO, m) { // not implement m.impl("gather_", gather_dipu_); - // unregistered op, we expect it can fallback to cpu, but it not work now (hard to sync). + // unregistered op, we expect it can fallback to cpu, but it not work now + // (hard to sync). } - -} // namespace ops -} // namespace c10d +} // namespace ops +} // namespace c10d diff --git a/dipu/torch_dipu/csrc_dipu/runtime/rthelper.h b/dipu/torch_dipu/csrc_dipu/runtime/rthelper.h index c681ba039..5c5bba95c 100644 --- a/dipu/torch_dipu/csrc_dipu/runtime/rthelper.h +++ b/dipu/torch_dipu/csrc_dipu/runtime/rthelper.h @@ -1,11 +1,11 @@ // Copyright (c) 2023, DeepLink. #include -#include -#include #include -#include -#include -#include #include #include +#include +#include +#include +#include +#include #include diff --git a/dipu/torch_dipu/csrc_dipu/stub.cpp b/dipu/torch_dipu/csrc_dipu/stub.cpp index 4cd386a2e..15cd19c5e 100644 --- a/dipu/torch_dipu/csrc_dipu/stub.cpp +++ b/dipu/torch_dipu/csrc_dipu/stub.cpp @@ -3,8 +3,8 @@ static std::vector methods; -static void AddPyMethodDefs(std::vector& vector, PyMethodDef* methods) -{ +static void AddPyMethodDefs(std::vector &vector, + PyMethodDef *methods) { if (!vector.empty()) { // remove nullptr terminator vector.pop_back(); @@ -18,25 +18,15 @@ static void AddPyMethodDefs(std::vector& vector, PyMethodDef* metho } } -extern "C" PyObject* initModule() { - +extern "C" PyObject *initModule() { AddPyMethodDefs(methods, dipu::exportTensorFunctions()); static struct PyModuleDef torchdipu_module = { - PyModuleDef_HEAD_INIT, - "torch_dipu._C", - nullptr, - -1, - methods.data() - }; - PyObject* module = PyModule_Create(&torchdipu_module); + PyModuleDef_HEAD_INIT, "torch_dipu._C", nullptr, -1, methods.data()}; + PyObject *module = PyModule_Create(&torchdipu_module); dipu::exportDIPURuntime(module); dipu::exportProfiler(module); return module; } -PyMODINIT_FUNC PyInit__C(void){ - return initModule(); -} - - +PyMODINIT_FUNC PyInit__C(void) { return initModule(); } diff --git a/dipu/torch_dipu/csrc_dipu/utils/Log.h b/dipu/torch_dipu/csrc_dipu/utils/Log.h index cff170813..142d2ea5f 100644 --- a/dipu/torch_dipu/csrc_dipu/utils/Log.h +++ b/dipu/torch_dipu/csrc_dipu/utils/Log.h @@ -1,17 +1,19 @@ // Copyright (c) 2023, DeepLink. #pragma once -#include #include +#include #define CONCAT_(prefix, suffix) prefix##suffix #define CONCAT(prefix, suffix) CONCAT_(prefix, suffix) #define MAKE_UNIQUE_VARIABLE_NAME(prefix) CONCAT(prefix##_, __LINE__) #define DIPU_LOG std::cout << __FILE__ << ":" << __LINE__ << " " -#define DIPU_LOG_ONCE \ - static auto& __attribute__((unused)) MAKE_UNIQUE_VARIABLE_NAME(__func__) = DIPU_LOG +#define DIPU_LOG_ONCE \ + static auto &__attribute__((unused)) MAKE_UNIQUE_VARIABLE_NAME(__func__) = \ + DIPU_LOG #define DIPU_LOG_ERROR std::cerr << __FILE__ << ":" << __LINE__ << " " -#define DIPU_LOG_ERROR_ONCE \ - static auto& __attribute__((unused)) MAKE_UNIQUE_VARIABLE_NAME(__func__) = DIPU_LOG_ERROR +#define DIPU_LOG_ERROR_ONCE \ + static auto &__attribute__((unused)) MAKE_UNIQUE_VARIABLE_NAME(__func__) = \ + DIPU_LOG_ERROR diff --git a/dipu/torch_dipu/csrc_dipu/utils/helpfunc.cpp b/dipu/torch_dipu/csrc_dipu/utils/helpfunc.cpp index 752156a17..578d88106 100644 --- a/dipu/torch_dipu/csrc_dipu/utils/helpfunc.cpp +++ b/dipu/torch_dipu/csrc_dipu/utils/helpfunc.cpp @@ -13,25 +13,22 @@ bool isDeviceTensor(const at::Tensor &tensor) { } static bool in_bad_fork = false; -bool is_in_bad_fork() { - return in_bad_fork; -} +bool is_in_bad_fork() { return in_bad_fork; } #ifndef WIN32 // Called in the forked child if device has already been initialized -static void forked_child() { - in_bad_fork = true; -} +static void forked_child() { in_bad_fork = true; } #endif // Should be called before the first device call. -// Note: This is distinct from initExtension because a stub device implementation -// has some working functions (e.g. device_count) but cannot fully initialize. +// Note: This is distinct from initExtension because a stub device +// implementation has some working functions (e.g. device_count) but cannot +// fully initialize. void poison_fork() { #ifndef WIN32 static std::once_flag flag; - std::call_once(flag, []{ pthread_atfork(nullptr, nullptr, forked_child); }); + std::call_once(flag, [] { pthread_atfork(nullptr, nullptr, forked_child); }); #endif } -} //end dipu \ No newline at end of file +} // namespace dipu \ No newline at end of file diff --git a/dipu/torch_dipu/csrc_dipu/utils/helpfunc.hpp b/dipu/torch_dipu/csrc_dipu/utils/helpfunc.hpp index eb4674e2f..2bd77cd53 100644 --- a/dipu/torch_dipu/csrc_dipu/utils/helpfunc.hpp +++ b/dipu/torch_dipu/csrc_dipu/utils/helpfunc.hpp @@ -6,14 +6,20 @@ using dipu::devapis::VendorDeviceType; namespace dipu { -constexpr const char* VendorTypeToStr(VendorDeviceType t) noexcept { +constexpr const char *VendorTypeToStr(VendorDeviceType t) noexcept { switch (t) { - case VendorDeviceType::MLU: return "MLU"; - case VendorDeviceType::CUDA: return "CUDA"; - case VendorDeviceType::NPU: return "NPU"; - case VendorDeviceType::GCU: return "GCU"; - case VendorDeviceType::SUPA: return "SUPA"; - case VendorDeviceType::DROPLET: return "DROPLET"; + case VendorDeviceType::MLU: + return "MLU"; + case VendorDeviceType::CUDA: + return "CUDA"; + case VendorDeviceType::NPU: + return "NPU"; + case VendorDeviceType::GCU: + return "GCU"; + case VendorDeviceType::SUPA: + return "SUPA"; + case VendorDeviceType::DROPLET: + return "DROPLET"; } return "null"; } @@ -23,4 +29,4 @@ DIPU_API bool isDeviceTensor(const at::Tensor &tensor); DIPU_API bool is_in_bad_fork(); void poison_fork(); -} // end dipu \ No newline at end of file +} // namespace dipu \ No newline at end of file diff --git a/dipu/torch_dipu/csrc_dipu/vendor/ascend/AscendGeneratorImpl.cpp b/dipu/torch_dipu/csrc_dipu/vendor/ascend/AscendGeneratorImpl.cpp index 646a877ca..903389dcf 100644 --- a/dipu/torch_dipu/csrc_dipu/vendor/ascend/AscendGeneratorImpl.cpp +++ b/dipu/torch_dipu/csrc_dipu/vendor/ascend/AscendGeneratorImpl.cpp @@ -1,10 +1,10 @@ // Copyright (c) 2023, DeepLink. -#include #include +#include -#include #include #include +#include namespace dipu { @@ -13,25 +13,31 @@ static const size_t offset_size = sizeof(int64_t); static const size_t total_size = seed_size + offset_size; class NPUGeneratorImpl : public dipu::DIPUGeneratorImpl { -protected: + protected: mutable std::once_flag init_state_flag; -public: - NPUGeneratorImpl(at::DeviceIndex device_index): dipu::DIPUGeneratorImpl(device_index) { - } - void set_state(const c10::TensorImpl& state) override { + public: + NPUGeneratorImpl(at::DeviceIndex device_index) + : dipu::DIPUGeneratorImpl(device_index) {} + + void set_state(const c10::TensorImpl &state) override { at::detail::check_rng_state(state); auto state_size = state.numel(); - TORCH_CHECK(state_size == total_size || state_size == total_size - offset_size, "RNG state is wrong size"); + TORCH_CHECK( + state_size == total_size || state_size == total_size - offset_size, + "RNG state is wrong size"); - at::Tensor state_tmp(state.shallow_copy_and_detach(state.version_counter(), true)); + at::Tensor state_tmp( + state.shallow_copy_and_detach(state.version_counter(), true)); state_ = state_tmp; state_need_reset_ = false; } void update_state() const override { if (state_need_reset_) { - state_ = at::detail::empty_cpu({(int64_t)total_size}, c10::ScalarType::Byte, c10::nullopt, c10::nullopt, c10::nullopt, c10::nullopt); + state_ = at::detail::empty_cpu({(int64_t)total_size}, + c10::ScalarType::Byte, c10::nullopt, + c10::nullopt, c10::nullopt, c10::nullopt); auto rng_state = state_.data_ptr(); uint64_t seed = this->current_seed(); int64_t offset = 0; @@ -46,4 +52,4 @@ const at::Generator vendorMakeGenerator(at::DeviceIndex device_index) { return at::make_generator(device_index); } -} // namespace torch_dipu \ No newline at end of file +} // namespace dipu \ No newline at end of file diff --git a/dipu/torch_dipu/csrc_dipu/vendor/ascend/basecommimpl.hpp b/dipu/torch_dipu/csrc_dipu/vendor/ascend/basecommimpl.hpp index 8840236b8..2b01a5c2f 100644 --- a/dipu/torch_dipu/csrc_dipu/vendor/ascend/basecommimpl.hpp +++ b/dipu/torch_dipu/csrc_dipu/vendor/ascend/basecommimpl.hpp @@ -2,6 +2,7 @@ #include #include + #include #include @@ -27,25 +28,25 @@ struct Map { // HCCL ReduceOp mapping std::map hcclOp = { - {ReduceOp::MIN, HCCL_REDUCE_MIN}, - {ReduceOp::MAX, HCCL_REDUCE_MAX}, - {ReduceOp::SUM, HCCL_REDUCE_SUM}, - {ReduceOp::PRODUCT, HCCL_REDUCE_PROD}, + {ReduceOp::MIN, HCCL_REDUCE_MIN}, + {ReduceOp::MAX, HCCL_REDUCE_MAX}, + {ReduceOp::SUM, HCCL_REDUCE_SUM}, + {ReduceOp::PRODUCT, HCCL_REDUCE_PROD}, }; -bool isPinnedPtr(const void *p) -{ - TORCH_CHECK(false, "isPinnedPtr not implemented for ascend.\n"); +bool isPinnedPtr(const void *p) { + TORCH_CHECK(false, "isPinnedPtr not implemented for ascend.\n"); return false; } -#define HCCL_THROW(cmd) \ - do { \ - TORCH_CHECK(cmd == HCCL_SUCCESS, "HCCL error in: " + \ - std::string(__FILE__) + ":" + std::to_string(__LINE__) + \ - ".\n" + "And see details in Ascend logs.\n" + \ - aclGetRecentErrMsg()); \ +#define HCCL_THROW(cmd) \ + do { \ + TORCH_CHECK(cmd == HCCL_SUCCESS, \ + "HCCL error in: " + std::string(__FILE__) + ":" + \ + std::to_string(__LINE__) + ".\n" + \ + "And see details in Ascend logs.\n" + \ + aclGetRecentErrMsg()); \ } while (0) -} // ns devapis -} // ns dipu \ No newline at end of file +} // namespace devapis +} // namespace dipu \ No newline at end of file diff --git a/dipu/torch_dipu/csrc_dipu/vendor/ascend/communicatorimpl.cpp b/dipu/torch_dipu/csrc_dipu/vendor/ascend/communicatorimpl.cpp index ef7129571..546d3a2e5 100644 --- a/dipu/torch_dipu/csrc_dipu/vendor/ascend/communicatorimpl.cpp +++ b/dipu/torch_dipu/csrc_dipu/vendor/ascend/communicatorimpl.cpp @@ -4,23 +4,22 @@ namespace dipu { namespace devapis { // HCCL DataType mapping -static constexpr std::array, 9> hcclDataTypes{ - { - {at::kByte, HCCL_DATA_TYPE_UINT8}, - {at::kChar, HCCL_DATA_TYPE_INT8}, - {at::kShort, HCCL_DATA_TYPE_INT16}, - {at::kInt, HCCL_DATA_TYPE_INT32}, - {at::kLong, HCCL_DATA_TYPE_INT64}, - {at::kHalf, HCCL_DATA_TYPE_FP16}, - {at::kFloat, HCCL_DATA_TYPE_FP32}, - {at::kDouble, HCCL_DATA_TYPE_FP64}, - {at::kBool, HCCL_DATA_TYPE_UINT8}, - } -}; +static constexpr std::array, 9> + hcclDataTypes{{ + {at::kByte, HCCL_DATA_TYPE_UINT8}, + {at::kChar, HCCL_DATA_TYPE_INT8}, + {at::kShort, HCCL_DATA_TYPE_INT16}, + {at::kInt, HCCL_DATA_TYPE_INT32}, + {at::kLong, HCCL_DATA_TYPE_INT64}, + {at::kHalf, HCCL_DATA_TYPE_FP16}, + {at::kFloat, HCCL_DATA_TYPE_FP32}, + {at::kDouble, HCCL_DATA_TYPE_FP64}, + {at::kBool, HCCL_DATA_TYPE_UINT8}, + }}; HcclDataType getHcclDataType(const at::ScalarType type) { static constexpr auto map = - Map{{hcclDataTypes}}; + Map{{hcclDataTypes}}; return map.at(type); } @@ -31,13 +30,14 @@ DIPU_API diclResult_t diclGetCommAsyncError(diclComm_t comm) { TORCH_CHECK(false, "ascend Not implement diclGetCommAsyncError"); } -DIPU_API diclResult_t diclGetUniqueId(commUniqueId* uniqueId) { +DIPU_API diclResult_t diclGetUniqueId(commUniqueId *uniqueId) { HCCL_THROW(HcclGetRootInfo(uniqueId)); return DICL_SUCCESS; } -DIPU_API diclResult_t diclCommInitRank(diclComm_t* comm, int nranks, commUniqueId uniqueId, - int rank, int localDeviceId) { +DIPU_API diclResult_t diclCommInitRank(diclComm_t *comm, int nranks, + commUniqueId uniqueId, int rank, + int localDeviceId) { HCCL_THROW(HcclCommInitRootInfo(nranks, &uniqueId, rank, comm)); return DICL_SUCCESS; } @@ -47,58 +47,69 @@ DIPU_API diclResult_t diclCommDestroy(diclComm_t comm) { return DICL_SUCCESS; } -DIPU_API diclResult_t diclAllReduce(const void *sendBuff, void *recvBuff, size_t count, - at::ScalarType dataType, const ReduceOp& reduceOp, - diclComm_t comm, deviceStream_t stream) { +DIPU_API diclResult_t diclAllReduce(const void *sendBuff, void *recvBuff, + size_t count, at::ScalarType dataType, + const ReduceOp &reduceOp, diclComm_t comm, + deviceStream_t stream) { HCCL_THROW(HcclAllReduce(const_cast(sendBuff), recvBuff, count, - getHcclDataType(dataType), hcclOp[reduceOp], comm, stream)); + getHcclDataType(dataType), hcclOp[reduceOp], comm, + stream)); return DICL_SUCCESS; } -DIPU_API diclResult_t diclAllGather(const void *sendBuf, void *recvBuf, size_t count, - at::ScalarType dataType, diclComm_t comm, - deviceStream_t stream) { - HCCL_THROW(HcclAllGather(const_cast(sendBuf), recvBuf, count, +DIPU_API diclResult_t diclAllGather(const void *sendBuf, void *recvBuf, + size_t count, at::ScalarType dataType, + diclComm_t comm, deviceStream_t stream) { + HCCL_THROW(HcclAllGather(const_cast(sendBuf), recvBuf, count, getHcclDataType(dataType), comm, stream)); return DICL_SUCCESS; } -DIPU_API diclResult_t diclReduce(const void* sendBuf, void* recvBuf, size_t count, - at::ScalarType dataType, const ReduceOp& reduceOp, - int root, diclComm_t comm, deviceStream_t stream) { - - HCCL_THROW(HcclReduce(const_cast(sendBuf), recvBuf, count, getHcclDataType(dataType), - hcclOp[reduceOp], root, comm, stream)); +DIPU_API diclResult_t diclReduce(const void *sendBuf, void *recvBuf, + size_t count, at::ScalarType dataType, + const ReduceOp &reduceOp, int root, + diclComm_t comm, deviceStream_t stream) { + HCCL_THROW(HcclReduce(const_cast(sendBuf), recvBuf, count, + getHcclDataType(dataType), hcclOp[reduceOp], root, comm, + stream)); return DICL_SUCCESS; } -DIPU_API diclResult_t diclReduceScatter(void *sendBuf, void *recvBuf, size_t recvCount, - at::ScalarType dataType, const ReduceOp& op, - diclComm_t comm, deviceStream_t stream) { - HCCL_THROW(HcclReduceScatter(sendBuf, recvBuf, recvCount, getHcclDataType(dataType), - hcclOp[op], comm, stream)); +DIPU_API diclResult_t diclReduceScatter(void *sendBuf, void *recvBuf, + size_t recvCount, + at::ScalarType dataType, + const ReduceOp &op, diclComm_t comm, + deviceStream_t stream) { + HCCL_THROW(HcclReduceScatter(sendBuf, recvBuf, recvCount, + getHcclDataType(dataType), hcclOp[op], comm, + stream)); return DICL_SUCCESS; } -DIPU_API diclResult_t diclSend(void* sendBuf, size_t count, at::ScalarType dataType, int peer, +DIPU_API diclResult_t diclSend(void *sendBuf, size_t count, + at::ScalarType dataType, int peer, diclComm_t comm, deviceStream_t stream) { - HCCL_THROW(HcclSend(sendBuf, count, getHcclDataType(dataType), peer, comm, stream)); + HCCL_THROW( + HcclSend(sendBuf, count, getHcclDataType(dataType), peer, comm, stream)); return DICL_SUCCESS; } -DIPU_API diclResult_t diclRecv(void* recvBuf, size_t count, at::ScalarType dataType, int peer, +DIPU_API diclResult_t diclRecv(void *recvBuf, size_t count, + at::ScalarType dataType, int peer, diclComm_t comm, deviceStream_t stream) { - HCCL_THROW(HcclRecv(recvBuf, count, getHcclDataType(dataType), peer, comm, stream)); + HCCL_THROW( + HcclRecv(recvBuf, count, getHcclDataType(dataType), peer, comm, stream)); return DICL_SUCCESS; } -DIPU_API diclResult_t diclBroadcast(const void *sendBuf, void* recvBuf, size_t count, - at::ScalarType dataType, int root, diclComm_t comm, +DIPU_API diclResult_t diclBroadcast(const void *sendBuf, void *recvBuf, + size_t count, at::ScalarType dataType, + int root, diclComm_t comm, deviceStream_t stream) { - HCCL_THROW(HcclBroadcast(const_cast(sendBuf), count, getHcclDataType(dataType), - root, comm, stream)); + HCCL_THROW(HcclBroadcast(const_cast(sendBuf), count, + getHcclDataType(dataType), root, comm, stream)); return DICL_SUCCESS; } -} // end namespace devapis -} // end namespace dipu \ No newline at end of file +} // end namespace devapis +} // end namespace dipu \ No newline at end of file diff --git a/dipu/torch_dipu/csrc_dipu/vendor/ascend/deviceimpl.cpp b/dipu/torch_dipu/csrc_dipu/vendor/ascend/deviceimpl.cpp index 7efb7ea18..e31cf2a07 100644 --- a/dipu/torch_dipu/csrc_dipu/vendor/ascend/deviceimpl.cpp +++ b/dipu/torch_dipu/csrc_dipu/vendor/ascend/deviceimpl.cpp @@ -3,23 +3,18 @@ #include #include -#include #include +#include -namespace dipu -{ +namespace dipu { DIPU_API devapis::VendorDeviceType VENDOR_TYPE = devapis::VendorDeviceType::NPU; namespace devapis { -void initializeVendor() { - -} - -void finalizeVendor() { +void initializeVendor() {} -} +void finalizeVendor() {} // ===================== // Device class related @@ -27,7 +22,7 @@ void finalizeVendor() { using ascend_deviceId = int32_t; thread_local bool setDevFlag = false; -static int initValue = [](){ +static int initValue = []() { DIPU_CALLACLRT(aclInit(nullptr)); DIPU_CALLACLRT(aclrtSetDevice(0)); setDevFlag = true; @@ -44,7 +39,7 @@ deviceId_t current_device() { return static_cast(devId_); } DIPUDeviceProperties getDeviceProperties(int32_t device_index) { - const char* device_name; + const char *device_name; size_t device_free; size_t device_total; device_name = aclrtGetSocName(); @@ -71,13 +66,9 @@ void setDevice(deviceId_t devId) { setDevFlag = true; } -void resetDevice(deviceId_t devId) { - DIPU_CALLACLRT(::aclrtResetDevice(devId)) -} +void resetDevice(deviceId_t devId) { DIPU_CALLACLRT(::aclrtResetDevice(devId)) } -void syncDevice() { - DIPU_CALLACLRT(::aclrtSynchronizeDevice()) -} +void syncDevice() { DIPU_CALLACLRT(::aclrtSynchronizeDevice()) } int getDeviceCount() { unsigned int num = -1; @@ -129,66 +120,74 @@ void freeDevice(void *p) { } // (synchronous) copy from device to a device -void memCopyD2D(size_t nbytes, deviceId_t dstDevId, void *dst, deviceId_t srcDevId, const void *src) { +void memCopyD2D(size_t nbytes, deviceId_t dstDevId, void *dst, + deviceId_t srcDevId, const void *src) { syncDevice(); - DIPU_CALLACLRT(::aclrtMemcpy(dst, nbytes, src, nbytes, ACL_MEMCPY_DEVICE_TO_DEVICE)); + DIPU_CALLACLRT( + ::aclrtMemcpy(dst, nbytes, src, nbytes, ACL_MEMCPY_DEVICE_TO_DEVICE)); } // (synchronous) copy from host to a device void memCopyH2D(size_t nbytes, void *dst, const void *src) { syncDevice(); - DIPU_CALLACLRT(::aclrtMemcpy(dst, nbytes, src, nbytes, ACL_MEMCPY_HOST_TO_DEVICE)); + DIPU_CALLACLRT( + ::aclrtMemcpy(dst, nbytes, src, nbytes, ACL_MEMCPY_HOST_TO_DEVICE)); } // (synchronous) copy from a device to host void memCopyD2H(size_t nbytes, void *dst, const void *src) { syncDevice(); - DIPU_CALLACLRT(::aclrtMemcpy(dst, nbytes, src, nbytes, ACL_MEMCPY_DEVICE_TO_HOST)); + DIPU_CALLACLRT( + ::aclrtMemcpy(dst, nbytes, src, nbytes, ACL_MEMCPY_DEVICE_TO_HOST)); } // (asynchronous) copy from device to a device void memCopyD2DAsync(const deviceStream_t stream, size_t nbytes, - deviceId_t dstDevId, void *dst, deviceId_t srcDevId, const void *src) { - DIPU_CALLACLRT(::aclrtMemcpyAsync(dst, nbytes, src, nbytes, ACL_MEMCPY_DEVICE_TO_DEVICE, stream)); + deviceId_t dstDevId, void *dst, deviceId_t srcDevId, + const void *src) { + DIPU_CALLACLRT(::aclrtMemcpyAsync(dst, nbytes, src, nbytes, + ACL_MEMCPY_DEVICE_TO_DEVICE, stream)); } // (asynchronous) copy from host to a device -void memCopyH2DAsync(const deviceStream_t stream, size_t nbytes, void *dst, const void *src){ - DIPU_CALLACLRT(::aclrtMemcpyAsync(dst, nbytes, src, nbytes, ACL_MEMCPY_HOST_TO_DEVICE, stream)); +void memCopyH2DAsync(const deviceStream_t stream, size_t nbytes, void *dst, + const void *src) { + DIPU_CALLACLRT(::aclrtMemcpyAsync(dst, nbytes, src, nbytes, + ACL_MEMCPY_HOST_TO_DEVICE, stream)); } // (asynchronous) copy from a device to host -void memCopyD2HAsync(const deviceStream_t stream, size_t nbytes, void *dst, const void *src) { - DIPU_CALLACLRT(::aclrtMemcpyAsync(dst, nbytes, src, nbytes, ACL_MEMCPY_DEVICE_TO_HOST, stream)); +void memCopyD2HAsync(const deviceStream_t stream, size_t nbytes, void *dst, + const void *src) { + DIPU_CALLACLRT(::aclrtMemcpyAsync(dst, nbytes, src, nbytes, + ACL_MEMCPY_DEVICE_TO_HOST, stream)); } // (asynchronous) set val -void memSetAsync(const deviceStream_t stream, void* ptr, int val, size_t size) { - DIPU_CALLACLRT(aclrtMemsetAsync(ptr, size, val, size, stream)); +void memSetAsync(const deviceStream_t stream, void *ptr, int val, size_t size) { + DIPU_CALLACLRT(aclrtMemsetAsync(ptr, size, val, size, stream)); } // check last launch succ or not, throw if fail void checkLastError() { - const char* erroInfo = aclGetRecentErrMsg(); - if (erroInfo == nullptr) - { + const char *erroInfo = aclGetRecentErrMsg(); + if (erroInfo == nullptr) { return; } printf("%s\n", erroInfo); } void getRuntimeVersion(int *version) { - int major, minor, patch; - DIPU_CALLACLRT(::aclrtGetVersion(&major, &minor, &patch)) - *version = major * 10000 + minor * 100 + patch; + int major, minor, patch; + DIPU_CALLACLRT(::aclrtGetVersion(&major, &minor, &patch)) + *version = major * 10000 + minor * 100 + patch; } // ===================== // device stream related // ===================== void createStream(deviceStream_t *stream, bool prior) { - if (prior) - { + if (prior) { DIPU_LOGW( "Ascend device doesn't support prior queue(stream)." " Fall back on creating queue without priority."); @@ -201,8 +200,8 @@ void destroyStream(deviceStream_t stream) { } void destroyStream(deviceStream_t stream, deviceId_t devId) { - setDevice(devId); - destroyStream(stream); + setDevice(devId); + destroyStream(stream); } void syncStream(deviceStream_t stream) { @@ -210,7 +209,7 @@ void syncStream(deviceStream_t stream) { } bool isStreamEmpty(deviceStream_t stream) { - //aclrtSynchronizeStreamWithTimeout(stream); + // aclrtSynchronizeStreamWithTimeout(stream); return false; } @@ -218,9 +217,7 @@ bool isStreamEmpty(deviceStream_t stream) { // device event related // ===================== -void releaseStream() { - return; -} +void releaseStream() { return; } bool streamNotNull(deviceStream_t stream) { return stream != deviceDefaultStreamLiteral; @@ -230,7 +227,6 @@ void streamWaitEvent(deviceStream_t stream, deviceEvent_t event) { DIPU_CALLACLRT(::aclrtSynchronizeEvent(event)) } - // ===================== // device event related // ===================== @@ -242,28 +238,27 @@ void recordEvent(deviceEvent_t event, deviceStream_t stream) { DIPU_CALLACLRT(::aclrtRecordEvent(event, stream)); } -void eventElapsedTime(float *time, deviceEvent_t start, deviceEvent_t end) { - DIPU_CALLACLRT(aclrtEventElapsedTime(time, start, end)) -} +void eventElapsedTime(float *time, deviceEvent_t start, deviceEvent_t end){ + DIPU_CALLACLRT(aclrtEventElapsedTime(time, start, end))} EventStatus getEventStatus(deviceEvent_t event) { aclrtEventRecordedStatus status; DIPU_CALLACLRT(aclrtQueryEventStatus(event, &status)) if (status == ::ACL_EVENT_RECORDED_STATUS_COMPLETE) { return devapis::EventStatus::READY; - } else if (status == ::ACL_EVENT_RECORDED_STATUS_NOT_READY){ + } else if (status == ::ACL_EVENT_RECORDED_STATUS_NOT_READY) { return devapis::EventStatus::PENDING; } throw std::runtime_error("dipu device error"); } -void createEvent(deviceEvent_t* event) { - DIPU_CALLACLRT(::aclrtCreateEvent(event)) +void createEvent(deviceEvent_t *event) { + DIPU_CALLACLRT(::aclrtCreateEvent(event)) } void destroyEvent(deviceEvent_t event) { - DIPU_CALLACLRT(::aclrtDestroyEvent(event)) + DIPU_CALLACLRT(::aclrtDestroyEvent(event)) } -} // end namespace devapis -} // end namespace dipu +} // end namespace devapis +} // end namespace dipu diff --git a/dipu/torch_dipu/csrc_dipu/vendor/ascend/vendorapi.h b/dipu/torch_dipu/csrc_dipu/vendor/ascend/vendorapi.h index 83baaaadc..8204dfb04 100644 --- a/dipu/torch_dipu/csrc_dipu/vendor/ascend/vendorapi.h +++ b/dipu/torch_dipu/csrc_dipu/vendor/ascend/vendorapi.h @@ -4,37 +4,37 @@ #include #include #include -#include + #include #include -namespace dipu -{ - -#define TRACK_ACL(x) \ - { \ - static bool enable = std::getenv("DIPU_TRACK_ACL") != nullptr; \ - if (enable) \ - { \ - printf("[%s: %d]:%s\n", __FILE__, __LINE__, x); \ - } \ - } - -#define DIPU_CALLACLRT(Expr) \ - { \ - TRACK_ACL(#Expr); \ - ::aclError ret = Expr; \ - if (ret != ::ACL_SUCCESS) \ - { \ - throw std::runtime_error(std::string("ascend device error:") + aclGetRecentErrMsg()); \ - } \ - } +#include + +namespace dipu { + +#define TRACK_ACL(x) \ + { \ + static bool enable = std::getenv("DIPU_TRACK_ACL") != nullptr; \ + if (enable) { \ + printf("[%s: %d]:%s\n", __FILE__, __LINE__, x); \ + } \ + } + +#define DIPU_CALLACLRT(Expr) \ + { \ + TRACK_ACL(#Expr); \ + ::aclError ret = Expr; \ + if (ret != ::ACL_SUCCESS) { \ + throw std::runtime_error(std::string("ascend device error:") + \ + aclGetRecentErrMsg()); \ + } \ + } using deviceStream_t = aclrtStream; #define deviceDefaultStreamLiteral nullptr; using deviceEvent_t = aclrtEvent; -using deviceHandle_t = aclrtContext*; +using deviceHandle_t = aclrtContext *; using diclComm_t = HcclComm; using commUniqueId = HcclRootInfo; -} +} // namespace dipu diff --git a/dipu/torch_dipu/csrc_dipu/vendor/camb/CambGeneratorImpl.cpp b/dipu/torch_dipu/csrc_dipu/vendor/camb/CambGeneratorImpl.cpp index 95412540c..30599049a 100644 --- a/dipu/torch_dipu/csrc_dipu/vendor/camb/CambGeneratorImpl.cpp +++ b/dipu/torch_dipu/csrc_dipu/vendor/camb/CambGeneratorImpl.cpp @@ -1,14 +1,14 @@ // Copyright (c) 2023, DeepLink. -#include +#include + +#include + #include +#include -#include #include #include - -#include - -#include +#include namespace dipu { @@ -28,25 +28,26 @@ static deviceHandle_t getDeviceHandler(c10::DeviceIndex device_index) { DIPU_CALLCNNL(cnnlSetQueue(handle, stream.rawstream())); return handle; } - + // Discriminate floating device type. static bool is_floating_device = true; class MLUGeneratorImpl : public dipu::DIPUGeneratorImpl { -public: - MLUGeneratorImpl(at::DeviceIndex device_index): dipu::DIPUGeneratorImpl(device_index) { - } + public: + MLUGeneratorImpl(at::DeviceIndex device_index) + : dipu::DIPUGeneratorImpl(device_index) {} /** - * set state - * - * See Note [Acquire lock when using random generators] - */ - void set_state(const c10::TensorImpl& state) override { + * set state + * + * See Note [Acquire lock when using random generators] + */ + void set_state(const c10::TensorImpl &state) override { auto state_size = state.numel(); TORCH_CHECK(state_size == mlu_state_size, "RNG state is wrong size"); - at::Tensor state_tmp(state.shallow_copy_and_detach(state.version_counter(), true)); + at::Tensor state_tmp( + state.shallow_copy_and_detach(state.version_counter(), true)); state_ = state_tmp.to(device_); state_need_reset_ = false; } @@ -55,7 +56,7 @@ class MLUGeneratorImpl : public dipu::DIPUGeneratorImpl { * update_state * * See Note [Acquire lock when using random generators] - */ + */ void update_state() const override { // update the state tensor. TORCH_CHECK(is_floating_device, "is_floating_device must be true"); @@ -70,7 +71,8 @@ class MLUGeneratorImpl : public dipu::DIPUGeneratorImpl { auto state_ptr = state_.tensor_data().data_ptr(); dipu::DIPUGuard guard(state_.device()); auto handle = getDeviceHandler(state_.device().index()); - DIPU_CALLCNNL(cnnlRandMakeMTGP32KernelState(handle, state_ptr, nullptr, nullptr, seed_)); + DIPU_CALLCNNL(cnnlRandMakeMTGP32KernelState(handle, state_ptr, nullptr, + nullptr, seed_)); state_need_reset_ = false; } }; @@ -79,4 +81,4 @@ const at::Generator vendorMakeGenerator(at::DeviceIndex device_index) { return at::make_generator(device_index); } -} // namespace torch_dipu \ No newline at end of file +} // namespace dipu \ No newline at end of file diff --git a/dipu/torch_dipu/csrc_dipu/vendor/camb/basecommimpl.hpp b/dipu/torch_dipu/csrc_dipu/vendor/camb/basecommimpl.hpp index b23ce197a..cb6d8a4d2 100644 --- a/dipu/torch_dipu/csrc_dipu/vendor/camb/basecommimpl.hpp +++ b/dipu/torch_dipu/csrc_dipu/vendor/camb/basecommimpl.hpp @@ -9,58 +9,54 @@ namespace dipu { namespace devapis { - // using CambProcessGroupDICL = ProcessGroupDICL; - - // c10::intrusive_ptr createProcessGroupDICL(const c10::intrusive_ptr &store, - // int rank, int size, const std::chrono::duration &timeout) { - // return c10::make_intrusive(store, rank, size); - // } - - // CNCL op mapping - static std::map cncl_op = { - {ReduceOp::MIN, cnclMin}, - {ReduceOp::MAX, cnclMax}, - {ReduceOp::SUM, cnclSum}, - {ReduceOp::PRODUCT, cnclProd}, - }; - - - #define CNCL_THROW(cmd) \ - do { \ - cnclResult_t error = cmd; \ - if (error != CNCL_RET_SUCCESS) { \ - std::string err = "CNCL error in: " + std::string(__FILE__) + ":" + \ - std::to_string(__LINE__) + ", " + \ - std::string(cnclGetErrorStr(error)); \ - TORCH_CHECK(false, err); \ - } \ - } while (0) - - #define CNCL_ASSERT(cmd) \ - do { \ - cnclResult_t res = cmd; \ - if (res != CNCL_RET_SUCCESS) { \ - std::string err = cnclGetErrorStr(res); \ - fprintf( \ - stderr, \ - "CNCL error in: %s:%d, %s\n", \ - __FILE__, \ - __LINE__, \ - err.c_str()); \ - abort(); \ - } \ - } while (0) - - #define CNCL_RET(cmd) \ - do { \ - cnclResult_t error = cmd; \ - if (error != CNCL_RET_SUCCESS) { \ - std::string err = "CNCL error in: " + std::string(__FILE__) + ":" + \ - std::to_string(__LINE__) + ", " + \ - std::string(cnclGetErrorStr(error)); \ - TORCH_CHECK(false, err); \ - } \ - } while (0) - -} -} \ No newline at end of file +// using CambProcessGroupDICL = ProcessGroupDICL; + +// c10::intrusive_ptr createProcessGroupDICL(const +// c10::intrusive_ptr &store, +// int rank, int size, const std::chrono::duration &timeout) { +// return c10::make_intrusive(store, rank, size); +// } + +// CNCL op mapping +static std::map cncl_op = { + {ReduceOp::MIN, cnclMin}, + {ReduceOp::MAX, cnclMax}, + {ReduceOp::SUM, cnclSum}, + {ReduceOp::PRODUCT, cnclProd}, +}; + +#define CNCL_THROW(cmd) \ + do { \ + cnclResult_t error = cmd; \ + if (error != CNCL_RET_SUCCESS) { \ + std::string err = "CNCL error in: " + std::string(__FILE__) + ":" + \ + std::to_string(__LINE__) + ", " + \ + std::string(cnclGetErrorStr(error)); \ + TORCH_CHECK(false, err); \ + } \ + } while (0) + +#define CNCL_ASSERT(cmd) \ + do { \ + cnclResult_t res = cmd; \ + if (res != CNCL_RET_SUCCESS) { \ + std::string err = cnclGetErrorStr(res); \ + fprintf(stderr, "CNCL error in: %s:%d, %s\n", __FILE__, __LINE__, \ + err.c_str()); \ + abort(); \ + } \ + } while (0) + +#define CNCL_RET(cmd) \ + do { \ + cnclResult_t error = cmd; \ + if (error != CNCL_RET_SUCCESS) { \ + std::string err = "CNCL error in: " + std::string(__FILE__) + ":" + \ + std::to_string(__LINE__) + ", " + \ + std::string(cnclGetErrorStr(error)); \ + TORCH_CHECK(false, err); \ + } \ + } while (0) + +} // namespace devapis +} // namespace dipu \ No newline at end of file diff --git a/dipu/torch_dipu/csrc_dipu/vendor/camb/basedeviceimpl.hpp b/dipu/torch_dipu/csrc_dipu/vendor/camb/basedeviceimpl.hpp index ae666b1fe..849538b36 100644 --- a/dipu/torch_dipu/csrc_dipu/vendor/camb/basedeviceimpl.hpp +++ b/dipu/torch_dipu/csrc_dipu/vendor/camb/basedeviceimpl.hpp @@ -1,12 +1,13 @@ // Copyright (c) 2023, DeepLink. #pragma once -#include #include #include +#include + #include -#include #include +#include // use header file as common is weird. change to use base class in future. // only vendor deviceimpl.cpp can include this header @@ -25,13 +26,9 @@ deviceId_t current_device() { return static_cast(devId_); } -void resetDevice(deviceId_t devId) { - DIPU_CALLCNRT(::cnrtDeviceReset()) -} +void resetDevice(deviceId_t devId) { DIPU_CALLCNRT(::cnrtDeviceReset()) } -void syncDevice() { - DIPU_CALLCNRT(::cnrtSyncDevice()) -} +void syncDevice() { DIPU_CALLCNRT(::cnrtSyncDevice()) } int getDeviceCount() { int num = -1; @@ -45,9 +42,7 @@ void getDriverVersion(int *version) { *version = verInfo.version; } -void releaseStream() { - return; -} +void releaseStream() { return; } bool streamNotNull(deviceStream_t stream) { return stream != deviceDefaultStreamLiteral; @@ -57,32 +52,24 @@ void streamWaitEvent(deviceStream_t stream, deviceEvent_t event) { DIPU_CALLCNRT(::cnrtQueueWaitNotifier(event, stream, 0)) } - - // ===================== // device event related // ===================== -void waitEvent(deviceEvent_t event) { - DIPU_CALLCNRT(::cnrtWaitNotifier(event)) -} +void waitEvent(deviceEvent_t event) { DIPU_CALLCNRT(::cnrtWaitNotifier(event)) } void recordEvent(deviceEvent_t event, deviceStream_t stream) { DIPU_CALLCNRT(::cnrtPlaceNotifier(event, stream)); } -void eventElapsedTime(float *time, deviceEvent_t start, deviceEvent_t end) { - DIPU_CALLCNRT(cnrtNotifierElapsedTime(start, end, time)) -} +void eventElapsedTime(float *time, deviceEvent_t start, deviceEvent_t end){ + DIPU_CALLCNRT(cnrtNotifierElapsedTime(start, end, time))} EventStatus getEventStatus(deviceEvent_t event) { ::cnrtRet_t ret = ::cnrtQueryNotifier(event); - if (ret == ::cnrtSuccess) - { + if (ret == ::cnrtSuccess) { return devapis::EventStatus::READY; - } - else if (ret == ::cnrtErrorBusy || ret == ::cnrtErrorNotReady) - { + } else if (ret == ::cnrtErrorBusy || ret == ::cnrtErrorNotReady) { checkLastError(); /* reset internal error state*/ return devapis::EventStatus::PENDING; } @@ -93,56 +80,43 @@ EventStatus getEventStatus(deviceEvent_t event) { // mem related // ===================== -void freeHost(void *p) { - DIPU_CALLCNRT(cnrtFreeHost(p)) -} +void freeHost(void *p){DIPU_CALLCNRT(cnrtFreeHost(p))} OpStatus mallocDevice(void **p, size_t nbytes, bool throwExcepion) { ::cnrtRet_t r = ::cnrtMalloc(p, nbytes); - if (r != ::cnrtSuccess) - { - if (throwExcepion) - { + if (r != ::cnrtSuccess) { + if (throwExcepion) { checkLastError(); /* reset internal error state*/ TORCH_CHECK(false, "alloc failed in mallocDevice, ret = ", r); - } - else if ((r == ::cnrtErrorNoMem)) - { + } else if ((r == ::cnrtErrorNoMem)) { return OpStatus::ERR_NOMEM; - } - else - { + } else { return OpStatus::ERR_UNKNOWN; } } return OpStatus::SUCCESS; } -void freeDevice(void *p) { - DIPU_CALLCNRT(::cnrtFree(p)) -} +void freeDevice(void *p) { DIPU_CALLCNRT(::cnrtFree(p)) } bool isPinnedPtr(const void *p) { - ::cnrtPointerAttributes_t attr; - DIPU_CALLCNRT(::cnrtPointerGetAttributes(&attr, p)) - return attr.type == cnrtMemTypeHost; + ::cnrtPointerAttributes_t attr; + DIPU_CALLCNRT(::cnrtPointerGetAttributes(&attr, p)) + return attr.type == cnrtMemTypeHost; } // (synchronous) copy from device to a device -void memCopyD2D(size_t nbytes, deviceId_t dstDevId, void *dst, deviceId_t srcDevId, const void *src) { +void memCopyD2D(size_t nbytes, deviceId_t dstDevId, void *dst, + deviceId_t srcDevId, const void *src) { // TODO(zhaoxiujia) : check src const syncDevice(); - if (srcDevId != dstDevId) - { - DIPU_CALLCNRT(::cnrtMemcpyPeer( - dst, dstDevId, const_cast(src), srcDevId, nbytes)) - } - else - { - if (dst != src) - { - DIPU_CALLCNRT(::cnrtMemcpy( - dst, const_cast(src), nbytes, CNRT_MEM_TRANS_DIR_DEV2DEV)) + if (srcDevId != dstDevId) { + DIPU_CALLCNRT(::cnrtMemcpyPeer(dst, dstDevId, const_cast(src), + srcDevId, nbytes)) + } else { + if (dst != src) { + DIPU_CALLCNRT(::cnrtMemcpy(dst, const_cast(src), nbytes, + CNRT_MEM_TRANS_DIR_DEV2DEV)) } } } @@ -150,46 +124,45 @@ void memCopyD2D(size_t nbytes, deviceId_t dstDevId, void *dst, deviceId_t srcDev // (synchronous) copy from host to a device void memCopyH2D(size_t nbytes, void *dst, const void *src) { syncDevice(); - DIPU_CALLCNRT(::cnrtMemcpy( - dst, const_cast(src), nbytes, CNRT_MEM_TRANS_DIR_HOST2DEV)) + DIPU_CALLCNRT(::cnrtMemcpy(dst, const_cast(src), nbytes, + CNRT_MEM_TRANS_DIR_HOST2DEV)) } // (synchronous) copy from a device to host void memCopyD2H(size_t nbytes, void *dst, const void *src) { syncDevice(); - DIPU_CALLCNRT(::cnrtMemcpy( - dst, const_cast(src), nbytes, CNRT_MEM_TRANS_DIR_DEV2HOST)) + DIPU_CALLCNRT(::cnrtMemcpy(dst, const_cast(src), nbytes, + CNRT_MEM_TRANS_DIR_DEV2HOST)) } // (asynchronous) copy from device to a device void memCopyD2DAsync(const deviceStream_t stream, size_t nbytes, - deviceId_t dstDevId, void *dst, deviceId_t srcDevId, const void *src) { - if (dstDevId == srcDevId) - { - if (dst != src) - { - DIPU_CALLCNRT(::cnrtMemcpyAsync( - dst, const_cast(src), nbytes, stream, CNRT_MEM_TRANS_DIR_DEV2DEV)) + deviceId_t dstDevId, void *dst, deviceId_t srcDevId, + const void *src) { + if (dstDevId == srcDevId) { + if (dst != src) { + DIPU_CALLCNRT(::cnrtMemcpyAsync(dst, const_cast(src), nbytes, + stream, CNRT_MEM_TRANS_DIR_DEV2DEV)) } - } - else - { - DIPU_CALLCNRT(cnrtMemcpyPeerAsync( - dst, dstDevId, const_cast(src), srcDevId, nbytes, stream)) + } else { + DIPU_CALLCNRT(cnrtMemcpyPeerAsync(dst, dstDevId, const_cast(src), + srcDevId, nbytes, stream)) } } // (asynchronous) copy from host to a device -void memCopyH2DAsync(const deviceStream_t stream, size_t nbytes, void *dst, const void *src){ - DIPU_CALLCNRT(::cnrtMemcpyAsync( - dst, const_cast(src), nbytes, stream, CNRT_MEM_TRANS_DIR_HOST2DEV)) +void memCopyH2DAsync(const deviceStream_t stream, size_t nbytes, void *dst, + const void *src) { + DIPU_CALLCNRT(::cnrtMemcpyAsync(dst, const_cast(src), nbytes, stream, + CNRT_MEM_TRANS_DIR_HOST2DEV)) } // (asynchronous) copy from a device to host -void memCopyD2HAsync(const deviceStream_t stream, size_t nbytes, void *dst, const void *src) { - DIPU_CALLCNRT(::cnrtMemcpyAsync( - dst, const_cast(src), nbytes, stream, CNRT_MEM_TRANS_DIR_DEV2HOST)) +void memCopyD2HAsync(const deviceStream_t stream, size_t nbytes, void *dst, + const void *src) { + DIPU_CALLCNRT(::cnrtMemcpyAsync(dst, const_cast(src), nbytes, stream, + CNRT_MEM_TRANS_DIR_DEV2HOST)) } -} // end namespace devapis -} // end namespace dipu +} // end namespace devapis +} // end namespace dipu diff --git a/dipu/torch_dipu/csrc_dipu/vendor/camb/cnrt_5.x/deviceimpl.cpp b/dipu/torch_dipu/csrc_dipu/vendor/camb/cnrt_5.x/deviceimpl.cpp index d5b0afe5e..d91d84ecb 100644 --- a/dipu/torch_dipu/csrc_dipu/vendor/camb/cnrt_5.x/deviceimpl.cpp +++ b/dipu/torch_dipu/csrc_dipu/vendor/camb/cnrt_5.x/deviceimpl.cpp @@ -1,17 +1,12 @@ // Copyright (c) 2023, DeepLink. #include "../basedeviceimpl.hpp" -namespace dipu -{ +namespace dipu { namespace devapis { -void initializeVendor() { +void initializeVendor() {} -} - -void finalizeVendor() { - -} +void finalizeVendor() {} // camb5.8.2 // set current device given device according to id @@ -23,9 +18,7 @@ void setDevice(deviceId_t devId) { } // check last launch succ or not, throw if fail -void checkLastError() { - DIPU_CALLCNRT(::cnrtGetLastErr()) -} +void checkLastError() { DIPU_CALLCNRT(::cnrtGetLastErr()) } void getRuntimeVersion(int *version) { DIPU_CALLCNRT(::cnrtGetVersion(reinterpret_cast(version))) @@ -35,8 +28,7 @@ void getRuntimeVersion(int *version) { // device stream related // ===================== void createStream(deviceStream_t *stream, bool prior) { - if (prior) - { + if (prior) { DIPU_LOGW( "Camb device doesn't support prior queue(stream)." " Fall back on creating queue without priority."); @@ -84,5 +76,5 @@ void memSetAsync(const deviceStream_t stream, void *ptr, int val, size_t size) { DIPU_CALLCNRT(cnrtMemsetD8Async(ptr, val, size, stream)) } -} // end namespace devapis -} // end namespace dipu +} // end namespace devapis +} // end namespace dipu diff --git a/dipu/torch_dipu/csrc_dipu/vendor/camb/cnrt_6.x/communiatorimpl.cpp b/dipu/torch_dipu/csrc_dipu/vendor/camb/cnrt_6.x/communiatorimpl.cpp index 4cd668293..bb004e7ec 100644 --- a/dipu/torch_dipu/csrc_dipu/vendor/camb/cnrt_6.x/communiatorimpl.cpp +++ b/dipu/torch_dipu/csrc_dipu/vendor/camb/cnrt_6.x/communiatorimpl.cpp @@ -4,104 +4,121 @@ namespace dipu { namespace devapis { - // CNCL type typing - static std::unordered_map cncl_data_type = { - {at::kChar, cnclInt8}, {at::kByte, cnclUint8}, {at::kHalf, cnclHalf}, - {at::kFloat, cnclFloat}, {at::kInt, cnclInt32}, {at::kBool, cnclInt8}, - {at::kLong, cnclInvalid}, {at::kDouble, cnclInvalid} - }; - - static void convertTypeSize(size_t& count, at::ScalarType& datatype) { - if (datatype == at::ScalarType::Long || datatype == at::ScalarType::Double) { - datatype = at::kByte; - count = count * sizeof(long); - } +// CNCL type typing +static std::unordered_map cncl_data_type = { + {at::kChar, cnclInt8}, {at::kByte, cnclUint8}, {at::kHalf, cnclHalf}, + {at::kFloat, cnclFloat}, {at::kInt, cnclInt32}, {at::kBool, cnclInt8}, + {at::kLong, cnclInvalid}, {at::kDouble, cnclInvalid}}; + +static void convertTypeSize(size_t &count, at::ScalarType &datatype) { + if (datatype == at::ScalarType::Long || datatype == at::ScalarType::Double) { + datatype = at::kByte; + count = count * sizeof(long); } +} - const int DICL_UNIQUE_ID_BYTES_SIZE = CNCL_CLIQUE_ID_BYTES_SIZE; +const int DICL_UNIQUE_ID_BYTES_SIZE = CNCL_CLIQUE_ID_BYTES_SIZE; - DIPU_API diclResult_t diclGetCommAsyncError(diclComm_t comm) { - cnclResult_t result = cnclGetCommAsyncError(comm); - if (result != CNCL_RET_SUCCESS) { - return DICL_SUCCESS; - } else { - return DICL_ERR_UNDEF; - } - } - - DIPU_API diclResult_t diclGetUniqueId(commUniqueId* uniqueId) { - CNCL_THROW(cnclGetCliqueId(uniqueId)); - return DICL_SUCCESS; - } - - DIPU_API diclResult_t diclCommInitRank(diclComm_t* comm, int nranks, commUniqueId uniqueId, - int rank, int localDeviceId) { - CNCL_THROW(cnclInitComms(comm, 1, &localDeviceId, &rank, nranks, &uniqueId)); - return DICL_SUCCESS; - } - - // // DIPU_API diclResult_t diclCommInitAll(diclComm_t* comms, int ndev, const int* devlist); - - DIPU_API diclResult_t diclCommDestroy(diclComm_t comm) { - CNCL_THROW(cnclDestroyComms(&comm, 1)); - return DICL_SUCCESS; - } - - // DIPU_API diclResult_t diclCommFinalize(diclComm_t comm); - - // DIPU_API diclResult_t diclCommAbort(diclComm_t comm); - - DIPU_API diclResult_t diclAllReduce(const void *sendbuff, void *recvbuff, size_t count, at::ScalarType datatype, - const ReduceOp& reduceOp, diclComm_t comm, deviceStream_t stream) { - convertTypeSize(count, datatype); - CNCL_THROW(cnclAllReduce(sendbuff, recvbuff, count, cncl_data_type[datatype], cncl_op[reduceOp], - comm, stream)); - return DICL_SUCCESS; - } - - DIPU_API diclResult_t diclBroadcast(const void *sendbuff, void* recvbuff, size_t count, at::ScalarType datatype, - int root, diclComm_t comm, deviceStream_t stream) { - convertTypeSize(count, datatype); - CNCL_THROW(cnclBroadcast(sendbuff, recvbuff, count, cncl_data_type[datatype], root, - comm, stream)); +DIPU_API diclResult_t diclGetCommAsyncError(diclComm_t comm) { + cnclResult_t result = cnclGetCommAsyncError(comm); + if (result != CNCL_RET_SUCCESS) { return DICL_SUCCESS; + } else { + return DICL_ERR_UNDEF; } - - DIPU_API diclResult_t diclAllGather(const void *sendBuf, void *recvBuf, size_t count, at::ScalarType datatype, - diclComm_t comm, deviceStream_t stream) { - convertTypeSize(count, datatype); - CNCL_THROW(cnclAllGather(sendBuf, recvBuf, count, cncl_data_type[datatype], comm, stream)); - return DICL_SUCCESS; - } - - DIPU_API diclResult_t diclReduce(const void* sendbuff, void* recvbuff, size_t count, at::ScalarType datatype, - const ReduceOp& reduceOp, int root, diclComm_t comm, deviceStream_t stream) { - convertTypeSize(count, datatype); - CNCL_THROW(cnclReduce(sendbuff, recvbuff, count, cncl_data_type[datatype], cncl_op[reduceOp], - root, comm, stream)); - return DICL_SUCCESS; - } - - DIPU_API diclResult_t diclReduceScatter(void *sendBuf, void *recvBuf, size_t count, at::ScalarType datatype, - const ReduceOp& op, diclComm_t comm, deviceStream_t stream) { - convertTypeSize(count, datatype); - CNCL_THROW(cnclReduceScatter(sendBuf, recvBuf, count, cncl_data_type[datatype], cncl_op[op], comm, stream)); - return DICL_SUCCESS; - } - - DIPU_API diclResult_t diclSend(void* sendbuff, size_t count, at::ScalarType datatype, int peer, - diclComm_t comm, deviceStream_t stream){ - convertTypeSize(count, datatype); - CNCL_THROW(cnclSend(sendbuff, count, cncl_data_type[datatype], peer, comm, stream)); - return DICL_SUCCESS; - } - - DIPU_API diclResult_t diclRecv(void* recvbuff, size_t count, at::ScalarType datatype, int peer, - diclComm_t comm, deviceStream_t stream) { - convertTypeSize(count, datatype); - CNCL_THROW(cnclRecv(recvbuff, count, cncl_data_type[datatype], peer, comm, stream)); - return DICL_SUCCESS; - } - -} // end namespace devapis -} // end namespace dipu +} + +DIPU_API diclResult_t diclGetUniqueId(commUniqueId *uniqueId) { + CNCL_THROW(cnclGetCliqueId(uniqueId)); + return DICL_SUCCESS; +} + +DIPU_API diclResult_t diclCommInitRank(diclComm_t *comm, int nranks, + commUniqueId uniqueId, int rank, + int localDeviceId) { + CNCL_THROW(cnclInitComms(comm, 1, &localDeviceId, &rank, nranks, &uniqueId)); + return DICL_SUCCESS; +} + +// // DIPU_API diclResult_t diclCommInitAll(diclComm_t* comms, int ndev, const +// int* devlist); + +DIPU_API diclResult_t diclCommDestroy(diclComm_t comm) { + CNCL_THROW(cnclDestroyComms(&comm, 1)); + return DICL_SUCCESS; +} + +// DIPU_API diclResult_t diclCommFinalize(diclComm_t comm); + +// DIPU_API diclResult_t diclCommAbort(diclComm_t comm); + +DIPU_API diclResult_t diclAllReduce(const void *sendbuff, void *recvbuff, + size_t count, at::ScalarType datatype, + const ReduceOp &reduceOp, diclComm_t comm, + deviceStream_t stream) { + convertTypeSize(count, datatype); + CNCL_THROW(cnclAllReduce(sendbuff, recvbuff, count, cncl_data_type[datatype], + cncl_op[reduceOp], comm, stream)); + return DICL_SUCCESS; +} + +DIPU_API diclResult_t diclBroadcast(const void *sendbuff, void *recvbuff, + size_t count, at::ScalarType datatype, + int root, diclComm_t comm, + deviceStream_t stream) { + convertTypeSize(count, datatype); + CNCL_THROW(cnclBroadcast(sendbuff, recvbuff, count, cncl_data_type[datatype], + root, comm, stream)); + return DICL_SUCCESS; +} + +DIPU_API diclResult_t diclAllGather(const void *sendBuf, void *recvBuf, + size_t count, at::ScalarType datatype, + diclComm_t comm, deviceStream_t stream) { + convertTypeSize(count, datatype); + CNCL_THROW(cnclAllGather(sendBuf, recvBuf, count, cncl_data_type[datatype], + comm, stream)); + return DICL_SUCCESS; +} + +DIPU_API diclResult_t diclReduce(const void *sendbuff, void *recvbuff, + size_t count, at::ScalarType datatype, + const ReduceOp &reduceOp, int root, + diclComm_t comm, deviceStream_t stream) { + convertTypeSize(count, datatype); + CNCL_THROW(cnclReduce(sendbuff, recvbuff, count, cncl_data_type[datatype], + cncl_op[reduceOp], root, comm, stream)); + return DICL_SUCCESS; +} + +DIPU_API diclResult_t diclReduceScatter(void *sendBuf, void *recvBuf, + size_t count, at::ScalarType datatype, + const ReduceOp &op, diclComm_t comm, + deviceStream_t stream) { + convertTypeSize(count, datatype); + CNCL_THROW(cnclReduceScatter(sendBuf, recvBuf, count, + cncl_data_type[datatype], cncl_op[op], comm, + stream)); + return DICL_SUCCESS; +} + +DIPU_API diclResult_t diclSend(void *sendbuff, size_t count, + at::ScalarType datatype, int peer, + diclComm_t comm, deviceStream_t stream) { + convertTypeSize(count, datatype); + CNCL_THROW( + cnclSend(sendbuff, count, cncl_data_type[datatype], peer, comm, stream)); + return DICL_SUCCESS; +} + +DIPU_API diclResult_t diclRecv(void *recvbuff, size_t count, + at::ScalarType datatype, int peer, + diclComm_t comm, deviceStream_t stream) { + convertTypeSize(count, datatype); + CNCL_THROW( + cnclRecv(recvbuff, count, cncl_data_type[datatype], peer, comm, stream)); + return DICL_SUCCESS; +} + +} // end namespace devapis +} // end namespace dipu diff --git a/dipu/torch_dipu/csrc_dipu/vendor/camb/cnrt_6.x/deviceimpl.cpp b/dipu/torch_dipu/csrc_dipu/vendor/camb/cnrt_6.x/deviceimpl.cpp index f216fb834..4a73d60b1 100644 --- a/dipu/torch_dipu/csrc_dipu/vendor/camb/cnrt_6.x/deviceimpl.cpp +++ b/dipu/torch_dipu/csrc_dipu/vendor/camb/cnrt_6.x/deviceimpl.cpp @@ -1,8 +1,7 @@ // Copyright (c) 2023, DeepLink. #include "../basedeviceimpl.hpp" -namespace dipu -{ +namespace dipu { namespace devapis { #define DIPU_INIT_CNDEV_VERSION(info) info.version = CNDEV_VERSION_5; @@ -14,14 +13,9 @@ void setDevice(deviceId_t devId) { DIPU_CALLCNRT(::cnrtSetDevice(devId_)) } -void initializeVendor() { - DIPU_CALLCNDEV(::cndevInit(0)); -} - -void finalizeVendor() { - ::cndevRelease(); -} +void initializeVendor() { DIPU_CALLCNDEV(::cndevInit(0)); } +void finalizeVendor() { ::cndevRelease(); } DIPUDeviceProperties getDeviceProperties(int32_t device_index) { ::cnrtDeviceProp_t device_prop; @@ -30,9 +24,12 @@ DIPUDeviceProperties getDeviceProperties(int32_t device_index) { int32_t multi_processor_cnt = 1; ::cndevMemoryInfo_t mem_info; DIPU_CALLCNRT(::cnrtGetDeviceProperties(&device_prop, device_index)); - DIPU_CALLCNRT(::cnrtDeviceGetAttribute(&major, ::cnrtAttrComputeCapabilityMajor, device_index)); - DIPU_CALLCNRT(::cnrtDeviceGetAttribute(&minor, ::cnrtAttrComputeCapabilityMinor, device_index)); - // DIPU_CALLCNRT(::cnrtDeviceGetAttribute(&multi_processor_cnt, ::cnrtAttrConcurrentKernels, device_index)); + DIPU_CALLCNRT(::cnrtDeviceGetAttribute( + &major, ::cnrtAttrComputeCapabilityMajor, device_index)); + DIPU_CALLCNRT(::cnrtDeviceGetAttribute( + &minor, ::cnrtAttrComputeCapabilityMinor, device_index)); + // DIPU_CALLCNRT(::cnrtDeviceGetAttribute(&multi_processor_cnt, + // ::cnrtAttrConcurrentKernels, device_index)); DIPU_INIT_CNDEV_VERSION(mem_info); DIPU_CALLCNDEV(::cndevGetMemoryUsage(&mem_info, device_index)); @@ -45,30 +42,28 @@ DIPUDeviceProperties getDeviceProperties(int32_t device_index) { return prop; } -/* - both cndevMemoryInfo_t.physicalMemoryUsed from cndevGetMemoryUsage and cndevProcessInfo_t from cndevGetProcessInfo seems not correct, - value always zero, need further investigation. -DIPUDeviceStatus getDeviceStatus(int32_t device_index) { +/* + both cndevMemoryInfo_t.physicalMemoryUsed from cndevGetMemoryUsage and +cndevProcessInfo_t from cndevGetProcessInfo seems not correct, value always +zero, need further investigation. DIPUDeviceStatus getDeviceStatus(int32_t +device_index) { } */ // check last launch succ or not, throw if fail -void checkLastError() { - DIPU_CALLCNRT(::cnrtGetLastError()) -} +void checkLastError() { DIPU_CALLCNRT(::cnrtGetLastError()) } void getRuntimeVersion(int *version) { - int major, minor, patch; - DIPU_CALLCNRT(::cnrtGetLibVersion(&major, &minor, &patch)) - *version = major * 10000 + minor * 100 + patch; + int major, minor, patch; + DIPU_CALLCNRT(::cnrtGetLibVersion(&major, &minor, &patch)) + *version = major * 10000 + minor * 100 + patch; } // ===================== // device stream related // ===================== void createStream(deviceStream_t *stream, bool prior) { - if (prior) - { + if (prior) { DIPU_LOGW( "Camb device doesn't support prior queue(stream)." " Fall back on creating queue without priority."); @@ -81,8 +76,8 @@ void destroyStream(deviceStream_t stream) { } void destroyStream(deviceStream_t stream, deviceId_t devId) { - setDevice(devId); - destroyStream(stream); + setDevice(devId); + destroyStream(stream); } void syncStream(deviceStream_t stream) { @@ -101,12 +96,12 @@ bool isStreamEmpty(deviceStream_t stream) { // device event related // ===================== -void createEvent(deviceEvent_t* event) { - DIPU_CALLCNRT(::cnrtNotifierCreate(event)) +void createEvent(deviceEvent_t *event) { + DIPU_CALLCNRT(::cnrtNotifierCreate(event)) } void destroyEvent(deviceEvent_t event) { - DIPU_CALLCNRT(::cnrtNotifierDestroy(event)) + DIPU_CALLCNRT(::cnrtNotifierDestroy(event)) } // ===================== @@ -117,9 +112,9 @@ void mallocHost(void **p, size_t nbytes) { } // (asynchronous) set val -void memSetAsync(const deviceStream_t stream, void* ptr, int val, size_t size) { - DIPU_CALLCNRT(cnrtMemsetAsync(ptr, val, size, stream)) +void memSetAsync(const deviceStream_t stream, void *ptr, int val, size_t size) { + DIPU_CALLCNRT(cnrtMemsetAsync(ptr, val, size, stream)) } -} // end namespace devapis -} // end namespace dipu +} // end namespace devapis +} // end namespace dipu diff --git a/dipu/torch_dipu/csrc_dipu/vendor/camb/vendorapi.h b/dipu/torch_dipu/csrc_dipu/vendor/camb/vendorapi.h index 6c297e8dd..ec5151d53 100644 --- a/dipu/torch_dipu/csrc_dipu/vendor/camb/vendorapi.h +++ b/dipu/torch_dipu/csrc_dipu/vendor/camb/vendorapi.h @@ -1,34 +1,37 @@ // Copyright (c) 2023, DeepLink. #pragma once -#include +#include #include #include -#include +#include + #include #include namespace dipu { -#define DIPU_CALLCNRT(Expr) \ - { \ - ::cnrtRet_t ret = Expr; \ - TORCH_CHECK(ret == ::CNRT_RET_SUCCESS, "call cnrt error, expr = ", #Expr, ", ret = ", ret); \ - } - -#define DIPU_CALLCNDEV(Expr) \ - { \ - ::cndevRet_t ret = Expr; \ - TORCH_CHECK(ret == ::CNDEV_SUCCESS, "call cndev error, expr = ", #Expr, ", ret = ", ret); \ - } - -#define DIPU_CALLCNNL(Expr) \ - { \ - ::cnnlStatus_t ret = Expr; \ - TORCH_CHECK(ret == ::CNNL_STATUS_SUCCESS, "call cnnl error, expr = ", #Expr, ", ret = ", ret); \ - } - +#define DIPU_CALLCNRT(Expr) \ + { \ + ::cnrtRet_t ret = Expr; \ + TORCH_CHECK(ret == ::CNRT_RET_SUCCESS, "call cnrt error, expr = ", #Expr, \ + ", ret = ", ret); \ + } + +#define DIPU_CALLCNDEV(Expr) \ + { \ + ::cndevRet_t ret = Expr; \ + TORCH_CHECK(ret == ::CNDEV_SUCCESS, "call cndev error, expr = ", #Expr, \ + ", ret = ", ret); \ + } + +#define DIPU_CALLCNNL(Expr) \ + { \ + ::cnnlStatus_t ret = Expr; \ + TORCH_CHECK(ret == ::CNNL_STATUS_SUCCESS, \ + "call cnnl error, expr = ", #Expr, ", ret = ", ret); \ + } using deviceStream_t = cnrtQueue_t; #define deviceDefaultStreamLiteral nullptr @@ -38,4 +41,4 @@ using deviceHandle_t = cnnlHandle_t; using diclComm_t = cnclComm_t; using commUniqueId = cnclCliqueId; -} +} // namespace dipu diff --git a/dipu/torch_dipu/csrc_dipu/vendor/cuda/CUDACopyInplace.cpp b/dipu/torch_dipu/csrc_dipu/vendor/cuda/CUDACopyInplace.cpp index 7e2b9d8b6..e7264f9a0 100644 --- a/dipu/torch_dipu/csrc_dipu/vendor/cuda/CUDACopyInplace.cpp +++ b/dipu/torch_dipu/csrc_dipu/vendor/cuda/CUDACopyInplace.cpp @@ -1,14 +1,15 @@ // Copyright (c) 2023, DeepLink. -#include -#include -#include #include + +#include +#include #include +#include namespace dipu { -at::Tensor& copy_(at::Tensor& self, const at::Tensor& src, bool non_blocking) { +at::Tensor ©_(at::Tensor &self, const at::Tensor &src, bool non_blocking) { if (self.numel() == 0) { return self; } @@ -17,10 +18,15 @@ at::Tensor& copy_(at::Tensor& self, const at::Tensor& src, bool non_blocking) { ::diopiContext context(stream.rawstream()); auto ctx = &context; - ::diopiConstTensorHandle_t srcDiopiTensorHandle = dipu::diopi_helper::toDiopiTensorHandle(src); - ::diopiTensorHandle_t selfDiopiTensorHandle = dipu::diopi_helper::toDiopiTensorHandle(self); - ::diopiError_t ret = ::diopiCopyInp(ctx, srcDiopiTensorHandle, selfDiopiTensorHandle); - TORCH_CHECK(ret == ::diopiSuccess, __FILE__, ":", __LINE__, R"(::diopiCopyInp(ctx, src, dst);)", " error, error code is ", ret, "error message is ", diopiGetLastErrorString()); + ::diopiConstTensorHandle_t srcDiopiTensorHandle = + dipu::diopi_helper::toDiopiTensorHandle(src); + ::diopiTensorHandle_t selfDiopiTensorHandle = + dipu::diopi_helper::toDiopiTensorHandle(self); + ::diopiError_t ret = + ::diopiCopyInp(ctx, srcDiopiTensorHandle, selfDiopiTensorHandle); + TORCH_CHECK(ret == ::diopiSuccess, __FILE__, ":", __LINE__, + R"(::diopiCopyInp(ctx, src, dst);)", " error, error code is ", + ret, "error message is ", diopiGetLastErrorString()); if (!non_blocking) { dipu::devapis::syncStream(stream.rawstream()); @@ -29,23 +35,30 @@ at::Tensor& copy_(at::Tensor& self, const at::Tensor& src, bool non_blocking) { } class CUDACopyInplace : public DIPUCopyInplace { -public: + public: CUDACopyInplace() = default; ~CUDACopyInplace() = default; - at::Tensor& run(at::Tensor& self, const at::Tensor& src, bool non_blocking) override { + at::Tensor &run(at::Tensor &self, const at::Tensor &src, + bool non_blocking) override { return copy_(self, src, non_blocking); } - at::Tensor& copy_between_devices(at::TensorIterator& iter, at::Tensor& self, const at::Tensor& src, bool non_blocking) override { + at::Tensor ©_between_devices(at::TensorIterator &iter, at::Tensor &self, + const at::Tensor &src, + bool non_blocking) override { return copy_(self, src, non_blocking); } - at::Tensor& copy_contiguous(at::TensorIterator& iter, at::Tensor& self, const at::Tensor& src, bool non_blocking) override { + at::Tensor ©_contiguous(at::TensorIterator &iter, at::Tensor &self, + const at::Tensor &src, + bool non_blocking) override { return copy_(self, src, non_blocking); } - at::Tensor& copy_uncontiguous(at::TensorIterator& iter, at::Tensor& self, const at::Tensor& src, bool non_blocking) override { + at::Tensor ©_uncontiguous(at::TensorIterator &iter, at::Tensor &self, + const at::Tensor &src, + bool non_blocking) override { return copy_(self, src, non_blocking); } }; diff --git a/dipu/torch_dipu/csrc_dipu/vendor/cuda/CudaGeneratorImpl.cpp b/dipu/torch_dipu/csrc_dipu/vendor/cuda/CudaGeneratorImpl.cpp index 7eb1e266d..da195f4b0 100644 --- a/dipu/torch_dipu/csrc_dipu/vendor/cuda/CudaGeneratorImpl.cpp +++ b/dipu/torch_dipu/csrc_dipu/vendor/cuda/CudaGeneratorImpl.cpp @@ -11,26 +11,32 @@ static const size_t offset_size = sizeof(int64_t); static const size_t total_size = states_size + seed_size + offset_size; class CUDAGeneratorImpl : public dipu::DIPUGeneratorImpl { -public: - CUDAGeneratorImpl(at::DeviceIndex device_index): dipu::DIPUGeneratorImpl(device_index) { - } + public: + CUDAGeneratorImpl(at::DeviceIndex device_index) + : dipu::DIPUGeneratorImpl(device_index) {} - void set_state(const c10::TensorImpl& state) override { + void set_state(const c10::TensorImpl &state) override { at::detail::check_rng_state(state); auto state_size = state.numel(); - TORCH_CHECK(state_size == total_size || state_size == total_size - offset_size, "RNG state is wrong size"); + TORCH_CHECK( + state_size == total_size || state_size == total_size - offset_size, + "RNG state is wrong size"); - at::Tensor state_tmp(state.shallow_copy_and_detach(state.version_counter(), true)); + at::Tensor state_tmp( + state.shallow_copy_and_detach(state.version_counter(), true)); state_ = state_tmp; state_need_reset_ = false; } void update_state() const override { if (state_need_reset_) { - state_ = at::detail::empty_cpu({(int64_t)total_size}, c10::ScalarType::Byte, c10::nullopt, c10::nullopt, c10::nullopt, c10::nullopt); + state_ = at::detail::empty_cpu({(int64_t)total_size}, + c10::ScalarType::Byte, c10::nullopt, + c10::nullopt, c10::nullopt, c10::nullopt); auto rng_state = state_.data_ptr(); - // since curandStateMTGP is not used anymore, fill gen_states of THCGenerator with deterministic garbage value of -1 - // gen_states in THCGenerator struct was an array of curandStateMtgp32s. + // since curandStateMTGP is not used anymore, fill gen_states of + // THCGenerator with deterministic garbage value of -1 gen_states in + // THCGenerator struct was an array of curandStateMtgp32s. memset(rng_state, -1, states_size); uint64_t current_seed = this->current_seed(); int64_t offset = 0; @@ -45,4 +51,4 @@ const at::Generator vendorMakeGenerator(at::DeviceIndex device_index) { return at::make_generator(device_index); } -} // namespace torch_dipu \ No newline at end of file +} // namespace dipu \ No newline at end of file diff --git a/dipu/torch_dipu/csrc_dipu/vendor/cuda/communiatorimpl.cpp b/dipu/torch_dipu/csrc_dipu/vendor/cuda/communiatorimpl.cpp index 4a9cb55e0..b0f523960 100644 --- a/dipu/torch_dipu/csrc_dipu/vendor/cuda/communiatorimpl.cpp +++ b/dipu/torch_dipu/csrc_dipu/vendor/cuda/communiatorimpl.cpp @@ -1,4 +1,5 @@ #include + #include #include @@ -6,118 +7,125 @@ namespace dipu { namespace devapis { - // NCCL op mapping - static std::map ncclOp = { - {ReduceOp::MIN, ncclMin}, - {ReduceOp::MAX, ncclMax}, - {ReduceOp::SUM, ncclSum}, - {ReduceOp::PRODUCT, ncclProd}, - #ifdef NCCL_HAS_AVG +// NCCL op mapping +static std::map ncclOp = { + {ReduceOp::MIN, ncclMin}, {ReduceOp::MAX, ncclMax}, + {ReduceOp::SUM, ncclSum}, {ReduceOp::PRODUCT, ncclProd}, +#ifdef NCCL_HAS_AVG {ReduceOp::AVG, ncclAvg}, - #endif - }; - - // NCCL type typing - static std::map ncclDataType = { - {at::kChar, ncclInt8}, - {at::kByte, ncclUint8}, - {at::kFloat, ncclFloat}, - {at::kDouble, ncclDouble}, - {at::kInt, ncclInt32}, - {at::kLong, ncclInt64}, - {at::kHalf, ncclHalf}, - {at::kBool, ncclUint8}, - #if HAS_NCCL_BF16_DATATYPE +#endif +}; + +// NCCL type typing +static std::map ncclDataType = { + {at::kChar, ncclInt8}, {at::kByte, ncclUint8}, + {at::kFloat, ncclFloat}, {at::kDouble, ncclDouble}, + {at::kInt, ncclInt32}, {at::kLong, ncclInt64}, + {at::kHalf, ncclHalf}, {at::kBool, ncclUint8}, +#if HAS_NCCL_BF16_DATATYPE {at::kBFloat16, ncclBfloat16}, - #endif - }; - +#endif +}; // Macro to print and abort on a non-successful NCCL return value. -#define NCCL_THROW(cmd) \ - do { \ - ncclResult_t result = cmd; \ - if (result != ncclSuccess) { \ - std::string err = ncclGetErrorString(result); \ - fprintf( \ - stderr, \ - "NCCL error in: %s:%d, %s\n", \ - __FILE__, \ - __LINE__, \ - err.c_str()); \ - TORCH_CHECK(false, err); \ - } \ +#define NCCL_THROW(cmd) \ + do { \ + ncclResult_t result = cmd; \ + if (result != ncclSuccess) { \ + std::string err = ncclGetErrorString(result); \ + fprintf(stderr, "NCCL error in: %s:%d, %s\n", __FILE__, __LINE__, \ + err.c_str()); \ + TORCH_CHECK(false, err); \ + } \ } while (0) +const int DICL_UNIQUE_ID_BYTES_SIZE = NCCL_UNIQUE_ID_BYTES; - const int DICL_UNIQUE_ID_BYTES_SIZE = NCCL_UNIQUE_ID_BYTES; - - DIPU_API diclResult_t diclGetCommAsyncError(diclComm_t comm) { - ncclResult_t ncclAsyncErr_; - NCCL_THROW(ncclCommGetAsyncError(comm, &ncclAsyncErr_)); - if (ncclAsyncErr_ != ncclSuccess) { - return DICL_SUCCESS; - } else { - return DICL_ERR_UNDEF; - } - } - - DIPU_API diclResult_t diclGetUniqueId(commUniqueId* uniqueId) { - NCCL_THROW(ncclGetUniqueId(uniqueId)); - return DICL_SUCCESS; - } - - DIPU_API diclResult_t diclCommInitRank(diclComm_t* comm, int nranks, commUniqueId uniqueId, - int rank, int localDeviceId) { - NCCL_THROW(ncclCommInitRank(comm, nranks, uniqueId, rank)); - return DICL_SUCCESS; - } - - DIPU_API diclResult_t diclCommDestroy(ncclComm_t comm) { - NCCL_THROW(ncclCommDestroy(comm)); - return DICL_SUCCESS; - } - - DIPU_API diclResult_t diclAllReduce(const void *sendbuff, void *recvbuff, size_t count, at::ScalarType datatype, - const ReduceOp& reduceOp, diclComm_t comm, deviceStream_t stream) { - NCCL_THROW(ncclAllReduce(sendbuff, recvbuff, count, ncclDataType[datatype], ncclOp[reduceOp], comm, stream)); - return DICL_SUCCESS; - } - - DIPU_API diclResult_t diclBroadcast(const void *sendbuff, void* recvbuff, size_t count, at::ScalarType datatype, - int root, diclComm_t comm, deviceStream_t stream) { - NCCL_THROW(ncclBroadcast(sendbuff, recvbuff, count, ncclDataType[datatype], root, comm, stream)); - return DICL_SUCCESS; - } - - DIPU_API diclResult_t diclAllGather(const void *sendBuf, void *recvBuf, size_t sendCount, at::ScalarType datatype, - diclComm_t comm, deviceStream_t stream) { - NCCL_THROW(ncclAllGather(sendBuf, recvBuf, sendCount, ncclDataType[datatype], comm, stream)); - return DICL_SUCCESS; - } - - DIPU_API diclResult_t diclReduce(const void* sendbuff, void* recvbuff, size_t count, at::ScalarType datatype, - const ReduceOp& reduceOp, int root, diclComm_t comm, deviceStream_t stream) { - NCCL_THROW(ncclReduce(sendbuff, recvbuff, count, ncclDataType[datatype], ncclOp[reduceOp], root, comm, stream)); - return DICL_SUCCESS; - } - - DIPU_API diclResult_t diclReduceScatter(void *sendBuf, void *recvBuf, size_t recvCount, at::ScalarType datatype, - const ReduceOp& reduceOp, diclComm_t comm, deviceStream_t stream) { - NCCL_THROW(ncclReduceScatter(sendBuf, recvBuf, recvCount, ncclDataType[datatype], ncclOp[reduceOp], comm, stream)); - } - - DIPU_API diclResult_t diclSend(void* sendbuff, size_t count, at::ScalarType datatype, int peer, - diclComm_t comm, deviceStream_t stream){ - NCCL_THROW(ncclSend(sendbuff, count, ncclDataType[datatype], peer, comm, stream)); +DIPU_API diclResult_t diclGetCommAsyncError(diclComm_t comm) { + ncclResult_t ncclAsyncErr_; + NCCL_THROW(ncclCommGetAsyncError(comm, &ncclAsyncErr_)); + if (ncclAsyncErr_ != ncclSuccess) { return DICL_SUCCESS; + } else { + return DICL_ERR_UNDEF; } - - DIPU_API diclResult_t diclRecv(void* recvbuff, size_t count, at::ScalarType datatype, int peer, - diclComm_t comm, deviceStream_t stream) { - NCCL_THROW(ncclRecv(recvbuff, count, ncclDataType[datatype], peer, comm, stream)); - return DICL_SUCCESS; - } - -} // end namespace devapis -} // end namespace dipu +} + +DIPU_API diclResult_t diclGetUniqueId(commUniqueId *uniqueId) { + NCCL_THROW(ncclGetUniqueId(uniqueId)); + return DICL_SUCCESS; +} + +DIPU_API diclResult_t diclCommInitRank(diclComm_t *comm, int nranks, + commUniqueId uniqueId, int rank, + int localDeviceId) { + NCCL_THROW(ncclCommInitRank(comm, nranks, uniqueId, rank)); + return DICL_SUCCESS; +} + +DIPU_API diclResult_t diclCommDestroy(ncclComm_t comm) { + NCCL_THROW(ncclCommDestroy(comm)); + return DICL_SUCCESS; +} + +DIPU_API diclResult_t diclAllReduce(const void *sendbuff, void *recvbuff, + size_t count, at::ScalarType datatype, + const ReduceOp &reduceOp, diclComm_t comm, + deviceStream_t stream) { + NCCL_THROW(ncclAllReduce(sendbuff, recvbuff, count, ncclDataType[datatype], + ncclOp[reduceOp], comm, stream)); + return DICL_SUCCESS; +} + +DIPU_API diclResult_t diclBroadcast(const void *sendbuff, void *recvbuff, + size_t count, at::ScalarType datatype, + int root, diclComm_t comm, + deviceStream_t stream) { + NCCL_THROW(ncclBroadcast(sendbuff, recvbuff, count, ncclDataType[datatype], + root, comm, stream)); + return DICL_SUCCESS; +} + +DIPU_API diclResult_t diclAllGather(const void *sendBuf, void *recvBuf, + size_t sendCount, at::ScalarType datatype, + diclComm_t comm, deviceStream_t stream) { + NCCL_THROW(ncclAllGather(sendBuf, recvBuf, sendCount, ncclDataType[datatype], + comm, stream)); + return DICL_SUCCESS; +} + +DIPU_API diclResult_t diclReduce(const void *sendbuff, void *recvbuff, + size_t count, at::ScalarType datatype, + const ReduceOp &reduceOp, int root, + diclComm_t comm, deviceStream_t stream) { + NCCL_THROW(ncclReduce(sendbuff, recvbuff, count, ncclDataType[datatype], + ncclOp[reduceOp], root, comm, stream)); + return DICL_SUCCESS; +} + +DIPU_API diclResult_t diclReduceScatter( + void *sendBuf, void *recvBuf, size_t recvCount, at::ScalarType datatype, + const ReduceOp &reduceOp, diclComm_t comm, deviceStream_t stream) { + NCCL_THROW(ncclReduceScatter(sendBuf, recvBuf, recvCount, + ncclDataType[datatype], ncclOp[reduceOp], comm, + stream)); +} + +DIPU_API diclResult_t diclSend(void *sendbuff, size_t count, + at::ScalarType datatype, int peer, + diclComm_t comm, deviceStream_t stream) { + NCCL_THROW( + ncclSend(sendbuff, count, ncclDataType[datatype], peer, comm, stream)); + return DICL_SUCCESS; +} + +DIPU_API diclResult_t diclRecv(void *recvbuff, size_t count, + at::ScalarType datatype, int peer, + diclComm_t comm, deviceStream_t stream) { + NCCL_THROW( + ncclRecv(recvbuff, count, ncclDataType[datatype], peer, comm, stream)); + return DICL_SUCCESS; +} + +} // end namespace devapis +} // end namespace dipu diff --git a/dipu/torch_dipu/csrc_dipu/vendor/cuda/deviceimpl.cpp b/dipu/torch_dipu/csrc_dipu/vendor/cuda/deviceimpl.cpp index 97363a502..45f60b95a 100644 --- a/dipu/torch_dipu/csrc_dipu/vendor/cuda/deviceimpl.cpp +++ b/dipu/torch_dipu/csrc_dipu/vendor/cuda/deviceimpl.cpp @@ -1,11 +1,14 @@ // Copyright (c) 2023, DeepLink. #include -#include -#include + #include +#include +#include + namespace dipu { -DIPU_API devapis::VendorDeviceType VENDOR_TYPE = devapis::VendorDeviceType::CUDA; +DIPU_API devapis::VendorDeviceType VENDOR_TYPE = + devapis::VendorDeviceType::CUDA; namespace devapis { @@ -14,14 +17,9 @@ using cuda_deviceId = int; // Device class related // ===================== -void initializeVendor() { - -} - -void finalizeVendor() { - -} +void initializeVendor() {} +void finalizeVendor() {} deviceId_t current_device() { cuda_deviceId devId_; @@ -51,71 +49,64 @@ DIPUDeviceStatus getDeviceStatus(int32_t device_index) { // in cuda_runtime_api.h // set current device given device according to id void setDevice(deviceId_t devId) { - cuda_deviceId devId_ = static_cast(devId); - DIPU_CALLCUDA(::cudaSetDevice(devId_)) + cuda_deviceId devId_ = static_cast(devId); + DIPU_CALLCUDA(::cudaSetDevice(devId_)) } -void resetDevice(deviceId_t devId) { - DIPU_CALLCUDA(::cudaDeviceReset()) -} +void resetDevice(deviceId_t devId) { DIPU_CALLCUDA(::cudaDeviceReset()) } -void syncDevice() { - DIPU_CALLCUDA(::cudaDeviceSynchronize()) -} +void syncDevice() { DIPU_CALLCUDA(::cudaDeviceSynchronize()) } // check last launch succ or not, throw if fail -void checkLastError() { - DIPU_CALLCUDA(::cudaGetLastError()) -} +void checkLastError() { DIPU_CALLCUDA(::cudaGetLastError()) } int getDeviceCount() { int num = -1; - DIPU_CALLCUDA(::cudaGetDeviceCount(reinterpret_cast(&num))) + DIPU_CALLCUDA(::cudaGetDeviceCount(reinterpret_cast(&num))) return num; } -void getDriverVersion(int* version) { - DIPU_CALLCUDA(::cudaDriverGetVersion(version)) +void getDriverVersion(int *version) { + DIPU_CALLCUDA(::cudaDriverGetVersion(version)) } -void getRuntimeVersion(int* version) { - DIPU_CALLCUDA(::cudaRuntimeGetVersion(version)) +void getRuntimeVersion(int *version) { + DIPU_CALLCUDA(::cudaRuntimeGetVersion(version)) } // ===================== // device stream related // ===================== -void createStream(deviceStream_t* stream, bool prior) { - if (prior) { - DIPU_CALLCUDA(::cudaStreamCreateWithPriority(stream, cudaStreamDefault, -1)) - } else { - DIPU_CALLCUDA(::cudaStreamCreate(stream)) - } +void createStream(deviceStream_t *stream, bool prior) { + if (prior) { + DIPU_CALLCUDA(::cudaStreamCreateWithPriority(stream, cudaStreamDefault, -1)) + } else { + DIPU_CALLCUDA(::cudaStreamCreate(stream)) + } } void destroyStream(deviceStream_t stream) { - DIPU_CALLCUDA(::cudaStreamDestroy(stream)) + DIPU_CALLCUDA(::cudaStreamDestroy(stream)) } void destroyStream(deviceStream_t stream, deviceId_t devId) { - setDevice(devId); - destroyStream(stream); + setDevice(devId); + destroyStream(stream); } -void releaseStream() { - return; -} +void releaseStream() { return; } bool streamNotNull(deviceStream_t stream) { - return (stream != nullptr && stream != cudaStreamLegacy && stream != cudaStreamPerThread); + return (stream != nullptr && stream != cudaStreamLegacy && + stream != cudaStreamPerThread); } void syncStream(deviceStream_t stream) { - DIPU_CALLCUDA(::cudaStreamSynchronize(stream)); + DIPU_CALLCUDA(::cudaStreamSynchronize(stream)); } void streamWaitEvent(deviceStream_t stream, deviceEvent_t event) { - DIPU_CALLCUDA(::cudaStreamWaitEvent(stream, event, 0)) + DIPU_CALLCUDA(::cudaStreamWaitEvent(stream, event, 0)) } bool isStreamEmpty(deviceStream_t stream) { @@ -126,145 +117,135 @@ bool isStreamEmpty(deviceStream_t stream) { return false; } - // ===================== // device event related // ===================== -void createEvent(deviceEvent_t* event) { - static bool enableTiming = []() { - const char* env = std::getenv("DIPU_CUDA_EVENT_TIMING"); - if (env) { - return std::atoi(env) > 0; - } else { - return true; - } - }(); - - DIPU_CALLCUDA(::cudaEventCreateWithFlags(event, enableTiming ? cudaEventDefault : cudaEventDisableTiming)) +void createEvent(deviceEvent_t *event) { + static bool enableTiming = []() { + const char *env = std::getenv("DIPU_CUDA_EVENT_TIMING"); + if (env) { + return std::atoi(env) > 0; + } else { + return true; + } + }(); + + DIPU_CALLCUDA(::cudaEventCreateWithFlags( + event, enableTiming ? cudaEventDefault : cudaEventDisableTiming)) } void destroyEvent(deviceEvent_t event) { - DIPU_CALLCUDA(::cudaEventDestroy(event)) + DIPU_CALLCUDA(::cudaEventDestroy(event)) } void waitEvent(deviceEvent_t event) { - DIPU_CALLCUDA(::cudaEventSynchronize(event)) + DIPU_CALLCUDA(::cudaEventSynchronize(event)) } void recordEvent(deviceEvent_t event, deviceStream_t stream) { - DIPU_CALLCUDA(::cudaEventRecord(event, stream)) + DIPU_CALLCUDA(::cudaEventRecord(event, stream)) } -void eventElapsedTime(float* time, deviceEvent_t start, deviceEvent_t end) { - DIPU_CALLCUDA(cudaEventElapsedTime(time, start, end)) -} +void eventElapsedTime(float *time, deviceEvent_t start, deviceEvent_t end){ + DIPU_CALLCUDA(cudaEventElapsedTime(time, start, end))} EventStatus getEventStatus(deviceEvent_t event) { - ::cudaError_t ret = ::cudaEventQuery(event); - if (ret == ::cudaSuccess) { - return devapis::EventStatus::READY; - } else if (ret == ::cudaErrorNotReady) { - ::cudaGetLastError(); /* reset internal error state*/ - return devapis::EventStatus::PENDING; - } else { - TORCH_CHECK(false, "unexpected event status in getEventStatus, ret = ", ret); - } + ::cudaError_t ret = ::cudaEventQuery(event); + if (ret == ::cudaSuccess) { + return devapis::EventStatus::READY; + } else if (ret == ::cudaErrorNotReady) { + ::cudaGetLastError(); /* reset internal error state*/ + return devapis::EventStatus::PENDING; + } else { + TORCH_CHECK(false, + "unexpected event status in getEventStatus, ret = ", ret); + } } // ===================== // mem related // ===================== -void mallocHost(void** p, size_t nbytes) { - DIPU_CALLCUDA(::cudaMallocHost(p, nbytes)) +void mallocHost(void **p, size_t nbytes) { + DIPU_CALLCUDA(::cudaMallocHost(p, nbytes)) } -void freeHost(void* p) { - DIPU_CALLCUDA(::cudaFreeHost(p)) -} +void freeHost(void *p){DIPU_CALLCUDA(::cudaFreeHost(p))} OpStatus mallocDevice(void **p, size_t nbytes, bool throwExcepion) { - ::cudaError_t r = ::cudaMalloc(p, nbytes); - if (r != ::cudaSuccess) { - if(throwExcepion) { - ::cudaGetLastError(); /* reset internal error state*/ - TORCH_CHECK(false, "alloc failed in mallocDevice, ret = ", r, " size= ", nbytes); - } - else if(r == ::cudaErrorMemoryAllocation) { - return OpStatus::ERR_NOMEM; - } - else { - return OpStatus::ERR_UNKNOWN; - } + ::cudaError_t r = ::cudaMalloc(p, nbytes); + if (r != ::cudaSuccess) { + if (throwExcepion) { + ::cudaGetLastError(); /* reset internal error state*/ + TORCH_CHECK(false, "alloc failed in mallocDevice, ret = ", r, + " size= ", nbytes); + } else if (r == ::cudaErrorMemoryAllocation) { + return OpStatus::ERR_NOMEM; + } else { + return OpStatus::ERR_UNKNOWN; } - return OpStatus::SUCCESS; + } + return OpStatus::SUCCESS; } -void freeDevice(void* p) { - DIPU_CALLCUDA(::cudaFree(p)) -} +void freeDevice(void *p) { DIPU_CALLCUDA(::cudaFree(p)) } bool isPinnedPtr(const void *p) { - ::cudaPointerAttributes attr; - DIPU_CALLCUDA(::cudaPointerGetAttributes(&attr, p)) - return attr.type == cudaMemoryTypeHost; + ::cudaPointerAttributes attr; + DIPU_CALLCUDA(::cudaPointerGetAttributes(&attr, p)) + return attr.type == cudaMemoryTypeHost; } -void memSetAsync(const deviceStream_t stream, void* ptr, int val, size_t size) { - DIPU_CALLCUDA(::cudaMemsetAsync(ptr, val, size, stream)) +void memSetAsync(const deviceStream_t stream, void *ptr, int val, size_t size) { + DIPU_CALLCUDA(::cudaMemsetAsync(ptr, val, size, stream)) } -void memCopyD2D(size_t nbytes, deviceId_t dstDevId, void* dst, deviceId_t srcDevId, const void* src) { - if (dstDevId == srcDevId) { - DIPU_CALLCUDA(::cudaMemcpy(dst, src, nbytes, ::cudaMemcpyDeviceToDevice)) - } else { - DIPU_CALLCUDA(::cudaMemcpyPeer(dst, dstDevId, src, srcDevId, nbytes)) - } +void memCopyD2D(size_t nbytes, deviceId_t dstDevId, void *dst, + deviceId_t srcDevId, const void *src) { + if (dstDevId == srcDevId) { + DIPU_CALLCUDA(::cudaMemcpy(dst, src, nbytes, ::cudaMemcpyDeviceToDevice)) + } else { + DIPU_CALLCUDA(::cudaMemcpyPeer(dst, dstDevId, src, srcDevId, nbytes)) + } } // (synchronous) copy from host to a CUDA device -void memCopyH2D(size_t nbytes, void* dst, const void* src) { - DIPU_CALLCUDA(::cudaMemcpy(dst, src, nbytes, ::cudaMemcpyHostToDevice)) +void memCopyH2D(size_t nbytes, void *dst, const void *src) { + DIPU_CALLCUDA(::cudaMemcpy(dst, src, nbytes, ::cudaMemcpyHostToDevice)) } // (synchronous) copy from a CUDA device to host -void memCopyD2H(size_t nbytes, void* dst, const void* src) { - DIPU_CALLCUDA(::cudaMemcpy(dst, src, nbytes, ::cudaMemcpyDeviceToHost)) +void memCopyD2H(size_t nbytes, void *dst, const void *src) { + DIPU_CALLCUDA(::cudaMemcpy(dst, src, nbytes, ::cudaMemcpyDeviceToHost)) } // (asynchronous) copy from device to a device void memCopyD2DAsync(const deviceStream_t stream, size_t nbytes, - deviceId_t dstDevId, void* dst, deviceId_t srcDevId, const void* src) { + deviceId_t dstDevId, void *dst, deviceId_t srcDevId, + const void *src) { if (dstDevId == srcDevId) { - DIPU_CALLCUDA(::cudaMemcpyAsync( - dst, src, nbytes, cudaMemcpyDeviceToDevice, stream)) + DIPU_CALLCUDA( + ::cudaMemcpyAsync(dst, src, nbytes, cudaMemcpyDeviceToDevice, stream)) } else { - DIPU_CALLCUDA(::cudaMemcpyPeerAsync( - dst, dstDevId, src, srcDevId, nbytes, stream)) + DIPU_CALLCUDA( + ::cudaMemcpyPeerAsync(dst, dstDevId, src, srcDevId, nbytes, stream)) } } // (asynchronous) copy from host to a device -void memCopyH2DAsync(const deviceStream_t stream, size_t nbytes, void* dst, const void* src) { - DIPU_CALLCUDA(::cudaMemcpyAsync( - dst, src, nbytes, cudaMemcpyHostToDevice, stream)) +void memCopyH2DAsync(const deviceStream_t stream, size_t nbytes, void *dst, + const void *src) { + DIPU_CALLCUDA( + ::cudaMemcpyAsync(dst, src, nbytes, cudaMemcpyHostToDevice, stream)) } // (asynchronous) copy from a device to host -void memCopyD2HAsync(const deviceStream_t stream, size_t nbytes, void* dst, const void* src) { - DIPU_CALLCUDA(::cudaMemcpyAsync( - dst, src, nbytes, cudaMemcpyDeviceToHost, stream)); +void memCopyD2HAsync(const deviceStream_t stream, size_t nbytes, void *dst, + const void *src) { + DIPU_CALLCUDA( + ::cudaMemcpyAsync(dst, src, nbytes, cudaMemcpyDeviceToHost, stream)); } -} // end namespace devapis +} // end namespace devapis } // namespace dipu - - - - - - - - - diff --git a/dipu/torch_dipu/csrc_dipu/vendor/cuda/patch/DIPUPatchCudaAllocator.cpp b/dipu/torch_dipu/csrc_dipu/vendor/cuda/patch/DIPUPatchCudaAllocator.cpp index dd29fefb0..2033c0df7 100644 --- a/dipu/torch_dipu/csrc_dipu/vendor/cuda/patch/DIPUPatchCudaAllocator.cpp +++ b/dipu/torch_dipu/csrc_dipu/vendor/cuda/patch/DIPUPatchCudaAllocator.cpp @@ -1,7 +1,9 @@ -#include -#include "c10/cuda/CUDACachingAllocator.h" -#include #include +#include + +#include "c10/cuda/CUDACachingAllocator.h" + +#include namespace c10 { @@ -9,55 +11,89 @@ namespace cuda { namespace CUDACachingAllocator { - -#define DIPU_PATCH_CUDA_ALLOCATOR(x) \ - std::cout << __FUNCTION__ << ":" << __LINE__ << " this function should not be called!" x << std::endl; +#define DIPU_PATCH_CUDA_ALLOCATOR(x) \ + std::cout << __FUNCTION__ << ":" << __LINE__ \ + << " this function should not be called!" x << std::endl; class DIPUCUDAAllocatorProxy : public CUDAAllocator { - std::unordered_map tempMemBlock; + std::unordered_map tempMemBlock; using mutex_t = std::mutex; mutable mutex_t mut_; + public: - virtual void* raw_alloc_with_stream(size_t nbytes, cudaStream_t stream) override {DIPU_PATCH_CUDA_ALLOCATOR();} - virtual void setMemoryFraction(double fraction, int device) override { DIPU_PATCH_CUDA_ALLOCATOR();} - virtual void* getBaseAllocation(void* ptr, size_t* size) override { DIPU_PATCH_CUDA_ALLOCATOR();} - virtual void recordStream(const DataPtr&, CUDAStream stream) override { DIPU_PATCH_CUDA_ALLOCATOR();} - virtual DeviceStats getDeviceStats(int device) override { DIPU_PATCH_CUDA_ALLOCATOR();} - virtual void resetAccumulatedStats(int device) override { DIPU_PATCH_CUDA_ALLOCATOR();} - virtual void resetPeakStats(int device) override { DIPU_PATCH_CUDA_ALLOCATOR();} - virtual SnapshotInfo snapshot() override { DIPU_PATCH_CUDA_ALLOCATOR();} - virtual void notifyCaptureBegin(int device, CaptureId_t graph_id, MempoolId_t mempool_id) override { DIPU_PATCH_CUDA_ALLOCATOR();} - virtual void notifyCaptureAboutToEnd(int device, CaptureId_t graph_id) override { DIPU_PATCH_CUDA_ALLOCATOR();} - virtual void notifyCaptureEnded(int device, CaptureId_t graph_id) override { DIPU_PATCH_CUDA_ALLOCATOR();} - virtual void notifyCaptureDestroy(int device, MempoolId_t mempool_id) override { DIPU_PATCH_CUDA_ALLOCATOR();} - virtual std::shared_ptr getIpcDevPtr(std::string handle) override { DIPU_PATCH_CUDA_ALLOCATOR();} - virtual void recordHistory(bool enabled, CreateContextFn context_recorder, size_t alloc_trace_max_entries, bool alloc_trace_record_context) override { DIPU_PATCH_CUDA_ALLOCATOR();} - virtual void attachOutOfMemoryObserver(OutOfMemoryObserver observer) override { DIPU_PATCH_CUDA_ALLOCATOR();} - virtual std::string name() override { DIPU_PATCH_CUDA_ALLOCATOR();} - virtual void cacheInfo(int dev_id, size_t* largestBlock) override {DIPU_PATCH_CUDA_ALLOCATOR();} - - virtual void* raw_alloc(size_t nbytes) override { + virtual void *raw_alloc_with_stream(size_t nbytes, + cudaStream_t stream) override { + DIPU_PATCH_CUDA_ALLOCATOR(); + } + virtual void setMemoryFraction(double fraction, int device) override { + DIPU_PATCH_CUDA_ALLOCATOR(); + } + virtual void *getBaseAllocation(void *ptr, size_t *size) override { + DIPU_PATCH_CUDA_ALLOCATOR(); + } + virtual void recordStream(const DataPtr &, CUDAStream stream) override { + DIPU_PATCH_CUDA_ALLOCATOR(); + } + virtual DeviceStats getDeviceStats(int device) override { + DIPU_PATCH_CUDA_ALLOCATOR(); + } + virtual void resetAccumulatedStats(int device) override { + DIPU_PATCH_CUDA_ALLOCATOR(); + } + virtual void resetPeakStats(int device) override { + DIPU_PATCH_CUDA_ALLOCATOR(); + } + virtual SnapshotInfo snapshot() override { DIPU_PATCH_CUDA_ALLOCATOR(); } + virtual void notifyCaptureBegin(int device, CaptureId_t graph_id, + MempoolId_t mempool_id) override { + DIPU_PATCH_CUDA_ALLOCATOR(); + } + virtual void notifyCaptureAboutToEnd(int device, + CaptureId_t graph_id) override { + DIPU_PATCH_CUDA_ALLOCATOR(); + } + virtual void notifyCaptureEnded(int device, CaptureId_t graph_id) override { + DIPU_PATCH_CUDA_ALLOCATOR(); + } + virtual void notifyCaptureDestroy(int device, + MempoolId_t mempool_id) override { + DIPU_PATCH_CUDA_ALLOCATOR(); + } + virtual std::shared_ptr getIpcDevPtr(std::string handle) override { + DIPU_PATCH_CUDA_ALLOCATOR(); + } + virtual void recordHistory(bool enabled, CreateContextFn context_recorder, + size_t alloc_trace_max_entries, + bool alloc_trace_record_context) override { + DIPU_PATCH_CUDA_ALLOCATOR(); + } + virtual void attachOutOfMemoryObserver( + OutOfMemoryObserver observer) override { + DIPU_PATCH_CUDA_ALLOCATOR(); + } + virtual std::string name() override { DIPU_PATCH_CUDA_ALLOCATOR(); } + virtual void cacheInfo(int dev_id, size_t *largestBlock) override { + DIPU_PATCH_CUDA_ALLOCATOR(); + } + + virtual void *raw_alloc(size_t nbytes) override { auto data_ptr = this->allocate(nbytes); - void* ptr = data_ptr.get(); + void *ptr = data_ptr.get(); std::lock_guard lk(mut_); tempMemBlock.emplace(ptr, std::move(data_ptr)); return ptr; } - virtual void raw_delete(void* ptr) override { + virtual void raw_delete(void *ptr) override { std::lock_guard lk(mut_); tempMemBlock.erase(ptr); } virtual void init(int device_count) override {} - virtual bool initialized() override { - return true; - } + virtual bool initialized() override { return true; } - virtual void emptyCache() override { - dipu::emptyCachedMem(); - } + virtual void emptyCache() override { dipu::emptyCachedMem(); } virtual bool needsPoolSpecificPeerAccess() override { // DIPU_PATCH_CUDA_ALLOCATOR(); @@ -65,23 +101,24 @@ class DIPUCUDAAllocatorProxy : public CUDAAllocator { } virtual DataPtr allocate(size_t n) const override { - //DIPU_PATCH_CUDA_ALLOCATOR(); + // DIPU_PATCH_CUDA_ALLOCATOR(); auto data_ptr = c10::GetAllocator(dipu::DIPU_DEVICE_TYPE)->allocate(n); - data_ptr.unsafe_set_device(c10::Device(c10::DeviceType::CUDA, data_ptr.device().index())); + data_ptr.unsafe_set_device( + c10::Device(c10::DeviceType::CUDA, data_ptr.device().index())); return data_ptr; } }; -} // namespace CUDACachingAllocator +} // namespace CUDACachingAllocator -} // namespace cuda +} // namespace cuda -} // namespace c10 +} // namespace c10 namespace dipu { int patchCachingAllocator() { - const char* env = std::getenv("DIPU_PATCH_CUDA_CACHED_ALLOCATOR"); + const char *env = std::getenv("DIPU_PATCH_CUDA_CACHED_ALLOCATOR"); if (env != nullptr) { if (std::atoi(env) <= 0) { return 0; @@ -91,15 +128,20 @@ int patchCachingAllocator() { } /* Our implementation idea is different from the native pytorch implementation, - so the interface cannot be fully aligned. We only implement the most basic and necessary functions. + so the interface cannot be fully aligned. We only implement the most basic + and necessary functions. */ - static c10::cuda::CUDACachingAllocator::DIPUCUDAAllocatorProxy cuda_allocator_proxy; - c10::cuda::CUDACachingAllocator::allocator.store(dynamic_cast(&cuda_allocator_proxy)); + static c10::cuda::CUDACachingAllocator::DIPUCUDAAllocatorProxy + cuda_allocator_proxy; + c10::cuda::CUDACachingAllocator::allocator.store( + dynamic_cast( + &cuda_allocator_proxy)); return 0; } /*This order is really unrequired and unimportant, -and this compilation unit may not be compiled, so it is still initialized with global variables +and this compilation unit may not be compiled, so it is still initialized with +global variables */ static int n = patchCachingAllocator(); -} // namespace dipu \ No newline at end of file +} // namespace dipu \ No newline at end of file diff --git a/dipu/torch_dipu/csrc_dipu/vendor/cuda/patch/wrapperRegister.cpp b/dipu/torch_dipu/csrc_dipu/vendor/cuda/patch/wrapperRegister.cpp index cacad16c9..569261030 100644 --- a/dipu/torch_dipu/csrc_dipu/vendor/cuda/patch/wrapperRegister.cpp +++ b/dipu/torch_dipu/csrc_dipu/vendor/cuda/patch/wrapperRegister.cpp @@ -1,7 +1,7 @@ // Copyright (c) 2023, DeepLink. #include -#include #include +#include namespace at { diff --git a/dipu/torch_dipu/csrc_dipu/vendor/cuda/vendorapi.h b/dipu/torch_dipu/csrc_dipu/vendor/cuda/vendorapi.h index 50260ec46..c4220a294 100644 --- a/dipu/torch_dipu/csrc_dipu/vendor/cuda/vendorapi.h +++ b/dipu/torch_dipu/csrc_dipu/vendor/cuda/vendorapi.h @@ -11,11 +11,12 @@ namespace dipu { -#define DIPU_CALLCUDA(Expr) \ -{ \ - cudaError_t ret = Expr; \ - TORCH_CHECK(ret == ::cudaSuccess, "call cuda error, expr = ", #Expr, ", ret = ", ret); \ -} +#define DIPU_CALLCUDA(Expr) \ + { \ + cudaError_t ret = Expr; \ + TORCH_CHECK(ret == ::cudaSuccess, "call cuda error, expr = ", #Expr, \ + ", ret = ", ret); \ + } using deviceStream_t = cudaStream_t; #define deviceDefaultStreamLiteral cudaStreamLegacy @@ -24,9 +25,4 @@ using deviceEvent_t = cudaEvent_t; using diclComm_t = ncclComm_t; using commUniqueId = ncclUniqueId; -} - - - - - +} // namespace dipu diff --git a/dipu/torch_dipu/csrc_dipu/vendor/droplet/DropletGeneratorImpl.cpp b/dipu/torch_dipu/csrc_dipu/vendor/droplet/DropletGeneratorImpl.cpp index c731ce39d..c6b10723e 100644 --- a/dipu/torch_dipu/csrc_dipu/vendor/droplet/DropletGeneratorImpl.cpp +++ b/dipu/torch_dipu/csrc_dipu/vendor/droplet/DropletGeneratorImpl.cpp @@ -1,10 +1,9 @@ -#include #include +#include -#include #include #include -#include +#include namespace dipu { @@ -14,19 +13,17 @@ namespace dipu { // just an example // not implemented now class DROPLETGeneratorImpl : public dipu::DIPUGeneratorImpl { -public: - DROPLETGeneratorImpl(at::DeviceIndex device_index): dipu::DIPUGeneratorImpl(device_index) { - } + public: + DROPLETGeneratorImpl(at::DeviceIndex device_index) + : dipu::DIPUGeneratorImpl(device_index) {} - void set_state(const c10::TensorImpl& state) override { - } + void set_state(const c10::TensorImpl &state) override {} - void update_state() const override { - } + void update_state() const override {} }; const at::Generator vendorMakeGenerator(at::DeviceIndex device_index) { return at::make_generator(device_index); } -} \ No newline at end of file +} // namespace dipu \ No newline at end of file diff --git a/dipu/torch_dipu/csrc_dipu/vendor/droplet/communicatorimpl.cpp b/dipu/torch_dipu/csrc_dipu/vendor/droplet/communicatorimpl.cpp index 8ed8126f0..9b04379b1 100644 --- a/dipu/torch_dipu/csrc_dipu/vendor/droplet/communicatorimpl.cpp +++ b/dipu/torch_dipu/csrc_dipu/vendor/droplet/communicatorimpl.cpp @@ -4,7 +4,7 @@ #include #ifdef USE_PCCL #include -#endif // USE_PCCL +#endif // USE_PCCL #include #include @@ -24,24 +24,23 @@ const int DICL_UNIQUE_ID_BYTES_SIZE = PCCL_UNIQUE_ID_BYTES; // PCCL reduce-op mapping static const std::map toPcclReduceOp = { - {ReduceOp::MIN, pcclMin}, - {ReduceOp::MAX, pcclMax}, - {ReduceOp::SUM, pcclSum}, - {ReduceOp::PRODUCT, pcclProd}, + {ReduceOp::MIN, pcclMin}, {ReduceOp::MAX, pcclMax}, + {ReduceOp::SUM, pcclSum}, {ReduceOp::PRODUCT, pcclProd}, {ReduceOp::AVG, pcclAvg}, }; // TODO: find a better function to get reduce-op's name -#define RedOpTypeToPcclRedOp_t(op_type, pccl_op) \ -pcclRedOp_t pccl_op; \ -{ \ - auto p = toPcclReduceOp.find(op_type); \ - if (p == toPcclReduceOp.end()) { \ - std::string err = "Unsupported reduce op " + std::to_string(op_type) + " at: " LOCATION "\n"; \ - throw std::runtime_error(err); \ - } \ - pccl_op = p->second; \ -} +#define RedOpTypeToPcclRedOp_t(op_type, pccl_op) \ + pcclRedOp_t pccl_op; \ + { \ + auto p = toPcclReduceOp.find(op_type); \ + if (p == toPcclReduceOp.end()) { \ + std::string err = "Unsupported reduce op " + std::to_string(op_type) + \ + " at: " LOCATION "\n"; \ + throw std::runtime_error(err); \ + } \ + pccl_op = p->second; \ + } // PCCL dtype mapping static const std::map toPcclDataType = { @@ -57,166 +56,193 @@ static const std::map toPcclDataType = { {at::kBFloat16, pcclBfloat16}, }; -#define ScalarTypeToPcclDataType_t(scalar_type, pccl_data_type) \ -pcclDataType_t pccl_data_type; \ -{ \ - auto p = toPcclDataType.find(scalar_type); \ - if (p == toPcclDataType.end()) { \ - std::string err = std::string("Not supported ScalarType ") + c10::toString(scalar_type) + \ - " at: " LOCATION "\n"; \ - throw std::runtime_error(err); \ - } \ - pccl_data_type = p->second; \ -} +#define ScalarTypeToPcclDataType_t(scalar_type, pccl_data_type) \ + pcclDataType_t pccl_data_type; \ + { \ + auto p = toPcclDataType.find(scalar_type); \ + if (p == toPcclDataType.end()) { \ + std::string err = std::string("Not supported ScalarType ") + \ + c10::toString(scalar_type) + " at: " LOCATION "\n"; \ + throw std::runtime_error(err); \ + } \ + pccl_data_type = p->second; \ + } // Macro to print and abort on a non-successful PCCL return value. -#define CALL_PCCL(expr) \ -do { \ - pcclResult_t result = expr; \ - if (result != pcclSuccess) { \ - std::string err = "PCCL error at: " LOCATION ", return code=" + std::to_string(result) + \ - ", err_str:" + pcclGetErrorString(result); \ - throw std::runtime_error(err); \ - } \ -} while (0) +#define CALL_PCCL(expr) \ + do { \ + pcclResult_t result = expr; \ + if (result != pcclSuccess) { \ + std::string err = "PCCL error at: " LOCATION ", return code=" + \ + std::to_string(result) + \ + ", err_str:" + pcclGetErrorString(result); \ + throw std::runtime_error(err); \ + } \ + } while (0) DIPU_API diclResult_t diclGetCommAsyncError(diclComm_t comm) { - pcclResult_t pcclAsyncErr; - CALL_PCCL(pcclCommGetAsyncError(comm, &pcclAsyncErr)); - // shuold we return pcclInProgress as success or not? - if (pcclAsyncErr != pcclSuccess) { - return DICL_ERR_UNDEF; - } else { - return DICL_SUCCESS; - } -} - -DIPU_API diclResult_t diclGetUniqueId(commUniqueId* uniqueId) { - CALL_PCCL(pcclGetUniqueId(uniqueId)); - return DICL_SUCCESS; -} - -DIPU_API diclResult_t diclCommInitRank(diclComm_t* comm, int nranks, commUniqueId uniqueId, - int rank, int localDeviceId) { - CALL_PCCL(pcclCommInitRank(comm, nranks, uniqueId, rank)); - return DICL_SUCCESS; -} - -DIPU_API diclResult_t diclCommDestroy(diclComm_t comm) { - CALL_PCCL(pcclCommDestroy(comm)); - return DICL_SUCCESS; -} - -DIPU_API diclResult_t diclAllReduce(const void *sendbuff, void *recvbuff, size_t count, at::ScalarType datatype, - const ReduceOp& reduceOp, diclComm_t comm, deviceStream_t stream) { - ScalarTypeToPcclDataType_t(datatype, pcclDataType); - RedOpTypeToPcclRedOp_t(reduceOp, pcclReduceOp); - CALL_PCCL(pcclAllReduce(sendbuff, recvbuff, count, pcclDataType, pcclReduceOp, comm, stream)); - return DICL_SUCCESS; -} - -DIPU_API diclResult_t diclBroadcast(const void *sendbuff, void* recvbuff, size_t count, at::ScalarType datatype, - int root, diclComm_t comm, deviceStream_t stream) { - ScalarTypeToPcclDataType_t(datatype, pcclDataType); - CALL_PCCL(pcclBroadcast(sendbuff, recvbuff, count, pcclDataType, root, comm, stream)); - return DICL_SUCCESS; -} - -DIPU_API diclResult_t diclAllGather(const void *sendBuf, void *recvBuf, size_t count, at::ScalarType datatype, - diclComm_t comm, deviceStream_t stream) { - ScalarTypeToPcclDataType_t(datatype, pcclDataType); - CALL_PCCL(pcclAllGather(sendBuf, recvBuf, count, pcclDataType, comm, stream)); - return DICL_SUCCESS; -} - -DIPU_API diclResult_t diclReduce(const void* sendbuff, void* recvbuff, size_t count, at::ScalarType datatype, - const ReduceOp& reduceOp, int root, diclComm_t comm, deviceStream_t stream) { - ScalarTypeToPcclDataType_t(datatype, pcclDataType); - RedOpTypeToPcclRedOp_t(reduceOp, pcclReduceOp); - CALL_PCCL(pcclReduce(sendbuff, recvbuff, count, pcclDataType, pcclReduceOp, root, comm, stream)); - return DICL_SUCCESS; -} - -DIPU_API diclResult_t diclReduceScatter(void *sendBuf, void *recvBuf, size_t recvCount, at::ScalarType datatype, - const ReduceOp& reduceOp, diclComm_t comm, deviceStream_t stream) { - ScalarTypeToPcclDataType_t(datatype, pcclDataType); - RedOpTypeToPcclRedOp_t(reduceOp, pcclReduceOp); - CALL_PCCL(pcclReduceScatter(sendBuf, recvBuf, recvCount, pcclDataType, pcclReduceOp, comm, stream)); + pcclResult_t pcclAsyncErr; + CALL_PCCL(pcclCommGetAsyncError(comm, &pcclAsyncErr)); + // shuold we return pcclInProgress as success or not? + if (pcclAsyncErr != pcclSuccess) { + return DICL_ERR_UNDEF; + } else { return DICL_SUCCESS; + } } -DIPU_API diclResult_t diclSend(void* sendbuff, size_t count, at::ScalarType datatype, int peer, - diclComm_t comm, deviceStream_t stream){ - ScalarTypeToPcclDataType_t(datatype, pcclDataType); - CALL_PCCL(pcclSend(sendbuff, count, pcclDataType, peer, comm, stream)); - return DICL_SUCCESS; +DIPU_API diclResult_t diclGetUniqueId(commUniqueId *uniqueId) { + CALL_PCCL(pcclGetUniqueId(uniqueId)); + return DICL_SUCCESS; } -DIPU_API diclResult_t diclRecv(void* recvbuff, size_t count, at::ScalarType datatype, int peer, - diclComm_t comm, deviceStream_t stream) { - ScalarTypeToPcclDataType_t(datatype, pcclDataType); - CALL_PCCL(pcclRecv(recvbuff, count, pcclDataType, peer, comm, stream)); - return DICL_SUCCESS; +DIPU_API diclResult_t diclCommInitRank(diclComm_t *comm, int nranks, + commUniqueId uniqueId, int rank, + int localDeviceId) { + CALL_PCCL(pcclCommInitRank(comm, nranks, uniqueId, rank)); + return DICL_SUCCESS; } -#else // USE_PCCL +DIPU_API diclResult_t diclCommDestroy(diclComm_t comm) { + CALL_PCCL(pcclCommDestroy(comm)); + return DICL_SUCCESS; +} + +DIPU_API diclResult_t diclAllReduce(const void *sendbuff, void *recvbuff, + size_t count, at::ScalarType datatype, + const ReduceOp &reduceOp, diclComm_t comm, + deviceStream_t stream) { + ScalarTypeToPcclDataType_t(datatype, pcclDataType); + RedOpTypeToPcclRedOp_t(reduceOp, pcclReduceOp); + CALL_PCCL(pcclAllReduce(sendbuff, recvbuff, count, pcclDataType, pcclReduceOp, + comm, stream)); + return DICL_SUCCESS; +} + +DIPU_API diclResult_t diclBroadcast(const void *sendbuff, void *recvbuff, + size_t count, at::ScalarType datatype, + int root, diclComm_t comm, + deviceStream_t stream) { + ScalarTypeToPcclDataType_t(datatype, pcclDataType); + CALL_PCCL(pcclBroadcast(sendbuff, recvbuff, count, pcclDataType, root, comm, + stream)); + return DICL_SUCCESS; +} + +DIPU_API diclResult_t diclAllGather(const void *sendBuf, void *recvBuf, + size_t count, at::ScalarType datatype, + diclComm_t comm, deviceStream_t stream) { + ScalarTypeToPcclDataType_t(datatype, pcclDataType); + CALL_PCCL(pcclAllGather(sendBuf, recvBuf, count, pcclDataType, comm, stream)); + return DICL_SUCCESS; +} + +DIPU_API diclResult_t diclReduce(const void *sendbuff, void *recvbuff, + size_t count, at::ScalarType datatype, + const ReduceOp &reduceOp, int root, + diclComm_t comm, deviceStream_t stream) { + ScalarTypeToPcclDataType_t(datatype, pcclDataType); + RedOpTypeToPcclRedOp_t(reduceOp, pcclReduceOp); + CALL_PCCL(pcclReduce(sendbuff, recvbuff, count, pcclDataType, pcclReduceOp, + root, comm, stream)); + return DICL_SUCCESS; +} + +DIPU_API diclResult_t diclReduceScatter( + void *sendBuf, void *recvBuf, size_t recvCount, at::ScalarType datatype, + const ReduceOp &reduceOp, diclComm_t comm, deviceStream_t stream) { + ScalarTypeToPcclDataType_t(datatype, pcclDataType); + RedOpTypeToPcclRedOp_t(reduceOp, pcclReduceOp); + CALL_PCCL(pcclReduceScatter(sendBuf, recvBuf, recvCount, pcclDataType, + pcclReduceOp, comm, stream)); + return DICL_SUCCESS; +} + +DIPU_API diclResult_t diclSend(void *sendbuff, size_t count, + at::ScalarType datatype, int peer, + diclComm_t comm, deviceStream_t stream) { + ScalarTypeToPcclDataType_t(datatype, pcclDataType); + CALL_PCCL(pcclSend(sendbuff, count, pcclDataType, peer, comm, stream)); + return DICL_SUCCESS; +} + +DIPU_API diclResult_t diclRecv(void *recvbuff, size_t count, + at::ScalarType datatype, int peer, + diclComm_t comm, deviceStream_t stream) { + ScalarTypeToPcclDataType_t(datatype, pcclDataType); + CALL_PCCL(pcclRecv(recvbuff, count, pcclDataType, peer, comm, stream)); + return DICL_SUCCESS; +} + +#else // USE_PCCL const int DICL_UNIQUE_ID_BYTES_SIZE = 0; DIPU_API diclResult_t diclGetCommAsyncError(diclComm_t comm) { - return DICL_ERR_UNDEF; + return DICL_ERR_UNDEF; } -DIPU_API diclResult_t diclGetUniqueId(pcclUniqueId* uniqueId) { - return DICL_ERR_UNDEF; +DIPU_API diclResult_t diclGetUniqueId(pcclUniqueId *uniqueId) { + return DICL_ERR_UNDEF; } -DIPU_API diclResult_t diclCommInitRank(diclComm_t* comm, int nranks, pcclUniqueId uniqueId, - int rank, int localDeviceId) { - return DICL_ERR_UNDEF; +DIPU_API diclResult_t diclCommInitRank(diclComm_t *comm, int nranks, + pcclUniqueId uniqueId, int rank, + int localDeviceId) { + return DICL_ERR_UNDEF; } DIPU_API diclResult_t diclCommDestroy(diclComm_t comm) { - return DICL_ERR_UNDEF; + return DICL_ERR_UNDEF; } -DIPU_API diclResult_t diclAllReduce(const void *sendbuff, void *recvbuff, size_t count, at::ScalarType datatype, - const ReduceOp& reduceOp, diclComm_t comm, deviceStream_t stream) { - return DICL_ERR_UNDEF; +DIPU_API diclResult_t diclAllReduce(const void *sendbuff, void *recvbuff, + size_t count, at::ScalarType datatype, + const ReduceOp &reduceOp, diclComm_t comm, + deviceStream_t stream) { + return DICL_ERR_UNDEF; } -DIPU_API diclResult_t diclBroadcast(const void *sendbuff, void* recvbuff, size_t count, at::ScalarType datatype, - int root, diclComm_t comm, deviceStream_t stream) { - return DICL_ERR_UNDEF; +DIPU_API diclResult_t diclBroadcast(const void *sendbuff, void *recvbuff, + size_t count, at::ScalarType datatype, + int root, diclComm_t comm, + deviceStream_t stream) { + return DICL_ERR_UNDEF; } -DIPU_API diclResult_t diclAllGather(const void *sendBuf, void *recvBuf, size_t count, at::ScalarType datatype, - diclComm_t comm, deviceStream_t stream) { - return DICL_ERR_UNDEF; +DIPU_API diclResult_t diclAllGather(const void *sendBuf, void *recvBuf, + size_t count, at::ScalarType datatype, + diclComm_t comm, deviceStream_t stream) { + return DICL_ERR_UNDEF; } -DIPU_API diclResult_t diclReduce(const void* sendbuff, void* recvbuff, size_t count, at::ScalarType datatype, - const ReduceOp& reduceOp, int root, diclComm_t comm, deviceStream_t stream) { - return DICL_ERR_UNDEF; +DIPU_API diclResult_t diclReduce(const void *sendbuff, void *recvbuff, + size_t count, at::ScalarType datatype, + const ReduceOp &reduceOp, int root, + diclComm_t comm, deviceStream_t stream) { + return DICL_ERR_UNDEF; } -DIPU_API diclResult_t diclReduceScatter(void *sendBuf, void *recvBuf, size_t recvCount, at::ScalarType datatype, - const ReduceOp& reduceOp, diclComm_t comm, deviceStream_t stream) { - return DICL_ERR_UNDEF; +DIPU_API diclResult_t diclReduceScatter( + void *sendBuf, void *recvBuf, size_t recvCount, at::ScalarType datatype, + const ReduceOp &reduceOp, diclComm_t comm, deviceStream_t stream) { + return DICL_ERR_UNDEF; } -DIPU_API diclResult_t diclSend(void* sendbuff, size_t count, at::ScalarType datatype, int peer, - diclComm_t comm, deviceStream_t stream){ - return DICL_ERR_UNDEF; +DIPU_API diclResult_t diclSend(void *sendbuff, size_t count, + at::ScalarType datatype, int peer, + diclComm_t comm, deviceStream_t stream) { + return DICL_ERR_UNDEF; } -DIPU_API diclResult_t diclRecv(void* recvbuff, size_t count, at::ScalarType datatype, int peer, - diclComm_t comm, deviceStream_t stream) { - return DICL_ERR_UNDEF; +DIPU_API diclResult_t diclRecv(void *recvbuff, size_t count, + at::ScalarType datatype, int peer, + diclComm_t comm, deviceStream_t stream) { + return DICL_ERR_UNDEF; } -#endif // USE_PCCL +#endif // USE_PCCL -} // end namespace devapis +} // end namespace devapis -} // end namespace dipu +} // end namespace dipu diff --git a/dipu/torch_dipu/csrc_dipu/vendor/droplet/deviceimpl.cpp b/dipu/torch_dipu/csrc_dipu/vendor/droplet/deviceimpl.cpp index 2e6795e4a..33744df79 100644 --- a/dipu/torch_dipu/csrc_dipu/vendor/droplet/deviceimpl.cpp +++ b/dipu/torch_dipu/csrc_dipu/vendor/droplet/deviceimpl.cpp @@ -1,10 +1,11 @@ -#include #include +#include namespace dipu { -DIPU_API devapis::VendorDeviceType VENDOR_TYPE = devapis::VendorDeviceType::DROPLET; +DIPU_API devapis::VendorDeviceType VENDOR_TYPE = + devapis::VendorDeviceType::DROPLET; namespace devapis { @@ -14,15 +15,13 @@ using DROPLET_deviceId = int; // ===================== void initializeVendor() { -// according to the discussion with droplet team, make a random runtime call -// to make sure droplet runtime software is initialized correctly + // according to the discussion with droplet team, make a random runtime call + // to make sure droplet runtime software is initialized correctly int num = -1; - DIPU_CALLDROPLET(::tangGetDeviceCount(reinterpret_cast(&num))) + DIPU_CALLDROPLET(::tangGetDeviceCount(reinterpret_cast(&num))) } -void finalizeVendor() { - -} +void finalizeVendor() {} deviceId_t current_device() { DROPLET_deviceId devId_; @@ -31,79 +30,74 @@ deviceId_t current_device() { } DIPUDeviceProperties getDeviceProperties(int32_t device_index) { - DIPUDeviceProperties prop; return prop; } // set current device given device according to id void setDevice(deviceId_t devId) { - DROPLET_deviceId devId_ = static_cast(devId); - DIPU_CALLDROPLET(::tangSetDevice(devId_)) + DROPLET_deviceId devId_ = static_cast(devId); + DIPU_CALLDROPLET(::tangSetDevice(devId_)) } void resetDevice(deviceId_t devId) { - // DIPU_CALLDROPLET(::tangDeviceReset()) + // DIPU_CALLDROPLET(::tangDeviceReset()) } -void syncDevice() { - DIPU_CALLDROPLET(::tangDeviceSynchronize()) -} +void syncDevice() { DIPU_CALLDROPLET(::tangDeviceSynchronize()) } // check last launch succ or not, throw if fail -void checkLastError() { - DIPU_CALLDROPLET(::tangGetLastError()) -} +void checkLastError() { DIPU_CALLDROPLET(::tangGetLastError()) } int getDeviceCount() { int num = -1; - DIPU_CALLDROPLET(::tangGetDeviceCount(reinterpret_cast(&num))) + DIPU_CALLDROPLET(::tangGetDeviceCount(reinterpret_cast(&num))) return num; } -void getDriverVersion(int* version) { - // DIPU_CALLDROPLET(::tangDriverGetVersion(version)) +void getDriverVersion(int *version) { + // DIPU_CALLDROPLET(::tangDriverGetVersion(version)) } -void getRuntimeVersion(int* version) { - // DIPU_CALLDROPLET(::tangRuntimeGetVersion(version)) +void getRuntimeVersion(int *version) { + // DIPU_CALLDROPLET(::tangRuntimeGetVersion(version)) } // ===================== // device stream related // ===================== -void createStream(deviceStream_t* stream, bool prior) { - if (prior) { - DIPU_CALLDROPLET(::tangStreamCreateWithPriority(stream, tangStreamDefault, -1)) - } else { - DIPU_CALLDROPLET(::tangStreamCreate(stream)) - } +void createStream(deviceStream_t *stream, bool prior) { + if (prior) { + DIPU_CALLDROPLET( + ::tangStreamCreateWithPriority(stream, tangStreamDefault, -1)) + } else { + DIPU_CALLDROPLET(::tangStreamCreate(stream)) + } } void destroyStream(deviceStream_t stream) { - DIPU_CALLDROPLET(::tangStreamDestroy(stream)) + DIPU_CALLDROPLET(::tangStreamDestroy(stream)) } void destroyStream(deviceStream_t stream, deviceId_t devId) { - setDevice(devId); - destroyStream(stream); + setDevice(devId); + destroyStream(stream); } -void releaseStream() { - return; -} +void releaseStream() { return; } bool streamNotNull(deviceStream_t stream) { - return stream != nullptr; - // return (stream != nullptr && stream != tangStreamLegacy && stream != tangStreamPerThread); + return stream != nullptr; + // return (stream != nullptr && stream != tangStreamLegacy && stream != + // tangStreamPerThread); } void syncStream(deviceStream_t stream) { - DIPU_CALLDROPLET(::tangStreamSynchronize(stream)); + DIPU_CALLDROPLET(::tangStreamSynchronize(stream)); } void streamWaitEvent(deviceStream_t stream, deviceEvent_t event) { - DIPU_CALLDROPLET(::tangStreamWaitEvent(stream, event, 0)) + DIPU_CALLDROPLET(::tangStreamWaitEvent(stream, event, 0)) } bool isStreamEmpty(deviceStream_t stream) { @@ -114,138 +108,125 @@ bool isStreamEmpty(deviceStream_t stream) { return false; } - // ===================== // device event related // ===================== -void createEvent(deviceEvent_t* event) { - DIPU_CALLDROPLET(::tangEventCreateWithFlags(event, tangEventDisableTiming)) +void createEvent(deviceEvent_t *event) { + DIPU_CALLDROPLET(::tangEventCreateWithFlags(event, tangEventDisableTiming)) } void destroyEvent(deviceEvent_t event) { - DIPU_CALLDROPLET(::tangEventDestroy(event)) + DIPU_CALLDROPLET(::tangEventDestroy(event)) } void waitEvent(deviceEvent_t event) { - DIPU_CALLDROPLET(::tangEventSynchronize(event)) + DIPU_CALLDROPLET(::tangEventSynchronize(event)) } void recordEvent(deviceEvent_t event, deviceStream_t stream) { - DIPU_CALLDROPLET(::tangEventRecord(event, stream)) + DIPU_CALLDROPLET(::tangEventRecord(event, stream)) } -void eventElapsedTime(float* time, deviceEvent_t start, deviceEvent_t end) { - DIPU_CALLDROPLET(tangEventElapsedTime(time, start, end)) -} +void eventElapsedTime(float *time, deviceEvent_t start, deviceEvent_t end){ + DIPU_CALLDROPLET(tangEventElapsedTime(time, start, end))} EventStatus getEventStatus(deviceEvent_t event) { - ::tangError_t ret = ::tangEventQuery(event); - if (ret == ::tangSuccess) { - return devapis::EventStatus::READY; - } else if (ret == ::tangErrorNotReady) { - ::tangGetLastError(); /* reset internal error state*/ - return devapis::EventStatus::PENDING; - } else { - throw std::runtime_error("dipu device error"); - } + ::tangError_t ret = ::tangEventQuery(event); + if (ret == ::tangSuccess) { + return devapis::EventStatus::READY; + } else if (ret == ::tangErrorNotReady) { + ::tangGetLastError(); /* reset internal error state*/ + return devapis::EventStatus::PENDING; + } else { + throw std::runtime_error("dipu device error"); + } } // ===================== // mem related // ===================== -void mallocHost(void** p, size_t nbytes) { - DIPU_CALLDROPLET(::tangMallocHost(p, nbytes)) +void mallocHost(void **p, size_t nbytes) { + DIPU_CALLDROPLET(::tangMallocHost(p, nbytes)) } -void freeHost(void* p) { - DIPU_CALLDROPLET(::tangFreeHost(p)) -} +void freeHost(void *p){DIPU_CALLDROPLET(::tangFreeHost(p))} OpStatus mallocDevice(void **p, size_t nbytes, bool throwExcepion) { - if (nbytes == 0) return OpStatus::SUCCESS; - ::tangError_t r = ::tangMalloc(p, nbytes); - if (r != ::tangSuccess) { - if(throwExcepion) { - printf("call a tangrt function failed. return code=%d %d", r, nbytes); - ::tangGetLastError(); /* reset internal error state*/ - throw std::runtime_error("alloc failed in dipu"); - } - else if(r == ::tangErrorMemoryAllocation) { - return OpStatus::ERR_NOMEM; - } - else { - return OpStatus::ERR_UNKNOWN; - } + if (nbytes == 0) return OpStatus::SUCCESS; + ::tangError_t r = ::tangMalloc(p, nbytes); + if (r != ::tangSuccess) { + if (throwExcepion) { + printf("call a tangrt function failed. return code=%d %d", r, nbytes); + ::tangGetLastError(); /* reset internal error state*/ + throw std::runtime_error("alloc failed in dipu"); + } else if (r == ::tangErrorMemoryAllocation) { + return OpStatus::ERR_NOMEM; + } else { + return OpStatus::ERR_UNKNOWN; } - return OpStatus::SUCCESS; + } + return OpStatus::SUCCESS; } -void freeDevice(void* p) { - DIPU_CALLDROPLET(::tangFree(p)) -} +void freeDevice(void *p) { DIPU_CALLDROPLET(::tangFree(p)) } -bool isPinnedPtr(const void *p) { - return true; -} +bool isPinnedPtr(const void *p) { return true; } -void memSetAsync(const deviceStream_t stream, void* ptr, int val, size_t size) { - DIPU_CALLDROPLET(::tangMemsetAsync(ptr, val, size, stream)) +void memSetAsync(const deviceStream_t stream, void *ptr, int val, size_t size) { + DIPU_CALLDROPLET(::tangMemsetAsync(ptr, val, size, stream)) } -void memCopyD2D(size_t nbytes, deviceId_t dstDevId, void* dst, deviceId_t srcDevId, const void* src) { - if (dstDevId == srcDevId) { - DIPU_CALLDROPLET(::tangMemcpy(dst, src, nbytes, ::tangMemcpyDeviceToDevice)) - } else { - // DIPU_CALLDROPLET(::tangMemcpyPeer(dst, dstDevId, src, srcDevId, nbytes)) - throw std::runtime_error("dipu device error with tangMemcpyPeer not supported"); - } +void memCopyD2D(size_t nbytes, deviceId_t dstDevId, void *dst, + deviceId_t srcDevId, const void *src) { + if (dstDevId == srcDevId) { + DIPU_CALLDROPLET(::tangMemcpy(dst, src, nbytes, ::tangMemcpyDeviceToDevice)) + } else { + // DIPU_CALLDROPLET(::tangMemcpyPeer(dst, dstDevId, src, srcDevId, nbytes)) + throw std::runtime_error( + "dipu device error with tangMemcpyPeer not supported"); + } } // (synchronous) copy from host to a DROPLET device -void memCopyH2D(size_t nbytes, void* dst, const void* src) { - DIPU_CALLDROPLET(::tangMemcpy(dst, src, nbytes, ::tangMemcpyHostToDevice)) +void memCopyH2D(size_t nbytes, void *dst, const void *src) { + DIPU_CALLDROPLET(::tangMemcpy(dst, src, nbytes, ::tangMemcpyHostToDevice)) } // (synchronous) copy from a DROPLET device to host -void memCopyD2H(size_t nbytes, void* dst, const void* src) { - DIPU_CALLDROPLET(::tangMemcpy(dst, src, nbytes, ::tangMemcpyDeviceToHost)) +void memCopyD2H(size_t nbytes, void *dst, const void *src) { + DIPU_CALLDROPLET(::tangMemcpy(dst, src, nbytes, ::tangMemcpyDeviceToHost)) } // (asynchronous) copy from device to a device void memCopyD2DAsync(const deviceStream_t stream, size_t nbytes, - deviceId_t dstDevId, void* dst, deviceId_t srcDevId, const void* src) { + deviceId_t dstDevId, void *dst, deviceId_t srcDevId, + const void *src) { if (dstDevId == srcDevId) { - DIPU_CALLDROPLET(::tangMemcpyAsync( - dst, src, nbytes, tangMemcpyDeviceToDevice, stream)) + DIPU_CALLDROPLET( + ::tangMemcpyAsync(dst, src, nbytes, tangMemcpyDeviceToDevice, stream)) } else { - throw std::runtime_error("dipu device error with tangMemcpyPeerAsync not supported"); + throw std::runtime_error( + "dipu device error with tangMemcpyPeerAsync not supported"); // DIPU_CALLDROPLET(::tangMemcpyPeerAsync( // dst, dstDevId, src, srcDevId, nbytes, stream)) } } // (asynchronous) copy from host to a device -void memCopyH2DAsync(const deviceStream_t stream, size_t nbytes, void* dst, const void* src) { - DIPU_CALLDROPLET(::tangMemcpyAsync( - dst, src, nbytes, tangMemcpyHostToDevice, stream)) +void memCopyH2DAsync(const deviceStream_t stream, size_t nbytes, void *dst, + const void *src) { + DIPU_CALLDROPLET( + ::tangMemcpyAsync(dst, src, nbytes, tangMemcpyHostToDevice, stream)) } // (asynchronous) copy from a device to host -void memCopyD2HAsync(const deviceStream_t stream, size_t nbytes, void* dst, const void* src) { - DIPU_CALLDROPLET(::tangMemcpyAsync( - dst, src, nbytes, tangMemcpyDeviceToHost, stream)); +void memCopyD2HAsync(const deviceStream_t stream, size_t nbytes, void *dst, + const void *src) { + DIPU_CALLDROPLET( + ::tangMemcpyAsync(dst, src, nbytes, tangMemcpyDeviceToHost, stream)); } -} // end namespace devapis - -} // namespace parrots - - - - - - - - +} // end namespace devapis +} // namespace dipu diff --git a/dipu/torch_dipu/csrc_dipu/vendor/droplet/vendorapi.h b/dipu/torch_dipu/csrc_dipu/vendor/droplet/vendorapi.h index ab08cd35e..8630a0bc1 100644 --- a/dipu/torch_dipu/csrc_dipu/vendor/droplet/vendorapi.h +++ b/dipu/torch_dipu/csrc_dipu/vendor/droplet/vendorapi.h @@ -2,40 +2,35 @@ #include #ifdef USE_PCCL #include -#endif // USE_PCCL +#endif // USE_PCCL #include #include namespace dipu { - -#define DIPU_CALLDROPLET(Expr) { \ - tangError_t ret = Expr; \ - if (ret != tangSuccess) { \ - printf("call a tangrt function (%s) failed. return code=%d", #Expr, ret); \ - throw std::runtime_error("dipu device error"); \ - } \ -} +#define DIPU_CALLDROPLET(Expr) \ + { \ + tangError_t ret = Expr; \ + if (ret != tangSuccess) { \ + printf("call a tangrt function (%s) failed. return code=%d", #Expr, \ + ret); \ + throw std::runtime_error("dipu device error"); \ + } \ + } using deviceStream_t = tangStream_t; #define deviceDefaultStreamLiteral nullptr using deviceEvent_t = tangEvent_t; -using deviceHandle_t = tangContext_t*; +using deviceHandle_t = tangContext_t *; #ifdef USE_PCCL using diclComm_t = pcclComm_t; using commUniqueId = pcclUniqueId; -#else // USE_PCCL +#else // USE_PCCL class pcclComm_t {}; -using diclComm_t = pcclComm_t*; +using diclComm_t = pcclComm_t *; class pcclUniqueId {}; using commUniqueId = pcclUniqueId; -#endif // USE_PCCL - -} // namespace dipu - - - - - +#endif // USE_PCCL +} // namespace dipu diff --git a/dipu/torch_dipu/csrc_dipu/vendor/supa/commimpl.cpp b/dipu/torch_dipu/csrc_dipu/vendor/supa/commimpl.cpp index e4896519e..532fcadf0 100644 --- a/dipu/torch_dipu/csrc_dipu/vendor/supa/commimpl.cpp +++ b/dipu/torch_dipu/csrc_dipu/vendor/supa/commimpl.cpp @@ -5,15 +5,16 @@ namespace dipu { namespace devapis { -#define SUCCL_CALL(Expr) \ - { \ - succlResult_t ret = Expr; \ - if (ret != succlSuccess) { \ - std::cout << "call a succl function (" << #Expr << ") failed. return code= " << ret << std::endl; \ - return DICL_ERR_UNDEF; \ - } else { \ - return DICL_SUCCESS; \ - } \ +#define SUCCL_CALL(Expr) \ + { \ + succlResult_t ret = Expr; \ + if (ret != succlSuccess) { \ + std::cout << "call a succl function (" << #Expr \ + << ") failed. return code= " << ret << std::endl; \ + return DICL_ERR_UNDEF; \ + } else { \ + return DICL_SUCCESS; \ + } \ } const int DICL_UNIQUE_ID_BYTES_SIZE = SUCCL_UNIQUE_ID_BYTES; @@ -23,15 +24,21 @@ diclResult_t diclGetCommAsyncError(diclComm_t comm) { SUCCL_CALL(succlCommGetAsyncError(comm, &result)); }; -diclResult_t diclGetUniqueId(commUniqueId *uniqueId) { SUCCL_CALL(succlGetUniqueId(uniqueId)); } +diclResult_t diclGetUniqueId(commUniqueId *uniqueId) { + SUCCL_CALL(succlGetUniqueId(uniqueId)); +} -diclResult_t diclCommInitRank(diclComm_t *comm, int nranks, commUniqueId uniqueId, int rank, int localDeviceId) { +diclResult_t diclCommInitRank(diclComm_t *comm, int nranks, + commUniqueId uniqueId, int rank, + int localDeviceId) { SUCCL_CALL(succlCommInitRank(comm, nranks, uniqueId, rank)); } // void diclCommInitAll(diclComm_t* comms, int ndev, const int* devlist); -diclResult_t diclCommDestroy(diclComm_t comm) { SUCCL_CALL(succlCommDestroy(comm)); }; +diclResult_t diclCommDestroy(diclComm_t comm) { + SUCCL_CALL(succlCommDestroy(comm)); +}; // diclResult_t diclCommFinalize(diclComm_t comm); @@ -39,14 +46,10 @@ diclResult_t diclCommDestroy(diclComm_t comm) { SUCCL_CALL(succlCommDestroy(comm static bool toSucclDataType(at::ScalarType type, succlDataType_t &out) { static std::map succlDataType = { - {at::kChar, succlInt8}, - {at::kByte, succlUint8}, - {at::kFloat, succlFloat}, - {at::kDouble, succlFloat}, - {at::kInt, succlInt32}, - {at::kLong, succlInt32}, - {at::kBool, succlUint8}, - {at::kBFloat16, succlBfloat16}, + {at::kChar, succlInt8}, {at::kByte, succlUint8}, + {at::kFloat, succlFloat}, {at::kDouble, succlFloat}, + {at::kInt, succlInt32}, {at::kLong, succlInt32}, + {at::kBool, succlUint8}, {at::kBFloat16, succlBfloat16}, }; auto it = succlDataType.find(type); if (it == succlDataType.end()) { @@ -71,63 +74,72 @@ static bool toSucclOpType(ReduceOp type, succlRedOp_t &out) { return true; }; -#define ConvertScalarType(x) \ - succlDataType_t suDataType; \ - if (!toSucclDataType(x, suDataType)) { \ - return DICL_ERR_UNDEF; \ +#define ConvertScalarType(x) \ + succlDataType_t suDataType; \ + if (!toSucclDataType(x, suDataType)) { \ + return DICL_ERR_UNDEF; \ } -#define ConvertOpType(x) \ - succlRedOp_t suOp; \ - if (!toSucclOpType(x, suOp)) { \ - return DICL_ERR_UNDEF; \ +#define ConvertOpType(x) \ + succlRedOp_t suOp; \ + if (!toSucclOpType(x, suOp)) { \ + return DICL_ERR_UNDEF; \ } // SCCL op mapping -diclResult_t diclAllReduce(const void *sendbuff, void *recvbuff, size_t count, at::ScalarType datatype, - const ReduceOp &reduceOp, diclComm_t comm, deviceStream_t stream) { +diclResult_t diclAllReduce(const void *sendbuff, void *recvbuff, size_t count, + at::ScalarType datatype, const ReduceOp &reduceOp, + diclComm_t comm, deviceStream_t stream) { ConvertScalarType(datatype); ConvertOpType(reduceOp); - SUCCL_CALL(succlAllReduce(sendbuff, recvbuff, count, suDataType, suOp, comm, stream)); + SUCCL_CALL(succlAllReduce(sendbuff, recvbuff, count, suDataType, suOp, comm, + stream)); } -diclResult_t diclBroadcast(const void *sendbuff, void *recvbuff, size_t count, at::ScalarType datatype, int root, - diclComm_t comm, deviceStream_t stream) { +diclResult_t diclBroadcast(const void *sendbuff, void *recvbuff, size_t count, + at::ScalarType datatype, int root, diclComm_t comm, + deviceStream_t stream) { ConvertScalarType(datatype); - SUCCL_CALL(succlBroadcast(sendbuff, recvbuff, count, suDataType, root, comm, stream)); + SUCCL_CALL(succlBroadcast(sendbuff, recvbuff, count, suDataType, root, comm, + stream)); } -diclResult_t diclAllGather(const void *sendBuf, void *recvBuf, size_t count, at::ScalarType datatype, diclComm_t comm, +diclResult_t diclAllGather(const void *sendBuf, void *recvBuf, size_t count, + at::ScalarType datatype, diclComm_t comm, deviceStream_t stream) { ConvertScalarType(datatype); SUCCL_CALL(succlAllGather(sendBuf, recvBuf, count, suDataType, comm, stream)); } -diclResult_t diclReduce(const void *sendbuff, void *recvbuff, size_t count, at::ScalarType datatype, - const ReduceOp &reduceOp, int root, diclComm_t comm, deviceStream_t stream) { +diclResult_t diclReduce(const void *sendbuff, void *recvbuff, size_t count, + at::ScalarType datatype, const ReduceOp &reduceOp, + int root, diclComm_t comm, deviceStream_t stream) { ConvertScalarType(datatype); ConvertOpType(reduceOp); - SUCCL_CALL(succlReduce(sendbuff, recvbuff, count, suDataType, suOp, root, comm, stream)); + SUCCL_CALL(succlReduce(sendbuff, recvbuff, count, suDataType, suOp, root, + comm, stream)); } -diclResult_t diclReduceScatter(void *sendBuf, void *recvBuf, size_t recvCount, at::ScalarType dataType, - const ReduceOp &op, diclComm_t comm, deviceStream_t stream) { +diclResult_t diclReduceScatter(void *sendBuf, void *recvBuf, size_t recvCount, + at::ScalarType dataType, const ReduceOp &op, + diclComm_t comm, deviceStream_t stream) { ConvertScalarType(dataType); ConvertOpType(op); - SUCCL_CALL(succlReduceScatter(sendBuf, recvBuf, recvCount, suDataType, suOp, comm, stream)); + SUCCL_CALL(succlReduceScatter(sendBuf, recvBuf, recvCount, suDataType, suOp, + comm, stream)); } -diclResult_t diclSend(void *sendbuff, size_t count, at::ScalarType datatype, int peer, diclComm_t comm, - deviceStream_t stream) { +diclResult_t diclSend(void *sendbuff, size_t count, at::ScalarType datatype, + int peer, diclComm_t comm, deviceStream_t stream) { ConvertScalarType(datatype); SUCCL_CALL(succlSend(sendbuff, count, suDataType, peer, comm, stream)); } -diclResult_t diclRecv(void *recvbuff, size_t count, at::ScalarType datatype, int peer, diclComm_t comm, - deviceStream_t stream) { +diclResult_t diclRecv(void *recvbuff, size_t count, at::ScalarType datatype, + int peer, diclComm_t comm, deviceStream_t stream) { ConvertScalarType(datatype); SUCCL_CALL(succlRecv(recvbuff, count, suDataType, peer, comm, stream)); } -} // end namespace devapis -} // end namespace dipu +} // end namespace devapis +} // end namespace dipu diff --git a/dipu/torch_dipu/csrc_dipu/vendor/supa/copyinplace.cpp b/dipu/torch_dipu/csrc_dipu/vendor/supa/copyinplace.cpp index 992c61c24..d6139c89e 100644 --- a/dipu/torch_dipu/csrc_dipu/vendor/supa/copyinplace.cpp +++ b/dipu/torch_dipu/csrc_dipu/vendor/supa/copyinplace.cpp @@ -8,25 +8,22 @@ namespace dipu { namespace supa { -at::Tensor& copy_(at::Tensor& self, const at::Tensor& src, bool non_blocking) { +at::Tensor ©_(at::Tensor &self, const at::Tensor &src, bool non_blocking) { if (self.numel() == 0) { return self; } dipu::DIPUStream stream = getCurrentDIPUStream(); ::diopiContext context(stream.rawstream()); auto ctx = &context; - ::diopiConstTensorHandle_t srcDiopiTensorHandle = dipu::diopi_helper::toDiopiTensorHandle(src); - ::diopiTensorHandle_t selfDiopiTensorHandle = dipu::diopi_helper::toDiopiTensorHandle(self); - ::diopiError_t ret = ::diopiCopyInp(ctx, srcDiopiTensorHandle, selfDiopiTensorHandle); - TORCH_CHECK(ret == ::diopiSuccess, - __FILE__, - ":", - __LINE__, - R"(::diopiCopyInp(ctx, src, dst);)", - " error, error code is ", - ret, - "error message is ", - diopiGetLastErrorString()); + ::diopiConstTensorHandle_t srcDiopiTensorHandle = + dipu::diopi_helper::toDiopiTensorHandle(src); + ::diopiTensorHandle_t selfDiopiTensorHandle = + dipu::diopi_helper::toDiopiTensorHandle(self); + ::diopiError_t ret = + ::diopiCopyInp(ctx, srcDiopiTensorHandle, selfDiopiTensorHandle); + TORCH_CHECK(ret == ::diopiSuccess, __FILE__, ":", __LINE__, + R"(::diopiCopyInp(ctx, src, dst);)", " error, error code is ", + ret, "error message is ", diopiGetLastErrorString()); // TODO(caikun): remove syncStream when cache allocator is ready if (non_blocking) { dipu::devapis::syncStream(stream.rawstream()); @@ -35,21 +32,30 @@ at::Tensor& copy_(at::Tensor& self, const at::Tensor& src, bool non_blocking) { } class SUPACopyInplace : public DIPUCopyInplace { -public: + public: SUPACopyInplace() = default; ~SUPACopyInplace() = default; - at::Tensor& run(at::Tensor& self, const at::Tensor& src, bool non_blocking) override { return copy_(self, src, non_blocking); } + at::Tensor &run(at::Tensor &self, const at::Tensor &src, + bool non_blocking) override { + return copy_(self, src, non_blocking); + } - at::Tensor& copy_between_devices(at::TensorIterator& iter, at::Tensor& self, const at::Tensor& src, bool non_blocking) override { + at::Tensor ©_between_devices(at::TensorIterator &iter, at::Tensor &self, + const at::Tensor &src, + bool non_blocking) override { return copy_(self, src, non_blocking); } - at::Tensor& copy_contiguous(at::TensorIterator& iter, at::Tensor& self, const at::Tensor& src, bool non_blocking) override { + at::Tensor ©_contiguous(at::TensorIterator &iter, at::Tensor &self, + const at::Tensor &src, + bool non_blocking) override { return copy_(self, src, non_blocking); } - at::Tensor& copy_uncontiguous(at::TensorIterator& iter, at::Tensor& self, const at::Tensor& src, bool non_blocking) override { + at::Tensor ©_uncontiguous(at::TensorIterator &iter, at::Tensor &self, + const at::Tensor &src, + bool non_blocking) override { return copy_(self, src, non_blocking); } }; diff --git a/dipu/torch_dipu/csrc_dipu/vendor/supa/deviceimpl.cpp b/dipu/torch_dipu/csrc_dipu/vendor/supa/deviceimpl.cpp index c28017940..a5ac8ab0b 100644 --- a/dipu/torch_dipu/csrc_dipu/vendor/supa/deviceimpl.cpp +++ b/dipu/torch_dipu/csrc_dipu/vendor/supa/deviceimpl.cpp @@ -1,16 +1,19 @@ -#include #include +#include + namespace dipu { -DIPU_API devapis::VendorDeviceType VENDOR_TYPE = devapis::VendorDeviceType::SUPA; +DIPU_API devapis::VendorDeviceType VENDOR_TYPE = + devapis::VendorDeviceType::SUPA; namespace devapis { -#define SUPA_CALL(Expr) \ - { \ - suError_t __ret = Expr; \ - if (__ret != suSuccess) { \ - printf("call a supa function (%s) failed. return code=%d", #Expr, __ret); \ - } \ +#define SUPA_CALL(Expr) \ + { \ + suError_t __ret = Expr; \ + if (__ret != suSuccess) { \ + printf("call a supa function (%s) failed. return code=%d", #Expr, \ + __ret); \ + } \ } void initializeVendor() {} @@ -18,7 +21,7 @@ void initializeVendor() {} void finalizeVendor() {} class DeviceGuard { -public: + public: DeviceGuard(int device) : device_bak(-1) { SUPA_CALL(suGetDevice(&device_bak)); if (device_bak != device) { @@ -33,7 +36,7 @@ class DeviceGuard { } } -private: + private: int device_bak; }; @@ -46,7 +49,8 @@ DIPU_API deviceId_t current_device() { DIPU_API DIPUDeviceProperties getDeviceProperties(int32_t device_index) { suDeviceProp prop; SUPA_CALL(suGetDeviceProperties(&prop, device_index)); - return {prop.name, prop.totalGlobalMem, prop.major, prop.minor, prop.multiProcessorCount}; + return {prop.name, prop.totalGlobalMem, prop.major, prop.minor, + prop.multiProcessorCount}; } // set current device given device according to id @@ -67,7 +71,9 @@ DIPU_API void syncDevice() { SUPA_CALL(suDeviceSynchronize()); } DIPU_API void checkLastError() { suError_t last_err = suGetLastError(); if (last_err != suSuccess) { - throw std::runtime_error("dipu device error, ret code:" + std::to_string(last_err) + ":" + suGetErrorString(last_err)); + throw std::runtime_error( + "dipu device error, ret code:" + std::to_string(last_err) + ":" + + suGetErrorString(last_err)); } } @@ -77,9 +83,13 @@ DIPU_API int getDeviceCount() { return count; } -DIPU_API void getDriverVersion(int *version) { SUPA_CALL(suDriverGetVersion(version)); } +DIPU_API void getDriverVersion(int *version) { + SUPA_CALL(suDriverGetVersion(version)); +} -DIPU_API void getRuntimeVersion(int *version) { SUPA_CALL(suRuntimeGetVersion(version)); } +DIPU_API void getRuntimeVersion(int *version) { + SUPA_CALL(suRuntimeGetVersion(version)); +} DIPU_API void createStream(deviceStream_t *stream, bool prior) { int flags = suStreamDefault; @@ -87,7 +97,9 @@ DIPU_API void createStream(deviceStream_t *stream, bool prior) { SUPA_CALL(suStreamCreateWithPriority(stream, flags, prior ? -1 : 0)); } -DIPU_API void destroyStream(deviceStream_t stream) { SUPA_CALL(suStreamDestroy(stream)); } +DIPU_API void destroyStream(deviceStream_t stream) { + SUPA_CALL(suStreamDestroy(stream)); +} DIPU_API void destroyStream(deviceStream_t stream, deviceId_t devId) { int device = static_cast(devId); @@ -99,11 +111,15 @@ DIPU_API void releaseStream() { // throw std::runtime_error("release stream is not support."); } -DIPU_API void syncStream(deviceStream_t stream) { SUPA_CALL(suStreamSynchronize(stream)); } +DIPU_API void syncStream(deviceStream_t stream) { + SUPA_CALL(suStreamSynchronize(stream)); +} DIPU_API bool streamNotNull(deviceStream_t stream) { return true; } -DIPU_API void streamWaitEvent(deviceStream_t stream, deviceEvent_t event) { SUPA_CALL(suStreamWaitEvent(stream, event)); } +DIPU_API void streamWaitEvent(deviceStream_t stream, deviceEvent_t event) { + SUPA_CALL(suStreamWaitEvent(stream, event)); +} // same as query last event status in stream.(every op has a event) DIPU_API bool isStreamEmpty(deviceStream_t stream) { @@ -123,15 +139,24 @@ DIPU_API bool isStreamEmpty(deviceStream_t stream) { // device event related // ===================== -DIPU_API void createEvent(deviceEvent_t *event) { SUPA_CALL(suEventCreate(event)); } +DIPU_API void createEvent(deviceEvent_t *event) { + SUPA_CALL(suEventCreate(event)); +} -DIPU_API void destroyEvent(deviceEvent_t event) { SUPA_CALL(suEventDestroy(event)); } +DIPU_API void destroyEvent(deviceEvent_t event) { + SUPA_CALL(suEventDestroy(event)); +} -DIPU_API void waitEvent(deviceEvent_t event) { SUPA_CALL(suEventSynchronize(event)); } +DIPU_API void waitEvent(deviceEvent_t event) { + SUPA_CALL(suEventSynchronize(event)); +} -DIPU_API void recordEvent(deviceEvent_t event, deviceStream_t stream) { SUPA_CALL(suEventRecord(event, stream)); } +DIPU_API void recordEvent(deviceEvent_t event, deviceStream_t stream) { + SUPA_CALL(suEventRecord(event, stream)); +} -DIPU_API void eventElapsedTime(float *time, deviceEvent_t start, deviceEvent_t end) { +DIPU_API void eventElapsedTime(float *time, deviceEvent_t start, + deviceEvent_t end) { // unit of time is ms. SUPA_CALL(suEventElapsedTime(time, start, end)); } @@ -179,10 +204,14 @@ DIPU_API void freeDevice(void *p) { br_device_free(p); } DIPU_API bool isPinnedPtr(const void *p) { return false; } // (asynchronous) set val -DIPU_API void memSetAsync(const deviceStream_t stream, void *ptr, int val, size_t size) { SUPA_CALL(suMemsetAsync(ptr, val, size, stream)); } +DIPU_API void memSetAsync(const deviceStream_t stream, void *ptr, int val, + size_t size) { + SUPA_CALL(suMemsetAsync(ptr, val, size, stream)); +} // (synchronous) copy from device to a device -DIPU_API void memCopyD2D(size_t nbytes, deviceId_t dstDevId, void *dst, deviceId_t srcDevId, const void *src) { +DIPU_API void memCopyD2D(size_t nbytes, deviceId_t dstDevId, void *dst, + deviceId_t srcDevId, const void *src) { // SUPA uses Unified Virtual Address SUPA_CALL(suMemcpy(dst, src, nbytes, suMemcpyDeviceToDevice)); } @@ -200,7 +229,9 @@ DIPU_API void memCopyD2H(size_t nbytes, /*Host dstDev,*/ void *dst, } // (asynchronous) copy from device to a device -DIPU_API void memCopyD2DAsync(const deviceStream_t stream, size_t nbytes, deviceId_t dstDevId, void *dst, deviceId_t srcDevId, const void *src) { +DIPU_API void memCopyD2DAsync(const deviceStream_t stream, size_t nbytes, + deviceId_t dstDevId, void *dst, + deviceId_t srcDevId, const void *src) { SUPA_CALL(suMemcpyAsync(dst, src, nbytes, stream, suMemcpyDeviceToDevice)); } diff --git a/dipu/torch_dipu/csrc_dipu/vendor/supa/generatorimpl.cpp b/dipu/torch_dipu/csrc_dipu/vendor/supa/generatorimpl.cpp index f816cc0ec..53f525cca 100644 --- a/dipu/torch_dipu/csrc_dipu/vendor/supa/generatorimpl.cpp +++ b/dipu/torch_dipu/csrc_dipu/vendor/supa/generatorimpl.cpp @@ -1,5 +1,6 @@ #include #include + #include #include #include @@ -11,25 +12,31 @@ static const size_t offset_size = sizeof(int64_t); static const size_t total_size = seed_size + offset_size; class SUPAGeneratorImpl : public dipu::DIPUGeneratorImpl { -protected: + protected: mutable std::once_flag init_state_flag; -public: - SUPAGeneratorImpl(at::DeviceIndex device_index) : dipu::DIPUGeneratorImpl(device_index) {} + public: + SUPAGeneratorImpl(at::DeviceIndex device_index) + : dipu::DIPUGeneratorImpl(device_index) {} - void set_state(const c10::TensorImpl& state) override { + void set_state(const c10::TensorImpl &state) override { at::detail::check_rng_state(state); auto state_size = state.numel(); - TORCH_CHECK(state_size == total_size || state_size == total_size - offset_size, "RNG state is wrong size"); + TORCH_CHECK( + state_size == total_size || state_size == total_size - offset_size, + "RNG state is wrong size"); - at::Tensor state_tmp(state.shallow_copy_and_detach(state.version_counter(), true)); + at::Tensor state_tmp( + state.shallow_copy_and_detach(state.version_counter(), true)); state_ = state_tmp; state_need_reset_ = false; } void update_state() const override { if (state_need_reset_) { - state_ = at::detail::empty_cpu({(int64_t)total_size}, c10::ScalarType::Byte, c10::nullopt, c10::nullopt, c10::nullopt, c10::nullopt); + state_ = at::detail::empty_cpu({(int64_t)total_size}, + c10::ScalarType::Byte, c10::nullopt, + c10::nullopt, c10::nullopt, c10::nullopt); auto rng_state = state_.data_ptr(); uint64_t seed = this->current_seed(); int64_t offset = 0; @@ -40,6 +47,8 @@ class SUPAGeneratorImpl : public dipu::DIPUGeneratorImpl { } }; -const at::Generator vendorMakeGenerator(at::DeviceIndex device_index) { return at::make_generator(device_index); } +const at::Generator vendorMakeGenerator(at::DeviceIndex device_index) { + return at::make_generator(device_index); +} } // namespace dipu diff --git a/dipu/torch_dipu/csrc_dipu/vendor/supa/vendorapi.h b/dipu/torch_dipu/csrc_dipu/vendor/supa/vendorapi.h index f15623faf..a982b7abd 100644 --- a/dipu/torch_dipu/csrc_dipu/vendor/supa/vendorapi.h +++ b/dipu/torch_dipu/csrc_dipu/vendor/supa/vendorapi.h @@ -1,15 +1,16 @@ // Copyright (c) 2023, DeepLink. #pragma once -#include #include #include +#include + namespace dipu { #define deviceDefaultStreamLiteral nullptr; using deviceStream_t = suStream_t; using deviceEvent_t = suEvent_t; -using deviceHandle_t = suContext*; +using deviceHandle_t = suContext *; using deviceId_t = int; using diclComm_t = succlComm_t; diff --git a/dipu/torch_dipu/csrc_dipu/vendor/topsrider/TopsGeneratorImpl.cpp b/dipu/torch_dipu/csrc_dipu/vendor/topsrider/TopsGeneratorImpl.cpp index 8791d32fe..cb75db888 100644 --- a/dipu/torch_dipu/csrc_dipu/vendor/topsrider/TopsGeneratorImpl.cpp +++ b/dipu/torch_dipu/csrc_dipu/vendor/topsrider/TopsGeneratorImpl.cpp @@ -10,35 +10,43 @@ static const size_t offset_size = sizeof(int64_t); static const size_t total_size = seed_size + offset_size; class TopsGeneratorImpl : public dipu::DIPUGeneratorImpl { -protected: - mutable std::once_flag init_state_flag; - -public: - TopsGeneratorImpl(at::DeviceIndex device_index) : dipu::DIPUGeneratorImpl(device_index) {} - - void set_state(const c10::TensorImpl& new_state) override { - at::detail::check_rng_state(new_state); - auto new_state_size = new_state.numel(); - TORCH_CHECK(new_state_size == total_size || new_state_size == total_size - offset_size, "RNG state is wrong size"); - - at::Tensor state_tmp(new_state.shallow_copy_and_detach(new_state.version_counter(), true)); - state_ = state_tmp; - state_need_reset_ = false; - } - - void update_state() const override { - if (state_need_reset_) { - state_ = at::detail::empty_cpu({(int64_t)total_size}, c10::ScalarType::Byte, c10::nullopt, c10::nullopt, c10::nullopt, c10::nullopt); - auto rng_state = state_.data_ptr(); - uint64_t seed = this->current_seed(); - int64_t offset = 0; - std::memcpy(rng_state, &seed, seed_size); - std::memcpy(rng_state + seed_size, &offset, offset_size); - state_need_reset_ = false; - } + protected: + mutable std::once_flag init_state_flag; + + public: + TopsGeneratorImpl(at::DeviceIndex device_index) + : dipu::DIPUGeneratorImpl(device_index) {} + + void set_state(const c10::TensorImpl &new_state) override { + at::detail::check_rng_state(new_state); + auto new_state_size = new_state.numel(); + TORCH_CHECK(new_state_size == total_size || + new_state_size == total_size - offset_size, + "RNG state is wrong size"); + + at::Tensor state_tmp( + new_state.shallow_copy_and_detach(new_state.version_counter(), true)); + state_ = state_tmp; + state_need_reset_ = false; + } + + void update_state() const override { + if (state_need_reset_) { + state_ = at::detail::empty_cpu({(int64_t)total_size}, + c10::ScalarType::Byte, c10::nullopt, + c10::nullopt, c10::nullopt, c10::nullopt); + auto rng_state = state_.data_ptr(); + uint64_t seed = this->current_seed(); + int64_t offset = 0; + std::memcpy(rng_state, &seed, seed_size); + std::memcpy(rng_state + seed_size, &offset, offset_size); + state_need_reset_ = false; } + } }; -const at::Generator vendorMakeGenerator(at::DeviceIndex device_index) { return at::make_generator(device_index); } +const at::Generator vendorMakeGenerator(at::DeviceIndex device_index) { + return at::make_generator(device_index); +} } // namespace dipu diff --git a/dipu/torch_dipu/csrc_dipu/vendor/topsrider/communiatorimpl.cpp b/dipu/torch_dipu/csrc_dipu/vendor/topsrider/communiatorimpl.cpp index 8fd2b90ef..fdac40706 100644 --- a/dipu/torch_dipu/csrc_dipu/vendor/topsrider/communiatorimpl.cpp +++ b/dipu/torch_dipu/csrc_dipu/vendor/topsrider/communiatorimpl.cpp @@ -23,12 +23,12 @@ DIPU_API diclResult_t diclGetCommAsyncError(diclComm_t comm) { return DICL_SUCCESS; } -DIPU_API diclResult_t diclGetUniqueId(commUniqueId* uniqueId) { +DIPU_API diclResult_t diclGetUniqueId(commUniqueId *uniqueId) { ECCL_THROW(ecclGetUniqueId(uniqueId)); return DICL_SUCCESS; } -DIPU_API diclResult_t diclCommInitRank(diclComm_t* comm, int nranks, +DIPU_API diclResult_t diclCommInitRank(diclComm_t *comm, int nranks, commUniqueId uniqueId, int rank, int localDeviceId) { ECCL_THROW(ecclCommInitRank(comm, nranks, uniqueId, rank)); @@ -47,16 +47,16 @@ DIPU_API diclResult_t diclCommDestroy(diclComm_t comm) { // DIPU_API diclResult_t diclCommAbort(diclComm_t comm); -DIPU_API diclResult_t diclAllReduce(const void* sendbuff, void* recvbuff, +DIPU_API diclResult_t diclAllReduce(const void *sendbuff, void *recvbuff, size_t count, at::ScalarType datatype, - const ReduceOp& reduceOp, diclComm_t comm, + const ReduceOp &reduceOp, diclComm_t comm, deviceStream_t stream) { ECCL_THROW(ecclAllReduce(sendbuff, recvbuff, count, eccl_data_type[datatype], eccl_op[reduceOp], comm, stream)); return DICL_SUCCESS; } -DIPU_API diclResult_t diclBroadcast(const void* sendbuff, void* recvbuff, +DIPU_API diclResult_t diclBroadcast(const void *sendbuff, void *recvbuff, size_t count, at::ScalarType datatype, int root, diclComm_t comm, deviceStream_t stream) { @@ -65,7 +65,7 @@ DIPU_API diclResult_t diclBroadcast(const void* sendbuff, void* recvbuff, return DICL_SUCCESS; } -DIPU_API diclResult_t diclAllGather(const void* sendBuf, void* recvBuf, +DIPU_API diclResult_t diclAllGather(const void *sendBuf, void *recvBuf, size_t count, at::ScalarType datatype, diclComm_t comm, deviceStream_t stream) { ECCL_THROW(ecclAllGather(sendBuf, recvBuf, count, eccl_data_type[datatype], @@ -73,19 +73,19 @@ DIPU_API diclResult_t diclAllGather(const void* sendBuf, void* recvBuf, return DICL_SUCCESS; } -DIPU_API diclResult_t diclReduce(const void* sendbuff, void* recvbuff, +DIPU_API diclResult_t diclReduce(const void *sendbuff, void *recvbuff, size_t count, at::ScalarType datatype, - const ReduceOp& reduceOp, int root, + const ReduceOp &reduceOp, int root, diclComm_t comm, deviceStream_t stream) { ECCL_THROW(ecclReduce(sendbuff, recvbuff, count, eccl_data_type[datatype], eccl_op[reduceOp], root, comm, stream)); return DICL_SUCCESS; } -DIPU_API diclResult_t diclReduceScatter(void* sendBuf, void* recvBuf, +DIPU_API diclResult_t diclReduceScatter(void *sendBuf, void *recvBuf, size_t recvCount, at::ScalarType dataType, - const ReduceOp& op, diclComm_t comm, + const ReduceOp &op, diclComm_t comm, deviceStream_t stream) { ECCL_THROW(ecclReduceScatter(sendBuf, recvBuf, recvCount, eccl_data_type[dataType], eccl_op[op], comm, @@ -93,7 +93,7 @@ DIPU_API diclResult_t diclReduceScatter(void* sendBuf, void* recvBuf, return DICL_SUCCESS; } -DIPU_API diclResult_t diclSend(void* sendbuff, size_t count, +DIPU_API diclResult_t diclSend(void *sendbuff, size_t count, at::ScalarType datatype, int peer, diclComm_t comm, deviceStream_t stream) { ECCL_THROW( @@ -101,7 +101,7 @@ DIPU_API diclResult_t diclSend(void* sendbuff, size_t count, return DICL_SUCCESS; } -DIPU_API diclResult_t diclRecv(void* recvbuff, size_t count, +DIPU_API diclResult_t diclRecv(void *recvbuff, size_t count, at::ScalarType datatype, int peer, diclComm_t comm, deviceStream_t stream) { ECCL_THROW( diff --git a/dipu/torch_dipu/csrc_dipu/vendor/topsrider/deviceimpl.cpp b/dipu/torch_dipu/csrc_dipu/vendor/topsrider/deviceimpl.cpp index f42ead52c..a27591ac9 100644 --- a/dipu/torch_dipu/csrc_dipu/vendor/topsrider/deviceimpl.cpp +++ b/dipu/torch_dipu/csrc_dipu/vendor/topsrider/deviceimpl.cpp @@ -1,9 +1,10 @@ // Copyright (c) 2023, DeepLink. -#include -#include #include #include +#include +#include + namespace dipu { DIPU_API devapis::VendorDeviceType VENDOR_TYPE = devapis::VendorDeviceType::GCU; @@ -14,13 +15,9 @@ using tops_deviceId = int; // Device class related // ===================== -void initializeVendor() { +void initializeVendor() {} -} - -void finalizeVendor() { - -} +void finalizeVendor() {} deviceId_t current_device() { tops_deviceId devId_; @@ -44,75 +41,68 @@ DIPUDeviceProperties getDeviceProperties(int32_t device_index) { // in tops_runtime_api.h // set current device given device according to id void setDevice(deviceId_t devId) { - tops_deviceId devId_ = static_cast(devId); - DIPU_CALLTOPSRT(::topsSetDevice(devId_)) + tops_deviceId devId_ = static_cast(devId); + DIPU_CALLTOPSRT(::topsSetDevice(devId_)) } -void resetDevice(deviceId_t devId) { - DIPU_CALLTOPSRT(::topsDeviceReset()) -} +void resetDevice(deviceId_t devId) { DIPU_CALLTOPSRT(::topsDeviceReset()) } -void syncDevice() { - DIPU_CALLTOPSRT(::topsDeviceSynchronize()) -} +void syncDevice() { DIPU_CALLTOPSRT(::topsDeviceSynchronize()) } // check last launch succ or not, throw if fail -void checkLastError() { - DIPU_CALLTOPSRT(::topsGetLastError()) -} +void checkLastError() { DIPU_CALLTOPSRT(::topsGetLastError()) } int getDeviceCount() { int num = -1; - DIPU_CALLTOPSRT(::topsGetDeviceCount(reinterpret_cast(&num))) + DIPU_CALLTOPSRT(::topsGetDeviceCount(reinterpret_cast(&num))) return num; } -void getDriverVersion(int* version) { - DIPU_CALLTOPSRT(::topsDriverGetVersion(version)) +void getDriverVersion(int *version) { + DIPU_CALLTOPSRT(::topsDriverGetVersion(version)) } -void getRuntimeVersion(int* version) { - DIPU_CALLTOPSRT(::topsRuntimeGetVersion(version)) +void getRuntimeVersion(int *version) { + DIPU_CALLTOPSRT(::topsRuntimeGetVersion(version)) } // ===================== // device stream related // ===================== -void createStream(deviceStream_t* stream, bool prior) { - if (prior) { - DIPU_LOGW( - "topsStreamCreateWithPriority is not ready, replace with " - "topsStreamCreate"); - DIPU_CALLTOPSRT(::topsStreamCreate(stream)) - // DIPU_CALLTOPSRT(::topsStreamCreateWithPriority(stream, topsStreamDefault, -1)) - } else { - DIPU_CALLTOPSRT(::topsStreamCreate(stream)) - } +void createStream(deviceStream_t *stream, bool prior) { + if (prior) { + DIPU_LOGW( + "topsStreamCreateWithPriority is not ready, replace with " + "topsStreamCreate"); + DIPU_CALLTOPSRT(::topsStreamCreate(stream)) + // DIPU_CALLTOPSRT(::topsStreamCreateWithPriority(stream, topsStreamDefault, + // -1)) + } else { + DIPU_CALLTOPSRT(::topsStreamCreate(stream)) + } } void destroyStream(deviceStream_t stream) { - DIPU_CALLTOPSRT(::topsStreamDestroy(stream)) + DIPU_CALLTOPSRT(::topsStreamDestroy(stream)) } void destroyStream(deviceStream_t stream, deviceId_t devId) { - setDevice(devId); - destroyStream(stream); + setDevice(devId); + destroyStream(stream); } -void releaseStream() { - return; -} +void releaseStream() { return; } bool streamNotNull(deviceStream_t stream) { - return (stream != nullptr && stream != topsStreamPerThread); + return (stream != nullptr && stream != topsStreamPerThread); } void syncStream(deviceStream_t stream) { - DIPU_CALLTOPSRT(::topsStreamSynchronize(stream)); + DIPU_CALLTOPSRT(::topsStreamSynchronize(stream)); } void streamWaitEvent(deviceStream_t stream, deviceEvent_t event) { - DIPU_CALLTOPSRT(::topsStreamWaitEvent(stream, event, 0)) + DIPU_CALLTOPSRT(::topsStreamWaitEvent(stream, event, 0)) } bool isStreamEmpty(deviceStream_t stream) { @@ -123,137 +113,126 @@ bool isStreamEmpty(deviceStream_t stream) { return false; } - // ===================== // device event related // ===================== -void createEvent(deviceEvent_t* event) { - DIPU_CALLTOPSRT(::topsEventCreateWithFlags(event, topsEventDisableTiming)) +void createEvent(deviceEvent_t *event) { + DIPU_CALLTOPSRT(::topsEventCreateWithFlags(event, topsEventDisableTiming)) } void destroyEvent(deviceEvent_t event) { - DIPU_CALLTOPSRT(::topsEventDestroy(event)) + DIPU_CALLTOPSRT(::topsEventDestroy(event)) } void waitEvent(deviceEvent_t event) { - DIPU_CALLTOPSRT(::topsEventSynchronize(event)) + DIPU_CALLTOPSRT(::topsEventSynchronize(event)) } void recordEvent(deviceEvent_t event, deviceStream_t stream) { - DIPU_CALLTOPSRT(::topsEventRecord(event, stream)) + DIPU_CALLTOPSRT(::topsEventRecord(event, stream)) } -void eventElapsedTime(float* time, deviceEvent_t start, deviceEvent_t end) { - DIPU_CALLTOPSRT(topsEventElapsedTime(time, start, end)) -} +void eventElapsedTime(float *time, deviceEvent_t start, deviceEvent_t end){ + DIPU_CALLTOPSRT(topsEventElapsedTime(time, start, end))} EventStatus getEventStatus(deviceEvent_t event) { - ::topsError_t ret = ::topsEventQuery(event); - if (ret == ::topsSuccess) { - return devapis::EventStatus::READY; - } else if (ret == ::topsErrorNotReady) { - ::topsGetLastError(); /* reset internal error state*/ - return devapis::EventStatus::PENDING; - } else { - throw std::runtime_error("dipu device error, ret code:" + - std::to_string(ret)); - } + ::topsError_t ret = ::topsEventQuery(event); + if (ret == ::topsSuccess) { + return devapis::EventStatus::READY; + } else if (ret == ::topsErrorNotReady) { + ::topsGetLastError(); /* reset internal error state*/ + return devapis::EventStatus::PENDING; + } else { + throw std::runtime_error("dipu device error, ret code:" + + std::to_string(ret)); + } } // ===================== // mem related // ===================== -void mallocHost(void** p, size_t nbytes) { - if (nbytes != 0) DIPU_CALLTOPSRT(::topsHostMalloc(p, nbytes)) +void mallocHost(void **p, size_t nbytes) { + if (nbytes != 0) DIPU_CALLTOPSRT(::topsHostMalloc(p, nbytes)) } -void freeHost(void* p) { - if (!p) DIPU_CALLTOPSRT(::topsHostFree(p)) +void freeHost(void *p) { + if (!p) DIPU_CALLTOPSRT(::topsHostFree(p)) } OpStatus mallocDevice(void **p, size_t nbytes, bool throwExcepion) { - ::topsError_t r = ::topsMalloc(p, nbytes); - if (r != ::topsSuccess) { - if(throwExcepion) { - ::topsGetLastError(); /* reset internal error state*/ - throw std::runtime_error("alloc failed in dipu"); - } - else if(r == ::topsErrorMemoryAllocation) { - return OpStatus::ERR_NOMEM; - } - else { - return OpStatus::ERR_UNKNOWN; - } + ::topsError_t r = ::topsMalloc(p, nbytes); + if (r != ::topsSuccess) { + if (throwExcepion) { + ::topsGetLastError(); /* reset internal error state*/ + throw std::runtime_error("alloc failed in dipu"); + } else if (r == ::topsErrorMemoryAllocation) { + return OpStatus::ERR_NOMEM; + } else { + return OpStatus::ERR_UNKNOWN; } - return OpStatus::SUCCESS; + } + return OpStatus::SUCCESS; } -void freeDevice(void* p) { - DIPU_CALLTOPSRT(::topsFree(p)) -} +void freeDevice(void *p) { DIPU_CALLTOPSRT(::topsFree(p)) } -bool isPinnedPtr(const void* p) { - ::topsPointerAttribute_t attr; - DIPU_CALLTOPSRT(::topsPointerGetAttributes(&attr, p)) - return attr.memoryType == topsMemoryTypeHost; +bool isPinnedPtr(const void *p) { + ::topsPointerAttribute_t attr; + DIPU_CALLTOPSRT(::topsPointerGetAttributes(&attr, p)) + return attr.memoryType == topsMemoryTypeHost; } -void memSetAsync(const deviceStream_t stream, void* ptr, int val, size_t size) { - DIPU_CALLTOPSRT(::topsMemsetAsync(ptr, val, size, stream)) +void memSetAsync(const deviceStream_t stream, void *ptr, int val, size_t size) { + DIPU_CALLTOPSRT(::topsMemsetAsync(ptr, val, size, stream)) } -void memCopyD2D(size_t nbytes, deviceId_t dstDevId, void* dst, deviceId_t srcDevId, const void* src) { - if (dstDevId == srcDevId) { - DIPU_CALLTOPSRT(::topsMemcpy(dst, src, nbytes, ::topsMemcpyDeviceToDevice)) - } else { - DIPU_CALLTOPSRT(::topsMemcpyPeer(dst, dstDevId, src, srcDevId, nbytes)) - } +void memCopyD2D(size_t nbytes, deviceId_t dstDevId, void *dst, + deviceId_t srcDevId, const void *src) { + if (dstDevId == srcDevId) { + DIPU_CALLTOPSRT(::topsMemcpy(dst, src, nbytes, ::topsMemcpyDeviceToDevice)) + } else { + DIPU_CALLTOPSRT(::topsMemcpyPeer(dst, dstDevId, src, srcDevId, nbytes)) + } } // (synchronous) copy from host to a tops device -void memCopyH2D(size_t nbytes, void* dst, const void* src) { - DIPU_CALLTOPSRT(::topsMemcpy(dst, src, nbytes, ::topsMemcpyHostToDevice)) +void memCopyH2D(size_t nbytes, void *dst, const void *src) { + DIPU_CALLTOPSRT(::topsMemcpy(dst, src, nbytes, ::topsMemcpyHostToDevice)) } // (synchronous) copy from a tops device to host -void memCopyD2H(size_t nbytes, void* dst, const void* src) { - DIPU_CALLTOPSRT(::topsMemcpy(dst, src, nbytes, ::topsMemcpyDeviceToHost)) +void memCopyD2H(size_t nbytes, void *dst, const void *src) { + DIPU_CALLTOPSRT(::topsMemcpy(dst, src, nbytes, ::topsMemcpyDeviceToHost)) } // (asynchronous) copy from device to a device void memCopyD2DAsync(const deviceStream_t stream, size_t nbytes, - deviceId_t dstDevId, void* dst, deviceId_t srcDevId, const void* src) { + deviceId_t dstDevId, void *dst, deviceId_t srcDevId, + const void *src) { if (dstDevId == srcDevId) { - DIPU_CALLTOPSRT(::topsMemcpyAsync( - dst, src, nbytes, topsMemcpyDeviceToDevice, stream)) + DIPU_CALLTOPSRT( + ::topsMemcpyAsync(dst, src, nbytes, topsMemcpyDeviceToDevice, stream)) } else { - DIPU_CALLTOPSRT(::topsMemcpyPeerAsync( - dst, dstDevId, src, srcDevId, nbytes, stream)) + DIPU_CALLTOPSRT( + ::topsMemcpyPeerAsync(dst, dstDevId, src, srcDevId, nbytes, stream)) } } // (asynchronous) copy from host to a device -void memCopyH2DAsync(const deviceStream_t stream, size_t nbytes, void* dst, const void* src) { - DIPU_CALLTOPSRT(::topsMemcpyAsync( - dst, src, nbytes, topsMemcpyHostToDevice, stream)) +void memCopyH2DAsync(const deviceStream_t stream, size_t nbytes, void *dst, + const void *src) { + DIPU_CALLTOPSRT( + ::topsMemcpyAsync(dst, src, nbytes, topsMemcpyHostToDevice, stream)) } // (asynchronous) copy from a device to host -void memCopyD2HAsync(const deviceStream_t stream, size_t nbytes, void* dst, const void* src) { - DIPU_CALLTOPSRT(::topsMemcpyAsync( - dst, src, nbytes, topsMemcpyDeviceToHost, stream)); +void memCopyD2HAsync(const deviceStream_t stream, size_t nbytes, void *dst, + const void *src) { + DIPU_CALLTOPSRT( + ::topsMemcpyAsync(dst, src, nbytes, topsMemcpyDeviceToHost, stream)); } -} // end namespace devapis - -} // namespace parrots - - - - - - - - +} // end namespace devapis +} // namespace dipu diff --git a/dipu/torch_dipu/csrc_dipu/vendor/topsrider/vendorapi.h b/dipu/torch_dipu/csrc_dipu/vendor/topsrider/vendorapi.h index a7e043a02..b4e960044 100644 --- a/dipu/torch_dipu/csrc_dipu/vendor/topsrider/vendorapi.h +++ b/dipu/torch_dipu/csrc_dipu/vendor/topsrider/vendorapi.h @@ -1,17 +1,19 @@ // Copyright (c) 2023, DeepLink. #pragma once -#include #include #include +#include + namespace dipu { -#define DIPU_CALLTOPSRT(Expr) \ - { \ - ::topsError_t ret = Expr; \ - if (ret != ::topsSuccess) { \ - throw std::runtime_error("dipu device error, ret code:" + std::to_string(ret)); \ - } \ +#define DIPU_CALLTOPSRT(Expr) \ + { \ + ::topsError_t ret = Expr; \ + if (ret != ::topsSuccess) { \ + throw std::runtime_error("dipu device error, ret code:" + \ + std::to_string(ret)); \ + } \ } using deviceStream_t = topsStream_t; @@ -20,4 +22,4 @@ using deviceEvent_t = topsEvent_t; using diclComm_t = ecclComm_t; using commUniqueId = ecclUniqueId; -} +} // namespace dipu From 8e1ac3bd82bc01f993a29b08f61b1a7271e7ed87 Mon Sep 17 00:00:00 2001 From: wiryls <7984500+wiryls@users.noreply.github.com> Date: Tue, 14 Nov 2023 20:10:42 +0800 Subject: [PATCH 2/3] fix: update IncludeCategories --- dipu/.clang-format | 65 ++++++++++---------- dipu/torch_dipu/csrc_dipu/utils/helpfunc.cpp | 2 +- 2 files changed, 34 insertions(+), 33 deletions(-) diff --git a/dipu/.clang-format b/dipu/.clang-format index bf44edbbc..340dd0bef 100644 --- a/dipu/.clang-format +++ b/dipu/.clang-format @@ -1,35 +1,36 @@ --- BasedOnStyle: InheritParentConfig IncludeCategories: - - Regex: '^("|<)csrc_dipu/.*' - Priority: 9 - SortPriority: 0 - CaseSensitive: false - - Regex: '^("|<)diopi/.*' - Priority: 8 - SortPriority: 0 - CaseSensitive: false - - Regex: '^("|<)(c10|aten|torch).*' - Priority: 4 - SortPriority: 0 - CaseSensitive: false - - Regex: '^("|<)(pybind11|Python\.h|frameobject.\h).*' - Priority: 5 - SortPriority: 0 - CaseSensitive: false - - Regex: '^<((ext/.*)|pthread)\.h.*' - Priority: 2 - SortPriority: 1 - CaseSensitive: false - - Regex: '^("|<)(cuda|su|cn|(..?ccl)|(.*_runtime)).*\.h.*' - Priority: 3 - SortPriority: 0 - CaseSensitive: false - - Regex: '^<.*' - Priority: 2 - SortPriority: 0 - CaseSensitive: false - - Regex: '.*' - Priority: 10 - SortPriority: 0 - CaseSensitive: false + - Regex: '^("|<)csrc_dipu/.*' + Priority: 90 + CaseSensitive: false + - Regex: '^("|<)diopi/.*' + Priority: 80 + CaseSensitive: false + - Regex: '^("|<)(c10|aten|torch).*' + Priority: 40 + CaseSensitive: false + - Regex: '^("|<)(Python\.h).*' + Priority: 50 + CaseSensitive: false + - Regex: '^("|<)(frameobject\.h).*' + Priority: 50 + SortPriority: 51 + CaseSensitive: false + - Regex: '^("|<)(pybind11).*' + Priority: 50 + SortPriority: 52 + CaseSensitive: false + - Regex: '^<((ext/.*)|pthread)\.h.*' + Priority: 20 + SortPriority: 21 + CaseSensitive: false + - Regex: '^("|<)(cuda|su|cn|(..?ccl)|(.*_runtime)).*\.h.*' + Priority: 30 + CaseSensitive: false + - Regex: '^<.*' + Priority: 20 + CaseSensitive: false + - Regex: '.*' + Priority: 100 + CaseSensitive: false diff --git a/dipu/torch_dipu/csrc_dipu/utils/helpfunc.cpp b/dipu/torch_dipu/csrc_dipu/utils/helpfunc.cpp index 578d88106..76fac9dbe 100644 --- a/dipu/torch_dipu/csrc_dipu/utils/helpfunc.cpp +++ b/dipu/torch_dipu/csrc_dipu/utils/helpfunc.cpp @@ -2,8 +2,8 @@ #include "./helpfunc.hpp" #ifndef WIN32 -#include #include +#include #endif namespace dipu { From baa137c1a2555c9e722b8ac58472779d3a978c40 Mon Sep 17 00:00:00 2001 From: wiryls <7984500+wiryls@users.noreply.github.com> Date: Tue, 14 Nov 2023 20:43:28 +0800 Subject: [PATCH 3/3] fix: make sure structmember.h is before Python.h --- dipu/.clang-format | 16 ++++++++-------- .../csrc_dipu/binding/patchCsrcDevice.cpp | 3 ++- 2 files changed, 10 insertions(+), 9 deletions(-) diff --git a/dipu/.clang-format b/dipu/.clang-format index 340dd0bef..61244b861 100644 --- a/dipu/.clang-format +++ b/dipu/.clang-format @@ -1,31 +1,31 @@ --- BasedOnStyle: InheritParentConfig IncludeCategories: - - Regex: '^("|<)csrc_dipu/.*' + - Regex: '^("|<)csrc_dipu/' Priority: 90 CaseSensitive: false - - Regex: '^("|<)diopi/.*' + - Regex: '^("|<)diopi/' Priority: 80 CaseSensitive: false - - Regex: '^("|<)(c10|aten|torch).*' + - Regex: '^("|<)(c10|aten|torch)/' Priority: 40 CaseSensitive: false - - Regex: '^("|<)(Python\.h).*' + - Regex: '^("|<)Python\.h' Priority: 50 CaseSensitive: false - - Regex: '^("|<)(frameobject\.h).*' + - Regex: '^("|<)(frameobject|structmember)\.h' Priority: 50 SortPriority: 51 CaseSensitive: false - - Regex: '^("|<)(pybind11).*' + - Regex: '^("|<)(pybind11)' Priority: 50 SortPriority: 52 CaseSensitive: false - - Regex: '^<((ext/.*)|pthread)\.h.*' + - Regex: '^<((ext/.*)|pthread)\.h' Priority: 20 SortPriority: 21 CaseSensitive: false - - Regex: '^("|<)(cuda|su|cn|(..?ccl)|(.*_runtime)).*\.h.*' + - Regex: '^("|<)(cuda|su|cn|(..?ccl)|(.*_runtime)).*\.h' Priority: 30 CaseSensitive: false - Regex: '^<.*' diff --git a/dipu/torch_dipu/csrc_dipu/binding/patchCsrcDevice.cpp b/dipu/torch_dipu/csrc_dipu/binding/patchCsrcDevice.cpp index 9cf9e8202..2576bfb19 100644 --- a/dipu/torch_dipu/csrc_dipu/binding/patchCsrcDevice.cpp +++ b/dipu/torch_dipu/csrc_dipu/binding/patchCsrcDevice.cpp @@ -3,7 +3,6 @@ #include #include #include -#include #include #include @@ -17,6 +16,8 @@ #include #include +#include + #include "exportapi.h" namespace dipu {