diff --git a/impl/ascend/aclnn/aclnn.cpp b/impl/ascend/aclnn/aclnn.cpp index f18f6c5a3..7d1e69c67 100644 --- a/impl/ascend/aclnn/aclnn.cpp +++ b/impl/ascend/aclnn/aclnn.cpp @@ -106,9 +106,6 @@ int aclnnAddAdaptor(diopiContextHandle_t ctx, diopiConstTensorHandle_t self1, di // 调用aclnnAdd第二段接口 ret = aclnnAdd(workspaceAddr, workspaceSize, executor, stream); CHECK_RET(ret == ACL_SUCCESS, LOG_PRINT("aclnnAdd failed. ERROR: %d\n", ret); return ret); - // 3.(固定写法)同步等待任务执行结束 - ret = aclrtSynchronizeStream(stream); - CHECK_RET(ret == ACL_SUCCESS, LOG_PRINT("aclrtSynchronizeStream failed. ERROR: %d\n", ret); return ret); if (workspaceSize > 0) { aclrtFree(workspaceAddr); @@ -149,9 +146,6 @@ int aclnnSinAdaptor(diopiContextHandle_t ctx, diopiConstTensorHandle_t self1, di // 调用aclnnSin第二段接口 ret = aclnnSin(workspaceAddr, workspaceSize, executor, stream); CHECK_RET(ret == ACL_SUCCESS, LOG_PRINT("aclnnSin failed. ERROR: %d\n", ret); return ret); - // 3.(固定写法)同步等待任务执行结束 - ret = aclrtSynchronizeStream(stream); - CHECK_RET(ret == ACL_SUCCESS, LOG_PRINT("aclrtSynchronizeStream failed. ERROR: %d\n", ret); return ret); if (workspaceSize > 0) { aclrtFree(workspaceAddr); @@ -192,9 +186,6 @@ int aclnnCosAdaptor(diopiContextHandle_t ctx, diopiConstTensorHandle_t self1, di // 调用aclnnCos第二段接口 ret = aclnnCos(workspaceAddr, workspaceSize, executor, stream); CHECK_RET(ret == ACL_SUCCESS, LOG_PRINT("aclnnCos failed. ERROR: %d\n", ret); return ret); - // 3.(固定写法)同步等待任务执行结束 - ret = aclrtSynchronizeStream(stream); - CHECK_RET(ret == ACL_SUCCESS, LOG_PRINT("aclrtSynchronizeStream failed. ERROR: %d\n", ret); return ret); if (workspaceSize > 0) { aclrtFree(workspaceAddr); diff --git a/impl/ascend/common/acloprunner.hpp b/impl/ascend/common/acloprunner.hpp index 779d75020..f607aa7a4 100644 --- a/impl/ascend/common/acloprunner.hpp +++ b/impl/ascend/common/acloprunner.hpp @@ -683,7 +683,6 @@ class AclOpRunner { } *syncTensorPtr = syncTensorReal; } - CALL_ACLRT(aclrtSynchronizeStream(stream)); // Get environment variables once when run is called for the first time if (isDebugAclOpRunnerOn()) { info(__FILE__, __LINE__, __FUNCTION__, "%s", dumpRunnerInfo().c_str()); diff --git a/impl/ascend/common/utils.cpp b/impl/ascend/common/utils.cpp index 9dcc0eff0..28a0cf588 100644 --- a/impl/ascend/common/utils.cpp +++ b/impl/ascend/common/utils.cpp @@ -148,7 +148,6 @@ diopiError_t reshape(diopiContextHandle_t ctx, const AscendTensor& src, AscendTe diopiStreamHandle_t stream; diopiGetStream(ctx, &stream); aclrtMemcpyAsync(destPtr, dst.getAclMemBufferSize(), sourcePtr, src.getAclMemBufferSize(), ACL_MEMCPY_DEVICE_TO_DEVICE, stream); - aclrtSynchronizeStream(stream); return diopiSuccess; } @@ -317,7 +316,6 @@ diopiError_t makeTensorFromScalar(diopiContextHandle_t ctx, const diopiScalar_t* diopiRequireTensor(ctx, out, &sSize, nullptr, dtype, diopi_device); diopiGetTensorData(outCopyDev, &dst); CALL_ACLRT(aclrtMemcpyAsync(dst, elemsize, src, elemsize, ACL_MEMCPY_HOST_TO_DEVICE, stream)); - CALL_ACLRT(aclrtSynchronizeStream(stream)); diopiCastDtype(ctx, *out, outCopyDev); } else { error(__FILE__, __LINE__, __FUNCTION__, "device(%s) not supported", deviceType2Str(device)); @@ -732,7 +730,6 @@ diopiTensorHandle_t hostToDevice(diopiContextHandle_t ctx, diopiConstTensorHandl diopiGetStream(ctx, &stream); int64_t elemsize = getBaseBufferSize(src); CALL_ACLRT(aclrtMemcpyAsync(dstPtr, elemsize, const_cast(srcPtr), elemsize, ACL_MEMCPY_HOST_TO_DEVICE, stream)); - CALL_ACLRT(aclrtSynchronizeStream(stream)); return dst; } else { return const_cast(src); diff --git a/impl/ascend/functions/loss.cpp b/impl/ascend/functions/loss.cpp index 779b06bed..9a2c82553 100644 --- a/impl/ascend/functions/loss.cpp +++ b/impl/ascend/functions/loss.cpp @@ -77,11 +77,10 @@ diopiError_t nllLossOutWithTotalWeight(diopiContextHandle_t ctx, diopiTensorHand castTensor(ctx, weightAt, diopi_dtype_float32); if (0 <= ignoreIndex && ignoreIndex < inputAt.shape(-1)) { diopiStreamHandle_t stream; - void *ptr = reinterpret_cast(const_cast(weightAt.data())) + ignoreIndex * weightAt.elemsize(); + void* ptr = reinterpret_cast(const_cast(weightAt.data())) + ignoreIndex * weightAt.elemsize(); float val = 0.0f; diopiGetStream(ctx, &stream); aclrtMemcpyAsync(ptr, sizeof(float), &val, sizeof(float), ACL_MEMCPY_HOST_TO_DEVICE, stream); - aclrtSynchronizeStream(stream); } // ascend only support inpu tensor with 2D dimension @@ -254,7 +253,7 @@ diopiError_t diopiNLLLossBackward(diopiContextHandle_t ctx, diopiTensorHandle_t runner.addInput(targetPtr, getBaseBufferSize(targetCopy), calTargetShapeVec, ACL_FORMAT_ND, diopi_dtype_int32).setAttr("ignore_index", ignoreIndex); if (inputShape.len > 2) { - void *gradInputPtr; + void* gradInputPtr; diopiGetTensorData(gradInputCopy, &gradInputPtr); runner.addOutput(gradInputPtr, getBaseBufferSize(gradInputCopy), calShapeVec, ACL_FORMAT_ND, gradDtype); } else { diff --git a/impl/ascend_npu/torch_npu/csrc/CopyKernel.cpp b/impl/ascend_npu/torch_npu/csrc/CopyKernel.cpp index bbf4d9618..8fe3db415 100755 --- a/impl/ascend_npu/torch_npu/csrc/CopyKernel.cpp +++ b/impl/ascend_npu/torch_npu/csrc/CopyKernel.cpp @@ -401,7 +401,6 @@ at::Tensor& NPUNativeFunctions::copy_(at::Tensor& self, const at::Tensor& src, b if (!non_blocking) { c10_npu::getCurrentNPUStream().synchronize(); } - c10_npu::getCurrentNPUStream().synchronize(); return self; } diff --git a/impl/ascend_npu/torch_npu/csrc/DIOPIAdapter.cpp b/impl/ascend_npu/torch_npu/csrc/DIOPIAdapter.cpp index fa836c3b3..31708adca 100755 --- a/impl/ascend_npu/torch_npu/csrc/DIOPIAdapter.cpp +++ b/impl/ascend_npu/torch_npu/csrc/DIOPIAdapter.cpp @@ -69,14 +69,11 @@ static std::map STRING_SCALAR_TYPE_TO_ACL_TYPE_ aclError AclrtMemcpyAsyncParamCheck(void* dst, size_t destMax, const void* src, size_t count, aclrtMemcpyKind kind, aclrtStream stream) { auto ret = aclrtMemcpyAsync(dst, destMax, src, count, kind, stream); - NPU_CHECK_ERROR(aclrtSynchronizeStream(stream)); return ret; } aclError AclrtMemcpyParamCheck(void* dst, size_t destMax, const void* src, size_t count, aclrtMemcpyKind kind) { - c10_npu::getCurrentNPUStream().synchronize(); auto ret = aclrtMemcpy(dst, destMax, src, count, kind); - c10_npu::getCurrentNPUStream().synchronize(); return ret; } @@ -889,7 +886,6 @@ void copy_d2d_by_memcpy(at::Tensor& dst, const at::Tensor& src, int64_t exceptSi } c10_npu::NPUStream stream = c10_npu::getCurrentNPUStream(); NPU_CHECK_ERROR(aclrtMemcpyAsync(dst.data_ptr(), dst.nbytes(), src.data_ptr(), src.nbytes(), ACL_MEMCPY_DEVICE_TO_DEVICE, stream)); - NPU_CHECK_ERROR(aclrtSynchronizeStream(stream)); } float CalcuOpUtil::GetScalarFloatValue(const c10::Scalar& scalar) { @@ -1027,7 +1023,6 @@ NPUStatus CalcuOpUtil::AclrtMemcpyAsync(const std::pair& ds void* src_ptr = reinterpret_cast(src.first.data_ptr()) + src.second * src.first.itemsize(); c10_npu::NPUStream stream = c10_npu::getCurrentNPUStream(); NPU_CHECK_ERROR(aclrtMemcpyAsync(dst_ptr, dst_size, src_ptr, src_size, kind, stream)); - NPU_CHECK_ERROR(aclrtSynchronizeStream(stream)); return "SUCCESS"; } @@ -1051,7 +1046,6 @@ aclError CalcuOpUtil::AclrtMemcpyWithModeSwitch(void* dst, size_t dstMax, const aclError CalcuOpUtil::LaunchAsyncCopyTaskWithModeSwitch(const at::Tensor& dst, size_t dstMax, const at::Tensor& src, size_t count, aclrtMemcpyKind kind) { c10_npu::NPUStream stream = c10_npu::getCurrentNPUStream(); NPU_CHECK_ERROR(aclrtMemcpyAsync(dst.data_ptr(), dst.nbytes(), src.data_ptr(), src.nbytes(), kind, stream)); - NPU_CHECK_ERROR(aclrtSynchronizeStream(stream)); } void ContiguousTensorDesc::refresh_contiguous_using_size_and_stride() { @@ -2743,9 +2737,7 @@ void NPUStream::synchronize() const { aclError queue::LaunchAsyncCopyTask(void* dst, size_t dstLen, void* src, size_t srcLen, aclrtMemcpyKind kind) { c10_npu::NPUStream stream = c10_npu::getCurrentNPUStream(); - NPU_CHECK_ERROR(aclrtSynchronizeStream(stream)); auto ret = aclrtMemcpyAsync(dst, dstLen, src, srcLen, kind, stream); - NPU_CHECK_ERROR(aclrtSynchronizeStream(stream)); return ret; }