From e562068736042eb6a1fa6fb2543c51c48910ee0a Mon Sep 17 00:00:00 2001 From: kchristin Date: Wed, 23 Oct 2024 01:49:12 +0300 Subject: [PATCH] Fix suggestions --- .../clad/Differentiator/BuiltinDerivatives.h | 6 ++-- lib/Differentiator/ReverseModeVisitor.cpp | 11 ++++--- test/CUDA/GradientKernels.cu | 30 ++++++++++++++----- 3 files changed, 30 insertions(+), 17 deletions(-) diff --git a/include/clad/Differentiator/BuiltinDerivatives.h b/include/clad/Differentiator/BuiltinDerivatives.h index 17b974550..3467486cf 100644 --- a/include/clad/Differentiator/BuiltinDerivatives.h +++ b/include/clad/Differentiator/BuiltinDerivatives.h @@ -85,7 +85,7 @@ ValueAndPushforward cudaDeviceSynchronize_pushforward() template __global__ void atomicAdd_kernel(T* destPtr, T* srcPtr, size_t N) { - for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < N; + for (size_t i = blockIdx.x * blockDim.x + threadIdx.x; i < N; i += blockDim.x * gridDim.x) atomicAdd(&destPtr[i], srcPtr[i]); } @@ -95,7 +95,7 @@ void cudaMemcpy_pullback(T* destPtr, T* srcPtr, size_t count, cudaMemcpyKind kind, T* d_destPtr, T* d_srcPtr, size_t* d_count, cudaMemcpyKind* d_kind) __attribute__((host)) { - T* aux_destPtr; + T* aux_destPtr = nullptr; if (kind == cudaMemcpyDeviceToHost) { *d_kind = cudaMemcpyHostToDevice; cudaMalloc(&aux_destPtr, count); @@ -125,7 +125,7 @@ void cudaMemcpy_pullback(T* destPtr, T* srcPtr, size_t count, cudaFree(aux_destPtr); } else if (kind == cudaMemcpyHostToDevice) { // d_kind is device to host, so d_srcPtr is a host pointer - for (size_t i = 0; i < N; ++i) + for (size_t i = 0; i < N; i++) d_srcPtr[i] += aux_destPtr[i]; free(aux_destPtr); } diff --git a/lib/Differentiator/ReverseModeVisitor.cpp b/lib/Differentiator/ReverseModeVisitor.cpp index 84b2079d3..4c7262f72 100644 --- a/lib/Differentiator/ReverseModeVisitor.cpp +++ b/lib/Differentiator/ReverseModeVisitor.cpp @@ -611,8 +611,9 @@ Expr* getArraySizeExpr(const ArrayType* AT, ASTContext& context, if (!m_DiffReq.CUDAGlobalArgsIndexes.empty()) for (auto index : m_DiffReq.CUDAGlobalArgsIndexes) m_CUDAGlobalArgs.emplace(m_Derivative->getParamDecl(index)); - // If the function is a global kernel, all its parameters reside in the - // global memory of the GPU + // if the function is a global kernel, all the adjoint parameters reside in + // the global memory of the GPU. To facilitate the process, all the params + // of the kernel are added to the set. else if (m_DiffReq->hasAttr()) for (auto* param : params) m_CUDAGlobalArgs.emplace(param); @@ -3226,10 +3227,8 @@ Expr* getArraySizeExpr(const ArrayType* AT, ASTContext& context, bool dxInForward = false; if (auto* callExpr = dyn_cast_or_null(stmtDx)) if (auto* FD = dyn_cast(callExpr->getCalleeDecl())) - if (utils::IsMemoryFunction(FD)) { - printf("%s\n", FD->getNameAsString().c_str()); + if (utils::IsMemoryFunction(FD)) dxInForward = true; - } if (stmtDx) { if (dxInForward) addToCurrentBlock(stmtDx, direction::forward); @@ -3257,7 +3256,7 @@ Expr* getArraySizeExpr(const ArrayType* AT, ASTContext& context, if (size) { llvm::SmallVector args; - if (auto BinOp = dyn_cast(size)) { + if (auto* BinOp = dyn_cast(size)) { if (BinOp->getOpcode() == BO_Mul) { Expr* lhs = BinOp->getLHS(); Expr* rhs = BinOp->getRHS(); diff --git a/test/CUDA/GradientKernels.cu b/test/CUDA/GradientKernels.cu index f111d0d9b..5a5b33a07 100644 --- a/test/CUDA/GradientKernels.cu +++ b/test/CUDA/GradientKernels.cu @@ -427,9 +427,12 @@ void fn(double *out, double *in) { //CHECK-NEXT: } double fn_memory(double *out, double *in) { - kernel_call<<<1, 10>>>(out, in); + double *in_dev = nullptr; + cudaMalloc(&in_dev, 10 * sizeof(double)); + cudaMemcpy(in_dev, in, 10 * sizeof(double), cudaMemcpyHostToDevice); + kernel_call<<<1, 10>>>(out, in_dev); cudaDeviceSynchronize(); - double *out_host = (double*)malloc(10 * sizeof(double)); + double *out_host = (double *)malloc(10 * sizeof(double)); cudaMemcpy(out_host, out, 10 * sizeof(double), cudaMemcpyDeviceToHost); double res = 0; for (int i=0; i < 10; ++i) { @@ -437,7 +440,7 @@ double fn_memory(double *out, double *in) { } free(out_host); cudaFree(out); - cudaFree(in); + cudaFree(in_dev); return res; } @@ -445,7 +448,12 @@ double fn_memory(double *out, double *in) { //CHECK-NEXT: int _d_i = 0; //CHECK-NEXT: int i = 0; //CHECK-NEXT: clad::tape _t1 = {}; -//CHECK-NEXT: kernel_call<<<1, 10>>>(out, in); +//CHECK-NEXT: double *_d_in_dev = nullptr; +//CHECK-NEXT: double *in_dev = nullptr; +//CHECK-NEXT: cudaMalloc(&_d_in_dev, 10 * sizeof(double)); +//CHECK-NEXT: cudaMalloc(&in_dev, 10 * sizeof(double)); +//CHECK-NEXT: cudaMemcpy(in_dev, in, 10 * sizeof(double), cudaMemcpyHostToDevice); +//CHECK-NEXT: kernel_call<<<1, 10>>>(out, in_dev); //CHECK-NEXT: cudaDeviceSynchronize(); //CHECK-NEXT: double *_d_out_host = (double *)calloc(10, sizeof(double)); //CHECK-NEXT: double *out_host = (double *)malloc(10 * sizeof(double)); @@ -481,10 +489,16 @@ double fn_memory(double *out, double *in) { //CHECK-NEXT: clad::custom_derivatives::cudaMemcpy_pullback(out_host, out, 10 * sizeof(double), cudaMemcpyDeviceToHost, _d_out_host, _d_out, &_r0, &_r1); //CHECK-NEXT: } //CHECK-NEXT: kernel_call_pullback<<<1, 10>>>(out, in, _d_out, _d_in); +//CHECK-NEXT: { +//CHECK-NEXT: unsigned long _r0 = 0UL; +//CHECK-NEXT: cudaMemcpyKind _r1 = static_cast(0U); +//CHECK-NEXT: clad::custom_derivatives::cudaMemcpy_pullback(in_dev, in, 10 * sizeof(double), cudaMemcpyHostToDevice, _d_in_dev, _d_in, &_r0, &_r1); +//CHECK-NEXT: } //CHECK-NEXT: free(out_host); //CHECK-NEXT: free(_d_out_host); //CHECK-NEXT: cudaFree(out); -//CHECK-NEXT: cudaFree(in); +//CHECK-NEXT: cudaFree(in_dev); +//CHECK-NEXT: cudaFree(_d_in_dev); //CHECK-NEXT:} // CHECK: __attribute__((device)) void device_fn_pullback_1(double in, double val, double _d_y, double *_d_in, double *_d_val) { @@ -797,9 +811,8 @@ int main(void) { INIT(dummy_in_double, dummy_out_double, val, d_in_double, d_out_double, d_val); auto test_memory = clad::gradient(fn_memory); - test_memory.execute(dummy_out_double, dummy_in_double, d_out_double, d_in_double); - cudaMemcpy(res, d_in_double, 10 * sizeof(double), cudaMemcpyDeviceToHost); - printf("%0.2f, %0.2f, %0.2f\n", res[0], res[1], res[2]); // CHECK-EXEC: 60.00, 0.00, 0.00 + test_memory.execute(dummy_out_double, fives, d_out_double, zeros); + printf("%0.2f, %0.2f, %0.2f\n", zeros[0], zeros[1], zeros[2]); // CHECK-EXEC: 60.00, 0.00, 0.00 free(res); free(fives); @@ -808,6 +821,7 @@ int main(void) { cudaFree(d_in_double); cudaFree(val); cudaFree(d_val); + cudaFree(dummy_in_double); return 0; }