diff --git a/include/clad/Differentiator/DiffPlanner.h b/include/clad/Differentiator/DiffPlanner.h index ef29b7246..a4b06a148 100644 --- a/include/clad/Differentiator/DiffPlanner.h +++ b/include/clad/Differentiator/DiffPlanner.h @@ -46,6 +46,8 @@ struct DiffRequest { clang::CallExpr* CallContext = nullptr; /// Args provided to the call to clad::gradient/differentiate. const clang::Expr* Args = nullptr; + /// Indexes of global GPU args of function as a subset of Args. + std::vector CUDAGlobalArgsIndexes; /// Requested differentiation mode, forward or reverse. DiffMode Mode = DiffMode::unknown; /// If function appears in the call to clad::gradient/differentiate, diff --git a/include/clad/Differentiator/ExternalRMVSource.h b/include/clad/Differentiator/ExternalRMVSource.h index 4da9d09fc..72fc596b0 100644 --- a/include/clad/Differentiator/ExternalRMVSource.h +++ b/include/clad/Differentiator/ExternalRMVSource.h @@ -124,7 +124,7 @@ class ExternalRMVSource { /// This is called just before finalising `VisitReturnStmt`. virtual void ActBeforeFinalizingVisitReturnStmt(StmtDiff& retExprDiff) {} - /// This ic called just before finalising `VisitCallExpr`. + /// This is called just before finalising `VisitCallExpr`. /// /// \param CE call expression that is being visited. /// \param CallArgs diff --git a/include/clad/Differentiator/ReverseModeVisitor.h b/include/clad/Differentiator/ReverseModeVisitor.h index b044ee0ec..dabcfd256 100644 --- a/include/clad/Differentiator/ReverseModeVisitor.h +++ b/include/clad/Differentiator/ReverseModeVisitor.h @@ -38,6 +38,11 @@ namespace clad { // several private/protected members of the visitor classes. friend class ErrorEstimationHandler; llvm::SmallVector m_IndependentVars; + /// Set used to keep track of parameter variables w.r.t which the + /// the derivative (gradient) is being computed. This is separate from the + /// m_Variables map because all other intermediate variables will + /// not be stored here. + std::unordered_set m_ParamVarsWithDiff; /// In addition to a sequence of forward-accumulated Stmts (m_Blocks), in /// the reverse mode we also accumulate Stmts for the reverse pass which /// will be executed on return. @@ -51,6 +56,8 @@ namespace clad { /// that will be put immediately in the beginning of derivative function /// block. Stmts m_Globals; + /// Global GPU args of the function. + std::unordered_set m_CUDAGlobalArgs; //// A reference to the output parameter of the gradient function. clang::Expr* m_Result; /// A flag indicating if the Stmt we are currently visiting is inside loop. @@ -432,7 +439,7 @@ namespace clad { /// Helper function that checks whether the function to be derived /// is meant to be executed only by the GPU - bool shouldUseCudaAtomicOps(); + bool shouldUseCudaAtomicOps(const clang::Expr* E); /// Add call to cuda::atomicAdd for the given LHS and RHS expressions. /// diff --git a/lib/Differentiator/ReverseModeVisitor.cpp b/lib/Differentiator/ReverseModeVisitor.cpp index 2a7a16aa4..badbf2591 100644 --- a/lib/Differentiator/ReverseModeVisitor.cpp +++ b/lib/Differentiator/ReverseModeVisitor.cpp @@ -104,10 +104,15 @@ Expr* getArraySizeExpr(const ArrayType* AT, ASTContext& context, return CladTapeResult{*this, PushExpr, PopExpr, TapeRef}; } - bool ReverseModeVisitor::shouldUseCudaAtomicOps() { - return m_DiffReq->hasAttr() || - (m_DiffReq->hasAttr() && - !m_DiffReq->hasAttr()); + bool ReverseModeVisitor::shouldUseCudaAtomicOps(const Expr* E) { + // Same as checking whether this is a function executed by the GPU + if (!m_CUDAGlobalArgs.empty()) + if (const auto* DRE = dyn_cast(E)) + if (const auto* PVD = dyn_cast(DRE->getDecl())) + // Check whether this param is in the global memory of the GPU + return m_CUDAGlobalArgs.find(PVD) != m_CUDAGlobalArgs.end(); + + return false; } clang::Expr* ReverseModeVisitor::BuildCallToCudaAtomicAdd(clang::Expr* LHS, @@ -123,8 +128,13 @@ Expr* getArraySizeExpr(const ArrayType* AT, ASTContext& context, m_Sema.BuildDeclarationNameExpr(SS, lookupResult, /*ADL=*/true).get(); Expr* finalLHS = LHS; - if (isa(LHS)) + if (auto* UO = dyn_cast(LHS)) { + if (UO->getOpcode() == UnaryOperatorKind::UO_Deref) + finalLHS = UO->getSubExpr()->IgnoreImplicit(); + } else if (!LHS->getType()->isPointerType() && + !LHS->getType()->isReferenceType()) finalLHS = BuildOp(UnaryOperatorKind::UO_AddrOf, LHS); + llvm::SmallVector atomicArgs = {finalLHS, RHS}; assert(!m_Builder.noOverloadExists(UnresolvedLookup, atomicArgs) && @@ -440,6 +450,12 @@ Expr* getArraySizeExpr(const ArrayType* AT, ASTContext& context, if (m_ExternalSource) m_ExternalSource->ActAfterCreatingDerivedFnParams(params); + // if the function is a global kernel, all its parameters reside in the + // global memory of the GPU + if (m_DiffReq->hasAttr()) + for (auto* param : params) + m_CUDAGlobalArgs.emplace(param); + llvm::ArrayRef paramsRef = clad_compat::makeArrayRef(params.data(), params.size()); gradientFD->setParams(paramsRef); @@ -546,6 +562,8 @@ Expr* getArraySizeExpr(const ArrayType* AT, ASTContext& context, auto derivativeName = utils::ComputeEffectiveFnName(m_DiffReq.Function) + "_pullback"; + for (auto index : m_DiffReq.CUDAGlobalArgsIndexes) + derivativeName += "_" + std::to_string(index); auto DNI = utils::BuildDeclarationNameInfo(m_Sema, derivativeName); auto paramTypes = ComputeParamTypes(args); @@ -587,6 +605,12 @@ Expr* getArraySizeExpr(const ArrayType* AT, ASTContext& context, m_ExternalSource->ActAfterCreatingDerivedFnParams(params); m_Derivative->setParams(params); + // Match the global arguments of the call to the device function to the + // pullback function's parameters. + if (!m_DiffReq.CUDAGlobalArgsIndexes.empty()) + for (auto index : m_DiffReq.CUDAGlobalArgsIndexes) + m_CUDAGlobalArgs.emplace(m_Derivative->getParamDecl(index)); + m_Derivative->setBody(nullptr); if (!m_DiffReq.DeclarationOnly) { @@ -1519,7 +1543,7 @@ Expr* getArraySizeExpr(const ArrayType* AT, ASTContext& context, BuildArraySubscript(target, forwSweepDerivativeIndices); // Create the (target += dfdx) statement. if (dfdx()) { - if (shouldUseCudaAtomicOps()) { + if (shouldUseCudaAtomicOps(target)) { Expr* atomicCall = BuildCallToCudaAtomicAdd(result, dfdx()); // Add it to the body statements. addToCurrentBlock(atomicCall, direction::reverse); @@ -1583,9 +1607,17 @@ Expr* getArraySizeExpr(const ArrayType* AT, ASTContext& context, // FIXME: not sure if this is generic. // Don't update derivatives of record types. if (!VD->getType()->isRecordType()) { - auto* add_assign = BuildOp(BO_AddAssign, it->second, dfdx()); - // Add it to the body statements. - addToCurrentBlock(add_assign, direction::reverse); + Expr* base = it->second; + if (auto* UO = dyn_cast(it->second)) + base = UO->getSubExpr()->IgnoreImpCasts(); + if (shouldUseCudaAtomicOps(base)) { + Expr* atomicCall = BuildCallToCudaAtomicAdd(it->second, dfdx()); + // Add it to the body statements. + addToCurrentBlock(atomicCall, direction::reverse); + } else { + auto* add_assign = BuildOp(BO_AddAssign, it->second, dfdx()); + addToCurrentBlock(add_assign, direction::reverse); + } } } return StmtDiff(clonedDRE, it->second, it->second); @@ -1728,20 +1760,31 @@ Expr* getArraySizeExpr(const ArrayType* AT, ASTContext& context, for (const Expr* Arg : CE->arguments()) { StmtDiff ArgDiff = Visit(Arg, dfdx()); CallArgs.push_back(ArgDiff.getExpr()); - DerivedCallArgs.push_back(ArgDiff.getExpr_dx()); + if (auto* DRE = dyn_cast(ArgDiff.getExpr())) { + // If the arg is used for differentiation of the function, then we + // cannot free it in the end as it's the result to be returned to the + // user. + if (m_ParamVarsWithDiff.find(DRE->getDecl()) == + m_ParamVarsWithDiff.end()) + DerivedCallArgs.push_back(ArgDiff.getExpr_dx()); + } } Expr* call = m_Sema .ActOnCallExpr(getCurrentScope(), Clone(CE->getCallee()), Loc, llvm::MutableArrayRef(CallArgs), Loc) .get(); - Expr* call_dx = - m_Sema - .ActOnCallExpr(getCurrentScope(), Clone(CE->getCallee()), Loc, - llvm::MutableArrayRef(DerivedCallArgs), Loc) - .get(); m_DeallocExprs.push_back(call); - m_DeallocExprs.push_back(call_dx); + + if (!DerivedCallArgs.empty()) { + Expr* call_dx = + m_Sema + .ActOnCallExpr(getCurrentScope(), Clone(CE->getCallee()), Loc, + llvm::MutableArrayRef(DerivedCallArgs), + Loc) + .get(); + m_DeallocExprs.push_back(call_dx); + } return StmtDiff(); } @@ -1887,6 +1930,7 @@ Expr* getArraySizeExpr(const ArrayType* AT, ASTContext& context, // If it has more args or f_darg0 was not found, we look for its pullback // function. const auto* MD = dyn_cast(FD); + std::vector globalCallArgs; if (!OverloadedDerivedFn) { size_t idx = 0; @@ -1952,12 +1996,23 @@ Expr* getArraySizeExpr(const ArrayType* AT, ASTContext& context, pullback); // Try to find it in builtin derivatives + std::string customPullback = + clad::utils::ComputeEffectiveFnName(FD) + "_pullback"; + // Add the indexes of the global args to the custom pullback name + if (!m_CUDAGlobalArgs.empty()) + for (size_t i = 0; i < pullbackCallArgs.size(); i++) + if (auto* DRE = dyn_cast(pullbackCallArgs[i])) + if (auto* param = dyn_cast(DRE->getDecl())) + if (m_CUDAGlobalArgs.find(param) != m_CUDAGlobalArgs.end()) { + customPullback += "_" + std::to_string(i); + globalCallArgs.emplace_back(i); + } + if (baseDiff.getExpr()) pullbackCallArgs.insert( pullbackCallArgs.begin(), BuildOp(UnaryOperatorKind::UO_AddrOf, baseDiff.getExpr())); - std::string customPullback = - clad::utils::ComputeEffectiveFnName(FD) + "_pullback"; + OverloadedDerivedFn = m_Builder.BuildCallToCustomDerivativeOrNumericalDiff( customPullback, pullbackCallArgs, getCurrentScope(), @@ -1990,6 +2045,11 @@ Expr* getArraySizeExpr(const ArrayType* AT, ASTContext& context, // derive the called function. DiffRequest pullbackRequest{}; pullbackRequest.Function = FD; + + // Mark the indexes of the global args. Necessary if the argument of the + // call has a different name than the function's signature parameter. + pullbackRequest.CUDAGlobalArgsIndexes = globalCallArgs; + pullbackRequest.BaseFunctionName = clad::utils::ComputeEffectiveFnName(FD); pullbackRequest.Mode = DiffMode::experimental_pullback; @@ -2237,12 +2297,15 @@ Expr* getArraySizeExpr(const ArrayType* AT, ASTContext& context, ConstantFolder::synthesizeLiteral(m_Context.IntTy, m_Context, i); Expr* gradElem = BuildArraySubscript(gradRef, {idx}); Expr* gradExpr = BuildOp(BO_Mul, dfdx, gradElem); + // Inputs were not pointers, so the output args are not in global GPU + // memory. Hence, no need to use atomic ops. PostCallStmts.push_back(BuildOp(BO_AddAssign, outputArgs[i], gradExpr)); NumDiffArgs.push_back(args[i]); } std::string Name = "central_difference"; return m_Builder.BuildCallToCustomDerivativeOrNumericalDiff( - Name, NumDiffArgs, getCurrentScope(), /*OriginalFnDC=*/nullptr, + Name, NumDiffArgs, getCurrentScope(), + /*OriginalFnDC=*/nullptr, /*forCustomDerv=*/false, /*namespaceShouldExist=*/false); } @@ -2343,8 +2406,8 @@ Expr* getArraySizeExpr(const ArrayType* AT, ASTContext& context, else { derivedE = BuildOp(UnaryOperatorKind::UO_Deref, diff_dx); // Create the (target += dfdx) statement. - if (dfdx()) { - if (shouldUseCudaAtomicOps()) { + if (dfdx() && derivedE) { + if (shouldUseCudaAtomicOps(diff_dx)) { Expr* atomicCall = BuildCallToCudaAtomicAdd(diff_dx, dfdx()); // Add it to the body statements. addToCurrentBlock(atomicCall, direction::reverse); @@ -4556,6 +4619,7 @@ Expr* getArraySizeExpr(const ArrayType* AT, ASTContext& context, m_Variables[*it] = utils::BuildParenExpr(m_Sema, m_Variables[*it]); } + m_ParamVarsWithDiff.emplace(*it); } } } diff --git a/lib/Differentiator/VisitorBase.cpp b/lib/Differentiator/VisitorBase.cpp index b6156397c..63ae3c369 100644 --- a/lib/Differentiator/VisitorBase.cpp +++ b/lib/Differentiator/VisitorBase.cpp @@ -783,7 +783,8 @@ namespace clad { // Return the found overload. std::string Name = "forward_central_difference"; return m_Builder.BuildCallToCustomDerivativeOrNumericalDiff( - Name, NumDiffArgs, getCurrentScope(), /*OriginalFnDC=*/nullptr, + Name, NumDiffArgs, getCurrentScope(), + /*OriginalFnDC=*/nullptr, /*forCustomDerv=*/false, /*namespaceShouldExist=*/false); } diff --git a/test/CUDA/GradientKernels.cu b/test/CUDA/GradientKernels.cu index 171341a7e..a60604fa6 100644 --- a/test/CUDA/GradientKernels.cu +++ b/test/CUDA/GradientKernels.cu @@ -288,6 +288,186 @@ __global__ void add_kernel_7(double *a, double *b) { //CHECK-NEXT: } //CHECK-NEXT:} +__device__ double device_fn(double in, double val) { + return in + val; +} + +__global__ void kernel_with_device_call(double *out, double *in, double val) { + int index = threadIdx.x; + out[index] = device_fn(in[index], val); +} + +// CHECK: void kernel_with_device_call_grad_0_2(double *out, double *in, double val, double *_d_out, double *_d_val) { +//CHECK-NEXT: int _d_index = 0; +//CHECK-NEXT: int index0 = threadIdx.x; +//CHECK-NEXT: double _t0 = out[index0]; +//CHECK-NEXT: out[index0] = device_fn(in[index0], val); +//CHECK-NEXT: { +//CHECK-NEXT: out[index0] = _t0; +//CHECK-NEXT: double _r_d0 = _d_out[index0]; +//CHECK-NEXT: _d_out[index0] = 0.; +//CHECK-NEXT: double _r0 = 0.; +//CHECK-NEXT: double _r1 = 0.; +//CHECK-NEXT: device_fn_pullback_1(in[index0], val, _r_d0, &_r0, &_r1); +//CHECK-NEXT: atomicAdd(_d_val, _r1); +//CHECK-NEXT: } +//CHECK-NEXT:} + +__device__ double device_fn_2(double *in, double val) { + int index = threadIdx.x + blockIdx.x * blockDim.x; + return in[index] + val; +} + +__global__ void kernel_with_device_call_2(double *out, double *in, double val) { + int index = threadIdx.x; + out[index] = device_fn_2(in, val); +} + +__global__ void dup_kernel_with_device_call_2(double *out, double *in, double val) { + int index = threadIdx.x; + out[index] = device_fn_2(in, val); +} + +// CHECK: void kernel_with_device_call_2_grad_0_2(double *out, double *in, double val, double *_d_out, double *_d_val) { +//CHECK-NEXT: int _d_index = 0; +//CHECK-NEXT: int index0 = threadIdx.x; +//CHECK-NEXT: double _t0 = out[index0]; +//CHECK-NEXT: out[index0] = device_fn_2(in, val); +//CHECK-NEXT: { +//CHECK-NEXT: out[index0] = _t0; +//CHECK-NEXT: double _r_d0 = _d_out[index0]; +//CHECK-NEXT: _d_out[index0] = 0.; +//CHECK-NEXT: double _r0 = 0.; +//CHECK-NEXT: device_fn_2_pullback_0_1(in, val, _r_d0, &_r0); +//CHECK-NEXT: atomicAdd(_d_val, _r0); +//CHECK-NEXT: } +//CHECK-NEXT:} + +// CHECK: void kernel_with_device_call_2_grad_0_1(double *out, double *in, double val, double *_d_out, double *_d_in) { +//CHECK-NEXT: double _d_val = 0.; +//CHECK-NEXT: int _d_index = 0; +//CHECK-NEXT: int index0 = threadIdx.x; +//CHECK-NEXT: double _t0 = out[index0]; +//CHECK-NEXT: out[index0] = device_fn_2(in, val); +//CHECK-NEXT: { +//CHECK-NEXT: out[index0] = _t0; +//CHECK-NEXT: double _r_d0 = _d_out[index0]; +//CHECK-NEXT: _d_out[index0] = 0.; +//CHECK-NEXT: double _r0 = 0.; +//CHECK-NEXT: device_fn_2_pullback_0_1_3(in, val, _r_d0, _d_in, &_r0); +//CHECK-NEXT: _d_val += _r0; +//CHECK-NEXT: } +//CHECK-NEXT:} + +__device__ double device_fn_3(double *in, double *val) { + int index = threadIdx.x + blockIdx.x * blockDim.x; + return in[index] + *val; +} + +__global__ void kernel_with_device_call_3(double *out, double *in, double *val) { + int index = threadIdx.x; + out[index] = device_fn_3(in, val); +} + +// CHECK: void kernel_with_device_call_3_grad(double *out, double *in, double *val, double *_d_out, double *_d_in, double *_d_val) { +//CHECK-NEXT: int _d_index = 0; +//CHECK-NEXT: int index0 = threadIdx.x; +//CHECK-NEXT: double _t0 = out[index0]; +//CHECK-NEXT: out[index0] = device_fn_3(in, val); +//CHECK-NEXT: { +//CHECK-NEXT: out[index0] = _t0; +//CHECK-NEXT: double _r_d0 = _d_out[index0]; +//CHECK-NEXT: _d_out[index0] = 0.; +//CHECK-NEXT: device_fn_3_pullback_0_1_3_4(in, val, _r_d0, _d_in, _d_val); +//CHECK-NEXT: } +//CHECK-NEXT:} + +__device__ double device_fn_4(double *in, double val) { + int index = threadIdx.x + blockIdx.x * blockDim.x; + return in[index] + val; +} + +__device__ double device_with_device_call(double *in, double val) { + return device_fn_4(in, val); +} + +__global__ void kernel_with_nested_device_call(double *out, double *in, double val) { + int index = threadIdx.x; + out[index] = device_with_device_call(in, val); +} + +// CHECK: void kernel_with_nested_device_call_grad_0_1(double *out, double *in, double val, double *_d_out, double *_d_in) { +//CHECK-NEXT: double _d_val = 0.; +//CHECK-NEXT: int _d_index = 0; +//CHECK-NEXT: int index0 = threadIdx.x; +//CHECK-NEXT: double _t0 = out[index0]; +//CHECK-NEXT: out[index0] = device_with_device_call(in, val); +//CHECK-NEXT: { +//CHECK-NEXT: out[index0] = _t0; +//CHECK-NEXT: double _r_d0 = _d_out[index0]; +//CHECK-NEXT: _d_out[index0] = 0.; +//CHECK-NEXT: double _r0 = 0.; +//CHECK-NEXT: device_with_device_call_pullback_0_1_3(in, val, _r_d0, _d_in, &_r0); +//CHECK-NEXT: _d_val += _r0; +//CHECK-NEXT: } +//CHECK-NEXT:} + +// CHECK: __attribute__((device)) void device_fn_pullback_1(double in, double val, double _d_y, double *_d_in, double *_d_val) { +//CHECK-NEXT: { +//CHECK-NEXT: *_d_in += _d_y; +//CHECK-NEXT: *_d_val += _d_y; +//CHECK-NEXT: } +//CHECK-NEXT:} + +// CHECK: __attribute__((device)) void device_fn_2_pullback_0_1(double *in, double val, double _d_y, double *_d_val) { +//CHECK-NEXT: unsigned int _t1 = blockIdx.x; +//CHECK-NEXT: unsigned int _t0 = blockDim.x; +//CHECK-NEXT: int _d_index = 0; +//CHECK-NEXT: int index0 = threadIdx.x + _t1 * _t0; +//CHECK-NEXT: *_d_val += _d_y; +//CHECK-NEXT:} + +// CHECK: __attribute__((device)) void device_fn_2_pullback_0_1_3(double *in, double val, double _d_y, double *_d_in, double *_d_val) { +//CHECK-NEXT: unsigned int _t1 = blockIdx.x; +//CHECK-NEXT: unsigned int _t0 = blockDim.x; +//CHECK-NEXT: int _d_index = 0; +//CHECK-NEXT: int index0 = threadIdx.x + _t1 * _t0; +//CHECK-NEXT: { +//CHECK-NEXT: atomicAdd(&_d_in[index0], _d_y); +//CHECK-NEXT: *_d_val += _d_y; +//CHECK-NEXT: } +//CHECK-NEXT:} + +// CHECK: __attribute__((device)) void device_fn_3_pullback_0_1_3_4(double *in, double *val, double _d_y, double *_d_in, double *_d_val) { +//CHECK-NEXT: unsigned int _t1 = blockIdx.x; +//CHECK-NEXT: unsigned int _t0 = blockDim.x; +//CHECK-NEXT: int _d_index = 0; +//CHECK-NEXT: int index0 = threadIdx.x + _t1 * _t0; +//CHECK-NEXT: { +//CHECK-NEXT: atomicAdd(&_d_in[index0], _d_y); +//CHECK-NEXT: atomicAdd(_d_val, _d_y); +//CHECK-NEXT: } +//CHECK-NEXT:} + +// CHECK: __attribute__((device)) void device_with_device_call_pullback_0_1_3(double *in, double val, double _d_y, double *_d_in, double *_d_val) { +//CHECK-NEXT: { +//CHECK-NEXT: double _r0 = 0.; +//CHECK-NEXT: device_fn_4_pullback_0_1_3(in, val, _d_y, _d_in, &_r0); +//CHECK-NEXT: *_d_val += _r0; +//CHECK-NEXT: } +//CHECK-NEXT:} + +// CHECK: __attribute__((device)) void device_fn_4_pullback_0_1_3(double *in, double val, double _d_y, double *_d_in, double *_d_val) { +//CHECK-NEXT: unsigned int _t1 = blockIdx.x; +//CHECK-NEXT: unsigned int _t0 = blockDim.x; +//CHECK-NEXT: int _d_index = 0; +//CHECK-NEXT: int index0 = threadIdx.x + _t1 * _t0; +//CHECK-NEXT: { +//CHECK-NEXT: atomicAdd(&_d_in[index0], _d_y); +//CHECK-NEXT: *_d_val += _d_y; +//CHECK-NEXT: } +//CHECK-NEXT:} + #define TEST(F, grid, block, shared_mem, use_stream, x, dx, N) \ { \ int *fives = (int*)malloc(N * sizeof(int)); \ @@ -345,9 +525,9 @@ __global__ void add_kernel_7(double *a, double *b) { else { \ test.execute_kernel(grid, block, y, x, dy, dx); \ } \ - cudaDeviceSynchronize(); \ int *res = (int*)malloc(N * sizeof(int)); \ cudaMemcpy(res, dx, N * sizeof(int), cudaMemcpyDeviceToHost); \ + cudaDeviceSynchronize(); \ for (int i = 0; i < (N - 1); i++) { \ printf("%d, ", res[i]); \ } \ @@ -380,9 +560,9 @@ __global__ void add_kernel_7(double *a, double *b) { else { \ test.execute_kernel(grid, block, y, x, N, dy, dx); \ } \ - cudaDeviceSynchronize(); \ int *res = (int*)malloc(N * sizeof(int)); \ cudaMemcpy(res, dx, N * sizeof(int), cudaMemcpyDeviceToHost); \ + cudaDeviceSynchronize(); \ for (int i = 0; i < (N - 1); i++) { \ printf("%d, ", res[i]); \ } \ @@ -415,9 +595,9 @@ __global__ void add_kernel_7(double *a, double *b) { else { \ test.execute_kernel(grid, block, y, x, dy, dx); \ } \ - cudaDeviceSynchronize(); \ double *res = (double*)malloc(N * sizeof(double)); \ cudaMemcpy(res, dx, N * sizeof(double), cudaMemcpyDeviceToHost); \ + cudaDeviceSynchronize(); \ for (int i = 0; i < (N - 1); i++) { \ printf("%0.2f, ", res[i]); \ } \ @@ -427,6 +607,25 @@ __global__ void add_kernel_7(double *a, double *b) { free(res); \ } +#define INIT(x, y, val, dx, dy, d_val) \ +{ \ + double *fives = (double*)malloc(10 * sizeof(double)); \ + for(int i = 0; i < 10; i++) { \ + fives[i] = 5; \ + } \ + double *zeros = (double*)malloc(10 * sizeof(double)); \ + for(int i = 0; i < 10; i++) { \ + zeros[i] = 0; \ + } \ + cudaMemcpy(x, fives, 10 * sizeof(double), cudaMemcpyHostToDevice); \ + cudaMemcpy(y, zeros, 10 * sizeof(double), cudaMemcpyHostToDevice); \ + cudaMemcpy(val, fives, sizeof(double), cudaMemcpyHostToDevice); \ + cudaMemcpy(dx, zeros, 10 * sizeof(double), cudaMemcpyHostToDevice); \ + cudaMemcpy(dy, fives, 10 * sizeof(double), cudaMemcpyHostToDevice); \ + cudaMemcpy(d_val, zeros, sizeof(double), cudaMemcpyHostToDevice); \ + free(fives); \ + free(zeros); \ +} int main(void) { int *a, *d_a; @@ -472,11 +671,71 @@ int main(void) { TEST_2_D(add_kernel_7, dim3(1), dim3(5, 1, 1), 0, false, "a, b", dummy_out_double, dummy_in_double, d_out_double, d_in_double, 10); // CHECK-EXEC: 50.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00 + double *val; + cudaMalloc(&val, sizeof(double)); + double *d_val; + cudaMalloc(&d_val, sizeof(double)); + + INIT(dummy_in_double, dummy_out_double, val, d_in_double, d_out_double, d_val); + + auto test_device = clad::gradient(kernel_with_device_call, "out, val"); + test_device.execute_kernel(dim3(1), dim3(10, 1, 1), dummy_out_double, dummy_in_double, 5, d_out_double, d_val); + double *res = (double*)malloc(10 * sizeof(double)); + cudaMemcpy(res, d_val, sizeof(double), cudaMemcpyDeviceToHost); + cudaDeviceSynchronize(); + printf("%0.2f\n", *res); // CHECK-EXEC: 50.00 + + INIT(dummy_in_double, dummy_out_double, val, d_in_double, d_out_double, d_val); + + auto test_device_2 = clad::gradient(kernel_with_device_call_2, "out, val"); + test_device_2.execute_kernel(dim3(1), dim3(10, 1, 1), dummy_out_double, dummy_in_double, 5, d_out_double, d_val); + cudaMemcpy(res, d_val, sizeof(double), cudaMemcpyDeviceToHost); + cudaDeviceSynchronize(); + printf("%0.2f\n", *res); // CHECK-EXEC: 50.00 + + INIT(dummy_in_double, dummy_out_double, val, d_in_double, d_out_double, d_val); + + auto check_dup = clad::gradient(dup_kernel_with_device_call_2, "out, val"); // check that the pullback function is not regenerated + check_dup.execute_kernel(dim3(1), dim3(10, 1, 1), dummy_out_double, dummy_in_double, 5, d_out_double, d_val); + cudaMemcpy(res, d_val, sizeof(double), cudaMemcpyDeviceToHost); + cudaDeviceSynchronize(); + printf("%s\n", cudaGetErrorString(cudaGetLastError())); // CHECK-EXEC: no error + printf("%0.2f\n", *res); // CHECK-EXEC: 50.00 + + INIT(dummy_in_double, dummy_out_double, val, d_in_double, d_out_double, d_val); + + auto test_device_3 = clad::gradient(kernel_with_device_call_2, "out, in"); + test_device_3.execute_kernel(dim3(1), dim3(10, 1, 1), dummy_out_double, dummy_in_double, 5, d_out_double, d_in_double); + cudaDeviceSynchronize(); + cudaMemcpy(res, d_in_double, 10 * sizeof(double), cudaMemcpyDeviceToHost); + printf("%0.2f, %0.2f, %0.2f\n", res[0], res[1], res[2]); // CHECK-EXEC: 5.00, 5.00, 5.00 + + INIT(dummy_in_double, dummy_out_double, val, d_in_double, d_out_double, d_val); + + auto test_device_4 = clad::gradient(kernel_with_device_call_3); + test_device_4.execute_kernel(dim3(1), dim3(10, 1, 1), dummy_out_double, dummy_in_double, val, d_out_double, d_in_double, d_val); + cudaDeviceSynchronize(); + cudaMemcpy(res, d_in_double, 10 * sizeof(double), cudaMemcpyDeviceToHost); + cudaDeviceSynchronize(); + printf("%0.2f, %0.2f, %0.2f\n", res[0], res[1], res[2]); // CHECK-EXEC: 5.00, 5.00, 5.00 + cudaMemcpy(res, d_val, sizeof(double), cudaMemcpyDeviceToHost); + printf("%0.2f\n", *res); // CHECK-EXEC: 50.00 + + INIT(dummy_in_double, dummy_out_double, val, d_in_double, d_out_double, d_val); + + auto nested_device = clad::gradient(kernel_with_nested_device_call, "out, in"); + nested_device.execute_kernel(dim3(1), dim3(10, 1, 1), dummy_out_double, dummy_in_double, 5, d_out_double, d_in_double); + cudaDeviceSynchronize(); + cudaMemcpy(res, d_in_double, 10 * sizeof(double), cudaMemcpyDeviceToHost); + printf("%0.2f, %0.2f, %0.2f\n", res[0], res[1], res[2]); // CHECK-EXEC: 5.00, 5.00, 5.00 + + free(res); cudaFree(dummy_in_double); cudaFree(dummy_out_double); cudaFree(d_out_double); cudaFree(d_in_double); - + cudaFree(val); + cudaFree(d_val); return 0; }