diff --git a/include/clad/Differentiator/BuiltinDerivatives.h b/include/clad/Differentiator/BuiltinDerivatives.h index 3467486cf..f3ade48e7 100644 --- a/include/clad/Differentiator/BuiltinDerivatives.h +++ b/include/clad/Differentiator/BuiltinDerivatives.h @@ -91,7 +91,7 @@ __global__ void atomicAdd_kernel(T* destPtr, T* srcPtr, size_t N) { } template -void cudaMemcpy_pullback(T* destPtr, T* srcPtr, size_t count, +void cudaMemcpy_pullback(T* destPtr, const T* srcPtr, const size_t count, cudaMemcpyKind kind, T* d_destPtr, T* d_srcPtr, size_t* d_count, cudaMemcpyKind* d_kind) __attribute__((host)) { diff --git a/lib/Differentiator/ReverseModeVisitor.cpp b/lib/Differentiator/ReverseModeVisitor.cpp index 7acccd469..e05e28a74 100644 --- a/lib/Differentiator/ReverseModeVisitor.cpp +++ b/lib/Differentiator/ReverseModeVisitor.cpp @@ -1823,7 +1823,8 @@ Expr* getArraySizeExpr(const ArrayType* AT, ASTContext& context, // If all arguments are constant literals, then this does not contribute to // the gradient. // FIXME: revert this when this is integrated in the activity analysis pass. - if (!isa(CE) && !isa(CE)) { + if (!isa(CE) && !isa(CE) && + CE->getCallReturnType(m_Context).getAsString() != "cudaError_t") { bool allArgsAreConstantLiterals = true; for (const Expr* arg : CE->arguments()) { // if it's of type MaterializeTemporaryExpr, then check its @@ -1847,7 +1848,8 @@ Expr* getArraySizeExpr(const ArrayType* AT, ASTContext& context, // derived function. In the case of member functions, `implicit` // this object is always passed by reference. if (!dfdx() && !utils::HasAnyReferenceOrPointerArgument(FD) && - !isa(CE) && !isa(CE)) { + !isa(CE) && !isa(CE) && + CE->getCallReturnType(m_Context).getAsString() != "cudaError_t") { for (const Expr* Arg : CE->arguments()) { StmtDiff ArgDiff = Visit(Arg, dfdx()); CallArgs.push_back(ArgDiff.getExpr()); diff --git a/test/CUDA/tensor_demo.cu b/test/CUDA/tensor_demo.cu index 341b61a8b..c75de4942 100644 --- a/test/CUDA/tensor_demo.cu +++ b/test/CUDA/tensor_demo.cu @@ -1,11 +1,14 @@ +// RUN: %cladclang_cuda -I%S/../../include --cuda-path=%cudapath \ +// RUN: --cuda-gpu-arch=%cudaarch %cudaldflags -otensor_demo.out %s +// RUN: ./tensor_demo.out | %filecheck_exec %s + + #include "clad/Differentiator/Differentiator.h" typedef unsigned long long int size_type; -__device__ void computeStartStep(size_type& A_start, size_type& A_step, size_type& B_start, size_type& B_step, const int idx, const size_type A_dim[3], const size_type B_dim[3], const int contractDims[2]) { +__device__ void computeStartStep(size_type& A_start, size_type& A_step, size_type& B_start, size_type& B_step, const int idx, const size_type A_dim[3], const size_type B_dim[3], const int contractDimA, const int contractDimB) { size_type A_a, A_b, A_c, B_d, B_e, B_f; - int contractDimA = contractDims[0]; - int contractDimB = contractDims[1]; switch (contractDimA) { case 0: @@ -50,10 +53,8 @@ __device__ void computeStartStep(size_type& A_start, size_type& A_step, size_typ } } -__global__ void tensorContraction3D(float* C, const float *A, const float *B, const size_type *A_dim, const size_type *B_dim, const int contractDims[2]) { +__global__ void tensorContraction3D(float* C, const float *A, const float *B, const size_type *A_dim, const size_type *B_dim, const int contractDimA, const int contractDimB) { int idx = blockIdx.x * blockDim.x + threadIdx.x; - int contractDimA = contractDims[0]; - int contractDimB = contractDims[1]; // Each thread computes one element of the output tensor int totalElements = A_dim[(contractDimA + 1) % 3] * A_dim[(contractDimA + 2) % 3] * B_dim[(contractDimB + 1) % 3] * B_dim[(contractDimB + 2) % 3]; @@ -61,7 +62,7 @@ __global__ void tensorContraction3D(float* C, const float *A, const float *B, co size_type A_start, B_start, A_step, B_step; size_type A_a, A_b, A_c, B_d, B_e, B_f; - computeStartStep(A_start, A_step, B_start, B_step, idx, A_dim, B_dim, contractDims); + computeStartStep(A_start, A_step, B_start, B_step, idx, A_dim, B_dim, contractDimA, contractDimB); float sum = 0.0f; for (int i = 0; i < A_dim[contractDimA]; i++) { // A_dim[contractDimA] == B_dim[contractDimB] @@ -72,12 +73,12 @@ __global__ void tensorContraction3D(float* C, const float *A, const float *B, co } } -void launchTensorContraction3D(float* C, float* A, float* B, const size_type D1, const size_type D2, const size_type D3, const size_type D4, const size_type D5) { +void launchTensorContraction3D(float* C, const float* A, const float* B, const size_type D1, const size_type D2, const size_type D3, const size_type D4, const size_type D5) { float *d_A = nullptr, *d_B = nullptr, *d_C = nullptr; - size_type A_size = D1 * D2 * D3 * sizeof(float); - size_type B_size = D3 * D4 * D5 * sizeof(float); - size_type C_size = D1 * D2 * D4 * D5 * sizeof(float); + const size_type A_size = D1 * D2 * D3 * sizeof(float); + const size_type B_size = D3 * D4 * D5 * sizeof(float); + const size_type C_size = D1 * D2 * D4 * D5 * sizeof(float); // Allocate device memory and copy data from host to device cudaMalloc(&d_A, A_size); @@ -86,8 +87,8 @@ void launchTensorContraction3D(float* C, float* A, float* B, const size_type D1, cudaMemcpy(d_A, A, A_size, cudaMemcpyHostToDevice); cudaMemcpy(d_B, B, B_size, cudaMemcpyHostToDevice); - size_type A_dim[3] = {D1, D2, D3}; - size_type B_dim[3] = {D3, D4, D5}; + const size_type A_dim[3] = {D1, D2, D3}; + const size_type B_dim[3] = {D3, D4, D5}; size_type *d_A_dim = nullptr, *d_B_dim = nullptr; cudaMalloc(&d_A_dim, 3 * sizeof(size_type)); @@ -95,13 +96,8 @@ void launchTensorContraction3D(float* C, float* A, float* B, const size_type D1, cudaMemcpy(d_A_dim, A_dim, 3 * sizeof(size_type), cudaMemcpyHostToDevice); cudaMemcpy(d_B_dim, B_dim, 3 * sizeof(size_type), cudaMemcpyHostToDevice); - int contractDims[2] = {2, 0}; - int *d_contractDims = nullptr; - cudaMalloc(&d_contractDims, 2 * sizeof(int)); - cudaMemcpy(d_contractDims, contractDims, 2 * sizeof(int), cudaMemcpyHostToDevice); - // Launch the kernel - tensorContraction3D<<<1, 256>>>(d_C, d_A, d_B, d_A_dim, d_B_dim, d_contractDims); + tensorContraction3D<<<1, 256>>>(d_C, d_A, d_B, d_A_dim, d_B_dim, /*contractDimA=*/2, /*contractDimB=*/0); // Copy the result from device to host cudaMemcpy(C, d_C, C_size, cudaMemcpyDeviceToHost); @@ -112,7 +108,6 @@ void launchTensorContraction3D(float* C, float* A, float* B, const size_type D1, cudaFree(d_C); cudaFree(d_A_dim); cudaFree(d_B_dim); - cudaFree(d_contractDims); } int main() { @@ -195,3 +190,55 @@ int main() { return 0; } + +// CHECK-EXEC: Result C: +// CHECK-NEXT: 130 140 +// CHECK-NEXT: 150 160 +// CHECK-NEXT: 170 180 +// CHECK-NEXT: +// CHECK-NEXT: 290 316 +// CHECK-NEXT: 342 368 +// CHECK-NEXT: 394 420 +// CHECK-NEXT: +// CHECK-NEXT: 450 492 +// CHECK-NEXT: 534 576 +// CHECK-NEXT: 618 660 +// CHECK-NEXT: +// CHECK-NEXT: +// CHECK-NEXT: 610 668 +// CHECK-NEXT: 726 784 +// CHECK-NEXT: 842 900 +// CHECK-NEXT: +// CHECK-NEXT: 770 844 +// CHECK-NEXT: 918 992 +// CHECK-NEXT: 1066 1140 +// CHECK-NEXT: +// CHECK-NEXT: 930 1020 +// CHECK-NEXT: 1110 1200 +// CHECK-NEXT: 1290 1380 + +// CHECK-EXEC: Result C_grad w.r.t. A: +// CHECK-NEXT: 21 57 93 129 +// CHECK-NEXT: 21 57 93 129 +// CHECK-NEXT: 21 57 93 129 +// CHECK-NEXT: +// CHECK-NEXT: 21 57 93 129 +// CHECK-NEXT: 21 57 93 129 +// CHECK-NEXT: 21 57 93 129 +// CHECK-NEXT: +// CHECK-EXEC: Result C_grad w.r.t. B: +// CHECK-NEXT: 66 66 +// CHECK-NEXT: 66 66 +// CHECK-NEXT: 66 66 +// CHECK-NEXT: +// CHECK-NEXT: 72 72 +// CHECK-NEXT: 72 72 +// CHECK-NEXT: 72 72 +// CHECK-NEXT: +// CHECK-NEXT: 78 78 +// CHECK-NEXT: 78 78 +// CHECK-NEXT: 78 78 +// CHECK-NEXT: +// CHECK-NEXT: 84 84 +// CHECK-NEXT: 84 84 +// CHECK-NEXT: 84 84 \ No newline at end of file