Don't skip cuda host functions when they have const args

kchristin22 · Nov 3, 2024 · bf26f58 · bf26f58
1 parent d236dcf
commit bf26f58
Show file tree

Hide file tree

Showing 3 changed files with 72 additions and 23 deletions.
diff --git a/include/clad/Differentiator/BuiltinDerivatives.h b/include/clad/Differentiator/BuiltinDerivatives.h
@@ -91,7 +91,7 @@ __global__ void atomicAdd_kernel(T* destPtr, T* srcPtr, size_t N) {
 }
 
 template <typename T>
-void cudaMemcpy_pullback(T* destPtr, T* srcPtr, size_t count,
+void cudaMemcpy_pullback(T* destPtr, const T* srcPtr, const size_t count,
                          cudaMemcpyKind kind, T* d_destPtr, T* d_srcPtr,
                          size_t* d_count, cudaMemcpyKind* d_kind)
     __attribute__((host)) {

diff --git a/lib/Differentiator/ReverseModeVisitor.cpp b/lib/Differentiator/ReverseModeVisitor.cpp
@@ -1823,7 +1823,8 @@ Expr* getArraySizeExpr(const ArrayType* AT, ASTContext& context,
     // If all arguments are constant literals, then this does not contribute to
     // the gradient.
     // FIXME: revert this when this is integrated in the activity analysis pass.
-    if (!isa<CXXMemberCallExpr>(CE) && !isa<CXXOperatorCallExpr>(CE)) {
+    if (!isa<CXXMemberCallExpr>(CE) && !isa<CXXOperatorCallExpr>(CE) &&
+        CE->getCallReturnType(m_Context).getAsString() != "cudaError_t") {
       bool allArgsAreConstantLiterals = true;
       for (const Expr* arg : CE->arguments()) {
         // if it's of type MaterializeTemporaryExpr, then check its
@@ -1847,7 +1848,8 @@ Expr* getArraySizeExpr(const ArrayType* AT, ASTContext& context,
     // derived function. In the case of member functions, `implicit`
     // this object is always passed by reference.
     if (!dfdx() && !utils::HasAnyReferenceOrPointerArgument(FD) &&
-        !isa<CXXMemberCallExpr>(CE) && !isa<CXXOperatorCallExpr>(CE)) {
+        !isa<CXXMemberCallExpr>(CE) && !isa<CXXOperatorCallExpr>(CE) &&
+        CE->getCallReturnType(m_Context).getAsString() != "cudaError_t") {
       for (const Expr* Arg : CE->arguments()) {
         StmtDiff ArgDiff = Visit(Arg, dfdx());
         CallArgs.push_back(ArgDiff.getExpr());

diff --git a/test/CUDA/tensor_demo.cu b/test/CUDA/tensor_demo.cu
@@ -1,11 +1,14 @@
+// RUN: %cladclang_cuda -I%S/../../include --cuda-path=%cudapath \
+// RUN:     --cuda-gpu-arch=%cudaarch %cudaldflags -otensor_demo.out %s
+// RUN: ./tensor_demo.out | %filecheck_exec %s
+
+
 #include "clad/Differentiator/Differentiator.h"
 
 typedef unsigned long long int size_type;
 
-__device__ void computeStartStep(size_type& A_start, size_type& A_step, size_type& B_start, size_type& B_step, const int idx, const size_type A_dim[3], const size_type B_dim[3], const int contractDims[2]) {
+__device__ void computeStartStep(size_type& A_start, size_type& A_step, size_type& B_start, size_type& B_step, const int idx, const size_type A_dim[3], const size_type B_dim[3], const int contractDimA, const int contractDimB) {
     size_type A_a, A_b, A_c, B_d, B_e, B_f;
-    int contractDimA = contractDims[0];
-    int contractDimB = contractDims[1];
 
     switch (contractDimA) {
         case 0:
@@ -50,18 +53,16 @@ __device__ void computeStartStep(size_type& A_start, size_type& A_step, size_typ
     }
 }
 
-__global__ void tensorContraction3D(float* C, const float *A, const float *B, const size_type *A_dim, const size_type *B_dim, const int contractDims[2]) {
+__global__ void tensorContraction3D(float* C, const float *A, const float *B, const size_type *A_dim, const size_type *B_dim, const int contractDimA, const int contractDimB) {
     int idx = blockIdx.x * blockDim.x + threadIdx.x;
-    int contractDimA = contractDims[0];
-    int contractDimB = contractDims[1];
 
     // Each thread computes one element of the output tensor
     int totalElements = A_dim[(contractDimA + 1) % 3] * A_dim[(contractDimA + 2) % 3] * B_dim[(contractDimB + 1) % 3] * B_dim[(contractDimB + 2) % 3];
     if (idx < totalElements) {
       size_type A_start, B_start, A_step, B_step;
       size_type A_a, A_b, A_c, B_d, B_e, B_f;
 
-      computeStartStep(A_start, A_step, B_start, B_step, idx, A_dim, B_dim, contractDims);
+      computeStartStep(A_start, A_step, B_start, B_step, idx, A_dim, B_dim, contractDimA, contractDimB);
 
       float sum = 0.0f;
       for (int i = 0; i < A_dim[contractDimA]; i++) { // A_dim[contractDimA] == B_dim[contractDimB]
@@ -72,12 +73,12 @@ __global__ void tensorContraction3D(float* C, const float *A, const float *B, co
     }
 }
 
-void launchTensorContraction3D(float* C, float* A, float* B, const size_type D1, const size_type D2, const size_type D3, const size_type D4, const size_type D5) {
+void launchTensorContraction3D(float* C, const float* A, const float* B, const size_type D1, const size_type D2, const size_type D3, const size_type D4, const size_type D5) {
     float *d_A = nullptr, *d_B = nullptr, *d_C = nullptr;
 
-    size_type A_size = D1 * D2 * D3 * sizeof(float);
-    size_type B_size = D3 * D4 * D5 * sizeof(float);
-    size_type C_size = D1 * D2 * D4 * D5 * sizeof(float);
+    const size_type A_size = D1 * D2 * D3 * sizeof(float);
+    const size_type B_size = D3 * D4 * D5 * sizeof(float);
+    const size_type C_size = D1 * D2 * D4 * D5 * sizeof(float);
 
     // Allocate device memory and copy data from host to device
     cudaMalloc(&d_A, A_size);
@@ -86,22 +87,17 @@ void launchTensorContraction3D(float* C, float* A, float* B, const size_type D1,
     cudaMemcpy(d_A, A, A_size, cudaMemcpyHostToDevice);
     cudaMemcpy(d_B, B, B_size, cudaMemcpyHostToDevice);
 
-    size_type A_dim[3] = {D1, D2, D3};
-    size_type B_dim[3] = {D3, D4, D5};
+    const size_type A_dim[3] = {D1, D2, D3};
+    const size_type B_dim[3] = {D3, D4, D5};
 
     size_type *d_A_dim = nullptr, *d_B_dim = nullptr;
     cudaMalloc(&d_A_dim, 3 * sizeof(size_type));
     cudaMalloc(&d_B_dim, 3 * sizeof(size_type));
     cudaMemcpy(d_A_dim, A_dim, 3 * sizeof(size_type), cudaMemcpyHostToDevice);
     cudaMemcpy(d_B_dim, B_dim, 3 * sizeof(size_type), cudaMemcpyHostToDevice);
 
-    int contractDims[2] = {2, 0};
-    int *d_contractDims = nullptr;
-    cudaMalloc(&d_contractDims, 2 * sizeof(int));
-    cudaMemcpy(d_contractDims, contractDims, 2 * sizeof(int), cudaMemcpyHostToDevice);
-
     // Launch the kernel
-    tensorContraction3D<<<1, 256>>>(d_C, d_A, d_B, d_A_dim, d_B_dim, d_contractDims);
+    tensorContraction3D<<<1, 256>>>(d_C, d_A, d_B, d_A_dim, d_B_dim, /*contractDimA=*/2, /*contractDimB=*/0);
 
     // Copy the result from device to host
     cudaMemcpy(C, d_C, C_size, cudaMemcpyDeviceToHost);
@@ -112,7 +108,6 @@ void launchTensorContraction3D(float* C, float* A, float* B, const size_type D1,
     cudaFree(d_C);
     cudaFree(d_A_dim);
     cudaFree(d_B_dim);
-    cudaFree(d_contractDims);
 }
 
 int main() {
@@ -195,3 +190,55 @@ int main() {
 
     return 0;
 }
+
+// CHECK-EXEC: Result C:
+// CHECK-NEXT: 130 140 
+// CHECK-NEXT: 150 160 
+// CHECK-NEXT: 170 180 
+// CHECK-NEXT:
+// CHECK-NEXT: 290 316 
+// CHECK-NEXT: 342 368 
+// CHECK-NEXT: 394 420 
+// CHECK-NEXT:
+// CHECK-NEXT: 450 492 
+// CHECK-NEXT: 534 576 
+// CHECK-NEXT: 618 660 
+// CHECK-NEXT:
+// CHECK-NEXT:
+// CHECK-NEXT: 610 668 
+// CHECK-NEXT: 726 784 
+// CHECK-NEXT: 842 900 
+// CHECK-NEXT:
+// CHECK-NEXT: 770 844 
+// CHECK-NEXT: 918 992 
+// CHECK-NEXT: 1066 1140 
+// CHECK-NEXT:
+// CHECK-NEXT: 930 1020 
+// CHECK-NEXT: 1110 1200 
+// CHECK-NEXT: 1290 1380 
+
+// CHECK-EXEC: Result C_grad w.r.t. A:
+// CHECK-NEXT: 21 57 93 129 
+// CHECK-NEXT: 21 57 93 129 
+// CHECK-NEXT: 21 57 93 129 
+// CHECK-NEXT:
+// CHECK-NEXT: 21 57 93 129 
+// CHECK-NEXT: 21 57 93 129 
+// CHECK-NEXT: 21 57 93 129 
+// CHECK-NEXT:
+// CHECK-EXEC: Result C_grad w.r.t. B:
+// CHECK-NEXT: 66 66 
+// CHECK-NEXT: 66 66 
+// CHECK-NEXT: 66 66 
+// CHECK-NEXT:
+// CHECK-NEXT: 72 72 
+// CHECK-NEXT: 72 72 
+// CHECK-NEXT: 72 72 
+// CHECK-NEXT:
+// CHECK-NEXT: 78 78 
+// CHECK-NEXT: 78 78 
+// CHECK-NEXT: 78 78 
+// CHECK-NEXT:
+// CHECK-NEXT: 84 84 
+// CHECK-NEXT: 84 84 
+// CHECK-NEXT: 84 84