diff --git a/include/clad/Differentiator/DiffPlanner.h b/include/clad/Differentiator/DiffPlanner.h
index ef29b7246..a4b06a148 100644
--- a/include/clad/Differentiator/DiffPlanner.h
+++ b/include/clad/Differentiator/DiffPlanner.h
@@ -46,6 +46,8 @@ struct DiffRequest {
   clang::CallExpr* CallContext = nullptr;
   /// Args provided to the call to clad::gradient/differentiate.
   const clang::Expr* Args = nullptr;
+  /// Indexes of global GPU args of function as a subset of Args.
+  std::vector<size_t> CUDAGlobalArgsIndexes;
   /// Requested differentiation mode, forward or reverse.
   DiffMode Mode = DiffMode::unknown;
   /// If function appears in the call to clad::gradient/differentiate,
diff --git a/include/clad/Differentiator/ExternalRMVSource.h b/include/clad/Differentiator/ExternalRMVSource.h
index 4da9d09fc..72fc596b0 100644
--- a/include/clad/Differentiator/ExternalRMVSource.h
+++ b/include/clad/Differentiator/ExternalRMVSource.h
@@ -124,7 +124,7 @@ class ExternalRMVSource {
   /// This is called just before finalising `VisitReturnStmt`.
   virtual void ActBeforeFinalizingVisitReturnStmt(StmtDiff& retExprDiff) {}
 
-  /// This ic called just before finalising `VisitCallExpr`.
+  /// This is called just before finalising `VisitCallExpr`.
   ///
   /// \param CE call expression that is being visited.
   /// \param CallArgs
diff --git a/include/clad/Differentiator/ReverseModeVisitor.h b/include/clad/Differentiator/ReverseModeVisitor.h
index b044ee0ec..dabcfd256 100644
--- a/include/clad/Differentiator/ReverseModeVisitor.h
+++ b/include/clad/Differentiator/ReverseModeVisitor.h
@@ -38,6 +38,11 @@ namespace clad {
     // several private/protected members of the visitor classes.
     friend class ErrorEstimationHandler;
     llvm::SmallVector<const clang::ValueDecl*, 16> m_IndependentVars;
+    /// Set used to keep track of parameter variables w.r.t which the
+    /// the derivative (gradient) is being computed. This is separate from the
+    /// m_Variables map because all other intermediate variables will
+    /// not be stored here.
+    std::unordered_set<const clang::ValueDecl*> m_ParamVarsWithDiff;
     /// In addition to a sequence of forward-accumulated Stmts (m_Blocks), in
     /// the reverse mode we also accumulate Stmts for the reverse pass which
     /// will be executed on return.
@@ -51,6 +56,8 @@ namespace clad {
     /// that will be put immediately in the beginning of derivative function
     /// block.
     Stmts m_Globals;
+    /// Global GPU args of the function.
+    std::unordered_set<const clang::ParmVarDecl*> m_CUDAGlobalArgs;
     //// A reference to the output parameter of the gradient function.
     clang::Expr* m_Result;
     /// A flag indicating if the Stmt we are currently visiting is inside loop.
@@ -432,7 +439,7 @@ namespace clad {
 
     /// Helper function that checks whether the function to be derived
     /// is meant to be executed only by the GPU
-    bool shouldUseCudaAtomicOps();
+    bool shouldUseCudaAtomicOps(const clang::Expr* E);
 
     /// Add call to cuda::atomicAdd for the given LHS and RHS expressions.
     ///
diff --git a/lib/Differentiator/ReverseModeVisitor.cpp b/lib/Differentiator/ReverseModeVisitor.cpp
index 2a7a16aa4..badbf2591 100644
--- a/lib/Differentiator/ReverseModeVisitor.cpp
+++ b/lib/Differentiator/ReverseModeVisitor.cpp
@@ -104,10 +104,15 @@ Expr* getArraySizeExpr(const ArrayType* AT, ASTContext& context,
     return CladTapeResult{*this, PushExpr, PopExpr, TapeRef};
   }
 
-  bool ReverseModeVisitor::shouldUseCudaAtomicOps() {
-    return m_DiffReq->hasAttr<clang::CUDAGlobalAttr>() ||
-           (m_DiffReq->hasAttr<clang::CUDADeviceAttr>() &&
-            !m_DiffReq->hasAttr<clang::CUDAHostAttr>());
+  bool ReverseModeVisitor::shouldUseCudaAtomicOps(const Expr* E) {
+    // Same as checking whether this is a function executed by the GPU
+    if (!m_CUDAGlobalArgs.empty())
+      if (const auto* DRE = dyn_cast<DeclRefExpr>(E))
+        if (const auto* PVD = dyn_cast<ParmVarDecl>(DRE->getDecl()))
+          // Check whether this param is in the global memory of the GPU
+          return m_CUDAGlobalArgs.find(PVD) != m_CUDAGlobalArgs.end();
+
+    return false;
   }
 
   clang::Expr* ReverseModeVisitor::BuildCallToCudaAtomicAdd(clang::Expr* LHS,
@@ -123,8 +128,13 @@ Expr* getArraySizeExpr(const ArrayType* AT, ASTContext& context,
         m_Sema.BuildDeclarationNameExpr(SS, lookupResult, /*ADL=*/true).get();
 
     Expr* finalLHS = LHS;
-    if (isa<ArraySubscriptExpr>(LHS))
+    if (auto* UO = dyn_cast<UnaryOperator>(LHS)) {
+      if (UO->getOpcode() == UnaryOperatorKind::UO_Deref)
+        finalLHS = UO->getSubExpr()->IgnoreImplicit();
+    } else if (!LHS->getType()->isPointerType() &&
+               !LHS->getType()->isReferenceType())
       finalLHS = BuildOp(UnaryOperatorKind::UO_AddrOf, LHS);
+
     llvm::SmallVector<Expr*, 2> atomicArgs = {finalLHS, RHS};
 
     assert(!m_Builder.noOverloadExists(UnresolvedLookup, atomicArgs) &&
@@ -440,6 +450,12 @@ Expr* getArraySizeExpr(const ArrayType* AT, ASTContext& context,
     if (m_ExternalSource)
       m_ExternalSource->ActAfterCreatingDerivedFnParams(params);
 
+    // if the function is a global kernel, all its parameters reside in the
+    // global memory of the GPU
+    if (m_DiffReq->hasAttr<clang::CUDAGlobalAttr>())
+      for (auto* param : params)
+        m_CUDAGlobalArgs.emplace(param);
+
     llvm::ArrayRef<ParmVarDecl*> paramsRef =
         clad_compat::makeArrayRef(params.data(), params.size());
     gradientFD->setParams(paramsRef);
@@ -546,6 +562,8 @@ Expr* getArraySizeExpr(const ArrayType* AT, ASTContext& context,
 
     auto derivativeName =
         utils::ComputeEffectiveFnName(m_DiffReq.Function) + "_pullback";
+    for (auto index : m_DiffReq.CUDAGlobalArgsIndexes)
+      derivativeName += "_" + std::to_string(index);
     auto DNI = utils::BuildDeclarationNameInfo(m_Sema, derivativeName);
 
     auto paramTypes = ComputeParamTypes(args);
@@ -587,6 +605,12 @@ Expr* getArraySizeExpr(const ArrayType* AT, ASTContext& context,
       m_ExternalSource->ActAfterCreatingDerivedFnParams(params);
 
     m_Derivative->setParams(params);
+    // Match the global arguments of the call to the device function to the
+    // pullback function's parameters.
+    if (!m_DiffReq.CUDAGlobalArgsIndexes.empty())
+      for (auto index : m_DiffReq.CUDAGlobalArgsIndexes)
+        m_CUDAGlobalArgs.emplace(m_Derivative->getParamDecl(index));
+
     m_Derivative->setBody(nullptr);
 
     if (!m_DiffReq.DeclarationOnly) {
@@ -1519,7 +1543,7 @@ Expr* getArraySizeExpr(const ArrayType* AT, ASTContext& context,
         BuildArraySubscript(target, forwSweepDerivativeIndices);
     // Create the (target += dfdx) statement.
     if (dfdx()) {
-      if (shouldUseCudaAtomicOps()) {
+      if (shouldUseCudaAtomicOps(target)) {
         Expr* atomicCall = BuildCallToCudaAtomicAdd(result, dfdx());
         // Add it to the body statements.
         addToCurrentBlock(atomicCall, direction::reverse);
@@ -1583,9 +1607,17 @@ Expr* getArraySizeExpr(const ArrayType* AT, ASTContext& context,
           // FIXME: not sure if this is generic.
           // Don't update derivatives of record types.
           if (!VD->getType()->isRecordType()) {
-            auto* add_assign = BuildOp(BO_AddAssign, it->second, dfdx());
-            // Add it to the body statements.
-            addToCurrentBlock(add_assign, direction::reverse);
+            Expr* base = it->second;
+            if (auto* UO = dyn_cast<UnaryOperator>(it->second))
+              base = UO->getSubExpr()->IgnoreImpCasts();
+            if (shouldUseCudaAtomicOps(base)) {
+              Expr* atomicCall = BuildCallToCudaAtomicAdd(it->second, dfdx());
+              // Add it to the body statements.
+              addToCurrentBlock(atomicCall, direction::reverse);
+            } else {
+              auto* add_assign = BuildOp(BO_AddAssign, it->second, dfdx());
+              addToCurrentBlock(add_assign, direction::reverse);
+            }
           }
         }
         return StmtDiff(clonedDRE, it->second, it->second);
@@ -1728,20 +1760,31 @@ Expr* getArraySizeExpr(const ArrayType* AT, ASTContext& context,
       for (const Expr* Arg : CE->arguments()) {
         StmtDiff ArgDiff = Visit(Arg, dfdx());
         CallArgs.push_back(ArgDiff.getExpr());
-        DerivedCallArgs.push_back(ArgDiff.getExpr_dx());
+        if (auto* DRE = dyn_cast<DeclRefExpr>(ArgDiff.getExpr())) {
+          // If the arg is used for differentiation of the function, then we
+          // cannot free it in the end as it's the result to be returned to the
+          // user.
+          if (m_ParamVarsWithDiff.find(DRE->getDecl()) ==
+              m_ParamVarsWithDiff.end())
+            DerivedCallArgs.push_back(ArgDiff.getExpr_dx());
+        }
       }
       Expr* call =
           m_Sema
               .ActOnCallExpr(getCurrentScope(), Clone(CE->getCallee()), Loc,
                              llvm::MutableArrayRef<Expr*>(CallArgs), Loc)
               .get();
-      Expr* call_dx =
-          m_Sema
-              .ActOnCallExpr(getCurrentScope(), Clone(CE->getCallee()), Loc,
-                             llvm::MutableArrayRef<Expr*>(DerivedCallArgs), Loc)
-              .get();
       m_DeallocExprs.push_back(call);
-      m_DeallocExprs.push_back(call_dx);
+
+      if (!DerivedCallArgs.empty()) {
+        Expr* call_dx =
+            m_Sema
+                .ActOnCallExpr(getCurrentScope(), Clone(CE->getCallee()), Loc,
+                               llvm::MutableArrayRef<Expr*>(DerivedCallArgs),
+                               Loc)
+                .get();
+        m_DeallocExprs.push_back(call_dx);
+      }
       return StmtDiff();
     }
 
@@ -1887,6 +1930,7 @@ Expr* getArraySizeExpr(const ArrayType* AT, ASTContext& context,
     // If it has more args or f_darg0 was not found, we look for its pullback
     // function.
     const auto* MD = dyn_cast<CXXMethodDecl>(FD);
+    std::vector<size_t> globalCallArgs;
     if (!OverloadedDerivedFn) {
       size_t idx = 0;
 
@@ -1952,12 +1996,23 @@ Expr* getArraySizeExpr(const ArrayType* AT, ASTContext& context,
                                 pullback);
 
       // Try to find it in builtin derivatives
+      std::string customPullback =
+          clad::utils::ComputeEffectiveFnName(FD) + "_pullback";
+      // Add the indexes of the global args to the custom pullback name
+      if (!m_CUDAGlobalArgs.empty())
+        for (size_t i = 0; i < pullbackCallArgs.size(); i++)
+          if (auto* DRE = dyn_cast<DeclRefExpr>(pullbackCallArgs[i]))
+            if (auto* param = dyn_cast<ParmVarDecl>(DRE->getDecl()))
+              if (m_CUDAGlobalArgs.find(param) != m_CUDAGlobalArgs.end()) {
+                customPullback += "_" + std::to_string(i);
+                globalCallArgs.emplace_back(i);
+              }
+
       if (baseDiff.getExpr())
         pullbackCallArgs.insert(
             pullbackCallArgs.begin(),
             BuildOp(UnaryOperatorKind::UO_AddrOf, baseDiff.getExpr()));
-      std::string customPullback =
-          clad::utils::ComputeEffectiveFnName(FD) + "_pullback";
+
       OverloadedDerivedFn =
           m_Builder.BuildCallToCustomDerivativeOrNumericalDiff(
               customPullback, pullbackCallArgs, getCurrentScope(),
@@ -1990,6 +2045,11 @@ Expr* getArraySizeExpr(const ArrayType* AT, ASTContext& context,
         // derive the called function.
         DiffRequest pullbackRequest{};
         pullbackRequest.Function = FD;
+
+        // Mark the indexes of the global args. Necessary if the argument of the
+        // call has a different name than the function's signature parameter.
+        pullbackRequest.CUDAGlobalArgsIndexes = globalCallArgs;
+
         pullbackRequest.BaseFunctionName =
             clad::utils::ComputeEffectiveFnName(FD);
         pullbackRequest.Mode = DiffMode::experimental_pullback;
@@ -2237,12 +2297,15 @@ Expr* getArraySizeExpr(const ArrayType* AT, ASTContext& context,
           ConstantFolder::synthesizeLiteral(m_Context.IntTy, m_Context, i);
       Expr* gradElem = BuildArraySubscript(gradRef, {idx});
       Expr* gradExpr = BuildOp(BO_Mul, dfdx, gradElem);
+      // Inputs were not pointers, so the output args are not in global GPU
+      // memory. Hence, no need to use atomic ops.
       PostCallStmts.push_back(BuildOp(BO_AddAssign, outputArgs[i], gradExpr));
       NumDiffArgs.push_back(args[i]);
     }
     std::string Name = "central_difference";
     return m_Builder.BuildCallToCustomDerivativeOrNumericalDiff(
-        Name, NumDiffArgs, getCurrentScope(), /*OriginalFnDC=*/nullptr,
+        Name, NumDiffArgs, getCurrentScope(),
+        /*OriginalFnDC=*/nullptr,
         /*forCustomDerv=*/false,
         /*namespaceShouldExist=*/false);
   }
@@ -2343,8 +2406,8 @@ Expr* getArraySizeExpr(const ArrayType* AT, ASTContext& context,
       else {
         derivedE = BuildOp(UnaryOperatorKind::UO_Deref, diff_dx);
         // Create the (target += dfdx) statement.
-        if (dfdx()) {
-          if (shouldUseCudaAtomicOps()) {
+        if (dfdx() && derivedE) {
+          if (shouldUseCudaAtomicOps(diff_dx)) {
             Expr* atomicCall = BuildCallToCudaAtomicAdd(diff_dx, dfdx());
             // Add it to the body statements.
             addToCurrentBlock(atomicCall, direction::reverse);
@@ -4556,6 +4619,7 @@ Expr* getArraySizeExpr(const ArrayType* AT, ASTContext& context,
               m_Variables[*it] =
                   utils::BuildParenExpr(m_Sema, m_Variables[*it]);
           }
+          m_ParamVarsWithDiff.emplace(*it);
         }
       }
     }
diff --git a/lib/Differentiator/VisitorBase.cpp b/lib/Differentiator/VisitorBase.cpp
index b6156397c..63ae3c369 100644
--- a/lib/Differentiator/VisitorBase.cpp
+++ b/lib/Differentiator/VisitorBase.cpp
@@ -783,7 +783,8 @@ namespace clad {
     // Return the found overload.
     std::string Name = "forward_central_difference";
     return m_Builder.BuildCallToCustomDerivativeOrNumericalDiff(
-        Name, NumDiffArgs, getCurrentScope(), /*OriginalFnDC=*/nullptr,
+        Name, NumDiffArgs, getCurrentScope(),
+        /*OriginalFnDC=*/nullptr,
         /*forCustomDerv=*/false,
         /*namespaceShouldExist=*/false);
   }
diff --git a/test/CUDA/GradientKernels.cu b/test/CUDA/GradientKernels.cu
index 171341a7e..a60604fa6 100644
--- a/test/CUDA/GradientKernels.cu
+++ b/test/CUDA/GradientKernels.cu
@@ -288,6 +288,186 @@ __global__ void add_kernel_7(double *a, double *b) {
 //CHECK-NEXT:    }
 //CHECK-NEXT:}
 
+__device__ double device_fn(double in, double val) {
+  return in + val;
+}
+
+__global__ void kernel_with_device_call(double *out, double *in, double val) {
+  int index = threadIdx.x;
+  out[index] = device_fn(in[index], val);
+}
+
+// CHECK: void kernel_with_device_call_grad_0_2(double *out, double *in, double val, double *_d_out, double *_d_val) {
+//CHECK-NEXT:    int _d_index = 0;
+//CHECK-NEXT:    int index0 = threadIdx.x;
+//CHECK-NEXT:    double _t0 = out[index0];
+//CHECK-NEXT:    out[index0] = device_fn(in[index0], val);
+//CHECK-NEXT:    {
+//CHECK-NEXT:        out[index0] = _t0;
+//CHECK-NEXT:        double _r_d0 = _d_out[index0];
+//CHECK-NEXT:        _d_out[index0] = 0.;
+//CHECK-NEXT:        double _r0 = 0.;
+//CHECK-NEXT:        double _r1 = 0.;
+//CHECK-NEXT:        device_fn_pullback_1(in[index0], val, _r_d0, &_r0, &_r1);
+//CHECK-NEXT:        atomicAdd(_d_val, _r1);
+//CHECK-NEXT:    }
+//CHECK-NEXT:}
+
+__device__ double device_fn_2(double *in, double val) {
+  int index = threadIdx.x + blockIdx.x * blockDim.x;
+  return in[index] + val;
+}
+
+__global__ void kernel_with_device_call_2(double *out, double *in, double val) {
+  int index = threadIdx.x;
+  out[index] = device_fn_2(in, val);
+} 
+
+__global__ void dup_kernel_with_device_call_2(double *out, double *in, double val) {
+  int index = threadIdx.x;
+  out[index] = device_fn_2(in, val);
+} 
+
+// CHECK: void kernel_with_device_call_2_grad_0_2(double *out, double *in, double val, double *_d_out, double *_d_val) {
+//CHECK-NEXT:    int _d_index = 0;
+//CHECK-NEXT:    int index0 = threadIdx.x;
+//CHECK-NEXT:    double _t0 = out[index0];
+//CHECK-NEXT:    out[index0] = device_fn_2(in, val);
+//CHECK-NEXT:    {
+//CHECK-NEXT:        out[index0] = _t0;
+//CHECK-NEXT:        double _r_d0 = _d_out[index0];
+//CHECK-NEXT:        _d_out[index0] = 0.;
+//CHECK-NEXT:        double _r0 = 0.;
+//CHECK-NEXT:        device_fn_2_pullback_0_1(in, val, _r_d0, &_r0);
+//CHECK-NEXT:        atomicAdd(_d_val, _r0);
+//CHECK-NEXT:    }
+//CHECK-NEXT:}
+
+// CHECK: void kernel_with_device_call_2_grad_0_1(double *out, double *in, double val, double *_d_out, double *_d_in) {
+//CHECK-NEXT:    double _d_val = 0.;
+//CHECK-NEXT:    int _d_index = 0;
+//CHECK-NEXT:    int index0 = threadIdx.x;
+//CHECK-NEXT:    double _t0 = out[index0];
+//CHECK-NEXT:    out[index0] = device_fn_2(in, val);
+//CHECK-NEXT:    {
+//CHECK-NEXT:        out[index0] = _t0;
+//CHECK-NEXT:        double _r_d0 = _d_out[index0];
+//CHECK-NEXT:        _d_out[index0] = 0.;
+//CHECK-NEXT:        double _r0 = 0.;
+//CHECK-NEXT:        device_fn_2_pullback_0_1_3(in, val, _r_d0, _d_in, &_r0);
+//CHECK-NEXT:        _d_val += _r0;
+//CHECK-NEXT:    }
+//CHECK-NEXT:}
+
+__device__ double device_fn_3(double *in, double *val) {
+  int index = threadIdx.x + blockIdx.x * blockDim.x;
+  return in[index] + *val;
+}
+
+__global__ void kernel_with_device_call_3(double *out, double *in, double *val) {
+  int index = threadIdx.x;
+  out[index] = device_fn_3(in, val);
+} 
+
+// CHECK: void kernel_with_device_call_3_grad(double *out, double *in, double *val, double *_d_out, double *_d_in, double *_d_val) {
+//CHECK-NEXT:    int _d_index = 0;
+//CHECK-NEXT:    int index0 = threadIdx.x;
+//CHECK-NEXT:    double _t0 = out[index0];
+//CHECK-NEXT:    out[index0] = device_fn_3(in, val);
+//CHECK-NEXT:    {
+//CHECK-NEXT:        out[index0] = _t0;
+//CHECK-NEXT:        double _r_d0 = _d_out[index0];
+//CHECK-NEXT:        _d_out[index0] = 0.;
+//CHECK-NEXT:        device_fn_3_pullback_0_1_3_4(in, val, _r_d0, _d_in, _d_val);
+//CHECK-NEXT:    }
+//CHECK-NEXT:}
+
+__device__ double device_fn_4(double *in, double val) {
+  int index = threadIdx.x + blockIdx.x * blockDim.x;
+  return in[index] + val;
+}
+
+__device__ double device_with_device_call(double *in, double val) {
+  return device_fn_4(in, val);
+}
+
+__global__ void kernel_with_nested_device_call(double *out, double *in, double val) {
+  int index = threadIdx.x;
+  out[index] = device_with_device_call(in, val);
+}
+
+// CHECK: void kernel_with_nested_device_call_grad_0_1(double *out, double *in, double val, double *_d_out, double *_d_in) {
+//CHECK-NEXT:    double _d_val = 0.;
+//CHECK-NEXT:    int _d_index = 0;
+//CHECK-NEXT:    int index0 = threadIdx.x;
+//CHECK-NEXT:    double _t0 = out[index0];
+//CHECK-NEXT:    out[index0] = device_with_device_call(in, val);
+//CHECK-NEXT:    {
+//CHECK-NEXT:       out[index0] = _t0;
+//CHECK-NEXT:       double _r_d0 = _d_out[index0];
+//CHECK-NEXT:        _d_out[index0] = 0.;
+//CHECK-NEXT:        double _r0 = 0.;
+//CHECK-NEXT:        device_with_device_call_pullback_0_1_3(in, val, _r_d0, _d_in, &_r0);
+//CHECK-NEXT:        _d_val += _r0;
+//CHECK-NEXT:    }
+//CHECK-NEXT:}
+
+// CHECK: __attribute__((device)) void device_fn_pullback_1(double in, double val, double _d_y, double *_d_in, double *_d_val) {
+//CHECK-NEXT:    {
+//CHECK-NEXT:                *_d_in += _d_y;
+//CHECK-NEXT:                *_d_val += _d_y;
+//CHECK-NEXT:    }
+//CHECK-NEXT:}
+
+// CHECK: __attribute__((device)) void device_fn_2_pullback_0_1(double *in, double val, double _d_y, double *_d_val) {
+//CHECK-NEXT:    unsigned int _t1 = blockIdx.x;
+//CHECK-NEXT:    unsigned int _t0 = blockDim.x;
+//CHECK-NEXT:    int _d_index = 0;
+//CHECK-NEXT:    int index0 = threadIdx.x + _t1 * _t0;
+//CHECK-NEXT:    *_d_val += _d_y;
+//CHECK-NEXT:}
+
+// CHECK: __attribute__((device)) void device_fn_2_pullback_0_1_3(double *in, double val, double _d_y, double *_d_in, double *_d_val) {
+//CHECK-NEXT:    unsigned int _t1 = blockIdx.x;
+//CHECK-NEXT:    unsigned int _t0 = blockDim.x;
+//CHECK-NEXT:    int _d_index = 0;
+//CHECK-NEXT:    int index0 = threadIdx.x + _t1 * _t0;
+//CHECK-NEXT:    {
+//CHECK-NEXT:        atomicAdd(&_d_in[index0], _d_y);
+//CHECK-NEXT:        *_d_val += _d_y;
+//CHECK-NEXT:    }
+//CHECK-NEXT:}
+
+// CHECK: __attribute__((device)) void device_fn_3_pullback_0_1_3_4(double *in, double *val, double _d_y, double *_d_in, double *_d_val) {
+//CHECK-NEXT:    unsigned int _t1 = blockIdx.x;
+//CHECK-NEXT:    unsigned int _t0 = blockDim.x;
+//CHECK-NEXT:    int _d_index = 0;
+//CHECK-NEXT:    int index0 = threadIdx.x + _t1 * _t0;
+//CHECK-NEXT:    {
+//CHECK-NEXT:        atomicAdd(&_d_in[index0], _d_y);
+//CHECK-NEXT:        atomicAdd(_d_val, _d_y);
+//CHECK-NEXT:    }
+//CHECK-NEXT:}
+
+// CHECK: __attribute__((device)) void device_with_device_call_pullback_0_1_3(double *in, double val, double _d_y, double *_d_in, double *_d_val) {
+//CHECK-NEXT:    {
+//CHECK-NEXT:        double _r0 = 0.;
+//CHECK-NEXT:        device_fn_4_pullback_0_1_3(in, val, _d_y, _d_in, &_r0);
+//CHECK-NEXT:        *_d_val += _r0;
+//CHECK-NEXT:    }
+//CHECK-NEXT:}
+
+// CHECK: __attribute__((device)) void device_fn_4_pullback_0_1_3(double *in, double val, double _d_y, double *_d_in, double *_d_val) {
+//CHECK-NEXT:    unsigned int _t1 = blockIdx.x;
+//CHECK-NEXT:    unsigned int _t0 = blockDim.x;
+//CHECK-NEXT:    int _d_index = 0;
+//CHECK-NEXT:    int index0 = threadIdx.x + _t1 * _t0;
+//CHECK-NEXT:    {
+//CHECK-NEXT:        atomicAdd(&_d_in[index0], _d_y);
+//CHECK-NEXT:        *_d_val += _d_y;
+//CHECK-NEXT:    }
+//CHECK-NEXT:}
+
 #define TEST(F, grid, block, shared_mem, use_stream, x, dx, N)              \
   {                                                                         \
     int *fives = (int*)malloc(N * sizeof(int));                             \
@@ -345,9 +525,9 @@ __global__ void add_kernel_7(double *a, double *b) {
     else {                                                                    \
       test.execute_kernel(grid, block, y, x, dy, dx);                         \
     }                                                                         \
-    cudaDeviceSynchronize();                                                  \
     int *res = (int*)malloc(N * sizeof(int));                                 \
     cudaMemcpy(res, dx, N * sizeof(int), cudaMemcpyDeviceToHost);             \
+    cudaDeviceSynchronize();                                                  \
     for (int i = 0; i < (N - 1); i++) {                                       \
       printf("%d, ", res[i]);                                                 \
     }                                                                         \
@@ -380,9 +560,9 @@ __global__ void add_kernel_7(double *a, double *b) {
     else {                                                                        \
       test.execute_kernel(grid, block, y, x, N, dy, dx);                          \
     }                                                                             \
-    cudaDeviceSynchronize();                                                      \
     int *res = (int*)malloc(N * sizeof(int));                                     \
     cudaMemcpy(res, dx, N * sizeof(int), cudaMemcpyDeviceToHost);                 \
+    cudaDeviceSynchronize();                                                      \
     for (int i = 0; i < (N - 1); i++) {                                           \
       printf("%d, ", res[i]);                                                     \
     }                                                                             \
@@ -415,9 +595,9 @@ __global__ void add_kernel_7(double *a, double *b) {
     else {                                                                      \
       test.execute_kernel(grid, block, y, x, dy, dx);                           \
     }                                                                           \
-    cudaDeviceSynchronize();                                                    \
     double *res = (double*)malloc(N * sizeof(double));                          \
     cudaMemcpy(res, dx, N * sizeof(double), cudaMemcpyDeviceToHost);            \
+    cudaDeviceSynchronize();                                                    \
     for (int i = 0; i < (N - 1); i++) {                                         \
       printf("%0.2f, ", res[i]);                                                \
     }                                                                           \
@@ -427,6 +607,25 @@ __global__ void add_kernel_7(double *a, double *b) {
     free(res);                                                                  \
   }
 
+#define INIT(x, y, val, dx, dy, d_val)                                          \
+{                                                                               \
+  double *fives = (double*)malloc(10 * sizeof(double));                         \
+  for(int i = 0; i < 10; i++) {                                                 \
+    fives[i] = 5;                                                               \
+  }                                                                             \
+  double *zeros = (double*)malloc(10 * sizeof(double));                         \
+  for(int i = 0; i < 10; i++) {                                                 \
+    zeros[i] = 0;                                                               \
+  }                                                                             \
+  cudaMemcpy(x, fives, 10 * sizeof(double), cudaMemcpyHostToDevice);            \
+  cudaMemcpy(y, zeros, 10 * sizeof(double), cudaMemcpyHostToDevice);            \
+  cudaMemcpy(val, fives, sizeof(double), cudaMemcpyHostToDevice);               \
+  cudaMemcpy(dx, zeros, 10 * sizeof(double), cudaMemcpyHostToDevice);           \
+  cudaMemcpy(dy, fives, 10 * sizeof(double), cudaMemcpyHostToDevice);           \
+  cudaMemcpy(d_val, zeros, sizeof(double), cudaMemcpyHostToDevice);             \
+  free(fives);                                                                  \
+  free(zeros);                                                                  \
+}
 
 int main(void) {
   int *a, *d_a;
@@ -472,11 +671,71 @@ int main(void) {
 
   TEST_2_D(add_kernel_7, dim3(1), dim3(5, 1, 1), 0, false, "a, b", dummy_out_double, dummy_in_double, d_out_double, d_in_double, 10); // CHECK-EXEC: 50.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00
 
+  double *val;
+  cudaMalloc(&val, sizeof(double));
+  double *d_val;
+  cudaMalloc(&d_val, sizeof(double));
+  
+  INIT(dummy_in_double, dummy_out_double, val, d_in_double, d_out_double, d_val);
+
+  auto test_device = clad::gradient(kernel_with_device_call, "out, val");
+  test_device.execute_kernel(dim3(1), dim3(10, 1, 1), dummy_out_double, dummy_in_double, 5, d_out_double, d_val);
+  double *res = (double*)malloc(10 * sizeof(double));
+  cudaMemcpy(res, d_val, sizeof(double), cudaMemcpyDeviceToHost);
+  cudaDeviceSynchronize();
+  printf("%0.2f\n", *res); // CHECK-EXEC: 50.00
+
+  INIT(dummy_in_double, dummy_out_double, val, d_in_double, d_out_double, d_val);
+
+  auto test_device_2 = clad::gradient(kernel_with_device_call_2, "out, val");
+  test_device_2.execute_kernel(dim3(1), dim3(10, 1, 1), dummy_out_double, dummy_in_double, 5, d_out_double, d_val);
+  cudaMemcpy(res, d_val, sizeof(double), cudaMemcpyDeviceToHost);
+  cudaDeviceSynchronize();
+  printf("%0.2f\n", *res); // CHECK-EXEC: 50.00
+
+  INIT(dummy_in_double, dummy_out_double, val, d_in_double, d_out_double, d_val);
+
+  auto check_dup = clad::gradient(dup_kernel_with_device_call_2, "out, val"); // check that the pullback function is not regenerated
+  check_dup.execute_kernel(dim3(1), dim3(10, 1, 1), dummy_out_double, dummy_in_double, 5, d_out_double, d_val);
+  cudaMemcpy(res, d_val, sizeof(double), cudaMemcpyDeviceToHost);
+  cudaDeviceSynchronize();
+  printf("%s\n", cudaGetErrorString(cudaGetLastError())); // CHECK-EXEC: no error
+  printf("%0.2f\n", *res); // CHECK-EXEC: 50.00
+
+  INIT(dummy_in_double, dummy_out_double, val, d_in_double, d_out_double, d_val);
+
+  auto test_device_3 = clad::gradient(kernel_with_device_call_2, "out, in");
+  test_device_3.execute_kernel(dim3(1), dim3(10, 1, 1), dummy_out_double, dummy_in_double, 5, d_out_double, d_in_double);
+  cudaDeviceSynchronize();
+  cudaMemcpy(res, d_in_double, 10 * sizeof(double), cudaMemcpyDeviceToHost);
+  printf("%0.2f, %0.2f, %0.2f\n", res[0], res[1], res[2]); // CHECK-EXEC: 5.00, 5.00, 5.00
+
+  INIT(dummy_in_double, dummy_out_double, val, d_in_double, d_out_double, d_val);
+
+  auto test_device_4 = clad::gradient(kernel_with_device_call_3);
+  test_device_4.execute_kernel(dim3(1), dim3(10, 1, 1), dummy_out_double, dummy_in_double, val, d_out_double, d_in_double, d_val);
+  cudaDeviceSynchronize();
+  cudaMemcpy(res, d_in_double, 10 * sizeof(double), cudaMemcpyDeviceToHost);
+  cudaDeviceSynchronize();
+  printf("%0.2f, %0.2f, %0.2f\n", res[0], res[1], res[2]); // CHECK-EXEC: 5.00, 5.00, 5.00
+  cudaMemcpy(res, d_val, sizeof(double), cudaMemcpyDeviceToHost);
+  printf("%0.2f\n", *res); // CHECK-EXEC: 50.00
+
+  INIT(dummy_in_double, dummy_out_double, val, d_in_double, d_out_double, d_val);
+
+  auto nested_device = clad::gradient(kernel_with_nested_device_call, "out, in");
+  nested_device.execute_kernel(dim3(1), dim3(10, 1, 1), dummy_out_double, dummy_in_double, 5, d_out_double, d_in_double);
+  cudaDeviceSynchronize();
+  cudaMemcpy(res, d_in_double, 10 * sizeof(double), cudaMemcpyDeviceToHost);
+  printf("%0.2f, %0.2f, %0.2f\n", res[0], res[1], res[2]); // CHECK-EXEC: 5.00, 5.00, 5.00
+
+  free(res);
   cudaFree(dummy_in_double);
   cudaFree(dummy_out_double);
   cudaFree(d_out_double);
   cudaFree(d_in_double);
-
+  cudaFree(val);
+  cudaFree(d_val);
 
   return 0;
 }