diff --git a/demos/CUDA/BlackScholes/BlackScholes.cu b/demos/CUDA/BlackScholes/BlackScholes.cu
new file mode 100644
index 000000000..3c3617b44
--- /dev/null
+++ b/demos/CUDA/BlackScholes/BlackScholes.cu
@@ -0,0 +1,405 @@
+/* Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *  * Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ *  * Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *  * Neither the name of NVIDIA CORPORATION nor the names of its
+ *    contributors may be used to endorse or promote products derived
+ *    from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+ * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+ * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+ * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+/*
+ * This sample evaluates fair call and put prices for a
+ * given set of European options by Black-Scholes formula.
+ * See supplied whitepaper for more explanations.
+ */
+
+/*
+ * DISCLAIMER: The following file has been modified slightly to make it
+ * compatible with Clad. The original file can be found at NVIDIA's cuda-samples
+ * repository at GitHub.
+ *
+ * Relevant documentation regarding the problem at hand can be found at NVIDIA's
+ * cuda-samples repository. With the use of Clad, we compute some of the Greeks
+ * (sensitivities) for Black-Scholes and verify them using the
+ * theoretical values as denoted in Wikipedia
+ * (https://en.wikipedia.org/wiki/Black%E2%80%93Scholes_model).
+ *
+ * To build and run the demo, run the following command: make run
+ */
+
+#include "clad/Differentiator/Differentiator.h"
+
+#include <helper_cuda.h> // helper functions CUDA error checking and initialization
+#include <helper_functions.h> // helper functions for string parsing
+
+////////////////////////////////////////////////////////////////////////////////
+// Process an array of optN options on CPU
+////////////////////////////////////////////////////////////////////////////////
+extern "C" void BlackScholesCPU(float* h_CallResult, float* h_PutResult,
+                                float* h_StockPrice, float* h_OptionStrike,
+                                float* h_OptionYears, float Riskfree,
+                                float Volatility, int optN);
+extern "C" double CND(double d);
+
+////////////////////////////////////////////////////////////////////////////////
+// Process an array of OptN options on GPU
+////////////////////////////////////////////////////////////////////////////////
+#include "BlackScholes_kernel.cuh"
+
+////////////////////////////////////////////////////////////////////////////////
+// Helper function, returning uniformly distributed
+// random float in [low, high] range
+////////////////////////////////////////////////////////////////////////////////
+float RandFloat(float low, float high) {
+  float t = (float)rand() / (float)RAND_MAX;
+  return (1.0f - t) * low + t * high;
+}
+
+////////////////////////////////////////////////////////////////////////////////
+// Data configuration
+////////////////////////////////////////////////////////////////////////////////
+const int OPT_N = 4000000;
+const int NUM_ITERATIONS = 512;
+
+const int OPT_SZ = OPT_N * sizeof(float);
+const float RISKFREE = 0.02f;
+const float VOLATILITY = 0.30f;
+
+#define DIV_UP(a, b) (((a) + (b) - 1) / (b))
+
+////////////////////////////////////////////////////////////////////////////////
+// Main program
+////////////////////////////////////////////////////////////////////////////////
+
+void launch(float* h_CallResultCPU, float* h_CallResultGPU,
+            float* h_PutResultCPU, float* h_PutResultGPU, float* h_StockPrice,
+            float* h_OptionStrike, float* h_OptionYears) {
+
+  //'d_' prefix - GPU (device) memory space
+  float
+      // Results calculated by GPU
+      *d_CallResult = nullptr,
+      *d_PutResult = nullptr,
+      // GPU instance of input data
+          *d_StockPrice = nullptr, *d_OptionStrike = nullptr,
+      *d_OptionYears = nullptr;
+
+  cudaMalloc((void**)&d_CallResult, OPT_SZ);
+  cudaMalloc((void**)&d_PutResult, OPT_SZ);
+  cudaMalloc((void**)&d_StockPrice, OPT_SZ);
+  cudaMalloc((void**)&d_OptionStrike, OPT_SZ);
+  cudaMalloc((void**)&d_OptionYears, OPT_SZ);
+
+  // Copy options data to GPU memory for further processing
+  cudaMemcpy(d_StockPrice, h_StockPrice, OPT_SZ, cudaMemcpyHostToDevice);
+  cudaMemcpy(d_OptionStrike, h_OptionStrike, OPT_SZ, cudaMemcpyHostToDevice);
+  cudaMemcpy(d_OptionYears, h_OptionYears, OPT_SZ, cudaMemcpyHostToDevice);
+
+  BlackScholesGPU<<<DIV_UP((OPT_N / 2), 128), 128 /*480, 128*/>>>(
+      (float2*)d_CallResult, (float2*)d_PutResult, (float2*)d_StockPrice,
+      (float2*)d_OptionStrike, (float2*)d_OptionYears, RISKFREE, VOLATILITY,
+      OPT_N);
+
+  // Both call and put is calculated
+
+  // Read back GPU results to compare them to CPU results
+  cudaMemcpy(h_CallResultGPU, d_CallResult, OPT_SZ, cudaMemcpyDeviceToHost);
+  cudaMemcpy(h_PutResultGPU, d_PutResult, OPT_SZ, cudaMemcpyDeviceToHost);
+
+  // Calculate options values on CPU
+  BlackScholesCPU(h_CallResultCPU, h_PutResultCPU, h_StockPrice, h_OptionStrike,
+                  h_OptionYears, RISKFREE, VOLATILITY, OPT_N);
+
+  cudaFree(d_OptionYears);
+  cudaFree(d_OptionStrike);
+  cudaFree(d_StockPrice);
+  cudaFree(d_PutResult);
+  cudaFree(d_CallResult);
+}
+
+double d1(double S, double X, double T) {
+  return (log(S / X) + (RISKFREE + 0.5 * VOLATILITY * VOLATILITY) * T) /
+         (VOLATILITY * sqrt(T));
+}
+
+double N_prime(double d) {
+  const double RSQRT2PI =
+      0.39894228040143267793994605993438; // 1 / sqrt(2 * PI)
+  return RSQRT2PI * exp(-0.5 * d * d);
+}
+
+enum Greek { Delta, dX, Theta };
+
+double computeL1norm_Call(float* S, float* X, float* T, float* d, Greek greek) {
+  double delta, ref, sum_delta, sum_ref;
+  sum_delta = 0;
+  sum_ref = 0;
+  switch (greek) {
+  case Delta:
+    for (int i = 0; i < OPT_N; i++) {
+      double d1_val = d1(S[i], X[i], T[i]);
+      ref = CND(d1_val);
+      delta = fabs(d[i] - ref);
+      sum_delta += delta;
+      sum_ref += fabs(ref);
+    }
+    break;
+  case dX:
+    for (int i = 0; i < OPT_N; i++) {
+      double T_val = T[i];
+      double d1_val = d1(S[i], X[i], T_val);
+      double d2_val = d1_val - VOLATILITY * sqrt(T_val);
+      double expRT = exp(-RISKFREE * T_val);
+      ref = -expRT * CND(d2_val);
+      delta = fabs(d[i] - ref);
+      sum_delta += delta;
+      sum_ref += fabs(ref);
+    }
+    break;
+  case Theta:
+    for (int i = 0; i < OPT_N; i++) {
+      double S_val = S[i], X_val = X[i], T_val = T[i];
+      double d1_val = d1(S_val, X_val, T_val);
+      double d2_val = d1_val - VOLATILITY * sqrt(T_val);
+      double expRT = exp(-RISKFREE * T_val);
+      ref =
+          (S_val * N_prime(d1_val) * VOLATILITY) / (2 * sqrt(T_val)) +
+          RISKFREE * X_val * expRT *
+              CND(d2_val); // theta is with respect to t, so -theta is the
+                           // approximation of the derivative with respect to T
+      delta = fabs(d[i] - ref);
+      sum_delta += delta;
+      sum_ref += fabs(ref);
+    }
+  }
+
+  return sum_delta / sum_ref;
+}
+
+double computeL1norm_Put(float* S, float* X, float* T, float* d, Greek greek) {
+  double delta, ref, sum_delta, sum_ref;
+  sum_delta = 0;
+  sum_ref = 0;
+  switch (greek) {
+  case Delta:
+    for (int i = 0; i < OPT_N; i++) {
+      double d1_val = d1(S[i], X[i], T[i]);
+      ref = CND(d1_val) - 1.0;
+      delta = fabs(d[i] - ref);
+      sum_delta += delta;
+      sum_ref += fabs(ref);
+    }
+    break;
+  case dX:
+    for (int i = 0; i < OPT_N; i++) {
+      double T_val = T[i];
+      double d1_val = d1(S[i], X[i], T_val);
+      double d2_val = d1_val - VOLATILITY * sqrt(T_val);
+      double expRT = exp(-RISKFREE * T_val);
+      ref = expRT * CND(-d2_val);
+      delta = fabs(d[i] - ref);
+      sum_delta += delta;
+      sum_ref += fabs(ref);
+    }
+    break;
+  case Theta:
+    for (int i = 0; i < OPT_N; i++) {
+      double S_val = S[i], X_val = X[i], T_val = T[i];
+      double d1_val = d1(S_val, X_val, T_val);
+      double d2_val = d1_val - VOLATILITY * sqrt(T_val);
+      double expRT = exp(-RISKFREE * T_val);
+      ref = (S_val * N_prime(d1_val) * VOLATILITY) / (2 * sqrt(T_val)) -
+            RISKFREE * X_val * expRT * CND(-d2_val);
+      delta = fabs(d[i] - ref);
+      sum_delta += delta;
+      sum_ref += fabs(ref);
+    }
+  }
+
+  return sum_delta / sum_ref;
+}
+
+int main(int argc, char** argv) {
+  float* h_CallResultCPU = (float*)malloc(OPT_SZ);
+  float* h_PutResultCPU = (float*)malloc(OPT_SZ);
+  float* h_CallResultGPU = (float*)malloc(OPT_SZ);
+  float* h_PutResultGPU = (float*)malloc(OPT_SZ);
+  float* h_StockPrice = (float*)malloc(OPT_SZ);
+  float* h_OptionStrike = (float*)malloc(OPT_SZ);
+  float* h_OptionYears = (float*)malloc(OPT_SZ);
+
+  srand(5347);
+
+  // Generate options set
+  for (int i = 0; i < OPT_N; i++) {
+    h_CallResultCPU[i] = 0.0f;
+    h_PutResultCPU[i] = -1.0f;
+    h_StockPrice[i] = RandFloat(5.0f, 30.0f);
+    h_OptionStrike[i] = RandFloat(1.0f, 100.0f);
+    h_OptionYears[i] = RandFloat(0.25f, 10.0f);
+  }
+
+  // Compute gradients
+  auto callGrad = clad::gradient(
+      launch, "h_CallResultGPU, h_StockPrice, h_OptionStrike, h_OptionYears");
+  auto putGrad = clad::gradient(
+      launch, "h_PutResultGPU, h_StockPrice, h_OptionStrike, h_OptionYears");
+
+  // Declare and initialize the derivatives
+  float* d_CallResultGPU = (float*)malloc(OPT_SZ);
+  float* d_PutResultGPU = (float*)malloc(OPT_SZ);
+  float* d_StockPrice = (float*)calloc(OPT_N, sizeof(float));
+  float* d_OptionStrike = (float*)calloc(OPT_N, sizeof(float));
+  float* d_OptionYears = (float*)calloc(OPT_N, sizeof(float));
+
+  for (int i = 0; i < OPT_N; i++) {
+    d_CallResultGPU[i] = 1.0f;
+    d_PutResultGPU[i] = 1.0f;
+  }
+
+  // Launch the kernel and the gradient
+
+  // Compute the derivatives of the price of the call options
+  callGrad.execute(h_CallResultCPU, h_CallResultGPU, h_PutResultCPU,
+                   h_PutResultGPU, h_StockPrice, h_OptionStrike, h_OptionYears,
+                   d_CallResultGPU, d_StockPrice, d_OptionStrike,
+                   d_OptionYears);
+
+  // Calculate max absolute difference and L1 distance
+  // between CPU and GPU results
+  double delta, ref, sum_delta, sum_ref, L1norm;
+  sum_delta = 0;
+  sum_ref = 0;
+
+  for (int i = 0; i < OPT_N; i++) {
+    ref = h_CallResultCPU[i];
+    delta = fabs(h_CallResultCPU[i] - h_CallResultGPU[i]);
+    sum_delta += delta;
+    sum_ref += fabs(ref);
+  }
+
+  L1norm = sum_delta / sum_ref;
+  printf("L1norm = %E\n", L1norm);
+  if (L1norm > 1e-6) {
+    printf("Original test failed\n");
+    return EXIT_FAILURE;
+  }
+
+  // Verify delta
+  L1norm = computeL1norm_Call(h_StockPrice, h_OptionStrike, h_OptionYears,
+                              d_StockPrice, Delta);
+  printf("L1norm of delta for Call option = %E\n", L1norm);
+  if (L1norm > 1e-5) {
+    printf("Gradient test failed: the difference between the computed and the "
+           "approximated theoretical delta for Call option is larger than "
+           "expected\n");
+    return EXIT_FAILURE;
+  }
+
+  // Verify derivatives with respect to the Strike price
+  L1norm = computeL1norm_Call(h_StockPrice, h_OptionStrike, h_OptionYears,
+                              d_OptionStrike, dX);
+  printf("L1norm of derivative of Call w.r.t. the strike price = %E\n", L1norm);
+  if (L1norm > 1e-5) {
+    printf(
+        "Gradient test failed: the difference between the computed and the "
+        "approximated theoretical derivative of Call w.r.t. the strike price "
+        "is larger than expected\n");
+    return EXIT_FAILURE;
+  }
+
+  // Verify theta
+  L1norm = computeL1norm_Call(h_StockPrice, h_OptionStrike, h_OptionYears,
+                              d_OptionYears, Theta);
+  printf("L1norm of theta for Call option = %E\n", L1norm);
+  if (L1norm > 1e-5) {
+    printf("Gradient test failed: the difference between the computed and the "
+           "approximated theoretical theta for Call option is larger than "
+           "expected\n");
+    return EXIT_FAILURE;
+  }
+
+  // Compute the derivatives of the price of the Put options
+  for (int i = 0; i < OPT_N; i++) {
+    h_CallResultCPU[i] = 0.0f;
+    h_PutResultCPU[i] = -1.0f;
+    d_CallResultGPU[i] = 1.0f;
+    d_PutResultGPU[i] = 1.0f;
+  }
+
+  for (int i = 0; i < OPT_N; i++) {
+    d_StockPrice[i] = 0.f;
+    d_OptionStrike[i] = 0.f;
+    d_OptionYears[i] = 0.f;
+  }
+
+  putGrad.execute(h_CallResultCPU, h_CallResultGPU, h_PutResultCPU,
+                  h_PutResultGPU, h_StockPrice, h_OptionStrike, h_OptionYears,
+                  d_PutResultGPU, d_StockPrice, d_OptionStrike, d_OptionYears);
+
+  // Verify delta
+  L1norm = computeL1norm_Put(h_StockPrice, h_OptionStrike, h_OptionYears,
+                             d_StockPrice, Delta);
+  printf("L1norm of delta for Put option = %E\n", L1norm);
+  if (L1norm > 1e-5) {
+    printf("Gradient test failed: the difference between the computed and "
+           "the approximated theoretical delta for Put option is larger than "
+           "expected\n");
+    return EXIT_FAILURE;
+  }
+
+  // Verify derivatives with respect to the Strike price
+  L1norm = computeL1norm_Put(h_StockPrice, h_OptionStrike, h_OptionYears,
+                             d_OptionStrike, dX);
+  printf("L1norm of derivative of Put w.r.t. the strike price = %E\n", L1norm);
+  if (L1norm > 1e-6) {
+    printf("Gradient test failed: the difference between the computed and the "
+           "approximated theoretcial derivative of "
+           "Put w.r.t. the strike price is larger than expected\n");
+    return EXIT_FAILURE;
+  }
+
+  // Verify theta
+  L1norm = computeL1norm_Put(h_StockPrice, h_OptionStrike, h_OptionYears,
+                             d_OptionYears, Theta);
+  printf("L1norm of theta for Put option = %E\n", L1norm);
+  if (L1norm > 1e-5) {
+    printf("Gradient test failed: the difference between the computed and the "
+           "approximated theoretical theta for Put option is larger than "
+           "expected\n");
+    return EXIT_FAILURE;
+  }
+
+  free(h_OptionYears);
+  free(h_OptionStrike);
+  free(h_StockPrice);
+  free(h_PutResultGPU);
+  free(h_CallResultGPU);
+  free(h_PutResultCPU);
+  free(h_CallResultCPU);
+  free(d_OptionYears);
+  free(d_OptionStrike);
+  free(d_StockPrice);
+  free(d_PutResultGPU);
+  free(d_CallResultGPU);
+
+  return EXIT_SUCCESS;
+}
diff --git a/demos/CUDA/BlackScholes/BlackScholes_gold.cpp b/demos/CUDA/BlackScholes/BlackScholes_gold.cpp
new file mode 100644
index 000000000..6b0d9c8ef
--- /dev/null
+++ b/demos/CUDA/BlackScholes/BlackScholes_gold.cpp
@@ -0,0 +1,93 @@
+/* Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *  * Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ *  * Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *  * Neither the name of NVIDIA CORPORATION nor the names of its
+ *    contributors may be used to endorse or promote products derived
+ *    from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+ * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+ * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+ * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+/*
+ * DISCLAIMER: The following file has been modified slightly to make it
+ * compatible with Clad. The original file can be found at NVIDIA's cuda-samples
+ * repository at GitHub.
+ */
+
+#include <math.h>
+
+////////////////////////////////////////////////////////////////////////////////
+// Polynomial approximation of cumulative normal distribution function
+////////////////////////////////////////////////////////////////////////////////
+extern "C" double CND(double d) {
+  const double A1 = 0.31938153;
+  const double A2 = -0.356563782;
+  const double A3 = 1.781477937;
+  const double A4 = -1.821255978;
+  const double A5 = 1.330274429;
+  const double RSQRT2PI = 0.39894228040143267793994605993438;
+
+  double K = 1.0 / (1.0 + 0.2316419 * fabs(d));
+
+  double cnd = RSQRT2PI * exp(-0.5 * d * d) *
+               (K * (A1 + K * (A2 + K * (A3 + K * (A4 + K * A5)))));
+
+  if (d > 0)
+    cnd = 1.0 - cnd;
+
+  return cnd;
+}
+
+////////////////////////////////////////////////////////////////////////////////
+// Black-Scholes formula for both call and put
+////////////////////////////////////////////////////////////////////////////////
+static void BlackScholesBodyCPU(float& callResult, float& putResult,
+                                float Sf, // Stock price
+                                float Xf, // Option strike
+                                float Tf, // Option years
+                                float Rf, // Riskless rate
+                                float Vf  // Volatility rate
+) {
+  double S = Sf, X = Xf, T = Tf, R = Rf, V = Vf;
+
+  double sqrtT = sqrt(T);
+  double d1 = (log(S / X) + (R + 0.5 * V * V) * T) / (V * sqrtT);
+  double d2 = d1 - V * sqrtT;
+  double CNDD1 = CND(d1);
+  double CNDD2 = CND(d2);
+
+  // Calculate Call and Put simultaneously
+  double expRT = exp(-R * T);
+  callResult = (float)(S * CNDD1 - X * expRT * CNDD2);
+  putResult = (float)(X * expRT * (1.0 - CNDD2) - S * (1.0 - CNDD1));
+}
+
+////////////////////////////////////////////////////////////////////////////////
+// Process an array of optN options
+////////////////////////////////////////////////////////////////////////////////
+extern "C" void BlackScholesCPU(float* h_CallResult, float* h_PutResult,
+                                float* h_StockPrice, float* h_OptionStrike,
+                                float* h_OptionYears, float Riskfree,
+                                float Volatility, int optN) {
+  for (int opt = 0; opt < optN; opt++)
+    BlackScholesBodyCPU(h_CallResult[opt], h_PutResult[opt], h_StockPrice[opt],
+                        h_OptionStrike[opt], h_OptionYears[opt], Riskfree,
+                        Volatility);
+}
diff --git a/demos/CUDA/BlackScholes/BlackScholes_kernel.cuh b/demos/CUDA/BlackScholes/BlackScholes_kernel.cuh
new file mode 100644
index 000000000..26497b8ac
--- /dev/null
+++ b/demos/CUDA/BlackScholes/BlackScholes_kernel.cuh
@@ -0,0 +1,108 @@
+/* Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *  * Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ *  * Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *  * Neither the name of NVIDIA CORPORATION nor the names of its
+ *    contributors may be used to endorse or promote products derived
+ *    from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+ * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+ * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+ * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+////////////////////////////////////////////////////////////////////////////////
+// Polynomial approximation of cumulative normal distribution function
+////////////////////////////////////////////////////////////////////////////////
+__device__ inline float cndGPU(float d) {
+  const float A1 = 0.31938153f;
+  const float A2 = -0.356563782f;
+  const float A3 = 1.781477937f;
+  const float A4 = -1.821255978f;
+  const float A5 = 1.330274429f;
+  const float RSQRT2PI = 0.39894228040143267793994605993438f;
+
+  float K = fdividef(1.0f, (1.0f + 0.2316419f * fabsf(d)));
+
+  float cnd = RSQRT2PI * expf(-0.5f * d * d) *
+              (K * (A1 + K * (A2 + K * (A3 + K * (A4 + K * A5)))));
+
+  if (d > 0)
+    cnd = 1.0f - cnd;
+
+  return cnd;
+}
+
+////////////////////////////////////////////////////////////////////////////////
+// Black-Scholes formula for both call and put
+////////////////////////////////////////////////////////////////////////////////
+__device__ inline void BlackScholesBodyGPU(float& CallResult, float& PutResult,
+                                           float S, // Stock price
+                                           float X, // Option strike
+                                           float T, // Option years
+                                           float R, // Riskless rate
+                                           float V  // Volatility rate
+) {
+  float sqrtT, expRT;
+  float d1, d2, CNDD1, CNDD2;
+
+  sqrtT = fdividef(1.0F, 1.0 / sqrtf(T));
+  d1 = fdividef(logf(S / X) + (R + 0.5f * V * V) * T, V * sqrtT);
+  d2 = d1 - V * sqrtT;
+
+  CNDD1 = cndGPU(d1);
+  CNDD2 = cndGPU(d2);
+
+  // Calculate Call and Put simultaneously
+  expRT = expf(-R * T);
+  CallResult = S * CNDD1 - X * expRT * CNDD2;
+  PutResult = X * expRT * (1.0f - CNDD2) - S * (1.0f - CNDD1);
+}
+
+////////////////////////////////////////////////////////////////////////////////
+// Process an array of optN options on GPU
+////////////////////////////////////////////////////////////////////////////////
+__global__ void BlackScholesGPU(float2* __restrict d_CallResult,
+                                float2* __restrict d_PutResult,
+                                float2* __restrict d_StockPrice,
+                                float2* __restrict d_OptionStrike,
+                                float2* __restrict d_OptionYears,
+                                float Riskfree, float Volatility, int optN) {
+  ////Thread index
+  // const int      tid = blockDim.x * blockIdx.x + threadIdx.x;
+  ////Total number of threads in execution grid
+  // const int THREAD_N = blockDim.x * gridDim.x;
+
+  const int opt = blockDim.x * blockIdx.x + threadIdx.x;
+
+  // Calculating 2 options per thread to increase ILP (instruction level
+  // parallelism)
+  if (opt < (optN / 2)) {
+    float callResult1, callResult2;
+    float putResult1, putResult2;
+    BlackScholesBodyGPU(callResult1, putResult1, d_StockPrice[opt].x,
+                        d_OptionStrike[opt].x, d_OptionYears[opt].x, Riskfree,
+                        Volatility);
+    BlackScholesBodyGPU(callResult2, putResult2, d_StockPrice[opt].y,
+                        d_OptionStrike[opt].y, d_OptionYears[opt].y, Riskfree,
+                        Volatility);
+    d_CallResult[opt].x = callResult1;
+    d_CallResult[opt].y = callResult2;
+    d_PutResult[opt].x = putResult1;
+    d_PutResult[opt].y = putResult2;
+  }
+}
diff --git a/demos/CUDA/BlackScholes/Makefile b/demos/CUDA/BlackScholes/Makefile
new file mode 100644
index 000000000..665939732
--- /dev/null
+++ b/demos/CUDA/BlackScholes/Makefile
@@ -0,0 +1,41 @@
+# Paths and Compiler Settings
+LLVM_PATH = /usr/lib/llvm-17
+CLANG = $(LLVM_PATH)/bin/clang
+CUDA_PATH ?= /usr/local/cuda-11.8
+CLAD_PATH = $(CURDIR)/../../..
+CLAD_PLUGIN = $(CLAD_PATH)/build/./lib/clad.so
+
+# Compiler flags
+CXXFLAGS = -std=c++17 -Xclang -add-plugin -Xclang clad
+CXXFLAGS += -Xclang -load -Xclang $(CLAD_PLUGIN)
+CXXFLAGS += -I$(CLAD_PATH)/include -I$(CURDIR)/helper -I$(CUDA_PATH)/include 
+
+CUDA_FLAGS = --cuda-path=$(CUDA_PATH) --cuda-gpu-arch=sm_60
+
+# Linker flags
+LDFLAGS = -L$(CUDA_PATH)/lib64 -lcudart_static -ldl -lrt -pthread -lm -lstdc++
+
+all: build
+
+.SILENT: build run clean clobber BlackScholes BlackScholes.o BlackScholes_gold.o
+
+build: BlackScholes
+
+BlackScholes.o:BlackScholes.cu
+	$(CLANG) $(CXXFLAGS) -o $@ -c $< $(CUDA_FLAGS)
+
+BlackScholes_gold.o:BlackScholes_gold.cpp
+	$(CLANG) $(CXXFLAGS) -o $@ -c $<
+
+BlackScholes: BlackScholes.o BlackScholes_gold.o
+	$(CLANG) $(CXXFLAGS) -o $@ BlackScholes.o BlackScholes_gold.o $(LDFLAGS)
+
+run: build
+	./BlackScholes
+
+testrun: build
+
+clean:
+	rm -f BlackScholes BlackScholes.o BlackScholes_gold.o
+
+clobber: clean
\ No newline at end of file
diff --git a/demos/CUDA/BlackScholes/helper/helper_cuda.h b/demos/CUDA/BlackScholes/helper/helper_cuda.h
new file mode 100644
index 000000000..6666e8208
--- /dev/null
+++ b/demos/CUDA/BlackScholes/helper/helper_cuda.h
@@ -0,0 +1,951 @@
+/* Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *  * Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ *  * Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *  * Neither the name of NVIDIA CORPORATION nor the names of its
+ *    contributors may be used to endorse or promote products derived
+ *    from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+ * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+ * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+ * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+////////////////////////////////////////////////////////////////////////////////
+// These are CUDA Helper functions for initialization and error checking
+
+#ifndef COMMON_HELPER_CUDA_H_
+#define COMMON_HELPER_CUDA_H_
+
+#pragma once
+
+#include <stdint.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include <helper_string.h>
+
+#ifndef EXIT_WAIVED
+#define EXIT_WAIVED 2
+#endif
+
+// Note, it is required that your SDK sample to include the proper header
+// files, please refer the CUDA examples for examples of the needed CUDA
+// headers, which may change depending on which CUDA functions are used.
+
+// CUDA Runtime error messages
+#ifdef __DRIVER_TYPES_H__
+static const char* _cudaGetErrorEnum(cudaError_t error) {
+  return cudaGetErrorName(error);
+}
+#endif
+
+#ifdef CUDA_DRIVER_API
+// CUDA Driver API errors
+static const char* _cudaGetErrorEnum(CUresult error) {
+  static char unknown[] = "<unknown>";
+  const char* ret = NULL;
+  cuGetErrorName(error, &ret);
+  return ret ? ret : unknown;
+}
+#endif
+
+#ifdef CUBLAS_API_H_
+// cuBLAS API errors
+static const char* _cudaGetErrorEnum(cublasStatus_t error) {
+  switch (error) {
+  case CUBLAS_STATUS_SUCCESS:
+    return "CUBLAS_STATUS_SUCCESS";
+
+  case CUBLAS_STATUS_NOT_INITIALIZED:
+    return "CUBLAS_STATUS_NOT_INITIALIZED";
+
+  case CUBLAS_STATUS_ALLOC_FAILED:
+    return "CUBLAS_STATUS_ALLOC_FAILED";
+
+  case CUBLAS_STATUS_INVALID_VALUE:
+    return "CUBLAS_STATUS_INVALID_VALUE";
+
+  case CUBLAS_STATUS_ARCH_MISMATCH:
+    return "CUBLAS_STATUS_ARCH_MISMATCH";
+
+  case CUBLAS_STATUS_MAPPING_ERROR:
+    return "CUBLAS_STATUS_MAPPING_ERROR";
+
+  case CUBLAS_STATUS_EXECUTION_FAILED:
+    return "CUBLAS_STATUS_EXECUTION_FAILED";
+
+  case CUBLAS_STATUS_INTERNAL_ERROR:
+    return "CUBLAS_STATUS_INTERNAL_ERROR";
+
+  case CUBLAS_STATUS_NOT_SUPPORTED:
+    return "CUBLAS_STATUS_NOT_SUPPORTED";
+
+  case CUBLAS_STATUS_LICENSE_ERROR:
+    return "CUBLAS_STATUS_LICENSE_ERROR";
+  }
+
+  return "<unknown>";
+}
+#endif
+
+#ifdef _CUFFT_H_
+// cuFFT API errors
+static const char* _cudaGetErrorEnum(cufftResult error) {
+  switch (error) {
+  case CUFFT_SUCCESS:
+    return "CUFFT_SUCCESS";
+
+  case CUFFT_INVALID_PLAN:
+    return "CUFFT_INVALID_PLAN";
+
+  case CUFFT_ALLOC_FAILED:
+    return "CUFFT_ALLOC_FAILED";
+
+  case CUFFT_INVALID_TYPE:
+    return "CUFFT_INVALID_TYPE";
+
+  case CUFFT_INVALID_VALUE:
+    return "CUFFT_INVALID_VALUE";
+
+  case CUFFT_INTERNAL_ERROR:
+    return "CUFFT_INTERNAL_ERROR";
+
+  case CUFFT_EXEC_FAILED:
+    return "CUFFT_EXEC_FAILED";
+
+  case CUFFT_SETUP_FAILED:
+    return "CUFFT_SETUP_FAILED";
+
+  case CUFFT_INVALID_SIZE:
+    return "CUFFT_INVALID_SIZE";
+
+  case CUFFT_UNALIGNED_DATA:
+    return "CUFFT_UNALIGNED_DATA";
+
+  case CUFFT_INCOMPLETE_PARAMETER_LIST:
+    return "CUFFT_INCOMPLETE_PARAMETER_LIST";
+
+  case CUFFT_INVALID_DEVICE:
+    return "CUFFT_INVALID_DEVICE";
+
+  case CUFFT_PARSE_ERROR:
+    return "CUFFT_PARSE_ERROR";
+
+  case CUFFT_NO_WORKSPACE:
+    return "CUFFT_NO_WORKSPACE";
+
+  case CUFFT_NOT_IMPLEMENTED:
+    return "CUFFT_NOT_IMPLEMENTED";
+
+  case CUFFT_LICENSE_ERROR:
+    return "CUFFT_LICENSE_ERROR";
+
+  case CUFFT_NOT_SUPPORTED:
+    return "CUFFT_NOT_SUPPORTED";
+  }
+
+  return "<unknown>";
+}
+#endif
+
+#ifdef CUSPARSEAPI
+// cuSPARSE API errors
+static const char* _cudaGetErrorEnum(cusparseStatus_t error) {
+  switch (error) {
+  case CUSPARSE_STATUS_SUCCESS:
+    return "CUSPARSE_STATUS_SUCCESS";
+
+  case CUSPARSE_STATUS_NOT_INITIALIZED:
+    return "CUSPARSE_STATUS_NOT_INITIALIZED";
+
+  case CUSPARSE_STATUS_ALLOC_FAILED:
+    return "CUSPARSE_STATUS_ALLOC_FAILED";
+
+  case CUSPARSE_STATUS_INVALID_VALUE:
+    return "CUSPARSE_STATUS_INVALID_VALUE";
+
+  case CUSPARSE_STATUS_ARCH_MISMATCH:
+    return "CUSPARSE_STATUS_ARCH_MISMATCH";
+
+  case CUSPARSE_STATUS_MAPPING_ERROR:
+    return "CUSPARSE_STATUS_MAPPING_ERROR";
+
+  case CUSPARSE_STATUS_EXECUTION_FAILED:
+    return "CUSPARSE_STATUS_EXECUTION_FAILED";
+
+  case CUSPARSE_STATUS_INTERNAL_ERROR:
+    return "CUSPARSE_STATUS_INTERNAL_ERROR";
+
+  case CUSPARSE_STATUS_MATRIX_TYPE_NOT_SUPPORTED:
+    return "CUSPARSE_STATUS_MATRIX_TYPE_NOT_SUPPORTED";
+  }
+
+  return "<unknown>";
+}
+#endif
+
+#ifdef CUSOLVER_COMMON_H_
+// cuSOLVER API errors
+static const char* _cudaGetErrorEnum(cusolverStatus_t error) {
+  switch (error) {
+  case CUSOLVER_STATUS_SUCCESS:
+    return "CUSOLVER_STATUS_SUCCESS";
+  case CUSOLVER_STATUS_NOT_INITIALIZED:
+    return "CUSOLVER_STATUS_NOT_INITIALIZED";
+  case CUSOLVER_STATUS_ALLOC_FAILED:
+    return "CUSOLVER_STATUS_ALLOC_FAILED";
+  case CUSOLVER_STATUS_INVALID_VALUE:
+    return "CUSOLVER_STATUS_INVALID_VALUE";
+  case CUSOLVER_STATUS_ARCH_MISMATCH:
+    return "CUSOLVER_STATUS_ARCH_MISMATCH";
+  case CUSOLVER_STATUS_MAPPING_ERROR:
+    return "CUSOLVER_STATUS_MAPPING_ERROR";
+  case CUSOLVER_STATUS_EXECUTION_FAILED:
+    return "CUSOLVER_STATUS_EXECUTION_FAILED";
+  case CUSOLVER_STATUS_INTERNAL_ERROR:
+    return "CUSOLVER_STATUS_INTERNAL_ERROR";
+  case CUSOLVER_STATUS_MATRIX_TYPE_NOT_SUPPORTED:
+    return "CUSOLVER_STATUS_MATRIX_TYPE_NOT_SUPPORTED";
+  case CUSOLVER_STATUS_NOT_SUPPORTED:
+    return "CUSOLVER_STATUS_NOT_SUPPORTED ";
+  case CUSOLVER_STATUS_ZERO_PIVOT:
+    return "CUSOLVER_STATUS_ZERO_PIVOT";
+  case CUSOLVER_STATUS_INVALID_LICENSE:
+    return "CUSOLVER_STATUS_INVALID_LICENSE";
+  }
+
+  return "<unknown>";
+}
+#endif
+
+#ifdef CURAND_H_
+// cuRAND API errors
+static const char* _cudaGetErrorEnum(curandStatus_t error) {
+  switch (error) {
+  case CURAND_STATUS_SUCCESS:
+    return "CURAND_STATUS_SUCCESS";
+
+  case CURAND_STATUS_VERSION_MISMATCH:
+    return "CURAND_STATUS_VERSION_MISMATCH";
+
+  case CURAND_STATUS_NOT_INITIALIZED:
+    return "CURAND_STATUS_NOT_INITIALIZED";
+
+  case CURAND_STATUS_ALLOCATION_FAILED:
+    return "CURAND_STATUS_ALLOCATION_FAILED";
+
+  case CURAND_STATUS_TYPE_ERROR:
+    return "CURAND_STATUS_TYPE_ERROR";
+
+  case CURAND_STATUS_OUT_OF_RANGE:
+    return "CURAND_STATUS_OUT_OF_RANGE";
+
+  case CURAND_STATUS_LENGTH_NOT_MULTIPLE:
+    return "CURAND_STATUS_LENGTH_NOT_MULTIPLE";
+
+  case CURAND_STATUS_DOUBLE_PRECISION_REQUIRED:
+    return "CURAND_STATUS_DOUBLE_PRECISION_REQUIRED";
+
+  case CURAND_STATUS_LAUNCH_FAILURE:
+    return "CURAND_STATUS_LAUNCH_FAILURE";
+
+  case CURAND_STATUS_PREEXISTING_FAILURE:
+    return "CURAND_STATUS_PREEXISTING_FAILURE";
+
+  case CURAND_STATUS_INITIALIZATION_FAILED:
+    return "CURAND_STATUS_INITIALIZATION_FAILED";
+
+  case CURAND_STATUS_ARCH_MISMATCH:
+    return "CURAND_STATUS_ARCH_MISMATCH";
+
+  case CURAND_STATUS_INTERNAL_ERROR:
+    return "CURAND_STATUS_INTERNAL_ERROR";
+  }
+
+  return "<unknown>";
+}
+#endif
+
+#ifdef NVJPEGAPI
+// nvJPEG API errors
+static const char* _cudaGetErrorEnum(nvjpegStatus_t error) {
+  switch (error) {
+  case NVJPEG_STATUS_SUCCESS:
+    return "NVJPEG_STATUS_SUCCESS";
+
+  case NVJPEG_STATUS_NOT_INITIALIZED:
+    return "NVJPEG_STATUS_NOT_INITIALIZED";
+
+  case NVJPEG_STATUS_INVALID_PARAMETER:
+    return "NVJPEG_STATUS_INVALID_PARAMETER";
+
+  case NVJPEG_STATUS_BAD_JPEG:
+    return "NVJPEG_STATUS_BAD_JPEG";
+
+  case NVJPEG_STATUS_JPEG_NOT_SUPPORTED:
+    return "NVJPEG_STATUS_JPEG_NOT_SUPPORTED";
+
+  case NVJPEG_STATUS_ALLOCATOR_FAILURE:
+    return "NVJPEG_STATUS_ALLOCATOR_FAILURE";
+
+  case NVJPEG_STATUS_EXECUTION_FAILED:
+    return "NVJPEG_STATUS_EXECUTION_FAILED";
+
+  case NVJPEG_STATUS_ARCH_MISMATCH:
+    return "NVJPEG_STATUS_ARCH_MISMATCH";
+
+  case NVJPEG_STATUS_INTERNAL_ERROR:
+    return "NVJPEG_STATUS_INTERNAL_ERROR";
+  }
+
+  return "<unknown>";
+}
+#endif
+
+#ifdef NV_NPPIDEFS_H
+// NPP API errors
+static const char* _cudaGetErrorEnum(NppStatus error) {
+  switch (error) {
+  case NPP_NOT_SUPPORTED_MODE_ERROR:
+    return "NPP_NOT_SUPPORTED_MODE_ERROR";
+
+  case NPP_ROUND_MODE_NOT_SUPPORTED_ERROR:
+    return "NPP_ROUND_MODE_NOT_SUPPORTED_ERROR";
+
+  case NPP_RESIZE_NO_OPERATION_ERROR:
+    return "NPP_RESIZE_NO_OPERATION_ERROR";
+
+  case NPP_NOT_SUFFICIENT_COMPUTE_CAPABILITY:
+    return "NPP_NOT_SUFFICIENT_COMPUTE_CAPABILITY";
+
+#if ((NPP_VERSION_MAJOR << 12) + (NPP_VERSION_MINOR << 4)) <= 0x5000
+
+  case NPP_BAD_ARG_ERROR:
+    return "NPP_BAD_ARGUMENT_ERROR";
+
+  case NPP_COEFF_ERROR:
+    return "NPP_COEFFICIENT_ERROR";
+
+  case NPP_RECT_ERROR:
+    return "NPP_RECTANGLE_ERROR";
+
+  case NPP_QUAD_ERROR:
+    return "NPP_QUADRANGLE_ERROR";
+
+  case NPP_MEM_ALLOC_ERR:
+    return "NPP_MEMORY_ALLOCATION_ERROR";
+
+  case NPP_HISTO_NUMBER_OF_LEVELS_ERROR:
+    return "NPP_HISTOGRAM_NUMBER_OF_LEVELS_ERROR";
+
+  case NPP_INVALID_INPUT:
+    return "NPP_INVALID_INPUT";
+
+  case NPP_POINTER_ERROR:
+    return "NPP_POINTER_ERROR";
+
+  case NPP_WARNING:
+    return "NPP_WARNING";
+
+  case NPP_ODD_ROI_WARNING:
+    return "NPP_ODD_ROI_WARNING";
+#else
+
+  // These are for CUDA 5.5 or higher
+  case NPP_BAD_ARGUMENT_ERROR:
+    return "NPP_BAD_ARGUMENT_ERROR";
+
+  case NPP_COEFFICIENT_ERROR:
+    return "NPP_COEFFICIENT_ERROR";
+
+  case NPP_RECTANGLE_ERROR:
+    return "NPP_RECTANGLE_ERROR";
+
+  case NPP_QUADRANGLE_ERROR:
+    return "NPP_QUADRANGLE_ERROR";
+
+  case NPP_MEMORY_ALLOCATION_ERR:
+    return "NPP_MEMORY_ALLOCATION_ERROR";
+
+  case NPP_HISTOGRAM_NUMBER_OF_LEVELS_ERROR:
+    return "NPP_HISTOGRAM_NUMBER_OF_LEVELS_ERROR";
+
+  case NPP_INVALID_HOST_POINTER_ERROR:
+    return "NPP_INVALID_HOST_POINTER_ERROR";
+
+  case NPP_INVALID_DEVICE_POINTER_ERROR:
+    return "NPP_INVALID_DEVICE_POINTER_ERROR";
+#endif
+
+  case NPP_LUT_NUMBER_OF_LEVELS_ERROR:
+    return "NPP_LUT_NUMBER_OF_LEVELS_ERROR";
+
+  case NPP_TEXTURE_BIND_ERROR:
+    return "NPP_TEXTURE_BIND_ERROR";
+
+  case NPP_WRONG_INTERSECTION_ROI_ERROR:
+    return "NPP_WRONG_INTERSECTION_ROI_ERROR";
+
+  case NPP_NOT_EVEN_STEP_ERROR:
+    return "NPP_NOT_EVEN_STEP_ERROR";
+
+  case NPP_INTERPOLATION_ERROR:
+    return "NPP_INTERPOLATION_ERROR";
+
+  case NPP_RESIZE_FACTOR_ERROR:
+    return "NPP_RESIZE_FACTOR_ERROR";
+
+  case NPP_HAAR_CLASSIFIER_PIXEL_MATCH_ERROR:
+    return "NPP_HAAR_CLASSIFIER_PIXEL_MATCH_ERROR";
+
+#if ((NPP_VERSION_MAJOR << 12) + (NPP_VERSION_MINOR << 4)) <= 0x5000
+
+  case NPP_MEMFREE_ERR:
+    return "NPP_MEMFREE_ERR";
+
+  case NPP_MEMSET_ERR:
+    return "NPP_MEMSET_ERR";
+
+  case NPP_MEMCPY_ERR:
+    return "NPP_MEMCPY_ERROR";
+
+  case NPP_MIRROR_FLIP_ERR:
+    return "NPP_MIRROR_FLIP_ERR";
+#else
+
+  case NPP_MEMFREE_ERROR:
+    return "NPP_MEMFREE_ERROR";
+
+  case NPP_MEMSET_ERROR:
+    return "NPP_MEMSET_ERROR";
+
+  case NPP_MEMCPY_ERROR:
+    return "NPP_MEMCPY_ERROR";
+
+  case NPP_MIRROR_FLIP_ERROR:
+    return "NPP_MIRROR_FLIP_ERROR";
+#endif
+
+  case NPP_ALIGNMENT_ERROR:
+    return "NPP_ALIGNMENT_ERROR";
+
+  case NPP_STEP_ERROR:
+    return "NPP_STEP_ERROR";
+
+  case NPP_SIZE_ERROR:
+    return "NPP_SIZE_ERROR";
+
+  case NPP_NULL_POINTER_ERROR:
+    return "NPP_NULL_POINTER_ERROR";
+
+  case NPP_CUDA_KERNEL_EXECUTION_ERROR:
+    return "NPP_CUDA_KERNEL_EXECUTION_ERROR";
+
+  case NPP_NOT_IMPLEMENTED_ERROR:
+    return "NPP_NOT_IMPLEMENTED_ERROR";
+
+  case NPP_ERROR:
+    return "NPP_ERROR";
+
+  case NPP_SUCCESS:
+    return "NPP_SUCCESS";
+
+  case NPP_WRONG_INTERSECTION_QUAD_WARNING:
+    return "NPP_WRONG_INTERSECTION_QUAD_WARNING";
+
+  case NPP_MISALIGNED_DST_ROI_WARNING:
+    return "NPP_MISALIGNED_DST_ROI_WARNING";
+
+  case NPP_AFFINE_QUAD_INCORRECT_WARNING:
+    return "NPP_AFFINE_QUAD_INCORRECT_WARNING";
+
+  case NPP_DOUBLE_SIZE_WARNING:
+    return "NPP_DOUBLE_SIZE_WARNING";
+
+  case NPP_WRONG_INTERSECTION_ROI_WARNING:
+    return "NPP_WRONG_INTERSECTION_ROI_WARNING";
+
+#if ((NPP_VERSION_MAJOR << 12) + (NPP_VERSION_MINOR << 4)) >= 0x6000
+  /* These are 6.0 or higher */
+  case NPP_LUT_PALETTE_BITSIZE_ERROR:
+    return "NPP_LUT_PALETTE_BITSIZE_ERROR";
+
+  case NPP_ZC_MODE_NOT_SUPPORTED_ERROR:
+    return "NPP_ZC_MODE_NOT_SUPPORTED_ERROR";
+
+  case NPP_QUALITY_INDEX_ERROR:
+    return "NPP_QUALITY_INDEX_ERROR";
+
+  case NPP_CHANNEL_ORDER_ERROR:
+    return "NPP_CHANNEL_ORDER_ERROR";
+
+  case NPP_ZERO_MASK_VALUE_ERROR:
+    return "NPP_ZERO_MASK_VALUE_ERROR";
+
+  case NPP_NUMBER_OF_CHANNELS_ERROR:
+    return "NPP_NUMBER_OF_CHANNELS_ERROR";
+
+  case NPP_COI_ERROR:
+    return "NPP_COI_ERROR";
+
+  case NPP_DIVISOR_ERROR:
+    return "NPP_DIVISOR_ERROR";
+
+  case NPP_CHANNEL_ERROR:
+    return "NPP_CHANNEL_ERROR";
+
+  case NPP_STRIDE_ERROR:
+    return "NPP_STRIDE_ERROR";
+
+  case NPP_ANCHOR_ERROR:
+    return "NPP_ANCHOR_ERROR";
+
+  case NPP_MASK_SIZE_ERROR:
+    return "NPP_MASK_SIZE_ERROR";
+
+  case NPP_MOMENT_00_ZERO_ERROR:
+    return "NPP_MOMENT_00_ZERO_ERROR";
+
+  case NPP_THRESHOLD_NEGATIVE_LEVEL_ERROR:
+    return "NPP_THRESHOLD_NEGATIVE_LEVEL_ERROR";
+
+  case NPP_THRESHOLD_ERROR:
+    return "NPP_THRESHOLD_ERROR";
+
+  case NPP_CONTEXT_MATCH_ERROR:
+    return "NPP_CONTEXT_MATCH_ERROR";
+
+  case NPP_FFT_FLAG_ERROR:
+    return "NPP_FFT_FLAG_ERROR";
+
+  case NPP_FFT_ORDER_ERROR:
+    return "NPP_FFT_ORDER_ERROR";
+
+  case NPP_SCALE_RANGE_ERROR:
+    return "NPP_SCALE_RANGE_ERROR";
+
+  case NPP_DATA_TYPE_ERROR:
+    return "NPP_DATA_TYPE_ERROR";
+
+  case NPP_OUT_OFF_RANGE_ERROR:
+    return "NPP_OUT_OFF_RANGE_ERROR";
+
+  case NPP_DIVIDE_BY_ZERO_ERROR:
+    return "NPP_DIVIDE_BY_ZERO_ERROR";
+
+  case NPP_RANGE_ERROR:
+    return "NPP_RANGE_ERROR";
+
+  case NPP_NO_MEMORY_ERROR:
+    return "NPP_NO_MEMORY_ERROR";
+
+  case NPP_ERROR_RESERVED:
+    return "NPP_ERROR_RESERVED";
+
+  case NPP_NO_OPERATION_WARNING:
+    return "NPP_NO_OPERATION_WARNING";
+
+  case NPP_DIVIDE_BY_ZERO_WARNING:
+    return "NPP_DIVIDE_BY_ZERO_WARNING";
+#endif
+
+#if ((NPP_VERSION_MAJOR << 12) + (NPP_VERSION_MINOR << 4)) >= 0x7000
+  /* These are 7.0 or higher */
+  case NPP_OVERFLOW_ERROR:
+    return "NPP_OVERFLOW_ERROR";
+
+  case NPP_CORRUPTED_DATA_ERROR:
+    return "NPP_CORRUPTED_DATA_ERROR";
+#endif
+  }
+
+  return "<unknown>";
+}
+#endif
+
+template <typename T>
+void check(T result, const char* const func, const char* const file,
+           const int line) {
+  if (result) {
+    fprintf(stderr, "CUDA error at %s:%d code=%d(%s) \"%s\" \n", file, line,
+            static_cast<unsigned int>(result), _cudaGetErrorEnum(result), func);
+    exit(EXIT_FAILURE);
+  }
+}
+
+#ifdef __DRIVER_TYPES_H__
+// This will output the proper CUDA error strings in the event
+// that a CUDA host call returns an error
+#define checkCudaErrors(val) check((val), #val, __FILE__, __LINE__)
+
+// This will output the proper error string when calling cudaGetLastError
+#define getLastCudaError(msg) __getLastCudaError(msg, __FILE__, __LINE__)
+
+inline void __getLastCudaError(const char* errorMessage, const char* file,
+                               const int line) {
+  cudaError_t err = cudaGetLastError();
+
+  if (cudaSuccess != err) {
+    fprintf(stderr,
+            "%s(%i) : getLastCudaError() CUDA error :"
+            " %s : (%d) %s.\n",
+            file, line, errorMessage, static_cast<int>(err),
+            cudaGetErrorString(err));
+    exit(EXIT_FAILURE);
+  }
+}
+
+// This will only print the proper error string when calling cudaGetLastError
+// but not exit program incase error detected.
+#define printLastCudaError(msg) __printLastCudaError(msg, __FILE__, __LINE__)
+
+inline void __printLastCudaError(const char* errorMessage, const char* file,
+                                 const int line) {
+  cudaError_t err = cudaGetLastError();
+
+  if (cudaSuccess != err) {
+    fprintf(stderr,
+            "%s(%i) : getLastCudaError() CUDA error :"
+            " %s : (%d) %s.\n",
+            file, line, errorMessage, static_cast<int>(err),
+            cudaGetErrorString(err));
+  }
+}
+#endif
+
+#ifndef MAX
+#define MAX(a, b) (a > b ? a : b)
+#endif
+
+// Float To Int conversion
+inline int ftoi(float value) {
+  return (value >= 0 ? static_cast<int>(value + 0.5)
+                     : static_cast<int>(value - 0.5));
+}
+
+// Beginning of GPU Architecture definitions
+inline int _ConvertSMVer2Cores(int major, int minor) {
+  // Defines for GPU Architecture types (using the SM version to determine
+  // the # of cores per SM
+  typedef struct {
+    int SM; // 0xMm (hexidecimal notation), M = SM Major version,
+    // and m = SM minor version
+    int Cores;
+  } sSMtoCores;
+
+  sSMtoCores nGpuArchCoresPerSM[] = {
+      {0x30, 192}, {0x32, 192}, {0x35, 192}, {0x37, 192}, {0x50, 128},
+      {0x52, 128}, {0x53, 128}, {0x60, 64},  {0x61, 128}, {0x62, 128},
+      {0x70, 64},  {0x72, 64},  {0x75, 64},  {0x80, 64},  {0x86, 128},
+      {0x87, 128}, {0x89, 128}, {0x90, 128}, {-1, -1}};
+
+  int index = 0;
+
+  while (nGpuArchCoresPerSM[index].SM != -1) {
+    if (nGpuArchCoresPerSM[index].SM == ((major << 4) + minor))
+      return nGpuArchCoresPerSM[index].Cores;
+
+    index++;
+  }
+
+  // If we don't find the values, we default use the previous one
+  // to run properly
+  printf("MapSMtoCores for SM %d.%d is undefined."
+         "  Default to use %d Cores/SM\n",
+         major, minor, nGpuArchCoresPerSM[index - 1].Cores);
+  return nGpuArchCoresPerSM[index - 1].Cores;
+}
+
+inline const char* _ConvertSMVer2ArchName(int major, int minor) {
+  // Defines for GPU Architecture types (using the SM version to determine
+  // the GPU Arch name)
+  typedef struct {
+    int SM; // 0xMm (hexidecimal notation), M = SM Major version,
+    // and m = SM minor version
+    const char* name;
+  } sSMtoArchName;
+
+  sSMtoArchName nGpuArchNameSM[] = {
+      {0x30, "Kepler"},       {0x32, "Kepler"},  {0x35, "Kepler"},
+      {0x37, "Kepler"},       {0x50, "Maxwell"}, {0x52, "Maxwell"},
+      {0x53, "Maxwell"},      {0x60, "Pascal"},  {0x61, "Pascal"},
+      {0x62, "Pascal"},       {0x70, "Volta"},   {0x72, "Xavier"},
+      {0x75, "Turing"},       {0x80, "Ampere"},  {0x86, "Ampere"},
+      {0x87, "Ampere"},       {0x89, "Ada"},     {0x90, "Hopper"},
+      {-1, "Graphics Device"}};
+
+  int index = 0;
+
+  while (nGpuArchNameSM[index].SM != -1) {
+    if (nGpuArchNameSM[index].SM == ((major << 4) + minor))
+      return nGpuArchNameSM[index].name;
+
+    index++;
+  }
+
+  // If we don't find the values, we default use the previous one
+  // to run properly
+  printf("MapSMtoArchName for SM %d.%d is undefined."
+         "  Default to use %s\n",
+         major, minor, nGpuArchNameSM[index - 1].name);
+  return nGpuArchNameSM[index - 1].name;
+}
+// end of GPU Architecture definitions
+
+#ifdef __CUDA_RUNTIME_H__
+// General GPU Device CUDA Initialization
+inline int gpuDeviceInit(int devID) {
+  int device_count;
+  checkCudaErrors(cudaGetDeviceCount(&device_count));
+
+  if (device_count == 0) {
+    fprintf(stderr, "gpuDeviceInit() CUDA error: "
+                    "no devices supporting CUDA.\n");
+    exit(EXIT_FAILURE);
+  }
+
+  if (devID < 0)
+    devID = 0;
+
+  if (devID > device_count - 1) {
+    fprintf(stderr, "\n");
+    fprintf(stderr, ">> %d CUDA capable GPU device(s) detected. <<\n",
+            device_count);
+    fprintf(stderr,
+            ">> gpuDeviceInit (-device=%d) is not a valid"
+            " GPU device. <<\n",
+            devID);
+    fprintf(stderr, "\n");
+    return -devID;
+  }
+
+  int computeMode = -1, major = 0, minor = 0;
+  checkCudaErrors(
+      cudaDeviceGetAttribute(&computeMode, cudaDevAttrComputeMode, devID));
+  checkCudaErrors(
+      cudaDeviceGetAttribute(&major, cudaDevAttrComputeCapabilityMajor, devID));
+  checkCudaErrors(
+      cudaDeviceGetAttribute(&minor, cudaDevAttrComputeCapabilityMinor, devID));
+  if (computeMode == cudaComputeModeProhibited) {
+    fprintf(stderr, "Error: device is running in <Compute Mode "
+                    "Prohibited>, no threads can use cudaSetDevice().\n");
+    return -1;
+  }
+
+  if (major < 1) {
+    fprintf(stderr, "gpuDeviceInit(): GPU device does not support CUDA.\n");
+    exit(EXIT_FAILURE);
+  }
+
+  checkCudaErrors(cudaSetDevice(devID));
+  printf("gpuDeviceInit() CUDA Device [%d]: \"%s\n", devID,
+         _ConvertSMVer2ArchName(major, minor));
+
+  return devID;
+}
+
+// This function returns the best GPU (with maximum GFLOPS)
+inline int gpuGetMaxGflopsDeviceId() {
+  int current_device = 0, sm_per_multiproc = 0;
+  int max_perf_device = 0;
+  int device_count = 0;
+  int devices_prohibited = 0;
+
+  uint64_t max_compute_perf = 0;
+  checkCudaErrors(cudaGetDeviceCount(&device_count));
+
+  if (device_count == 0) {
+    fprintf(stderr, "gpuGetMaxGflopsDeviceId() CUDA error:"
+                    " no devices supporting CUDA.\n");
+    exit(EXIT_FAILURE);
+  }
+
+  // Find the best CUDA capable GPU device
+  current_device = 0;
+
+  while (current_device < device_count) {
+    int computeMode = -1, major = 0, minor = 0;
+    checkCudaErrors(cudaDeviceGetAttribute(&computeMode, cudaDevAttrComputeMode,
+                                           current_device));
+    checkCudaErrors(cudaDeviceGetAttribute(
+        &major, cudaDevAttrComputeCapabilityMajor, current_device));
+    checkCudaErrors(cudaDeviceGetAttribute(
+        &minor, cudaDevAttrComputeCapabilityMinor, current_device));
+
+    // If this GPU is not running on Compute Mode prohibited,
+    // then we can add it to the list
+    if (computeMode != cudaComputeModeProhibited) {
+      if (major == 9999 && minor == 9999)
+        sm_per_multiproc = 1;
+      else
+        sm_per_multiproc = _ConvertSMVer2Cores(major, minor);
+      int multiProcessorCount = 0, clockRate = 0;
+      checkCudaErrors(cudaDeviceGetAttribute(&multiProcessorCount,
+                                             cudaDevAttrMultiProcessorCount,
+                                             current_device));
+      cudaError_t result = cudaDeviceGetAttribute(
+          &clockRate, cudaDevAttrClockRate, current_device);
+      if (result != cudaSuccess) {
+        // If cudaDevAttrClockRate attribute is not supported we
+        // set clockRate as 1, to consider GPU with most SMs and CUDA Cores.
+        if (result == cudaErrorInvalidValue) {
+          clockRate = 1;
+        } else {
+          fprintf(stderr, "CUDA error at %s:%d code=%d(%s) \n", __FILE__,
+                  __LINE__, static_cast<unsigned int>(result),
+                  _cudaGetErrorEnum(result));
+          exit(EXIT_FAILURE);
+        }
+      }
+      uint64_t compute_perf =
+          (uint64_t)multiProcessorCount * sm_per_multiproc * clockRate;
+
+      if (compute_perf > max_compute_perf) {
+        max_compute_perf = compute_perf;
+        max_perf_device = current_device;
+      }
+    } else {
+      devices_prohibited++;
+    }
+
+    ++current_device;
+  }
+
+  if (devices_prohibited == device_count) {
+    fprintf(stderr, "gpuGetMaxGflopsDeviceId() CUDA error:"
+                    " all devices have compute mode prohibited.\n");
+    exit(EXIT_FAILURE);
+  }
+
+  return max_perf_device;
+}
+
+// Initialization code to find the best CUDA Device
+inline int findCudaDevice(int argc, const char** argv) {
+  int devID = 0;
+
+  // If the command-line has a device number specified, use it
+  if (checkCmdLineFlag(argc, argv, "device")) {
+    devID = getCmdLineArgumentInt(argc, argv, "device=");
+
+    if (devID < 0) {
+      printf("Invalid command line parameter\n ");
+      exit(EXIT_FAILURE);
+    } else {
+      devID = gpuDeviceInit(devID);
+
+      if (devID < 0) {
+        printf("exiting...\n");
+        exit(EXIT_FAILURE);
+      }
+    }
+  } else {
+    // Otherwise pick the device with highest Gflops/s
+    devID = gpuGetMaxGflopsDeviceId();
+    checkCudaErrors(cudaSetDevice(devID));
+    int major = 0, minor = 0;
+    checkCudaErrors(cudaDeviceGetAttribute(
+        &major, cudaDevAttrComputeCapabilityMajor, devID));
+    checkCudaErrors(cudaDeviceGetAttribute(
+        &minor, cudaDevAttrComputeCapabilityMinor, devID));
+    printf("GPU Device %d: \"%s\" with compute capability %d.%d\n\n", devID,
+           _ConvertSMVer2ArchName(major, minor), major, minor);
+  }
+
+  return devID;
+}
+
+inline int findIntegratedGPU() {
+  int current_device = 0;
+  int device_count = 0;
+  int devices_prohibited = 0;
+
+  checkCudaErrors(cudaGetDeviceCount(&device_count));
+
+  if (device_count == 0) {
+    fprintf(stderr, "CUDA error: no devices supporting CUDA.\n");
+    exit(EXIT_FAILURE);
+  }
+
+  // Find the integrated GPU which is compute capable
+  while (current_device < device_count) {
+    int computeMode = -1, integrated = -1;
+    checkCudaErrors(cudaDeviceGetAttribute(&computeMode, cudaDevAttrComputeMode,
+                                           current_device));
+    checkCudaErrors(cudaDeviceGetAttribute(&integrated, cudaDevAttrIntegrated,
+                                           current_device));
+    // If GPU is integrated and is not running on Compute Mode prohibited,
+    // then cuda can map to GLES resource
+    if (integrated && (computeMode != cudaComputeModeProhibited)) {
+      checkCudaErrors(cudaSetDevice(current_device));
+
+      int major = 0, minor = 0;
+      checkCudaErrors(cudaDeviceGetAttribute(
+          &major, cudaDevAttrComputeCapabilityMajor, current_device));
+      checkCudaErrors(cudaDeviceGetAttribute(
+          &minor, cudaDevAttrComputeCapabilityMinor, current_device));
+      printf("GPU Device %d: \"%s\" with compute capability %d.%d\n\n",
+             current_device, _ConvertSMVer2ArchName(major, minor), major,
+             minor);
+
+      return current_device;
+    } else {
+      devices_prohibited++;
+    }
+
+    current_device++;
+  }
+
+  if (devices_prohibited == device_count) {
+    fprintf(stderr, "CUDA error:"
+                    " No GLES-CUDA Interop capable GPU found.\n");
+    exit(EXIT_FAILURE);
+  }
+
+  return -1;
+}
+
+// General check for CUDA GPU SM Capabilities
+inline bool checkCudaCapabilities(int major_version, int minor_version) {
+  int dev;
+  int major = 0, minor = 0;
+
+  checkCudaErrors(cudaGetDevice(&dev));
+  checkCudaErrors(
+      cudaDeviceGetAttribute(&major, cudaDevAttrComputeCapabilityMajor, dev));
+  checkCudaErrors(
+      cudaDeviceGetAttribute(&minor, cudaDevAttrComputeCapabilityMinor, dev));
+
+  if ((major > major_version) ||
+      (major == major_version && minor >= minor_version)) {
+    printf("  Device %d: <%16s >, Compute SM %d.%d detected\n", dev,
+           _ConvertSMVer2ArchName(major, minor), major, minor);
+    return true;
+  } else {
+    printf("  No GPU device was found that can support "
+           "CUDA compute capability %d.%d.\n",
+           major_version, minor_version);
+    return false;
+  }
+}
+#endif
+
+// end of CUDA Helper Functions
+
+#endif // COMMON_HELPER_CUDA_H_
diff --git a/demos/CUDA/BlackScholes/helper/helper_functions.h b/demos/CUDA/BlackScholes/helper/helper_functions.h
new file mode 100644
index 000000000..bd40ba43e
--- /dev/null
+++ b/demos/CUDA/BlackScholes/helper/helper_functions.h
@@ -0,0 +1,59 @@
+/* Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *  * Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ *  * Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *  * Neither the name of NVIDIA CORPORATION nor the names of its
+ *    contributors may be used to endorse or promote products derived
+ *    from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+ * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+ * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+ * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+// These are helper functions for the SDK samples (string parsing,
+// timers, image helpers, etc)
+#ifndef COMMON_HELPER_FUNCTIONS_H_
+#define COMMON_HELPER_FUNCTIONS_H_
+
+#ifdef WIN32
+#pragma warning(disable : 4996)
+#endif
+
+// includes, project
+#include <assert.h>
+#include <exception>
+#include <math.h>
+#include <stdio.h>
+#include <stdlib.h>
+
+#include <algorithm>
+#include <fstream>
+#include <iostream>
+#include <string>
+#include <vector>
+
+// includes, timer, string parsing, image helpers
+#include <helper_image.h> // helper functions for image compare, dump, data comparisons
+#include <helper_string.h> // helper functions for string parsing
+#include <helper_timer.h>  // helper functions for timers
+
+#ifndef EXIT_WAIVED
+#define EXIT_WAIVED 2
+#endif
+
+#endif // COMMON_HELPER_FUNCTIONS_H_
diff --git a/demos/CUDA/BlackScholes/helper/helper_image.h b/demos/CUDA/BlackScholes/helper/helper_image.h
new file mode 100644
index 000000000..33fbf1b62
--- /dev/null
+++ b/demos/CUDA/BlackScholes/helper/helper_image.h
@@ -0,0 +1,961 @@
+/* Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *  * Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ *  * Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *  * Neither the name of NVIDIA CORPORATION nor the names of its
+ *    contributors may be used to endorse or promote products derived
+ *    from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+ * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+ * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+ * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+// These are helper functions for the SDK samples (image,bitmap)
+#ifndef COMMON_HELPER_IMAGE_H_
+#define COMMON_HELPER_IMAGE_H_
+
+#include <assert.h>
+#include <exception>
+#include <math.h>
+#include <stdint.h>
+
+#include <algorithm>
+#include <fstream>
+#include <iostream>
+#include <string>
+#include <vector>
+
+#ifndef MIN
+#define MIN(a, b) ((a < b) ? a : b)
+#endif
+#ifndef MAX
+#define MAX(a, b) ((a > b) ? a : b)
+#endif
+
+#ifndef EXIT_WAIVED
+#define EXIT_WAIVED 2
+#endif
+
+#include <helper_string.h>
+
+// namespace unnamed (internal)
+namespace helper_image_internal {
+//! size of PGM file header
+const unsigned int PGMHeaderSize = 0x40;
+
+// types
+
+//! Data converter from unsigned char / unsigned byte to type T
+template <class T> struct ConverterFromUByte;
+
+//! Data converter from unsigned char / unsigned byte
+template <> struct ConverterFromUByte<unsigned char> {
+  //! Conversion operator
+  //! @return converted value
+  //! @param  val  value to convert
+  float operator()(const unsigned char& val) {
+    return static_cast<unsigned char>(val);
+  }
+};
+
+//! Data converter from unsigned char / unsigned byte to float
+template <> struct ConverterFromUByte<float> {
+  //! Conversion operator
+  //! @return converted value
+  //! @param  val  value to convert
+  float operator()(const unsigned char& val) {
+    return static_cast<float>(val) / 255.0f;
+  }
+};
+
+//! Data converter from unsigned char / unsigned byte to type T
+template <class T> struct ConverterToUByte;
+
+//! Data converter from unsigned char / unsigned byte to unsigned int
+template <> struct ConverterToUByte<unsigned char> {
+  //! Conversion operator (essentially a passthru
+  //! @return converted value
+  //! @param  val  value to convert
+  unsigned char operator()(const unsigned char& val) { return val; }
+};
+
+//! Data converter from unsigned char / unsigned byte to unsigned int
+template <> struct ConverterToUByte<float> {
+  //! Conversion operator
+  //! @return converted value
+  //! @param  val  value to convert
+  unsigned char operator()(const float& val) {
+    return static_cast<unsigned char>(val * 255.0f);
+  }
+};
+} // namespace helper_image_internal
+
+#if defined(WIN32) || defined(_WIN32) || defined(WIN64) || defined(_WIN64)
+#ifndef FOPEN
+#define FOPEN(fHandle, filename, mode) fopen_s(&fHandle, filename, mode)
+#endif
+#ifndef FOPEN_FAIL
+#define FOPEN_FAIL(result) (result != 0)
+#endif
+#ifndef SSCANF
+#define SSCANF sscanf_s
+#endif
+#else
+#ifndef FOPEN
+#define FOPEN(fHandle, filename, mode) (fHandle = fopen(filename, mode))
+#endif
+#ifndef FOPEN_FAIL
+#define FOPEN_FAIL(result) (result == NULL)
+#endif
+#ifndef SSCANF
+#define SSCANF sscanf
+#endif
+#endif
+
+inline bool __loadPPM(const char* file, unsigned char** data, unsigned int* w,
+                      unsigned int* h, unsigned int* channels) {
+  FILE* fp = NULL;
+
+  if (FOPEN_FAIL(FOPEN(fp, file, "rb"))) {
+    std::cerr << "__LoadPPM() : Failed to open file: " << file << std::endl;
+    return false;
+  }
+
+  // check header
+  char header[helper_image_internal::PGMHeaderSize];
+
+  if (fgets(header, helper_image_internal::PGMHeaderSize, fp) == NULL) {
+    std::cerr << "__LoadPPM() : reading PGM header returned NULL" << std::endl;
+    return false;
+  }
+
+  if (strncmp(header, "P5", 2) == 0) {
+    *channels = 1;
+  } else if (strncmp(header, "P6", 2) == 0) {
+    *channels = 3;
+  } else {
+    std::cerr << "__LoadPPM() : File is not a PPM or PGM image" << std::endl;
+    *channels = 0;
+    return false;
+  }
+
+  // parse header, read maxval, width and height
+  unsigned int width = 0;
+  unsigned int height = 0;
+  unsigned int maxval = 0;
+  unsigned int i = 0;
+
+  while (i < 3) {
+    if (fgets(header, helper_image_internal::PGMHeaderSize, fp) == NULL) {
+      std::cerr << "__LoadPPM() : reading PGM header returned NULL"
+                << std::endl;
+      return false;
+    }
+
+    if (header[0] == '#')
+      continue;
+
+    if (i == 0)
+      i += SSCANF(header, "%u %u %u", &width, &height, &maxval);
+    else if (i == 1)
+      i += SSCANF(header, "%u %u", &height, &maxval);
+    else if (i == 2)
+      i += SSCANF(header, "%u", &maxval);
+  }
+
+  // check if given handle for the data is initialized
+  if (NULL != *data) {
+    if (*w != width || *h != height)
+      std::cerr << "__LoadPPM() : Invalid image dimensions." << std::endl;
+  } else {
+    *data = (unsigned char*)malloc(sizeof(unsigned char) * width * height *
+                                   *channels);
+    *w = width;
+    *h = height;
+  }
+
+  // read and close file
+  if (fread(*data, sizeof(unsigned char), width * height * *channels, fp) == 0)
+    std::cerr << "__LoadPPM() read data returned error." << std::endl;
+
+  fclose(fp);
+
+  return true;
+}
+
+template <class T>
+inline bool sdkLoadPGM(const char* file, T** data, unsigned int* w,
+                       unsigned int* h) {
+  unsigned char* idata = NULL;
+  unsigned int channels;
+
+  if (true != __loadPPM(file, &idata, w, h, &channels))
+    return false;
+
+  unsigned int size = *w * *h * channels;
+
+  // initialize mem if necessary
+  // the correct size is checked / set in loadPGMc()
+  if (NULL == *data)
+    *data = reinterpret_cast<T*>(malloc(sizeof(T) * size));
+
+  // copy and cast data
+  std::transform(idata, idata + size, *data,
+                 helper_image_internal::ConverterFromUByte<T>());
+
+  free(idata);
+
+  return true;
+}
+
+template <class T>
+inline bool sdkLoadPPM4(const char* file, T** data, unsigned int* w,
+                        unsigned int* h) {
+  unsigned char* idata = 0;
+  unsigned int channels;
+
+  if (__loadPPM(file, &idata, w, h, &channels)) {
+    // pad 4th component
+    int size = *w * *h;
+    // keep the original pointer
+    unsigned char* idata_orig = idata;
+    *data = reinterpret_cast<T*>(malloc(sizeof(T) * size * 4));
+    unsigned char* ptr = *data;
+
+    for (int i = 0; i < size; i++) {
+      *ptr++ = *idata++;
+      *ptr++ = *idata++;
+      *ptr++ = *idata++;
+      *ptr++ = 0;
+    }
+
+    free(idata_orig);
+    return true;
+  } else {
+    free(idata);
+    return false;
+  }
+}
+
+inline bool __savePPM(const char* file, unsigned char* data, unsigned int w,
+                      unsigned int h, unsigned int channels) {
+  assert(NULL != data);
+  assert(w > 0);
+  assert(h > 0);
+
+  std::fstream fh(file, std::fstream::out | std::fstream::binary);
+
+  if (fh.bad()) {
+    std::cerr << "__savePPM() : Opening file failed." << std::endl;
+    return false;
+  }
+
+  if (channels == 1) {
+    fh << "P5\n";
+  } else if (channels == 3) {
+    fh << "P6\n";
+  } else {
+    std::cerr << "__savePPM() : Invalid number of channels." << std::endl;
+    return false;
+  }
+
+  fh << w << "\n" << h << "\n" << 0xff << std::endl;
+
+  for (unsigned int i = 0; (i < (w * h * channels)) && fh.good(); ++i)
+    fh << data[i];
+
+  fh.flush();
+
+  if (fh.bad()) {
+    std::cerr << "__savePPM() : Writing data failed." << std::endl;
+    return false;
+  }
+
+  fh.close();
+
+  return true;
+}
+
+template <class T>
+inline bool sdkSavePGM(const char* file, T* data, unsigned int w,
+                       unsigned int h) {
+  unsigned int size = w * h;
+  unsigned char* idata = (unsigned char*)malloc(sizeof(unsigned char) * size);
+
+  std::transform(data, data + size, idata,
+                 helper_image_internal::ConverterToUByte<T>());
+
+  // write file
+  bool result = __savePPM(file, idata, w, h, 1);
+
+  // cleanup
+  free(idata);
+
+  return result;
+}
+
+inline bool sdkSavePPM4ub(const char* file, unsigned char* data, unsigned int w,
+                          unsigned int h) {
+  // strip 4th component
+  int size = w * h;
+  unsigned char* ndata =
+      (unsigned char*)malloc(sizeof(unsigned char) * size * 3);
+  unsigned char* ptr = ndata;
+
+  for (int i = 0; i < size; i++) {
+    *ptr++ = *data++;
+    *ptr++ = *data++;
+    *ptr++ = *data++;
+    data++;
+  }
+
+  bool result = __savePPM(file, ndata, w, h, 3);
+  free(ndata);
+  return result;
+}
+
+//////////////////////////////////////////////////////////////////////////////
+//! Read file \filename and return the data
+//! @return bool if reading the file succeeded, otherwise false
+//! @param filename name of the source file
+//! @param data  uninitialized pointer, returned initialized and pointing to
+//!        the data read
+//! @param len  number of data elements in data, -1 on error
+//////////////////////////////////////////////////////////////////////////////
+template <class T>
+inline bool sdkReadFile(const char* filename, T** data, unsigned int* len,
+                        bool verbose) {
+  // check input arguments
+  assert(NULL != filename);
+  assert(NULL != len);
+
+  // intermediate storage for the data read
+  std::vector<T> data_read;
+
+  // open file for reading
+  FILE* fh = NULL;
+
+  // check if filestream is valid
+  if (FOPEN_FAIL(FOPEN(fh, filename, "r"))) {
+    printf("Unable to open input file: %s\n", filename);
+    return false;
+  }
+
+  // read all data elements
+  T token;
+
+  while (!feof(fh)) {
+    fscanf(fh, "%f", &token);
+    data_read.push_back(token);
+  }
+
+  // the last element is read twice
+  data_read.pop_back();
+  fclose(fh);
+
+  // check if the given handle is already initialized
+  if (NULL != *data) {
+    if (*len != data_read.size()) {
+      std::cerr << "sdkReadFile() : Initialized memory given but "
+                << "size  mismatch with signal read "
+                << "(data read / data init = " << (unsigned int)data_read.size()
+                << " / " << *len << ")" << std::endl;
+
+      return false;
+    }
+  } else {
+    // allocate storage for the data read
+    *data = reinterpret_cast<T*>(malloc(sizeof(T) * data_read.size()));
+    // store signal size
+    *len = static_cast<unsigned int>(data_read.size());
+  }
+
+  // copy data
+  memcpy(*data, &data_read.front(), sizeof(T) * data_read.size());
+
+  return true;
+}
+
+//////////////////////////////////////////////////////////////////////////////
+//! Read file \filename and return the data
+//! @return bool if reading the file succeeded, otherwise false
+//! @param filename name of the source file
+//! @param data  uninitialized pointer, returned initialized and pointing to
+//!        the data read
+//! @param len  number of data elements in data, -1 on error
+//////////////////////////////////////////////////////////////////////////////
+template <class T>
+inline bool sdkReadFileBlocks(const char* filename, T** data, unsigned int* len,
+                              unsigned int block_num, unsigned int block_size,
+                              bool verbose) {
+  // check input arguments
+  assert(NULL != filename);
+  assert(NULL != len);
+
+  // open file for reading
+  FILE* fh = fopen(filename, "rb");
+
+  if (fh == NULL && verbose) {
+    std::cerr << "sdkReadFile() : Opening file failed." << std::endl;
+    return false;
+  }
+
+  // check if the given handle is already initialized
+  // allocate storage for the data read
+  data[block_num] = reinterpret_cast<T*>(malloc(block_size));
+
+  // read all data elements
+  fseek(fh, block_num * block_size, SEEK_SET);
+  *len = fread(data[block_num], sizeof(T), block_size / sizeof(T), fh);
+
+  fclose(fh);
+
+  return true;
+}
+
+//////////////////////////////////////////////////////////////////////////////
+//! Write a data file \filename
+//! @return true if writing the file succeeded, otherwise false
+//! @param filename name of the source file
+//! @param data  data to write
+//! @param len  number of data elements in data, -1 on error
+//! @param epsilon  epsilon for comparison
+//////////////////////////////////////////////////////////////////////////////
+template <class T, class S>
+inline bool sdkWriteFile(const char* filename, const T* data, unsigned int len,
+                         const S epsilon, bool verbose, bool append = false) {
+  assert(NULL != filename);
+  assert(NULL != data);
+
+  // open file for writing
+  //    if (append) {
+  std::fstream fh(filename, std::fstream::out | std::fstream::ate);
+
+  if (verbose) {
+    std::cerr << "sdkWriteFile() : Open file " << filename
+              << " for write/append." << std::endl;
+  }
+
+  /*    } else {
+          std::fstream fh(filename, std::fstream::out);
+          if (verbose) {
+              std::cerr << "sdkWriteFile() : Open file " << filename << " for
+     write." << std::endl;
+          }
+      }
+  */
+
+  // check if filestream is valid
+  if (!fh.good()) {
+    if (verbose)
+      std::cerr << "sdkWriteFile() : Opening file failed." << std::endl;
+
+    return false;
+  }
+
+  // first write epsilon
+  fh << "# " << epsilon << "\n";
+
+  // write data
+  for (unsigned int i = 0; (i < len) && (fh.good()); ++i)
+    fh << data[i] << ' ';
+
+  // Check if writing succeeded
+  if (!fh.good()) {
+    if (verbose)
+      std::cerr << "sdkWriteFile() : Writing file failed." << std::endl;
+
+    return false;
+  }
+
+  // file ends with nl
+  fh << std::endl;
+
+  return true;
+}
+
+//////////////////////////////////////////////////////////////////////////////
+//! Compare two arrays of arbitrary type
+//! @return  true if \a reference and \a data are identical, otherwise false
+//! @param reference  timer_interface to the reference data / gold image
+//! @param data       handle to the computed data
+//! @param len        number of elements in reference and data
+//! @param epsilon    epsilon to use for the comparison
+//////////////////////////////////////////////////////////////////////////////
+template <class T, class S>
+inline bool compareData(const T* reference, const T* data,
+                        const unsigned int len, const S epsilon,
+                        const float threshold) {
+  assert(epsilon >= 0);
+
+  bool result = true;
+  unsigned int error_count = 0;
+
+  for (unsigned int i = 0; i < len; ++i) {
+    float diff = static_cast<float>(reference[i]) - static_cast<float>(data[i]);
+    bool comp = (diff <= epsilon) && (diff >= -epsilon);
+    result &= comp;
+
+    error_count += !comp;
+
+#if 0
+
+    if (!comp) {
+      std::cerr << "ERROR, i = " << i << ",\t "
+                << reference[i] << " / "
+                << data[i]
+                << " (reference / data)\n";
+    }
+
+#endif
+  }
+
+  if (threshold == 0.0f) {
+    return (result) ? true : false;
+  } else {
+    if (error_count) {
+      printf("%4.2f(%%) of bytes mismatched (count=%d)\n",
+             static_cast<float>(error_count) * 100 / static_cast<float>(len),
+             error_count);
+    }
+
+    return (len * threshold > error_count) ? true : false;
+  }
+}
+
+#ifndef __MIN_EPSILON_ERROR
+#define __MIN_EPSILON_ERROR 1e-3f
+#endif
+
+//////////////////////////////////////////////////////////////////////////////
+//! Compare two arrays of arbitrary type
+//! @return  true if \a reference and \a data are identical, otherwise false
+//! @param reference  handle to the reference data / gold image
+//! @param data       handle to the computed data
+//! @param len        number of elements in reference and data
+//! @param epsilon    epsilon to use for the comparison
+//! @param epsilon    threshold % of (# of bytes) for pass/fail
+//////////////////////////////////////////////////////////////////////////////
+template <class T, class S>
+inline bool compareDataAsFloatThreshold(const T* reference, const T* data,
+                                        const unsigned int len, const S epsilon,
+                                        const float threshold) {
+  assert(epsilon >= 0);
+
+  // If we set epsilon to be 0, let's set a minimum threshold
+  float max_error = MAX((float)epsilon, __MIN_EPSILON_ERROR);
+  int error_count = 0;
+  bool result = true;
+
+  for (unsigned int i = 0; i < len; ++i) {
+    float diff =
+        fabs(static_cast<float>(reference[i]) - static_cast<float>(data[i]));
+    bool comp = (diff < max_error);
+    result &= comp;
+
+    if (!comp)
+      error_count++;
+  }
+
+  if (threshold == 0.0f) {
+    if (error_count)
+      printf("total # of errors = %d\n", error_count);
+
+    return (error_count == 0) ? true : false;
+  } else {
+    if (error_count) {
+      printf("%4.2f(%%) of bytes mismatched (count=%d)\n",
+             static_cast<float>(error_count) * 100 / static_cast<float>(len),
+             error_count);
+    }
+
+    return ((len * threshold > error_count) ? true : false);
+  }
+}
+
+inline void sdkDumpBin(void* data, unsigned int bytes, const char* filename) {
+  printf("sdkDumpBin: <%s>\n", filename);
+  FILE* fp;
+  FOPEN(fp, filename, "wb");
+  fwrite(data, bytes, 1, fp);
+  fflush(fp);
+  fclose(fp);
+}
+
+inline bool sdkCompareBin2BinUint(const char* src_file, const char* ref_file,
+                                  unsigned int nelements, const float epsilon,
+                                  const float threshold, char* exec_path) {
+  unsigned int *src_buffer, *ref_buffer;
+  FILE *src_fp = NULL, *ref_fp = NULL;
+
+  uint64_t error_count = 0;
+  size_t fsize = 0;
+
+  if (FOPEN_FAIL(FOPEN(src_fp, src_file, "rb"))) {
+    printf("compareBin2Bin <unsigned int> unable to open src_file: %s\n",
+           src_file);
+    error_count++;
+  }
+
+  char* ref_file_path = sdkFindFilePath(ref_file, exec_path);
+
+  if (ref_file_path == NULL) {
+    printf("compareBin2Bin <unsigned int>  unable to find <%s> in <%s>\n",
+           ref_file, exec_path);
+    printf(">>> Check info.xml and [project//data] folder <%s> <<<\n",
+           ref_file);
+    printf("Aborting comparison!\n");
+    printf("  FAILED\n");
+    error_count++;
+
+    if (src_fp)
+      fclose(src_fp);
+
+    if (ref_fp)
+      fclose(ref_fp);
+  } else {
+    if (FOPEN_FAIL(FOPEN(ref_fp, ref_file_path, "rb"))) {
+      printf("compareBin2Bin <unsigned int>"
+             " unable to open ref_file: %s\n",
+             ref_file_path);
+      error_count++;
+    }
+
+    if (src_fp && ref_fp) {
+      src_buffer = (unsigned int*)malloc(nelements * sizeof(unsigned int));
+      ref_buffer = (unsigned int*)malloc(nelements * sizeof(unsigned int));
+
+      fsize = fread(src_buffer, nelements, sizeof(unsigned int), src_fp);
+      fsize = fread(ref_buffer, nelements, sizeof(unsigned int), ref_fp);
+
+      printf("> compareBin2Bin <unsigned int> nelements=%d,"
+             " epsilon=%4.2f, threshold=%4.2f\n",
+             nelements, epsilon, threshold);
+      printf("   src_file <%s>, size=%d bytes\n", src_file,
+             static_cast<int>(fsize));
+      printf("   ref_file <%s>, size=%d bytes\n", ref_file_path,
+             static_cast<int>(fsize));
+
+      if (!compareData<unsigned int, float>(ref_buffer, src_buffer, nelements,
+                                            epsilon, threshold))
+        error_count++;
+
+      fclose(src_fp);
+      fclose(ref_fp);
+
+      free(src_buffer);
+      free(ref_buffer);
+    } else {
+      if (src_fp)
+        fclose(src_fp);
+
+      if (ref_fp)
+        fclose(ref_fp);
+    }
+  }
+
+  if (error_count == 0)
+    printf("  OK\n");
+  else
+    printf("  FAILURE: %d errors...\n", (unsigned int)error_count);
+
+  return (error_count == 0); // returns true if all pixels pass
+}
+
+inline bool sdkCompareBin2BinFloat(const char* src_file, const char* ref_file,
+                                   unsigned int nelements, const float epsilon,
+                                   const float threshold, char* exec_path) {
+  float *src_buffer = NULL, *ref_buffer = NULL;
+  FILE *src_fp = NULL, *ref_fp = NULL;
+  size_t fsize = 0;
+
+  uint64_t error_count = 0;
+
+  if (FOPEN_FAIL(FOPEN(src_fp, src_file, "rb"))) {
+    printf("compareBin2Bin <float> unable to open src_file: %s\n", src_file);
+    error_count = 1;
+  }
+
+  char* ref_file_path = sdkFindFilePath(ref_file, exec_path);
+
+  if (ref_file_path == NULL) {
+    printf("compareBin2Bin <float> unable to find <%s> in <%s>\n", ref_file,
+           exec_path);
+    printf(">>> Check info.xml and [project//data] folder <%s> <<<\n",
+           exec_path);
+    printf("Aborting comparison!\n");
+    printf("  FAILED\n");
+    error_count++;
+
+    if (src_fp)
+      fclose(src_fp);
+
+    if (ref_fp)
+      fclose(ref_fp);
+  } else {
+    if (FOPEN_FAIL(FOPEN(ref_fp, ref_file_path, "rb"))) {
+      printf("compareBin2Bin <float> unable to open ref_file: %s\n",
+             ref_file_path);
+      error_count = 1;
+    }
+
+    if (src_fp && ref_fp) {
+      src_buffer = reinterpret_cast<float*>(malloc(nelements * sizeof(float)));
+      ref_buffer = reinterpret_cast<float*>(malloc(nelements * sizeof(float)));
+
+      printf("> compareBin2Bin <float> nelements=%d, epsilon=%4.2f,"
+             " threshold=%4.2f\n",
+             nelements, epsilon, threshold);
+      fsize = fread(src_buffer, sizeof(float), nelements, src_fp);
+      printf("   src_file <%s>, size=%d bytes\n", src_file,
+             static_cast<int>(fsize * sizeof(float)));
+      fsize = fread(ref_buffer, sizeof(float), nelements, ref_fp);
+      printf("   ref_file <%s>, size=%d bytes\n", ref_file_path,
+             static_cast<int>(fsize * sizeof(float)));
+
+      if (!compareDataAsFloatThreshold<float, float>(
+              ref_buffer, src_buffer, nelements, epsilon, threshold))
+        error_count++;
+
+      fclose(src_fp);
+      fclose(ref_fp);
+
+      free(src_buffer);
+      free(ref_buffer);
+    } else {
+      if (src_fp)
+        fclose(src_fp);
+
+      if (ref_fp)
+        fclose(ref_fp);
+    }
+  }
+
+  if (error_count == 0)
+    printf("  OK\n");
+  else
+    printf("  FAILURE: %d errors...\n", (unsigned int)error_count);
+
+  return (error_count == 0); // returns true if all pixels pass
+}
+
+inline bool sdkCompareL2fe(const float* reference, const float* data,
+                           const unsigned int len, const float epsilon) {
+  assert(epsilon >= 0);
+
+  float error = 0;
+  float ref = 0;
+
+  for (unsigned int i = 0; i < len; ++i) {
+    float diff = reference[i] - data[i];
+    error += diff * diff;
+    ref += reference[i] * reference[i];
+  }
+
+  float normRef = sqrtf(ref);
+
+  if (fabs(ref) < 1e-7) {
+#ifdef _DEBUG
+    std::cerr << "ERROR, reference l2-norm is 0\n";
+#endif
+    return false;
+  }
+
+  float normError = sqrtf(error);
+  error = normError / normRef;
+  bool result = error < epsilon;
+#ifdef _DEBUG
+
+  if (!result) {
+    std::cerr << "ERROR, l2-norm error " << error << " is greater than epsilon "
+              << epsilon << "\n";
+  }
+
+#endif
+
+  return result;
+}
+
+inline bool sdkLoadPPMub(const char* file, unsigned char** data,
+                         unsigned int* w, unsigned int* h) {
+  unsigned int channels;
+  return __loadPPM(file, data, w, h, &channels);
+}
+
+inline bool sdkLoadPPM4ub(const char* file, unsigned char** data,
+                          unsigned int* w, unsigned int* h) {
+  unsigned char* idata = 0;
+  unsigned int channels;
+
+  if (__loadPPM(file, &idata, w, h, &channels)) {
+    // pad 4th component
+    int size = *w * *h;
+    // keep the original pointer
+    unsigned char* idata_orig = idata;
+    *data = (unsigned char*)malloc(sizeof(unsigned char) * size * 4);
+    unsigned char* ptr = *data;
+
+    for (int i = 0; i < size; i++) {
+      *ptr++ = *idata++;
+      *ptr++ = *idata++;
+      *ptr++ = *idata++;
+      *ptr++ = 0;
+    }
+
+    free(idata_orig);
+    return true;
+  } else {
+    free(idata);
+    return false;
+  }
+}
+
+inline bool sdkComparePPM(const char* src_file, const char* ref_file,
+                          const float epsilon, const float threshold,
+                          bool verboseErrors) {
+  unsigned char *src_data, *ref_data;
+  uint64_t error_count = 0;
+  unsigned int ref_width, ref_height;
+  unsigned int src_width, src_height;
+
+  if (src_file == NULL || ref_file == NULL) {
+    if (verboseErrors) {
+      std::cerr << "PPMvsPPM: src_file or ref_file is NULL."
+                   "  Aborting comparison\n";
+    }
+
+    return false;
+  }
+
+  if (verboseErrors) {
+    std::cerr << "> Compare (a)rendered:  <" << src_file << ">\n";
+    std::cerr << ">         (b)reference: <" << ref_file << ">\n";
+  }
+
+  if (sdkLoadPPM4ub(ref_file, &ref_data, &ref_width, &ref_height) != true) {
+    if (verboseErrors) {
+      std::cerr << "PPMvsPPM: unable to load ref image file: " << ref_file
+                << "\n";
+    }
+
+    return false;
+  }
+
+  if (sdkLoadPPM4ub(src_file, &src_data, &src_width, &src_height) != true) {
+    std::cerr << "PPMvsPPM: unable to load src image file: " << src_file
+              << "\n";
+    return false;
+  }
+
+  if (src_height != ref_height || src_width != ref_width) {
+    if (verboseErrors) {
+      std::cerr << "PPMvsPPM: source and ref size mismatch (" << src_width
+                << "," << src_height << ")vs(" << ref_width << "," << ref_height
+                << ")\n";
+    }
+  }
+
+  if (verboseErrors) {
+    std::cerr << "PPMvsPPM: comparing images size (" << src_width << ","
+              << src_height << ") epsilon(" << epsilon << "), threshold("
+              << threshold * 100 << "%)\n";
+  }
+
+  if (compareData(ref_data, src_data, src_width * src_height * 4, epsilon,
+                  threshold) == false)
+    error_count = 1;
+
+  if (error_count == 0) {
+    if (verboseErrors)
+      std::cerr << "    OK\n\n";
+  } else {
+    if (verboseErrors)
+      std::cerr << "    FAILURE!  " << error_count << " errors...\n\n";
+  }
+
+  // returns true if all pixels pass
+  return (error_count == 0) ? true : false;
+}
+
+inline bool sdkComparePGM(const char* src_file, const char* ref_file,
+                          const float epsilon, const float threshold,
+                          bool verboseErrors) {
+  unsigned char *src_data = 0, *ref_data = 0;
+  uint64_t error_count = 0;
+  unsigned int ref_width, ref_height;
+  unsigned int src_width, src_height;
+
+  if (src_file == NULL || ref_file == NULL) {
+    if (verboseErrors) {
+      std::cerr << "PGMvsPGM: src_file or ref_file is NULL."
+                   "  Aborting comparison\n";
+    }
+
+    return false;
+  }
+
+  if (verboseErrors) {
+    std::cerr << "> Compare (a)rendered:  <" << src_file << ">\n";
+    std::cerr << ">         (b)reference: <" << ref_file << ">\n";
+  }
+
+  if (sdkLoadPPMub(ref_file, &ref_data, &ref_width, &ref_height) != true) {
+    if (verboseErrors) {
+      std::cerr << "PGMvsPGM: unable to load ref image file: " << ref_file
+                << "\n";
+    }
+
+    return false;
+  }
+
+  if (sdkLoadPPMub(src_file, &src_data, &src_width, &src_height) != true) {
+    std::cerr << "PGMvsPGM: unable to load src image file: " << src_file
+              << "\n";
+    return false;
+  }
+
+  if (src_height != ref_height || src_width != ref_width) {
+    if (verboseErrors) {
+      std::cerr << "PGMvsPGM: source and ref size mismatch (" << src_width
+                << "," << src_height << ")vs(" << ref_width << "," << ref_height
+                << ")\n";
+    }
+  }
+
+  if (verboseErrors)
+    std::cerr << "PGMvsPGM: comparing images size (" << src_width << ","
+              << src_height << ") epsilon(" << epsilon << "), threshold("
+              << threshold * 100 << "%)\n";
+
+  if (compareData(ref_data, src_data, src_width * src_height, epsilon,
+                  threshold) == false)
+    error_count = 1;
+
+  if (error_count == 0) {
+    if (verboseErrors)
+      std::cerr << "    OK\n\n";
+  } else {
+    if (verboseErrors)
+      std::cerr << "    FAILURE!  " << error_count << " errors...\n\n";
+  }
+
+  // returns true if all pixels pass
+  return (error_count == 0) ? true : false;
+}
+
+#endif // COMMON_HELPER_IMAGE_H_
diff --git a/demos/CUDA/BlackScholes/helper/helper_string.h b/demos/CUDA/BlackScholes/helper/helper_string.h
new file mode 100644
index 000000000..f6c25f659
--- /dev/null
+++ b/demos/CUDA/BlackScholes/helper/helper_string.h
@@ -0,0 +1,441 @@
+/* Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *  * Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ *  * Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *  * Neither the name of NVIDIA CORPORATION nor the names of its
+ *    contributors may be used to endorse or promote products derived
+ *    from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+ * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+ * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+ * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+// These are helper functions for the SDK samples (string parsing, timers, etc)
+#ifndef COMMON_HELPER_STRING_H_
+#define COMMON_HELPER_STRING_H_
+
+#include <fstream>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string>
+
+#if defined(WIN32) || defined(_WIN32) || defined(WIN64) || defined(_WIN64)
+#ifndef _CRT_SECURE_NO_DEPRECATE
+#define _CRT_SECURE_NO_DEPRECATE
+#endif
+#ifndef STRCASECMP
+#define STRCASECMP _stricmp
+#endif
+#ifndef STRNCASECMP
+#define STRNCASECMP _strnicmp
+#endif
+#ifndef STRCPY
+#define STRCPY(sFilePath, nLength, sPath) strcpy_s(sFilePath, nLength, sPath)
+#endif
+
+#ifndef FOPEN
+#define FOPEN(fHandle, filename, mode) fopen_s(&fHandle, filename, mode)
+#endif
+#ifndef FOPEN_FAIL
+#define FOPEN_FAIL(result) (result != 0)
+#endif
+#ifndef SSCANF
+#define SSCANF sscanf_s
+#endif
+#ifndef SPRINTF
+#define SPRINTF sprintf_s
+#endif
+#else // Linux Includes
+#include <string.h>
+#include <strings.h>
+
+#ifndef STRCASECMP
+#define STRCASECMP strcasecmp
+#endif
+#ifndef STRNCASECMP
+#define STRNCASECMP strncasecmp
+#endif
+#ifndef STRCPY
+#define STRCPY(sFilePath, nLength, sPath) strcpy(sFilePath, sPath)
+#endif
+
+#ifndef FOPEN
+#define FOPEN(fHandle, filename, mode) (fHandle = fopen(filename, mode))
+#endif
+#ifndef FOPEN_FAIL
+#define FOPEN_FAIL(result) (result == NULL)
+#endif
+#ifndef SSCANF
+#define SSCANF sscanf
+#endif
+#ifndef SPRINTF
+#define SPRINTF sprintf
+#endif
+#endif
+
+#ifndef EXIT_WAIVED
+#define EXIT_WAIVED 2
+#endif
+
+// CUDA Utility Helper Functions
+inline int stringRemoveDelimiter(char delimiter, const char* string) {
+  int string_start = 0;
+
+  while (string[string_start] == delimiter)
+    string_start++;
+
+  if (string_start >= static_cast<int>(strlen(string) - 1))
+    return 0;
+
+  return string_start;
+}
+
+inline int getFileExtension(char* filename, char** extension) {
+  int string_length = static_cast<int>(strlen(filename));
+
+  while (filename[string_length--] != '.')
+    if (string_length == 0)
+      break;
+
+  if (string_length > 0)
+    string_length += 2;
+
+  if (string_length == 0)
+    *extension = NULL;
+  else
+    *extension = &filename[string_length];
+
+  return string_length;
+}
+
+inline bool checkCmdLineFlag(const int argc, const char** argv,
+                             const char* string_ref) {
+  bool bFound = false;
+
+  if (argc >= 1) {
+    for (int i = 1; i < argc; i++) {
+      int string_start = stringRemoveDelimiter('-', argv[i]);
+      const char* string_argv = &argv[i][string_start];
+
+      const char* equal_pos = strchr(string_argv, '=');
+      int argv_length = static_cast<int>(
+          equal_pos == 0 ? strlen(string_argv) : equal_pos - string_argv);
+
+      int length = static_cast<int>(strlen(string_ref));
+
+      if (length == argv_length &&
+          !STRNCASECMP(string_argv, string_ref, length)) {
+        bFound = true;
+        continue;
+      }
+    }
+  }
+
+  return bFound;
+}
+
+// This function wraps the CUDA Driver API into a template function
+template <class T>
+inline bool getCmdLineArgumentValue(const int argc, const char** argv,
+                                    const char* string_ref, T* value) {
+  bool bFound = false;
+
+  if (argc >= 1) {
+    for (int i = 1; i < argc; i++) {
+      int string_start = stringRemoveDelimiter('-', argv[i]);
+      const char* string_argv = &argv[i][string_start];
+      int length = static_cast<int>(strlen(string_ref));
+
+      if (!STRNCASECMP(string_argv, string_ref, length)) {
+        if (length + 1 <= static_cast<int>(strlen(string_argv))) {
+          int auto_inc = (string_argv[length] == '=') ? 1 : 0;
+          *value = (T)atoi(&string_argv[length + auto_inc]);
+        }
+
+        bFound = true;
+        i = argc;
+      }
+    }
+  }
+
+  return bFound;
+}
+
+inline int getCmdLineArgumentInt(const int argc, const char** argv,
+                                 const char* string_ref) {
+  bool bFound = false;
+  int value = -1;
+
+  if (argc >= 1) {
+    for (int i = 1; i < argc; i++) {
+      int string_start = stringRemoveDelimiter('-', argv[i]);
+      const char* string_argv = &argv[i][string_start];
+      int length = static_cast<int>(strlen(string_ref));
+
+      if (!STRNCASECMP(string_argv, string_ref, length)) {
+        if (length + 1 <= static_cast<int>(strlen(string_argv))) {
+          int auto_inc = (string_argv[length] == '=') ? 1 : 0;
+          value = atoi(&string_argv[length + auto_inc]);
+        } else {
+          value = 0;
+        }
+
+        bFound = true;
+        continue;
+      }
+    }
+  }
+
+  if (bFound)
+    return value;
+  else
+    return 0;
+}
+
+inline float getCmdLineArgumentFloat(const int argc, const char** argv,
+                                     const char* string_ref) {
+  bool bFound = false;
+  float value = -1;
+
+  if (argc >= 1) {
+    for (int i = 1; i < argc; i++) {
+      int string_start = stringRemoveDelimiter('-', argv[i]);
+      const char* string_argv = &argv[i][string_start];
+      int length = static_cast<int>(strlen(string_ref));
+
+      if (!STRNCASECMP(string_argv, string_ref, length)) {
+        if (length + 1 <= static_cast<int>(strlen(string_argv))) {
+          int auto_inc = (string_argv[length] == '=') ? 1 : 0;
+          value = static_cast<float>(atof(&string_argv[length + auto_inc]));
+        } else {
+          value = 0.f;
+        }
+
+        bFound = true;
+        continue;
+      }
+    }
+  }
+
+  if (bFound)
+    return value;
+  else
+    return 0;
+}
+
+inline bool getCmdLineArgumentString(const int argc, const char** argv,
+                                     const char* string_ref,
+                                     char** string_retval) {
+  bool bFound = false;
+
+  if (argc >= 1) {
+    for (int i = 1; i < argc; i++) {
+      int string_start = stringRemoveDelimiter('-', argv[i]);
+      char* string_argv = const_cast<char*>(&argv[i][string_start]);
+      int length = static_cast<int>(strlen(string_ref));
+
+      if (!STRNCASECMP(string_argv, string_ref, length)) {
+        *string_retval = &string_argv[length + 1];
+        bFound = true;
+        continue;
+      }
+    }
+  }
+
+  if (!bFound)
+    *string_retval = NULL;
+
+  return bFound;
+}
+
+//////////////////////////////////////////////////////////////////////////////
+//! Find the path for a file assuming that
+//! files are found in the searchPath.
+//!
+//! @return the path if succeeded, otherwise 0
+//! @param filename         name of the file
+//! @param executable_path  optional absolute path of the executable
+//////////////////////////////////////////////////////////////////////////////
+inline char* sdkFindFilePath(const char* filename,
+                             const char* executable_path) {
+  // <executable_name> defines a variable that is replaced with the name of the
+  // executable
+
+  // Typical relative search paths to locate needed companion files (e.g. sample
+  // input data, or JIT source files) The origin for the relative search may be
+  // the .exe file, a .bat file launching an .exe, a browser .exe launching the
+  // .exe or .bat, etc
+  const char* searchPath[] = {
+      "./",      // same dir
+      "./data/", // same dir
+
+      "../../../../Samples/<executable_name>/", // up 4 in tree
+      "../../../Samples/<executable_name>/",    // up 3 in tree
+      "../../Samples/<executable_name>/",       // up 2 in tree
+
+      "../../../../Samples/<executable_name>/data/", // up 4 in tree
+      "../../../Samples/<executable_name>/data/",    // up 3 in tree
+      "../../Samples/<executable_name>/data/",       // up 2 in tree
+
+      "../../../../Samples/0_Introduction/<executable_name>/", // up 4 in tree
+      "../../../Samples/0_Introduction/<executable_name>/",    // up 3 in tree
+      "../../Samples/0_Introduction/<executable_name>/",       // up 2 in tree
+
+      "../../../../Samples/1_Utilities/<executable_name>/", // up 4 in tree
+      "../../../Samples/1_Utilities/<executable_name>/",    // up 3 in tree
+      "../../Samples/1_Utilities/<executable_name>/",       // up 2 in tree
+
+      "../../../../Samples/2_Concepts_and_Techniques/<executable_name>/", // up
+                                                                          // 4
+                                                                          // in
+                                                                          // tree
+      "../../../Samples/2_Concepts_and_Techniques/<executable_name>/", // up 3
+                                                                       // in
+                                                                       // tree
+      "../../Samples/2_Concepts_and_Techniques/<executable_name>/", // up 2 in
+                                                                    // tree
+
+      "../../../../Samples/3_CUDA_Features/<executable_name>/", // up 4 in tree
+      "../../../Samples/3_CUDA_Features/<executable_name>/",    // up 3 in tree
+      "../../Samples/3_CUDA_Features/<executable_name>/",       // up 2 in tree
+
+      "../../../../Samples/4_CUDA_Libraries/<executable_name>/", // up 4 in tree
+      "../../../Samples/4_CUDA_Libraries/<executable_name>/",    // up 3 in tree
+      "../../Samples/4_CUDA_Libraries/<executable_name>/",       // up 2 in tree
+
+      "../../../../Samples/5_Domain_Specific/<executable_name>/", // up 4 in
+                                                                  // tree
+      "../../../Samples/5_Domain_Specific/<executable_name>/", // up 3 in tree
+      "../../Samples/5_Domain_Specific/<executable_name>/",    // up 2 in tree
+
+      "../../../../Samples/6_Performance/<executable_name>/", // up 4 in tree
+      "../../../Samples/6_Performance/<executable_name>/",    // up 3 in tree
+      "../../Samples/6_Performance/<executable_name>/",       // up 2 in tree
+
+      "../../../../Samples/0_Introduction/<executable_name>/data/", // up 4 in
+                                                                    // tree
+      "../../../Samples/0_Introduction/<executable_name>/data/", // up 3 in tree
+      "../../Samples/0_Introduction/<executable_name>/data/",    // up 2 in tree
+
+      "../../../../Samples/1_Utilities/<executable_name>/data/", // up 4 in tree
+      "../../../Samples/1_Utilities/<executable_name>/data/",    // up 3 in tree
+      "../../Samples/1_Utilities/<executable_name>/data/",       // up 2 in tree
+
+      "../../../../Samples/2_Concepts_and_Techniques/<executable_name>/data/", // up 4 in tree
+      "../../../Samples/2_Concepts_and_Techniques/<executable_name>/data/", // up 3 in tree
+      "../../Samples/2_Concepts_and_Techniques/<executable_name>/data/", // up 2
+                                                                         // in
+                                                                         // tree
+
+      "../../../../Samples/3_CUDA_Features/<executable_name>/data/", // up 4 in
+                                                                     // tree
+      "../../../Samples/3_CUDA_Features/<executable_name>/data/",    // up 3 in
+                                                                     // tree
+      "../../Samples/3_CUDA_Features/<executable_name>/data/", // up 2 in tree
+
+      "../../../../Samples/4_CUDA_Libraries/<executable_name>/data/", // up 4 in
+                                                                      // tree
+      "../../../Samples/4_CUDA_Libraries/<executable_name>/data/",    // up 3 in
+                                                                      // tree
+      "../../Samples/4_CUDA_Libraries/<executable_name>/data/", // up 2 in tree
+
+      "../../../../Samples/5_Domain_Specific/<executable_name>/data/", // up 4
+                                                                       // in
+                                                                       // tree
+      "../../../Samples/5_Domain_Specific/<executable_name>/data/", // up 3 in
+                                                                    // tree
+      "../../Samples/5_Domain_Specific/<executable_name>/data/", // up 2 in tree
+
+      "../../../../Samples/6_Performance/<executable_name>/data/", // up 4 in
+                                                                   // tree
+      "../../../Samples/6_Performance/<executable_name>/data/", // up 3 in tree
+      "../../Samples/6_Performance/<executable_name>/data/",    // up 2 in tree
+
+      "../../../../Common/data/", // up 4 in tree
+      "../../../Common/data/",    // up 3 in tree
+      "../../Common/data/"        // up 2 in tree
+  };
+
+  // Extract the executable name
+  std::string executable_name;
+
+  if (executable_path != 0) {
+    executable_name = std::string(executable_path);
+
+#if defined(WIN32) || defined(_WIN32) || defined(WIN64) || defined(_WIN64)
+    // Windows path delimiter
+    size_t delimiter_pos = executable_name.find_last_of('\\');
+    executable_name.erase(0, delimiter_pos + 1);
+
+    if (executable_name.rfind(".exe") != std::string::npos) {
+      // we strip .exe, only if the .exe is found
+      executable_name.resize(executable_name.size() - 4);
+    }
+
+#else
+    // Linux & OSX path delimiter
+    size_t delimiter_pos = executable_name.find_last_of('/');
+    executable_name.erase(0, delimiter_pos + 1);
+#endif
+  }
+
+  // Loop over all search paths and return the first hit
+  for (unsigned int i = 0; i < sizeof(searchPath) / sizeof(char*); ++i) {
+    std::string path(searchPath[i]);
+    size_t executable_name_pos = path.find("<executable_name>");
+
+    // If there is executable_name variable in the searchPath
+    // replace it with the value
+    if (executable_name_pos != std::string::npos) {
+      if (executable_path != 0) {
+        path.replace(executable_name_pos, strlen("<executable_name>"),
+                     executable_name);
+      } else {
+        // Skip this path entry if no executable argument is given
+        continue;
+      }
+    }
+
+#ifdef _DEBUG
+    printf("sdkFindFilePath <%s> in %s\n", filename, path.c_str());
+#endif
+
+    // Test if the file exists
+    path.append(filename);
+    FILE* fp;
+    FOPEN(fp, path.c_str(), "rb");
+
+    if (fp != NULL) {
+      fclose(fp);
+      // File found
+      // returning an allocated array here for backwards compatibility reasons
+      char* file_path = reinterpret_cast<char*>(malloc(path.length() + 1));
+      STRCPY(file_path, path.length() + 1, path.c_str());
+      return file_path;
+    }
+
+    if (fp)
+      fclose(fp);
+  }
+
+  // File not found
+  printf("\nerror: sdkFindFilePath: file <%s> not found!\n", filename);
+  return 0;
+}
+
+#endif // COMMON_HELPER_STRING_H_
diff --git a/demos/CUDA/BlackScholes/helper/helper_timer.h b/demos/CUDA/BlackScholes/helper/helper_timer.h
new file mode 100644
index 000000000..3869bf8ca
--- /dev/null
+++ b/demos/CUDA/BlackScholes/helper/helper_timer.h
@@ -0,0 +1,448 @@
+/* Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *  * Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ *  * Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *  * Neither the name of NVIDIA CORPORATION nor the names of its
+ *    contributors may be used to endorse or promote products derived
+ *    from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+ * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+ * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+ * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+// Helper Timing Functions
+#ifndef COMMON_HELPER_TIMER_H_
+#define COMMON_HELPER_TIMER_H_
+
+#ifndef EXIT_WAIVED
+#define EXIT_WAIVED 2
+#endif
+
+// includes, system
+#include <vector>
+
+// includes, project
+#include <exception>
+
+// Definition of the StopWatch Interface, this is used if we don't want to use
+// the CUT functions But rather in a self contained class interface
+class StopWatchInterface {
+public:
+  StopWatchInterface() {}
+  virtual ~StopWatchInterface() {}
+
+public:
+  //! Start time measurement
+  virtual void start() = 0;
+
+  //! Stop time measurement
+  virtual void stop() = 0;
+
+  //! Reset time counters to zero
+  virtual void reset() = 0;
+
+  //! Time in msec. after start. If the stop watch is still running (i.e. there
+  //! was no call to stop()) then the elapsed time is returned, otherwise the
+  //! time between the last start() and stop call is returned
+  virtual float getTime() = 0;
+
+  //! Mean time to date based on the number of times the stopwatch has been
+  //! _stopped_ (ie finished sessions) and the current total time
+  virtual float getAverageTime() = 0;
+};
+
+//////////////////////////////////////////////////////////////////
+// Begin Stopwatch timer class definitions for all OS platforms //
+//////////////////////////////////////////////////////////////////
+#if defined(WIN32) || defined(_WIN32) || defined(WIN64) || defined(_WIN64)
+// includes, system
+#define WINDOWS_LEAN_AND_MEAN
+#include <windows.h>
+#undef min
+#undef max
+
+//! Windows specific implementation of StopWatch
+class StopWatchWin : public StopWatchInterface {
+public:
+  //! Constructor, default
+  StopWatchWin()
+      : start_time(), end_time(), diff_time(0.0f), total_time(0.0f),
+        running(false), clock_sessions(0), freq(0), freq_set(false) {
+    if (!freq_set) {
+      // helper variable
+      LARGE_INTEGER temp;
+
+      // get the tick frequency from the OS
+      QueryPerformanceFrequency(reinterpret_cast<LARGE_INTEGER*>(&temp));
+
+      // convert to type in which it is needed
+      freq = (static_cast<double>(temp.QuadPart)) / 1000.0;
+
+      // rememeber query
+      freq_set = true;
+    }
+  }
+
+  // Destructor
+  ~StopWatchWin() {}
+
+public:
+  //! Start time measurement
+  inline void start();
+
+  //! Stop time measurement
+  inline void stop();
+
+  //! Reset time counters to zero
+  inline void reset();
+
+  //! Time in msec. after start. If the stop watch is still running (i.e. there
+  //! was no call to stop()) then the elapsed time is returned, otherwise the
+  //! time between the last start() and stop call is returned
+  inline float getTime();
+
+  //! Mean time to date based on the number of times the stopwatch has been
+  //! _stopped_ (ie finished sessions) and the current total time
+  inline float getAverageTime();
+
+private:
+  // member variables
+
+  //! Start of measurement
+  LARGE_INTEGER start_time;
+  //! End of measurement
+  LARGE_INTEGER end_time;
+
+  //! Time difference between the last start and stop
+  float diff_time;
+
+  //! TOTAL time difference between starts and stops
+  float total_time;
+
+  //! flag if the stop watch is running
+  bool running;
+
+  //! Number of times clock has been started
+  //! and stopped to allow averaging
+  int clock_sessions;
+
+  //! tick frequency
+  double freq;
+
+  //! flag if the frequency has been set
+  bool freq_set;
+};
+
+// functions, inlined
+
+////////////////////////////////////////////////////////////////////////////////
+//! Start time measurement
+////////////////////////////////////////////////////////////////////////////////
+inline void StopWatchWin::start() {
+  QueryPerformanceCounter(reinterpret_cast<LARGE_INTEGER*>(&start_time));
+  running = true;
+}
+
+////////////////////////////////////////////////////////////////////////////////
+//! Stop time measurement and increment add to the current diff_time summation
+//! variable. Also increment the number of times this clock has been run.
+////////////////////////////////////////////////////////////////////////////////
+inline void StopWatchWin::stop() {
+  QueryPerformanceCounter(reinterpret_cast<LARGE_INTEGER*>(&end_time));
+  diff_time = static_cast<float>(((static_cast<double>(end_time.QuadPart) -
+                                   static_cast<double>(start_time.QuadPart)) /
+                                  freq));
+
+  total_time += diff_time;
+  clock_sessions++;
+  running = false;
+}
+
+////////////////////////////////////////////////////////////////////////////////
+//! Reset the timer to 0. Does not change the timer running state but does
+//! recapture this point in time as the current start time if it is running.
+////////////////////////////////////////////////////////////////////////////////
+inline void StopWatchWin::reset() {
+  diff_time = 0;
+  total_time = 0;
+  clock_sessions = 0;
+
+  if (running)
+    QueryPerformanceCounter(reinterpret_cast<LARGE_INTEGER*>(&start_time));
+}
+
+////////////////////////////////////////////////////////////////////////////////
+//! Time in msec. after start. If the stop watch is still running (i.e. there
+//! was no call to stop()) then the elapsed time is returned added to the
+//! current diff_time sum, otherwise the current summed time difference alone
+//! is returned.
+////////////////////////////////////////////////////////////////////////////////
+inline float StopWatchWin::getTime() {
+  // Return the TOTAL time to date
+  float retval = total_time;
+
+  if (running) {
+    LARGE_INTEGER temp;
+    QueryPerformanceCounter(reinterpret_cast<LARGE_INTEGER*>(&temp));
+    retval += static_cast<float>(((static_cast<double>(temp.QuadPart) -
+                                   static_cast<double>(start_time.QuadPart)) /
+                                  freq));
+  }
+
+  return retval;
+}
+
+////////////////////////////////////////////////////////////////////////////////
+//! Time in msec. for a single run based on the total number of COMPLETED runs
+//! and the total time.
+////////////////////////////////////////////////////////////////////////////////
+inline float StopWatchWin::getAverageTime() {
+  return (clock_sessions > 0) ? (total_time / clock_sessions) : 0.0f;
+}
+#else
+// Declarations for Stopwatch on Linux and Mac OSX
+// includes, system
+#include <ctime>
+#include <sys/time.h>
+
+//! Windows specific implementation of StopWatch
+class StopWatchLinux : public StopWatchInterface {
+public:
+  //! Constructor, default
+  StopWatchLinux()
+      : start_time(), diff_time(0.0), total_time(0.0), running(false),
+        clock_sessions(0) {}
+
+  // Destructor
+  virtual ~StopWatchLinux() {}
+
+public:
+  //! Start time measurement
+  inline void start();
+
+  //! Stop time measurement
+  inline void stop();
+
+  //! Reset time counters to zero
+  inline void reset();
+
+  //! Time in msec. after start. If the stop watch is still running (i.e. there
+  //! was no call to stop()) then the elapsed time is returned, otherwise the
+  //! time between the last start() and stop call is returned
+  inline float getTime();
+
+  //! Mean time to date based on the number of times the stopwatch has been
+  //! _stopped_ (ie finished sessions) and the current total time
+  inline float getAverageTime();
+
+private:
+  // helper functions
+
+  //! Get difference between start time and current time
+  inline float getDiffTime();
+
+private:
+  // member variables
+
+  //! Start of measurement
+  struct timeval start_time;
+
+  //! Time difference between the last start and stop
+  float diff_time;
+
+  //! TOTAL time difference between starts and stops
+  float total_time;
+
+  //! flag if the stop watch is running
+  bool running;
+
+  //! Number of times clock has been started
+  //! and stopped to allow averaging
+  int clock_sessions;
+};
+
+// functions, inlined
+
+////////////////////////////////////////////////////////////////////////////////
+//! Start time measurement
+////////////////////////////////////////////////////////////////////////////////
+inline void StopWatchLinux::start() {
+  gettimeofday(&start_time, 0);
+  running = true;
+}
+
+////////////////////////////////////////////////////////////////////////////////
+//! Stop time measurement and increment add to the current diff_time summation
+//! variable. Also increment the number of times this clock has been run.
+////////////////////////////////////////////////////////////////////////////////
+inline void StopWatchLinux::stop() {
+  diff_time = getDiffTime();
+  total_time += diff_time;
+  running = false;
+  clock_sessions++;
+}
+
+////////////////////////////////////////////////////////////////////////////////
+//! Reset the timer to 0. Does not change the timer running state but does
+//! recapture this point in time as the current start time if it is running.
+////////////////////////////////////////////////////////////////////////////////
+inline void StopWatchLinux::reset() {
+  diff_time = 0;
+  total_time = 0;
+  clock_sessions = 0;
+
+  if (running)
+    gettimeofday(&start_time, 0);
+}
+
+////////////////////////////////////////////////////////////////////////////////
+//! Time in msec. after start. If the stop watch is still running (i.e. there
+//! was no call to stop()) then the elapsed time is returned added to the
+//! current diff_time sum, otherwise the current summed time difference alone
+//! is returned.
+////////////////////////////////////////////////////////////////////////////////
+inline float StopWatchLinux::getTime() {
+  // Return the TOTAL time to date
+  float retval = total_time;
+
+  if (running)
+    retval += getDiffTime();
+
+  return retval;
+}
+
+////////////////////////////////////////////////////////////////////////////////
+//! Time in msec. for a single run based on the total number of COMPLETED runs
+//! and the total time.
+////////////////////////////////////////////////////////////////////////////////
+inline float StopWatchLinux::getAverageTime() {
+  return (clock_sessions > 0) ? (total_time / clock_sessions) : 0.0f;
+}
+////////////////////////////////////////////////////////////////////////////////
+
+////////////////////////////////////////////////////////////////////////////////
+inline float StopWatchLinux::getDiffTime() {
+  struct timeval t_time;
+  gettimeofday(&t_time, 0);
+
+  // time difference in milli-seconds
+  return static_cast<float>(1000.0 * (t_time.tv_sec - start_time.tv_sec) +
+                            (0.001 * (t_time.tv_usec - start_time.tv_usec)));
+}
+#endif // WIN32
+
+////////////////////////////////////////////////////////////////////////////////
+//! Timer functionality exported
+
+////////////////////////////////////////////////////////////////////////////////
+//! Create a new timer
+//! @return true if a time has been created, otherwise false
+//! @param  name of the new timer, 0 if the creation failed
+////////////////////////////////////////////////////////////////////////////////
+inline bool sdkCreateTimer(StopWatchInterface** timer_interface) {
+// printf("sdkCreateTimer called object %08x\n", (void *)*timer_interface);
+#if defined(WIN32) || defined(_WIN32) || defined(WIN64) || defined(_WIN64)
+  *timer_interface = reinterpret_cast<StopWatchInterface*>(new StopWatchWin());
+#else
+  *timer_interface =
+      reinterpret_cast<StopWatchInterface*>(new StopWatchLinux());
+#endif
+  return (*timer_interface != NULL) ? true : false;
+}
+
+////////////////////////////////////////////////////////////////////////////////
+//! Delete a timer
+//! @return true if a time has been deleted, otherwise false
+//! @param  name of the timer to delete
+////////////////////////////////////////////////////////////////////////////////
+inline bool sdkDeleteTimer(StopWatchInterface** timer_interface) {
+  // printf("sdkDeleteTimer called object %08x\n", (void *)*timer_interface);
+  if (*timer_interface) {
+    delete *timer_interface;
+    *timer_interface = NULL;
+  }
+
+  return true;
+}
+
+////////////////////////////////////////////////////////////////////////////////
+//! Start the time with name \a name
+//! @param name  name of the timer to start
+////////////////////////////////////////////////////////////////////////////////
+inline bool sdkStartTimer(StopWatchInterface** timer_interface) {
+  // printf("sdkStartTimer called object %08x\n", (void *)*timer_interface);
+  if (*timer_interface)
+    (*timer_interface)->start();
+
+  return true;
+}
+
+////////////////////////////////////////////////////////////////////////////////
+//! Stop the time with name \a name. Does not reset.
+//! @param name  name of the timer to stop
+////////////////////////////////////////////////////////////////////////////////
+inline bool sdkStopTimer(StopWatchInterface** timer_interface) {
+  // printf("sdkStopTimer called object %08x\n", (void *)*timer_interface);
+  if (*timer_interface)
+    (*timer_interface)->stop();
+
+  return true;
+}
+
+////////////////////////////////////////////////////////////////////////////////
+//! Resets the timer's counter.
+//! @param name  name of the timer to reset.
+////////////////////////////////////////////////////////////////////////////////
+inline bool sdkResetTimer(StopWatchInterface** timer_interface) {
+  // printf("sdkResetTimer called object %08x\n", (void *)*timer_interface);
+  if (*timer_interface)
+    (*timer_interface)->reset();
+
+  return true;
+}
+
+////////////////////////////////////////////////////////////////////////////////
+//! Return the average time for timer execution as the total time
+//! for the timer dividied by the number of completed (stopped) runs the timer
+//! has made.
+//! Excludes the current running time if the timer is currently running.
+//! @param name  name of the timer to return the time of
+////////////////////////////////////////////////////////////////////////////////
+inline float sdkGetAverageTimerValue(StopWatchInterface** timer_interface) {
+  //  printf("sdkGetAverageTimerValue called object %08x\n", (void
+  //  *)*timer_interface);
+  if (*timer_interface)
+    return (*timer_interface)->getAverageTime();
+  else
+    return 0.0f;
+}
+
+////////////////////////////////////////////////////////////////////////////////
+//! Total execution time for the timer over all runs since the last reset
+//! or timer creation.
+//! @param name  name of the timer to obtain the value of.
+////////////////////////////////////////////////////////////////////////////////
+inline float sdkGetTimerValue(StopWatchInterface** timer_interface) {
+  // printf("sdkGetTimerValue called object %08x\n", (void *)*timer_interface);
+  if (*timer_interface)
+    return (*timer_interface)->getTime();
+  else
+    return 0.0f;
+}
+
+#endif // COMMON_HELPER_TIMER_H_
diff --git a/demos/CUDA/TensorContraction.cu b/demos/CUDA/TensorContraction.cu
new file mode 100644
index 000000000..9d324a065
--- /dev/null
+++ b/demos/CUDA/TensorContraction.cu
@@ -0,0 +1,266 @@
+//--------------------------------------------------------------------*- C++ -*-
+// clad - The C++ Clang-based Automatic Differentiator
+//
+// A demo, describing how to calculate a tensor contraction and its gradient.
+//
+// author:  Christina Koutsou <christinakoutsou22-at-gmail.com>
+//----------------------------------------------------------------------------//
+
+// RUN: /path/to/clang -std=c++17 -Xclang -add-plugin -Xclang clad -Xclang -load -Xclang   \
+//          /path/to/libclad.so -I/path/to/clad/include -I/path/to/cuda/include            \
+//          TensorContraction.cu -o TensorContraction                                      \
+//          --cuda-path=path/to/cuda  --cuda-gpu-arch=%cuda_arch                           \
+//          -L/path/to/cuda/lib64 -lcudart_static -ldl -lrt -pthread -lm -lstdc++
+// RUN: ./TensorContraction
+
+#include "clad/Differentiator/Differentiator.h"
+
+typedef unsigned long long int size_type;
+
+__device__ void computeStartStep(size_type& A_start, size_type& A_step,
+                                 size_type& B_start, size_type& B_step,
+                                 const int idx, const size_type A_dim[3],
+                                 const size_type B_dim[3],
+                                 const int contractDimA,
+                                 const int contractDimB) {
+  size_type A_a, A_b, A_c, B_d, B_e, B_f;
+
+  switch (contractDimA) {
+  case 0:
+    A_b = idx / (A_dim[2] * B_dim[(contractDimB + 1) % 3] *
+                 B_dim[(contractDimB + 2) % 3]);
+    A_c = (idx /
+           (B_dim[(contractDimB + 1) % 3] * B_dim[(contractDimB + 2) % 3])) %
+          A_dim[2];
+    A_start = 0 + A_b * A_dim[2] + A_c;
+    A_step = A_dim[1] * A_dim[2];
+    break;
+  case 1:
+    A_a = idx / (A_dim[2] * B_dim[(contractDimB + 1) % 3] *
+                 B_dim[(contractDimB + 2) % 3]);
+    A_c = (idx /
+           (B_dim[(contractDimB + 1) % 3] * B_dim[(contractDimB + 2) % 3])) %
+          A_dim[2];
+    A_start = A_a * A_dim[1] * A_dim[2] + 0 + A_c;
+    A_step = A_dim[2];
+    break;
+  case 2:
+    A_a = idx / (A_dim[1] * B_dim[(contractDimB + 1) % 3] *
+                 B_dim[(contractDimB + 2) % 3]);
+    A_b = (idx /
+           (B_dim[(contractDimB + 1) % 3] * B_dim[(contractDimB + 2) % 3])) %
+          A_dim[1];
+    A_start = A_a * A_dim[1] * A_dim[2] + A_b * A_dim[2];
+    A_step = 1;
+    break;
+  }
+
+  switch (contractDimB) {
+  case 0:
+    B_e = (idx / B_dim[2]) % B_dim[1];
+    B_f = idx % B_dim[2];
+    B_start = 0 + B_e * B_dim[2] + B_f;
+    B_step = B_dim[1] * B_dim[2];
+    break;
+  case 1:
+    B_d = (idx / B_dim[2]) % B_dim[0];
+    B_f = idx % B_dim[2];
+    B_start = B_d * B_dim[2] * B_dim[1] + 0 + B_f;
+    B_step = B_dim[2];
+    break;
+  case 2:
+    B_d = (idx / B_dim[1]) % B_dim[0];
+    B_e = idx % B_dim[1];
+    B_start = B_d * B_dim[2] * B_dim[1] + B_e * B_dim[2];
+    B_step = 1;
+    break;
+  }
+}
+
+__global__ void tensorContraction3D(float* C, const float* A, const float* B,
+                                    const size_type* A_dim,
+                                    const size_type* B_dim,
+                                    const int contractDimA,
+                                    const int contractDimB) {
+  int idx = blockIdx.x * blockDim.x + threadIdx.x;
+
+  // Each thread computes one element of the output tensor
+  int totalElements =
+      A_dim[(contractDimA + 1) % 3] * A_dim[(contractDimA + 2) % 3] *
+      B_dim[(contractDimB + 1) % 3] * B_dim[(contractDimB + 2) % 3];
+  if (idx < totalElements) {
+    size_type A_start, B_start, A_step, B_step;
+    size_type A_a, A_b, A_c, B_d, B_e, B_f;
+
+    computeStartStep(A_start, A_step, B_start, B_step, idx, A_dim, B_dim,
+                     contractDimA, contractDimB);
+
+    float sum = 0.0f;
+    for (int i = 0; i < A_dim[contractDimA];
+         i++) // A_dim[contractDimA] == B_dim[contractDimB]
+      sum += A[A_start + (i * A_step)] * B[B_start + (i * B_step)];
+
+    C[idx] = sum;
+  }
+}
+
+void launchTensorContraction3D(float* C, const float* A, const float* B,
+                               const size_type D1, const size_type D2,
+                               const size_type D3, const size_type D4,
+                               const size_type D5) {
+  float *d_A = nullptr, *d_B = nullptr, *d_C = nullptr;
+
+  const size_type A_size = D1 * D2 * D3 * sizeof(float);
+  const size_type B_size = D3 * D4 * D5 * sizeof(float);
+  const size_type C_size = D1 * D2 * D4 * D5 * sizeof(float);
+
+  // Allocate device memory and copy data from host to device
+  cudaMalloc(&d_A, A_size);
+  cudaMalloc(&d_B, B_size);
+  cudaMalloc(&d_C, C_size);
+  cudaMemcpy(d_A, A, A_size, cudaMemcpyHostToDevice);
+  cudaMemcpy(d_B, B, B_size, cudaMemcpyHostToDevice);
+
+  const size_type A_dim[3] = {D1, D2, D3};
+  const size_type B_dim[3] = {D3, D4, D5};
+
+  size_type *d_A_dim = nullptr, *d_B_dim = nullptr;
+  cudaMalloc(&d_A_dim, 3 * sizeof(size_type));
+  cudaMalloc(&d_B_dim, 3 * sizeof(size_type));
+  cudaMemcpy(d_A_dim, A_dim, 3 * sizeof(size_type), cudaMemcpyHostToDevice);
+  cudaMemcpy(d_B_dim, B_dim, 3 * sizeof(size_type), cudaMemcpyHostToDevice);
+
+  // Launch the kernel
+  tensorContraction3D<<<1, 256>>>(d_C, d_A, d_B, d_A_dim, d_B_dim,
+                                  /*contractDimA=*/2, /*contractDimB=*/0);
+
+  // Copy the result from device to host
+  cudaMemcpy(C, d_C, C_size, cudaMemcpyDeviceToHost);
+
+  // Free device memory
+  cudaFree(d_A);
+  cudaFree(d_B);
+  cudaFree(d_C);
+  cudaFree(d_A_dim);
+  cudaFree(d_B_dim);
+}
+
+int main() {
+  const size_type D1 = 2, D2 = 3, D3 = 4, D4 = 3, D5 = 2;
+
+  float A[D1][D2][D3] = {
+      {{1, 2, 3, 4}, {5, 6, 7, 8}, {9, 10, 11, 12}},
+      {{13, 14, 15, 16}, {17, 18, 19, 20}, {21, 22, 23, 24}}};
+
+  float B[D3][D4][D5] = {{{1, 2}, {3, 4}, {5, 6}},
+                         {{7, 8}, {9, 10}, {11, 12}},
+                         {{13, 14}, {15, 16}, {17, 18}},
+                         {{19, 20}, {21, 22}, {23, 24}}};
+
+  float C[D1][D2][D4][D5] = {0}; // Result tensor
+
+  // Compute the gradient
+  auto tensor_grad = clad::gradient(launchTensorContraction3D, "C, A, B");
+
+  // Initialize the gradient inputs
+  float gradC[D1][D2][D4][D5] = {{{{1, 1}, {1, 1}, {1, 1}},
+                                  {{1, 1}, {1, 1}, {1, 1}},
+                                  {{1, 1}, {1, 1}, {1, 1}}},
+                                 {{{1, 1}, {1, 1}, {1, 1}},
+                                  {{1, 1}, {1, 1}, {1, 1}},
+                                  {{1, 1}, {1, 1}, {1, 1}}}};
+  float gradA[D1][D2][D3] = {0};
+  float gradB[D3][D4][D5] = {0};
+
+  // Execute tensor contraction and its gradient
+  tensor_grad.execute(&C[0][0][0][0], &A[0][0][0], &B[0][0][0], D1, D2, D3, D4,
+                      D5, &gradC[0][0][0][0], &gradA[0][0][0], &gradB[0][0][0]);
+
+  // Print the result
+  std::cout << "Result C:\n";
+  for (int i = 0; i < D1; ++i) {
+    for (int j = 0; j < D2; ++j) {
+      for (int k = 0; k < D4; ++k) {
+        for (int l = 0; l < D5; ++l)
+          std::cout << C[i][j][k][l] << " ";
+        std::cout << "\n";
+      }
+      std::cout << "\n";
+    }
+    std::cout << "\n";
+  }
+
+  std::cout << "Result C_grad w.r.t. A:\n";
+  for (int i = 0; i < D1; ++i) {
+    for (int j = 0; j < D2; ++j) {
+      for (int k = 0; k < D3; ++k)
+        std::cout << gradA[i][j][k] << " ";
+      std::cout << "\n";
+    }
+    std::cout << "\n";
+  }
+
+  std::cout << "Result C_grad w.r.t. B:\n";
+  for (int i = 0; i < D3; ++i) {
+    for (int j = 0; j < D4; ++j) {
+      for (int k = 0; k < D5; ++k)
+        std::cout << gradB[i][j][k] << " ";
+      std::cout << "\n";
+    }
+    std::cout << "\n";
+  }
+
+  return 0;
+}
+
+// CHECK-EXEC: Result C:
+// CHECK-NEXT: 130 140
+// CHECK-NEXT: 150 160
+// CHECK-NEXT: 170 180
+// CHECK-NEXT:
+// CHECK-NEXT: 290 316
+// CHECK-NEXT: 342 368
+// CHECK-NEXT: 394 420
+// CHECK-NEXT:
+// CHECK-NEXT: 450 492
+// CHECK-NEXT: 534 576
+// CHECK-NEXT: 618 660
+// CHECK-NEXT:
+// CHECK-NEXT:
+// CHECK-NEXT: 610 668
+// CHECK-NEXT: 726 784
+// CHECK-NEXT: 842 900
+// CHECK-NEXT:
+// CHECK-NEXT: 770 844
+// CHECK-NEXT: 918 992
+// CHECK-NEXT: 1066 1140
+// CHECK-NEXT:
+// CHECK-NEXT: 930 1020
+// CHECK-NEXT: 1110 1200
+// CHECK-NEXT: 1290 1380
+
+// CHECK-EXEC: Result C_grad w.r.t. A:
+// CHECK-NEXT: 21 57 93 129
+// CHECK-NEXT: 21 57 93 129
+// CHECK-NEXT: 21 57 93 129
+// CHECK-NEXT:
+// CHECK-NEXT: 21 57 93 129
+// CHECK-NEXT: 21 57 93 129
+// CHECK-NEXT: 21 57 93 129
+// CHECK-NEXT:
+// CHECK-EXEC: Result C_grad w.r.t. B:
+// CHECK-NEXT: 66 66
+// CHECK-NEXT: 66 66
+// CHECK-NEXT: 66 66
+// CHECK-NEXT:
+// CHECK-NEXT: 72 72
+// CHECK-NEXT: 72 72
+// CHECK-NEXT: 72 72
+// CHECK-NEXT:
+// CHECK-NEXT: 78 78
+// CHECK-NEXT: 78 78
+// CHECK-NEXT: 78 78
+// CHECK-NEXT:
+// CHECK-NEXT: 84 84
+// CHECK-NEXT: 84 84
+// CHECK-NEXT: 84 84
\ No newline at end of file
diff --git a/include/clad/Differentiator/BuiltinDerivatives.h b/include/clad/Differentiator/BuiltinDerivatives.h
index 62296ab92..8b8b4e09a 100644
--- a/include/clad/Differentiator/BuiltinDerivatives.h
+++ b/include/clad/Differentiator/BuiltinDerivatives.h
@@ -386,6 +386,28 @@ inline void free_pushforward(void* ptr, void* d_ptr) {
 // NOLINTEND(cppcoreguidelines-owning-memory)
 // NOLINTEND(cppcoreguidelines-no-malloc)
 
+CUDA_HOST_DEVICE inline void expf_pullback(float a, float d_y, float* d_a) {
+  *d_a += expf(a) * d_y;
+}
+
+CUDA_HOST_DEVICE inline void fabsf_pullback(float a, float d_y, float* d_a) {
+  *d_a += (a >= 0) ? d_y : -d_y;
+}
+
+CUDA_HOST_DEVICE inline void logf_pullback(float a, float d_y, float* d_a) {
+  *d_a += (1.F / a) * d_y;
+}
+
+CUDA_HOST_DEVICE inline void fdividef_pullback(float a, float b, float d_y,
+                                               float* d_a, float* d_b) {
+  *d_a += (1.F / b) * d_y;
+  *d_b += (-a / (b * b)) * d_y;
+}
+
+CUDA_HOST_DEVICE inline void sqrtf_pullback(float a, float d_y, float* d_a) {
+  *d_a += (1.F / (2.F * sqrtf(a))) * d_y;
+}
+
 // These are required because C variants of mathematical functions are
 // defined in global namespace.
 using std::abs_pushforward;