From 0638cc9e2143d24b33d17760a0e0ea1d1ef802b8 Mon Sep 17 00:00:00 2001
From: Mihail Mihov <mihovmihailp@gmail.com>
Date: Wed, 31 Jul 2024 17:38:01 +0300
Subject: [PATCH 01/11] CladFunction `constexpr`

---
 include/clad/Differentiator/CladConfig.h     |   3 +
 include/clad/Differentiator/Differentiator.h | 220 +++++++++++++------
 include/clad/Differentiator/FunctionTraits.h |  23 +-
 3 files changed, 166 insertions(+), 80 deletions(-)
diff --git a/include/clad/Differentiator/CladConfig.h b/include/clad/Differentiator/CladConfig.h
index 39d47efd8..a81c0cef7 100644
--- a/include/clad/Differentiator/CladConfig.h
+++ b/include/clad/Differentiator/CladConfig.h
@@ -34,6 +34,9 @@ enum opts : unsigned {
 
   // Specifying whether we only want the diagonal of the hessian.
   diagonal_only = 1 << (ORDER_BITS + 4),
+
+  // Specify that we need a constexpr-enabled CladFunction
+  immediate_mode = 1 << (ORDER_BITS + 7),
 }; // enum opts
 
 constexpr unsigned GetDerivativeOrder(const unsigned bitmasked_opts) {
diff --git a/include/clad/Differentiator/Differentiator.h b/include/clad/Differentiator/Differentiator.h
index dfb900e1e..413dca1a4 100644
--- a/include/clad/Differentiator/Differentiator.h
+++ b/include/clad/Differentiator/Differentiator.h
@@ -120,7 +120,7 @@ CUDA_HOST_DEVICE T push(tape<T>& to, ArgsT... val) {
   template <bool EnablePadding, class... Rest, class F, class... Args,
             class... fArgTypes,
             typename std::enable_if<EnablePadding, bool>::type = true>
-  CUDA_HOST_DEVICE return_type_t<F>
+  constexpr CUDA_HOST_DEVICE return_type_t<F>
   execute_with_default_args(list<Rest...>, F f, list<fArgTypes...>,
                             CUDA_ARGS CUDA_REST_ARGS Args&&... args) {
 #if defined(__CUDACC__) && !defined(__CUDA_ARCH__)
@@ -148,7 +148,7 @@ CUDA_HOST_DEVICE T push(tape<T>& to, ArgsT... val) {
   template <bool EnablePadding, class... Rest, class F, class... Args,
             class... fArgTypes,
             typename std::enable_if<!EnablePadding, bool>::type = true>
-  return_type_t<F>
+  constexpr return_type_t<F>
   execute_with_default_args(list<Rest...>, F f, list<fArgTypes...>,
                             CUDA_ARGS CUDA_REST_ARGS Args&&... args) {
 #if defined(__CUDACC__) && !defined(__CUDA_ARCH__)
@@ -167,10 +167,10 @@ CUDA_HOST_DEVICE T push(tape<T>& to, ArgsT... val) {
   template <bool EnablePadding, class... Rest, class ReturnType, class C,
             class Obj, class... Args, class... fArgTypes,
             typename std::enable_if<EnablePadding, bool>::type = true>
-  CUDA_HOST_DEVICE auto
+  constexpr CUDA_HOST_DEVICE auto
   execute_with_default_args(list<Rest...>, ReturnType C::*f, Obj&& obj,
-                            list<fArgTypes...>, Args&&... args)
-      -> return_type_t<decltype(f)> {
+                            list<fArgTypes...>,
+                            Args&&... args) -> return_type_t<decltype(f)> {
     return (static_cast<Obj>(obj).*f)((fArgTypes)(args)...,
                                       static_cast<Rest>(nullptr)...);
   }
@@ -178,9 +178,10 @@ CUDA_HOST_DEVICE T push(tape<T>& to, ArgsT... val) {
   template <bool EnablePadding, class... Rest, class ReturnType, class C,
             class Obj, class... Args, class... fArgTypes,
             typename std::enable_if<!EnablePadding, bool>::type = true>
-  auto execute_with_default_args(list<Rest...>, ReturnType C::*f, Obj&& obj,
-                                 list<fArgTypes...>, Args&&... args)
-      -> return_type_t<decltype(f)> {
+  constexpr auto
+  execute_with_default_args(list<Rest...>, ReturnType C::*f, Obj&& obj,
+                            list<fArgTypes...>,
+                            Args&&... args) -> return_type_t<decltype(f)> {
     return (static_cast<Obj>(obj).*f)(static_cast<Args>(args)...);
   }
 
@@ -192,7 +193,7 @@ CUDA_HOST_DEVICE T push(tape<T>& to, ArgsT... val) {
   /// Default value of `Functor` here is temporary, and should be removed
   /// once all clad differentiation functions support differentiating functors.
   template <typename F, typename FunctorT = ExtractFunctorTraits_t<F>,
-            bool EnablePadding = false>
+            bool EnablePadding = false, bool ImmediateMode = false>
   class CladFunction {
   public:
     using CladFunctionType = F;
@@ -200,46 +201,80 @@ CUDA_HOST_DEVICE T push(tape<T>& to, ArgsT... val) {
 
   private:
     CladFunctionType m_Function;
-    char* m_Code;
+    const char* m_Code;
     FunctorType *m_Functor = nullptr;
     bool m_CUDAkernel = false;
 
   public:
+#ifdef __cpp_concepts
     CUDA_HOST_DEVICE CladFunction(CladFunctionType f, const char* code,
                                   FunctorType* functor = nullptr,
                                   bool CUDAkernel = false)
+      requires(!ImmediateMode)
         : m_Function(f), m_Functor(functor), m_CUDAkernel(CUDAkernel) {
 #ifndef __CLAD_SO_LOADED
       static_assert(false, "clad doesn't appear to be loaded; make sure that "
                            "you pass clad.so to clang.");
 #endif
+      size_t length = GetLength(code);
+      char* temp = (char*)malloc(length + 1);
+      m_Code = temp;
+      while ((*temp++ = *code++))
+        ;
+    }
 
+    constexpr CUDA_HOST_DEVICE CladFunction(CladFunctionType f,
+                                            FunctorType* functor = nullptr,
+                                            bool CUDAkernel = false)
+      requires(ImmediateMode)
+        : m_Function(f), m_Code("<constexpr functions don't have support for "
+                                "printing the derivative yet>"),
+          m_Functor(functor), m_CUDAkernel(CUDAkernel) {
+#ifndef __CLAD_SO_LOADED
+      static_assert(false, "clad doesn't appear to be loaded; make sure that "
+                           "you pass clad.so to clang.");
+#endif
+    }
+#else
+    CUDA_HOST_DEVICE CladFunction(CladFunctionType f, const char* code,
+                                  FunctorType* functor = nullptr,
+                                  bool CUDAkernel = false)
+        : m_Function(f), m_Functor(functor), m_CUDAkernel(CUDAkernel) {
+#ifndef __CLAD_SO_LOADED
+      static_assert(false, "clad doesn't appear to be loaded; make sure that "
+                           "you pass clad.so to clang.");
+#endif
       size_t length = GetLength(code);
       char* temp = (char*)malloc(length + 1);
       m_Code = temp;
       while ((*temp++ = *code++))
         ;
     }
+#endif
+
     /// Constructor overload for initializing `m_Functor` when functor
     /// is passed by reference.
-    CUDA_HOST_DEVICE
-    CladFunction(CladFunctionType f, const char* code, FunctorType& functor)
+    CUDA_HOST_DEVICE CladFunction(CladFunctionType f, const char* code,
+                                  FunctorType& functor)
         : CladFunction(f, code, &functor) {};
 
+    constexpr CUDA_HOST_DEVICE CladFunction(CladFunctionType f,
+                                            FunctorType& functor)
+        : CladFunction(f, &functor) {};
+
     // Intentionally leak m_Code, otherwise we have to link against c++ runtime,
     // i.e -lstdc++.
     //~CladFunction() { /*free(m_Code);*/ }
 
-    CladFunctionType getFunctionPtr() { return m_Function; }
+    constexpr CladFunctionType getFunctionPtr() const { return m_Function; }
 
     template <typename... Args, class FnType = CladFunctionType>
-    typename std::enable_if<!std::is_same<FnType, NoFunction*>::value,
-                            return_type_t<F>>::type
-    execute(Args&&... args) CUDA_HOST_DEVICE {
-      if (!m_Function) {
-        printf("CladFunction is invalid\n");
+    typename std::enable_if<
+        !std::is_same<FnType, NoFunction*>::value,
+        return_type_t<F>>::type constexpr execute(Args&&... args)
+        CUDA_HOST_DEVICE const {
+      if (!m_Function)
         return static_cast<return_type_t<F>>(return_type_t<F>());
-      }
       if (m_CUDAkernel) {
         printf("Use execute_kernel() for global CUDA kernels\n");
         return static_cast<return_type_t<F>>(return_type_t<F>());
@@ -278,19 +313,20 @@ CUDA_HOST_DEVICE T push(tape<T>& to, ArgsT... val) {
     /// Error handling is handled in the clad side using clang diagnostics 
     /// subsystem.
     template <typename... Args, class FnType = CladFunctionType>
-    typename std::enable_if<std::is_same<FnType, NoFunction*>::value,
-                            return_type_t<F>>::type
-    execute(Args&&... args) CUDA_HOST_DEVICE {
+    typename std::enable_if<
+        std::is_same<FnType, NoFunction*>::value,
+        return_type_t<F>>::type constexpr execute(Args&&... args)
+        CUDA_HOST_DEVICE const {
       return static_cast<return_type_t<F>>(0);
     }
 
     /// Return the string representation for the generated derivative.
-    const char* getCode() const {
+    constexpr const char* getCode() const {
       if (m_Code)
         return m_Code;
-      else
-        return "<invalid>";
+      return "<invalid>";
     }
+
     void dump() const {
       printf("The code is: \n%s\n", getCode());
     }
@@ -315,8 +351,8 @@ CUDA_HOST_DEVICE T push(tape<T>& to, ArgsT... val) {
     private:
       /// Helper function for executing non-member derived functions.
       template <class Fn, class... Args>
-      CUDA_HOST_DEVICE return_type_t<CladFunctionType>
-      execute_helper(Fn f, CUDA_ARGS Args&&... args) {
+      constexpr CUDA_HOST_DEVICE return_type_t<CladFunctionType>
+      execute_helper(Fn f, CUDA_ARGS Args&&... args) const {
         // `static_cast` is required here for perfect forwarding.
 #if defined(__CUDACC__)
         if constexpr (sizeof...(Args) >= 2) {
@@ -354,27 +390,25 @@ CUDA_HOST_DEVICE T push(tape<T>& to, ArgsT... val) {
       /// Helper functions for executing member derived functions.
       /// If user have passed object explicitly, then this specialization will
       /// be used and derived function will be called through the passed object.
-      template <
-          class ReturnType,
-          class C,
-          class Obj,
-          class = typename std::enable_if<
-              std::is_same<typename std::decay<Obj>::type, C>::value>::type,
-          class... Args>
-      return_type_t<CladFunctionType>
-      execute_helper(ReturnType C::*f, Obj&& obj, Args&&... args) {
+      template <class ReturnType, class C, class Obj,
+                class = typename std::enable_if<std::is_same<
+                    typename std::decay<Obj>::type, C>::value>::type,
+                class... Args>
+      constexpr return_type_t<CladFunctionType>
+      execute_helper(ReturnType C::*f, Obj&& obj, Args&&... args) const {
         // `static_cast` is required here for perfect forwarding.
-      return execute_with_default_args<EnablePadding>(
-          DropArgs_t<sizeof...(Args), decltype(f)>{}, f, static_cast<Obj>(obj),
-          TakeNFirstArgs_t<sizeof...(Args), decltype(f)>{},
-          static_cast<Args>(args)...);
+        return execute_with_default_args<EnablePadding>(
+            DropArgs_t<sizeof...(Args), decltype(f)>{}, f,
+            static_cast<Obj>(obj),
+            TakeNFirstArgs_t<sizeof...(Args), decltype(f)>{},
+            static_cast<Args>(args)...);
       }
       /// If user have not passed object explicitly, then this specialization
       /// will be used and derived function will be called through the object
       /// saved in `CladFunction`.
       template <class ReturnType, class C, class... Args>
-      return_type_t<CladFunctionType> execute_helper(ReturnType C::*f,
-                                                     Args&&... args) {
+      constexpr return_type_t<CladFunctionType>
+      execute_helper(ReturnType C::*f, Args&&... args) const {
         // `static_cast` is required here for perfect forwarding.
         return execute_with_default_args<EnablePadding>(
             DropArgs_t<sizeof...(Args), decltype(f)>{}, f, *m_Functor,
@@ -406,6 +440,8 @@ CUDA_HOST_DEVICE T push(tape<T>& to, ArgsT... val) {
             typename = typename std::enable_if<
                 !clad::HasOption(GetBitmaskedOpts(BitMaskedOpts...),
                                  opts::vector_mode) &&
+                !clad::HasOption(GetBitmaskedOpts(BitMaskedOpts...),
+                                 opts::immediate_mode) &&
                 !std::is_class<remove_reference_and_pointer_t<F>>::value>::type>
   CladFunction<DerivedFnType, ExtractFunctorTraits_t<F>> __attribute__((
       annotate("D")))
@@ -416,6 +452,23 @@ CUDA_HOST_DEVICE T push(tape<T>& to, ArgsT... val) {
                                                                   code);
   }
 
+  template <unsigned... BitMaskedOpts, typename ArgSpec = const char*,
+            typename F,
+            typename DerivedFnType = ExtractDerivedFnTraitsForwMode_t<F>,
+            typename = typename std::enable_if<
+                !clad::HasOption(GetBitmaskedOpts(BitMaskedOpts...),
+                                 opts::vector_mode) &&
+                clad::HasOption(GetBitmaskedOpts(BitMaskedOpts...),
+                                opts::immediate_mode) &&
+                !std::is_class<remove_reference_and_pointer_t<F>>::value>::type>
+  constexpr CladFunction<DerivedFnType, ExtractFunctorTraits_t<F>, false,
+                         true> __attribute__((annotate("D")))
+  differentiate(F fn, ArgSpec args = "",
+                DerivedFnType derivedFn = static_cast<DerivedFnType>(nullptr)) {
+    return CladFunction<DerivedFnType, ExtractFunctorTraits_t<F>, false, true>(
+        derivedFn);
+  }
+
   /// Specialization for differentiating functors.
   /// The specialization is needed because objects have to be passed
   /// by reference whereas functions have to be passed by value.
@@ -426,13 +479,13 @@ CUDA_HOST_DEVICE T push(tape<T>& to, ArgsT... val) {
                 !clad::HasOption(GetBitmaskedOpts(BitMaskedOpts...),
                                  opts::vector_mode) &&
                 std::is_class<remove_reference_and_pointer_t<F>>::value>::type>
-  CladFunction<DerivedFnType, ExtractFunctorTraits_t<F>> __attribute__((
-      annotate("D")))
+  constexpr CladFunction<
+      DerivedFnType, ExtractFunctorTraits_t<F>> __attribute__((annotate("D")))
   differentiate(F&& f, ArgSpec args = "",
                 DerivedFnType derivedFn = static_cast<DerivedFnType>(nullptr),
                 const char* code = "") {
-      return CladFunction<DerivedFnType, ExtractFunctorTraits_t<F>>(derivedFn,
-                                                                    code, f);
+    return CladFunction<DerivedFnType, ExtractFunctorTraits_t<F>>(derivedFn,
+                                                                  code, f);
   }
 
   /// Generates function which computes derivative of `fn` argument w.r.t
@@ -449,8 +502,8 @@ CUDA_HOST_DEVICE T push(tape<T>& to, ArgsT... val) {
                 clad::HasOption(GetBitmaskedOpts(BitMaskedOpts...),
                                 opts::vector_mode) &&
                 !std::is_class<remove_reference_and_pointer_t<F>>::value>::type>
-  CladFunction<DerivedFnType, ExtractFunctorTraits_t<F>, true> __attribute__((
-      annotate("D")))
+  constexpr CladFunction<DerivedFnType, ExtractFunctorTraits_t<F>,
+                         true> __attribute__((annotate("D")))
   differentiate(F fn, ArgSpec args = "",
                 DerivedFnType derivedFn = static_cast<DerivedFnType>(nullptr),
                 const char* code = "") {
@@ -468,9 +521,11 @@ CUDA_HOST_DEVICE T push(tape<T>& to, ArgsT... val) {
   template <unsigned... BitMaskedOpts, typename ArgSpec = const char*,
             typename F, typename DerivedFnType = GradientDerivedFnTraits_t<F>,
             typename = typename std::enable_if<
+                !clad::HasOption(GetBitmaskedOpts(BitMaskedOpts...),
+                                 opts::immediate_mode) &&
                 !std::is_class<remove_reference_and_pointer_t<F>>::value>::type>
-  CladFunction<DerivedFnType, ExtractFunctorTraits_t<F>, true> __attribute__((
-      annotate("G"))) CUDA_HOST_DEVICE
+  constexpr CladFunction<DerivedFnType, ExtractFunctorTraits_t<F>,
+                         true> __attribute__((annotate("G"))) CUDA_HOST_DEVICE
   gradient(F f, ArgSpec args = "",
            DerivedFnType derivedFn = static_cast<DerivedFnType>(nullptr),
            const char* code = "", bool CUDAkernel = false) {
@@ -478,6 +533,21 @@ CUDA_HOST_DEVICE T push(tape<T>& to, ArgsT... val) {
         derivedFn /* will be replaced by gradient*/, code, nullptr, CUDAkernel);
   }
 
+  template <unsigned... BitMaskedOpts, typename ArgSpec = const char*,
+            typename F, typename DerivedFnType = GradientDerivedFnTraits_t<F>,
+            typename = typename std::enable_if<
+                clad::HasOption(GetBitmaskedOpts(BitMaskedOpts...),
+                                opts::immediate_mode) &&
+                !std::is_class<remove_reference_and_pointer_t<F>>::value>::type>
+  constexpr CladFunction<DerivedFnType, ExtractFunctorTraits_t<F>, true,
+                         true> __attribute__((annotate("G"))) CUDA_HOST_DEVICE
+  gradient(F f, ArgSpec args = "",
+           DerivedFnType derivedFn = static_cast<DerivedFnType>(nullptr),
+           bool CUDAkernel = false) {
+    return CladFunction<DerivedFnType, ExtractFunctorTraits_t<F>, true, true>(
+        derivedFn /* will be replaced by gradient*/, nullptr, CUDAkernel);
+  }
+
   /// Specialization for differentiating functors.
   /// The specialization is needed because objects have to be passed
   /// by reference whereas functions have to be passed by value.
@@ -485,13 +555,13 @@ CUDA_HOST_DEVICE T push(tape<T>& to, ArgsT... val) {
             typename F, typename DerivedFnType = GradientDerivedFnTraits_t<F>,
             typename = typename std::enable_if<
                 std::is_class<remove_reference_and_pointer_t<F>>::value>::type>
-  CladFunction<DerivedFnType, ExtractFunctorTraits_t<F>, true> __attribute__((
-      annotate("G"))) CUDA_HOST_DEVICE
+  constexpr CladFunction<DerivedFnType, ExtractFunctorTraits_t<F>,
+                         true> __attribute__((annotate("G"))) CUDA_HOST_DEVICE
   gradient(F&& f, ArgSpec args = "",
            DerivedFnType derivedFn = static_cast<DerivedFnType>(nullptr),
            const char* code = "") {
-      return CladFunction<DerivedFnType, ExtractFunctorTraits_t<F>, true>(
-          derivedFn /* will be replaced by gradient*/, code, f);
+    return CladFunction<DerivedFnType, ExtractFunctorTraits_t<F>, true>(
+        derivedFn /* will be replaced by gradient*/, code, f);
   }
 
   /// Generates function which computes hessian matrix of the given function wrt
@@ -504,9 +574,11 @@ CUDA_HOST_DEVICE T push(tape<T>& to, ArgsT... val) {
   template <unsigned... BitMaskedOpts, typename ArgSpec = const char*,
             typename F, typename DerivedFnType = HessianDerivedFnTraits_t<F>,
             typename = typename std::enable_if<
+                !clad::HasOption(GetBitmaskedOpts(BitMaskedOpts...),
+                                 opts::immediate_mode) &&
                 !std::is_class<remove_reference_and_pointer_t<F>>::value>::type>
-  CladFunction<DerivedFnType, ExtractFunctorTraits_t<F>> __attribute__((
-      annotate("H")))
+  constexpr CladFunction<
+      DerivedFnType, ExtractFunctorTraits_t<F>> __attribute__((annotate("H")))
   hessian(F f, ArgSpec args = "",
           DerivedFnType derivedFn = static_cast<DerivedFnType>(nullptr),
           const char* code = "") {
@@ -514,6 +586,20 @@ CUDA_HOST_DEVICE T push(tape<T>& to, ArgsT... val) {
         derivedFn /* will be replaced by hessian*/, code);
   }
 
+  template <unsigned... BitMaskedOpts, typename ArgSpec = const char*,
+            typename F, typename DerivedFnType = HessianDerivedFnTraits_t<F>,
+            typename = typename std::enable_if<
+                clad::HasOption(GetBitmaskedOpts(BitMaskedOpts...),
+                                opts::immediate_mode) &&
+                !std::is_class<remove_reference_and_pointer_t<F>>::value>::type>
+  constexpr CladFunction<DerivedFnType, ExtractFunctorTraits_t<F>, false,
+                         true> __attribute__((annotate("H")))
+  hessian(F f, ArgSpec args = "",
+          DerivedFnType derivedFn = static_cast<DerivedFnType>(nullptr)) {
+    return CladFunction<DerivedFnType, ExtractFunctorTraits_t<F>, false, true>(
+        derivedFn /* will be replaced by hessian*/);
+  }
+
   /// Specialization for differentiating functors.
   /// The specialization is needed because objects have to be passed
   /// by reference whereas functions have to be passed by value.
@@ -521,13 +607,13 @@ CUDA_HOST_DEVICE T push(tape<T>& to, ArgsT... val) {
             typename F, typename DerivedFnType = HessianDerivedFnTraits_t<F>,
             typename = typename std::enable_if<
                 std::is_class<remove_reference_and_pointer_t<F>>::value>::type>
-  CladFunction<DerivedFnType, ExtractFunctorTraits_t<F>> __attribute__((
-      annotate("H")))
+  constexpr CladFunction<
+      DerivedFnType, ExtractFunctorTraits_t<F>> __attribute__((annotate("H")))
   hessian(F&& f, ArgSpec args = "",
           DerivedFnType derivedFn = static_cast<DerivedFnType>(nullptr),
           const char* code = "") {
-      return CladFunction<DerivedFnType, ExtractFunctorTraits_t<F>>(
-          derivedFn /* will be replaced by hessian*/, code, f);
+    return CladFunction<DerivedFnType, ExtractFunctorTraits_t<F>>(
+        derivedFn /* will be replaced by hessian*/, code, f);
   }
 
   /// Generates function which computes jacobian matrix of the given function
@@ -541,8 +627,8 @@ CUDA_HOST_DEVICE T push(tape<T>& to, ArgsT... val) {
             typename F, typename DerivedFnType = JacobianDerivedFnTraits_t<F>,
             typename = typename std::enable_if<
                 !std::is_class<remove_reference_and_pointer_t<F>>::value>::type>
-  CladFunction<DerivedFnType, ExtractFunctorTraits_t<F>> __attribute__((
-      annotate("J")))
+  constexpr CladFunction<
+      DerivedFnType, ExtractFunctorTraits_t<F>> __attribute__((annotate("J")))
   jacobian(F f, ArgSpec args = "",
            DerivedFnType derivedFn = static_cast<DerivedFnType>(nullptr),
            const char* code = "") {
@@ -557,18 +643,18 @@ CUDA_HOST_DEVICE T push(tape<T>& to, ArgsT... val) {
             typename F, typename DerivedFnType = JacobianDerivedFnTraits_t<F>,
             typename = typename std::enable_if<
                 std::is_class<remove_reference_and_pointer_t<F>>::value>::type>
-  CladFunction<DerivedFnType, ExtractFunctorTraits_t<F>> __attribute__((
-      annotate("J")))
+  constexpr CladFunction<
+      DerivedFnType, ExtractFunctorTraits_t<F>> __attribute__((annotate("J")))
   jacobian(F&& f, ArgSpec args = "",
            DerivedFnType derivedFn = static_cast<DerivedFnType>(nullptr),
            const char* code = "") {
-      return CladFunction<DerivedFnType, ExtractFunctorTraits_t<F>>(
-          derivedFn /* will be replaced by Jacobian*/, code, f);
+    return CladFunction<DerivedFnType, ExtractFunctorTraits_t<F>>(
+        derivedFn /* will be replaced by Jacobian*/, code, f);
   }
 
   template <typename ArgSpec = const char*, typename F,
             typename DerivedFnType = GradientDerivedEstFnTraits_t<F>>
-  CladFunction<DerivedFnType> __attribute__((annotate("E")))
+  constexpr CladFunction<DerivedFnType> __attribute__((annotate("E")))
   estimate_error(F f, ArgSpec args = "",
                  DerivedFnType derivedFn = static_cast<DerivedFnType>(nullptr),
                  const char* code = "") {
diff --git a/include/clad/Differentiator/FunctionTraits.h b/include/clad/Differentiator/FunctionTraits.h
index c15eeb270..bc568e51d 100644
--- a/include/clad/Differentiator/FunctionTraits.h
+++ b/include/clad/Differentiator/FunctionTraits.h
@@ -763,17 +763,15 @@ namespace clad {
   /// Specialization for free function pointer type
   template <class F>
   struct ExtractDerivedFnTraitsForwMode<
-      F*,
-      typename std::enable_if<std::is_function<F>::value>::type> {
+      F*, typename std::enable_if<std::is_function<F>::value>::type> {
     using type = remove_reference_and_pointer_t<F>*;
   };
 
   /// Specialization for member function pointer type
   template <class F>
   struct ExtractDerivedFnTraitsForwMode<
-      F,
-      typename std::enable_if<
-          std::is_member_function_pointer<F>::value>::type> {
+      F, typename std::enable_if<
+             std::is_member_function_pointer<F>::value>::type> {
     using type = typename std::decay<F>::type;
   };
 
@@ -783,20 +781,19 @@ namespace clad {
   /// defines member typedef `type` as the type of `NoFunction*`.
   template <class F>
   struct ExtractDerivedFnTraitsForwMode<
-      F,
-      typename std::enable_if<
-          std::is_class<remove_reference_and_pointer_t<F>>::value &&
-          has_call_operator<F>::value>::type> {
+      F, typename std::enable_if<
+             std::is_class<remove_reference_and_pointer_t<F>>::value &&
+             has_call_operator<F>::value>::type> {
     using ClassType =
         typename std::decay<remove_reference_and_pointer_t<F>>::type;
     using type = decltype(&ClassType::operator());
   };
+
   template <class F>
   struct ExtractDerivedFnTraitsForwMode<
-      F,
-      typename std::enable_if<
-          std::is_class<remove_reference_and_pointer_t<F>>::value &&
-          !has_call_operator<F>::value>::type> {
+      F, typename std::enable_if<
+             std::is_class<remove_reference_and_pointer_t<F>>::value &&
+             !has_call_operator<F>::value>::type> {
     using type = NoFunction*;
   };
 

From b13cc77f4c21b883b933be9f61d23a92d09a4d3c Mon Sep 17 00:00:00 2001
From: Mihail Mihov <mihovmihailp@gmail.com>
Date: Wed, 28 Aug 2024 16:57:33 +0200
Subject: [PATCH 02/11] Process `DiffRequest`s to immediate functions earlier

---
 include/clad/Differentiator/DynamicGraph.h |  3 ++-
 tools/ClangPlugin.cpp                      | 11 +++++++++++
 2 files changed, 13 insertions(+), 1 deletion(-)

diff --git a/include/clad/Differentiator/DynamicGraph.h b/include/clad/Differentiator/DynamicGraph.h
index f7b5f61b0..2ef8cf992 100644
--- a/include/clad/Differentiator/DynamicGraph.h
+++ b/include/clad/Differentiator/DynamicGraph.h
@@ -106,7 +106,8 @@ template <typename T> class DynamicGraph {
   bool isProcessingNode() { return m_currentId != -1; }
 
   /// Get the nodes in the graph.
-  const std::vector<T>& getNodes() { return m_nodes; }
+  const std::vector<T>& getNodes() const { return m_nodes; }
+  std::vector<T>& getNodes() { return m_nodes; }
 
   /// Print the nodes and edges in the graph.
   void print() {
diff --git a/tools/ClangPlugin.cpp b/tools/ClangPlugin.cpp
index d228a2dc3..b89170528 100644
--- a/tools/ClangPlugin.cpp
+++ b/tools/ClangPlugin.cpp
@@ -137,6 +137,17 @@ namespace clad {
       SetRequestOptions(opts);
       DiffCollector collector(DGR, CladEnabledRange, m_DiffRequestGraph, S,
                               opts);
+
+      for (DiffRequest& request : m_DiffRequestGraph.getNodes()) {
+        if (!request.Function->isImmediateFunction() &&
+            !request.Function->isConstexpr())
+          continue;
+
+        m_DiffRequestGraph.setCurrentProcessingNode(request);
+        ProcessDiffRequest(request);
+        m_DiffRequestGraph.markCurrentNodeProcessed();
+      }
+
       // We could not delay the processing of derivatives, inform act as if each
       // call is final. That would still have vgvassilev/clad#248 unresolved.
       if (!m_Multiplexer)

From 34eccfc662a772792e6a995dc4d9265f7a2492e9 Mon Sep 17 00:00:00 2001
From: Mihail Mihov <mihovmihailp+github@gmail.com>
Date: Sat, 19 Oct 2024 15:57:15 +0300
Subject: [PATCH 03/11] Add tests for constexpr and consteval

---
 test/ForwardMode/ConstevalTest.C | 53 +++++++++++++++++++++++++++
 test/ForwardMode/ConstexprTest.C | 63 ++++++++++++++++++++++++++++++++
 test/ForwardMode/constexprTest.C | 43 ----------------------
 3 files changed, 116 insertions(+), 43 deletions(-)
 create mode 100644 test/ForwardMode/ConstevalTest.C
 create mode 100644 test/ForwardMode/ConstexprTest.C
 delete mode 100644 test/ForwardMode/constexprTest.C

diff --git a/test/ForwardMode/ConstevalTest.C b/test/ForwardMode/ConstevalTest.C
new file mode 100644
index 000000000..95b0fc936
--- /dev/null
+++ b/test/ForwardMode/ConstevalTest.C
@@ -0,0 +1,53 @@
+// RUN: %cladclang %s -I%S/../../include -std=c++23 -oConstevalTest.out | %filecheck %s
+// RUN: ./ConstevalTest.out | %filecheck_exec %s
+// UNSUPPORTED: clang-8, clang-9, clang-10, clang-11, clang-12, clang-13, clang-14, clang-15, clang-16
+
+#include "clad/Differentiator/Differentiator.h"
+
+consteval double fn(double x, double y) {
+    return (x+y)/2;
+}
+
+//CHECK: consteval double fn_darg0(double x, double y) {
+//CHECK-NEXT:    double _d_x = 1;
+//CHECK-NEXT:    double _d_y = 0;
+//CHECK-NEXT:    double _t0 = (x + y);
+//CHECK-NEXT:    return ((_d_x + _d_y) * 2 - _t0 * 0) / (2 * 2);
+//CHECK-NEXT:}
+
+consteval double mul(double a, double b, double c) {
+     double val = 99.00;
+     double result = val * a + 100 - b + c;
+     return result;
+}
+
+//CHECK: consteval double mul_darg0(double a, double b, double c) {
+//CHECK-NEXT:    double _d_a = 1;
+//CHECK-NEXT:    double _d_b = 0;
+//CHECK-NEXT:    double _d_c = 0;
+//CHECK-NEXT:    double _d_val = 0.;
+//CHECK-NEXT:    double val = 99.;
+//CHECK-NEXT:    double _d_result = _d_val * a + val * _d_a + 0 - _d_b + _d_c;
+//CHECK-NEXT:    double result = val * a + 100 - b + c;
+//CHECK-NEXT:    return _d_result;
+//CHECK-NEXT:}
+
+consteval double fn_test() {
+    auto dx = clad::differentiate<clad::immediate_mode>(fn, "x");
+
+    return dx.execute(4, 7);
+}
+
+consteval double mul_test() {
+    auto dx = clad::differentiate<clad::immediate_mode>(mul, "a");
+
+    return dx.execute(5, 6, 10);
+}
+
+int main() {
+    constexpr double fn_result = fn_test();
+    printf("%.2f\n", fn_result); // CHECK-EXEC: 0.50
+
+    constexpr double mul_result = mul_test();
+    printf("%.2f\n", mul_result); // CHECK-EXEC: 99.00
+}
diff --git a/test/ForwardMode/ConstexprTest.C b/test/ForwardMode/ConstexprTest.C
new file mode 100644
index 000000000..cdf3dcd67
--- /dev/null
+++ b/test/ForwardMode/ConstexprTest.C
@@ -0,0 +1,63 @@
+// RUN: %cladclang %s -I%S/../../include -std=c++23 -oConstexprTest.out | %filecheck %s
+// RUN: ./ConstexprTest.out | %filecheck_exec %s
+// UNSUPPORTED: clang-8, clang-9, clang-10, clang-11, clang-12, clang-13, clang-14, clang-15, clang-16
+
+#include "clad/Differentiator/Differentiator.h"
+
+constexpr double fn(double x, double y) {
+    return (x + y) / 2;
+}
+
+//CHECK: constexpr double fn_darg0(double x, double y) {
+//CHECK-NEXT:    double _d_x = 1;
+//CHECK-NEXT:    double _d_y = 0;
+//CHECK-NEXT:    double _t0 = (x + y);
+//CHECK-NEXT:    return ((_d_x + _d_y) * 2 - _t0 * 0) / (2 * 2);
+//CHECK-NEXT:}
+
+constexpr double mul(double a, double b, double c) {
+     double val = 99.00;
+     double result = val * a + 100 - b + c;
+     return result;
+}
+
+//CHECK: constexpr double mul_darg0(double a, double b, double c) {
+//CHECK-NEXT:    double _d_a = 1;
+//CHECK-NEXT:    double _d_b = 0;
+//CHECK-NEXT:    double _d_c = 0;
+//CHECK-NEXT:    double _d_val = 0.;
+//CHECK-NEXT:    double val = 99.;
+//CHECK-NEXT:    double _d_result = _d_val * a + val * _d_a + 0 - _d_b + _d_c;
+//CHECK-NEXT:    double result = val * a + 100 - b + c;
+//CHECK-NEXT:    return _d_result;
+//CHECK-NEXT:}
+
+constexpr double fn_test() {
+    if consteval {
+	auto dx = clad::differentiate<clad::immediate_mode>(fn, "x");
+
+	return dx.execute(4, 7);
+    } else {
+	assert(false && "fn non-immediate context");
+	return -1.;
+    }
+}
+
+constexpr double mul_test() {
+    if consteval {
+	auto dx = clad::differentiate<clad::immediate_mode>(mul, "a");
+
+	return dx.execute(5, 6, 10);
+    } else {
+	assert(false && "mul non-immediate context");
+	return -1.;
+    }
+}
+
+int main() {
+    constexpr double fn_result = fn_test();
+    printf("%.2f\n", fn_result); // CHECK-EXEC: 0.50
+
+    constexpr double mul_result = mul_test();
+    printf("%.2f\n", mul_result); // CHECK-EXEC: 99.0
+}
diff --git a/test/ForwardMode/constexprTest.C b/test/ForwardMode/constexprTest.C
deleted file mode 100644
index 5ead5b1b0..000000000
--- a/test/ForwardMode/constexprTest.C
+++ /dev/null
@@ -1,43 +0,0 @@
-// RUN: %cladclang %s -I%S/../../include -oconstexprTest.out | %filecheck %s
-// RUN: ./constexprTest.out | %filecheck_exec %s
-
-#include "clad/Differentiator/Differentiator.h"
-
-#include "../TestUtils.h"
-
-
-constexpr double fn(double a, double b) {
-    return (a+b)/2;
-}
-
-//CHECK: constexpr double fn_darg0(double a, double b) {
-//CHECK-NEXT:    double _d_a = 1;
-//CHECK-NEXT:    double _d_b = 0;
-//CHECK-NEXT:    double _t0 = (a + b);
-//CHECK-NEXT:    return ((_d_a + _d_b) * 2 - _t0 * 0) / (2 * 2);
-//CHECK-NEXT:}
-
-constexpr double mul(double a, double b, double c) {
-     double val = 99.00;
-     double result = val * a + 100 - b + c;
-     return result;
-}
-
-//CHECK: constexpr double mul_darg0(double a, double b, double c) {
-//CHECK-NEXT:    double _d_a = 1;
-//CHECK-NEXT:    double _d_b = 0;
-//CHECK-NEXT:    double _d_c = 0;
-//CHECK-NEXT:    double _d_val = 0.;
-//CHECK-NEXT:    double val = 99.;
-//CHECK-NEXT:    double _d_result = _d_val * a + val * _d_a + 0 - _d_b + _d_c;
-//CHECK-NEXT:    double result = val * a + 100 - b + c;
-//CHECK-NEXT:    return _d_result;
-//CHECK-NEXT:}
-
-int main() {
-    INIT_DIFFERENTIATE(fn,"a");
-    INIT_DIFFERENTIATE(mul, "a");
-
-    TEST_DIFFERENTIATE(fn, 4, 7); // CHECK-EXEC: {0.50}
-    TEST_DIFFERENTIATE(mul, 5, 6, 10); // CHECK-EXEC: {99.00}
-}

From c213194247dfe3219f70f54a7dc98cd79c679ecb Mon Sep 17 00:00:00 2001
From: Mihail Mihov <mihovmihailp+github@gmail.com>
Date: Sat, 19 Oct 2024 16:46:29 +0300
Subject: [PATCH 04/11] Do not assume index of `derivedFn` and `code`
 parameters

---
 lib/Differentiator/DiffPlanner.cpp | 70 ++++++++++++++++++------------
 1 file changed, 43 insertions(+), 27 deletions(-)

diff --git a/lib/Differentiator/DiffPlanner.cpp b/lib/Differentiator/DiffPlanner.cpp
index b4c7018a2..64c12ed1f 100644
--- a/lib/Differentiator/DiffPlanner.cpp
+++ b/lib/Differentiator/DiffPlanner.cpp
@@ -190,6 +190,20 @@ namespace clad {
 
     FunctionDecl* replacementFD = OverloadedFD ? OverloadedFD : FD;
 
+    auto codeArgIdx = -1;
+    auto derivedFnArgIdx = -1;
+    auto idx = 0;
+    for (auto* arg : call->arguments()) {
+      if (auto* default_arg_expr = dyn_cast<CXXDefaultArgExpr>(arg)) {
+        std::string argName = default_arg_expr->getParam()->getNameAsString();
+        if (argName == "derivedFn")
+          derivedFnArgIdx = idx;
+        else if (argName == "code")
+          codeArgIdx = idx;
+      }
+      ++idx;
+    }
+
     // Index of "CUDAkernel" parameter:
     int numArgs = static_cast<int>(call->getNumArgs());
     if (numArgs > 4) {
@@ -204,8 +218,6 @@ namespace clad {
       call->setArg(kernelArgIdx, cudaKernelFlag);
       numArgs--;
     }
-    auto codeArgIdx = numArgs - 1;
-    auto derivedFnArgIdx = numArgs - 2;
 
     // Create ref to generated FD.
     DeclRefExpr* DRE =
@@ -221,31 +233,35 @@ namespace clad {
     if (isa<CXXMethodDecl>(DRE->getDecl()))
       DRE->setValueKind(CLAD_COMPAT_ExprValueKind_R_or_PR_Value);
 
-    // Add the "&" operator
-    auto newUnOp =
-        SemaRef.BuildUnaryOp(nullptr, noLoc, UnaryOperatorKind::UO_AddrOf, DRE)
-            .get();
-    call->setArg(derivedFnArgIdx, newUnOp);
-
-    // Update the code parameter.
-    if (CXXDefaultArgExpr* Arg
-        = dyn_cast<CXXDefaultArgExpr>(call->getArg(codeArgIdx))) {
-      clang::LangOptions LangOpts;
-      LangOpts.CPlusPlus = true;
-      clang::PrintingPolicy Policy(LangOpts);
-      Policy.Bool = true;
-
-      std::string s;
-      llvm::raw_string_ostream Out(s);
-      FD->print(Out, Policy);
-      Out.flush();
-
-      StringLiteral* SL = utils::CreateStringLiteral(C, Out.str());
-      Expr* newArg =
-        SemaRef.ImpCastExprToType(SL,
-                                  Arg->getType(),
-                                  CK_ArrayToPointerDecay).get();
-      call->setArg(codeArgIdx, newArg);
+    if (derivedFnArgIdx != -1) {
+      // Add the "&" operator
+      auto* newUnOp =
+          SemaRef
+              .BuildUnaryOp(nullptr, noLoc, UnaryOperatorKind::UO_AddrOf, DRE)
+              .get();
+      call->setArg(derivedFnArgIdx, newUnOp);
+    }
+
+    // Update the code parameter if it was found.
+    if (codeArgIdx != -1) {
+      if (auto* Arg = dyn_cast<CXXDefaultArgExpr>(call->getArg(codeArgIdx))) {
+        clang::LangOptions LangOpts;
+        LangOpts.CPlusPlus = true;
+        clang::PrintingPolicy Policy(LangOpts);
+        Policy.Bool = true;
+
+        std::string s;
+        llvm::raw_string_ostream Out(s);
+        FD->print(Out, Policy);
+        Out.flush();
+
+        StringLiteral* SL = utils::CreateStringLiteral(C, Out.str());
+        Expr* newArg =
+            SemaRef
+                .ImpCastExprToType(SL, Arg->getType(), CK_ArrayToPointerDecay)
+                .get();
+        call->setArg(codeArgIdx, newArg);
+      }
     }
   }
 

From 294aea77a19194722b4e263c9f38c8482f5ffd15 Mon Sep 17 00:00:00 2001
From: Mihail Mihov <mihovmihailp+github@gmail.com>
Date: Sat, 19 Oct 2024 16:01:04 +0300
Subject: [PATCH 05/11] Mark `clad::array_ref` methods as constexpr

---
 include/clad/Differentiator/ArrayRef.h | 58 ++++++++++++++------------
 1 file changed, 31 insertions(+), 27 deletions(-)

diff --git a/include/clad/Differentiator/ArrayRef.h b/include/clad/Differentiator/ArrayRef.h
index fac7ab320..5fe173ee4 100644
--- a/include/clad/Differentiator/ArrayRef.h
+++ b/include/clad/Differentiator/ArrayRef.h
@@ -25,18 +25,19 @@ template <typename T> class array_ref {
   array_ref() = default;
   /// Constructor to store the pointer to and size of an array supplied by the
   /// user
-  CUDA_HOST_DEVICE array_ref(T* arr, std::size_t size)
+  constexpr CUDA_HOST_DEVICE array_ref(T* arr, std::size_t size)
       : m_arr(arr), m_size(size) {}
   /// Constructor for arrays having size equal to 1 or non pointer types to
   /// store their addresses
-  CUDA_HOST_DEVICE array_ref(T* a) : m_arr(a), m_size(1) {}
+  constexpr CUDA_HOST_DEVICE array_ref(T* a) : m_arr(a), m_size(1) {}
   /// Constructor for clad::array types
-  CUDA_HOST_DEVICE array_ref(array<T>& a) : m_arr(a.ptr()), m_size(a.size()) {}
+  constexpr CUDA_HOST_DEVICE array_ref(array<T>& a)
+      : m_arr(a.ptr()), m_size(a.size()) {}
 
   /// Operator for conversion from array_ref<T> to T*.
-  CUDA_HOST_DEVICE operator T*() { return m_arr; }
+  constexpr CUDA_HOST_DEVICE operator T*() { return m_arr; }
   /// Operator for conversion from array_ref<T> to const T*.
-  CUDA_HOST_DEVICE operator const T*() const { return m_arr; }
+  constexpr CUDA_HOST_DEVICE operator const T*() const { return m_arr; }
 
   template <typename U>
   CUDA_HOST_DEVICE array_ref<T>& operator=(const array<U>& a) {
@@ -46,25 +47,26 @@ template <typename T> class array_ref {
     return *this;
   }
   template <typename U>
-  CUDA_HOST_DEVICE array_ref<T>& operator=(const array_ref<T>& a) {
+  constexpr CUDA_HOST_DEVICE array_ref<T>& operator=(const array_ref<T>& a) {
     m_arr = a.ptr();
     m_size = a.size();
     return *this;
   }
   /// Returns the size of the underlying array
-  CUDA_HOST_DEVICE std::size_t size() const { return m_size; }
-  CUDA_HOST_DEVICE PUREFUNC T* ptr() const { return m_arr; }
-  CUDA_HOST_DEVICE PUREFUNC T*& ptr_ref() { return m_arr; }
+  constexpr CUDA_HOST_DEVICE std::size_t size() const { return m_size; }
+  constexpr CUDA_HOST_DEVICE PUREFUNC T* ptr() const { return m_arr; }
+  constexpr CUDA_HOST_DEVICE PUREFUNC T*& ptr_ref() { return m_arr; }
   /// Returns an array_ref to a part of the underlying array starting at
   /// offset and having the specified size
-  CUDA_HOST_DEVICE array_ref<T> slice(std::size_t offset, std::size_t size) {
+  constexpr CUDA_HOST_DEVICE array_ref<T> slice(std::size_t offset,
+                                                std::size_t size) {
     assert((offset >= 0) && (offset + size <= m_size) &&
            "Window is outside array. Please provide an offset and size "
            "inside the array size.");
     return array_ref<T>(&m_arr[offset], size);
   }
   /// Returns the reference to the underlying array
-  CUDA_HOST_DEVICE PUREFUNC T& operator*() { return *m_arr; }
+  constexpr CUDA_HOST_DEVICE PUREFUNC T& operator*() { return *m_arr; }
 
   // Arithmetic overloads
   /// Divides the arrays element wise
@@ -171,7 +173,7 @@ template <typename T> class array_ref {
 
 /// Multiplies the arrays element wise
 template <typename T, typename U>
-CUDA_HOST_DEVICE
+constexpr CUDA_HOST_DEVICE
     array_expression<const array_ref<T>&, BinaryMul, const array_ref<U>&>
     operator*(const array_ref<T>& Ar, const array_ref<U>& Br) {
   assert(Ar.size() == Br.size() &&
@@ -183,7 +185,7 @@ CUDA_HOST_DEVICE
 
 /// Adds the arrays element wise
 template <typename T, typename U>
-CUDA_HOST_DEVICE
+constexpr CUDA_HOST_DEVICE
     array_expression<const array_ref<T>&, BinaryAdd, const array_ref<U>&>
     operator+(const array_ref<T>& Ar, const array_ref<U>& Br) {
   assert(Ar.size() == Br.size() &&
@@ -195,7 +197,7 @@ CUDA_HOST_DEVICE
 
 /// Subtracts the arrays element wise
 template <typename T, typename U>
-CUDA_HOST_DEVICE
+constexpr CUDA_HOST_DEVICE
     array_expression<const array_ref<T>&, BinarySub, const array_ref<U>&>
     operator-(const array_ref<T>& Ar, const array_ref<U>& Br) {
   assert(
@@ -208,7 +210,7 @@ CUDA_HOST_DEVICE
 
 /// Divides the arrays element wise
 template <typename T, typename U>
-CUDA_HOST_DEVICE
+constexpr CUDA_HOST_DEVICE
     array_expression<const array_ref<T>&, BinaryDiv, const array_ref<U>&>
     operator/(const array_ref<T>& Ar, const array_ref<U>& Br) {
   assert(Ar.size() == Br.size() &&
@@ -221,7 +223,7 @@ CUDA_HOST_DEVICE
 /// Multiplies array_ref by a scalar
 template <typename T, typename U,
           typename std::enable_if<std::is_arithmetic<U>::value, int>::type = 0>
-CUDA_HOST_DEVICE array_expression<const array_ref<T>&, BinaryMul, U>
+constexpr CUDA_HOST_DEVICE array_expression<const array_ref<T>&, BinaryMul, U>
 operator*(const array_ref<T>& Ar, U a) {
   return array_expression<const array_ref<T>&, BinaryMul, U>(Ar, a);
 }
@@ -229,7 +231,7 @@ operator*(const array_ref<T>& Ar, U a) {
 /// Multiplies array_ref by a scalar (reverse order)
 template <typename T, typename U,
           typename std::enable_if<std::is_arithmetic<U>::value, int>::type = 0>
-CUDA_HOST_DEVICE array_expression<const array_ref<T>&, BinaryMul, U>
+constexpr CUDA_HOST_DEVICE array_expression<const array_ref<T>&, BinaryMul, U>
 operator*(U a, const array_ref<T>& Ar) {
   return array_expression<const array_ref<T>&, BinaryMul, U>(Ar, a);
 }
@@ -237,7 +239,7 @@ operator*(U a, const array_ref<T>& Ar) {
 /// Divides array_ref by a scalar
 template <typename T, typename U,
           typename std::enable_if<std::is_arithmetic<U>::value, int>::type = 0>
-CUDA_HOST_DEVICE array_expression<const array_ref<T>&, BinaryDiv, U>
+constexpr CUDA_HOST_DEVICE array_expression<const array_ref<T>&, BinaryDiv, U>
 operator/(const array_ref<T>& Ar, U a) {
   return array_expression<const array_ref<T>&, BinaryDiv, U>(Ar, a);
 }
@@ -245,7 +247,7 @@ operator/(const array_ref<T>& Ar, U a) {
 /// Adds array_ref by a scalar
 template <typename T, typename U,
           typename std::enable_if<std::is_arithmetic<U>::value, int>::type = 0>
-CUDA_HOST_DEVICE array_expression<const array_ref<T>&, BinaryAdd, U>
+constexpr CUDA_HOST_DEVICE array_expression<const array_ref<T>&, BinaryAdd, U>
 operator+(const array_ref<T>& Ar, U a) {
   return array_expression<const array_ref<T>&, BinaryAdd, U>(Ar, a);
 }
@@ -253,7 +255,7 @@ operator+(const array_ref<T>& Ar, U a) {
 /// Adds array_ref by a scalar (reverse order)
 template <typename T, typename U,
           typename std::enable_if<std::is_arithmetic<U>::value, int>::type = 0>
-CUDA_HOST_DEVICE array_expression<const array_ref<T>&, BinaryAdd, U>
+constexpr CUDA_HOST_DEVICE array_expression<const array_ref<T>&, BinaryAdd, U>
 operator+(U a, const array_ref<T>& Ar) {
   return array_expression<const array_ref<T>&, BinaryAdd, U>(Ar, a);
 }
@@ -261,7 +263,7 @@ operator+(U a, const array_ref<T>& Ar) {
 /// Subtracts array_ref by a scalar
 template <typename T, typename U,
           typename std::enable_if<std::is_arithmetic<U>::value, int>::type = 0>
-CUDA_HOST_DEVICE array_expression<const array_ref<T>&, BinarySub, U>
+constexpr CUDA_HOST_DEVICE array_expression<const array_ref<T>&, BinarySub, U>
 operator-(const array_ref<T>& Ar, U a) {
   return array_expression<const array_ref<T>&, BinarySub, U>(Ar, a);
 }
@@ -269,7 +271,7 @@ operator-(const array_ref<T>& Ar, U a) {
 /// Subtracts array_ref by a scalar (reverse order)
 template <typename T, typename U,
           typename std::enable_if<std::is_arithmetic<U>::value, int>::type = 0>
-CUDA_HOST_DEVICE array_expression<U, BinarySub, const array_ref<T>&>
+constexpr CUDA_HOST_DEVICE array_expression<U, BinarySub, const array_ref<T>&>
 operator-(U a, const array_ref<T>& Ar) {
   return array_expression<U, BinarySub, const array_ref<T>&>(a, Ar);
 }
@@ -303,16 +305,18 @@ operator-(U a, const array_ref<T>& Ar) {
     template <typename T, class = typename std::enable_if<
                               std::is_pointer<T>::value ||
                               std::is_same<T, std::nullptr_t>::value>::type>
-    CUDA_HOST_DEVICE array_ref(T arr, std::size_t size = 1)
+    constexpr CUDA_HOST_DEVICE array_ref(T arr, std::size_t size = 1)
         : m_arr((void*)arr), m_size(size) {}
     template <typename T>
-    CUDA_HOST_DEVICE array_ref(const array_ref<T>& other)
+    constexpr CUDA_HOST_DEVICE array_ref(const array_ref<T>& other)
         : m_arr(other.ptr()), m_size(other.size()) {}
-    template <typename T> CUDA_HOST_DEVICE operator array_ref<T>() {
+    template <typename T> constexpr CUDA_HOST_DEVICE operator array_ref<T>() {
       return array_ref<T>((T*)(m_arr), m_size);
     }
-    CUDA_HOST_DEVICE void* ptr() const { return m_arr; }
-    CUDA_HOST_DEVICE std::size_t size() const { return m_size; }
+    [[nodiscard]] constexpr CUDA_HOST_DEVICE void* ptr() const { return m_arr; }
+    [[nodiscard]] constexpr CUDA_HOST_DEVICE std::size_t size() const {
+      return m_size;
+    }
   };
   // NOLINTEND(*-pointer-arithmetic)
 } // namespace clad

From 24dbec67bcebe12878ae359e763f3c9aec935897 Mon Sep 17 00:00:00 2001
From: Mihail Mihov <mihovmihailp+github@gmail.com>
Date: Mon, 21 Oct 2024 22:36:00 +0300
Subject: [PATCH 06/11] Keep track of whether a request is immediate in
 `DiffRequest`

---
 include/clad/Differentiator/DiffPlanner.h |  3 +++
 lib/Differentiator/DiffPlanner.cpp        |  2 ++
 tools/ClangPlugin.cpp                     | 14 +++++++-------
 3 files changed, 12 insertions(+), 7 deletions(-)

diff --git a/include/clad/Differentiator/DiffPlanner.h b/include/clad/Differentiator/DiffPlanner.h
index 2116d5ea0..d2b74592b 100644
--- a/include/clad/Differentiator/DiffPlanner.h
+++ b/include/clad/Differentiator/DiffPlanner.h
@@ -65,6 +65,9 @@ struct DiffRequest {
   /// A flag to enable TBR analysis during reverse-mode differentiation.
   bool EnableTBRAnalysis = false;
   bool EnableVariedAnalysis = false;
+  /// A flag specifying whether this differentiation is to be used
+  /// in immediate contexts.
+  bool ImmediateMode = false;
   /// Puts the derived function and its code in the diff call
   void updateCall(clang::FunctionDecl* FD, clang::FunctionDecl* OverloadedFD,
                   clang::Sema& SemaRef);
diff --git a/lib/Differentiator/DiffPlanner.cpp b/lib/Differentiator/DiffPlanner.cpp
index 64c12ed1f..28485a413 100644
--- a/lib/Differentiator/DiffPlanner.cpp
+++ b/lib/Differentiator/DiffPlanner.cpp
@@ -748,6 +748,8 @@ namespace clad {
         request.RequestedDerivativeOrder = derivative_order;
         if (clad::HasOption(bitmasked_opts_value, clad::opts::use_enzyme))
           request.use_enzyme = true;
+        if (clad::HasOption(bitmasked_opts_value, clad::opts::immediate_mode))
+          request.ImmediateMode = true;
         if (enable_tbr_in_req) {
           utils::EmitDiag(m_Sema, DiagnosticsEngine::Error, endLoc,
                           "TBR analysis is not meant for forward mode AD.");
diff --git a/tools/ClangPlugin.cpp b/tools/ClangPlugin.cpp
index b89170528..012b194fd 100644
--- a/tools/ClangPlugin.cpp
+++ b/tools/ClangPlugin.cpp
@@ -138,15 +138,15 @@ namespace clad {
       DiffCollector collector(DGR, CladEnabledRange, m_DiffRequestGraph, S,
                               opts);
 
+#if CLANG_VERSION_MAJOR > 16
       for (DiffRequest& request : m_DiffRequestGraph.getNodes()) {
-        if (!request.Function->isImmediateFunction() &&
-            !request.Function->isConstexpr())
-          continue;
-
-        m_DiffRequestGraph.setCurrentProcessingNode(request);
-        ProcessDiffRequest(request);
-        m_DiffRequestGraph.markCurrentNodeProcessed();
+        if (request.ImmediateMode && request.Function->isConstexpr()) {
+          m_DiffRequestGraph.setCurrentProcessingNode(request);
+          ProcessDiffRequest(request);
+          m_DiffRequestGraph.markCurrentNodeProcessed();
+        }
       }
+#endif
 
       // We could not delay the processing of derivatives, inform act as if each
       // call is final. That would still have vgvassilev/clad#248 unresolved.

From b9d5bbfd0200e8d31dc757be8f2b1b13cf8e1c1a Mon Sep 17 00:00:00 2001
From: Mihail Mihov <mihovmihailp+github@gmail.com>
Date: Thu, 24 Oct 2024 00:11:10 +0300
Subject: [PATCH 07/11] Fix ForwardMode/NotEnoughArgError.C

---
 test/ForwardMode/NotEnoughArgError.C | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/test/ForwardMode/NotEnoughArgError.C b/test/ForwardMode/NotEnoughArgError.C
index df49c5b14..1660a5056 100644
--- a/test/ForwardMode/NotEnoughArgError.C
+++ b/test/ForwardMode/NotEnoughArgError.C
@@ -13,8 +13,8 @@ int main () {
   // expected-error@clad/Differentiator/Differentiator.h:* {{too few arguments to function call, expected 2, have 1}}
   // expected-note@clad/Differentiator/Differentiator.h:* {{in instantiation of function template specialization 'clad::execute_with_default_args<false, double, double (*)(double, double), int, double, true>' requested here}}
 #if __clang_major__ < 16
-  // expected-note@clad/Differentiator/Differentiator.h:* {{in instantiation of function template specialization 'clad::CladFunction<double (*)(double, double), clad::NoObject, false>::execute_helper<double (*)(double, double), int>' requested here}}
-  // expected-note@NotEnoughArgError.C:12 {{in instantiation of function template specialization 'clad::CladFunction<double (*)(double, double), clad::NoObject, false>::execute<int, double (*)(double, double)>' requested here}}
+  // expected-note@clad/Differentiator/Differentiator.h:* {{in instantiation of function template specialization 'clad::CladFunction<double (*)(double, double), clad::NoObject, false, false>::execute_helper<double (*)(double, double), int>' requested here}}
+  // expected-note@NotEnoughArgError.C:12 {{in instantiation of function template specialization 'clad::CladFunction<double (*)(double, double), clad::NoObject, false, false>::execute<int, double (*)(double, double)>' requested here}}
 #else
   // expected-note@clad/Differentiator/Differentiator.h:* {{in instantiation of function template specialization 'clad::CladFunction<double (*)(double, double), clad::NoObject>::execute_helper<double (*)(double, double), int>' requested here}}
   // expected-note@NotEnoughArgError.C:12 {{in instantiation of function template specialization 'clad::CladFunction<double (*)(double, double), clad::NoObject>::execute<int, double (*)(double, double)>' requested here}}

From 7e7aa96244e2f294871a8b7aa54f2bb94eb36cb6 Mon Sep 17 00:00:00 2001
From: kchristin <christinakoutsou22@gmail.com>
Date: Fri, 25 Oct 2024 22:39:28 +0300
Subject: [PATCH 08/11] Fix cuda device host constexpr execute functions
 declarations

---
 include/clad/Differentiator/Differentiator.h | 14 ++++++--------
 1 file changed, 6 insertions(+), 8 deletions(-)

diff --git a/include/clad/Differentiator/Differentiator.h b/include/clad/Differentiator/Differentiator.h
index 413dca1a4..4429ea54d 100644
--- a/include/clad/Differentiator/Differentiator.h
+++ b/include/clad/Differentiator/Differentiator.h
@@ -269,10 +269,9 @@ CUDA_HOST_DEVICE T push(tape<T>& to, ArgsT... val) {
     constexpr CladFunctionType getFunctionPtr() const { return m_Function; }
 
     template <typename... Args, class FnType = CladFunctionType>
-    typename std::enable_if<
-        !std::is_same<FnType, NoFunction*>::value,
-        return_type_t<F>>::type constexpr execute(Args&&... args)
-        CUDA_HOST_DEVICE const {
+    typename std::enable_if<!std::is_same<FnType, NoFunction*>::value,
+                            return_type_t<F>>::type constexpr CUDA_HOST_DEVICE
+    execute(Args&&... args) const {
       if (!m_Function)
         return static_cast<return_type_t<F>>(return_type_t<F>());
       if (m_CUDAkernel) {
@@ -313,10 +312,9 @@ CUDA_HOST_DEVICE T push(tape<T>& to, ArgsT... val) {
     /// Error handling is handled in the clad side using clang diagnostics 
     /// subsystem.
     template <typename... Args, class FnType = CladFunctionType>
-    typename std::enable_if<
-        std::is_same<FnType, NoFunction*>::value,
-        return_type_t<F>>::type constexpr execute(Args&&... args)
-        CUDA_HOST_DEVICE const {
+    typename std::enable_if<std::is_same<FnType, NoFunction*>::value,
+                            return_type_t<F>>::type constexpr CUDA_HOST_DEVICE
+    execute(Args&&... args) const {
       return static_cast<return_type_t<F>>(0);
     }
 

From d7dcb346e7123f40d72c99b07c2d247e560a58a6 Mon Sep 17 00:00:00 2001
From: kchristin <christinakoutsou22@gmail.com>
Date: Fri, 25 Oct 2024 22:40:20 +0300
Subject: [PATCH 09/11] Store argPtrs of cuda kernels in a std array instead of
 a vector

---
 include/clad/Differentiator/Differentiator.h | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/include/clad/Differentiator/Differentiator.h b/include/clad/Differentiator/Differentiator.h
index 4429ea54d..28d65857e 100644
--- a/include/clad/Differentiator/Differentiator.h
+++ b/include/clad/Differentiator/Differentiator.h
@@ -126,9 +126,8 @@ CUDA_HOST_DEVICE T push(tape<T>& to, ArgsT... val) {
 #if defined(__CUDACC__) && !defined(__CUDA_ARCH__)
     if (CUDAkernel) {
       constexpr size_t totalArgs = sizeof...(args) + sizeof...(Rest);
-      std::vector<void*> argPtrs;
-      argPtrs.reserve(totalArgs);
-      (argPtrs.push_back(static_cast<void*>(&args)), ...);
+      std::array<void*, totalArgs> argPtrs = {static_cast<void*>(&args)...,
+                                              static_cast<Rest>(nullptr)...};
 
       void* null_param = nullptr;
       for (size_t i = sizeof...(args); i < totalArgs; ++i)

From 21f62ce5aefc5289f6ab03fc73f747ea00985def Mon Sep 17 00:00:00 2001
From: Mihail Mihov <mihovmihailp@gmail.com>
Date: Sun, 27 Oct 2024 13:22:43 +0100
Subject: [PATCH 10/11] Rename `__CLAD_SO_LOADED` to `__CLAD__`

---
 include/clad/Differentiator/Differentiator.h | 6 +++---
 tools/ClangPlugin.cpp                        | 4 ++--
 2 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/include/clad/Differentiator/Differentiator.h b/include/clad/Differentiator/Differentiator.h
index 28d65857e..d5e51c4c0 100644
--- a/include/clad/Differentiator/Differentiator.h
+++ b/include/clad/Differentiator/Differentiator.h
@@ -211,7 +211,7 @@ CUDA_HOST_DEVICE T push(tape<T>& to, ArgsT... val) {
                                   bool CUDAkernel = false)
       requires(!ImmediateMode)
         : m_Function(f), m_Functor(functor), m_CUDAkernel(CUDAkernel) {
-#ifndef __CLAD_SO_LOADED
+#ifndef __CLAD__
       static_assert(false, "clad doesn't appear to be loaded; make sure that "
                            "you pass clad.so to clang.");
 #endif
@@ -229,7 +229,7 @@ CUDA_HOST_DEVICE T push(tape<T>& to, ArgsT... val) {
         : m_Function(f), m_Code("<constexpr functions don't have support for "
                                 "printing the derivative yet>"),
           m_Functor(functor), m_CUDAkernel(CUDAkernel) {
-#ifndef __CLAD_SO_LOADED
+#ifndef __CLAD__
       static_assert(false, "clad doesn't appear to be loaded; make sure that "
                            "you pass clad.so to clang.");
 #endif
@@ -239,7 +239,7 @@ CUDA_HOST_DEVICE T push(tape<T>& to, ArgsT... val) {
                                   FunctorType* functor = nullptr,
                                   bool CUDAkernel = false)
         : m_Function(f), m_Functor(functor), m_CUDAkernel(CUDAkernel) {
-#ifndef __CLAD_SO_LOADED
+#ifndef __CLAD__
       static_assert(false, "clad doesn't appear to be loaded; make sure that "
                            "you pass clad.so to clang.");
 #endif
diff --git a/tools/ClangPlugin.cpp b/tools/ClangPlugin.cpp
index 012b194fd..c868336dd 100644
--- a/tools/ClangPlugin.cpp
+++ b/tools/ClangPlugin.cpp
@@ -91,10 +91,10 @@ namespace clad {
       }
 #endif // CLANG_VERSION_MAJOR > 8
 
-      // Add define for __CLAD_SO_LOADED, so that CladFunction::CladFunction()
+      // Add define for __CLAD__, so that CladFunction::CladFunction()
       // doesn't throw an error.
       auto predefines = m_CI.getPreprocessor().getPredefines();
-      predefines.append("#define __CLAD_SO_LOADED 1\n");
+      predefines.append("#define __CLAD__ 1\n");
       m_CI.getPreprocessor().setPredefines(predefines);
     }
 

From 27f1892dae4dfb8a15278e48d79dbb24b1ac3c27 Mon Sep 17 00:00:00 2001
From: Mihail Mihov <mihovmihailp@gmail.com>
Date: Tue, 29 Oct 2024 15:21:28 +0100
Subject: [PATCH 11/11] Add page to documentation explaining
 `clad::immediate_mode`

---
 docs/userDocs/source/index.rst                |  1 +
 .../source/user/UsingImmediateMode.rst        | 71 +++++++++++++++++++
 2 files changed, 72 insertions(+)
 create mode 100644 docs/userDocs/source/user/UsingImmediateMode.rst

diff --git a/docs/userDocs/source/index.rst b/docs/userDocs/source/index.rst
index 8d41a0135..69da583e3 100644
--- a/docs/userDocs/source/index.rst
+++ b/docs/userDocs/source/index.rst
@@ -90,6 +90,7 @@ The User Guide
    user/tutorials
    user/UsingEnzymeWithinClad
    user/UsingVectorMode.rst
+   user/UsingImmediateMode
    user/FAQ
    user/DevelopersDocumentation
    user/IntroductionToClangForCladContributors
diff --git a/docs/userDocs/source/user/UsingImmediateMode.rst b/docs/userDocs/source/user/UsingImmediateMode.rst
new file mode 100644
index 000000000..e350958b2
--- /dev/null
+++ b/docs/userDocs/source/user/UsingImmediateMode.rst
@@ -0,0 +1,71 @@
+Using Clad-generated derivatives in an immediate context
+**********************************************************
+
+The derivatives that Clad generates are valid C++ code, which could in theory
+be executed at compile-time (or in an immediate context as the C++ standard
+calls it). When a function is differentiated all specifiers, such as
+`constexpr` and `consteval` are kept, but it is important to understand the
+interface that Clad provides for those derivatives to the user.
+
+When Clad differentiates a function (e.g. with `clad::differentiate`) the user
+receives a `CladFunction`, which contains a function pointer to the generated
+derivative, among many other things. Unfortunately due to how the C++ standard
+is written handling function pointers in an immediate context is very
+restricted and care needs to be taken to not violate the rules or the compiler
+won't be able to evaluate our `constexpr`/`consteval` functions during
+translation.
+
+Currently to get a `CladFunction` that is usable in immediate mode the user has
+to pass `clad::immediate_mode` to the differentiation function and that removes
+the ability to dump the generated derivative, but it may be possible to add
+support for that in the future.
+
+Usage of Clad's immediate mode
+================================================
+
+The following code snippet shows how one can request Clad to use the immediate
+mode for differentiation::
+
+    #include "clad/Differentiator/Differentiator.h"
+
+    constexpr double fn(double x, double y) {
+        return (x + y) / 2;
+    }
+
+    constexpr double fn_test() {
+        auto dx = clad::differentiate<clad::immediate_mode>(fn, "x");
+
+        return dx.execute(4, 7);
+    }
+
+    int main(){
+        constexpr double fn_result = fn_test();
+
+        printf("%.2f\n", fn_result);
+    }
+
+It is neccessary both to pass the `clad::immediate_mode` option to
+`clad::differentiate` and to keep both the call to `clad::differentiate` and
+all it's `.execute(...)` calls in the same immediate context, as the C++
+standard forbids having a function pointer to an immediate function outside of
+an immediate context. (It is not possible to do the differentiation and
+executions in main as `dx` would contain such a pointer, but `main` is not and
+can not be immediate)
+
+When using `constexpr` there is no easy way to tell whether the functions are
+actually being evaluated during translation, so it is a good idea to use either
+`consteval` or an `if consteval` (in C++23 and newer) to check if the immediate
+contexts are behaving as expected or assign the results to a variable marked
+`constexpr` as that would fail if the expression that is being assigned isn't
+immediate.
+
+Use cases supported by Clad's immediate mode
+================================================
+
+Currently Clad's immediate mode is primarily meant to be used in the forward
+mode (`clad::differentiate`) as internal data structures that Clad needs for
+differentiating loops, etc. are not yet usable in an immediate context.
+
+Both `constexpr` and `consteval` are supported as Clad doesn't actually rely on
+these specific keywords for its support, but instead uses clang's API to
+determine if the functions are immediate and should be differentiated eariler.