From a078f35f0bbff27705cae19a443f1451b3ecc772 Mon Sep 17 00:00:00 2001
From: Sebastian Grimberg <sjg@amazon.com>
Date: Tue, 30 Jan 2024 19:09:58 -0800
Subject: [PATCH 1/7] Improve libCEED quadrature point geometry data
 construction (avoid copying entire data from device to host and back to
 device)

---
 palace/fem/mesh.cpp | 71 +++++++++++++++++++++++----------------------
 1 file changed, 37 insertions(+), 34 deletions(-)
diff --git a/palace/fem/mesh.cpp b/palace/fem/mesh.cpp
index aaac29d86..4ac1e29d5 100644
--- a/palace/fem/mesh.cpp
+++ b/palace/fem/mesh.cpp
@@ -3,6 +3,8 @@
 
 #include "mesh.hpp"
 
+#include <ceed/backend.h>
+#include <mfem/general/forall.hpp>
 #include "fem/coefficient.hpp"
 #include "fem/fespace.hpp"
 #include "fem/libceed/integrator.hpp"
@@ -137,6 +139,33 @@ auto GetElementIndices(const mfem::ParMesh &mesh, bool use_bdr, int start, int s
   return element_indices;
 }
 
+void AssembleCeedAttributeData(const mfem::Array<int> &attr, int Q, Ceed ceed,
+                               CeedVector geom_data, CeedElemRestriction geom_data_restr)
+{
+  // The data for quadrature point i, component j, element k is found at index
+  // i * layout[0] + j * layout[1] + k * layout[2]. This is also correctly a non-OpenMP loop
+  // when executed on the CPU (OpenMP is handled in outer scope).
+  CeedMemType mem;
+  CeedInt layout[3];
+  CeedScalar *geom_data_array;
+  PalaceCeedCall(ceed, CeedGetPreferredMemType(ceed, &mem));
+  if (!mfem::Device::Allows(mfem::Backend::DEVICE_MASK))
+  {
+    mem = CEED_MEM_HOST;
+  }
+  PalaceCeedCall(ceed, CeedElemRestrictionGetELayout(geom_data_restr, &layout));
+  PalaceCeedCall(ceed, CeedVectorGetArray(geom_data, mem, &geom_data_array));
+  const auto *d_attr = attr.Read(mem == CEED_MEM_DEVICE);
+  mfem::forall_switch(mem == CEED_MEM_DEVICE, attr.Size() * Q,
+                      [=] MFEM_HOST_DEVICE(int l)
+                      {
+                        int k = l / Q;
+                        int i = l % Q;
+                        geom_data_array[i * layout[0] + k * layout[2]] = d_attr[k];
+                      });
+  CeedVectorRestoreArray(geom_data, &geom_data_array);
+}
+
 template <typename T>
 auto AssembleGeometryData(const mfem::GridFunction &mesh_nodes, Ceed ceed,
                           mfem::Geometry::Type geom, std::vector<int> &indices,
@@ -160,30 +189,10 @@ auto AssembleGeometryData(const mfem::GridFunction &mesh_nodes, Ceed ceed,
   PalaceCeedCall(ceed, CeedBasisGetNumQuadraturePoints(mesh_basis, &num_qpts));
   PalaceCeedCall(
       ceed, CeedVectorCreate(ceed, num_elem * num_qpts * geom_data_size, &data.geom_data));
-
-  // Data for quadrature point i, component j, element k is found at index i * strides[0] +
-  // j * strides[1] + k * strides[2].
-  CeedMemType mem;
-  CeedInt strides[3];
-  PalaceCeedCall(ceed, CeedGetPreferredMemType(ceed, &mem));
-  if (mfem::Device::Allows(mfem::Backend::DEVICE_MASK) && mem == CEED_MEM_DEVICE)
-  {
-    // GPU backends have CEED_STRIDES_BACKEND = {1, num_elem * num_qpts, num_qpts}.
-    strides[0] = 1;
-    strides[1] = num_elem * num_qpts;
-    strides[2] = num_qpts;
-  }
-  else
-  {
-    // CPU backends have CEED_STRIDES_BACKEND = {1, num_qpts, num_qpts * geom_data_size}.
-    strides[0] = 1;
-    strides[1] = num_qpts;
-    strides[2] = num_qpts * geom_data_size;
-  }
-  PalaceCeedCall(ceed,
-                 CeedElemRestrictionCreateStrided(ceed, num_elem, num_qpts, geom_data_size,
-                                                  num_elem * num_qpts * geom_data_size,
-                                                  strides, &data.geom_data_restr));
+  PalaceCeedCall(
+      ceed, CeedElemRestrictionCreateStrided(ceed, num_elem, num_qpts, geom_data_size,
+                                             num_elem * num_qpts * geom_data_size,
+                                             CEED_STRIDES_BACKEND, &data.geom_data_restr));
 
   // Compute the required geometry factors at quadrature points.
   CeedVector mesh_nodes_vec;
@@ -196,22 +205,16 @@ auto AssembleGeometryData(const mfem::GridFunction &mesh_nodes, Ceed ceed,
   PalaceCeedCall(ceed, CeedElemRestrictionDestroy(&mesh_restr));
   PalaceCeedCall(ceed, CeedBasisDestroy(&mesh_basis));
 
-  // Compute element attribute quadrature data. All inputs to a QFunction require the same
+  // Set the element attribute quadrature data. All inputs to a QFunction require the same
   // number of quadrature points, so we store the attribute at each quadrature point. This
   // is the first component of the quadrature data.
   {
-    CeedScalar *geom_data_array;
-    PalaceCeedCall(ceed,
-                   CeedVectorGetArray(data.geom_data, CEED_MEM_HOST, &geom_data_array));
+    mfem::Array<int> attr(num_elem);
     for (std::size_t k = 0; k < num_elem; k++)
     {
-      const auto attr = GetCeedAttribute(data.indices[k]);
-      for (CeedInt i = 0; i < num_qpts; i++)
-      {
-        geom_data_array[i * strides[0] + k * strides[2]] = attr;
-      }
+      attr[k] = GetCeedAttribute(data.indices[k]);
     }
-    CeedVectorRestoreArray(data.geom_data, &geom_data_array);
+    AssembleCeedAttributeData(attr, num_qpts, ceed, data.geom_data, data.geom_data_restr);
   }
 
   return data;

From 01e7606cae7316b65dac01c9df4e06fc71ae2d28 Mon Sep 17 00:00:00 2001
From: Sebastian Grimberg <sjg@amazon.com>
Date: Tue, 30 Jan 2024 19:11:09 -0800
Subject: [PATCH 2/7] Never assemble operator quadrature data explicitly
 (memory savings is more significant than minor QFunction speedup)

---
 palace/models/curlcurloperator.cpp | 2 +-
 palace/models/laplaceoperator.cpp  | 2 +-
 palace/models/spaceoperator.cpp    | 2 +-
 3 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/palace/models/curlcurloperator.cpp b/palace/models/curlcurloperator.cpp
index 8a30ec005..3b544c90a 100644
--- a/palace/models/curlcurloperator.cpp
+++ b/palace/models/curlcurloperator.cpp
@@ -163,7 +163,7 @@ std::unique_ptr<Operator> CurlCurlOperator::GetStiffnessMatrix()
                                          mat_op.GetInvPermeability());
   BilinearForm k(GetNDSpace());
   k.AddDomainIntegrator<CurlCurlIntegrator>(muinv_func);
-  k.AssembleQuadratureData();
+  // k.AssembleQuadratureData();
   auto k_vec = k.Assemble(GetNDSpaces(), skip_zeros);
   auto K = std::make_unique<MultigridOperator>(GetNDSpaces().GetNumLevels());
   for (std::size_t l = 0; l < GetNDSpaces().GetNumLevels(); l++)
diff --git a/palace/models/laplaceoperator.cpp b/palace/models/laplaceoperator.cpp
index 509067e63..c110a0726 100644
--- a/palace/models/laplaceoperator.cpp
+++ b/palace/models/laplaceoperator.cpp
@@ -184,7 +184,7 @@ std::unique_ptr<Operator> LaplaceOperator::GetStiffnessMatrix()
                                            mat_op.GetPermittivityReal());
   BilinearForm k(GetH1Space());
   k.AddDomainIntegrator<DiffusionIntegrator>(epsilon_func);
-  k.AssembleQuadratureData();
+  // k.AssembleQuadratureData();
   auto k_vec = k.Assemble(GetH1Spaces(), skip_zeros);
   auto K = std::make_unique<MultigridOperator>(GetH1Spaces().GetNumLevels());
   for (std::size_t l = 0; l < GetH1Spaces().GetNumLevels(); l++)
diff --git a/palace/models/spaceoperator.cpp b/palace/models/spaceoperator.cpp
index 631c669f8..47135b26b 100644
--- a/palace/models/spaceoperator.cpp
+++ b/palace/models/spaceoperator.cpp
@@ -694,7 +694,7 @@ std::unique_ptr<OperType> SpaceOperator::GetPreconditionerMatrix(double a0, doub
   const auto n_levels = GetNDSpaces().GetNumLevels();
   std::vector<std::unique_ptr<Operator>> br_vec(n_levels), bi_vec(n_levels),
       br_aux_vec(n_levels), bi_aux_vec(n_levels);
-  constexpr bool skip_zeros = false, assemble_q_data = true;
+  constexpr bool skip_zeros = false, assemble_q_data = false;
   if (std::is_same<OperType, ComplexOperator>::value && !pc_mat_real)
   {
     MaterialPropertyCoefficient dfr(mat_op.MaxCeedAttribute()),

From 2a5211456c3aff9c134c506f959e6418a3cdcad5 Mon Sep 17 00:00:00 2001
From: Sebastian Grimberg <sjg@amazon.com>
Date: Thu, 28 Dec 2023 09:37:54 -0800
Subject: [PATCH 3/7] Only use OpenMP with libCEED when not using GPU backends

---
 palace/fem/libceed/ceed.cpp | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/palace/fem/libceed/ceed.cpp b/palace/fem/libceed/ceed.cpp
index 43c555eb4..abe25f4e2 100644
--- a/palace/fem/libceed/ceed.cpp
+++ b/palace/fem/libceed/ceed.cpp
@@ -3,6 +3,7 @@
 
 #include "ceed.hpp"
 
+#include <string_view>
 #include "utils/omp.hpp"
 
 #if defined(MFEM_USE_OPENMP)
@@ -31,7 +32,9 @@ void Initialize(const char *resource, const char *jit_source_dir)
     PalacePragmaOmp(master)
     {
 #if defined(MFEM_USE_OPENMP)
-      const int nt = omp_get_num_threads();
+      // Only parallelize libCEED operators over threads when not using the GPU.
+      const int nt =
+          !std::string_view(resource).compare(0, 4, "/cpu") ? omp_get_num_threads() : 1;
 #else
       const int nt = 1;
 #endif

From 0414d748f09a747e3fc2c2cb95721d275804b8af Mon Sep 17 00:00:00 2001
From: Sebastian Grimberg <sjg@amazon.com>
Date: Tue, 2 Jan 2024 09:36:17 -0800
Subject: [PATCH 4/7] Avoid constructing discrete gradient matrix on coarsest
 mesh if not needed

---
 palace/fem/fespace.hpp       | 3 ++-
 palace/fem/libceed/basis.cpp | 2 +-
 2 files changed, 3 insertions(+), 2 deletions(-)

diff --git a/palace/fem/fespace.hpp b/palace/fem/fespace.hpp
index f7fbd3033..db3e4e490 100644
--- a/palace/fem/fespace.hpp
+++ b/palace/fem/fespace.hpp
@@ -300,7 +300,8 @@ class AuxiliaryFiniteElementSpaceHierarchy
   std::vector<const Operator *> GetDiscreteInterpolators() const
   {
     std::vector<const Operator *> G_(GetNumLevels());
-    for (std::size_t l = 0; l < G_.size(); l++)
+    G_[0] = nullptr;  // No discrete interpolator for coarsest level
+    for (std::size_t l = 1; l < G_.size(); l++)
     {
       G_[l] = &GetDiscreteInterpolatorAtLevel(l);
     }
diff --git a/palace/fem/libceed/basis.cpp b/palace/fem/libceed/basis.cpp
index 3623f91ec..672f3e97b 100644
--- a/palace/fem/libceed/basis.cpp
+++ b/palace/fem/libceed/basis.cpp
@@ -192,7 +192,7 @@ void InitInterpolatorBasis(const mfem::FiniteElement &trial_fe,
   if constexpr (false)
   {
     std::cout << "New interpolator basis (" << ceed << ", " << &trial_fe << ", " << &test_fe
-              << ")\n";
+              << ", " << (trial_fe.GetMapType() == test_fe.GetMapType()) << ")\n";
   }
   if constexpr (false)
   {

From 0940b871bf116b9f2bbb6e9041c53e766e41e0d1 Mon Sep 17 00:00:00 2001
From: Sebastian Grimberg <sjg@amazon.com>
Date: Tue, 30 Jan 2024 17:51:51 -0800
Subject: [PATCH 5/7] Add missing AddMult(Transpose) overrides

---
 palace/linalg/operator.cpp | 61 ++++++++++++++++++++++++++++++++++++++
 palace/linalg/operator.hpp | 55 ++++++++++++++++++++++++++++++++++
 2 files changed, 116 insertions(+)

diff --git a/palace/linalg/operator.cpp b/palace/linalg/operator.cpp
index 461f4b2f6..301bdf720 100644
--- a/palace/linalg/operator.cpp
+++ b/palace/linalg/operator.cpp
@@ -501,6 +501,41 @@ void BaseDiagonalOperator<ComplexOperator>::Mult(const ComplexVector &x,
                });
 }
 
+template <>
+void BaseDiagonalOperator<Operator>::AddMult(const Vector &x, Vector &y,
+                                             const double a) const
+{
+  const int N = this->height;
+  const auto *D = d.Read();
+  const auto *X = x.Read();
+  auto *Y = y.Write();
+  mfem::forall(N, [=] MFEM_HOST_DEVICE(int i) { Y[i] += a * D[i] * X[i]; });
+}
+
+template <>
+void BaseDiagonalOperator<ComplexOperator>::AddMult(const ComplexVector &x,
+                                                    ComplexVector &y,
+                                                    const std::complex<double> a) const
+{
+  const int N = this->height;
+  const double ar = a.real();
+  const double ai = a.imag();
+  const auto *DR = d.Real().Read();
+  const auto *DI = d.Imag().Read();
+  const auto *XR = x.Real().Read();
+  const auto *XI = x.Imag().Read();
+  auto *YR = y.Real().Write();
+  auto *YI = y.Imag().Write();
+  mfem::forall(N,
+               [=] MFEM_HOST_DEVICE(int i)
+               {
+                 const auto tr = DR[i] * XR[i] - DI[i] * XI[i];
+                 const auto ti = DI[i] * XR[i] + DR[i] * XI[i];
+                 YR[i] += ar * tr - ai * ti;
+                 YI[i] += ai * ti + ar * ti;
+               });
+}
+
 template <>
 void DiagonalOperatorHelper<BaseDiagonalOperator<ComplexOperator>,
                             ComplexOperator>::MultHermitianTranspose(const ComplexVector &x,
@@ -523,6 +558,32 @@ void DiagonalOperatorHelper<BaseDiagonalOperator<ComplexOperator>,
                });
 }
 
+template <>
+void DiagonalOperatorHelper<BaseDiagonalOperator<ComplexOperator>, ComplexOperator>::
+    AddMultHermitianTranspose(const ComplexVector &x, ComplexVector &y,
+                              const std::complex<double> a) const
+{
+  const ComplexVector &d =
+      static_cast<const BaseDiagonalOperator<ComplexOperator> *>(this)->d;
+  const int N = this->height;
+  const double ar = a.real();
+  const double ai = a.imag();
+  const auto *DR = d.Real().Read();
+  const auto *DI = d.Imag().Read();
+  const auto *XR = x.Real().Read();
+  const auto *XI = x.Imag().Read();
+  auto *YR = y.Real().Write();
+  auto *YI = y.Imag().Write();
+  mfem::forall(N,
+               [=] MFEM_HOST_DEVICE(int i)
+               {
+                 const auto tr = DR[i] * XR[i] + DI[i] * XI[i];
+                 const auto ti = -DI[i] * XR[i] + DR[i] * XI[i];
+                 YR[i] += ar * tr - ai * ti;
+                 YI[i] += ai * ti + ar * ti;
+               });
+}
+
 namespace linalg
 {
 
diff --git a/palace/linalg/operator.hpp b/palace/linalg/operator.hpp
index ab204c076..89f3890b9 100644
--- a/palace/linalg/operator.hpp
+++ b/palace/linalg/operator.hpp
@@ -161,6 +161,16 @@ class ProductOperatorHelper<ProductOperator, ComplexOperator> : public ComplexOp
     A.MultHermitianTranspose(x, z);
     B.MultHermitianTranspose(z, y);
   }
+
+  void AddMultHermitianTranspose(const ComplexVector &x, ComplexVector &y,
+                                 const std::complex<double> a = 1.0) const override
+  {
+    const ComplexOperator &A = static_cast<const ProductOperator *>(this)->A;
+    const ComplexOperator &B = static_cast<const ProductOperator *>(this)->B;
+    ComplexVector &z = static_cast<const ProductOperator *>(this)->z;
+    A.MultHermitianTranspose(x, z);
+    B.AddMultHermitianTranspose(z, y, a);
+  }
 };
 
 template <typename OperType>
@@ -171,6 +181,9 @@ class BaseProductOperator
 
   using VecType = typename std::conditional<std::is_same<OperType, ComplexOperator>::value,
                                             ComplexVector, Vector>::type;
+  using ScalarType =
+      typename std::conditional<std::is_same<OperType, ComplexOperator>::value,
+                                std::complex<double>, double>::type;
 
 private:
   const OperType &A, &B;
@@ -194,6 +207,19 @@ class BaseProductOperator
     A.MultTranspose(x, z);
     B.MultTranspose(z, y);
   }
+
+  void AddMult(const VecType &x, VecType &y, const ScalarType a = 1.0) const override
+  {
+    B.Mult(x, z);
+    A.AddMult(z, y, a);
+  }
+
+  void AddMultTranspose(const VecType &x, VecType &y,
+                        const ScalarType a = 1.0) const override
+  {
+    A.MultTranspose(x, z);
+    B.AddMultTranspose(z, y, a);
+  }
 };
 
 using ProductOperator = BaseProductOperator<Operator>;
@@ -219,6 +245,9 @@ class DiagonalOperatorHelper<DiagonalOperator, ComplexOperator> : public Complex
   DiagonalOperatorHelper(int s) : ComplexOperator(s) {}
 
   void MultHermitianTranspose(const ComplexVector &x, ComplexVector &y) const override;
+
+  void AddMultHermitianTranspose(const ComplexVector &x, ComplexVector &y,
+                                 const std::complex<double> a = 1.0) const override;
 };
 
 template <typename OperType>
@@ -229,6 +258,9 @@ class BaseDiagonalOperator
 
   using VecType = typename std::conditional<std::is_same<OperType, ComplexOperator>::value,
                                             ComplexVector, Vector>::type;
+  using ScalarType =
+      typename std::conditional<std::is_same<OperType, ComplexOperator>::value,
+                                std::complex<double>, double>::type;
 
 private:
   const VecType &d;
@@ -242,6 +274,14 @@ class BaseDiagonalOperator
   void Mult(const VecType &x, VecType &y) const override;
 
   void MultTranspose(const VecType &x, VecType &y) const override { Mult(x, y); }
+
+  void AddMult(const VecType &x, VecType &y, const ScalarType a = 1.0) const override;
+
+  void AddMultTranspose(const VecType &x, VecType &y,
+                        const ScalarType a = 1.0) const override
+  {
+    AddMult(x, y, a);
+  }
 };
 
 using DiagonalOperator = BaseDiagonalOperator<Operator>;
@@ -256,6 +296,9 @@ class BaseMultigridOperator : public OperType
 {
   using VecType = typename std::conditional<std::is_same<OperType, ComplexOperator>::value,
                                             ComplexVector, Vector>::type;
+  using ScalarType =
+      typename std::conditional<std::is_same<OperType, ComplexOperator>::value,
+                                std::complex<double>, double>::type;
 
 private:
   std::vector<std::unique_ptr<OperType>> ops, aux_ops;
@@ -300,10 +343,22 @@ class BaseMultigridOperator : public OperType
   }
 
   void Mult(const VecType &x, VecType &y) const override { GetFinestOperator().Mult(x, y); }
+
   void MultTranspose(const VecType &x, VecType &y) const override
   {
     GetFinestOperator().MultTranspose(x, y);
   }
+
+  void AddMult(const VecType &x, VecType &y, const ScalarType a = 1.0) const override
+  {
+    GetFinestOperator().AddMult(x, y, a);
+  }
+
+  void AddMultTranspose(const VecType &x, VecType &y,
+                        const ScalarType a = 1.0) const override
+  {
+    GetFinestOperator().AddMultTranspose(x, y, a);
+  }
 };
 
 using MultigridOperator = BaseMultigridOperator<Operator>;

From 40de187c6d7e4ad5686215b99d47a92f60592df7 Mon Sep 17 00:00:00 2001
From: Sebastian Grimberg <sjg@amazon.com>
Date: Thu, 1 Feb 2024 18:18:49 -0800
Subject: [PATCH 6/7] Parallel prolongation and restriction sometimes do not
 have appropriate AddMult/AddMultTranspose overrides, causing temporary vector
 allocation

So, fix it by handling this at the ParOperator level.
---
 palace/linalg/rap.cpp | 118 ++++++++++--------------------------------
 palace/linalg/rap.hpp |   2 -
 2 files changed, 26 insertions(+), 94 deletions(-)

diff --git a/palace/linalg/rap.cpp b/palace/linalg/rap.cpp
index 457f22511..4773ac9aa 100644
--- a/palace/linalg/rap.cpp
+++ b/palace/linalg/rap.cpp
@@ -52,18 +52,21 @@ void ParOperator::EliminateRHS(const Vector &x, Vector &b) const
   }
 
   MFEM_VERIFY(A, "No local matrix available for ParOperator::EliminateRHS!");
-  auto &tx = trial_fespace.GetTVector<Vector>();
   auto &lx = trial_fespace.GetLVector<Vector>();
   auto &ly = GetTestLVector();
-  tx = 0.0;
-  linalg::SetSubVector(tx, *dbc_tdof_list, x);
-  trial_fespace.GetProlongationMatrix()->Mult(tx, lx);
+  {
+    auto &tx = trial_fespace.GetTVector<Vector>();
+    tx = 0.0;
+    linalg::SetSubVector(tx, *dbc_tdof_list, x);
+    trial_fespace.GetProlongationMatrix()->Mult(tx, lx);
+  }
 
   // Apply the unconstrained operator.
   A->Mult(lx, ly);
-  ly *= -1.0;
 
-  RestrictionMatrixAddMult(ly, b);
+  auto &ty = test_fespace.GetTVector<Vector>();
+  RestrictionMatrixMult(ly, ty);
+  b.Add(-1.0, ty);
   if (diag_policy == DiagonalPolicy::DIAG_ONE)
   {
     linalg::SetSubVector(b, *dbc_tdof_list, x);
@@ -292,10 +295,10 @@ void ParOperator::AddMult(const Vector &x, Vector &y, const double a) const
   // Apply the operator on the L-vector.
   A->Mult(lx, ly);
 
+  auto &ty = test_fespace.GetTVector<Vector>();
+  RestrictionMatrixMult(ly, ty);
   if (dbc_tdof_list)
   {
-    auto &ty = test_fespace.GetTVector<Vector>();
-    RestrictionMatrixMult(ly, ty);
     if (diag_policy == DiagonalPolicy::DIAG_ONE)
     {
       linalg::SetSubVector(ty, *dbc_tdof_list, x);
@@ -304,16 +307,8 @@ void ParOperator::AddMult(const Vector &x, Vector &y, const double a) const
     {
       linalg::SetSubVector(ty, *dbc_tdof_list, 0.0);
     }
-    y.Add(a, ty);
-  }
-  else
-  {
-    if (a != 1.0)
-    {
-      ly *= a;
-    }
-    RestrictionMatrixAddMult(ly, y);
   }
+  y.Add(a, ty);
 }
 
 void ParOperator::AddMultTranspose(const Vector &x, Vector &y, const double a) const
@@ -343,10 +338,10 @@ void ParOperator::AddMultTranspose(const Vector &x, Vector &y, const double a) c
   // Apply the operator on the L-vector.
   A->MultTranspose(ly, lx);
 
+  auto &tx = trial_fespace.GetTVector<Vector>();
+  trial_fespace.GetProlongationMatrix()->MultTranspose(lx, tx);
   if (dbc_tdof_list)
   {
-    auto &tx = trial_fespace.GetTVector<Vector>();
-    trial_fespace.GetProlongationMatrix()->MultTranspose(lx, tx);
     if (diag_policy == DiagonalPolicy::DIAG_ONE)
     {
       linalg::SetSubVector(tx, *dbc_tdof_list, x);
@@ -355,16 +350,8 @@ void ParOperator::AddMultTranspose(const Vector &x, Vector &y, const double a) c
     {
       linalg::SetSubVector(tx, *dbc_tdof_list, 0.0);
     }
-    y.Add(a, tx);
-  }
-  else
-  {
-    if (a != 1.0)
-    {
-      lx *= a;
-    }
-    trial_fespace.GetProlongationMatrix()->AddMultTranspose(lx, y);
   }
+  y.Add(a, tx);
 }
 
 void ParOperator::RestrictionMatrixMult(const Vector &ly, Vector &ty) const
@@ -379,18 +366,6 @@ void ParOperator::RestrictionMatrixMult(const Vector &ly, Vector &ty) const
   }
 }
 
-void ParOperator::RestrictionMatrixAddMult(const Vector &ly, Vector &ty) const
-{
-  if (!use_R)
-  {
-    test_fespace.GetProlongationMatrix()->AddMultTranspose(ly, ty);
-  }
-  else
-  {
-    test_fespace.GetRestrictionMatrix()->AddMult(ly, ty);
-  }
-}
-
 void ParOperator::RestrictionMatrixMultTranspose(const Vector &ty, Vector &ly) const
 {
   if (!use_R)
@@ -623,10 +598,10 @@ void ComplexParOperator::AddMult(const ComplexVector &x, ComplexVector &y,
   // Apply the operator on the L-vector.
   A->Mult(lx, ly);
 
+  auto &ty = test_fespace.GetTVector<ComplexVector>();
+  RestrictionMatrixMult(ly, ty);
   if (dbc_tdof_list)
   {
-    auto &ty = test_fespace.GetTVector<ComplexVector>();
-    RestrictionMatrixMult(ly, ty);
     if (diag_policy == Operator::DiagonalPolicy::DIAG_ONE)
     {
       linalg::SetSubVector(ty, *dbc_tdof_list, x);
@@ -635,16 +610,8 @@ void ComplexParOperator::AddMult(const ComplexVector &x, ComplexVector &y,
     {
       linalg::SetSubVector(ty, *dbc_tdof_list, 0.0);
     }
-    y.AXPY(a, ty);
-  }
-  else
-  {
-    if (a != 1.0)
-    {
-      ly *= a;
-    }
-    RestrictionMatrixAddMult(ly, y);
   }
+  y.AXPY(a, ty);
 }
 
 void ComplexParOperator::AddMultTranspose(const ComplexVector &x, ComplexVector &y,
@@ -669,11 +636,11 @@ void ComplexParOperator::AddMultTranspose(const ComplexVector &x, ComplexVector
   // Apply the operator on the L-vector.
   A->MultTranspose(ly, lx);
 
+  auto &tx = trial_fespace.GetTVector<ComplexVector>();
+  trial_fespace.GetProlongationMatrix()->MultTranspose(lx.Real(), tx.Real());
+  trial_fespace.GetProlongationMatrix()->MultTranspose(lx.Imag(), tx.Imag());
   if (dbc_tdof_list)
   {
-    auto &tx = trial_fespace.GetTVector<ComplexVector>();
-    trial_fespace.GetProlongationMatrix()->MultTranspose(lx.Real(), tx.Real());
-    trial_fespace.GetProlongationMatrix()->MultTranspose(lx.Imag(), tx.Imag());
     if (diag_policy == Operator::DiagonalPolicy::DIAG_ONE)
     {
       linalg::SetSubVector(tx, *dbc_tdof_list, x);
@@ -682,17 +649,8 @@ void ComplexParOperator::AddMultTranspose(const ComplexVector &x, ComplexVector
     {
       linalg::SetSubVector(tx, *dbc_tdof_list, 0.0);
     }
-    y.AXPY(a, tx);
-  }
-  else
-  {
-    if (a != 1.0)
-    {
-      lx *= a;
-    }
-    trial_fespace.GetProlongationMatrix()->AddMultTranspose(lx.Real(), y.Real());
-    trial_fespace.GetProlongationMatrix()->AddMultTranspose(lx.Imag(), y.Imag());
   }
+  y.AXPY(a, tx);
 }
 
 void ComplexParOperator::AddMultHermitianTranspose(const ComplexVector &x, ComplexVector &y,
@@ -717,11 +675,11 @@ void ComplexParOperator::AddMultHermitianTranspose(const ComplexVector &x, Compl
   // Apply the operator on the L-vector.
   A->MultHermitianTranspose(ly, lx);
 
+  auto &tx = trial_fespace.GetTVector<ComplexVector>();
+  trial_fespace.GetProlongationMatrix()->MultTranspose(lx.Real(), tx.Real());
+  trial_fespace.GetProlongationMatrix()->MultTranspose(lx.Imag(), tx.Imag());
   if (dbc_tdof_list)
   {
-    auto &tx = trial_fespace.GetTVector<ComplexVector>();
-    trial_fespace.GetProlongationMatrix()->MultTranspose(lx.Real(), tx.Real());
-    trial_fespace.GetProlongationMatrix()->MultTranspose(lx.Imag(), tx.Imag());
     if (diag_policy == Operator::DiagonalPolicy::DIAG_ONE)
     {
       linalg::SetSubVector(tx, *dbc_tdof_list, x);
@@ -730,17 +688,8 @@ void ComplexParOperator::AddMultHermitianTranspose(const ComplexVector &x, Compl
     {
       linalg::SetSubVector(tx, *dbc_tdof_list, 0.0);
     }
-    y.AXPY(a, tx);
-  }
-  else
-  {
-    if (a != 1.0)
-    {
-      lx *= a;
-    }
-    trial_fespace.GetProlongationMatrix()->AddMultTranspose(lx.Real(), y.Real());
-    trial_fespace.GetProlongationMatrix()->AddMultTranspose(lx.Imag(), y.Imag());
   }
+  y.AXPY(a, tx);
 }
 
 void ComplexParOperator::RestrictionMatrixMult(const ComplexVector &ly,
@@ -758,21 +707,6 @@ void ComplexParOperator::RestrictionMatrixMult(const ComplexVector &ly,
   }
 }
 
-void ComplexParOperator::RestrictionMatrixAddMult(const ComplexVector &ly,
-                                                  ComplexVector &ty) const
-{
-  if (!use_R)
-  {
-    test_fespace.GetProlongationMatrix()->AddMultTranspose(ly.Real(), ty.Real());
-    test_fespace.GetProlongationMatrix()->AddMultTranspose(ly.Imag(), ty.Imag());
-  }
-  else
-  {
-    test_fespace.GetRestrictionMatrix()->AddMult(ly.Real(), ty.Real());
-    test_fespace.GetRestrictionMatrix()->AddMult(ly.Imag(), ty.Imag());
-  }
-}
-
 void ComplexParOperator::RestrictionMatrixMultTranspose(const ComplexVector &ty,
                                                         ComplexVector &ly) const
 {
diff --git a/palace/linalg/rap.hpp b/palace/linalg/rap.hpp
index 2ee99d3e0..4c9d36174 100644
--- a/palace/linalg/rap.hpp
+++ b/palace/linalg/rap.hpp
@@ -43,7 +43,6 @@ class ParOperator : public Operator
 
   // Helper methods for operator application.
   void RestrictionMatrixMult(const Vector &ly, Vector &ty) const;
-  void RestrictionMatrixAddMult(const Vector &ly, Vector &ty) const;
   void RestrictionMatrixMultTranspose(const Vector &ty, Vector &ly) const;
   Vector &GetTestLVector() const;
 
@@ -130,7 +129,6 @@ class ComplexParOperator : public ComplexOperator
 
   // Helper methods for operator application.
   void RestrictionMatrixMult(const ComplexVector &ly, ComplexVector &ty) const;
-  void RestrictionMatrixAddMult(const ComplexVector &ly, ComplexVector &ty) const;
   void RestrictionMatrixMultTranspose(const ComplexVector &ty, ComplexVector &ly) const;
   ComplexVector &GetTestLVector() const;
 

From 3323ddabf08088329b1d2c6da235aafb866e32ce Mon Sep 17 00:00:00 2001
From: Sebastian Grimberg <sjg@amazon.com>
Date: Mon, 5 Feb 2024 10:12:30 -0800
Subject: [PATCH 7/7] Rework libCEED quadrature point geometry data to populate
 element attributes directly in QFunction

---
 palace/fem/libceed/integrator.cpp     |   7 +-
 palace/fem/libceed/integrator.hpp     |   4 +-
 palace/fem/mesh.cpp                   | 103 +++++++++++---------------
 palace/fem/qfunctions/21/geom_21_qf.h |  12 +--
 palace/fem/qfunctions/22/geom_22_qf.h |  16 ++--
 palace/fem/qfunctions/32/geom_32_qf.h |  20 ++---
 palace/fem/qfunctions/33/geom_33_qf.h |  26 +++----
 palace/fem/qfunctions/geom_qf.h       |   5 +-
 8 files changed, 94 insertions(+), 99 deletions(-)

diff --git a/palace/fem/libceed/integrator.cpp b/palace/fem/libceed/integrator.cpp
index a87507b5c..14d644b13 100644
--- a/palace/fem/libceed/integrator.cpp
+++ b/palace/fem/libceed/integrator.cpp
@@ -346,7 +346,9 @@ int CeedGeometryDataGetSpaceDimension(CeedElemRestriction geom_data_restr, CeedI
 
 void AssembleCeedGeometryData(Ceed ceed, CeedElemRestriction mesh_restr,
                               CeedBasis mesh_basis, CeedVector mesh_nodes,
-                              CeedVector geom_data, CeedElemRestriction geom_data_restr)
+                              CeedElemRestriction attr_restr, CeedBasis attr_basis,
+                              CeedVector elem_attr, CeedVector geom_data,
+                              CeedElemRestriction geom_data_restr)
 {
   CeedInt dim, space_dim, num_elem, num_qpts;
   PalaceCeedCall(ceed, CeedBasisGetDimension(mesh_basis, &dim));
@@ -389,6 +391,7 @@ void AssembleCeedGeometryData(Ceed ceed, CeedElemRestriction mesh_restr,
   }
 
   // Inputs
+  PalaceCeedCall(ceed, CeedQFunctionAddInput(build_qf, "attr", 1, CEED_EVAL_INTERP));
   PalaceCeedCall(ceed, CeedQFunctionAddInput(build_qf, "q_w", 1, CEED_EVAL_WEIGHT));
   PalaceCeedCall(
       ceed, CeedQFunctionAddInput(build_qf, "grad_x", space_dim * dim, CEED_EVAL_GRAD));
@@ -409,6 +412,8 @@ void AssembleCeedGeometryData(Ceed ceed, CeedElemRestriction mesh_restr,
   PalaceCeedCall(ceed, CeedOperatorCreate(ceed, build_qf, nullptr, nullptr, &build_op));
   PalaceCeedCall(ceed, CeedQFunctionDestroy(&build_qf));
 
+  PalaceCeedCall(ceed,
+                 CeedOperatorSetField(build_op, "attr", attr_restr, attr_basis, elem_attr));
   PalaceCeedCall(ceed, CeedOperatorSetField(build_op, "q_w", CEED_ELEMRESTRICTION_NONE,
                                             mesh_basis, CEED_VECTOR_NONE));
   PalaceCeedCall(ceed, CeedOperatorSetField(build_op, "grad_x", mesh_restr, mesh_basis,
diff --git a/palace/fem/libceed/integrator.hpp b/palace/fem/libceed/integrator.hpp
index 20161ac37..25adeea51 100644
--- a/palace/fem/libceed/integrator.hpp
+++ b/palace/fem/libceed/integrator.hpp
@@ -53,7 +53,9 @@ int CeedGeometryDataGetSpaceDimension(CeedElemRestriction geom_data_restr, CeedI
 // libCEED operator.
 void AssembleCeedGeometryData(Ceed ceed, CeedElemRestriction mesh_restr,
                               CeedBasis mesh_basis, CeedVector mesh_nodes,
-                              CeedVector geom_data, CeedElemRestriction geom_data_restr);
+                              CeedElemRestriction attr_restr, CeedBasis attr_basis,
+                              CeedVector elem_attr, CeedVector geom_data,
+                              CeedElemRestriction geom_data_restr);
 
 // Construct libCEED operator using the given quadrature data, element restriction, and
 // basis objects.
diff --git a/palace/fem/mesh.cpp b/palace/fem/mesh.cpp
index 4ac1e29d5..4067b48c7 100644
--- a/palace/fem/mesh.cpp
+++ b/palace/fem/mesh.cpp
@@ -3,8 +3,6 @@
 
 #include "mesh.hpp"
 
-#include <ceed/backend.h>
-#include <mfem/general/forall.hpp>
 #include "fem/coefficient.hpp"
 #include "fem/fespace.hpp"
 #include "fem/libceed/integrator.hpp"
@@ -139,37 +137,8 @@ auto GetElementIndices(const mfem::ParMesh &mesh, bool use_bdr, int start, int s
   return element_indices;
 }
 
-void AssembleCeedAttributeData(const mfem::Array<int> &attr, int Q, Ceed ceed,
-                               CeedVector geom_data, CeedElemRestriction geom_data_restr)
-{
-  // The data for quadrature point i, component j, element k is found at index
-  // i * layout[0] + j * layout[1] + k * layout[2]. This is also correctly a non-OpenMP loop
-  // when executed on the CPU (OpenMP is handled in outer scope).
-  CeedMemType mem;
-  CeedInt layout[3];
-  CeedScalar *geom_data_array;
-  PalaceCeedCall(ceed, CeedGetPreferredMemType(ceed, &mem));
-  if (!mfem::Device::Allows(mfem::Backend::DEVICE_MASK))
-  {
-    mem = CEED_MEM_HOST;
-  }
-  PalaceCeedCall(ceed, CeedElemRestrictionGetELayout(geom_data_restr, &layout));
-  PalaceCeedCall(ceed, CeedVectorGetArray(geom_data, mem, &geom_data_array));
-  const auto *d_attr = attr.Read(mem == CEED_MEM_DEVICE);
-  mfem::forall_switch(mem == CEED_MEM_DEVICE, attr.Size() * Q,
-                      [=] MFEM_HOST_DEVICE(int l)
-                      {
-                        int k = l / Q;
-                        int i = l % Q;
-                        geom_data_array[i * layout[0] + k * layout[2]] = d_attr[k];
-                      });
-  CeedVectorRestoreArray(geom_data, &geom_data_array);
-}
-
-template <typename T>
-auto AssembleGeometryData(const mfem::GridFunction &mesh_nodes, Ceed ceed,
-                          mfem::Geometry::Type geom, std::vector<int> &indices,
-                          T GetCeedAttribute)
+auto AssembleGeometryData(Ceed ceed, mfem::Geometry::Type geom, std::vector<int> &indices,
+                          const mfem::GridFunction &mesh_nodes, const Vector &elem_attr)
 {
   const mfem::FiniteElementSpace &mesh_fespace = *mesh_nodes.FESpace();
   const mfem::Mesh &mesh = *mesh_fespace.GetMesh();
@@ -180,13 +149,33 @@ auto AssembleGeometryData(const mfem::GridFunction &mesh_nodes, Ceed ceed,
   data.indices = std::move(indices);
   const std::size_t num_elem = data.indices.size();
 
-  // Allocate storage for geometry factor data (stored as attribute + quadrature weight +
-  // Jacobian, column-major).
+  // Construct mesh node element restriction and basis.
   CeedElemRestriction mesh_restr =
       FiniteElementSpace::BuildCeedElemRestriction(mesh_fespace, ceed, geom, data.indices);
   CeedBasis mesh_basis = FiniteElementSpace::BuildCeedBasis(mesh_fespace, ceed, geom);
-  CeedInt num_qpts, geom_data_size = 2 + data.space_dim * data.dim;
+  CeedVector mesh_nodes_vec;
+  ceed::InitCeedVector(mesh_nodes, ceed, &mesh_nodes_vec);
+  CeedInt num_qpts;
   PalaceCeedCall(ceed, CeedBasisGetNumQuadraturePoints(mesh_basis, &num_qpts));
+
+  // Construct element attribute element restriction and basis.
+  CeedElemRestriction attr_restr;
+  CeedBasis attr_basis;
+  PalaceCeedCall(ceed, CeedElemRestrictionCreateStrided(ceed, num_elem, 1, 1, num_elem,
+                                                        CEED_STRIDES_BACKEND, &attr_restr));
+  {
+    Vector Bt(num_qpts);
+    Bt = 1.0;
+    PalaceCeedCall(ceed,
+                   CeedBasisCreateH1(ceed, ceed::GetCeedTopology(geom), 1, 1, num_qpts,
+                                     Bt.GetData(), nullptr, nullptr, nullptr, &attr_basis));
+  }
+  CeedVector elem_attr_vec;
+  ceed::InitCeedVector(elem_attr, ceed, &elem_attr_vec);
+
+  // Allocate storage for geometry factor data (stored as attribute + quadrature weight +
+  // Jacobian, column-major).
+  CeedInt geom_data_size = 2 + data.space_dim * data.dim;
   PalaceCeedCall(
       ceed, CeedVectorCreate(ceed, num_elem * num_qpts * geom_data_size, &data.geom_data));
   PalaceCeedCall(
@@ -195,27 +184,15 @@ auto AssembleGeometryData(const mfem::GridFunction &mesh_nodes, Ceed ceed,
                                              CEED_STRIDES_BACKEND, &data.geom_data_restr));
 
   // Compute the required geometry factors at quadrature points.
-  CeedVector mesh_nodes_vec;
-  ceed::InitCeedVector(mesh_nodes, ceed, &mesh_nodes_vec);
-
-  ceed::AssembleCeedGeometryData(ceed, mesh_restr, mesh_basis, mesh_nodes_vec,
-                                 data.geom_data, data.geom_data_restr);
-
+  ceed::AssembleCeedGeometryData(ceed, mesh_restr, mesh_basis, mesh_nodes_vec, attr_restr,
+                                 attr_basis, elem_attr_vec, data.geom_data,
+                                 data.geom_data_restr);
   PalaceCeedCall(ceed, CeedVectorDestroy(&mesh_nodes_vec));
   PalaceCeedCall(ceed, CeedElemRestrictionDestroy(&mesh_restr));
   PalaceCeedCall(ceed, CeedBasisDestroy(&mesh_basis));
-
-  // Set the element attribute quadrature data. All inputs to a QFunction require the same
-  // number of quadrature points, so we store the attribute at each quadrature point. This
-  // is the first component of the quadrature data.
-  {
-    mfem::Array<int> attr(num_elem);
-    for (std::size_t k = 0; k < num_elem; k++)
-    {
-      attr[k] = GetCeedAttribute(data.indices[k]);
-    }
-    AssembleCeedAttributeData(attr, num_qpts, ceed, data.geom_data, data.geom_data_restr);
-  }
+  PalaceCeedCall(ceed, CeedVectorDestroy(&elem_attr_vec));
+  PalaceCeedCall(ceed, CeedElemRestrictionDestroy(&attr_restr));
+  PalaceCeedCall(ceed, CeedBasisDestroy(&attr_basis));
 
   return data;
 }
@@ -281,8 +258,13 @@ auto BuildCeedGeomFactorData(
     }();
     for (auto &[geom, indices] : element_indices)
     {
-      geom_data_map.emplace(geom, AssembleGeometryData(*mesh.GetNodes(), ceed, geom,
-                                                       indices, GetCeedAttribute));
+      Vector elem_attr(indices.size());
+      for (std::size_t k = 0; k < indices.size(); k++)
+      {
+        elem_attr[k] = GetCeedAttribute(indices[k]);
+      }
+      geom_data_map.emplace(
+          geom, AssembleGeometryData(ceed, geom, indices, *mesh.GetNodes(), elem_attr));
     }
   }
 
@@ -307,8 +289,13 @@ auto BuildCeedGeomFactorData(
     };
     for (auto &[geom, indices] : element_indices)
     {
-      geom_data_map.emplace(geom, AssembleGeometryData(*mesh.GetNodes(), ceed, geom,
-                                                       indices, GetCeedAttribute));
+      Vector elem_attr(indices.size());
+      for (std::size_t k = 0; k < indices.size(); k++)
+      {
+        elem_attr[k] = GetCeedAttribute(indices[k]);
+      }
+      geom_data_map.emplace(
+          geom, AssembleGeometryData(ceed, geom, indices, *mesh.GetNodes(), elem_attr));
     }
   }
 
diff --git a/palace/fem/qfunctions/21/geom_21_qf.h b/palace/fem/qfunctions/21/geom_21_qf.h
index a6e9c1f1e..ce6311270 100644
--- a/palace/fem/qfunctions/21/geom_21_qf.h
+++ b/palace/fem/qfunctions/21/geom_21_qf.h
@@ -9,8 +9,8 @@
 CEED_QFUNCTION(f_build_geom_factor_21)(void *, CeedInt Q, const CeedScalar *const *in,
                                        CeedScalar *const *out)
 {
-  const CeedScalar *qw = in[0], *J = in[1];
-  CeedScalar *attr = out[0], *wdetJ = out[0] + Q, *adjJt = out[0] + 2 * Q;
+  const CeedScalar *attr = in[0], *qw = in[1], *J = in[2];
+  CeedScalar *qd_attr = out[0], *qd_wdetJ = out[0] + Q, *qd_adjJt = out[0] + 2 * Q;
 
   CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++)
   {
@@ -18,10 +18,10 @@ CEED_QFUNCTION(f_build_geom_factor_21)(void *, CeedInt Q, const CeedScalar *cons
     MatUnpack21(J + i, Q, J_loc);
     const CeedScalar detJ = AdjJt21<true>(J_loc, adjJt_loc);
 
-    attr[i] = 0;
-    wdetJ[i] = qw[i] * detJ;
-    adjJt[i + Q * 0] = adjJt_loc[0] / detJ;
-    adjJt[i + Q * 1] = adjJt_loc[1] / detJ;
+    qd_attr[i] = attr[i];
+    qd_wdetJ[i] = qw[i] * detJ;
+    qd_adjJt[i + Q * 0] = adjJt_loc[0] / detJ;
+    qd_adjJt[i + Q * 1] = adjJt_loc[1] / detJ;
   }
   return 0;
 }
diff --git a/palace/fem/qfunctions/22/geom_22_qf.h b/palace/fem/qfunctions/22/geom_22_qf.h
index 55cb36901..f53e9923d 100644
--- a/palace/fem/qfunctions/22/geom_22_qf.h
+++ b/palace/fem/qfunctions/22/geom_22_qf.h
@@ -9,8 +9,8 @@
 CEED_QFUNCTION(f_build_geom_factor_22)(void *, CeedInt Q, const CeedScalar *const *in,
                                        CeedScalar *const *out)
 {
-  const CeedScalar *qw = in[0], *J = in[1];
-  CeedScalar *attr = out[0], *wdetJ = out[0] + Q, *adjJt = out[0] + 2 * Q;
+  const CeedScalar *attr = in[0], *qw = in[1], *J = in[2];
+  CeedScalar *qd_attr = out[0], *qd_wdetJ = out[0] + Q, *qd_adjJt = out[0] + 2 * Q;
 
   CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++)
   {
@@ -18,12 +18,12 @@ CEED_QFUNCTION(f_build_geom_factor_22)(void *, CeedInt Q, const CeedScalar *cons
     MatUnpack22(J + i, Q, J_loc);
     const CeedScalar detJ = AdjJt22<true>(J_loc, adjJt_loc);
 
-    attr[i] = 0;
-    wdetJ[i] = qw[i] * detJ;
-    adjJt[i + Q * 0] = adjJt_loc[0] / detJ;
-    adjJt[i + Q * 1] = adjJt_loc[1] / detJ;
-    adjJt[i + Q * 2] = adjJt_loc[2] / detJ;
-    adjJt[i + Q * 3] = adjJt_loc[3] / detJ;
+    qd_attr[i] = attr[i];
+    qd_wdetJ[i] = qw[i] * detJ;
+    qd_adjJt[i + Q * 0] = adjJt_loc[0] / detJ;
+    qd_adjJt[i + Q * 1] = adjJt_loc[1] / detJ;
+    qd_adjJt[i + Q * 2] = adjJt_loc[2] / detJ;
+    qd_adjJt[i + Q * 3] = adjJt_loc[3] / detJ;
   }
   return 0;
 }
diff --git a/palace/fem/qfunctions/32/geom_32_qf.h b/palace/fem/qfunctions/32/geom_32_qf.h
index cf7db5ee0..ab7c4eaaa 100644
--- a/palace/fem/qfunctions/32/geom_32_qf.h
+++ b/palace/fem/qfunctions/32/geom_32_qf.h
@@ -9,8 +9,8 @@
 CEED_QFUNCTION(f_build_geom_factor_32)(void *, CeedInt Q, const CeedScalar *const *in,
                                        CeedScalar *const *out)
 {
-  const CeedScalar *qw = in[0], *J = in[1];
-  CeedScalar *attr = out[0], *wdetJ = out[0] + Q, *adjJt = out[0] + 2 * Q;
+  const CeedScalar *attr = in[0], *qw = in[1], *J = in[2];
+  CeedScalar *qd_attr = out[0], *qd_wdetJ = out[0] + Q, *qd_adjJt = out[0] + 2 * Q;
 
   CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++)
   {
@@ -18,14 +18,14 @@ CEED_QFUNCTION(f_build_geom_factor_32)(void *, CeedInt Q, const CeedScalar *cons
     MatUnpack32(J + i, Q, J_loc);
     const CeedScalar detJ = AdjJt32<true>(J_loc, adjJt_loc);
 
-    attr[i] = 0;
-    wdetJ[i] = qw[i] * detJ;
-    adjJt[i + Q * 0] = adjJt_loc[0] / detJ;
-    adjJt[i + Q * 1] = adjJt_loc[1] / detJ;
-    adjJt[i + Q * 2] = adjJt_loc[2] / detJ;
-    adjJt[i + Q * 3] = adjJt_loc[3] / detJ;
-    adjJt[i + Q * 4] = adjJt_loc[4] / detJ;
-    adjJt[i + Q * 5] = adjJt_loc[5] / detJ;
+    qd_attr[i] = attr[i];
+    qd_wdetJ[i] = qw[i] * detJ;
+    qd_adjJt[i + Q * 0] = adjJt_loc[0] / detJ;
+    qd_adjJt[i + Q * 1] = adjJt_loc[1] / detJ;
+    qd_adjJt[i + Q * 2] = adjJt_loc[2] / detJ;
+    qd_adjJt[i + Q * 3] = adjJt_loc[3] / detJ;
+    qd_adjJt[i + Q * 4] = adjJt_loc[4] / detJ;
+    qd_adjJt[i + Q * 5] = adjJt_loc[5] / detJ;
   }
   return 0;
 }
diff --git a/palace/fem/qfunctions/33/geom_33_qf.h b/palace/fem/qfunctions/33/geom_33_qf.h
index 213a4d1b3..21f81350e 100644
--- a/palace/fem/qfunctions/33/geom_33_qf.h
+++ b/palace/fem/qfunctions/33/geom_33_qf.h
@@ -9,8 +9,8 @@
 CEED_QFUNCTION(f_build_geom_factor_33)(void *, CeedInt Q, const CeedScalar *const *in,
                                        CeedScalar *const *out)
 {
-  const CeedScalar *qw = in[0], *J = in[1];
-  CeedScalar *attr = out[0], *wdetJ = out[0] + Q, *adjJt = out[0] + 2 * Q;
+  const CeedScalar *attr = in[0], *qw = in[1], *J = in[2];
+  CeedScalar *qd_attr = out[0], *qd_wdetJ = out[0] + Q, *qd_adjJt = out[0] + 2 * Q;
 
   CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++)
   {
@@ -18,17 +18,17 @@ CEED_QFUNCTION(f_build_geom_factor_33)(void *, CeedInt Q, const CeedScalar *cons
     MatUnpack33(J + i, Q, J_loc);
     const CeedScalar detJ = AdjJt33<true>(J_loc, adjJt_loc);
 
-    attr[i] = 0;
-    wdetJ[i] = qw[i] * detJ;
-    adjJt[i + Q * 0] = adjJt_loc[0] / detJ;
-    adjJt[i + Q * 1] = adjJt_loc[1] / detJ;
-    adjJt[i + Q * 2] = adjJt_loc[2] / detJ;
-    adjJt[i + Q * 3] = adjJt_loc[3] / detJ;
-    adjJt[i + Q * 4] = adjJt_loc[4] / detJ;
-    adjJt[i + Q * 5] = adjJt_loc[5] / detJ;
-    adjJt[i + Q * 6] = adjJt_loc[6] / detJ;
-    adjJt[i + Q * 7] = adjJt_loc[7] / detJ;
-    adjJt[i + Q * 8] = adjJt_loc[8] / detJ;
+    qd_attr[i] = attr[i];
+    qd_wdetJ[i] = qw[i] * detJ;
+    qd_adjJt[i + Q * 0] = adjJt_loc[0] / detJ;
+    qd_adjJt[i + Q * 1] = adjJt_loc[1] / detJ;
+    qd_adjJt[i + Q * 2] = adjJt_loc[2] / detJ;
+    qd_adjJt[i + Q * 3] = adjJt_loc[3] / detJ;
+    qd_adjJt[i + Q * 4] = adjJt_loc[4] / detJ;
+    qd_adjJt[i + Q * 5] = adjJt_loc[5] / detJ;
+    qd_adjJt[i + Q * 6] = adjJt_loc[6] / detJ;
+    qd_adjJt[i + Q * 7] = adjJt_loc[7] / detJ;
+    qd_adjJt[i + Q * 8] = adjJt_loc[8] / detJ;
   }
   return 0;
 }
diff --git a/palace/fem/qfunctions/geom_qf.h b/palace/fem/qfunctions/geom_qf.h
index dfbc33817..dc08248cf 100644
--- a/palace/fem/qfunctions/geom_qf.h
+++ b/palace/fem/qfunctions/geom_qf.h
@@ -6,8 +6,9 @@
 
 // libCEED QFunction for building geometry factors for integration and transformations.
 // At every quadrature point, compute qw * det(J) and adj(J)^T / |J| and store the result.
-// in[0] is quadrature weights, shape [Q]
-// in[1] is Jacobians, shape [qcomp=dim, ncomp=space_dim, Q]
+// in[0] is element attributes, shape [Q]
+// in[1] is quadrature weights, shape [Q]
+// in[2] is Jacobians, shape [qcomp=dim, ncomp=space_dim, Q]
 // out[0] is quadrature data, stored as {attribute, Jacobian determinant, (transpose)
 //        adjugate Jacobian} quadrature data, shape [ncomp=2+space_dim*dim, Q]