From a078f35f0bbff27705cae19a443f1451b3ecc772 Mon Sep 17 00:00:00 2001 From: Sebastian Grimberg Date: Tue, 30 Jan 2024 19:09:58 -0800 Subject: [PATCH 1/7] Improve libCEED quadrature point geometry data construction (avoid copying entire data from device to host and back to device) --- palace/fem/mesh.cpp | 71 +++++++++++++++++++++++---------------------- 1 file changed, 37 insertions(+), 34 deletions(-) diff --git a/palace/fem/mesh.cpp b/palace/fem/mesh.cpp index aaac29d86..4ac1e29d5 100644 --- a/palace/fem/mesh.cpp +++ b/palace/fem/mesh.cpp @@ -3,6 +3,8 @@ #include "mesh.hpp" +#include +#include #include "fem/coefficient.hpp" #include "fem/fespace.hpp" #include "fem/libceed/integrator.hpp" @@ -137,6 +139,33 @@ auto GetElementIndices(const mfem::ParMesh &mesh, bool use_bdr, int start, int s return element_indices; } +void AssembleCeedAttributeData(const mfem::Array &attr, int Q, Ceed ceed, + CeedVector geom_data, CeedElemRestriction geom_data_restr) +{ + // The data for quadrature point i, component j, element k is found at index + // i * layout[0] + j * layout[1] + k * layout[2]. This is also correctly a non-OpenMP loop + // when executed on the CPU (OpenMP is handled in outer scope). + CeedMemType mem; + CeedInt layout[3]; + CeedScalar *geom_data_array; + PalaceCeedCall(ceed, CeedGetPreferredMemType(ceed, &mem)); + if (!mfem::Device::Allows(mfem::Backend::DEVICE_MASK)) + { + mem = CEED_MEM_HOST; + } + PalaceCeedCall(ceed, CeedElemRestrictionGetELayout(geom_data_restr, &layout)); + PalaceCeedCall(ceed, CeedVectorGetArray(geom_data, mem, &geom_data_array)); + const auto *d_attr = attr.Read(mem == CEED_MEM_DEVICE); + mfem::forall_switch(mem == CEED_MEM_DEVICE, attr.Size() * Q, + [=] MFEM_HOST_DEVICE(int l) + { + int k = l / Q; + int i = l % Q; + geom_data_array[i * layout[0] + k * layout[2]] = d_attr[k]; + }); + CeedVectorRestoreArray(geom_data, &geom_data_array); +} + template auto AssembleGeometryData(const mfem::GridFunction &mesh_nodes, Ceed ceed, mfem::Geometry::Type geom, std::vector &indices, @@ -160,30 +189,10 @@ auto AssembleGeometryData(const mfem::GridFunction &mesh_nodes, Ceed ceed, PalaceCeedCall(ceed, CeedBasisGetNumQuadraturePoints(mesh_basis, &num_qpts)); PalaceCeedCall( ceed, CeedVectorCreate(ceed, num_elem * num_qpts * geom_data_size, &data.geom_data)); - - // Data for quadrature point i, component j, element k is found at index i * strides[0] + - // j * strides[1] + k * strides[2]. - CeedMemType mem; - CeedInt strides[3]; - PalaceCeedCall(ceed, CeedGetPreferredMemType(ceed, &mem)); - if (mfem::Device::Allows(mfem::Backend::DEVICE_MASK) && mem == CEED_MEM_DEVICE) - { - // GPU backends have CEED_STRIDES_BACKEND = {1, num_elem * num_qpts, num_qpts}. - strides[0] = 1; - strides[1] = num_elem * num_qpts; - strides[2] = num_qpts; - } - else - { - // CPU backends have CEED_STRIDES_BACKEND = {1, num_qpts, num_qpts * geom_data_size}. - strides[0] = 1; - strides[1] = num_qpts; - strides[2] = num_qpts * geom_data_size; - } - PalaceCeedCall(ceed, - CeedElemRestrictionCreateStrided(ceed, num_elem, num_qpts, geom_data_size, - num_elem * num_qpts * geom_data_size, - strides, &data.geom_data_restr)); + PalaceCeedCall( + ceed, CeedElemRestrictionCreateStrided(ceed, num_elem, num_qpts, geom_data_size, + num_elem * num_qpts * geom_data_size, + CEED_STRIDES_BACKEND, &data.geom_data_restr)); // Compute the required geometry factors at quadrature points. CeedVector mesh_nodes_vec; @@ -196,22 +205,16 @@ auto AssembleGeometryData(const mfem::GridFunction &mesh_nodes, Ceed ceed, PalaceCeedCall(ceed, CeedElemRestrictionDestroy(&mesh_restr)); PalaceCeedCall(ceed, CeedBasisDestroy(&mesh_basis)); - // Compute element attribute quadrature data. All inputs to a QFunction require the same + // Set the element attribute quadrature data. All inputs to a QFunction require the same // number of quadrature points, so we store the attribute at each quadrature point. This // is the first component of the quadrature data. { - CeedScalar *geom_data_array; - PalaceCeedCall(ceed, - CeedVectorGetArray(data.geom_data, CEED_MEM_HOST, &geom_data_array)); + mfem::Array attr(num_elem); for (std::size_t k = 0; k < num_elem; k++) { - const auto attr = GetCeedAttribute(data.indices[k]); - for (CeedInt i = 0; i < num_qpts; i++) - { - geom_data_array[i * strides[0] + k * strides[2]] = attr; - } + attr[k] = GetCeedAttribute(data.indices[k]); } - CeedVectorRestoreArray(data.geom_data, &geom_data_array); + AssembleCeedAttributeData(attr, num_qpts, ceed, data.geom_data, data.geom_data_restr); } return data; From 01e7606cae7316b65dac01c9df4e06fc71ae2d28 Mon Sep 17 00:00:00 2001 From: Sebastian Grimberg Date: Tue, 30 Jan 2024 19:11:09 -0800 Subject: [PATCH 2/7] Never assemble operator quadrature data explicitly (memory savings is more significant than minor QFunction speedup) --- palace/models/curlcurloperator.cpp | 2 +- palace/models/laplaceoperator.cpp | 2 +- palace/models/spaceoperator.cpp | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/palace/models/curlcurloperator.cpp b/palace/models/curlcurloperator.cpp index 8a30ec005..3b544c90a 100644 --- a/palace/models/curlcurloperator.cpp +++ b/palace/models/curlcurloperator.cpp @@ -163,7 +163,7 @@ std::unique_ptr CurlCurlOperator::GetStiffnessMatrix() mat_op.GetInvPermeability()); BilinearForm k(GetNDSpace()); k.AddDomainIntegrator(muinv_func); - k.AssembleQuadratureData(); + // k.AssembleQuadratureData(); auto k_vec = k.Assemble(GetNDSpaces(), skip_zeros); auto K = std::make_unique(GetNDSpaces().GetNumLevels()); for (std::size_t l = 0; l < GetNDSpaces().GetNumLevels(); l++) diff --git a/palace/models/laplaceoperator.cpp b/palace/models/laplaceoperator.cpp index 509067e63..c110a0726 100644 --- a/palace/models/laplaceoperator.cpp +++ b/palace/models/laplaceoperator.cpp @@ -184,7 +184,7 @@ std::unique_ptr LaplaceOperator::GetStiffnessMatrix() mat_op.GetPermittivityReal()); BilinearForm k(GetH1Space()); k.AddDomainIntegrator(epsilon_func); - k.AssembleQuadratureData(); + // k.AssembleQuadratureData(); auto k_vec = k.Assemble(GetH1Spaces(), skip_zeros); auto K = std::make_unique(GetH1Spaces().GetNumLevels()); for (std::size_t l = 0; l < GetH1Spaces().GetNumLevels(); l++) diff --git a/palace/models/spaceoperator.cpp b/palace/models/spaceoperator.cpp index 631c669f8..47135b26b 100644 --- a/palace/models/spaceoperator.cpp +++ b/palace/models/spaceoperator.cpp @@ -694,7 +694,7 @@ std::unique_ptr SpaceOperator::GetPreconditionerMatrix(double a0, doub const auto n_levels = GetNDSpaces().GetNumLevels(); std::vector> br_vec(n_levels), bi_vec(n_levels), br_aux_vec(n_levels), bi_aux_vec(n_levels); - constexpr bool skip_zeros = false, assemble_q_data = true; + constexpr bool skip_zeros = false, assemble_q_data = false; if (std::is_same::value && !pc_mat_real) { MaterialPropertyCoefficient dfr(mat_op.MaxCeedAttribute()), From 2a5211456c3aff9c134c506f959e6418a3cdcad5 Mon Sep 17 00:00:00 2001 From: Sebastian Grimberg Date: Thu, 28 Dec 2023 09:37:54 -0800 Subject: [PATCH 3/7] Only use OpenMP with libCEED when not using GPU backends --- palace/fem/libceed/ceed.cpp | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/palace/fem/libceed/ceed.cpp b/palace/fem/libceed/ceed.cpp index 43c555eb4..abe25f4e2 100644 --- a/palace/fem/libceed/ceed.cpp +++ b/palace/fem/libceed/ceed.cpp @@ -3,6 +3,7 @@ #include "ceed.hpp" +#include #include "utils/omp.hpp" #if defined(MFEM_USE_OPENMP) @@ -31,7 +32,9 @@ void Initialize(const char *resource, const char *jit_source_dir) PalacePragmaOmp(master) { #if defined(MFEM_USE_OPENMP) - const int nt = omp_get_num_threads(); + // Only parallelize libCEED operators over threads when not using the GPU. + const int nt = + !std::string_view(resource).compare(0, 4, "/cpu") ? omp_get_num_threads() : 1; #else const int nt = 1; #endif From 0414d748f09a747e3fc2c2cb95721d275804b8af Mon Sep 17 00:00:00 2001 From: Sebastian Grimberg Date: Tue, 2 Jan 2024 09:36:17 -0800 Subject: [PATCH 4/7] Avoid constructing discrete gradient matrix on coarsest mesh if not needed --- palace/fem/fespace.hpp | 3 ++- palace/fem/libceed/basis.cpp | 2 +- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/palace/fem/fespace.hpp b/palace/fem/fespace.hpp index f7fbd3033..db3e4e490 100644 --- a/palace/fem/fespace.hpp +++ b/palace/fem/fespace.hpp @@ -300,7 +300,8 @@ class AuxiliaryFiniteElementSpaceHierarchy std::vector GetDiscreteInterpolators() const { std::vector G_(GetNumLevels()); - for (std::size_t l = 0; l < G_.size(); l++) + G_[0] = nullptr; // No discrete interpolator for coarsest level + for (std::size_t l = 1; l < G_.size(); l++) { G_[l] = &GetDiscreteInterpolatorAtLevel(l); } diff --git a/palace/fem/libceed/basis.cpp b/palace/fem/libceed/basis.cpp index 3623f91ec..672f3e97b 100644 --- a/palace/fem/libceed/basis.cpp +++ b/palace/fem/libceed/basis.cpp @@ -192,7 +192,7 @@ void InitInterpolatorBasis(const mfem::FiniteElement &trial_fe, if constexpr (false) { std::cout << "New interpolator basis (" << ceed << ", " << &trial_fe << ", " << &test_fe - << ")\n"; + << ", " << (trial_fe.GetMapType() == test_fe.GetMapType()) << ")\n"; } if constexpr (false) { From 0940b871bf116b9f2bbb6e9041c53e766e41e0d1 Mon Sep 17 00:00:00 2001 From: Sebastian Grimberg Date: Tue, 30 Jan 2024 17:51:51 -0800 Subject: [PATCH 5/7] Add missing AddMult(Transpose) overrides --- palace/linalg/operator.cpp | 61 ++++++++++++++++++++++++++++++++++++++ palace/linalg/operator.hpp | 55 ++++++++++++++++++++++++++++++++++ 2 files changed, 116 insertions(+) diff --git a/palace/linalg/operator.cpp b/palace/linalg/operator.cpp index 461f4b2f6..301bdf720 100644 --- a/palace/linalg/operator.cpp +++ b/palace/linalg/operator.cpp @@ -501,6 +501,41 @@ void BaseDiagonalOperator::Mult(const ComplexVector &x, }); } +template <> +void BaseDiagonalOperator::AddMult(const Vector &x, Vector &y, + const double a) const +{ + const int N = this->height; + const auto *D = d.Read(); + const auto *X = x.Read(); + auto *Y = y.Write(); + mfem::forall(N, [=] MFEM_HOST_DEVICE(int i) { Y[i] += a * D[i] * X[i]; }); +} + +template <> +void BaseDiagonalOperator::AddMult(const ComplexVector &x, + ComplexVector &y, + const std::complex a) const +{ + const int N = this->height; + const double ar = a.real(); + const double ai = a.imag(); + const auto *DR = d.Real().Read(); + const auto *DI = d.Imag().Read(); + const auto *XR = x.Real().Read(); + const auto *XI = x.Imag().Read(); + auto *YR = y.Real().Write(); + auto *YI = y.Imag().Write(); + mfem::forall(N, + [=] MFEM_HOST_DEVICE(int i) + { + const auto tr = DR[i] * XR[i] - DI[i] * XI[i]; + const auto ti = DI[i] * XR[i] + DR[i] * XI[i]; + YR[i] += ar * tr - ai * ti; + YI[i] += ai * ti + ar * ti; + }); +} + template <> void DiagonalOperatorHelper, ComplexOperator>::MultHermitianTranspose(const ComplexVector &x, @@ -523,6 +558,32 @@ void DiagonalOperatorHelper, }); } +template <> +void DiagonalOperatorHelper, ComplexOperator>:: + AddMultHermitianTranspose(const ComplexVector &x, ComplexVector &y, + const std::complex a) const +{ + const ComplexVector &d = + static_cast *>(this)->d; + const int N = this->height; + const double ar = a.real(); + const double ai = a.imag(); + const auto *DR = d.Real().Read(); + const auto *DI = d.Imag().Read(); + const auto *XR = x.Real().Read(); + const auto *XI = x.Imag().Read(); + auto *YR = y.Real().Write(); + auto *YI = y.Imag().Write(); + mfem::forall(N, + [=] MFEM_HOST_DEVICE(int i) + { + const auto tr = DR[i] * XR[i] + DI[i] * XI[i]; + const auto ti = -DI[i] * XR[i] + DR[i] * XI[i]; + YR[i] += ar * tr - ai * ti; + YI[i] += ai * ti + ar * ti; + }); +} + namespace linalg { diff --git a/palace/linalg/operator.hpp b/palace/linalg/operator.hpp index ab204c076..89f3890b9 100644 --- a/palace/linalg/operator.hpp +++ b/palace/linalg/operator.hpp @@ -161,6 +161,16 @@ class ProductOperatorHelper : public ComplexOp A.MultHermitianTranspose(x, z); B.MultHermitianTranspose(z, y); } + + void AddMultHermitianTranspose(const ComplexVector &x, ComplexVector &y, + const std::complex a = 1.0) const override + { + const ComplexOperator &A = static_cast(this)->A; + const ComplexOperator &B = static_cast(this)->B; + ComplexVector &z = static_cast(this)->z; + A.MultHermitianTranspose(x, z); + B.AddMultHermitianTranspose(z, y, a); + } }; template @@ -171,6 +181,9 @@ class BaseProductOperator using VecType = typename std::conditional::value, ComplexVector, Vector>::type; + using ScalarType = + typename std::conditional::value, + std::complex, double>::type; private: const OperType &A, &B; @@ -194,6 +207,19 @@ class BaseProductOperator A.MultTranspose(x, z); B.MultTranspose(z, y); } + + void AddMult(const VecType &x, VecType &y, const ScalarType a = 1.0) const override + { + B.Mult(x, z); + A.AddMult(z, y, a); + } + + void AddMultTranspose(const VecType &x, VecType &y, + const ScalarType a = 1.0) const override + { + A.MultTranspose(x, z); + B.AddMultTranspose(z, y, a); + } }; using ProductOperator = BaseProductOperator; @@ -219,6 +245,9 @@ class DiagonalOperatorHelper : public Complex DiagonalOperatorHelper(int s) : ComplexOperator(s) {} void MultHermitianTranspose(const ComplexVector &x, ComplexVector &y) const override; + + void AddMultHermitianTranspose(const ComplexVector &x, ComplexVector &y, + const std::complex a = 1.0) const override; }; template @@ -229,6 +258,9 @@ class BaseDiagonalOperator using VecType = typename std::conditional::value, ComplexVector, Vector>::type; + using ScalarType = + typename std::conditional::value, + std::complex, double>::type; private: const VecType &d; @@ -242,6 +274,14 @@ class BaseDiagonalOperator void Mult(const VecType &x, VecType &y) const override; void MultTranspose(const VecType &x, VecType &y) const override { Mult(x, y); } + + void AddMult(const VecType &x, VecType &y, const ScalarType a = 1.0) const override; + + void AddMultTranspose(const VecType &x, VecType &y, + const ScalarType a = 1.0) const override + { + AddMult(x, y, a); + } }; using DiagonalOperator = BaseDiagonalOperator; @@ -256,6 +296,9 @@ class BaseMultigridOperator : public OperType { using VecType = typename std::conditional::value, ComplexVector, Vector>::type; + using ScalarType = + typename std::conditional::value, + std::complex, double>::type; private: std::vector> ops, aux_ops; @@ -300,10 +343,22 @@ class BaseMultigridOperator : public OperType } void Mult(const VecType &x, VecType &y) const override { GetFinestOperator().Mult(x, y); } + void MultTranspose(const VecType &x, VecType &y) const override { GetFinestOperator().MultTranspose(x, y); } + + void AddMult(const VecType &x, VecType &y, const ScalarType a = 1.0) const override + { + GetFinestOperator().AddMult(x, y, a); + } + + void AddMultTranspose(const VecType &x, VecType &y, + const ScalarType a = 1.0) const override + { + GetFinestOperator().AddMultTranspose(x, y, a); + } }; using MultigridOperator = BaseMultigridOperator; From 40de187c6d7e4ad5686215b99d47a92f60592df7 Mon Sep 17 00:00:00 2001 From: Sebastian Grimberg Date: Thu, 1 Feb 2024 18:18:49 -0800 Subject: [PATCH 6/7] Parallel prolongation and restriction sometimes do not have appropriate AddMult/AddMultTranspose overrides, causing temporary vector allocation So, fix it by handling this at the ParOperator level. --- palace/linalg/rap.cpp | 118 ++++++++++-------------------------------- palace/linalg/rap.hpp | 2 - 2 files changed, 26 insertions(+), 94 deletions(-) diff --git a/palace/linalg/rap.cpp b/palace/linalg/rap.cpp index 457f22511..4773ac9aa 100644 --- a/palace/linalg/rap.cpp +++ b/palace/linalg/rap.cpp @@ -52,18 +52,21 @@ void ParOperator::EliminateRHS(const Vector &x, Vector &b) const } MFEM_VERIFY(A, "No local matrix available for ParOperator::EliminateRHS!"); - auto &tx = trial_fespace.GetTVector(); auto &lx = trial_fespace.GetLVector(); auto &ly = GetTestLVector(); - tx = 0.0; - linalg::SetSubVector(tx, *dbc_tdof_list, x); - trial_fespace.GetProlongationMatrix()->Mult(tx, lx); + { + auto &tx = trial_fespace.GetTVector(); + tx = 0.0; + linalg::SetSubVector(tx, *dbc_tdof_list, x); + trial_fespace.GetProlongationMatrix()->Mult(tx, lx); + } // Apply the unconstrained operator. A->Mult(lx, ly); - ly *= -1.0; - RestrictionMatrixAddMult(ly, b); + auto &ty = test_fespace.GetTVector(); + RestrictionMatrixMult(ly, ty); + b.Add(-1.0, ty); if (diag_policy == DiagonalPolicy::DIAG_ONE) { linalg::SetSubVector(b, *dbc_tdof_list, x); @@ -292,10 +295,10 @@ void ParOperator::AddMult(const Vector &x, Vector &y, const double a) const // Apply the operator on the L-vector. A->Mult(lx, ly); + auto &ty = test_fespace.GetTVector(); + RestrictionMatrixMult(ly, ty); if (dbc_tdof_list) { - auto &ty = test_fespace.GetTVector(); - RestrictionMatrixMult(ly, ty); if (diag_policy == DiagonalPolicy::DIAG_ONE) { linalg::SetSubVector(ty, *dbc_tdof_list, x); @@ -304,16 +307,8 @@ void ParOperator::AddMult(const Vector &x, Vector &y, const double a) const { linalg::SetSubVector(ty, *dbc_tdof_list, 0.0); } - y.Add(a, ty); - } - else - { - if (a != 1.0) - { - ly *= a; - } - RestrictionMatrixAddMult(ly, y); } + y.Add(a, ty); } void ParOperator::AddMultTranspose(const Vector &x, Vector &y, const double a) const @@ -343,10 +338,10 @@ void ParOperator::AddMultTranspose(const Vector &x, Vector &y, const double a) c // Apply the operator on the L-vector. A->MultTranspose(ly, lx); + auto &tx = trial_fespace.GetTVector(); + trial_fespace.GetProlongationMatrix()->MultTranspose(lx, tx); if (dbc_tdof_list) { - auto &tx = trial_fespace.GetTVector(); - trial_fespace.GetProlongationMatrix()->MultTranspose(lx, tx); if (diag_policy == DiagonalPolicy::DIAG_ONE) { linalg::SetSubVector(tx, *dbc_tdof_list, x); @@ -355,16 +350,8 @@ void ParOperator::AddMultTranspose(const Vector &x, Vector &y, const double a) c { linalg::SetSubVector(tx, *dbc_tdof_list, 0.0); } - y.Add(a, tx); - } - else - { - if (a != 1.0) - { - lx *= a; - } - trial_fespace.GetProlongationMatrix()->AddMultTranspose(lx, y); } + y.Add(a, tx); } void ParOperator::RestrictionMatrixMult(const Vector &ly, Vector &ty) const @@ -379,18 +366,6 @@ void ParOperator::RestrictionMatrixMult(const Vector &ly, Vector &ty) const } } -void ParOperator::RestrictionMatrixAddMult(const Vector &ly, Vector &ty) const -{ - if (!use_R) - { - test_fespace.GetProlongationMatrix()->AddMultTranspose(ly, ty); - } - else - { - test_fespace.GetRestrictionMatrix()->AddMult(ly, ty); - } -} - void ParOperator::RestrictionMatrixMultTranspose(const Vector &ty, Vector &ly) const { if (!use_R) @@ -623,10 +598,10 @@ void ComplexParOperator::AddMult(const ComplexVector &x, ComplexVector &y, // Apply the operator on the L-vector. A->Mult(lx, ly); + auto &ty = test_fespace.GetTVector(); + RestrictionMatrixMult(ly, ty); if (dbc_tdof_list) { - auto &ty = test_fespace.GetTVector(); - RestrictionMatrixMult(ly, ty); if (diag_policy == Operator::DiagonalPolicy::DIAG_ONE) { linalg::SetSubVector(ty, *dbc_tdof_list, x); @@ -635,16 +610,8 @@ void ComplexParOperator::AddMult(const ComplexVector &x, ComplexVector &y, { linalg::SetSubVector(ty, *dbc_tdof_list, 0.0); } - y.AXPY(a, ty); - } - else - { - if (a != 1.0) - { - ly *= a; - } - RestrictionMatrixAddMult(ly, y); } + y.AXPY(a, ty); } void ComplexParOperator::AddMultTranspose(const ComplexVector &x, ComplexVector &y, @@ -669,11 +636,11 @@ void ComplexParOperator::AddMultTranspose(const ComplexVector &x, ComplexVector // Apply the operator on the L-vector. A->MultTranspose(ly, lx); + auto &tx = trial_fespace.GetTVector(); + trial_fespace.GetProlongationMatrix()->MultTranspose(lx.Real(), tx.Real()); + trial_fespace.GetProlongationMatrix()->MultTranspose(lx.Imag(), tx.Imag()); if (dbc_tdof_list) { - auto &tx = trial_fespace.GetTVector(); - trial_fespace.GetProlongationMatrix()->MultTranspose(lx.Real(), tx.Real()); - trial_fespace.GetProlongationMatrix()->MultTranspose(lx.Imag(), tx.Imag()); if (diag_policy == Operator::DiagonalPolicy::DIAG_ONE) { linalg::SetSubVector(tx, *dbc_tdof_list, x); @@ -682,17 +649,8 @@ void ComplexParOperator::AddMultTranspose(const ComplexVector &x, ComplexVector { linalg::SetSubVector(tx, *dbc_tdof_list, 0.0); } - y.AXPY(a, tx); - } - else - { - if (a != 1.0) - { - lx *= a; - } - trial_fespace.GetProlongationMatrix()->AddMultTranspose(lx.Real(), y.Real()); - trial_fespace.GetProlongationMatrix()->AddMultTranspose(lx.Imag(), y.Imag()); } + y.AXPY(a, tx); } void ComplexParOperator::AddMultHermitianTranspose(const ComplexVector &x, ComplexVector &y, @@ -717,11 +675,11 @@ void ComplexParOperator::AddMultHermitianTranspose(const ComplexVector &x, Compl // Apply the operator on the L-vector. A->MultHermitianTranspose(ly, lx); + auto &tx = trial_fespace.GetTVector(); + trial_fespace.GetProlongationMatrix()->MultTranspose(lx.Real(), tx.Real()); + trial_fespace.GetProlongationMatrix()->MultTranspose(lx.Imag(), tx.Imag()); if (dbc_tdof_list) { - auto &tx = trial_fespace.GetTVector(); - trial_fespace.GetProlongationMatrix()->MultTranspose(lx.Real(), tx.Real()); - trial_fespace.GetProlongationMatrix()->MultTranspose(lx.Imag(), tx.Imag()); if (diag_policy == Operator::DiagonalPolicy::DIAG_ONE) { linalg::SetSubVector(tx, *dbc_tdof_list, x); @@ -730,17 +688,8 @@ void ComplexParOperator::AddMultHermitianTranspose(const ComplexVector &x, Compl { linalg::SetSubVector(tx, *dbc_tdof_list, 0.0); } - y.AXPY(a, tx); - } - else - { - if (a != 1.0) - { - lx *= a; - } - trial_fespace.GetProlongationMatrix()->AddMultTranspose(lx.Real(), y.Real()); - trial_fespace.GetProlongationMatrix()->AddMultTranspose(lx.Imag(), y.Imag()); } + y.AXPY(a, tx); } void ComplexParOperator::RestrictionMatrixMult(const ComplexVector &ly, @@ -758,21 +707,6 @@ void ComplexParOperator::RestrictionMatrixMult(const ComplexVector &ly, } } -void ComplexParOperator::RestrictionMatrixAddMult(const ComplexVector &ly, - ComplexVector &ty) const -{ - if (!use_R) - { - test_fespace.GetProlongationMatrix()->AddMultTranspose(ly.Real(), ty.Real()); - test_fespace.GetProlongationMatrix()->AddMultTranspose(ly.Imag(), ty.Imag()); - } - else - { - test_fespace.GetRestrictionMatrix()->AddMult(ly.Real(), ty.Real()); - test_fespace.GetRestrictionMatrix()->AddMult(ly.Imag(), ty.Imag()); - } -} - void ComplexParOperator::RestrictionMatrixMultTranspose(const ComplexVector &ty, ComplexVector &ly) const { diff --git a/palace/linalg/rap.hpp b/palace/linalg/rap.hpp index 2ee99d3e0..4c9d36174 100644 --- a/palace/linalg/rap.hpp +++ b/palace/linalg/rap.hpp @@ -43,7 +43,6 @@ class ParOperator : public Operator // Helper methods for operator application. void RestrictionMatrixMult(const Vector &ly, Vector &ty) const; - void RestrictionMatrixAddMult(const Vector &ly, Vector &ty) const; void RestrictionMatrixMultTranspose(const Vector &ty, Vector &ly) const; Vector &GetTestLVector() const; @@ -130,7 +129,6 @@ class ComplexParOperator : public ComplexOperator // Helper methods for operator application. void RestrictionMatrixMult(const ComplexVector &ly, ComplexVector &ty) const; - void RestrictionMatrixAddMult(const ComplexVector &ly, ComplexVector &ty) const; void RestrictionMatrixMultTranspose(const ComplexVector &ty, ComplexVector &ly) const; ComplexVector &GetTestLVector() const; From 3323ddabf08088329b1d2c6da235aafb866e32ce Mon Sep 17 00:00:00 2001 From: Sebastian Grimberg Date: Mon, 5 Feb 2024 10:12:30 -0800 Subject: [PATCH 7/7] Rework libCEED quadrature point geometry data to populate element attributes directly in QFunction --- palace/fem/libceed/integrator.cpp | 7 +- palace/fem/libceed/integrator.hpp | 4 +- palace/fem/mesh.cpp | 103 +++++++++++--------------- palace/fem/qfunctions/21/geom_21_qf.h | 12 +-- palace/fem/qfunctions/22/geom_22_qf.h | 16 ++-- palace/fem/qfunctions/32/geom_32_qf.h | 20 ++--- palace/fem/qfunctions/33/geom_33_qf.h | 26 +++---- palace/fem/qfunctions/geom_qf.h | 5 +- 8 files changed, 94 insertions(+), 99 deletions(-) diff --git a/palace/fem/libceed/integrator.cpp b/palace/fem/libceed/integrator.cpp index a87507b5c..14d644b13 100644 --- a/palace/fem/libceed/integrator.cpp +++ b/palace/fem/libceed/integrator.cpp @@ -346,7 +346,9 @@ int CeedGeometryDataGetSpaceDimension(CeedElemRestriction geom_data_restr, CeedI void AssembleCeedGeometryData(Ceed ceed, CeedElemRestriction mesh_restr, CeedBasis mesh_basis, CeedVector mesh_nodes, - CeedVector geom_data, CeedElemRestriction geom_data_restr) + CeedElemRestriction attr_restr, CeedBasis attr_basis, + CeedVector elem_attr, CeedVector geom_data, + CeedElemRestriction geom_data_restr) { CeedInt dim, space_dim, num_elem, num_qpts; PalaceCeedCall(ceed, CeedBasisGetDimension(mesh_basis, &dim)); @@ -389,6 +391,7 @@ void AssembleCeedGeometryData(Ceed ceed, CeedElemRestriction mesh_restr, } // Inputs + PalaceCeedCall(ceed, CeedQFunctionAddInput(build_qf, "attr", 1, CEED_EVAL_INTERP)); PalaceCeedCall(ceed, CeedQFunctionAddInput(build_qf, "q_w", 1, CEED_EVAL_WEIGHT)); PalaceCeedCall( ceed, CeedQFunctionAddInput(build_qf, "grad_x", space_dim * dim, CEED_EVAL_GRAD)); @@ -409,6 +412,8 @@ void AssembleCeedGeometryData(Ceed ceed, CeedElemRestriction mesh_restr, PalaceCeedCall(ceed, CeedOperatorCreate(ceed, build_qf, nullptr, nullptr, &build_op)); PalaceCeedCall(ceed, CeedQFunctionDestroy(&build_qf)); + PalaceCeedCall(ceed, + CeedOperatorSetField(build_op, "attr", attr_restr, attr_basis, elem_attr)); PalaceCeedCall(ceed, CeedOperatorSetField(build_op, "q_w", CEED_ELEMRESTRICTION_NONE, mesh_basis, CEED_VECTOR_NONE)); PalaceCeedCall(ceed, CeedOperatorSetField(build_op, "grad_x", mesh_restr, mesh_basis, diff --git a/palace/fem/libceed/integrator.hpp b/palace/fem/libceed/integrator.hpp index 20161ac37..25adeea51 100644 --- a/palace/fem/libceed/integrator.hpp +++ b/palace/fem/libceed/integrator.hpp @@ -53,7 +53,9 @@ int CeedGeometryDataGetSpaceDimension(CeedElemRestriction geom_data_restr, CeedI // libCEED operator. void AssembleCeedGeometryData(Ceed ceed, CeedElemRestriction mesh_restr, CeedBasis mesh_basis, CeedVector mesh_nodes, - CeedVector geom_data, CeedElemRestriction geom_data_restr); + CeedElemRestriction attr_restr, CeedBasis attr_basis, + CeedVector elem_attr, CeedVector geom_data, + CeedElemRestriction geom_data_restr); // Construct libCEED operator using the given quadrature data, element restriction, and // basis objects. diff --git a/palace/fem/mesh.cpp b/palace/fem/mesh.cpp index 4ac1e29d5..4067b48c7 100644 --- a/palace/fem/mesh.cpp +++ b/palace/fem/mesh.cpp @@ -3,8 +3,6 @@ #include "mesh.hpp" -#include -#include #include "fem/coefficient.hpp" #include "fem/fespace.hpp" #include "fem/libceed/integrator.hpp" @@ -139,37 +137,8 @@ auto GetElementIndices(const mfem::ParMesh &mesh, bool use_bdr, int start, int s return element_indices; } -void AssembleCeedAttributeData(const mfem::Array &attr, int Q, Ceed ceed, - CeedVector geom_data, CeedElemRestriction geom_data_restr) -{ - // The data for quadrature point i, component j, element k is found at index - // i * layout[0] + j * layout[1] + k * layout[2]. This is also correctly a non-OpenMP loop - // when executed on the CPU (OpenMP is handled in outer scope). - CeedMemType mem; - CeedInt layout[3]; - CeedScalar *geom_data_array; - PalaceCeedCall(ceed, CeedGetPreferredMemType(ceed, &mem)); - if (!mfem::Device::Allows(mfem::Backend::DEVICE_MASK)) - { - mem = CEED_MEM_HOST; - } - PalaceCeedCall(ceed, CeedElemRestrictionGetELayout(geom_data_restr, &layout)); - PalaceCeedCall(ceed, CeedVectorGetArray(geom_data, mem, &geom_data_array)); - const auto *d_attr = attr.Read(mem == CEED_MEM_DEVICE); - mfem::forall_switch(mem == CEED_MEM_DEVICE, attr.Size() * Q, - [=] MFEM_HOST_DEVICE(int l) - { - int k = l / Q; - int i = l % Q; - geom_data_array[i * layout[0] + k * layout[2]] = d_attr[k]; - }); - CeedVectorRestoreArray(geom_data, &geom_data_array); -} - -template -auto AssembleGeometryData(const mfem::GridFunction &mesh_nodes, Ceed ceed, - mfem::Geometry::Type geom, std::vector &indices, - T GetCeedAttribute) +auto AssembleGeometryData(Ceed ceed, mfem::Geometry::Type geom, std::vector &indices, + const mfem::GridFunction &mesh_nodes, const Vector &elem_attr) { const mfem::FiniteElementSpace &mesh_fespace = *mesh_nodes.FESpace(); const mfem::Mesh &mesh = *mesh_fespace.GetMesh(); @@ -180,13 +149,33 @@ auto AssembleGeometryData(const mfem::GridFunction &mesh_nodes, Ceed ceed, data.indices = std::move(indices); const std::size_t num_elem = data.indices.size(); - // Allocate storage for geometry factor data (stored as attribute + quadrature weight + - // Jacobian, column-major). + // Construct mesh node element restriction and basis. CeedElemRestriction mesh_restr = FiniteElementSpace::BuildCeedElemRestriction(mesh_fespace, ceed, geom, data.indices); CeedBasis mesh_basis = FiniteElementSpace::BuildCeedBasis(mesh_fespace, ceed, geom); - CeedInt num_qpts, geom_data_size = 2 + data.space_dim * data.dim; + CeedVector mesh_nodes_vec; + ceed::InitCeedVector(mesh_nodes, ceed, &mesh_nodes_vec); + CeedInt num_qpts; PalaceCeedCall(ceed, CeedBasisGetNumQuadraturePoints(mesh_basis, &num_qpts)); + + // Construct element attribute element restriction and basis. + CeedElemRestriction attr_restr; + CeedBasis attr_basis; + PalaceCeedCall(ceed, CeedElemRestrictionCreateStrided(ceed, num_elem, 1, 1, num_elem, + CEED_STRIDES_BACKEND, &attr_restr)); + { + Vector Bt(num_qpts); + Bt = 1.0; + PalaceCeedCall(ceed, + CeedBasisCreateH1(ceed, ceed::GetCeedTopology(geom), 1, 1, num_qpts, + Bt.GetData(), nullptr, nullptr, nullptr, &attr_basis)); + } + CeedVector elem_attr_vec; + ceed::InitCeedVector(elem_attr, ceed, &elem_attr_vec); + + // Allocate storage for geometry factor data (stored as attribute + quadrature weight + + // Jacobian, column-major). + CeedInt geom_data_size = 2 + data.space_dim * data.dim; PalaceCeedCall( ceed, CeedVectorCreate(ceed, num_elem * num_qpts * geom_data_size, &data.geom_data)); PalaceCeedCall( @@ -195,27 +184,15 @@ auto AssembleGeometryData(const mfem::GridFunction &mesh_nodes, Ceed ceed, CEED_STRIDES_BACKEND, &data.geom_data_restr)); // Compute the required geometry factors at quadrature points. - CeedVector mesh_nodes_vec; - ceed::InitCeedVector(mesh_nodes, ceed, &mesh_nodes_vec); - - ceed::AssembleCeedGeometryData(ceed, mesh_restr, mesh_basis, mesh_nodes_vec, - data.geom_data, data.geom_data_restr); - + ceed::AssembleCeedGeometryData(ceed, mesh_restr, mesh_basis, mesh_nodes_vec, attr_restr, + attr_basis, elem_attr_vec, data.geom_data, + data.geom_data_restr); PalaceCeedCall(ceed, CeedVectorDestroy(&mesh_nodes_vec)); PalaceCeedCall(ceed, CeedElemRestrictionDestroy(&mesh_restr)); PalaceCeedCall(ceed, CeedBasisDestroy(&mesh_basis)); - - // Set the element attribute quadrature data. All inputs to a QFunction require the same - // number of quadrature points, so we store the attribute at each quadrature point. This - // is the first component of the quadrature data. - { - mfem::Array attr(num_elem); - for (std::size_t k = 0; k < num_elem; k++) - { - attr[k] = GetCeedAttribute(data.indices[k]); - } - AssembleCeedAttributeData(attr, num_qpts, ceed, data.geom_data, data.geom_data_restr); - } + PalaceCeedCall(ceed, CeedVectorDestroy(&elem_attr_vec)); + PalaceCeedCall(ceed, CeedElemRestrictionDestroy(&attr_restr)); + PalaceCeedCall(ceed, CeedBasisDestroy(&attr_basis)); return data; } @@ -281,8 +258,13 @@ auto BuildCeedGeomFactorData( }(); for (auto &[geom, indices] : element_indices) { - geom_data_map.emplace(geom, AssembleGeometryData(*mesh.GetNodes(), ceed, geom, - indices, GetCeedAttribute)); + Vector elem_attr(indices.size()); + for (std::size_t k = 0; k < indices.size(); k++) + { + elem_attr[k] = GetCeedAttribute(indices[k]); + } + geom_data_map.emplace( + geom, AssembleGeometryData(ceed, geom, indices, *mesh.GetNodes(), elem_attr)); } } @@ -307,8 +289,13 @@ auto BuildCeedGeomFactorData( }; for (auto &[geom, indices] : element_indices) { - geom_data_map.emplace(geom, AssembleGeometryData(*mesh.GetNodes(), ceed, geom, - indices, GetCeedAttribute)); + Vector elem_attr(indices.size()); + for (std::size_t k = 0; k < indices.size(); k++) + { + elem_attr[k] = GetCeedAttribute(indices[k]); + } + geom_data_map.emplace( + geom, AssembleGeometryData(ceed, geom, indices, *mesh.GetNodes(), elem_attr)); } } diff --git a/palace/fem/qfunctions/21/geom_21_qf.h b/palace/fem/qfunctions/21/geom_21_qf.h index a6e9c1f1e..ce6311270 100644 --- a/palace/fem/qfunctions/21/geom_21_qf.h +++ b/palace/fem/qfunctions/21/geom_21_qf.h @@ -9,8 +9,8 @@ CEED_QFUNCTION(f_build_geom_factor_21)(void *, CeedInt Q, const CeedScalar *const *in, CeedScalar *const *out) { - const CeedScalar *qw = in[0], *J = in[1]; - CeedScalar *attr = out[0], *wdetJ = out[0] + Q, *adjJt = out[0] + 2 * Q; + const CeedScalar *attr = in[0], *qw = in[1], *J = in[2]; + CeedScalar *qd_attr = out[0], *qd_wdetJ = out[0] + Q, *qd_adjJt = out[0] + 2 * Q; CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++) { @@ -18,10 +18,10 @@ CEED_QFUNCTION(f_build_geom_factor_21)(void *, CeedInt Q, const CeedScalar *cons MatUnpack21(J + i, Q, J_loc); const CeedScalar detJ = AdjJt21(J_loc, adjJt_loc); - attr[i] = 0; - wdetJ[i] = qw[i] * detJ; - adjJt[i + Q * 0] = adjJt_loc[0] / detJ; - adjJt[i + Q * 1] = adjJt_loc[1] / detJ; + qd_attr[i] = attr[i]; + qd_wdetJ[i] = qw[i] * detJ; + qd_adjJt[i + Q * 0] = adjJt_loc[0] / detJ; + qd_adjJt[i + Q * 1] = adjJt_loc[1] / detJ; } return 0; } diff --git a/palace/fem/qfunctions/22/geom_22_qf.h b/palace/fem/qfunctions/22/geom_22_qf.h index 55cb36901..f53e9923d 100644 --- a/palace/fem/qfunctions/22/geom_22_qf.h +++ b/palace/fem/qfunctions/22/geom_22_qf.h @@ -9,8 +9,8 @@ CEED_QFUNCTION(f_build_geom_factor_22)(void *, CeedInt Q, const CeedScalar *const *in, CeedScalar *const *out) { - const CeedScalar *qw = in[0], *J = in[1]; - CeedScalar *attr = out[0], *wdetJ = out[0] + Q, *adjJt = out[0] + 2 * Q; + const CeedScalar *attr = in[0], *qw = in[1], *J = in[2]; + CeedScalar *qd_attr = out[0], *qd_wdetJ = out[0] + Q, *qd_adjJt = out[0] + 2 * Q; CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++) { @@ -18,12 +18,12 @@ CEED_QFUNCTION(f_build_geom_factor_22)(void *, CeedInt Q, const CeedScalar *cons MatUnpack22(J + i, Q, J_loc); const CeedScalar detJ = AdjJt22(J_loc, adjJt_loc); - attr[i] = 0; - wdetJ[i] = qw[i] * detJ; - adjJt[i + Q * 0] = adjJt_loc[0] / detJ; - adjJt[i + Q * 1] = adjJt_loc[1] / detJ; - adjJt[i + Q * 2] = adjJt_loc[2] / detJ; - adjJt[i + Q * 3] = adjJt_loc[3] / detJ; + qd_attr[i] = attr[i]; + qd_wdetJ[i] = qw[i] * detJ; + qd_adjJt[i + Q * 0] = adjJt_loc[0] / detJ; + qd_adjJt[i + Q * 1] = adjJt_loc[1] / detJ; + qd_adjJt[i + Q * 2] = adjJt_loc[2] / detJ; + qd_adjJt[i + Q * 3] = adjJt_loc[3] / detJ; } return 0; } diff --git a/palace/fem/qfunctions/32/geom_32_qf.h b/palace/fem/qfunctions/32/geom_32_qf.h index cf7db5ee0..ab7c4eaaa 100644 --- a/palace/fem/qfunctions/32/geom_32_qf.h +++ b/palace/fem/qfunctions/32/geom_32_qf.h @@ -9,8 +9,8 @@ CEED_QFUNCTION(f_build_geom_factor_32)(void *, CeedInt Q, const CeedScalar *const *in, CeedScalar *const *out) { - const CeedScalar *qw = in[0], *J = in[1]; - CeedScalar *attr = out[0], *wdetJ = out[0] + Q, *adjJt = out[0] + 2 * Q; + const CeedScalar *attr = in[0], *qw = in[1], *J = in[2]; + CeedScalar *qd_attr = out[0], *qd_wdetJ = out[0] + Q, *qd_adjJt = out[0] + 2 * Q; CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++) { @@ -18,14 +18,14 @@ CEED_QFUNCTION(f_build_geom_factor_32)(void *, CeedInt Q, const CeedScalar *cons MatUnpack32(J + i, Q, J_loc); const CeedScalar detJ = AdjJt32(J_loc, adjJt_loc); - attr[i] = 0; - wdetJ[i] = qw[i] * detJ; - adjJt[i + Q * 0] = adjJt_loc[0] / detJ; - adjJt[i + Q * 1] = adjJt_loc[1] / detJ; - adjJt[i + Q * 2] = adjJt_loc[2] / detJ; - adjJt[i + Q * 3] = adjJt_loc[3] / detJ; - adjJt[i + Q * 4] = adjJt_loc[4] / detJ; - adjJt[i + Q * 5] = adjJt_loc[5] / detJ; + qd_attr[i] = attr[i]; + qd_wdetJ[i] = qw[i] * detJ; + qd_adjJt[i + Q * 0] = adjJt_loc[0] / detJ; + qd_adjJt[i + Q * 1] = adjJt_loc[1] / detJ; + qd_adjJt[i + Q * 2] = adjJt_loc[2] / detJ; + qd_adjJt[i + Q * 3] = adjJt_loc[3] / detJ; + qd_adjJt[i + Q * 4] = adjJt_loc[4] / detJ; + qd_adjJt[i + Q * 5] = adjJt_loc[5] / detJ; } return 0; } diff --git a/palace/fem/qfunctions/33/geom_33_qf.h b/palace/fem/qfunctions/33/geom_33_qf.h index 213a4d1b3..21f81350e 100644 --- a/palace/fem/qfunctions/33/geom_33_qf.h +++ b/palace/fem/qfunctions/33/geom_33_qf.h @@ -9,8 +9,8 @@ CEED_QFUNCTION(f_build_geom_factor_33)(void *, CeedInt Q, const CeedScalar *const *in, CeedScalar *const *out) { - const CeedScalar *qw = in[0], *J = in[1]; - CeedScalar *attr = out[0], *wdetJ = out[0] + Q, *adjJt = out[0] + 2 * Q; + const CeedScalar *attr = in[0], *qw = in[1], *J = in[2]; + CeedScalar *qd_attr = out[0], *qd_wdetJ = out[0] + Q, *qd_adjJt = out[0] + 2 * Q; CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++) { @@ -18,17 +18,17 @@ CEED_QFUNCTION(f_build_geom_factor_33)(void *, CeedInt Q, const CeedScalar *cons MatUnpack33(J + i, Q, J_loc); const CeedScalar detJ = AdjJt33(J_loc, adjJt_loc); - attr[i] = 0; - wdetJ[i] = qw[i] * detJ; - adjJt[i + Q * 0] = adjJt_loc[0] / detJ; - adjJt[i + Q * 1] = adjJt_loc[1] / detJ; - adjJt[i + Q * 2] = adjJt_loc[2] / detJ; - adjJt[i + Q * 3] = adjJt_loc[3] / detJ; - adjJt[i + Q * 4] = adjJt_loc[4] / detJ; - adjJt[i + Q * 5] = adjJt_loc[5] / detJ; - adjJt[i + Q * 6] = adjJt_loc[6] / detJ; - adjJt[i + Q * 7] = adjJt_loc[7] / detJ; - adjJt[i + Q * 8] = adjJt_loc[8] / detJ; + qd_attr[i] = attr[i]; + qd_wdetJ[i] = qw[i] * detJ; + qd_adjJt[i + Q * 0] = adjJt_loc[0] / detJ; + qd_adjJt[i + Q * 1] = adjJt_loc[1] / detJ; + qd_adjJt[i + Q * 2] = adjJt_loc[2] / detJ; + qd_adjJt[i + Q * 3] = adjJt_loc[3] / detJ; + qd_adjJt[i + Q * 4] = adjJt_loc[4] / detJ; + qd_adjJt[i + Q * 5] = adjJt_loc[5] / detJ; + qd_adjJt[i + Q * 6] = adjJt_loc[6] / detJ; + qd_adjJt[i + Q * 7] = adjJt_loc[7] / detJ; + qd_adjJt[i + Q * 8] = adjJt_loc[8] / detJ; } return 0; } diff --git a/palace/fem/qfunctions/geom_qf.h b/palace/fem/qfunctions/geom_qf.h index dfbc33817..dc08248cf 100644 --- a/palace/fem/qfunctions/geom_qf.h +++ b/palace/fem/qfunctions/geom_qf.h @@ -6,8 +6,9 @@ // libCEED QFunction for building geometry factors for integration and transformations. // At every quadrature point, compute qw * det(J) and adj(J)^T / |J| and store the result. -// in[0] is quadrature weights, shape [Q] -// in[1] is Jacobians, shape [qcomp=dim, ncomp=space_dim, Q] +// in[0] is element attributes, shape [Q] +// in[1] is quadrature weights, shape [Q] +// in[2] is Jacobians, shape [qcomp=dim, ncomp=space_dim, Q] // out[0] is quadrature data, stored as {attribute, Jacobian determinant, (transpose) // adjugate Jacobian} quadrature data, shape [ncomp=2+space_dim*dim, Q]