Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Collection of minor performance fixes during profiling and GPU testing #181

Merged
merged 7 commits into from
Feb 7, 2024
3 changes: 2 additions & 1 deletion palace/fem/fespace.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -300,7 +300,8 @@ class AuxiliaryFiniteElementSpaceHierarchy
std::vector<const Operator *> GetDiscreteInterpolators() const
{
std::vector<const Operator *> G_(GetNumLevels());
for (std::size_t l = 0; l < G_.size(); l++)
G_[0] = nullptr; // No discrete interpolator for coarsest level
for (std::size_t l = 1; l < G_.size(); l++)
{
G_[l] = &GetDiscreteInterpolatorAtLevel(l);
}
Expand Down
2 changes: 1 addition & 1 deletion palace/fem/libceed/basis.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -192,7 +192,7 @@ void InitInterpolatorBasis(const mfem::FiniteElement &trial_fe,
if constexpr (false)
{
std::cout << "New interpolator basis (" << ceed << ", " << &trial_fe << ", " << &test_fe
<< ")\n";
<< ", " << (trial_fe.GetMapType() == test_fe.GetMapType()) << ")\n";
}
if constexpr (false)
{
Expand Down
5 changes: 4 additions & 1 deletion palace/fem/libceed/ceed.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@

#include "ceed.hpp"

#include <string_view>
#include "utils/omp.hpp"

#if defined(MFEM_USE_OPENMP)
Expand Down Expand Up @@ -31,7 +32,9 @@ void Initialize(const char *resource, const char *jit_source_dir)
PalacePragmaOmp(master)
{
#if defined(MFEM_USE_OPENMP)
const int nt = omp_get_num_threads();
// Only parallelize libCEED operators over threads when not using the GPU.
const int nt =
!std::string_view(resource).compare(0, 4, "/cpu") ? omp_get_num_threads() : 1;
#else
const int nt = 1;
#endif
Expand Down
7 changes: 6 additions & 1 deletion palace/fem/libceed/integrator.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -346,7 +346,9 @@ int CeedGeometryDataGetSpaceDimension(CeedElemRestriction geom_data_restr, CeedI

void AssembleCeedGeometryData(Ceed ceed, CeedElemRestriction mesh_restr,
CeedBasis mesh_basis, CeedVector mesh_nodes,
CeedVector geom_data, CeedElemRestriction geom_data_restr)
CeedElemRestriction attr_restr, CeedBasis attr_basis,
CeedVector elem_attr, CeedVector geom_data,
CeedElemRestriction geom_data_restr)
{
CeedInt dim, space_dim, num_elem, num_qpts;
PalaceCeedCall(ceed, CeedBasisGetDimension(mesh_basis, &dim));
Expand Down Expand Up @@ -389,6 +391,7 @@ void AssembleCeedGeometryData(Ceed ceed, CeedElemRestriction mesh_restr,
}

// Inputs
PalaceCeedCall(ceed, CeedQFunctionAddInput(build_qf, "attr", 1, CEED_EVAL_INTERP));
PalaceCeedCall(ceed, CeedQFunctionAddInput(build_qf, "q_w", 1, CEED_EVAL_WEIGHT));
PalaceCeedCall(
ceed, CeedQFunctionAddInput(build_qf, "grad_x", space_dim * dim, CEED_EVAL_GRAD));
Expand All @@ -409,6 +412,8 @@ void AssembleCeedGeometryData(Ceed ceed, CeedElemRestriction mesh_restr,
PalaceCeedCall(ceed, CeedOperatorCreate(ceed, build_qf, nullptr, nullptr, &build_op));
PalaceCeedCall(ceed, CeedQFunctionDestroy(&build_qf));

PalaceCeedCall(ceed,
CeedOperatorSetField(build_op, "attr", attr_restr, attr_basis, elem_attr));
PalaceCeedCall(ceed, CeedOperatorSetField(build_op, "q_w", CEED_ELEMRESTRICTION_NONE,
mesh_basis, CEED_VECTOR_NONE));
PalaceCeedCall(ceed, CeedOperatorSetField(build_op, "grad_x", mesh_restr, mesh_basis,
Expand Down
4 changes: 3 additions & 1 deletion palace/fem/libceed/integrator.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -53,7 +53,9 @@ int CeedGeometryDataGetSpaceDimension(CeedElemRestriction geom_data_restr, CeedI
// libCEED operator.
void AssembleCeedGeometryData(Ceed ceed, CeedElemRestriction mesh_restr,
CeedBasis mesh_basis, CeedVector mesh_nodes,
CeedVector geom_data, CeedElemRestriction geom_data_restr);
CeedElemRestriction attr_restr, CeedBasis attr_basis,
CeedVector elem_attr, CeedVector geom_data,
CeedElemRestriction geom_data_restr);

// Construct libCEED operator using the given quadrature data, element restriction, and
// basis objects.
Expand Down
106 changes: 48 additions & 58 deletions palace/fem/mesh.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -137,10 +137,8 @@ auto GetElementIndices(const mfem::ParMesh &mesh, bool use_bdr, int start, int s
return element_indices;
}

template <typename T>
auto AssembleGeometryData(const mfem::GridFunction &mesh_nodes, Ceed ceed,
mfem::Geometry::Type geom, std::vector<int> &indices,
T GetCeedAttribute)
auto AssembleGeometryData(Ceed ceed, mfem::Geometry::Type geom, std::vector<int> &indices,
const mfem::GridFunction &mesh_nodes, const Vector &elem_attr)
{
const mfem::FiniteElementSpace &mesh_fespace = *mesh_nodes.FESpace();
const mfem::Mesh &mesh = *mesh_fespace.GetMesh();
Expand All @@ -151,68 +149,50 @@ auto AssembleGeometryData(const mfem::GridFunction &mesh_nodes, Ceed ceed,
data.indices = std::move(indices);
const std::size_t num_elem = data.indices.size();

// Allocate storage for geometry factor data (stored as attribute + quadrature weight +
// Jacobian, column-major).
// Construct mesh node element restriction and basis.
CeedElemRestriction mesh_restr =
FiniteElementSpace::BuildCeedElemRestriction(mesh_fespace, ceed, geom, data.indices);
CeedBasis mesh_basis = FiniteElementSpace::BuildCeedBasis(mesh_fespace, ceed, geom);
CeedInt num_qpts, geom_data_size = 2 + data.space_dim * data.dim;
CeedVector mesh_nodes_vec;
ceed::InitCeedVector(mesh_nodes, ceed, &mesh_nodes_vec);
CeedInt num_qpts;
PalaceCeedCall(ceed, CeedBasisGetNumQuadraturePoints(mesh_basis, &num_qpts));
PalaceCeedCall(
ceed, CeedVectorCreate(ceed, num_elem * num_qpts * geom_data_size, &data.geom_data));

// Data for quadrature point i, component j, element k is found at index i * strides[0] +
// j * strides[1] + k * strides[2].
CeedMemType mem;
CeedInt strides[3];
PalaceCeedCall(ceed, CeedGetPreferredMemType(ceed, &mem));
if (mfem::Device::Allows(mfem::Backend::DEVICE_MASK) && mem == CEED_MEM_DEVICE)
{
// GPU backends have CEED_STRIDES_BACKEND = {1, num_elem * num_qpts, num_qpts}.
strides[0] = 1;
strides[1] = num_elem * num_qpts;
strides[2] = num_qpts;
}
else
// Construct element attribute element restriction and basis.
CeedElemRestriction attr_restr;
CeedBasis attr_basis;
PalaceCeedCall(ceed, CeedElemRestrictionCreateStrided(ceed, num_elem, 1, 1, num_elem,
CEED_STRIDES_BACKEND, &attr_restr));
{
// CPU backends have CEED_STRIDES_BACKEND = {1, num_qpts, num_qpts * geom_data_size}.
strides[0] = 1;
strides[1] = num_qpts;
strides[2] = num_qpts * geom_data_size;
Vector Bt(num_qpts);
Bt = 1.0;
PalaceCeedCall(ceed,
CeedBasisCreateH1(ceed, ceed::GetCeedTopology(geom), 1, 1, num_qpts,
Bt.GetData(), nullptr, nullptr, nullptr, &attr_basis));
}
PalaceCeedCall(ceed,
CeedElemRestrictionCreateStrided(ceed, num_elem, num_qpts, geom_data_size,
num_elem * num_qpts * geom_data_size,
strides, &data.geom_data_restr));

// Compute the required geometry factors at quadrature points.
CeedVector mesh_nodes_vec;
ceed::InitCeedVector(mesh_nodes, ceed, &mesh_nodes_vec);
CeedVector elem_attr_vec;
ceed::InitCeedVector(elem_attr, ceed, &elem_attr_vec);

ceed::AssembleCeedGeometryData(ceed, mesh_restr, mesh_basis, mesh_nodes_vec,
data.geom_data, data.geom_data_restr);
// Allocate storage for geometry factor data (stored as attribute + quadrature weight +
// Jacobian, column-major).
CeedInt geom_data_size = 2 + data.space_dim * data.dim;
PalaceCeedCall(
ceed, CeedVectorCreate(ceed, num_elem * num_qpts * geom_data_size, &data.geom_data));
PalaceCeedCall(
ceed, CeedElemRestrictionCreateStrided(ceed, num_elem, num_qpts, geom_data_size,
num_elem * num_qpts * geom_data_size,
CEED_STRIDES_BACKEND, &data.geom_data_restr));

// Compute the required geometry factors at quadrature points.
ceed::AssembleCeedGeometryData(ceed, mesh_restr, mesh_basis, mesh_nodes_vec, attr_restr,
attr_basis, elem_attr_vec, data.geom_data,
data.geom_data_restr);
PalaceCeedCall(ceed, CeedVectorDestroy(&mesh_nodes_vec));
PalaceCeedCall(ceed, CeedElemRestrictionDestroy(&mesh_restr));
PalaceCeedCall(ceed, CeedBasisDestroy(&mesh_basis));

// Compute element attribute quadrature data. All inputs to a QFunction require the same
// number of quadrature points, so we store the attribute at each quadrature point. This
// is the first component of the quadrature data.
{
CeedScalar *geom_data_array;
PalaceCeedCall(ceed,
CeedVectorGetArray(data.geom_data, CEED_MEM_HOST, &geom_data_array));
for (std::size_t k = 0; k < num_elem; k++)
{
const auto attr = GetCeedAttribute(data.indices[k]);
for (CeedInt i = 0; i < num_qpts; i++)
{
geom_data_array[i * strides[0] + k * strides[2]] = attr;
}
}
CeedVectorRestoreArray(data.geom_data, &geom_data_array);
}
PalaceCeedCall(ceed, CeedVectorDestroy(&elem_attr_vec));
PalaceCeedCall(ceed, CeedElemRestrictionDestroy(&attr_restr));
PalaceCeedCall(ceed, CeedBasisDestroy(&attr_basis));

return data;
}
Expand Down Expand Up @@ -278,8 +258,13 @@ auto BuildCeedGeomFactorData(
}();
for (auto &[geom, indices] : element_indices)
{
geom_data_map.emplace(geom, AssembleGeometryData(*mesh.GetNodes(), ceed, geom,
indices, GetCeedAttribute));
Vector elem_attr(indices.size());
for (std::size_t k = 0; k < indices.size(); k++)
{
elem_attr[k] = GetCeedAttribute(indices[k]);
}
geom_data_map.emplace(
geom, AssembleGeometryData(ceed, geom, indices, *mesh.GetNodes(), elem_attr));
}
}

Expand All @@ -304,8 +289,13 @@ auto BuildCeedGeomFactorData(
};
for (auto &[geom, indices] : element_indices)
{
geom_data_map.emplace(geom, AssembleGeometryData(*mesh.GetNodes(), ceed, geom,
indices, GetCeedAttribute));
Vector elem_attr(indices.size());
for (std::size_t k = 0; k < indices.size(); k++)
{
elem_attr[k] = GetCeedAttribute(indices[k]);
}
geom_data_map.emplace(
geom, AssembleGeometryData(ceed, geom, indices, *mesh.GetNodes(), elem_attr));
}
}

Expand Down
12 changes: 6 additions & 6 deletions palace/fem/qfunctions/21/geom_21_qf.h
Original file line number Diff line number Diff line change
Expand Up @@ -9,19 +9,19 @@
CEED_QFUNCTION(f_build_geom_factor_21)(void *, CeedInt Q, const CeedScalar *const *in,
CeedScalar *const *out)
{
const CeedScalar *qw = in[0], *J = in[1];
CeedScalar *attr = out[0], *wdetJ = out[0] + Q, *adjJt = out[0] + 2 * Q;
const CeedScalar *attr = in[0], *qw = in[1], *J = in[2];
CeedScalar *qd_attr = out[0], *qd_wdetJ = out[0] + Q, *qd_adjJt = out[0] + 2 * Q;

CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++)
{
CeedScalar J_loc[2], adjJt_loc[2];
MatUnpack21(J + i, Q, J_loc);
const CeedScalar detJ = AdjJt21<true>(J_loc, adjJt_loc);

attr[i] = 0;
wdetJ[i] = qw[i] * detJ;
adjJt[i + Q * 0] = adjJt_loc[0] / detJ;
adjJt[i + Q * 1] = adjJt_loc[1] / detJ;
qd_attr[i] = attr[i];
qd_wdetJ[i] = qw[i] * detJ;
qd_adjJt[i + Q * 0] = adjJt_loc[0] / detJ;
qd_adjJt[i + Q * 1] = adjJt_loc[1] / detJ;
}
return 0;
}
Expand Down
16 changes: 8 additions & 8 deletions palace/fem/qfunctions/22/geom_22_qf.h
Original file line number Diff line number Diff line change
Expand Up @@ -9,21 +9,21 @@
CEED_QFUNCTION(f_build_geom_factor_22)(void *, CeedInt Q, const CeedScalar *const *in,
CeedScalar *const *out)
{
const CeedScalar *qw = in[0], *J = in[1];
CeedScalar *attr = out[0], *wdetJ = out[0] + Q, *adjJt = out[0] + 2 * Q;
const CeedScalar *attr = in[0], *qw = in[1], *J = in[2];
CeedScalar *qd_attr = out[0], *qd_wdetJ = out[0] + Q, *qd_adjJt = out[0] + 2 * Q;

CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++)
{
CeedScalar J_loc[4], adjJt_loc[4];
MatUnpack22(J + i, Q, J_loc);
const CeedScalar detJ = AdjJt22<true>(J_loc, adjJt_loc);

attr[i] = 0;
wdetJ[i] = qw[i] * detJ;
adjJt[i + Q * 0] = adjJt_loc[0] / detJ;
adjJt[i + Q * 1] = adjJt_loc[1] / detJ;
adjJt[i + Q * 2] = adjJt_loc[2] / detJ;
adjJt[i + Q * 3] = adjJt_loc[3] / detJ;
qd_attr[i] = attr[i];
qd_wdetJ[i] = qw[i] * detJ;
qd_adjJt[i + Q * 0] = adjJt_loc[0] / detJ;
qd_adjJt[i + Q * 1] = adjJt_loc[1] / detJ;
qd_adjJt[i + Q * 2] = adjJt_loc[2] / detJ;
qd_adjJt[i + Q * 3] = adjJt_loc[3] / detJ;
}
return 0;
}
Expand Down
20 changes: 10 additions & 10 deletions palace/fem/qfunctions/32/geom_32_qf.h
Original file line number Diff line number Diff line change
Expand Up @@ -9,23 +9,23 @@
CEED_QFUNCTION(f_build_geom_factor_32)(void *, CeedInt Q, const CeedScalar *const *in,
CeedScalar *const *out)
{
const CeedScalar *qw = in[0], *J = in[1];
CeedScalar *attr = out[0], *wdetJ = out[0] + Q, *adjJt = out[0] + 2 * Q;
const CeedScalar *attr = in[0], *qw = in[1], *J = in[2];
CeedScalar *qd_attr = out[0], *qd_wdetJ = out[0] + Q, *qd_adjJt = out[0] + 2 * Q;

CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++)
{
CeedScalar J_loc[6], adjJt_loc[6];
MatUnpack32(J + i, Q, J_loc);
const CeedScalar detJ = AdjJt32<true>(J_loc, adjJt_loc);

attr[i] = 0;
wdetJ[i] = qw[i] * detJ;
adjJt[i + Q * 0] = adjJt_loc[0] / detJ;
adjJt[i + Q * 1] = adjJt_loc[1] / detJ;
adjJt[i + Q * 2] = adjJt_loc[2] / detJ;
adjJt[i + Q * 3] = adjJt_loc[3] / detJ;
adjJt[i + Q * 4] = adjJt_loc[4] / detJ;
adjJt[i + Q * 5] = adjJt_loc[5] / detJ;
qd_attr[i] = attr[i];
qd_wdetJ[i] = qw[i] * detJ;
qd_adjJt[i + Q * 0] = adjJt_loc[0] / detJ;
qd_adjJt[i + Q * 1] = adjJt_loc[1] / detJ;
qd_adjJt[i + Q * 2] = adjJt_loc[2] / detJ;
qd_adjJt[i + Q * 3] = adjJt_loc[3] / detJ;
qd_adjJt[i + Q * 4] = adjJt_loc[4] / detJ;
qd_adjJt[i + Q * 5] = adjJt_loc[5] / detJ;
}
return 0;
}
Expand Down
26 changes: 13 additions & 13 deletions palace/fem/qfunctions/33/geom_33_qf.h
Original file line number Diff line number Diff line change
Expand Up @@ -9,26 +9,26 @@
CEED_QFUNCTION(f_build_geom_factor_33)(void *, CeedInt Q, const CeedScalar *const *in,
CeedScalar *const *out)
{
const CeedScalar *qw = in[0], *J = in[1];
CeedScalar *attr = out[0], *wdetJ = out[0] + Q, *adjJt = out[0] + 2 * Q;
const CeedScalar *attr = in[0], *qw = in[1], *J = in[2];
CeedScalar *qd_attr = out[0], *qd_wdetJ = out[0] + Q, *qd_adjJt = out[0] + 2 * Q;

CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++)
{
CeedScalar J_loc[9], adjJt_loc[9];
MatUnpack33(J + i, Q, J_loc);
const CeedScalar detJ = AdjJt33<true>(J_loc, adjJt_loc);

attr[i] = 0;
wdetJ[i] = qw[i] * detJ;
adjJt[i + Q * 0] = adjJt_loc[0] / detJ;
adjJt[i + Q * 1] = adjJt_loc[1] / detJ;
adjJt[i + Q * 2] = adjJt_loc[2] / detJ;
adjJt[i + Q * 3] = adjJt_loc[3] / detJ;
adjJt[i + Q * 4] = adjJt_loc[4] / detJ;
adjJt[i + Q * 5] = adjJt_loc[5] / detJ;
adjJt[i + Q * 6] = adjJt_loc[6] / detJ;
adjJt[i + Q * 7] = adjJt_loc[7] / detJ;
adjJt[i + Q * 8] = adjJt_loc[8] / detJ;
qd_attr[i] = attr[i];
qd_wdetJ[i] = qw[i] * detJ;
qd_adjJt[i + Q * 0] = adjJt_loc[0] / detJ;
qd_adjJt[i + Q * 1] = adjJt_loc[1] / detJ;
qd_adjJt[i + Q * 2] = adjJt_loc[2] / detJ;
qd_adjJt[i + Q * 3] = adjJt_loc[3] / detJ;
qd_adjJt[i + Q * 4] = adjJt_loc[4] / detJ;
qd_adjJt[i + Q * 5] = adjJt_loc[5] / detJ;
qd_adjJt[i + Q * 6] = adjJt_loc[6] / detJ;
qd_adjJt[i + Q * 7] = adjJt_loc[7] / detJ;
qd_adjJt[i + Q * 8] = adjJt_loc[8] / detJ;
}
return 0;
}
Expand Down
5 changes: 3 additions & 2 deletions palace/fem/qfunctions/geom_qf.h
Original file line number Diff line number Diff line change
Expand Up @@ -6,8 +6,9 @@

// libCEED QFunction for building geometry factors for integration and transformations.
// At every quadrature point, compute qw * det(J) and adj(J)^T / |J| and store the result.
// in[0] is quadrature weights, shape [Q]
// in[1] is Jacobians, shape [qcomp=dim, ncomp=space_dim, Q]
// in[0] is element attributes, shape [Q]
// in[1] is quadrature weights, shape [Q]
// in[2] is Jacobians, shape [qcomp=dim, ncomp=space_dim, Q]
// out[0] is quadrature data, stored as {attribute, Jacobian determinant, (transpose)
// adjugate Jacobian} quadrature data, shape [ncomp=2+space_dim*dim, Q]

Expand Down
Loading
Loading