diff --git a/palace/fem/bilinearform.cpp b/palace/fem/bilinearform.cpp index f26992021..1661cf9c9 100644 --- a/palace/fem/bilinearform.cpp +++ b/palace/fem/bilinearform.cpp @@ -52,11 +52,6 @@ BilinearForm::PartialAssemble(const FiniteElementSpace &trial_fespace, PalacePragmaOmp(parallel if (nt > 1)) { Ceed ceed = ceed::internal::GetCeedObjects()[utils::GetThreadNum()]; - - // Initialize the composite operator on each thread. - CeedOperator loc_op; - PalaceCeedCall(ceed, CeedCompositeOperatorCreate(ceed, &loc_op)); - for (const auto &[geom, data] : mesh.GetCeedGeomFactorData(ceed)) { const auto trial_map_type = @@ -80,8 +75,7 @@ BilinearForm::PartialAssemble(const FiniteElementSpace &trial_fespace, integ->SetMapTypes(trial_map_type, test_map_type); integ->Assemble(ceed, trial_restr, test_restr, trial_basis, test_basis, data.geom_data, data.geom_data_restr, &sub_op); - PalaceCeedCall(ceed, CeedCompositeOperatorAddSub(loc_op, sub_op)); - PalaceCeedCall(ceed, CeedOperatorDestroy(&sub_op)); + op->AddOper(sub_op); // Sub-operator owned by ceed::Operator } } else if (mfem::Geometry::Dimension[geom] == mesh.Dimension() - 1 && @@ -101,15 +95,15 @@ BilinearForm::PartialAssemble(const FiniteElementSpace &trial_fespace, integ->SetMapTypes(trial_map_type, test_map_type); integ->Assemble(ceed, trial_restr, test_restr, trial_basis, test_basis, data.geom_data, data.geom_data_restr, &sub_op); - PalaceCeedCall(ceed, CeedCompositeOperatorAddSub(loc_op, sub_op)); - PalaceCeedCall(ceed, CeedOperatorDestroy(&sub_op)); + op->AddOper(sub_op); // Sub-operator owned by ceed::Operator } } } - PalaceCeedCall(ceed, CeedOperatorCheckReady(loc_op)); - op->AddOper(loc_op); } + // Finalize the operator (call CeedOperatorCheckReady). + op->Finalize(); + return op; } @@ -224,12 +218,6 @@ std::unique_ptr DiscreteLinearOperator::PartialAssemble() const PalacePragmaOmp(parallel if (nt > 1)) { Ceed ceed = ceed::internal::GetCeedObjects()[utils::GetThreadNum()]; - - // Initialize the composite operators for each thread. - CeedOperator loc_op, loc_op_t; - PalaceCeedCall(ceed, CeedCompositeOperatorCreate(ceed, &loc_op)); - PalaceCeedCall(ceed, CeedCompositeOperatorCreate(ceed, &loc_op_t)); - for (const auto &[geom, data] : mesh.GetCeedGeomFactorData(ceed)) { if (mfem::Geometry::Dimension[geom] == mesh.Dimension() && !domain_interps.empty()) @@ -255,21 +243,18 @@ std::unique_ptr DiscreteLinearOperator::PartialAssemble() const { CeedOperator sub_op, sub_op_t; interp->Assemble(ceed, trial_restr, test_restr, interp_basis, &sub_op, &sub_op_t); - PalaceCeedCall(ceed, CeedCompositeOperatorAddSub(loc_op, sub_op)); - PalaceCeedCall(ceed, CeedCompositeOperatorAddSub(loc_op_t, sub_op_t)); - PalaceCeedCall(ceed, CeedOperatorDestroy(&sub_op)); - PalaceCeedCall(ceed, CeedOperatorDestroy(&sub_op_t)); + op->AddOper(sub_op, sub_op_t); // Sub-operator owned by ceed::Operator } // Basis is owned by the operator. PalaceCeedCall(ceed, CeedBasisDestroy(&interp_basis)); } } - PalaceCeedCall(ceed, CeedOperatorCheckReady(loc_op)); - PalaceCeedCall(ceed, CeedOperatorCheckReady(loc_op_t)); - op->AddOper(loc_op, loc_op_t); } + // Finalize the operator (call CeedOperatorCheckReady). + op->Finalize(); + // Construct dof multiplicity vector for scaling to account for dofs shared between // elements (on host, then copy to device). Vector test_multiplicity(test_fespace.GetVSize()); diff --git a/palace/fem/libceed/operator.cpp b/palace/fem/libceed/operator.cpp index da0fe2a70..4fafb72d4 100644 --- a/palace/fem/libceed/operator.cpp +++ b/palace/fem/libceed/operator.cpp @@ -21,6 +21,22 @@ Operator::Operator(int h, int w) : palace::Operator(h, w) op_t.resize(nt, nullptr); u.resize(nt, nullptr); v.resize(nt, nullptr); + PalacePragmaOmp(parallel if (op.size() > 1)) + { + const int id = utils::GetThreadNum(); + MFEM_ASSERT(id < op.size(), "Out of bounds access for thread number " << id << "!"); + Ceed ceed = ceed::internal::GetCeedObjects()[utils::GetThreadNum()]; + CeedOperator loc_op, loc_op_t; + CeedVector loc_u, loc_v; + PalaceCeedCall(ceed, CeedCompositeOperatorCreate(ceed, &loc_op)); + PalaceCeedCall(ceed, CeedCompositeOperatorCreate(ceed, &loc_op_t)); + PalaceCeedCall(ceed, CeedVectorCreate(ceed, width, &loc_u)); + PalaceCeedCall(ceed, CeedVectorCreate(ceed, height, &loc_v)); + op[id] = loc_op; + op_t[id] = loc_op_t; + u[id] = loc_u; + v[id] = loc_v; + } temp.UseDevice(true); } @@ -28,10 +44,9 @@ Operator::~Operator() { PalacePragmaOmp(parallel if (op.size() > 1)) { - Ceed ceed; const int id = utils::GetThreadNum(); - MFEM_ASSERT(id < op.size() && op[id], - "Out of bounds access for thread number " << id << "!"); + MFEM_ASSERT(id < op.size(), "Out of bounds access for thread number " << id << "!"); + Ceed ceed; PalaceCeedCallBackend(CeedOperatorGetCeed(op[id], &ceed)); PalaceCeedCall(ceed, CeedOperatorDestroy(&op[id])); PalaceCeedCall(ceed, CeedOperatorDestroy(&op_t[id])); @@ -40,32 +55,45 @@ Operator::~Operator() } } -void Operator::AddOper(CeedOperator op_, CeedOperator op_t_) +void Operator::AddOper(CeedOperator sub_op, CeedOperator sub_op_t) { + // This should be called from within a OpenMP parallel region. + const int id = utils::GetThreadNum(); + MFEM_ASSERT(id < op.size(), "Out of bounds access for thread number " << id << "!"); Ceed ceed; + PalaceCeedCallBackend(CeedOperatorGetCeed(sub_op, &ceed)); CeedSize l_in, l_out; - CeedVector loc_u, loc_v; - PalaceCeedCallBackend(CeedOperatorGetCeed(op_, &ceed)); - PalaceCeedCall(ceed, CeedOperatorGetActiveVectorLengths(op_, &l_in, &l_out)); - MFEM_VERIFY((l_in == 0 && l_out == 0) || (mfem::internal::to_int(l_in) == width && - mfem::internal::to_int(l_out) == height), + PalaceCeedCall(ceed, CeedOperatorGetActiveVectorLengths(sub_op, &l_in, &l_out)); + MFEM_VERIFY(mfem::internal::to_int(l_in) == width && + mfem::internal::to_int(l_out) == height, "Dimensions mismatch for CeedOperator!"); - if (op_t_) + PalaceCeedCall(ceed, CeedCompositeOperatorAddSub(op[id], sub_op)); + PalaceCeedCall(ceed, CeedOperatorDestroy(&sub_op)); + if (sub_op_t) { + Ceed ceed_t; + PalaceCeedCallBackend(CeedOperatorGetCeed(sub_op_t, &ceed_t)); + MFEM_VERIFY(ceed_t == ceed, "Ceed context mismatch for transpose CeedOperator!"); CeedSize l_in_t, l_out_t; - PalaceCeedCall(ceed, CeedOperatorGetActiveVectorLengths(op_t_, &l_in_t, &l_out_t)); - MFEM_VERIFY((l_in_t == 0 && l_out_t == 0) || (l_in_t == l_out && l_out_t == l_in), + PalaceCeedCall(ceed, CeedOperatorGetActiveVectorLengths(sub_op_t, &l_in_t, &l_out_t)); + MFEM_VERIFY(l_in_t == l_out && l_out_t == l_in, "Dimensions mismatch for transpose CeedOperator!"); + PalaceCeedCall(ceed, CeedCompositeOperatorAddSub(op_t[id], sub_op_t)); + PalaceCeedCall(ceed, CeedOperatorDestroy(&sub_op_t)); } - PalaceCeedCall(ceed, CeedVectorCreate(ceed, l_in, &loc_u)); - PalaceCeedCall(ceed, CeedVectorCreate(ceed, l_out, &loc_v)); +} - const int id = utils::GetThreadNum(); - MFEM_ASSERT(id < op.size(), "Out of bounds access for thread number " << id << "!"); - op[id] = op_; - op_t[id] = op_t_; - u[id] = loc_u; - v[id] = loc_v; +void Operator::Finalize() +{ + PalacePragmaOmp(parallel if (op.size() > 1)) + { + const int id = utils::GetThreadNum(); + MFEM_ASSERT(id < op.size(), "Out of bounds access for thread number " << id << "!"); + Ceed ceed; + PalaceCeedCallBackend(CeedOperatorGetCeed(op[id], &ceed)); + PalaceCeedCall(ceed, CeedOperatorCheckReady(op[id])); + PalaceCeedCall(ceed, CeedOperatorCheckReady(op_t[id])); + } } void Operator::AssembleDiagonal(Vector &diag) const @@ -84,10 +112,9 @@ void Operator::AssembleDiagonal(Vector &diag) const PalacePragmaOmp(parallel if (op.size() > 1)) { - Ceed ceed; const int id = utils::GetThreadNum(); - MFEM_ASSERT(id < op.size() && op[id], - "Out of bounds access for thread number " << id << "!"); + MFEM_ASSERT(id < op.size(), "Out of bounds access for thread number " << id << "!"); + Ceed ceed; PalaceCeedCallBackend(CeedOperatorGetCeed(op[id], &ceed)); PalaceCeedCall(ceed, CeedVectorSetArray(v[id], mem, CEED_USE_POINTER, diag_data)); PalaceCeedCall( @@ -116,10 +143,9 @@ inline void CeedAddMult(const std::vector &op, PalacePragmaOmp(parallel if (op.size() > 1)) { - Ceed ceed; const int id = utils::GetThreadNum(); - MFEM_ASSERT(id < op.size() && op[id], - "Out of bounds access for thread number " << id << "!"); + MFEM_ASSERT(id < op.size(), "Out of bounds access for thread number " << id << "!"); + Ceed ceed; PalaceCeedCallBackend(CeedOperatorGetCeed(op[id], &ceed)); PalaceCeedCall(ceed, CeedVectorSetArray(u[id], mem, CEED_USE_POINTER, const_cast(x_data))); @@ -407,32 +433,36 @@ std::unique_ptr OperatorCOOtoCSR(Ceed ceed, CeedInt m, Ce std::unique_ptr CeedOperatorFullAssemble(const Operator &op, bool skip_zeros, bool set) { - if (op.Size() == 0) - { - return std::make_unique(op.Height(), op.Width(), 0); - } - // Assemble operators on each thread. std::vector> loc_mat(op.Size()); PalacePragmaOmp(parallel if (op.Size() > 1)) { - Ceed ceed; const int id = utils::GetThreadNum(); - MFEM_ASSERT(id < op.Size() && op[id], - "Out of bounds access for thread number " << id << "!"); + MFEM_ASSERT(id < op.Size(), "Out of bounds access for thread number " << id << "!"); + Ceed ceed; PalaceCeedCallBackend(CeedOperatorGetCeed(op[id], &ceed)); - // First, get matrix on master thread in COO format, withs rows/cols always on host and - // vals potentially on the device. Process skipping zeros if desired. - CeedSize nnz; - CeedInt *rows, *cols; - CeedVector vals; - CeedMemType mem; - CeedOperatorAssembleCOO(ceed, op[id], skip_zeros, &nnz, &rows, &cols, &vals, &mem); - - // Convert COO to CSR (on each thread). The COO memory is free'd internally. - loc_mat[id] = - OperatorCOOtoCSR(ceed, op.Height(), op.Width(), nnz, rows, cols, vals, mem, set); + // Check if the operator is empty, otherwise assemble. + CeedInt nsub_ops; + PalaceCeedCall(ceed, CeedCompositeOperatorGetNumSub(op[id], &nsub_ops)); + if (nsub_ops == 0) + { + loc_mat[id] = std::make_unique(op.Height(), op.Width(), 0); + } + else + { + // First, get matrix on master thread in COO format, withs rows/cols always on host + // and vals potentially on the device. Process skipping zeros if desired. + CeedSize nnz; + CeedInt *rows, *cols; + CeedVector vals; + CeedMemType mem; + CeedOperatorAssembleCOO(ceed, op[id], skip_zeros, &nnz, &rows, &cols, &vals, &mem); + + // Convert COO to CSR (on each thread). The COO memory is free'd internally. + loc_mat[id] = + OperatorCOOtoCSR(ceed, op.Height(), op.Width(), nnz, rows, cols, vals, mem, set); + } } // Add CSR matrix objects from each thread (HYPRE's hypre_CSRMatrixAdd uses threads @@ -443,19 +473,19 @@ std::unique_ptr CeedOperatorFullAssemble(const Operator & { b_mat = std::make_unique(hypre_CSRMatrixClone(*mat, 0)); hypre_CSRMatrixSetConstantValues(*b_mat, 1.0); - for (std::size_t i = 1; i < op.Size(); i++) + for (std::size_t id = 1; id < op.Size(); id++) { - hypre_CSRMatrix *b_loc_mat = hypre_CSRMatrixClone(*loc_mat[i], 0); + hypre_CSRMatrix *b_loc_mat = hypre_CSRMatrixClone(*loc_mat[id], 0); hypre_CSRMatrixSetConstantValues(b_loc_mat, 1.0); b_mat = std::make_unique( hypre_CSRMatrixAdd(1.0, *b_mat, 1.0, b_loc_mat)); hypre_CSRMatrixDestroy(b_loc_mat); } } - for (std::size_t i = 1; i < op.Size(); i++) + for (std::size_t id = 1; id < op.Size(); id++) { mat = std::make_unique( - hypre_CSRMatrixAdd(1.0, *mat, 1.0, *loc_mat[i])); + hypre_CSRMatrixAdd(1.0, *mat, 1.0, *loc_mat[id])); } if (set && op.Size() > 1) { @@ -498,10 +528,10 @@ std::unique_ptr CeedOperatorCoarsen(const Operator &op_fine, // types, integrators) of the original fine operator. PalacePragmaOmp(parallel if (op_fine.Size() > 1)) { - Ceed ceed; const int id = utils::GetThreadNum(); - MFEM_ASSERT(id < op_fine.Size() && op_fine[id], + MFEM_ASSERT(id < op_fine.Size(), "Out of bounds access for thread number " << id << "!"); + Ceed ceed; PalaceCeedCallBackend(CeedOperatorGetCeed(op_fine[id], &ceed)); { Ceed ceed_parent; @@ -511,38 +541,21 @@ std::unique_ptr CeedOperatorCoarsen(const Operator &op_fine, ceed = ceed_parent; } } - - // Initialize the composite operator on each thread. - CeedOperator loc_op; - PalaceCeedCall(ceed, CeedCompositeOperatorCreate(ceed, &loc_op)); - - bool composite; - PalaceCeedCall(ceed, CeedOperatorIsComposite(op_fine[id], &composite)); - if (composite) - { - CeedInt nloc_ops_fine; - CeedOperator *loc_ops_fine; - PalaceCeedCall(ceed, CeedCompositeOperatorGetNumSub(op_fine[id], &nloc_ops_fine)); - PalaceCeedCall(ceed, CeedCompositeOperatorGetSubList(op_fine[id], &loc_ops_fine)); - for (CeedInt k = 0; k < nloc_ops_fine; k++) - { - CeedOperator sub_op; - SingleOperatorCoarsen(ceed, loc_ops_fine[k], &sub_op); - PalaceCeedCall(ceed, CeedCompositeOperatorAddSub(loc_op, sub_op)); - PalaceCeedCall(ceed, CeedOperatorDestroy(&sub_op)); - } - } - else + CeedInt nsub_ops_fine; + CeedOperator *sub_ops_fine; + PalaceCeedCall(ceed, CeedCompositeOperatorGetNumSub(op_fine[id], &nsub_ops_fine)); + PalaceCeedCall(ceed, CeedCompositeOperatorGetSubList(op_fine[id], &sub_ops_fine)); + for (CeedInt k = 0; k < nsub_ops_fine; k++) { - CeedOperator sub_op; - SingleOperatorCoarsen(ceed, op_fine[id], &sub_op); - PalaceCeedCall(ceed, CeedCompositeOperatorAddSub(loc_op, sub_op)); - PalaceCeedCall(ceed, CeedOperatorDestroy(&sub_op)); + CeedOperator sub_op_coarse; + SingleOperatorCoarsen(ceed, sub_ops_fine[k], &sub_op_coarse); + op_coarse->AddOper(sub_op_coarse); // Sub-operator owned by ceed::Operator } - PalaceCeedCall(ceed, CeedOperatorCheckReady(loc_op)); - op_coarse->AddOper(loc_op); // Thread-safe } + // Finalize the operator (call CeedOperatorCheckReady). + op_coarse->Finalize(); + return op_coarse; } diff --git a/palace/fem/libceed/operator.hpp b/palace/fem/libceed/operator.hpp index fe44d85ad..72848fcea 100644 --- a/palace/fem/libceed/operator.hpp +++ b/palace/fem/libceed/operator.hpp @@ -44,7 +44,9 @@ class Operator : public palace::Operator CeedOperator operator[](std::size_t i) const { return op[i]; } auto Size() const { return op.size(); } - void AddOper(CeedOperator op_, CeedOperator op_t_ = nullptr); + void AddOper(CeedOperator sub_op, CeedOperator sub_op_t = nullptr); + + void Finalize(); void SetDofMultiplicity(Vector &&mult) { dof_multiplicity = std::move(mult); } diff --git a/palace/linalg/rap.cpp b/palace/linalg/rap.cpp index f76fe40ad..070358e2f 100644 --- a/palace/linalg/rap.cpp +++ b/palace/linalg/rap.cpp @@ -106,25 +106,40 @@ mfem::HypreParMatrix &ParOperator::ParallelAssemble(bool skip_zeros) const if (!use_R) { const mfem::HypreParMatrix *Rt = test_fespace.Get().Dof_TrueDof_Matrix(); - RAP = std::make_unique(hypre_ParCSRMatrixRAP(*Rt, hA, *P), true); + RAP = std::make_unique(hypre_ParCSRMatrixRAPKT(*Rt, hA, *P, 1), + true); } else { - mfem::SparseMatrix *sRt = mfem::Transpose(*test_fespace.GetRestrictionMatrix()); - mfem::HypreParMatrix *hRt = new mfem::HypreParMatrix( - test_fespace.GetComm(), test_fespace.GlobalVSize(), test_fespace.GlobalTrueVSize(), - test_fespace.Get().GetDofOffsets(), test_fespace.Get().GetTrueDofOffsets(), sRt); - RAP = std::make_unique(hypre_ParCSRMatrixRAP(*hRt, hA, *P), true); - delete sRt; - delete hRt; + mfem::HypreParMatrix *hR = new mfem::HypreParMatrix( + test_fespace.GetComm(), test_fespace.GlobalTrueVSize(), test_fespace.GlobalVSize(), + test_fespace.Get().GetTrueDofOffsets(), test_fespace.Get().GetDofOffsets(), + const_cast(test_fespace.GetRestrictionMatrix())); + hypre_ParCSRMatrix *AP = hypre_ParCSRMatMat(hA, *P); + RAP = std::make_unique(hypre_ParCSRMatMat(*hR, AP), true); + hypre_ParCSRMatrixDestroy(AP); + delete hR; } hypre_ParCSRMatrixDiag(hA) = hA_diag; hypre_ParCSRMatrixDestroy(hA); hypre_ParCSRMatrixSetNumNonzeros(*RAP); + if (&trial_fespace == &test_fespace) + { + // Make sure that the first entry in each row is the diagonal one, for a square matrix. + hypre_CSRMatrixReorder(hypre_ParCSRMatrixDiag((hypre_ParCSRMatrix *)*RAP)); + } // Eliminate boundary conditions on the assembled (square) matrix. - RAP->EliminateBC(dbc_tdof_list, diag_policy); + if (&trial_fespace == &test_fespace) + { + RAP->EliminateBC(dbc_tdof_list, diag_policy); + } + else + { + MFEM_VERIFY(dbc_tdof_list.Size() == 0, + "Essential BC elimination is only available for square ParOperator!"); + } return *RAP; } diff --git a/palace/models/waveportoperator.cpp b/palace/models/waveportoperator.cpp index ef86ab4df..bbc2b7bf9 100644 --- a/palace/models/waveportoperator.cpp +++ b/palace/models/waveportoperator.cpp @@ -622,8 +622,8 @@ WavePortData::WavePortData(const config::WavePortData &data, port_h1_fespace->GetComm(), port_h1_fespace->Get().GlobalTrueVSize(), port_h1_fespace->Get().GetTrueDofOffsets(), &diag); auto [Bttr, Btti] = GetBtt(mat_op, *port_nd_fespace); - std::tie(Br, Bi) = - GetSystemMatrixB(Bttr.get(), Btti.get(), Dnn.get(), port_dbc_tdof_list); + auto [Br, Bi] = GetSystemMatrixB(Bttr.get(), Btti.get(), Dnn.get(), port_dbc_tdof_list); + B = std::make_unique(std::move(Br), std::move(Bi)); } // Configure a communicator for the processes which have elements for this port. @@ -848,7 +848,7 @@ void WavePortData::Initialize(double omega) // Construct matrices and solve the generalized eigenvalue problem for the desired wave // port mode. B uses the non-owning constructor since the matrices Br, Bi are not // functions of frequency (constructed once for all). - std::unique_ptr A, B; + std::unique_ptr A; const double sigma = -omega * omega * mu_eps_min; { auto [Attr, Atti] = GetAtt(mat_op, *port_nd_fespace, port_normal, omega, sigma); @@ -856,7 +856,6 @@ void WavePortData::Initialize(double omega) GetSystemMatrixA(Attr.get(), Atti.get(), Atnr.get(), Atni.get(), Antr.get(), Anti.get(), Annr.get(), Anni.get(), port_dbc_tdof_list); A = std::make_unique(std::move(Ar), std::move(Ai)); - B = std::make_unique(Br.get(), Bi.get()); } // Configure and solve the (inverse) eigenvalue problem for the desired boundary mode. diff --git a/palace/models/waveportoperator.hpp b/palace/models/waveportoperator.hpp index ed736839f..dc28df234 100644 --- a/palace/models/waveportoperator.hpp +++ b/palace/models/waveportoperator.hpp @@ -64,7 +64,8 @@ class WavePortData double mu_eps_min; // Operator storage for repeated boundary mode eigenvalue problem solves. - std::unique_ptr Atnr, Atni, Antr, Anti, Annr, Anni, Br, Bi; + std::unique_ptr Atnr, Atni, Antr, Anti, Annr, Anni; + std::unique_ptr B; ComplexVector v0, e0; // Eigenvalue solver for boundary modes.