From 18c432883507299a17fa693248a4faad6391d6fe Mon Sep 17 00:00:00 2001 From: Tobias Meyer Andersen Date: Wed, 30 Oct 2024 16:47:36 +0100 Subject: [PATCH 1/2] support matrix slitting for cpu dilu --- opm/simulators/linalg/DILU.hpp | 133 +++++++++++++++++- .../linalg/PreconditionerFactory_impl.hpp | 8 +- 2 files changed, 132 insertions(+), 9 deletions(-) diff --git a/opm/simulators/linalg/DILU.hpp b/opm/simulators/linalg/DILU.hpp index 602c7e6ac1d..3485f245df9 100644 --- a/opm/simulators/linalg/DILU.hpp +++ b/opm/simulators/linalg/DILU.hpp @@ -65,16 +65,16 @@ class MultithreadDILU : public PreconditionerWithUpdate /*! \brief Constructor gets all parameters to operate the prec. \param A The matrix to operate on. */ - MultithreadDILU(const M& A) - : A_(A) + MultithreadDILU(const M& A, bool split_matrix = true) + : A_(A), split_matrix_(split_matrix) { OPM_TIMEBLOCK(prec_construct); // TODO: rewrite so this value is set by an argument to the constructor #if HAVE_OPENMP use_multithreading = omp_get_max_threads() > 1; #endif - if (use_multithreading) { + assert(!split_matrix_); A_reordered_.emplace(A_.N(), A_.N(), A_.nonzeroes(), M::row_wise); //! Assuming symmetric matrices using a lower triangular coloring to construct @@ -99,6 +99,27 @@ class MultithreadDILU : public PreconditionerWithUpdate } } + if (split_matrix_) { + assert(!use_multithreading); + A_lower_.emplace(A_.N(), A_.N(), (A_.nonzeroes()-A_.N())/2, M::row_wise); + A_upper_.emplace(A_.N(), A_.N(), (A_.nonzeroes()-A_.N())/2, M::row_wise); + + for (auto lowerIt = A_lower_.value().createbegin(), upperIt = A_upper_.value().createbegin(); + lowerIt != A_lower_.value().createend(); + ++lowerIt, ++upperIt) { + + auto srcRow = A.begin() + lowerIt.index(); + + for (auto elem = srcRow->begin(); elem != srcRow->end(); ++elem) { + if (elem.index() < srcRow.index()) { // add index to lower matrix if under the diagonal + lowerIt.insert(elem.index()); + } else if (elem.index() > srcRow.index()) { // add element to upper matrix if above the diagonal + upperIt.insert(elem.index()); + } + } + } + } + Dinv_.resize(A_.N()); update(); } @@ -113,7 +134,11 @@ class MultithreadDILU : public PreconditionerWithUpdate if (use_multithreading) { parallelUpdate(); } else { - serialUpdate(); + if (split_matrix_) { + serialSplitUpdate(); + } else { + serialUpdate(); + } } } @@ -138,7 +163,13 @@ class MultithreadDILU : public PreconditionerWithUpdate if (use_multithreading) { parallelApply(v, d); } else { - serialApply(v, d); + + // serialApply(v, d); + if (split_matrix_) { + serialSplitApply(v, d); + } else { + serialApply(v, d); + } } } @@ -169,6 +200,8 @@ class MultithreadDILU : public PreconditionerWithUpdate private: //! \brief The matrix we operate on. const M& A_; + std::optional A_lower_; + std::optional A_upper_; //! \brief Copy of A_ that is reordered to store rows that can be computed simultaneously next to each other to //! increase cache usage when multithreading std::optional A_reordered_; @@ -182,9 +215,11 @@ class MultithreadDILU : public PreconditionerWithUpdate std::vector natural_to_reorder_; //! \brief Boolean value describing whether or not to use multithreaded version of functions bool use_multithreading{false}; + bool split_matrix_{false}; void serialUpdate() { + OPM_TIMEBLOCK(dilu_prec_update); for (std::size_t row = 0; row < A_.N(); ++row) { Dinv_[row] = A_[row][row]; } @@ -205,6 +240,43 @@ class MultithreadDILU : public PreconditionerWithUpdate } } + void serialSplitUpdate(){ + + OPM_TIMEBLOCK(dilu_prec_update); + // move data from A_ to A_lower_ and A_upper_ + // at first sight this can not be done with two large memory copies + for (auto row = A_.begin(); row != A_.end(); ++row) { + const auto row_i = row.index(); + for (auto a_ij = row->begin(); a_ij != row->end(); ++a_ij) { + if (a_ij.index() < row_i) { + A_lower_.value()[row_i][a_ij.index()] = *a_ij; + } else if (a_ij.index() > row_i) { + A_upper_.value()[row_i][a_ij.index()] = *a_ij; + } + } + } + + for (std::size_t row = 0; row < A_.N(); ++row) { + Dinv_[row] = A_[row][row]; + } + + for (auto row = A_lower_->begin(); row != A_lower_->end(); ++row) { + const auto row_i = row.index(); + auto Dinv_temp = Dinv_[row_i]; + for (auto a_ij = row->begin(); a_ij != row->end(); ++a_ij) { + const auto col_j = a_ij.index(); + const auto a_ji = A_upper_.value()[col_j].find(row_i); + // if A_lower[i, j] != 0 and A_upper[j, i] != 0 + if (a_ji != A_upper_.value()[col_j].end()) { + // Dinv_temp -= A_lower[i, j] * d[j] * A_upper[j, i] + Dinv_temp -= (*a_ij) * Dune::FieldMatrix(Dinv_[col_j]) * (*a_ji); + } + } + Dinv_temp.invert(); + Dinv_[row_i] = Dinv_temp; + } + } + void parallelUpdate() { #ifdef _OPENMP @@ -302,6 +374,57 @@ class MultithreadDILU : public PreconditionerWithUpdate } } + void serialSplitApply(X& v, const Y& d) + { + // M = (D + L_A) D^-1 (D + U_A) (a LU decomposition of M) + // where L_A and U_A are the strictly lower and upper parts of A and M has the properties: + // diag(A) = diag(M) + // Working with defect d = b - Ax and update v = x_{n+1} - x_n + // solving the product M^-1(d) using upper and lower triangular solve + // v = M^{-1}*d = (D + U_A)^{-1} D (D + L_A)^{-1} * d + // lower triangular solve: (D + L_A) y = d + using Xblock = typename X::block_type; + using Yblock = typename Y::block_type; + { + OPM_TIMEBLOCK(lower_solve); + auto endi = A_lower_.value().end(); + for (auto row = A_lower_.value().begin(); row != endi; ++row) { + const auto row_i = row.index(); + Yblock rhs = d[row_i]; + for (auto a_ij = (*row).begin(); a_ij != (*row).end(); ++a_ij) { + // if A_lower[i][j] != 0 + // rhs -= A_lower[i][j]* y[j], where v_j stores y_j + const auto col_j = a_ij.index(); + a_ij->mmv(v[col_j], rhs); + } + // y_i = Dinv_i * rhs + // storing y_i in v_i + Dinv_[row_i].mv(rhs, v[row_i]); // (D + L_A)_ii = D_i + } + } + + { + OPM_TIMEBLOCK(upper_solve); + // upper triangular solve: (D + U_A) v = Dy + auto rendi = A_upper_.value().beforeBegin(); + for (auto row = A_upper_.value().beforeEnd(); row != rendi; --row) { + const auto row_i = row.index(); + // rhs = 0 + Xblock rhs(0.0); + for (auto a_ij = (*row).beforeEnd(); a_ij != (*row).beforeBegin(); --a_ij) { + // if A_upper[i][j] != 0 + // rhs += A_upper[i][j]*v[j] + const auto col_j = a_ij.index(); + a_ij->umv(v[col_j], rhs); + } + // calculate update v = M^-1*d + // v_i = y_i - Dinv_i*rhs + // before update v_i is y_i + Dinv_[row_i].mmv(rhs, v[row_i]); + } + } + } + void parallelApply(X& v, const Y& d) { using Xblock = typename X::block_type; diff --git a/opm/simulators/linalg/PreconditionerFactory_impl.hpp b/opm/simulators/linalg/PreconditionerFactory_impl.hpp index c53c069e7cd..682fcedf4b9 100644 --- a/opm/simulators/linalg/PreconditionerFactory_impl.hpp +++ b/opm/simulators/linalg/PreconditionerFactory_impl.hpp @@ -175,8 +175,8 @@ struct StandardPreconditioners { comm, op.getmat(), n, w, resort); }); F::addCreator("DILU", [](const O& op, const P& prm, const std::function&, std::size_t, const C& comm) { - DUNE_UNUSED_PARAMETER(prm); - return wrapBlockPreconditioner>(comm, op.getmat()); + const bool split_matrix = prm.get("split_matrix", false); + return wrapBlockPreconditioner>(comm, op.getmat(), split_matrix); }); F::addCreator("Jac", [](const O& op, const P& prm, const std::function&, std::size_t, const C& comm) { const int n = prm.get("repeats", 1); @@ -453,8 +453,8 @@ struct StandardPreconditioners { op.getmat(), n, w, MILU_VARIANT::ILU); }); F::addCreator("DILU", [](const O& op, const P& prm, const std::function&, std::size_t) { - DUNE_UNUSED_PARAMETER(prm); - return std::make_shared>(op.getmat()); + const bool split_matrix = prm.get("split_matrix", false); + return std::make_shared>(op.getmat(), split_matrix); }); F::addCreator("Jac", [](const O& op, const P& prm, const std::function&, std::size_t) { const int n = prm.get("repeats", 1); From 44c0442410cff2c6b1a8a44a9f98ed33f0337feb Mon Sep 17 00:00:00 2001 From: Tobias Meyer Andersen Date: Wed, 30 Oct 2024 17:02:09 +0100 Subject: [PATCH 2/2] simplify memory transfer --- opm/simulators/linalg/DILU.hpp | 15 +++------------ 1 file changed, 3 insertions(+), 12 deletions(-) diff --git a/opm/simulators/linalg/DILU.hpp b/opm/simulators/linalg/DILU.hpp index 3485f245df9..c945e1b4928 100644 --- a/opm/simulators/linalg/DILU.hpp +++ b/opm/simulators/linalg/DILU.hpp @@ -243,18 +243,6 @@ class MultithreadDILU : public PreconditionerWithUpdate void serialSplitUpdate(){ OPM_TIMEBLOCK(dilu_prec_update); - // move data from A_ to A_lower_ and A_upper_ - // at first sight this can not be done with two large memory copies - for (auto row = A_.begin(); row != A_.end(); ++row) { - const auto row_i = row.index(); - for (auto a_ij = row->begin(); a_ij != row->end(); ++a_ij) { - if (a_ij.index() < row_i) { - A_lower_.value()[row_i][a_ij.index()] = *a_ij; - } else if (a_ij.index() > row_i) { - A_upper_.value()[row_i][a_ij.index()] = *a_ij; - } - } - } for (std::size_t row = 0; row < A_.N(); ++row) { Dinv_[row] = A_[row][row]; @@ -269,6 +257,9 @@ class MultithreadDILU : public PreconditionerWithUpdate // if A_lower[i, j] != 0 and A_upper[j, i] != 0 if (a_ji != A_upper_.value()[col_j].end()) { // Dinv_temp -= A_lower[i, j] * d[j] * A_upper[j, i] + // ensure the values are moved from A_ into A_lower_ and A_upper_ before use + *a_ij = A_[row_i][col_j]; + *a_ji = A_[col_j][row_i]; Dinv_temp -= (*a_ij) * Dune::FieldMatrix(Dinv_[col_j]) * (*a_ji); } }