Skip to content

Commit

Permalink
Merge Pull Request #12198 from trilinos/Trilinos/unpack-and-combine-new
Browse files Browse the repository at this point in the history
Automatically Merged using Trilinos Pull Request AutoTester
PR Title: b'Tpetra: move unpack and combine to device in TAFC'
PR Author: jhux2
  • Loading branch information
trilinos-autotester authored Sep 7, 2023
2 parents 4b9ad2b + f547f71 commit 2e32706
Show file tree
Hide file tree
Showing 5 changed files with 311 additions and 229 deletions.
20 changes: 15 additions & 5 deletions packages/muelu/src/Utils/MueLu_UtilitiesBase_def.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -342,6 +342,7 @@ namespace MueLu {
diag = Xpetra::VectorFactory<Scalar,LocalOrdinal,GlobalOrdinal,Node>::Build(rowMap,true);

if(rowMap->lib() == Xpetra::UnderlyingLib::UseTpetra) {
Teuchos::TimeMonitor MM = *Teuchos::TimeMonitor::getNewTimer("UtilitiesBase::GetLumpedMatrixDiagonal (Kokkos implementation)");
// Implement using Kokkos
using local_vector_type = typename Vector::dual_view_type::t_dev_um;
using local_matrix_type = typename Matrix::local_matrix_type;
Expand All @@ -366,6 +367,8 @@ namespace MueLu {
Kokkos::View<mag_type, execution_space> avgAbsDiagVal_dev("avgAbsDiagVal");
Kokkos::View<int, execution_space> numDiagsEqualToOne_dev("numDiagsEqualToOne");

{
Teuchos::TimeMonitor MMM = *Teuchos::TimeMonitor::getNewTimer("GetLumpedMatrixDiagonal: parallel_for (doReciprocal)");
Kokkos::parallel_for("GetLumpedMatrixDiagonal", my_policy,
KOKKOS_LAMBDA(const int rowIdx) {
diag_dev(rowIdx, 0) = KAT_S::zero();
Expand All @@ -387,15 +390,19 @@ namespace MueLu {
}
});

typename Kokkos::View<mag_type, execution_space>::HostMirror avgAbsDiagVal = Kokkos::create_mirror_view(avgAbsDiagVal_dev);
Kokkos::deep_copy(avgAbsDiagVal, avgAbsDiagVal_dev);
int numDiagsEqualToOne;
Kokkos::deep_copy(numDiagsEqualToOne, numDiagsEqualToOne_dev);

}
if (useAverageAbsDiagVal) {
Teuchos::TimeMonitor MMM = *Teuchos::TimeMonitor::getNewTimer("GetLumpedMatrixDiagonal: useAverageAbsDiagVal");
typename Kokkos::View<mag_type, execution_space>::HostMirror avgAbsDiagVal = Kokkos::create_mirror_view(avgAbsDiagVal_dev);
Kokkos::deep_copy(avgAbsDiagVal, avgAbsDiagVal_dev);
int numDiagsEqualToOne;
Kokkos::deep_copy(numDiagsEqualToOne, numDiagsEqualToOne_dev);

tol = TST::magnitude(100 * Teuchos::ScalarTraits<Scalar>::eps()) * (avgAbsDiagVal()-numDiagsEqualToOne) / (rowMap->getLocalNumElements()-numDiagsEqualToOne);
}

{
Teuchos::TimeMonitor MMM = *Teuchos::TimeMonitor::getNewTimer("ComputeLumpedDiagonalInverse: parallel_for (doReciprocal)");
Kokkos::parallel_for("ComputeLumpedDiagonalInverse", my_policy,
KOKKOS_LAMBDA(const int rowIdx) {
if (replaceSingleEntryRowWithZero && nnzPerRow(rowIdx) <= 1) {
Expand All @@ -410,8 +417,10 @@ namespace MueLu {
}
}
});
}

} else {
Teuchos::TimeMonitor MMM = *Teuchos::TimeMonitor::getNewTimer("GetLumpedMatrixDiagonal: parallel_for");
Kokkos::parallel_for("GetLumpedMatrixDiagonal", my_policy,
KOKKOS_LAMBDA(const int rowIdx) {
diag_dev(rowIdx, 0) = KAT_S::zero();
Expand All @@ -424,6 +433,7 @@ namespace MueLu {
}
} else {
// Implement using Teuchos
Teuchos::TimeMonitor MMM = *Teuchos::TimeMonitor::getNewTimer("UtilitiesBase: GetLumpedMatrixDiagonal: (Teuchos implementation)");
ArrayRCP<Scalar> diagVals = diag->getDataNonConst(0);
Teuchos::Array<Scalar> regSum(diag->getLocalLength());
Teuchos::ArrayView<const LocalOrdinal> cols;
Expand Down
126 changes: 46 additions & 80 deletions packages/tpetra/core/src/Tpetra_CrsMatrix_def.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -4534,7 +4534,7 @@ CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node>::
);
this->checkInternalState ();
}
}
} //fillComplete(domainMap, rangeMap, params)

template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
void
Expand Down Expand Up @@ -7919,12 +7919,12 @@ CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node>::
const size_t NumSameIDs = rowTransfer.getNumSameIDs();
ArrayView<const LO> ExportLIDs = reverseMode ?
rowTransfer.getRemoteLIDs () : rowTransfer.getExportLIDs ();
ArrayView<const LO> RemoteLIDs = reverseMode ?
rowTransfer.getExportLIDs () : rowTransfer.getRemoteLIDs ();
ArrayView<const LO> PermuteToLIDs = reverseMode ?
rowTransfer.getPermuteFromLIDs () : rowTransfer.getPermuteToLIDs ();
ArrayView<const LO> PermuteFromLIDs = reverseMode ?
rowTransfer.getPermuteToLIDs () : rowTransfer.getPermuteFromLIDs ();
auto RemoteLIDs = reverseMode ?
rowTransfer.getExportLIDs_dv() : rowTransfer.getRemoteLIDs_dv();
auto PermuteToLIDs = reverseMode ?
rowTransfer.getPermuteFromLIDs_dv() : rowTransfer.getPermuteToLIDs_dv();
auto PermuteFromLIDs = reverseMode ?
rowTransfer.getPermuteToLIDs_dv() : rowTransfer.getPermuteFromLIDs_dv();
Distributor& Distor = rowTransfer.getDistributor ();

// Owning PIDs
Expand Down Expand Up @@ -8119,14 +8119,14 @@ CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node>::
#endif
if (constantNumPackets == 0) {
destMat->reallocArraysForNumPacketsPerLid (ExportLIDs.size (),
RemoteLIDs.size ());
RemoteLIDs.view_host().size ());
}
else {
// There are a constant number of packets per element. We
// already know (from the number of "remote" (incoming)
// elements) how many incoming elements we expect, so we can
// resize the buffer accordingly.
const size_t rbufLen = RemoteLIDs.size() * constantNumPackets;
const size_t rbufLen = RemoteLIDs.view_host().size() * constantNumPackets;
destMat->reallocImportsIfNeeded (rbufLen, false, nullptr);
}
}
Expand Down Expand Up @@ -8450,97 +8450,63 @@ CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node>::
}
}

/*********************************************************************/
/**** 3) Copy all of the Same/Permute/Remote data into CSR_arrays ****/
/*********************************************************************/

// Backwards compatibility measure. We'll use this again below.
#ifdef HAVE_TPETRA_MMM_TIMINGS
RCP<TimeMonitor> tmCopySPRdata = rcp(new TimeMonitor(*TimeMonitor::getNewTimer(prefix + std::string("TAFC unpack-count-resize"))));
#endif
destMat->numImportPacketsPerLID_.sync_host ();
Teuchos::ArrayView<const size_t> numImportPacketsPerLID =
getArrayViewFromDualView (destMat->numImportPacketsPerLID_);
destMat->imports_.sync_host ();
Teuchos::ArrayView<const char> hostImports =
getArrayViewFromDualView (destMat->imports_);

if (verbose) {
std::ostringstream os;
os << *verbosePrefix << "Calling unpackAndCombineWithOwningPIDsCount"
<< std::endl;
std::cerr << os.str ();
}
size_t mynnz =
unpackAndCombineWithOwningPIDsCount (*this,
RemoteLIDs,
hostImports,
numImportPacketsPerLID,
constantNumPackets,
INSERT,
NumSameIDs,
PermuteToLIDs,
PermuteFromLIDs);
if (verbose) {
std::ostringstream os;
os << *verbosePrefix << "unpackAndCombineWithOwningPIDsCount returned "
<< mynnz << std::endl;
std::cerr << os.str ();
}
size_t N = BaseRowMap->getLocalNumElements ();
// TODO JHU Need to track down why numImportPacketsPerLID_ has not been corrently marked as modified on host (which it has been)
// TODO JHU somewhere above, e.g., call to Distor.doPostsAndWaits().
// TODO JHU This only becomes apparent as we begin to convert TAFC to run on device.
destMat->numImportPacketsPerLID_.modify_host(); //FIXME

// Allocations
ArrayRCP<size_t> CSR_rowptr(N+1);
# ifdef HAVE_TPETRA_MMM_TIMINGS
RCP<TimeMonitor> tmCopySPRdata = rcp(new TimeMonitor(*TimeMonitor::getNewTimer(prefix + std::string("TAFC unpack-count-resize + copy same-perm-remote data"))));
# endif
ArrayRCP<size_t> CSR_rowptr;
ArrayRCP<GO> CSR_colind_GID;
ArrayRCP<LO> CSR_colind_LID;
ArrayRCP<Scalar> CSR_vals;
CSR_colind_GID.resize (mynnz);
CSR_vals.resize (mynnz);

destMat->imports_.sync_device ();
destMat->numImportPacketsPerLID_.sync_device ();

size_t N = BaseRowMap->getLocalNumElements ();

auto RemoteLIDs_d = RemoteLIDs.view_device();
auto PermuteToLIDs_d = PermuteToLIDs.view_device();
auto PermuteFromLIDs_d = PermuteFromLIDs.view_device();

Details::unpackAndCombineIntoCrsArrays(
*this,
RemoteLIDs_d,
destMat->imports_.view_device(), //hostImports
destMat->numImportPacketsPerLID_.view_device(), //numImportPacketsPerLID
NumSameIDs,
PermuteToLIDs_d,
PermuteFromLIDs_d,
N,
MyPID,
CSR_rowptr,
CSR_colind_GID,
CSR_vals,
SourcePids(),
TargetPids);

// If LO and GO are the same, we can reuse memory when
// converting the column indices from global to local indices.
if (typeid (LO) == typeid (GO)) {
CSR_colind_LID = Teuchos::arcp_reinterpret_cast<LO> (CSR_colind_GID);
}
else {
CSR_colind_LID.resize (mynnz);
}
#ifdef HAVE_TPETRA_MMM_TIMINGS
tmCopySPRdata = Teuchos::null;
tmCopySPRdata = rcp(new TimeMonitor(*TimeMonitor::getNewTimer(prefix + std::string("TAFC copy same-perm-remote data"))));
#endif

if (verbose) {
std::ostringstream os;
os << *verbosePrefix << "Calling unpackAndCombineIntoCrsArrays"
<< std::endl;
std::cerr << os.str ();
CSR_colind_LID.resize (CSR_colind_GID.size());
}
// FIXME (mfh 15 May 2014) Why can't we abstract this out as an
// unpackAndCombine method on a "CrsArrays" object? This passing
// in a huge list of arrays is icky. Can't we have a bit of an
// abstraction? Implementing a concrete DistObject subclass only
// takes five methods.
unpackAndCombineIntoCrsArrays (*this,
RemoteLIDs,
hostImports,
numImportPacketsPerLID,
constantNumPackets,
INSERT,
NumSameIDs,
PermuteToLIDs,
PermuteFromLIDs,
N,
mynnz,
MyPID,
CSR_rowptr (),
CSR_colind_GID (),
Teuchos::av_reinterpret_cast<impl_scalar_type> (CSR_vals ()),
SourcePids (),
TargetPids);
CSR_colind_LID.resize (CSR_colind_GID.size());
size_t mynnz = CSR_vals.size();

// On return from unpackAndCombineIntoCrsArrays TargetPids[i] == -1 for locally
// owned entries. Convert them to the actual PID.
// JHU FIXME This can be done within unpackAndCombineIntoCrsArrays with a parallel_for.
for(size_t i=0; i<static_cast<size_t>(TargetPids.size()); i++)
{
if(TargetPids[i] == -1) TargetPids[i] = MyPID;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -45,6 +45,7 @@
#include "Kokkos_DualView.hpp"
#include "Tpetra_CrsMatrix_fwd.hpp"
#include "Tpetra_DistObject_decl.hpp"
#include "Tpetra_Details_DefaultTypes.hpp"

/// \file Tpetra_Details_unpackCrsMatrixAndCombine_decl.hpp
/// \brief Declaration of functions for unpacking the entries of a
Expand Down Expand Up @@ -213,10 +214,6 @@ unpackAndCombineWithOwningPIDsCount (

/// \brief unpackAndCombineIntoCrsArrays
///
/// \note You should call unpackAndCombineWithOwningPIDsCount first
/// and allocate all arrays accordingly, before calling this
/// function.
///
/// Note: The SourcePids vector (on input) should contain owning PIDs
/// for each column in the (source) ColMap, as from
/// Tpetra::Import_Util::getPids, with the "-1 for local" option being
Expand All @@ -225,27 +222,43 @@ unpackAndCombineWithOwningPIDsCount (
/// Note: The TargetPids vector (on output) will contain owning PIDs
/// for each entry in the matrix, with the "-1 for local" for locally
/// owned entries.
///
/// Note: This method does the work previously done in unpackAndCombineWithOwningPIDsCount,
/// namely, calculating the local number of nonzeros, and allocates CRS
/// arrays of the correct sizes.

template<typename Scalar, typename LocalOrdinal, typename GlobalOrdinal, typename Node>
void
unpackAndCombineIntoCrsArrays (
const CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node> & sourceMatrix,
const Teuchos::ArrayView<const LocalOrdinal>& importLIDs,
const Teuchos::ArrayView<const char>& imports,
const Teuchos::ArrayView<const size_t>& numPacketsPerLID,
const size_t constantNumPackets,
const CombineMode combineMode,
const Kokkos::View<LocalOrdinal const *,
Kokkos::Device<typename Node::device_type::execution_space,
Tpetra::Details::DefaultTypes::comm_buffer_memory_space<typename Node::device_type>>,
void, void>,
const Kokkos::View<const char*,
Kokkos::Device<typename Node::device_type::execution_space,
Tpetra::Details::DefaultTypes::comm_buffer_memory_space<typename Node::device_type>>
,void, void >,
const Kokkos::View<const size_t*,
Kokkos::Device<typename Node::device_type::execution_space,
Tpetra::Details::DefaultTypes::comm_buffer_memory_space<typename Node::device_type>>
,void, void >,
const size_t numSameIDs,
const Teuchos::ArrayView<const LocalOrdinal>& permuteToLIDs,
const Teuchos::ArrayView<const LocalOrdinal>& permuteFromLIDs,
const Kokkos::View<LocalOrdinal const *,
Kokkos::Device<typename Node::device_type::execution_space,
Tpetra::Details::DefaultTypes::comm_buffer_memory_space<typename Node::device_type>>,
void, void>,
const Kokkos::View<LocalOrdinal const *,
Kokkos::Device<typename Node::device_type::execution_space,
Tpetra::Details::DefaultTypes::comm_buffer_memory_space<typename Node::device_type>>,
void, void>,
size_t TargetNumRows,
size_t TargetNumNonzeros,
const int MyTargetPID,
const Teuchos::ArrayView<size_t>& CRS_rowptr,
const Teuchos::ArrayView<GlobalOrdinal>& CRS_colind,
const Teuchos::ArrayView<typename CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node>::impl_scalar_type>& CRS_vals,
Teuchos::ArrayRCP<size_t>& CRS_rowptr,
Teuchos::ArrayRCP<GlobalOrdinal>& CRS_colind,
Teuchos::ArrayRCP<Scalar>& CRS_vals,
const Teuchos::ArrayView<const int>& SourcePids,
Teuchos::Array<int>& TargetPids);

} // namespace Details
} // namespace Tpetra

Expand Down
Loading

0 comments on commit 2e32706

Please sign in to comment.