From 2cb954c43174242e14917a2817798bee5350718f Mon Sep 17 00:00:00 2001 From: Jonathan Hu Date: Fri, 16 Jun 2023 10:55:41 -0600 Subject: [PATCH 1/8] Tpetra: replace device type of resulting view This change makes usage of device type in unpackAndCombineWithOwningPIDsCount consistent with unpackAndCombineIntoCrsArrays for identical input parameters. --- .../src/Tpetra_Details_unpackCrsMatrixAndCombine_def.hpp | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/packages/tpetra/core/src/Tpetra_Details_unpackCrsMatrixAndCombine_def.hpp b/packages/tpetra/core/src/Tpetra_Details_unpackCrsMatrixAndCombine_def.hpp index aafdde2d536d..87a21708efa5 100644 --- a/packages/tpetra/core/src/Tpetra_Details_unpackCrsMatrixAndCombine_def.hpp +++ b/packages/tpetra/core/src/Tpetra_Details_unpackCrsMatrixAndCombine_def.hpp @@ -1367,7 +1367,6 @@ unpackAndCombineWithOwningPIDsCount ( using Kokkos::MemoryUnmanaged; using Kokkos::View; typedef typename Node::device_type DT; - typedef typename DistObject::buffer_device_type BDT; const char prefix[] = "unpackAndCombineWithOwningPIDsCount: "; TEUCHOS_TEST_FOR_EXCEPTION @@ -1392,12 +1391,12 @@ unpackAndCombineWithOwningPIDsCount ( permuteFromLIDs.size (), true, "permute_from_lids"); auto imports_d = - create_mirror_view_from_raw_host_array (BDT (), + create_mirror_view_from_raw_host_array (DT (), imports.getRawPtr (), imports.size (), true, "imports"); auto num_packets_per_lid_d = - create_mirror_view_from_raw_host_array (BDT (), + create_mirror_view_from_raw_host_array (DT (), numPacketsPerLID.getRawPtr (), numPacketsPerLID.size (), true, "num_packets_per_lid"); From c0cd874c3f4f1954a9ecfe7ed1c1551e170b5bed Mon Sep 17 00:00:00 2001 From: Jonathan Hu Date: Wed, 21 Jun 2023 07:32:30 -0600 Subject: [PATCH 2/8] MueLu: update lumped diagonal code Add additional timers Optimization: only do deep copies if required by user option --- .../src/Utils/MueLu_UtilitiesBase_def.hpp | 26 +++++++++++++++---- 1 file changed, 21 insertions(+), 5 deletions(-) diff --git a/packages/muelu/src/Utils/MueLu_UtilitiesBase_def.hpp b/packages/muelu/src/Utils/MueLu_UtilitiesBase_def.hpp index 1fe2f6df1b7f..4ce2ea8babfe 100644 --- a/packages/muelu/src/Utils/MueLu_UtilitiesBase_def.hpp +++ b/packages/muelu/src/Utils/MueLu_UtilitiesBase_def.hpp @@ -342,6 +342,7 @@ namespace MueLu { diag = Xpetra::VectorFactory::Build(rowMap,true); if(rowMap->lib() == Xpetra::UnderlyingLib::UseTpetra) { + Teuchos::TimeMonitor MM = *Teuchos::TimeMonitor::getNewTimer("UtilitiesBase::GetLumpedMatrixDiagonal (Kokkos implementation)"); // Implement using Kokkos using local_vector_type = typename Vector::dual_view_type::t_dev_um; using local_matrix_type = typename Matrix::local_matrix_type; @@ -366,6 +367,8 @@ namespace MueLu { Kokkos::View avgAbsDiagVal_dev("avgAbsDiagVal"); Kokkos::View numDiagsEqualToOne_dev("numDiagsEqualToOne"); + { + Teuchos::TimeMonitor MMM = *Teuchos::TimeMonitor::getNewTimer("GetLumpedMatrixDiagonal: parallel_for (doReciprocal)"); Kokkos::parallel_for("GetLumpedMatrixDiagonal", my_policy, KOKKOS_LAMBDA(const int rowIdx) { diag_dev(rowIdx, 0) = KAT_S::zero(); @@ -387,15 +390,25 @@ namespace MueLu { } }); - typename Kokkos::View::HostMirror avgAbsDiagVal = Kokkos::create_mirror_view(avgAbsDiagVal_dev); - Kokkos::deep_copy(avgAbsDiagVal, avgAbsDiagVal_dev); - int numDiagsEqualToOne; - Kokkos::deep_copy(numDiagsEqualToOne, numDiagsEqualToOne_dev); - + } if (useAverageAbsDiagVal) { + Teuchos::TimeMonitor MMM = *Teuchos::TimeMonitor::getNewTimer("GetLumpedMatrixDiagonal: useAverageAbsDiagVal"); + typename Kokkos::View::HostMirror avgAbsDiagVal = Kokkos::create_mirror_view(avgAbsDiagVal_dev); + { + Teuchos::TimeMonitor MMMM = *Teuchos::TimeMonitor::getNewTimer("GetLumpedMatrixDiagonal: deep_copy 1"); + Kokkos::deep_copy(avgAbsDiagVal, avgAbsDiagVal_dev); + } + int numDiagsEqualToOne; + { + Teuchos::TimeMonitor MMMM = *Teuchos::TimeMonitor::getNewTimer("GetLumpedMatrixDiagonal: deep_copy 2"); + Kokkos::deep_copy(numDiagsEqualToOne, numDiagsEqualToOne_dev); + } + tol = TST::magnitude(100 * Teuchos::ScalarTraits::eps()) * (avgAbsDiagVal()-numDiagsEqualToOne) / (rowMap->getLocalNumElements()-numDiagsEqualToOne); } + { + Teuchos::TimeMonitor MMM = *Teuchos::TimeMonitor::getNewTimer("ComputeLumpedDiagonalInverse: parallel_for (doReciprocal)"); Kokkos::parallel_for("ComputeLumpedDiagonalInverse", my_policy, KOKKOS_LAMBDA(const int rowIdx) { if (replaceSingleEntryRowWithZero && nnzPerRow(rowIdx) <= 1) { @@ -410,8 +423,10 @@ namespace MueLu { } } }); + } } else { + Teuchos::TimeMonitor MMM = *Teuchos::TimeMonitor::getNewTimer("GetLumpedMatrixDiagonal: parallel_for"); Kokkos::parallel_for("GetLumpedMatrixDiagonal", my_policy, KOKKOS_LAMBDA(const int rowIdx) { diag_dev(rowIdx, 0) = KAT_S::zero(); @@ -424,6 +439,7 @@ namespace MueLu { } } else { // Implement using Teuchos + Teuchos::TimeMonitor MMM = *Teuchos::TimeMonitor::getNewTimer("UtilitiesBase: GetLumpedMatrixDiagonal: (Teuchos implementation)"); ArrayRCP diagVals = diag->getDataNonConst(0); Teuchos::Array regSum(diag->getLocalLength()); Teuchos::ArrayView cols; From 51e72c91b50f85a55a0010b0fe98865d277025cc Mon Sep 17 00:00:00 2001 From: Jonathan Hu Date: Tue, 20 Jun 2023 12:49:39 -0600 Subject: [PATCH 3/8] Tpetra: merge two TAFC methods Merges Tpetra_CrsMatrix methods unpackAndCombineWithOwningPIDsCount and unpackAndCombineIntoCrsArrays. Reduces number of deep copies. Part of larger effort to have TAFC run on device. Temporary change in Tpetra_CrsMatrix_def.hpp: destMat->numImportPacketsPerLID_.modify_host() because numImportPacketsPerLID_ is a Kokkos::DualView and hasn't been properly marked as modified on host Addresses #11693 and #11694. --- .../tpetra/core/src/Tpetra_CrsMatrix_def.hpp | 59 +++- ...Details_unpackCrsMatrixAndCombine_decl.hpp | 33 +++ ..._Details_unpackCrsMatrixAndCombine_def.hpp | 271 +++++++++++++++++- .../ImportExport2/ImportExport2_UnitTests.cpp | 41 +++ 4 files changed, 394 insertions(+), 10 deletions(-) diff --git a/packages/tpetra/core/src/Tpetra_CrsMatrix_def.hpp b/packages/tpetra/core/src/Tpetra_CrsMatrix_def.hpp index 82ab73577c93..92837a8b1410 100644 --- a/packages/tpetra/core/src/Tpetra_CrsMatrix_def.hpp +++ b/packages/tpetra/core/src/Tpetra_CrsMatrix_def.hpp @@ -4534,7 +4534,7 @@ CrsMatrix:: ); this->checkInternalState (); } - } + } //fillComplete(domainMap, rangeMap, params) template void @@ -8450,11 +8450,19 @@ CrsMatrix:: } } - /*********************************************************************/ /**** 3) Copy all of the Same/Permute/Remote data into CSR_arrays ****/ /*********************************************************************/ // Backwards compatibility measure. We'll use this again below. + + // TODO JHU Need to track down why numImportPacketsPerLID_ has not been corrently marked as modified on host (which it has been) + // TODO JHU somewhere above, e.g., call to Distor.doPostsAndWaits(). + // TODO JHU This only becomes apparent as we begin to convert TAFC to run on device. + destMat->numImportPacketsPerLID_.modify_host(); //FIXME + +#define TPETRA_NEW_TAFC_UNPACK_AND_COMBINE +#ifndef TPETRA_NEW_TAFC_UNPACK_AND_COMBINE + #ifdef HAVE_TPETRA_MMM_TIMINGS RCP tmCopySPRdata = rcp(new TimeMonitor(*TimeMonitor::getNewTimer(prefix + std::string("TAFC unpack-count-resize")))); #endif @@ -8538,9 +8546,56 @@ CrsMatrix:: Teuchos::av_reinterpret_cast (CSR_vals ()), SourcePids (), TargetPids); +#else +# ifdef HAVE_TPETRA_MMM_TIMINGS + RCP tmCopySPRdata = rcp(new TimeMonitor(*TimeMonitor::getNewTimer(prefix + std::string("TAFC unpack-count-resize + copy same-perm-remote data")))); +# endif + ArrayRCP CSR_rowptr; + ArrayRCP CSR_colind_GID; + ArrayRCP CSR_colind_LID; + ArrayRCP CSR_vals; + + destMat->imports_.sync_device (); + destMat->numImportPacketsPerLID_.sync_device (); + + size_t N = BaseRowMap->getLocalNumElements (); + + TEUCHOS_TEST_FOR_EXCEPTION + (destMat->numImportPacketsPerLID_.need_sync_device(), std::logic_error, "The " + "input Kokkos::DualView was most recently modified on host, but TAFC " + "needs the device view of the data to be the most recently modified."); + + Details::unpackAndCombineIntoCrsArrays_new( + *this, + RemoteLIDs, + destMat->imports_.view_device(), //hostImports + destMat->numImportPacketsPerLID_.view_device(), //numImportPacketsPerLID + NumSameIDs, + PermuteToLIDs, + PermuteFromLIDs, + N, + MyPID, + CSR_rowptr, + CSR_colind_GID, + CSR_vals, + SourcePids(), + TargetPids); + + // If LO and GO are the same, we can reuse memory when + // converting the column indices from global to local indices. + if (typeid (LO) == typeid (GO)) { + CSR_colind_LID = Teuchos::arcp_reinterpret_cast (CSR_colind_GID); + } + else { + CSR_colind_LID.resize (CSR_colind_GID.size()); + } + CSR_colind_LID.resize (CSR_colind_GID.size()); + size_t mynnz = CSR_vals.size(); +#endif //ifndef TPETRA_NEW_TAFC_UNPACK_AND_COMBINE ... else // On return from unpackAndCombineIntoCrsArrays TargetPids[i] == -1 for locally // owned entries. Convert them to the actual PID. + // JHU FIXME This can be done within unpackAndCombineIntoCrsArrays_new with a parallel_for. for(size_t i=0; i(TargetPids.size()); i++) { if(TargetPids[i] == -1) TargetPids[i] = MyPID; diff --git a/packages/tpetra/core/src/Tpetra_Details_unpackCrsMatrixAndCombine_decl.hpp b/packages/tpetra/core/src/Tpetra_Details_unpackCrsMatrixAndCombine_decl.hpp index 349a1fa0ca86..9e0269950f67 100644 --- a/packages/tpetra/core/src/Tpetra_Details_unpackCrsMatrixAndCombine_decl.hpp +++ b/packages/tpetra/core/src/Tpetra_Details_unpackCrsMatrixAndCombine_decl.hpp @@ -246,6 +246,39 @@ unpackAndCombineIntoCrsArrays ( const Teuchos::ArrayView& SourcePids, Teuchos::Array& TargetPids); + +/// \brief unpackAndCombineIntoCrsArrays_new +/// +/// Note: The SourcePids vector (on input) should contain owning PIDs +/// for each column in the (source) ColMap, as from +/// Tpetra::Import_Util::getPids, with the "-1 for local" option being +/// used. +/// +/// Note: The TargetPids vector (on output) will contain owning PIDs +/// for each entry in the matrix, with the "-1 for local" for locally +/// owned entries. +/// +/// Note: This method does the work of unpackAndCombineWithOwningPIDsCount, +/// namely, calculating the local number of nonzeros, and allocates CRS +/// arrays of the correct sizes. +template +void +unpackAndCombineIntoCrsArrays_new ( + const CrsMatrix & sourceMatrix, + const Teuchos::ArrayView& importLIDs, + const Kokkos::View& imports_d, + const Kokkos::View& num_packets_per_lid_d, + const size_t numSameIDs, + const Teuchos::ArrayView& permuteToLIDs, + const Teuchos::ArrayView& permuteFromLIDs, + size_t TargetNumRows, + const int MyTargetPID, + Teuchos::ArrayRCP& CRS_rowptr, + Teuchos::ArrayRCP& CRS_colind, + Teuchos::ArrayRCP& CRS_vals, + const Teuchos::ArrayView& SourcePids, + Teuchos::Array& TargetPids); + } // namespace Details } // namespace Tpetra diff --git a/packages/tpetra/core/src/Tpetra_Details_unpackCrsMatrixAndCombine_def.hpp b/packages/tpetra/core/src/Tpetra_Details_unpackCrsMatrixAndCombine_def.hpp index 87a21708efa5..9f1597d50d9d 100644 --- a/packages/tpetra/core/src/Tpetra_Details_unpackCrsMatrixAndCombine_def.hpp +++ b/packages/tpetra/core/src/Tpetra_Details_unpackCrsMatrixAndCombine_def.hpp @@ -44,6 +44,7 @@ #include "Teuchos_Array.hpp" #include "Teuchos_ArrayView.hpp" #include "Teuchos_OrdinalTraits.hpp" +#include "Teuchos_TimeMonitor.hpp" #include "Tpetra_Details_castAwayConstDualView.hpp" #include "Tpetra_Details_computeOffsets.hpp" #include "Tpetra_Details_createMirrorView.hpp" @@ -166,7 +167,7 @@ unpackRow(const typename PackTraits::output_array_type& gids_out, return 24; // error code } return 0; // no errors -} +} //unpackRow /// \brief Unpacks and combines a single row of the CrsMatrix. /// @@ -419,7 +420,7 @@ struct UnpackCrsMatrixAndCombineFunctor { return error_code_h(); } -}; +}; //UnpackCrsMatrixAndCombineFunctor struct MaxNumEntTag {}; struct TotNumEntTag {}; @@ -489,7 +490,7 @@ class NumEntriesFunctor { tot_num_ent += static_cast (num_ent_LO); } } -}; +}; //NumEntriesFunctor /// \brief Maximum number of entries in any row of the packed matrix. /// @@ -739,7 +740,7 @@ unpackAndCombineIntoCrsMatrix( std::runtime_error, prefix << "UnpackCrsMatrixAndCombineFunctor reported error code " << error_code ); -} +} //unpackAndCombineIntoCrsMatrix (Kokkos version) template size_t @@ -797,7 +798,7 @@ unpackAndCombineWithOwningPIDsCount( } return count; -} +} //unpackAndCombineWithOwningPIDsCount (Kokkos version) /// \brief Setup row pointers for remotes template @@ -1367,6 +1368,7 @@ unpackAndCombineWithOwningPIDsCount ( using Kokkos::MemoryUnmanaged; using Kokkos::View; typedef typename Node::device_type DT; + typedef typename DT::execution_space execution_space; const char prefix[] = "unpackAndCombineWithOwningPIDsCount: "; TEUCHOS_TEST_FOR_EXCEPTION @@ -1404,7 +1406,7 @@ unpackAndCombineWithOwningPIDsCount ( return UnpackAndCombineCrsMatrixImpl::unpackAndCombineWithOwningPIDsCount( local_matrix, permute_from_lids_d, imports_d, num_packets_per_lid_d, numSameIDs); -} +} //unpackAndCombineWithOwningPIDsCount (Teuchos::Array version) /// \brief unpackAndCombineIntoCrsArrays /// @@ -1462,6 +1464,8 @@ unpackAndCombineIntoCrsArrays ( const char prefix[] = "Tpetra::Details::unpackAndCombineIntoCrsArrays: "; + std::cout << "PID " << MyTargetPID << ": " << "CRS_rowptr.size() = " << CRS_rowptr.size () << ", TargetNumRows+1 = " << TargetNumRows+1 << std::endl; + TEUCHOS_TEST_FOR_EXCEPTION( TargetNumRows + 1 != static_cast (CRS_rowptr.size ()), std::invalid_argument, prefix << "CRS_rowptr.size() = " << @@ -1613,7 +1617,242 @@ unpackAndCombineIntoCrsArrays ( TargetPids.getRawPtr(), TargetPids.size()); // DEEP_COPY REVIEW - DEVICE-TO-HOSTMIRROR deep_copy(execution_space(), tgt_pids_h, tgt_pids_d); -} +} //unpackAndCombineIntoCrsArrays (Teuchos::Array version) + +template +void +unpackAndCombineIntoCrsArrays_new ( + const CrsMatrix & sourceMatrix, + const Teuchos::ArrayView& importLIDs, + const Kokkos::View& imports_d, + const Kokkos::View& num_packets_per_lid_d, + const size_t numSameIDs, + const Teuchos::ArrayView& permuteToLIDs, + const Teuchos::ArrayView& permuteFromLIDs, + size_t TargetNumRows, + const int MyTargetPID, + Teuchos::ArrayRCP& CRS_rowptr, + Teuchos::ArrayRCP& CRS_colind, + Teuchos::ArrayRCP& CRS_vals, + const Teuchos::ArrayView& SourcePids, + Teuchos::Array& TargetPids) +{ + using execution_space = typename Node::execution_space; + using Tpetra::Details::PackTraits; + + using Kokkos::View; + using Kokkos::deep_copy; + + using Teuchos::ArrayView; + using Teuchos::outArg; + using Teuchos::REDUCE_MAX; + using Teuchos::reduceAll; + + typedef LocalOrdinal LO; + + typedef typename Node::device_type DT; + + typedef CrsMatrix matrix_type; + typedef typename matrix_type::impl_scalar_type ST; + typedef typename ArrayView::size_type size_type; + + const char prefix[] = "Tpetra::Details::unpackAndCombineIntoCrsArrays_new: "; +# ifdef HAVE_TPETRA_MMM_TIMINGS + using Teuchos::TimeMonitor; + Teuchos::RCP tm; +# endif + + using Kokkos::MemoryUnmanaged; + + TEUCHOS_TEST_FOR_EXCEPTION + (permuteToLIDs.size () != permuteFromLIDs.size (), std::invalid_argument, + prefix << "permuteToLIDs.size() = " << permuteToLIDs.size () << " != " + "permuteFromLIDs.size() = " << permuteFromLIDs.size() << "."); + // FIXME (mfh 26 Jan 2015) If there are no entries on the calling + // process, then the matrix is neither locally nor globally indexed. + const bool locallyIndexed = sourceMatrix.isLocallyIndexed (); + TEUCHOS_TEST_FOR_EXCEPTION + (! locallyIndexed, std::invalid_argument, prefix << "The input " + "CrsMatrix 'sourceMatrix' must be locally indexed."); + TEUCHOS_TEST_FOR_EXCEPTION + (((size_t)importLIDs.size ()) != num_packets_per_lid_d.size (), std::invalid_argument, + prefix << "importLIDs.size() = " << importLIDs.size () << " != " + "num_packets_per_lid_d.size() = " << num_packets_per_lid_d.size () << "."); + + auto local_matrix = sourceMatrix.getLocalMatrixDevice (); + auto permute_from_lids_d = + create_mirror_view_from_raw_host_array (DT (), + permuteFromLIDs.getRawPtr (), + permuteFromLIDs.size (), true, + "permute_from_lids"); + + // TargetNumNonzeros is number of nonzeros in local matrix. +# ifdef HAVE_TPETRA_MMM_TIMINGS + tm = Teuchos::rcp(new TimeMonitor(*TimeMonitor::getNewTimer(prefix + std::string("unpackAndCombineWithOwningPIDsCount")))); +# endif + size_t TargetNumNonzeros = + UnpackAndCombineCrsMatrixImpl::unpackAndCombineWithOwningPIDsCount( + local_matrix, permute_from_lids_d, imports_d, + num_packets_per_lid_d, numSameIDs); +# ifdef HAVE_TPETRA_MMM_TIMINGS + tm = Teuchos::null; +# endif + +# ifdef HAVE_TPETRA_MMM_TIMINGS + tm = Teuchos::rcp(new TimeMonitor(*TimeMonitor::getNewTimer(prefix + std::string("resize CRS pointers")))); +# endif + CRS_rowptr.resize (TargetNumRows+1); + CRS_colind.resize(TargetNumNonzeros); + CRS_vals.resize(TargetNumNonzeros); + Teuchos::ArrayRCP const & CRS_vals_impl_scalar_type = Teuchos::arcp_reinterpret_cast(CRS_vals); +# ifdef HAVE_TPETRA_MMM_TIMINGS + tm = Teuchos::null; +# endif + + TEUCHOS_TEST_FOR_EXCEPTION( + permuteToLIDs.size () != permuteFromLIDs.size (), std::invalid_argument, + prefix << "permuteToLIDs.size() = " << permuteToLIDs.size () + << "!= permuteFromLIDs.size() = " << permuteFromLIDs.size () << "."); + + // Preseed TargetPids with -1 for local + if (static_cast (TargetPids.size ()) != TargetNumNonzeros) { + TargetPids.resize (TargetNumNonzeros); + } + TargetPids.assign (TargetNumNonzeros, -1); + + // Grab pointers for sourceMatrix + auto local_col_map = sourceMatrix.getColMap()->getLocalMap(); + +# ifdef HAVE_TPETRA_MMM_TIMINGS + tm = Teuchos::rcp(new TimeMonitor(*TimeMonitor::getNewTimer(prefix + std::string("create mirror views from inputs")))); +# endif + // Convert input arrays to Kokkos::Views + DT outputDevice; + auto import_lids_d = + create_mirror_view_from_raw_host_array(outputDevice, importLIDs.getRawPtr(), + importLIDs.size(), true, "import_lids"); + + auto permute_to_lids_d = + create_mirror_view_from_raw_host_array(outputDevice, permuteToLIDs.getRawPtr(), + permuteToLIDs.size(), true, "permute_to_lids"); + + auto crs_rowptr_d = + create_mirror_view_from_raw_host_array(outputDevice, CRS_rowptr.getRawPtr(), + CRS_rowptr.size(), true, "crs_rowptr"); + + auto crs_colind_d = + create_mirror_view_from_raw_host_array(outputDevice, CRS_colind.getRawPtr(), + CRS_colind.size(), true, "crs_colidx"); +#ifdef HAVE_TPETRA_INST_COMPLEX_DOUBLE + static_assert (! std::is_same< + typename std::remove_const< + typename std::decay< + decltype (CRS_vals_impl_scalar_type) + >::type::value_type + >::type, + std::complex >::value, + "CRS_vals::value_type is std::complex; this should never happen" + ", since std::complex does not work in Kokkos::View objects."); +#endif // HAVE_TPETRA_INST_COMPLEX_DOUBLE + + auto crs_vals_d = + create_mirror_view_from_raw_host_array(outputDevice, CRS_vals_impl_scalar_type.getRawPtr(), + CRS_vals_impl_scalar_type.size(), true, "crs_vals"); + +#ifdef HAVE_TPETRA_INST_COMPLEX_DOUBLE + static_assert (! std::is_same< + typename decltype (crs_vals_d)::non_const_value_type, + std::complex >::value, + "crs_vals_d::non_const_value_type is std::complex; this should " + "never happen, since std::complex does not work in Kokkos::View objects."); +#endif // HAVE_TPETRA_INST_COMPLEX_DOUBLE + + auto src_pids_d = + create_mirror_view_from_raw_host_array(outputDevice, SourcePids.getRawPtr(), + SourcePids.size(), true, "src_pids"); + + auto tgt_pids_d = + create_mirror_view_from_raw_host_array(outputDevice, TargetPids.getRawPtr(), + TargetPids.size(), true, "tgt_pids"); + +# ifdef HAVE_TPETRA_MMM_TIMINGS + tm = Teuchos::null; +# endif + + size_t bytes_per_value = 0; + if (PackTraits::compileTimeSize) { + // assume that ST is default constructible + bytes_per_value = PackTraits::packValueCount(ST()); + } + else { + // Since the packed data come from the source matrix, we can use the source + // matrix to get the number of bytes per Scalar value stored in the matrix. + // This assumes that all Scalar values in the source matrix require the same + // number of bytes. If the source matrix has no entries on the calling + // process, then we hope that some process does have some idea how big + // a Scalar value is. Of course, if no processes have any entries, then no + // values should be packed (though this does assume that in our packing + // scheme, rows with zero entries take zero bytes). + size_t bytes_per_value_l = 0; + if (local_matrix.values.extent(0) > 0) { + const ST& val = local_matrix.values(0); + bytes_per_value_l = PackTraits::packValueCount(val); + } else { + const ST& val = crs_vals_d(0); + bytes_per_value_l = PackTraits::packValueCount(val); + } + Teuchos::reduceAll(*(sourceMatrix.getComm()), + Teuchos::REDUCE_MAX, + bytes_per_value_l, + outArg(bytes_per_value)); + } + +#ifdef HAVE_TPETRA_INST_COMPLEX_DOUBLE + static_assert (! std::is_same< + typename decltype (crs_vals_d)::non_const_value_type, + std::complex >::value, + "crs_vals_d::non_const_value_type is std::complex; this should " + "never happen, since std::complex does not work in Kokkos::View objects."); +#endif // HAVE_TPETRA_INST_COMPLEX_DOUBLE + +# ifdef HAVE_TPETRA_MMM_TIMINGS + tm = Teuchos::rcp(new TimeMonitor(*TimeMonitor::getNewTimer(prefix + std::string("unpackAndCombineIntoCrsArrays")))); +# endif + UnpackAndCombineCrsMatrixImpl::unpackAndCombineIntoCrsArrays( + local_matrix, local_col_map, import_lids_d, imports_d, + num_packets_per_lid_d, permute_to_lids_d, permute_from_lids_d, + crs_rowptr_d, crs_colind_d, crs_vals_d, src_pids_d, tgt_pids_d, + numSameIDs, TargetNumRows, TargetNumNonzeros, MyTargetPID, + bytes_per_value); +# ifdef HAVE_TPETRA_MMM_TIMINGS + tm = Teuchos::null; +# endif + + // Copy outputs back to host +# ifdef HAVE_TPETRA_MMM_TIMINGS + tm = Teuchos::rcp(new TimeMonitor(*TimeMonitor::getNewTimer(prefix + std::string("copy back to host")))); +# endif + typename decltype(crs_rowptr_d)::HostMirror crs_rowptr_h( + CRS_rowptr.getRawPtr(), CRS_rowptr.size()); + // DEEP_COPY REVIEW - DEVICE-TO-HOSTMIRROR + deep_copy(execution_space(), crs_rowptr_h, crs_rowptr_d); + + typename decltype(crs_colind_d)::HostMirror crs_colind_h( + CRS_colind.getRawPtr(), CRS_colind.size()); + // DEEP_COPY REVIEW - DEVICE-TO-HOSTMIRROR + deep_copy(execution_space(), crs_colind_h, crs_colind_d); + + typename decltype(crs_vals_d)::HostMirror crs_vals_h( + CRS_vals_impl_scalar_type.getRawPtr(), CRS_vals_impl_scalar_type.size()); + // DEEP_COPY REVIEW - DEVICE-TO-HOSTMIRROR + deep_copy(execution_space(), crs_vals_h, crs_vals_d); + + typename decltype(tgt_pids_d)::HostMirror tgt_pids_h( + TargetPids.getRawPtr(), TargetPids.size()); + // DEEP_COPY REVIEW - DEVICE-TO-HOSTMIRROR + deep_copy(execution_space(), tgt_pids_h, tgt_pids_d); + +} //unpackAndCombineIntoCrsArrays_new } // namespace Details } // namespace Tpetra @@ -1664,6 +1903,22 @@ unpackAndCombineIntoCrsArrays ( CombineMode, \ size_t, \ const Teuchos::ArrayView&, \ - const Teuchos::ArrayView&); + const Teuchos::ArrayView&); \ + template void \ + Details::unpackAndCombineIntoCrsArrays_new ( \ + const CrsMatrix &, \ + const Teuchos::ArrayView&, \ + const Kokkos::View&, \ + const Kokkos::View&, \ + const size_t, \ + const Teuchos::ArrayView&, \ + const Teuchos::ArrayView&, \ + size_t, \ + const int, \ + Teuchos::ArrayRCP&, \ + Teuchos::ArrayRCP&, \ + Teuchos::ArrayRCP&, \ + const Teuchos::ArrayView&, \ + Teuchos::Array&); #endif // TPETRA_DETAILS_UNPACKCRSMATRIXANDCOMBINE_DEF_HPP diff --git a/packages/tpetra/core/test/ImportExport2/ImportExport2_UnitTests.cpp b/packages/tpetra/core/test/ImportExport2/ImportExport2_UnitTests.cpp index 21643ed08573..687f55712a09 100644 --- a/packages/tpetra/core/test/ImportExport2/ImportExport2_UnitTests.cpp +++ b/packages/tpetra/core/test/ImportExport2/ImportExport2_UnitTests.cpp @@ -39,6 +39,8 @@ // ************************************************************************ // @HEADER +#include + #include #include @@ -459,6 +461,20 @@ namespace { } src_mat->fillComplete (); + RCP fos = Teuchos::fancyOStream(Teuchos::rcpFromRef(std::cout)); + fos->setOutputToRootOnly(-1); + +#if 0 + fflush(stdout); + sleep(1); comm->barrier(); + if (comm->getRank() == 0) std::cout << "========\nsrc_mat\n========" << std::endl; + sleep(1); comm->barrier(); + src_mat->describe(*fos,Teuchos::VERB_EXTREME); + sleep(1); comm->barrier(); + if (comm->getRank() == 0) std::cout << "========\nend of src_mat\n========\n\n" << std::endl; + sleep(1); comm->barrier(); +#endif + // Create the importer Import importer (src_map, tgt_map, getImportParameterList ()); // Do the import, and fill-complete the target matrix. @@ -496,6 +512,9 @@ namespace { Teuchos::null, Teuchos::null, rcp(&dummy,false)); + //comm->barrier(); + //TEST_EQUALITY(1,1); + //return; // Make sure that A_tgt2's row Map is the same as tgt_map, and // is also the same as the Import's targetMap. They should have @@ -521,6 +540,25 @@ namespace { as (10) * ScalarTraits::eps (); typedef typename CrsMatrix::nonconst_local_inds_host_view_type lids_type; typedef typename CrsMatrix::nonconst_values_host_view_type vals_type; + +#if 0 + fflush(stdout); + sleep(1); comm->barrier(); + if (comm->getRank() == 0) std::cout << "tgt_mat\n========" << std::endl; + sleep(1); comm->barrier(); + A_tgt2->describe(*fos,Teuchos::VERB_EXTREME); + sleep(1); comm->barrier(); + if (comm->getRank() == 0) std::cout << "=======\nend of tgt_mat\n========\n\n" << std::endl; + sleep(1); comm->barrier(); + + sleep(1); comm->barrier(); + if (comm->getRank() == 0) std::cout << "A_tgt2\n========" << std::endl; + sleep(1); comm->barrier(); + A_tgt2->describe(*fos,Teuchos::VERB_EXTREME); + sleep(1); comm->barrier(); + if (comm->getRank() == 0) std::cout << "=======\nend of A_tgt2\n========" << std::endl; + sleep(1); comm->barrier(); + #endif lids_type tgtRowInds; vals_type tgtRowVals; @@ -560,6 +598,8 @@ namespace { typedef typename Array::size_type size_type; for (size_type k = 0; k < static_cast (tgtNumEntries); ++k) { TEST_EQUALITY(tgtRowInds[k], tgt2RowInds[k]); + out << "JHU: tgtRowInds[" << k << "]=" << tgtRowInds[k] + << ", tgt2RowInds[" << k << "] = " << tgt2RowInds[k] << std::endl; // The "out" and "success" variables should have been // automatically defined by the unit test framework, in case // you're wondering where they came from. @@ -2388,6 +2428,7 @@ TEUCHOS_UNIT_TEST_TEMPLATE_3_DECL( Import_Util, UnpackAndCombineWithOwningPIDs, } using Tpetra::Details::unpackAndCombineIntoCrsArrays; + //JHU FIXME unpackAndCombineIntoCrsArrays ( *A, Importer->getRemoteLIDs (), From e097239cfafb6bd09d2f5c236be82c3727602907 Mon Sep 17 00:00:00 2001 From: Jonathan Hu Date: Fri, 14 Jul 2023 11:25:27 -0600 Subject: [PATCH 4/8] Tpetra: remove obsolete TAFC method Replace unpackAndCombineIntoCrsArrays version that accepts only Teuchos Arrays/ArrayRCPs with one that accepts some Kokkos views. (The conversion isn't complete, as upstream/downstream changes are still to be done.) Part of #11693 and #11694. --- .../tpetra/core/src/Tpetra_CrsMatrix_def.hpp | 92 +------- ...Details_unpackCrsMatrixAndCombine_decl.hpp | 40 +--- ..._Details_unpackCrsMatrixAndCombine_def.hpp | 220 +----------------- 3 files changed, 6 insertions(+), 346 deletions(-) diff --git a/packages/tpetra/core/src/Tpetra_CrsMatrix_def.hpp b/packages/tpetra/core/src/Tpetra_CrsMatrix_def.hpp index 92837a8b1410..dc02633983dd 100644 --- a/packages/tpetra/core/src/Tpetra_CrsMatrix_def.hpp +++ b/packages/tpetra/core/src/Tpetra_CrsMatrix_def.hpp @@ -8460,93 +8460,6 @@ CrsMatrix:: // TODO JHU This only becomes apparent as we begin to convert TAFC to run on device. destMat->numImportPacketsPerLID_.modify_host(); //FIXME -#define TPETRA_NEW_TAFC_UNPACK_AND_COMBINE -#ifndef TPETRA_NEW_TAFC_UNPACK_AND_COMBINE - -#ifdef HAVE_TPETRA_MMM_TIMINGS - RCP tmCopySPRdata = rcp(new TimeMonitor(*TimeMonitor::getNewTimer(prefix + std::string("TAFC unpack-count-resize")))); -#endif - destMat->numImportPacketsPerLID_.sync_host (); - Teuchos::ArrayView numImportPacketsPerLID = - getArrayViewFromDualView (destMat->numImportPacketsPerLID_); - destMat->imports_.sync_host (); - Teuchos::ArrayView hostImports = - getArrayViewFromDualView (destMat->imports_); - - if (verbose) { - std::ostringstream os; - os << *verbosePrefix << "Calling unpackAndCombineWithOwningPIDsCount" - << std::endl; - std::cerr << os.str (); - } - size_t mynnz = - unpackAndCombineWithOwningPIDsCount (*this, - RemoteLIDs, - hostImports, - numImportPacketsPerLID, - constantNumPackets, - INSERT, - NumSameIDs, - PermuteToLIDs, - PermuteFromLIDs); - if (verbose) { - std::ostringstream os; - os << *verbosePrefix << "unpackAndCombineWithOwningPIDsCount returned " - << mynnz << std::endl; - std::cerr << os.str (); - } - size_t N = BaseRowMap->getLocalNumElements (); - - // Allocations - ArrayRCP CSR_rowptr(N+1); - ArrayRCP CSR_colind_GID; - ArrayRCP CSR_colind_LID; - ArrayRCP CSR_vals; - CSR_colind_GID.resize (mynnz); - CSR_vals.resize (mynnz); - - // If LO and GO are the same, we can reuse memory when - // converting the column indices from global to local indices. - if (typeid (LO) == typeid (GO)) { - CSR_colind_LID = Teuchos::arcp_reinterpret_cast (CSR_colind_GID); - } - else { - CSR_colind_LID.resize (mynnz); - } -#ifdef HAVE_TPETRA_MMM_TIMINGS - tmCopySPRdata = Teuchos::null; - tmCopySPRdata = rcp(new TimeMonitor(*TimeMonitor::getNewTimer(prefix + std::string("TAFC copy same-perm-remote data")))); -#endif - - if (verbose) { - std::ostringstream os; - os << *verbosePrefix << "Calling unpackAndCombineIntoCrsArrays" - << std::endl; - std::cerr << os.str (); - } - // FIXME (mfh 15 May 2014) Why can't we abstract this out as an - // unpackAndCombine method on a "CrsArrays" object? This passing - // in a huge list of arrays is icky. Can't we have a bit of an - // abstraction? Implementing a concrete DistObject subclass only - // takes five methods. - unpackAndCombineIntoCrsArrays (*this, - RemoteLIDs, - hostImports, - numImportPacketsPerLID, - constantNumPackets, - INSERT, - NumSameIDs, - PermuteToLIDs, - PermuteFromLIDs, - N, - mynnz, - MyPID, - CSR_rowptr (), - CSR_colind_GID (), - Teuchos::av_reinterpret_cast (CSR_vals ()), - SourcePids (), - TargetPids); -#else # ifdef HAVE_TPETRA_MMM_TIMINGS RCP tmCopySPRdata = rcp(new TimeMonitor(*TimeMonitor::getNewTimer(prefix + std::string("TAFC unpack-count-resize + copy same-perm-remote data")))); # endif @@ -8565,7 +8478,7 @@ CrsMatrix:: "input Kokkos::DualView was most recently modified on host, but TAFC " "needs the device view of the data to be the most recently modified."); - Details::unpackAndCombineIntoCrsArrays_new( + Details::unpackAndCombineIntoCrsArrays( *this, RemoteLIDs, destMat->imports_.view_device(), //hostImports @@ -8591,11 +8504,10 @@ CrsMatrix:: } CSR_colind_LID.resize (CSR_colind_GID.size()); size_t mynnz = CSR_vals.size(); -#endif //ifndef TPETRA_NEW_TAFC_UNPACK_AND_COMBINE ... else // On return from unpackAndCombineIntoCrsArrays TargetPids[i] == -1 for locally // owned entries. Convert them to the actual PID. - // JHU FIXME This can be done within unpackAndCombineIntoCrsArrays_new with a parallel_for. + // JHU FIXME This can be done within unpackAndCombineIntoCrsArrays with a parallel_for. for(size_t i=0; i(TargetPids.size()); i++) { if(TargetPids[i] == -1) TargetPids[i] = MyPID; diff --git a/packages/tpetra/core/src/Tpetra_Details_unpackCrsMatrixAndCombine_decl.hpp b/packages/tpetra/core/src/Tpetra_Details_unpackCrsMatrixAndCombine_decl.hpp index 9e0269950f67..376a61171dc5 100644 --- a/packages/tpetra/core/src/Tpetra_Details_unpackCrsMatrixAndCombine_decl.hpp +++ b/packages/tpetra/core/src/Tpetra_Details_unpackCrsMatrixAndCombine_decl.hpp @@ -213,42 +213,6 @@ unpackAndCombineWithOwningPIDsCount ( /// \brief unpackAndCombineIntoCrsArrays /// -/// \note You should call unpackAndCombineWithOwningPIDsCount first -/// and allocate all arrays accordingly, before calling this -/// function. -/// -/// Note: The SourcePids vector (on input) should contain owning PIDs -/// for each column in the (source) ColMap, as from -/// Tpetra::Import_Util::getPids, with the "-1 for local" option being -/// used. -/// -/// Note: The TargetPids vector (on output) will contain owning PIDs -/// for each entry in the matrix, with the "-1 for local" for locally -/// owned entries. -template -void -unpackAndCombineIntoCrsArrays ( - const CrsMatrix & sourceMatrix, - const Teuchos::ArrayView& importLIDs, - const Teuchos::ArrayView& imports, - const Teuchos::ArrayView& numPacketsPerLID, - const size_t constantNumPackets, - const CombineMode combineMode, - const size_t numSameIDs, - const Teuchos::ArrayView& permuteToLIDs, - const Teuchos::ArrayView& permuteFromLIDs, - size_t TargetNumRows, - size_t TargetNumNonzeros, - const int MyTargetPID, - const Teuchos::ArrayView& CRS_rowptr, - const Teuchos::ArrayView& CRS_colind, - const Teuchos::ArrayView::impl_scalar_type>& CRS_vals, - const Teuchos::ArrayView& SourcePids, - Teuchos::Array& TargetPids); - - -/// \brief unpackAndCombineIntoCrsArrays_new -/// /// Note: The SourcePids vector (on input) should contain owning PIDs /// for each column in the (source) ColMap, as from /// Tpetra::Import_Util::getPids, with the "-1 for local" option being @@ -258,12 +222,12 @@ unpackAndCombineIntoCrsArrays ( /// for each entry in the matrix, with the "-1 for local" for locally /// owned entries. /// -/// Note: This method does the work of unpackAndCombineWithOwningPIDsCount, +/// Note: This method does the work previously done in unpackAndCombineWithOwningPIDsCount, /// namely, calculating the local number of nonzeros, and allocates CRS /// arrays of the correct sizes. template void -unpackAndCombineIntoCrsArrays_new ( +unpackAndCombineIntoCrsArrays ( const CrsMatrix & sourceMatrix, const Teuchos::ArrayView& importLIDs, const Kokkos::View& imports_d, diff --git a/packages/tpetra/core/src/Tpetra_Details_unpackCrsMatrixAndCombine_def.hpp b/packages/tpetra/core/src/Tpetra_Details_unpackCrsMatrixAndCombine_def.hpp index 9f1597d50d9d..bcd983ff1564 100644 --- a/packages/tpetra/core/src/Tpetra_Details_unpackCrsMatrixAndCombine_def.hpp +++ b/packages/tpetra/core/src/Tpetra_Details_unpackCrsMatrixAndCombine_def.hpp @@ -1425,203 +1425,6 @@ unpackAndCombineWithOwningPIDsCount ( template void unpackAndCombineIntoCrsArrays ( - const CrsMatrix & sourceMatrix, - const Teuchos::ArrayView& importLIDs, - const Teuchos::ArrayView& imports, - const Teuchos::ArrayView& numPacketsPerLID, - const size_t /* constantNumPackets */, - const CombineMode /* combineMode */, - const size_t numSameIDs, - const Teuchos::ArrayView& permuteToLIDs, - const Teuchos::ArrayView& permuteFromLIDs, - size_t TargetNumRows, - size_t TargetNumNonzeros, - const int MyTargetPID, - const Teuchos::ArrayView& CRS_rowptr, - const Teuchos::ArrayView& CRS_colind, - const Teuchos::ArrayView::impl_scalar_type>& CRS_vals, - const Teuchos::ArrayView& SourcePids, - Teuchos::Array& TargetPids) -{ - using execution_space = typename Node::execution_space; - using Tpetra::Details::PackTraits; - - using Kokkos::View; - using Kokkos::deep_copy; - - using Teuchos::ArrayView; - using Teuchos::outArg; - using Teuchos::REDUCE_MAX; - using Teuchos::reduceAll; - - typedef LocalOrdinal LO; - - typedef typename Node::device_type DT; - - typedef CrsMatrix matrix_type; - typedef typename matrix_type::impl_scalar_type ST; - typedef typename ArrayView::size_type size_type; - - const char prefix[] = "Tpetra::Details::unpackAndCombineIntoCrsArrays: "; - - std::cout << "PID " << MyTargetPID << ": " << "CRS_rowptr.size() = " << CRS_rowptr.size () << ", TargetNumRows+1 = " << TargetNumRows+1 << std::endl; - - TEUCHOS_TEST_FOR_EXCEPTION( - TargetNumRows + 1 != static_cast (CRS_rowptr.size ()), - std::invalid_argument, prefix << "CRS_rowptr.size() = " << - CRS_rowptr.size () << "!= TargetNumRows+1 = " << TargetNumRows+1 << "."); - - TEUCHOS_TEST_FOR_EXCEPTION( - permuteToLIDs.size () != permuteFromLIDs.size (), std::invalid_argument, - prefix << "permuteToLIDs.size() = " << permuteToLIDs.size () - << "!= permuteFromLIDs.size() = " << permuteFromLIDs.size () << "."); - const size_type numImportLIDs = importLIDs.size (); - - TEUCHOS_TEST_FOR_EXCEPTION( - numImportLIDs != numPacketsPerLID.size (), std::invalid_argument, - prefix << "importLIDs.size() = " << numImportLIDs << " != " - "numPacketsPerLID.size() = " << numPacketsPerLID.size() << "."); - - // Preseed TargetPids with -1 for local - if (static_cast (TargetPids.size ()) != TargetNumNonzeros) { - TargetPids.resize (TargetNumNonzeros); - } - TargetPids.assign (TargetNumNonzeros, -1); - - // Grab pointers for sourceMatrix - auto local_matrix = sourceMatrix.getLocalMatrixDevice(); - auto local_col_map = sourceMatrix.getColMap()->getLocalMap(); - - // Convert input arrays to Kokkos::View - DT outputDevice; - auto import_lids_d = - create_mirror_view_from_raw_host_array(outputDevice, importLIDs.getRawPtr(), - importLIDs.size(), true, "import_lids"); - - auto imports_d = - create_mirror_view_from_raw_host_array(outputDevice, imports.getRawPtr(), - imports.size(), true, "imports"); - - auto num_packets_per_lid_d = - create_mirror_view_from_raw_host_array(outputDevice, numPacketsPerLID.getRawPtr(), - numPacketsPerLID.size(), true, "num_packets_per_lid"); - - auto permute_from_lids_d = - create_mirror_view_from_raw_host_array(outputDevice, permuteFromLIDs.getRawPtr(), - permuteFromLIDs.size(), true, "permute_from_lids"); - - auto permute_to_lids_d = - create_mirror_view_from_raw_host_array(outputDevice, permuteToLIDs.getRawPtr(), - permuteToLIDs.size(), true, "permute_to_lids"); - - auto crs_rowptr_d = - create_mirror_view_from_raw_host_array(outputDevice, CRS_rowptr.getRawPtr(), - CRS_rowptr.size(), true, "crs_rowptr"); - - auto crs_colind_d = - create_mirror_view_from_raw_host_array(outputDevice, CRS_colind.getRawPtr(), - CRS_colind.size(), true, "crs_colidx"); - -#ifdef HAVE_TPETRA_INST_COMPLEX_DOUBLE - static_assert (! std::is_same< - typename std::remove_const< - typename std::decay< - decltype (CRS_vals) - >::type::value_type - >::type, - std::complex >::value, - "CRS_vals::value_type is std::complex; this should never happen" - ", since std::complex does not work in Kokkos::View objects."); -#endif // HAVE_TPETRA_INST_COMPLEX_DOUBLE - - auto crs_vals_d = - create_mirror_view_from_raw_host_array(outputDevice, CRS_vals.getRawPtr(), - CRS_vals.size(), true, "crs_vals"); - -#ifdef HAVE_TPETRA_INST_COMPLEX_DOUBLE - static_assert (! std::is_same< - typename decltype (crs_vals_d)::non_const_value_type, - std::complex >::value, - "crs_vals_d::non_const_value_type is std::complex; this should " - "never happen, since std::complex does not work in Kokkos::View objects."); -#endif // HAVE_TPETRA_INST_COMPLEX_DOUBLE - - auto src_pids_d = - create_mirror_view_from_raw_host_array(outputDevice, SourcePids.getRawPtr(), - SourcePids.size(), true, "src_pids"); - - auto tgt_pids_d = - create_mirror_view_from_raw_host_array(outputDevice, TargetPids.getRawPtr(), - TargetPids.size(), true, "tgt_pids"); - - size_t bytes_per_value = 0; - if (PackTraits::compileTimeSize) { - // assume that ST is default constructible - bytes_per_value = PackTraits::packValueCount(ST()); - } - else { - // Since the packed data come from the source matrix, we can use the source - // matrix to get the number of bytes per Scalar value stored in the matrix. - // This assumes that all Scalar values in the source matrix require the same - // number of bytes. If the source matrix has no entries on the calling - // process, then we hope that some process does have some idea how big - // a Scalar value is. Of course, if no processes have any entries, then no - // values should be packed (though this does assume that in our packing - // scheme, rows with zero entries take zero bytes). - size_t bytes_per_value_l = 0; - if (local_matrix.values.extent(0) > 0) { - const ST& val = local_matrix.values(0); - bytes_per_value_l = PackTraits::packValueCount(val); - } else { - const ST& val = crs_vals_d(0); - bytes_per_value_l = PackTraits::packValueCount(val); - } - Teuchos::reduceAll(*(sourceMatrix.getComm()), - Teuchos::REDUCE_MAX, - bytes_per_value_l, - outArg(bytes_per_value)); - } - -#ifdef HAVE_TPETRA_INST_COMPLEX_DOUBLE - static_assert (! std::is_same< - typename decltype (crs_vals_d)::non_const_value_type, - std::complex >::value, - "crs_vals_d::non_const_value_type is std::complex; this should " - "never happen, since std::complex does not work in Kokkos::View objects."); -#endif // HAVE_TPETRA_INST_COMPLEX_DOUBLE - - UnpackAndCombineCrsMatrixImpl::unpackAndCombineIntoCrsArrays( - local_matrix, local_col_map, import_lids_d, imports_d, - num_packets_per_lid_d, permute_to_lids_d, permute_from_lids_d, - crs_rowptr_d, crs_colind_d, crs_vals_d, src_pids_d, tgt_pids_d, - numSameIDs, TargetNumRows, TargetNumNonzeros, MyTargetPID, - bytes_per_value); - - // Copy outputs back to host - typename decltype(crs_rowptr_d)::HostMirror crs_rowptr_h( - CRS_rowptr.getRawPtr(), CRS_rowptr.size()); - // DEEP_COPY REVIEW - DEVICE-TO-HOSTMIRROR - deep_copy(execution_space(), crs_rowptr_h, crs_rowptr_d); - - typename decltype(crs_colind_d)::HostMirror crs_colind_h( - CRS_colind.getRawPtr(), CRS_colind.size()); - // DEEP_COPY REVIEW - DEVICE-TO-HOSTMIRROR - deep_copy(execution_space(), crs_colind_h, crs_colind_d); - - typename decltype(crs_vals_d)::HostMirror crs_vals_h( - CRS_vals.getRawPtr(), CRS_vals.size()); - // DEEP_COPY REVIEW - DEVICE-TO-HOSTMIRROR - deep_copy(execution_space(), crs_vals_h, crs_vals_d); - - typename decltype(tgt_pids_d)::HostMirror tgt_pids_h( - TargetPids.getRawPtr(), TargetPids.size()); - // DEEP_COPY REVIEW - DEVICE-TO-HOSTMIRROR - deep_copy(execution_space(), tgt_pids_h, tgt_pids_d); -} //unpackAndCombineIntoCrsArrays (Teuchos::Array version) - -template -void -unpackAndCombineIntoCrsArrays_new ( const CrsMatrix & sourceMatrix, const Teuchos::ArrayView& importLIDs, const Kokkos::View& imports_d, @@ -1852,7 +1655,7 @@ unpackAndCombineIntoCrsArrays_new ( // DEEP_COPY REVIEW - DEVICE-TO-HOSTMIRROR deep_copy(execution_space(), tgt_pids_h, tgt_pids_d); -} //unpackAndCombineIntoCrsArrays_new +} //unpackAndCombineIntoCrsArrays } // namespace Details } // namespace Tpetra @@ -1874,25 +1677,6 @@ unpackAndCombineIntoCrsArrays_new ( const Kokkos::DualView::buffer_device_type>&, \ const size_t, \ const CombineMode); \ - template void \ - Details::unpackAndCombineIntoCrsArrays ( \ - const CrsMatrix &, \ - const Teuchos::ArrayView&, \ - const Teuchos::ArrayView&, \ - const Teuchos::ArrayView&, \ - const size_t, \ - const CombineMode, \ - const size_t, \ - const Teuchos::ArrayView&, \ - const Teuchos::ArrayView&, \ - size_t, \ - size_t, \ - const int, \ - const Teuchos::ArrayView&, \ - const Teuchos::ArrayView&, \ - const Teuchos::ArrayView::impl_scalar_type>&, \ - const Teuchos::ArrayView&, \ - Teuchos::Array&); \ template size_t \ Details::unpackAndCombineWithOwningPIDsCount ( \ const CrsMatrix &, \ @@ -1905,7 +1689,7 @@ unpackAndCombineIntoCrsArrays_new ( const Teuchos::ArrayView&, \ const Teuchos::ArrayView&); \ template void \ - Details::unpackAndCombineIntoCrsArrays_new ( \ + Details::unpackAndCombineIntoCrsArrays ( \ const CrsMatrix &, \ const Teuchos::ArrayView&, \ const Kokkos::View&, \ From 530518a19adbd1a2be527efa8ad989c00f2fbdf6 Mon Sep 17 00:00:00 2001 From: Jonathan Hu Date: Fri, 14 Jul 2023 14:44:20 -0600 Subject: [PATCH 5/8] Tpetra: fix unit tests, eliminate more deep copies Update unpackAndCombineIntoCrsArrays unit test. Leverage the fact that Tpetra::Details::Transfer has methods for returning Kokkos::DualViews for remote, permuteTo, and permuteFrom LIDs. Part of #11693. --- .../tpetra/core/src/Tpetra_CrsMatrix_def.hpp | 26 ++++--- ...Details_unpackCrsMatrixAndCombine_decl.hpp | 11 +-- ..._Details_unpackCrsMatrixAndCombine_def.hpp | 49 ++++++-------- .../ImportExport2/ImportExport2_UnitTests.cpp | 67 ++++++++----------- 4 files changed, 68 insertions(+), 85 deletions(-) diff --git a/packages/tpetra/core/src/Tpetra_CrsMatrix_def.hpp b/packages/tpetra/core/src/Tpetra_CrsMatrix_def.hpp index dc02633983dd..18928daa1fe6 100644 --- a/packages/tpetra/core/src/Tpetra_CrsMatrix_def.hpp +++ b/packages/tpetra/core/src/Tpetra_CrsMatrix_def.hpp @@ -7919,12 +7919,12 @@ CrsMatrix:: const size_t NumSameIDs = rowTransfer.getNumSameIDs(); ArrayView ExportLIDs = reverseMode ? rowTransfer.getRemoteLIDs () : rowTransfer.getExportLIDs (); - ArrayView RemoteLIDs = reverseMode ? - rowTransfer.getExportLIDs () : rowTransfer.getRemoteLIDs (); - ArrayView PermuteToLIDs = reverseMode ? - rowTransfer.getPermuteFromLIDs () : rowTransfer.getPermuteToLIDs (); - ArrayView PermuteFromLIDs = reverseMode ? - rowTransfer.getPermuteToLIDs () : rowTransfer.getPermuteFromLIDs (); + auto RemoteLIDs = reverseMode ? + rowTransfer.getExportLIDs_dv() : rowTransfer.getRemoteLIDs_dv(); + auto PermuteToLIDs = reverseMode ? + rowTransfer.getPermuteFromLIDs_dv() : rowTransfer.getPermuteToLIDs_dv(); + auto PermuteFromLIDs = reverseMode ? + rowTransfer.getPermuteToLIDs_dv() : rowTransfer.getPermuteFromLIDs_dv(); Distributor& Distor = rowTransfer.getDistributor (); // Owning PIDs @@ -8119,14 +8119,14 @@ CrsMatrix:: #endif if (constantNumPackets == 0) { destMat->reallocArraysForNumPacketsPerLid (ExportLIDs.size (), - RemoteLIDs.size ()); + RemoteLIDs.view_host().size ()); } else { // There are a constant number of packets per element. We // already know (from the number of "remote" (incoming) // elements) how many incoming elements we expect, so we can // resize the buffer accordingly. - const size_t rbufLen = RemoteLIDs.size() * constantNumPackets; + const size_t rbufLen = RemoteLIDs.view_host().size() * constantNumPackets; destMat->reallocImportsIfNeeded (rbufLen, false, nullptr); } } @@ -8478,14 +8478,18 @@ CrsMatrix:: "input Kokkos::DualView was most recently modified on host, but TAFC " "needs the device view of the data to be the most recently modified."); + const Kokkos::View RemoteLIDs_d = RemoteLIDs.view_device(); + const Kokkos::View PermuteToLIDs_d = PermuteToLIDs.view_device(); + const Kokkos::View PermuteFromLIDs_d = PermuteFromLIDs.view_device(); + //auto PermuteToLIDs_d = PermuteToLIDs.view_device(); //FAILS Details::unpackAndCombineIntoCrsArrays( *this, - RemoteLIDs, + RemoteLIDs_d, destMat->imports_.view_device(), //hostImports destMat->numImportPacketsPerLID_.view_device(), //numImportPacketsPerLID NumSameIDs, - PermuteToLIDs, - PermuteFromLIDs, + PermuteToLIDs_d, + PermuteFromLIDs_d, N, MyPID, CSR_rowptr, diff --git a/packages/tpetra/core/src/Tpetra_Details_unpackCrsMatrixAndCombine_decl.hpp b/packages/tpetra/core/src/Tpetra_Details_unpackCrsMatrixAndCombine_decl.hpp index 376a61171dc5..a112950e37be 100644 --- a/packages/tpetra/core/src/Tpetra_Details_unpackCrsMatrixAndCombine_decl.hpp +++ b/packages/tpetra/core/src/Tpetra_Details_unpackCrsMatrixAndCombine_decl.hpp @@ -225,16 +225,17 @@ unpackAndCombineWithOwningPIDsCount ( /// Note: This method does the work previously done in unpackAndCombineWithOwningPIDsCount, /// namely, calculating the local number of nonzeros, and allocates CRS /// arrays of the correct sizes. + template void unpackAndCombineIntoCrsArrays ( const CrsMatrix & sourceMatrix, - const Teuchos::ArrayView& importLIDs, - const Kokkos::View& imports_d, - const Kokkos::View& num_packets_per_lid_d, + const Kokkos::View, + const Kokkos::View, + const Kokkos::View, const size_t numSameIDs, - const Teuchos::ArrayView& permuteToLIDs, - const Teuchos::ArrayView& permuteFromLIDs, + const Kokkos::View, + const Kokkos::View, size_t TargetNumRows, const int MyTargetPID, Teuchos::ArrayRCP& CRS_rowptr, diff --git a/packages/tpetra/core/src/Tpetra_Details_unpackCrsMatrixAndCombine_def.hpp b/packages/tpetra/core/src/Tpetra_Details_unpackCrsMatrixAndCombine_def.hpp index bcd983ff1564..433fce8a2371 100644 --- a/packages/tpetra/core/src/Tpetra_Details_unpackCrsMatrixAndCombine_def.hpp +++ b/packages/tpetra/core/src/Tpetra_Details_unpackCrsMatrixAndCombine_def.hpp @@ -1422,16 +1422,17 @@ unpackAndCombineWithOwningPIDsCount ( /// Note: The TargetPids vector (on output) will contain owning PIDs /// for each entry in the matrix, with the "-1 for local" for locally /// owned entries. + template void unpackAndCombineIntoCrsArrays ( const CrsMatrix & sourceMatrix, - const Teuchos::ArrayView& importLIDs, - const Kokkos::View& imports_d, - const Kokkos::View& num_packets_per_lid_d, + const Kokkos::View import_lids_d, + const Kokkos::View imports_d, + const Kokkos::View num_packets_per_lid_d, const size_t numSameIDs, - const Teuchos::ArrayView& permuteToLIDs, - const Teuchos::ArrayView& permuteFromLIDs, + const Kokkos::View permute_to_lids_d, + const Kokkos::View permute_from_lids_d, size_t TargetNumRows, const int MyTargetPID, Teuchos::ArrayRCP& CRS_rowptr, @@ -1468,9 +1469,9 @@ unpackAndCombineIntoCrsArrays ( using Kokkos::MemoryUnmanaged; TEUCHOS_TEST_FOR_EXCEPTION - (permuteToLIDs.size () != permuteFromLIDs.size (), std::invalid_argument, - prefix << "permuteToLIDs.size() = " << permuteToLIDs.size () << " != " - "permuteFromLIDs.size() = " << permuteFromLIDs.size() << "."); + (permute_to_lids_d.size () != permute_from_lids_d.size (), std::invalid_argument, + prefix << "permute_to_lids_d.size() = " << permute_to_lids_d.size () << " != " + "permute_from_lids_d.size() = " << permute_from_lids_d.size() << "."); // FIXME (mfh 26 Jan 2015) If there are no entries on the calling // process, then the matrix is neither locally nor globally indexed. const bool locallyIndexed = sourceMatrix.isLocallyIndexed (); @@ -1478,16 +1479,11 @@ unpackAndCombineIntoCrsArrays ( (! locallyIndexed, std::invalid_argument, prefix << "The input " "CrsMatrix 'sourceMatrix' must be locally indexed."); TEUCHOS_TEST_FOR_EXCEPTION - (((size_t)importLIDs.size ()) != num_packets_per_lid_d.size (), std::invalid_argument, - prefix << "importLIDs.size() = " << importLIDs.size () << " != " + (((size_t)import_lids_d.size ()) != num_packets_per_lid_d.size (), std::invalid_argument, + prefix << "import_lids_d.size() = " << import_lids_d.size () << " != " "num_packets_per_lid_d.size() = " << num_packets_per_lid_d.size () << "."); auto local_matrix = sourceMatrix.getLocalMatrixDevice (); - auto permute_from_lids_d = - create_mirror_view_from_raw_host_array (DT (), - permuteFromLIDs.getRawPtr (), - permuteFromLIDs.size (), true, - "permute_from_lids"); // TargetNumNonzeros is number of nonzeros in local matrix. # ifdef HAVE_TPETRA_MMM_TIMINGS @@ -1513,9 +1509,9 @@ unpackAndCombineIntoCrsArrays ( # endif TEUCHOS_TEST_FOR_EXCEPTION( - permuteToLIDs.size () != permuteFromLIDs.size (), std::invalid_argument, - prefix << "permuteToLIDs.size() = " << permuteToLIDs.size () - << "!= permuteFromLIDs.size() = " << permuteFromLIDs.size () << "."); + permute_to_lids_d.size () != permute_from_lids_d.size (), std::invalid_argument, + prefix << "permuteToLIDs.size() = " << permute_to_lids_d.size () + << "!= permute_from_lids_d.size() = " << permute_from_lids_d.size () << "."); // Preseed TargetPids with -1 for local if (static_cast (TargetPids.size ()) != TargetNumNonzeros) { @@ -1531,13 +1527,6 @@ unpackAndCombineIntoCrsArrays ( # endif // Convert input arrays to Kokkos::Views DT outputDevice; - auto import_lids_d = - create_mirror_view_from_raw_host_array(outputDevice, importLIDs.getRawPtr(), - importLIDs.size(), true, "import_lids"); - - auto permute_to_lids_d = - create_mirror_view_from_raw_host_array(outputDevice, permuteToLIDs.getRawPtr(), - permuteToLIDs.size(), true, "permute_to_lids"); auto crs_rowptr_d = create_mirror_view_from_raw_host_array(outputDevice, CRS_rowptr.getRawPtr(), @@ -1691,12 +1680,12 @@ unpackAndCombineIntoCrsArrays ( template void \ Details::unpackAndCombineIntoCrsArrays ( \ const CrsMatrix &, \ - const Teuchos::ArrayView&, \ - const Kokkos::View&, \ - const Kokkos::View&, \ + const Kokkos::View, \ + const Kokkos::View, \ + const Kokkos::View, \ const size_t, \ - const Teuchos::ArrayView&, \ - const Teuchos::ArrayView&, \ + const Kokkos::View, \ + const Kokkos::View, \ size_t, \ const int, \ Teuchos::ArrayRCP&, \ diff --git a/packages/tpetra/core/test/ImportExport2/ImportExport2_UnitTests.cpp b/packages/tpetra/core/test/ImportExport2/ImportExport2_UnitTests.cpp index 687f55712a09..ae9fec7ec3a0 100644 --- a/packages/tpetra/core/test/ImportExport2/ImportExport2_UnitTests.cpp +++ b/packages/tpetra/core/test/ImportExport2/ImportExport2_UnitTests.cpp @@ -598,8 +598,6 @@ namespace { typedef typename Array::size_type size_type; for (size_type k = 0; k < static_cast (tgtNumEntries); ++k) { TEST_EQUALITY(tgtRowInds[k], tgt2RowInds[k]); - out << "JHU: tgtRowInds[" << k << "]=" << tgtRowInds[k] - << ", tgt2RowInds[" << k << "] = " << tgt2RowInds[k] << std::endl; // The "out" and "success" variables should have been // automatically defined by the unit test framework, in case // you're wondering where they came from. @@ -2385,6 +2383,8 @@ TEUCHOS_UNIT_TEST_TEMPLATE_3_DECL( Import_Util, UnpackAndCombineWithOwningPIDs, } Kokkos::View importsView(imports.data(), imports.size()); distor.doPostsAndWaits(exports.view_host(),numExportPackets(),importsView,numImportPackets()); + auto importsView_d = Kokkos::create_mirror_view(Node::device_type::memory_space(), importsView); + deep_copy(importsView_d,importsView); if (verbose) { std::ostringstream os; os << *prefix << "Done with 4-arg doPostsAndWaits" << std::endl; @@ -2393,33 +2393,13 @@ TEUCHOS_UNIT_TEST_TEMPLATE_3_DECL( Import_Util, UnpackAndCombineWithOwningPIDs, ::Tpetra::Details::Behavior::enable_verbose_behavior (); - // Run the count... which should get the same NNZ as the traditional import - using Tpetra::Details::unpackAndCombineWithOwningPIDsCount; - size_t nnz2 = - unpackAndCombineWithOwningPIDsCount (*A, Importer->getRemoteLIDs (), - imports (), numImportPackets (), - constantNumPackets, - Tpetra::INSERT, - Importer->getNumSameIDs (), - Importer->getPermuteToLIDs (), - Importer->getPermuteFromLIDs ()); - if (verbose) { - std::ostringstream os; - os << *prefix << "Done with unpackAndCombineWithOwningPIDsCount; " - "nnz1=" << nnz1 << ", nnz2=" << nnz2 << std::endl; - std::cerr << os.str (); - } - - if(nnz1!=nnz2) test_err++; - total_err+=test_err; - ///////////////////////////////////////////////////////// // Test #2: Actual combine test ///////////////////////////////////////////////////////// - Teuchos::Array rowptr (MapTarget->getLocalNumElements () + 1); - Teuchos::Array colind (nnz2); - Teuchos::Array vals (nnz2); - Teuchos::Array TargetPids; + Teuchos::ArrayRCP rowptr; + Teuchos::ArrayRCP colind; + Teuchos::ArrayRCP vals; + Teuchos::Array TargetPids; if (verbose) { std::ostringstream os; @@ -2427,30 +2407,39 @@ TEUCHOS_UNIT_TEST_TEMPLATE_3_DECL( Import_Util, UnpackAndCombineWithOwningPIDs, std::cerr << os.str (); } + auto numImportPacketsView_d = Kokkos::create_mirror_view(Node::device_type::memory_space(),numImportPacketsView); + deep_copy(numImportPacketsView_d,numImportPacketsView); + + + const Kokkos::View RemoteLIDs_d = Importer->getRemoteLIDs_dv().view_device(); + const Kokkos::View PermuteToLIDs_d = Importer->getPermuteToLIDs_dv().view_device(); + const Kokkos::View PermuteFromLIDs_d = Importer->getPermuteFromLIDs_dv().view_device(); + using Tpetra::Details::unpackAndCombineIntoCrsArrays; - //JHU FIXME unpackAndCombineIntoCrsArrays ( *A, - Importer->getRemoteLIDs (), - imports (), - numImportPackets (), - constantNumPackets, - Tpetra::INSERT, + RemoteLIDs_d, + importsView_d, + numImportPacketsView_d, Importer->getNumSameIDs (), - Importer->getPermuteToLIDs (), - Importer->getPermuteFromLIDs (), + PermuteToLIDs_d, + PermuteFromLIDs_d, MapTarget->getLocalNumElements (), - nnz2, MyPID, - rowptr (), - colind (), - Teuchos::av_reinterpret_cast (vals ()), + rowptr, + colind, + vals, SourcePids (), TargetPids); + size_t nnz2 = vals.size(); + if(nnz1!=nnz2) test_err++; + total_err+=test_err; + if (verbose) { std::ostringstream os; - os << *prefix << "Done with unpackAndCombineIntoCrsArrays" << std::endl; + os << *prefix << "Done with unpackAndCombineIntoCrsArrays; " + "nnz1=" << nnz1 << ", nnz2=" << nnz2 << std::endl; std::cerr << os.str (); } From 10d7dadd81af7d1ff54fe99920d4013489f88e14 Mon Sep 17 00:00:00 2001 From: Jonathan Hu Date: Fri, 4 Aug 2023 17:35:48 -0600 Subject: [PATCH 6/8] MueLu: remove deep_copy timers In response to review of PR #12036. --- packages/muelu/src/Utils/MueLu_UtilitiesBase_def.hpp | 6 ------ 1 file changed, 6 deletions(-) diff --git a/packages/muelu/src/Utils/MueLu_UtilitiesBase_def.hpp b/packages/muelu/src/Utils/MueLu_UtilitiesBase_def.hpp index 4ce2ea8babfe..59d75dd1e796 100644 --- a/packages/muelu/src/Utils/MueLu_UtilitiesBase_def.hpp +++ b/packages/muelu/src/Utils/MueLu_UtilitiesBase_def.hpp @@ -394,15 +394,9 @@ namespace MueLu { if (useAverageAbsDiagVal) { Teuchos::TimeMonitor MMM = *Teuchos::TimeMonitor::getNewTimer("GetLumpedMatrixDiagonal: useAverageAbsDiagVal"); typename Kokkos::View::HostMirror avgAbsDiagVal = Kokkos::create_mirror_view(avgAbsDiagVal_dev); - { - Teuchos::TimeMonitor MMMM = *Teuchos::TimeMonitor::getNewTimer("GetLumpedMatrixDiagonal: deep_copy 1"); Kokkos::deep_copy(avgAbsDiagVal, avgAbsDiagVal_dev); - } int numDiagsEqualToOne; - { - Teuchos::TimeMonitor MMMM = *Teuchos::TimeMonitor::getNewTimer("GetLumpedMatrixDiagonal: deep_copy 2"); Kokkos::deep_copy(numDiagsEqualToOne, numDiagsEqualToOne_dev); - } tol = TST::magnitude(100 * Teuchos::ScalarTraits::eps()) * (avgAbsDiagVal()-numDiagsEqualToOne) / (rowMap->getLocalNumElements()-numDiagsEqualToOne); } From 35cb1253c2131b7041b2884899f658c0401c3a53 Mon Sep 17 00:00:00 2001 From: Jonathan Hu Date: Fri, 4 Aug 2023 17:58:32 -0600 Subject: [PATCH 7/8] Tpetra: remove unnecessary exception test In response to review comment for PR #12036. --- packages/tpetra/core/src/Tpetra_CrsMatrix_def.hpp | 5 ----- 1 file changed, 5 deletions(-) diff --git a/packages/tpetra/core/src/Tpetra_CrsMatrix_def.hpp b/packages/tpetra/core/src/Tpetra_CrsMatrix_def.hpp index 18928daa1fe6..00f45e2e3382 100644 --- a/packages/tpetra/core/src/Tpetra_CrsMatrix_def.hpp +++ b/packages/tpetra/core/src/Tpetra_CrsMatrix_def.hpp @@ -8473,11 +8473,6 @@ CrsMatrix:: size_t N = BaseRowMap->getLocalNumElements (); - TEUCHOS_TEST_FOR_EXCEPTION - (destMat->numImportPacketsPerLID_.need_sync_device(), std::logic_error, "The " - "input Kokkos::DualView was most recently modified on host, but TAFC " - "needs the device view of the data to be the most recently modified."); - const Kokkos::View RemoteLIDs_d = RemoteLIDs.view_device(); const Kokkos::View PermuteToLIDs_d = PermuteToLIDs.view_device(); const Kokkos::View PermuteFromLIDs_d = PermuteFromLIDs.view_device(); From f547f714261694f0dc04106b6b6817fa777ceb9a Mon Sep 17 00:00:00 2001 From: Jonathan Hu Date: Fri, 1 Sep 2023 16:31:28 -0600 Subject: [PATCH 8/8] Tpetra: fix TAFC changes for UVM enabled --- .../tpetra/core/src/Tpetra_CrsMatrix_def.hpp | 8 +- ...Details_unpackCrsMatrixAndCombine_decl.hpp | 27 +++++-- ..._Details_unpackCrsMatrixAndCombine_def.hpp | 81 ++++++++++++++----- .../ImportExport2/ImportExport2_UnitTests.cpp | 7 +- 4 files changed, 87 insertions(+), 36 deletions(-) diff --git a/packages/tpetra/core/src/Tpetra_CrsMatrix_def.hpp b/packages/tpetra/core/src/Tpetra_CrsMatrix_def.hpp index 00f45e2e3382..3649b7dd6a1e 100644 --- a/packages/tpetra/core/src/Tpetra_CrsMatrix_def.hpp +++ b/packages/tpetra/core/src/Tpetra_CrsMatrix_def.hpp @@ -8473,10 +8473,10 @@ CrsMatrix:: size_t N = BaseRowMap->getLocalNumElements (); - const Kokkos::View RemoteLIDs_d = RemoteLIDs.view_device(); - const Kokkos::View PermuteToLIDs_d = PermuteToLIDs.view_device(); - const Kokkos::View PermuteFromLIDs_d = PermuteFromLIDs.view_device(); - //auto PermuteToLIDs_d = PermuteToLIDs.view_device(); //FAILS + auto RemoteLIDs_d = RemoteLIDs.view_device(); + auto PermuteToLIDs_d = PermuteToLIDs.view_device(); + auto PermuteFromLIDs_d = PermuteFromLIDs.view_device(); + Details::unpackAndCombineIntoCrsArrays( *this, RemoteLIDs_d, diff --git a/packages/tpetra/core/src/Tpetra_Details_unpackCrsMatrixAndCombine_decl.hpp b/packages/tpetra/core/src/Tpetra_Details_unpackCrsMatrixAndCombine_decl.hpp index a112950e37be..31f48c464e32 100644 --- a/packages/tpetra/core/src/Tpetra_Details_unpackCrsMatrixAndCombine_decl.hpp +++ b/packages/tpetra/core/src/Tpetra_Details_unpackCrsMatrixAndCombine_decl.hpp @@ -45,6 +45,7 @@ #include "Kokkos_DualView.hpp" #include "Tpetra_CrsMatrix_fwd.hpp" #include "Tpetra_DistObject_decl.hpp" +#include "Tpetra_Details_DefaultTypes.hpp" /// \file Tpetra_Details_unpackCrsMatrixAndCombine_decl.hpp /// \brief Declaration of functions for unpacking the entries of a @@ -230,12 +231,27 @@ template & sourceMatrix, - const Kokkos::View, - const Kokkos::View, - const Kokkos::View, + const Kokkos::View>, + void, void>, + const Kokkos::View> + ,void, void >, + const Kokkos::View> + ,void, void >, const size_t numSameIDs, - const Kokkos::View, - const Kokkos::View, + const Kokkos::View>, + void, void>, + const Kokkos::View>, + void, void>, size_t TargetNumRows, const int MyTargetPID, Teuchos::ArrayRCP& CRS_rowptr, @@ -243,7 +259,6 @@ unpackAndCombineIntoCrsArrays ( Teuchos::ArrayRCP& CRS_vals, const Teuchos::ArrayView& SourcePids, Teuchos::Array& TargetPids); - } // namespace Details } // namespace Tpetra diff --git a/packages/tpetra/core/src/Tpetra_Details_unpackCrsMatrixAndCombine_def.hpp b/packages/tpetra/core/src/Tpetra_Details_unpackCrsMatrixAndCombine_def.hpp index 433fce8a2371..d9a9591ee71e 100644 --- a/packages/tpetra/core/src/Tpetra_Details_unpackCrsMatrixAndCombine_def.hpp +++ b/packages/tpetra/core/src/Tpetra_Details_unpackCrsMatrixAndCombine_def.hpp @@ -40,7 +40,10 @@ #ifndef TPETRA_DETAILS_UNPACKCRSMATRIXANDCOMBINE_DEF_HPP #define TPETRA_DETAILS_UNPACKCRSMATRIXANDCOMBINE_DEF_HPP +#include +#include #include "TpetraCore_config.h" +#include "Kokkos_Core.hpp" #include "Teuchos_Array.hpp" #include "Teuchos_ArrayView.hpp" #include "Teuchos_OrdinalTraits.hpp" @@ -52,9 +55,7 @@ #include "Tpetra_Details_PackTraits.hpp" #include "Tpetra_CrsMatrix_decl.hpp" #include "Tpetra_Details_getEntryOnHost.hpp" -#include "Kokkos_Core.hpp" -#include -#include +#include "Tpetra_Details_DefaultTypes.hpp" /// \file Tpetra_Details_unpackCrsMatrixAndCombine_def.hpp /// \brief Definition of functions for unpacking the entries of a @@ -747,8 +748,8 @@ size_t unpackAndCombineWithOwningPIDsCount( const LocalMatrix& local_matrix, const typename PackTraits::input_array_type permute_from_lids, - const Kokkos::View& imports, - const Kokkos::View& num_packets_per_lid, + const Kokkos::View& imports, + const Kokkos::View& num_packets_per_lid, const size_t num_same_ids) { using Kokkos::parallel_reduce; @@ -961,8 +962,8 @@ unpackAndCombineIntoCrsArrays2( const Kokkos::View& new_start_row, const typename PackTraits::input_array_type& offsets, const typename PackTraits::input_array_type& import_lids, - const Kokkos::View& imports, - const Kokkos::View& num_packets_per_lid, + const Kokkos::View& imports, + const Kokkos::View& num_packets_per_lid, const LocalMatrix& /* local_matrix */, const LocalMap /*& local_col_map*/, const int my_pid, @@ -1036,8 +1037,8 @@ unpackAndCombineIntoCrsArrays( const LocalMatrix & local_matrix, const LocalMap & local_col_map, const typename PackTraits::input_array_type& import_lids, - const Kokkos::View& imports, - const Kokkos::View& num_packets_per_lid, + const Kokkos::View& imports, + const Kokkos::View& num_packets_per_lid, const typename PackTraits::input_array_type& permute_to_lids, const typename PackTraits::input_array_type& permute_from_lids, const typename PackTraits::output_array_type& tgt_rowptr, @@ -1387,17 +1388,23 @@ unpackAndCombineWithOwningPIDsCount ( "numPacketsPerLID.size() = " << numPacketsPerLID.size () << "."); auto local_matrix = sourceMatrix.getLocalMatrixDevice (); - auto permute_from_lids_d = + + using kokkos_device_type = Kokkos::Device>; + + Kokkos::View permute_from_lids_d = create_mirror_view_from_raw_host_array (DT (), permuteFromLIDs.getRawPtr (), permuteFromLIDs.size (), true, "permute_from_lids"); - auto imports_d = + + Kokkos::View imports_d = create_mirror_view_from_raw_host_array (DT (), imports.getRawPtr (), imports.size (), true, "imports"); - auto num_packets_per_lid_d = + + Kokkos::View num_packets_per_lid_d = create_mirror_view_from_raw_host_array (DT (), numPacketsPerLID.getRawPtr (), numPacketsPerLID.size (), true, @@ -1427,12 +1434,27 @@ template & sourceMatrix, - const Kokkos::View import_lids_d, - const Kokkos::View imports_d, - const Kokkos::View num_packets_per_lid_d, + const Kokkos::View>, + void, void > import_lids_d, + const Kokkos::View>, + void, void > imports_d, + const Kokkos::View>, + void, void > num_packets_per_lid_d, const size_t numSameIDs, - const Kokkos::View permute_to_lids_d, - const Kokkos::View permute_from_lids_d, + const Kokkos::View>, + void, void > permute_to_lids_d, + const Kokkos::View>, + void, void > permute_from_lids_d, size_t TargetNumRows, const int MyTargetPID, Teuchos::ArrayRCP& CRS_rowptr, @@ -1680,12 +1702,27 @@ unpackAndCombineIntoCrsArrays ( template void \ Details::unpackAndCombineIntoCrsArrays ( \ const CrsMatrix &, \ - const Kokkos::View, \ - const Kokkos::View, \ - const Kokkos::View, \ + const Kokkos::View>,\ + void, void >, \ + const Kokkos::View>, \ + void, void >, \ + const Kokkos::View>, \ + void, void >, \ const size_t, \ - const Kokkos::View, \ - const Kokkos::View, \ + const Kokkos::View>, \ + void, void >, \ + const Kokkos::View>, \ + void, void >, \ size_t, \ const int, \ Teuchos::ArrayRCP&, \ diff --git a/packages/tpetra/core/test/ImportExport2/ImportExport2_UnitTests.cpp b/packages/tpetra/core/test/ImportExport2/ImportExport2_UnitTests.cpp index ae9fec7ec3a0..7896c21f7a71 100644 --- a/packages/tpetra/core/test/ImportExport2/ImportExport2_UnitTests.cpp +++ b/packages/tpetra/core/test/ImportExport2/ImportExport2_UnitTests.cpp @@ -2410,10 +2410,9 @@ TEUCHOS_UNIT_TEST_TEMPLATE_3_DECL( Import_Util, UnpackAndCombineWithOwningPIDs, auto numImportPacketsView_d = Kokkos::create_mirror_view(Node::device_type::memory_space(),numImportPacketsView); deep_copy(numImportPacketsView_d,numImportPacketsView); - - const Kokkos::View RemoteLIDs_d = Importer->getRemoteLIDs_dv().view_device(); - const Kokkos::View PermuteToLIDs_d = Importer->getPermuteToLIDs_dv().view_device(); - const Kokkos::View PermuteFromLIDs_d = Importer->getPermuteFromLIDs_dv().view_device(); + auto RemoteLIDs_d = Importer->getRemoteLIDs_dv().view_device(); + auto PermuteToLIDs_d = Importer->getPermuteToLIDs_dv().view_device(); + auto PermuteFromLIDs_d = Importer->getPermuteFromLIDs_dv().view_device(); using Tpetra::Details::unpackAndCombineIntoCrsArrays; unpackAndCombineIntoCrsArrays (