diff --git a/packages/muelu/src/Utils/MueLu_UtilitiesBase_def.hpp b/packages/muelu/src/Utils/MueLu_UtilitiesBase_def.hpp index 1fe2f6df1b7f..59d75dd1e796 100644 --- a/packages/muelu/src/Utils/MueLu_UtilitiesBase_def.hpp +++ b/packages/muelu/src/Utils/MueLu_UtilitiesBase_def.hpp @@ -342,6 +342,7 @@ namespace MueLu { diag = Xpetra::VectorFactory::Build(rowMap,true); if(rowMap->lib() == Xpetra::UnderlyingLib::UseTpetra) { + Teuchos::TimeMonitor MM = *Teuchos::TimeMonitor::getNewTimer("UtilitiesBase::GetLumpedMatrixDiagonal (Kokkos implementation)"); // Implement using Kokkos using local_vector_type = typename Vector::dual_view_type::t_dev_um; using local_matrix_type = typename Matrix::local_matrix_type; @@ -366,6 +367,8 @@ namespace MueLu { Kokkos::View avgAbsDiagVal_dev("avgAbsDiagVal"); Kokkos::View numDiagsEqualToOne_dev("numDiagsEqualToOne"); + { + Teuchos::TimeMonitor MMM = *Teuchos::TimeMonitor::getNewTimer("GetLumpedMatrixDiagonal: parallel_for (doReciprocal)"); Kokkos::parallel_for("GetLumpedMatrixDiagonal", my_policy, KOKKOS_LAMBDA(const int rowIdx) { diag_dev(rowIdx, 0) = KAT_S::zero(); @@ -387,15 +390,19 @@ namespace MueLu { } }); - typename Kokkos::View::HostMirror avgAbsDiagVal = Kokkos::create_mirror_view(avgAbsDiagVal_dev); - Kokkos::deep_copy(avgAbsDiagVal, avgAbsDiagVal_dev); - int numDiagsEqualToOne; - Kokkos::deep_copy(numDiagsEqualToOne, numDiagsEqualToOne_dev); - + } if (useAverageAbsDiagVal) { + Teuchos::TimeMonitor MMM = *Teuchos::TimeMonitor::getNewTimer("GetLumpedMatrixDiagonal: useAverageAbsDiagVal"); + typename Kokkos::View::HostMirror avgAbsDiagVal = Kokkos::create_mirror_view(avgAbsDiagVal_dev); + Kokkos::deep_copy(avgAbsDiagVal, avgAbsDiagVal_dev); + int numDiagsEqualToOne; + Kokkos::deep_copy(numDiagsEqualToOne, numDiagsEqualToOne_dev); + tol = TST::magnitude(100 * Teuchos::ScalarTraits::eps()) * (avgAbsDiagVal()-numDiagsEqualToOne) / (rowMap->getLocalNumElements()-numDiagsEqualToOne); } + { + Teuchos::TimeMonitor MMM = *Teuchos::TimeMonitor::getNewTimer("ComputeLumpedDiagonalInverse: parallel_for (doReciprocal)"); Kokkos::parallel_for("ComputeLumpedDiagonalInverse", my_policy, KOKKOS_LAMBDA(const int rowIdx) { if (replaceSingleEntryRowWithZero && nnzPerRow(rowIdx) <= 1) { @@ -410,8 +417,10 @@ namespace MueLu { } } }); + } } else { + Teuchos::TimeMonitor MMM = *Teuchos::TimeMonitor::getNewTimer("GetLumpedMatrixDiagonal: parallel_for"); Kokkos::parallel_for("GetLumpedMatrixDiagonal", my_policy, KOKKOS_LAMBDA(const int rowIdx) { diag_dev(rowIdx, 0) = KAT_S::zero(); @@ -424,6 +433,7 @@ namespace MueLu { } } else { // Implement using Teuchos + Teuchos::TimeMonitor MMM = *Teuchos::TimeMonitor::getNewTimer("UtilitiesBase: GetLumpedMatrixDiagonal: (Teuchos implementation)"); ArrayRCP diagVals = diag->getDataNonConst(0); Teuchos::Array regSum(diag->getLocalLength()); Teuchos::ArrayView cols; diff --git a/packages/tpetra/core/src/Tpetra_CrsMatrix_def.hpp b/packages/tpetra/core/src/Tpetra_CrsMatrix_def.hpp index 82ab73577c93..3649b7dd6a1e 100644 --- a/packages/tpetra/core/src/Tpetra_CrsMatrix_def.hpp +++ b/packages/tpetra/core/src/Tpetra_CrsMatrix_def.hpp @@ -4534,7 +4534,7 @@ CrsMatrix:: ); this->checkInternalState (); } - } + } //fillComplete(domainMap, rangeMap, params) template void @@ -7919,12 +7919,12 @@ CrsMatrix:: const size_t NumSameIDs = rowTransfer.getNumSameIDs(); ArrayView ExportLIDs = reverseMode ? rowTransfer.getRemoteLIDs () : rowTransfer.getExportLIDs (); - ArrayView RemoteLIDs = reverseMode ? - rowTransfer.getExportLIDs () : rowTransfer.getRemoteLIDs (); - ArrayView PermuteToLIDs = reverseMode ? - rowTransfer.getPermuteFromLIDs () : rowTransfer.getPermuteToLIDs (); - ArrayView PermuteFromLIDs = reverseMode ? - rowTransfer.getPermuteToLIDs () : rowTransfer.getPermuteFromLIDs (); + auto RemoteLIDs = reverseMode ? + rowTransfer.getExportLIDs_dv() : rowTransfer.getRemoteLIDs_dv(); + auto PermuteToLIDs = reverseMode ? + rowTransfer.getPermuteFromLIDs_dv() : rowTransfer.getPermuteToLIDs_dv(); + auto PermuteFromLIDs = reverseMode ? + rowTransfer.getPermuteToLIDs_dv() : rowTransfer.getPermuteFromLIDs_dv(); Distributor& Distor = rowTransfer.getDistributor (); // Owning PIDs @@ -8119,14 +8119,14 @@ CrsMatrix:: #endif if (constantNumPackets == 0) { destMat->reallocArraysForNumPacketsPerLid (ExportLIDs.size (), - RemoteLIDs.size ()); + RemoteLIDs.view_host().size ()); } else { // There are a constant number of packets per element. We // already know (from the number of "remote" (incoming) // elements) how many incoming elements we expect, so we can // resize the buffer accordingly. - const size_t rbufLen = RemoteLIDs.size() * constantNumPackets; + const size_t rbufLen = RemoteLIDs.view_host().size() * constantNumPackets; destMat->reallocImportsIfNeeded (rbufLen, false, nullptr); } } @@ -8450,52 +8450,48 @@ CrsMatrix:: } } - /*********************************************************************/ /**** 3) Copy all of the Same/Permute/Remote data into CSR_arrays ****/ /*********************************************************************/ // Backwards compatibility measure. We'll use this again below. -#ifdef HAVE_TPETRA_MMM_TIMINGS - RCP tmCopySPRdata = rcp(new TimeMonitor(*TimeMonitor::getNewTimer(prefix + std::string("TAFC unpack-count-resize")))); -#endif - destMat->numImportPacketsPerLID_.sync_host (); - Teuchos::ArrayView numImportPacketsPerLID = - getArrayViewFromDualView (destMat->numImportPacketsPerLID_); - destMat->imports_.sync_host (); - Teuchos::ArrayView hostImports = - getArrayViewFromDualView (destMat->imports_); - if (verbose) { - std::ostringstream os; - os << *verbosePrefix << "Calling unpackAndCombineWithOwningPIDsCount" - << std::endl; - std::cerr << os.str (); - } - size_t mynnz = - unpackAndCombineWithOwningPIDsCount (*this, - RemoteLIDs, - hostImports, - numImportPacketsPerLID, - constantNumPackets, - INSERT, - NumSameIDs, - PermuteToLIDs, - PermuteFromLIDs); - if (verbose) { - std::ostringstream os; - os << *verbosePrefix << "unpackAndCombineWithOwningPIDsCount returned " - << mynnz << std::endl; - std::cerr << os.str (); - } - size_t N = BaseRowMap->getLocalNumElements (); + // TODO JHU Need to track down why numImportPacketsPerLID_ has not been corrently marked as modified on host (which it has been) + // TODO JHU somewhere above, e.g., call to Distor.doPostsAndWaits(). + // TODO JHU This only becomes apparent as we begin to convert TAFC to run on device. + destMat->numImportPacketsPerLID_.modify_host(); //FIXME - // Allocations - ArrayRCP CSR_rowptr(N+1); +# ifdef HAVE_TPETRA_MMM_TIMINGS + RCP tmCopySPRdata = rcp(new TimeMonitor(*TimeMonitor::getNewTimer(prefix + std::string("TAFC unpack-count-resize + copy same-perm-remote data")))); +# endif + ArrayRCP CSR_rowptr; ArrayRCP CSR_colind_GID; ArrayRCP CSR_colind_LID; ArrayRCP CSR_vals; - CSR_colind_GID.resize (mynnz); - CSR_vals.resize (mynnz); + + destMat->imports_.sync_device (); + destMat->numImportPacketsPerLID_.sync_device (); + + size_t N = BaseRowMap->getLocalNumElements (); + + auto RemoteLIDs_d = RemoteLIDs.view_device(); + auto PermuteToLIDs_d = PermuteToLIDs.view_device(); + auto PermuteFromLIDs_d = PermuteFromLIDs.view_device(); + + Details::unpackAndCombineIntoCrsArrays( + *this, + RemoteLIDs_d, + destMat->imports_.view_device(), //hostImports + destMat->numImportPacketsPerLID_.view_device(), //numImportPacketsPerLID + NumSameIDs, + PermuteToLIDs_d, + PermuteFromLIDs_d, + N, + MyPID, + CSR_rowptr, + CSR_colind_GID, + CSR_vals, + SourcePids(), + TargetPids); // If LO and GO are the same, we can reuse memory when // converting the column indices from global to local indices. @@ -8503,44 +8499,14 @@ CrsMatrix:: CSR_colind_LID = Teuchos::arcp_reinterpret_cast (CSR_colind_GID); } else { - CSR_colind_LID.resize (mynnz); - } -#ifdef HAVE_TPETRA_MMM_TIMINGS - tmCopySPRdata = Teuchos::null; - tmCopySPRdata = rcp(new TimeMonitor(*TimeMonitor::getNewTimer(prefix + std::string("TAFC copy same-perm-remote data")))); -#endif - - if (verbose) { - std::ostringstream os; - os << *verbosePrefix << "Calling unpackAndCombineIntoCrsArrays" - << std::endl; - std::cerr << os.str (); + CSR_colind_LID.resize (CSR_colind_GID.size()); } - // FIXME (mfh 15 May 2014) Why can't we abstract this out as an - // unpackAndCombine method on a "CrsArrays" object? This passing - // in a huge list of arrays is icky. Can't we have a bit of an - // abstraction? Implementing a concrete DistObject subclass only - // takes five methods. - unpackAndCombineIntoCrsArrays (*this, - RemoteLIDs, - hostImports, - numImportPacketsPerLID, - constantNumPackets, - INSERT, - NumSameIDs, - PermuteToLIDs, - PermuteFromLIDs, - N, - mynnz, - MyPID, - CSR_rowptr (), - CSR_colind_GID (), - Teuchos::av_reinterpret_cast (CSR_vals ()), - SourcePids (), - TargetPids); + CSR_colind_LID.resize (CSR_colind_GID.size()); + size_t mynnz = CSR_vals.size(); // On return from unpackAndCombineIntoCrsArrays TargetPids[i] == -1 for locally // owned entries. Convert them to the actual PID. + // JHU FIXME This can be done within unpackAndCombineIntoCrsArrays with a parallel_for. for(size_t i=0; i(TargetPids.size()); i++) { if(TargetPids[i] == -1) TargetPids[i] = MyPID; diff --git a/packages/tpetra/core/src/Tpetra_Details_unpackCrsMatrixAndCombine_decl.hpp b/packages/tpetra/core/src/Tpetra_Details_unpackCrsMatrixAndCombine_decl.hpp index 349a1fa0ca86..31f48c464e32 100644 --- a/packages/tpetra/core/src/Tpetra_Details_unpackCrsMatrixAndCombine_decl.hpp +++ b/packages/tpetra/core/src/Tpetra_Details_unpackCrsMatrixAndCombine_decl.hpp @@ -45,6 +45,7 @@ #include "Kokkos_DualView.hpp" #include "Tpetra_CrsMatrix_fwd.hpp" #include "Tpetra_DistObject_decl.hpp" +#include "Tpetra_Details_DefaultTypes.hpp" /// \file Tpetra_Details_unpackCrsMatrixAndCombine_decl.hpp /// \brief Declaration of functions for unpacking the entries of a @@ -213,10 +214,6 @@ unpackAndCombineWithOwningPIDsCount ( /// \brief unpackAndCombineIntoCrsArrays /// -/// \note You should call unpackAndCombineWithOwningPIDsCount first -/// and allocate all arrays accordingly, before calling this -/// function. -/// /// Note: The SourcePids vector (on input) should contain owning PIDs /// for each column in the (source) ColMap, as from /// Tpetra::Import_Util::getPids, with the "-1 for local" option being @@ -225,27 +222,43 @@ unpackAndCombineWithOwningPIDsCount ( /// Note: The TargetPids vector (on output) will contain owning PIDs /// for each entry in the matrix, with the "-1 for local" for locally /// owned entries. +/// +/// Note: This method does the work previously done in unpackAndCombineWithOwningPIDsCount, +/// namely, calculating the local number of nonzeros, and allocates CRS +/// arrays of the correct sizes. + template void unpackAndCombineIntoCrsArrays ( const CrsMatrix & sourceMatrix, - const Teuchos::ArrayView& importLIDs, - const Teuchos::ArrayView& imports, - const Teuchos::ArrayView& numPacketsPerLID, - const size_t constantNumPackets, - const CombineMode combineMode, + const Kokkos::View>, + void, void>, + const Kokkos::View> + ,void, void >, + const Kokkos::View> + ,void, void >, const size_t numSameIDs, - const Teuchos::ArrayView& permuteToLIDs, - const Teuchos::ArrayView& permuteFromLIDs, + const Kokkos::View>, + void, void>, + const Kokkos::View>, + void, void>, size_t TargetNumRows, - size_t TargetNumNonzeros, const int MyTargetPID, - const Teuchos::ArrayView& CRS_rowptr, - const Teuchos::ArrayView& CRS_colind, - const Teuchos::ArrayView::impl_scalar_type>& CRS_vals, + Teuchos::ArrayRCP& CRS_rowptr, + Teuchos::ArrayRCP& CRS_colind, + Teuchos::ArrayRCP& CRS_vals, const Teuchos::ArrayView& SourcePids, Teuchos::Array& TargetPids); - } // namespace Details } // namespace Tpetra diff --git a/packages/tpetra/core/src/Tpetra_Details_unpackCrsMatrixAndCombine_def.hpp b/packages/tpetra/core/src/Tpetra_Details_unpackCrsMatrixAndCombine_def.hpp index aafdde2d536d..d9a9591ee71e 100644 --- a/packages/tpetra/core/src/Tpetra_Details_unpackCrsMatrixAndCombine_def.hpp +++ b/packages/tpetra/core/src/Tpetra_Details_unpackCrsMatrixAndCombine_def.hpp @@ -40,10 +40,14 @@ #ifndef TPETRA_DETAILS_UNPACKCRSMATRIXANDCOMBINE_DEF_HPP #define TPETRA_DETAILS_UNPACKCRSMATRIXANDCOMBINE_DEF_HPP +#include +#include #include "TpetraCore_config.h" +#include "Kokkos_Core.hpp" #include "Teuchos_Array.hpp" #include "Teuchos_ArrayView.hpp" #include "Teuchos_OrdinalTraits.hpp" +#include "Teuchos_TimeMonitor.hpp" #include "Tpetra_Details_castAwayConstDualView.hpp" #include "Tpetra_Details_computeOffsets.hpp" #include "Tpetra_Details_createMirrorView.hpp" @@ -51,9 +55,7 @@ #include "Tpetra_Details_PackTraits.hpp" #include "Tpetra_CrsMatrix_decl.hpp" #include "Tpetra_Details_getEntryOnHost.hpp" -#include "Kokkos_Core.hpp" -#include -#include +#include "Tpetra_Details_DefaultTypes.hpp" /// \file Tpetra_Details_unpackCrsMatrixAndCombine_def.hpp /// \brief Definition of functions for unpacking the entries of a @@ -166,7 +168,7 @@ unpackRow(const typename PackTraits::output_array_type& gids_out, return 24; // error code } return 0; // no errors -} +} //unpackRow /// \brief Unpacks and combines a single row of the CrsMatrix. /// @@ -419,7 +421,7 @@ struct UnpackCrsMatrixAndCombineFunctor { return error_code_h(); } -}; +}; //UnpackCrsMatrixAndCombineFunctor struct MaxNumEntTag {}; struct TotNumEntTag {}; @@ -489,7 +491,7 @@ class NumEntriesFunctor { tot_num_ent += static_cast (num_ent_LO); } } -}; +}; //NumEntriesFunctor /// \brief Maximum number of entries in any row of the packed matrix. /// @@ -739,15 +741,15 @@ unpackAndCombineIntoCrsMatrix( std::runtime_error, prefix << "UnpackCrsMatrixAndCombineFunctor reported error code " << error_code ); -} +} //unpackAndCombineIntoCrsMatrix (Kokkos version) template size_t unpackAndCombineWithOwningPIDsCount( const LocalMatrix& local_matrix, const typename PackTraits::input_array_type permute_from_lids, - const Kokkos::View& imports, - const Kokkos::View& num_packets_per_lid, + const Kokkos::View& imports, + const Kokkos::View& num_packets_per_lid, const size_t num_same_ids) { using Kokkos::parallel_reduce; @@ -797,7 +799,7 @@ unpackAndCombineWithOwningPIDsCount( } return count; -} +} //unpackAndCombineWithOwningPIDsCount (Kokkos version) /// \brief Setup row pointers for remotes template @@ -960,8 +962,8 @@ unpackAndCombineIntoCrsArrays2( const Kokkos::View& new_start_row, const typename PackTraits::input_array_type& offsets, const typename PackTraits::input_array_type& import_lids, - const Kokkos::View& imports, - const Kokkos::View& num_packets_per_lid, + const Kokkos::View& imports, + const Kokkos::View& num_packets_per_lid, const LocalMatrix& /* local_matrix */, const LocalMap /*& local_col_map*/, const int my_pid, @@ -1035,8 +1037,8 @@ unpackAndCombineIntoCrsArrays( const LocalMatrix & local_matrix, const LocalMap & local_col_map, const typename PackTraits::input_array_type& import_lids, - const Kokkos::View& imports, - const Kokkos::View& num_packets_per_lid, + const Kokkos::View& imports, + const Kokkos::View& num_packets_per_lid, const typename PackTraits::input_array_type& permute_to_lids, const typename PackTraits::input_array_type& permute_from_lids, const typename PackTraits::output_array_type& tgt_rowptr, @@ -1367,7 +1369,7 @@ unpackAndCombineWithOwningPIDsCount ( using Kokkos::MemoryUnmanaged; using Kokkos::View; typedef typename Node::device_type DT; - typedef typename DistObject::buffer_device_type BDT; + typedef typename DT::execution_space execution_space; const char prefix[] = "unpackAndCombineWithOwningPIDsCount: "; TEUCHOS_TEST_FOR_EXCEPTION @@ -1386,18 +1388,24 @@ unpackAndCombineWithOwningPIDsCount ( "numPacketsPerLID.size() = " << numPacketsPerLID.size () << "."); auto local_matrix = sourceMatrix.getLocalMatrixDevice (); - auto permute_from_lids_d = + + using kokkos_device_type = Kokkos::Device>; + + Kokkos::View permute_from_lids_d = create_mirror_view_from_raw_host_array (DT (), permuteFromLIDs.getRawPtr (), permuteFromLIDs.size (), true, "permute_from_lids"); - auto imports_d = - create_mirror_view_from_raw_host_array (BDT (), + + Kokkos::View imports_d = + create_mirror_view_from_raw_host_array (DT (), imports.getRawPtr (), imports.size (), true, "imports"); - auto num_packets_per_lid_d = - create_mirror_view_from_raw_host_array (BDT (), + + Kokkos::View num_packets_per_lid_d = + create_mirror_view_from_raw_host_array (DT (), numPacketsPerLID.getRawPtr (), numPacketsPerLID.size (), true, "num_packets_per_lid"); @@ -1405,7 +1413,7 @@ unpackAndCombineWithOwningPIDsCount ( return UnpackAndCombineCrsMatrixImpl::unpackAndCombineWithOwningPIDsCount( local_matrix, permute_from_lids_d, imports_d, num_packets_per_lid_d, numSameIDs); -} +} //unpackAndCombineWithOwningPIDsCount (Teuchos::Array version) /// \brief unpackAndCombineIntoCrsArrays /// @@ -1421,24 +1429,37 @@ unpackAndCombineWithOwningPIDsCount ( /// Note: The TargetPids vector (on output) will contain owning PIDs /// for each entry in the matrix, with the "-1 for local" for locally /// owned entries. + template void unpackAndCombineIntoCrsArrays ( const CrsMatrix & sourceMatrix, - const Teuchos::ArrayView& importLIDs, - const Teuchos::ArrayView& imports, - const Teuchos::ArrayView& numPacketsPerLID, - const size_t /* constantNumPackets */, - const CombineMode /* combineMode */, + const Kokkos::View>, + void, void > import_lids_d, + const Kokkos::View>, + void, void > imports_d, + const Kokkos::View>, + void, void > num_packets_per_lid_d, const size_t numSameIDs, - const Teuchos::ArrayView& permuteToLIDs, - const Teuchos::ArrayView& permuteFromLIDs, + const Kokkos::View>, + void, void > permute_to_lids_d, + const Kokkos::View>, + void, void > permute_from_lids_d, size_t TargetNumRows, - size_t TargetNumNonzeros, const int MyTargetPID, - const Teuchos::ArrayView& CRS_rowptr, - const Teuchos::ArrayView& CRS_colind, - const Teuchos::ArrayView::impl_scalar_type>& CRS_vals, + Teuchos::ArrayRCP& CRS_rowptr, + Teuchos::ArrayRCP& CRS_colind, + Teuchos::ArrayRCP& CRS_vals, const Teuchos::ArrayView& SourcePids, Teuchos::Array& TargetPids) { @@ -1461,23 +1482,58 @@ unpackAndCombineIntoCrsArrays ( typedef typename matrix_type::impl_scalar_type ST; typedef typename ArrayView::size_type size_type; - const char prefix[] = "Tpetra::Details::unpackAndCombineIntoCrsArrays: "; + const char prefix[] = "Tpetra::Details::unpackAndCombineIntoCrsArrays_new: "; +# ifdef HAVE_TPETRA_MMM_TIMINGS + using Teuchos::TimeMonitor; + Teuchos::RCP tm; +# endif - TEUCHOS_TEST_FOR_EXCEPTION( - TargetNumRows + 1 != static_cast (CRS_rowptr.size ()), - std::invalid_argument, prefix << "CRS_rowptr.size() = " << - CRS_rowptr.size () << "!= TargetNumRows+1 = " << TargetNumRows+1 << "."); + using Kokkos::MemoryUnmanaged; - TEUCHOS_TEST_FOR_EXCEPTION( - permuteToLIDs.size () != permuteFromLIDs.size (), std::invalid_argument, - prefix << "permuteToLIDs.size() = " << permuteToLIDs.size () - << "!= permuteFromLIDs.size() = " << permuteFromLIDs.size () << "."); - const size_type numImportLIDs = importLIDs.size (); + TEUCHOS_TEST_FOR_EXCEPTION + (permute_to_lids_d.size () != permute_from_lids_d.size (), std::invalid_argument, + prefix << "permute_to_lids_d.size() = " << permute_to_lids_d.size () << " != " + "permute_from_lids_d.size() = " << permute_from_lids_d.size() << "."); + // FIXME (mfh 26 Jan 2015) If there are no entries on the calling + // process, then the matrix is neither locally nor globally indexed. + const bool locallyIndexed = sourceMatrix.isLocallyIndexed (); + TEUCHOS_TEST_FOR_EXCEPTION + (! locallyIndexed, std::invalid_argument, prefix << "The input " + "CrsMatrix 'sourceMatrix' must be locally indexed."); + TEUCHOS_TEST_FOR_EXCEPTION + (((size_t)import_lids_d.size ()) != num_packets_per_lid_d.size (), std::invalid_argument, + prefix << "import_lids_d.size() = " << import_lids_d.size () << " != " + "num_packets_per_lid_d.size() = " << num_packets_per_lid_d.size () << "."); + + auto local_matrix = sourceMatrix.getLocalMatrixDevice (); + + // TargetNumNonzeros is number of nonzeros in local matrix. +# ifdef HAVE_TPETRA_MMM_TIMINGS + tm = Teuchos::rcp(new TimeMonitor(*TimeMonitor::getNewTimer(prefix + std::string("unpackAndCombineWithOwningPIDsCount")))); +# endif + size_t TargetNumNonzeros = + UnpackAndCombineCrsMatrixImpl::unpackAndCombineWithOwningPIDsCount( + local_matrix, permute_from_lids_d, imports_d, + num_packets_per_lid_d, numSameIDs); +# ifdef HAVE_TPETRA_MMM_TIMINGS + tm = Teuchos::null; +# endif + +# ifdef HAVE_TPETRA_MMM_TIMINGS + tm = Teuchos::rcp(new TimeMonitor(*TimeMonitor::getNewTimer(prefix + std::string("resize CRS pointers")))); +# endif + CRS_rowptr.resize (TargetNumRows+1); + CRS_colind.resize(TargetNumNonzeros); + CRS_vals.resize(TargetNumNonzeros); + Teuchos::ArrayRCP const & CRS_vals_impl_scalar_type = Teuchos::arcp_reinterpret_cast(CRS_vals); +# ifdef HAVE_TPETRA_MMM_TIMINGS + tm = Teuchos::null; +# endif TEUCHOS_TEST_FOR_EXCEPTION( - numImportLIDs != numPacketsPerLID.size (), std::invalid_argument, - prefix << "importLIDs.size() = " << numImportLIDs << " != " - "numPacketsPerLID.size() = " << numPacketsPerLID.size() << "."); + permute_to_lids_d.size () != permute_from_lids_d.size (), std::invalid_argument, + prefix << "permuteToLIDs.size() = " << permute_to_lids_d.size () + << "!= permute_from_lids_d.size() = " << permute_from_lids_d.size () << "."); // Preseed TargetPids with -1 for local if (static_cast (TargetPids.size ()) != TargetNumNonzeros) { @@ -1486,30 +1542,13 @@ unpackAndCombineIntoCrsArrays ( TargetPids.assign (TargetNumNonzeros, -1); // Grab pointers for sourceMatrix - auto local_matrix = sourceMatrix.getLocalMatrixDevice(); auto local_col_map = sourceMatrix.getColMap()->getLocalMap(); - // Convert input arrays to Kokkos::View +# ifdef HAVE_TPETRA_MMM_TIMINGS + tm = Teuchos::rcp(new TimeMonitor(*TimeMonitor::getNewTimer(prefix + std::string("create mirror views from inputs")))); +# endif + // Convert input arrays to Kokkos::Views DT outputDevice; - auto import_lids_d = - create_mirror_view_from_raw_host_array(outputDevice, importLIDs.getRawPtr(), - importLIDs.size(), true, "import_lids"); - - auto imports_d = - create_mirror_view_from_raw_host_array(outputDevice, imports.getRawPtr(), - imports.size(), true, "imports"); - - auto num_packets_per_lid_d = - create_mirror_view_from_raw_host_array(outputDevice, numPacketsPerLID.getRawPtr(), - numPacketsPerLID.size(), true, "num_packets_per_lid"); - - auto permute_from_lids_d = - create_mirror_view_from_raw_host_array(outputDevice, permuteFromLIDs.getRawPtr(), - permuteFromLIDs.size(), true, "permute_from_lids"); - - auto permute_to_lids_d = - create_mirror_view_from_raw_host_array(outputDevice, permuteToLIDs.getRawPtr(), - permuteToLIDs.size(), true, "permute_to_lids"); auto crs_rowptr_d = create_mirror_view_from_raw_host_array(outputDevice, CRS_rowptr.getRawPtr(), @@ -1518,12 +1557,11 @@ unpackAndCombineIntoCrsArrays ( auto crs_colind_d = create_mirror_view_from_raw_host_array(outputDevice, CRS_colind.getRawPtr(), CRS_colind.size(), true, "crs_colidx"); - #ifdef HAVE_TPETRA_INST_COMPLEX_DOUBLE static_assert (! std::is_same< typename std::remove_const< typename std::decay< - decltype (CRS_vals) + decltype (CRS_vals_impl_scalar_type) >::type::value_type >::type, std::complex >::value, @@ -1532,8 +1570,8 @@ unpackAndCombineIntoCrsArrays ( #endif // HAVE_TPETRA_INST_COMPLEX_DOUBLE auto crs_vals_d = - create_mirror_view_from_raw_host_array(outputDevice, CRS_vals.getRawPtr(), - CRS_vals.size(), true, "crs_vals"); + create_mirror_view_from_raw_host_array(outputDevice, CRS_vals_impl_scalar_type.getRawPtr(), + CRS_vals_impl_scalar_type.size(), true, "crs_vals"); #ifdef HAVE_TPETRA_INST_COMPLEX_DOUBLE static_assert (! std::is_same< @@ -1551,6 +1589,10 @@ unpackAndCombineIntoCrsArrays ( create_mirror_view_from_raw_host_array(outputDevice, TargetPids.getRawPtr(), TargetPids.size(), true, "tgt_pids"); +# ifdef HAVE_TPETRA_MMM_TIMINGS + tm = Teuchos::null; +# endif + size_t bytes_per_value = 0; if (PackTraits::compileTimeSize) { // assume that ST is default constructible @@ -1587,14 +1629,23 @@ unpackAndCombineIntoCrsArrays ( "never happen, since std::complex does not work in Kokkos::View objects."); #endif // HAVE_TPETRA_INST_COMPLEX_DOUBLE +# ifdef HAVE_TPETRA_MMM_TIMINGS + tm = Teuchos::rcp(new TimeMonitor(*TimeMonitor::getNewTimer(prefix + std::string("unpackAndCombineIntoCrsArrays")))); +# endif UnpackAndCombineCrsMatrixImpl::unpackAndCombineIntoCrsArrays( local_matrix, local_col_map, import_lids_d, imports_d, num_packets_per_lid_d, permute_to_lids_d, permute_from_lids_d, crs_rowptr_d, crs_colind_d, crs_vals_d, src_pids_d, tgt_pids_d, numSameIDs, TargetNumRows, TargetNumNonzeros, MyTargetPID, bytes_per_value); +# ifdef HAVE_TPETRA_MMM_TIMINGS + tm = Teuchos::null; +# endif // Copy outputs back to host +# ifdef HAVE_TPETRA_MMM_TIMINGS + tm = Teuchos::rcp(new TimeMonitor(*TimeMonitor::getNewTimer(prefix + std::string("copy back to host")))); +# endif typename decltype(crs_rowptr_d)::HostMirror crs_rowptr_h( CRS_rowptr.getRawPtr(), CRS_rowptr.size()); // DEEP_COPY REVIEW - DEVICE-TO-HOSTMIRROR @@ -1606,7 +1657,7 @@ unpackAndCombineIntoCrsArrays ( deep_copy(execution_space(), crs_colind_h, crs_colind_d); typename decltype(crs_vals_d)::HostMirror crs_vals_h( - CRS_vals.getRawPtr(), CRS_vals.size()); + CRS_vals_impl_scalar_type.getRawPtr(), CRS_vals_impl_scalar_type.size()); // DEEP_COPY REVIEW - DEVICE-TO-HOSTMIRROR deep_copy(execution_space(), crs_vals_h, crs_vals_d); @@ -1614,7 +1665,8 @@ unpackAndCombineIntoCrsArrays ( TargetPids.getRawPtr(), TargetPids.size()); // DEEP_COPY REVIEW - DEVICE-TO-HOSTMIRROR deep_copy(execution_space(), tgt_pids_h, tgt_pids_d); -} + +} //unpackAndCombineIntoCrsArrays } // namespace Details } // namespace Tpetra @@ -1636,25 +1688,6 @@ unpackAndCombineIntoCrsArrays ( const Kokkos::DualView::buffer_device_type>&, \ const size_t, \ const CombineMode); \ - template void \ - Details::unpackAndCombineIntoCrsArrays ( \ - const CrsMatrix &, \ - const Teuchos::ArrayView&, \ - const Teuchos::ArrayView&, \ - const Teuchos::ArrayView&, \ - const size_t, \ - const CombineMode, \ - const size_t, \ - const Teuchos::ArrayView&, \ - const Teuchos::ArrayView&, \ - size_t, \ - size_t, \ - const int, \ - const Teuchos::ArrayView&, \ - const Teuchos::ArrayView&, \ - const Teuchos::ArrayView::impl_scalar_type>&, \ - const Teuchos::ArrayView&, \ - Teuchos::Array&); \ template size_t \ Details::unpackAndCombineWithOwningPIDsCount ( \ const CrsMatrix &, \ @@ -1665,6 +1698,37 @@ unpackAndCombineIntoCrsArrays ( CombineMode, \ size_t, \ const Teuchos::ArrayView&, \ - const Teuchos::ArrayView&); + const Teuchos::ArrayView&); \ + template void \ + Details::unpackAndCombineIntoCrsArrays ( \ + const CrsMatrix &, \ + const Kokkos::View>,\ + void, void >, \ + const Kokkos::View>, \ + void, void >, \ + const Kokkos::View>, \ + void, void >, \ + const size_t, \ + const Kokkos::View>, \ + void, void >, \ + const Kokkos::View>, \ + void, void >, \ + size_t, \ + const int, \ + Teuchos::ArrayRCP&, \ + Teuchos::ArrayRCP&, \ + Teuchos::ArrayRCP&, \ + const Teuchos::ArrayView&, \ + Teuchos::Array&); #endif // TPETRA_DETAILS_UNPACKCRSMATRIXANDCOMBINE_DEF_HPP diff --git a/packages/tpetra/core/test/ImportExport2/ImportExport2_UnitTests.cpp b/packages/tpetra/core/test/ImportExport2/ImportExport2_UnitTests.cpp index 21643ed08573..7896c21f7a71 100644 --- a/packages/tpetra/core/test/ImportExport2/ImportExport2_UnitTests.cpp +++ b/packages/tpetra/core/test/ImportExport2/ImportExport2_UnitTests.cpp @@ -39,6 +39,8 @@ // ************************************************************************ // @HEADER +#include + #include #include @@ -459,6 +461,20 @@ namespace { } src_mat->fillComplete (); + RCP fos = Teuchos::fancyOStream(Teuchos::rcpFromRef(std::cout)); + fos->setOutputToRootOnly(-1); + +#if 0 + fflush(stdout); + sleep(1); comm->barrier(); + if (comm->getRank() == 0) std::cout << "========\nsrc_mat\n========" << std::endl; + sleep(1); comm->barrier(); + src_mat->describe(*fos,Teuchos::VERB_EXTREME); + sleep(1); comm->barrier(); + if (comm->getRank() == 0) std::cout << "========\nend of src_mat\n========\n\n" << std::endl; + sleep(1); comm->barrier(); +#endif + // Create the importer Import importer (src_map, tgt_map, getImportParameterList ()); // Do the import, and fill-complete the target matrix. @@ -496,6 +512,9 @@ namespace { Teuchos::null, Teuchos::null, rcp(&dummy,false)); + //comm->barrier(); + //TEST_EQUALITY(1,1); + //return; // Make sure that A_tgt2's row Map is the same as tgt_map, and // is also the same as the Import's targetMap. They should have @@ -521,6 +540,25 @@ namespace { as (10) * ScalarTraits::eps (); typedef typename CrsMatrix::nonconst_local_inds_host_view_type lids_type; typedef typename CrsMatrix::nonconst_values_host_view_type vals_type; + +#if 0 + fflush(stdout); + sleep(1); comm->barrier(); + if (comm->getRank() == 0) std::cout << "tgt_mat\n========" << std::endl; + sleep(1); comm->barrier(); + A_tgt2->describe(*fos,Teuchos::VERB_EXTREME); + sleep(1); comm->barrier(); + if (comm->getRank() == 0) std::cout << "=======\nend of tgt_mat\n========\n\n" << std::endl; + sleep(1); comm->barrier(); + + sleep(1); comm->barrier(); + if (comm->getRank() == 0) std::cout << "A_tgt2\n========" << std::endl; + sleep(1); comm->barrier(); + A_tgt2->describe(*fos,Teuchos::VERB_EXTREME); + sleep(1); comm->barrier(); + if (comm->getRank() == 0) std::cout << "=======\nend of A_tgt2\n========" << std::endl; + sleep(1); comm->barrier(); + #endif lids_type tgtRowInds; vals_type tgtRowVals; @@ -2345,6 +2383,8 @@ TEUCHOS_UNIT_TEST_TEMPLATE_3_DECL( Import_Util, UnpackAndCombineWithOwningPIDs, } Kokkos::View importsView(imports.data(), imports.size()); distor.doPostsAndWaits(exports.view_host(),numExportPackets(),importsView,numImportPackets()); + auto importsView_d = Kokkos::create_mirror_view(Node::device_type::memory_space(), importsView); + deep_copy(importsView_d,importsView); if (verbose) { std::ostringstream os; os << *prefix << "Done with 4-arg doPostsAndWaits" << std::endl; @@ -2353,33 +2393,13 @@ TEUCHOS_UNIT_TEST_TEMPLATE_3_DECL( Import_Util, UnpackAndCombineWithOwningPIDs, ::Tpetra::Details::Behavior::enable_verbose_behavior (); - // Run the count... which should get the same NNZ as the traditional import - using Tpetra::Details::unpackAndCombineWithOwningPIDsCount; - size_t nnz2 = - unpackAndCombineWithOwningPIDsCount (*A, Importer->getRemoteLIDs (), - imports (), numImportPackets (), - constantNumPackets, - Tpetra::INSERT, - Importer->getNumSameIDs (), - Importer->getPermuteToLIDs (), - Importer->getPermuteFromLIDs ()); - if (verbose) { - std::ostringstream os; - os << *prefix << "Done with unpackAndCombineWithOwningPIDsCount; " - "nnz1=" << nnz1 << ", nnz2=" << nnz2 << std::endl; - std::cerr << os.str (); - } - - if(nnz1!=nnz2) test_err++; - total_err+=test_err; - ///////////////////////////////////////////////////////// // Test #2: Actual combine test ///////////////////////////////////////////////////////// - Teuchos::Array rowptr (MapTarget->getLocalNumElements () + 1); - Teuchos::Array colind (nnz2); - Teuchos::Array vals (nnz2); - Teuchos::Array TargetPids; + Teuchos::ArrayRCP rowptr; + Teuchos::ArrayRCP colind; + Teuchos::ArrayRCP vals; + Teuchos::Array TargetPids; if (verbose) { std::ostringstream os; @@ -2387,29 +2407,38 @@ TEUCHOS_UNIT_TEST_TEMPLATE_3_DECL( Import_Util, UnpackAndCombineWithOwningPIDs, std::cerr << os.str (); } + auto numImportPacketsView_d = Kokkos::create_mirror_view(Node::device_type::memory_space(),numImportPacketsView); + deep_copy(numImportPacketsView_d,numImportPacketsView); + + auto RemoteLIDs_d = Importer->getRemoteLIDs_dv().view_device(); + auto PermuteToLIDs_d = Importer->getPermuteToLIDs_dv().view_device(); + auto PermuteFromLIDs_d = Importer->getPermuteFromLIDs_dv().view_device(); + using Tpetra::Details::unpackAndCombineIntoCrsArrays; unpackAndCombineIntoCrsArrays ( *A, - Importer->getRemoteLIDs (), - imports (), - numImportPackets (), - constantNumPackets, - Tpetra::INSERT, + RemoteLIDs_d, + importsView_d, + numImportPacketsView_d, Importer->getNumSameIDs (), - Importer->getPermuteToLIDs (), - Importer->getPermuteFromLIDs (), + PermuteToLIDs_d, + PermuteFromLIDs_d, MapTarget->getLocalNumElements (), - nnz2, MyPID, - rowptr (), - colind (), - Teuchos::av_reinterpret_cast (vals ()), + rowptr, + colind, + vals, SourcePids (), TargetPids); + size_t nnz2 = vals.size(); + if(nnz1!=nnz2) test_err++; + total_err+=test_err; + if (verbose) { std::ostringstream os; - os << *prefix << "Done with unpackAndCombineIntoCrsArrays" << std::endl; + os << *prefix << "Done with unpackAndCombineIntoCrsArrays; " + "nnz1=" << nnz1 << ", nnz2=" << nnz2 << std::endl; std::cerr << os.str (); }