From e67173ac2cec396bf91655f5b3b9f7cec7e4a16f Mon Sep 17 00:00:00 2001 From: Ian Halim Date: Mon, 15 Jul 2024 17:15:36 -0600 Subject: [PATCH 1/7] MueLu: Cut Drop Converted to Use Kokkos Original code within ORIGINAL ifdef. New code within NEW ifdef. DropTol structure marked with KOKKOS_INLINE_FUNCTION and default values are hard coded. Default Algorithm and Cut Drop Algorithm split into separate for loops in NEW code. Cut Drop converted to use Kokkos nested parallel loops. Timers placed in new code and are commented out. Code passes current unit tests. Saw a speedup of about 1.5x with Cuda and 1.2x with Serial when running unit tests with 10,000,000 rows. Signed-off-by: Ian Halim --- .../MueLu_CoalesceDropFactory_decl.hpp | 1 + .../MueLu_CoalesceDropFactory_def.hpp | 1663 ++++++++++++++++- 2 files changed, 1657 insertions(+), 7 deletions(-) diff --git a/packages/muelu/src/Graph/MatrixTransformation/MueLu_CoalesceDropFactory_decl.hpp b/packages/muelu/src/Graph/MatrixTransformation/MueLu_CoalesceDropFactory_decl.hpp index 96b5e778f6bc..db5e9a291313 100644 --- a/packages/muelu/src/Graph/MatrixTransformation/MueLu_CoalesceDropFactory_decl.hpp +++ b/packages/muelu/src/Graph/MatrixTransformation/MueLu_CoalesceDropFactory_decl.hpp @@ -160,6 +160,7 @@ class CoalesceDropFactory : public SingleLevelFactoryBase { //@} void Build(Level& currentLevel) const; // Build + void BuildKokkos(Level& currentLevel) const; private: // pre-drop function diff --git a/packages/muelu/src/Graph/MatrixTransformation/MueLu_CoalesceDropFactory_def.hpp b/packages/muelu/src/Graph/MatrixTransformation/MueLu_CoalesceDropFactory_def.hpp index 2c421c477bde..a8befaea592b 100644 --- a/packages/muelu/src/Graph/MatrixTransformation/MueLu_CoalesceDropFactory_def.hpp +++ b/packages/muelu/src/Graph/MatrixTransformation/MueLu_CoalesceDropFactory_def.hpp @@ -61,6 +61,8 @@ #include +#include //NEW +#include //NEW #include "MueLu_CoalesceDropFactory_decl.hpp" #include "MueLu_AmalgamationFactory.hpp" @@ -92,22 +94,30 @@ namespace MueLu { namespace Details { template struct DropTol { + KOKKOS_INLINE_FUNCTION //NEW DropTol() = default; + KOKKOS_INLINE_FUNCTION //NEW DropTol(DropTol const&) = default; + KOKKOS_INLINE_FUNCTION //NEW DropTol(DropTol&&) = default; DropTol& operator=(DropTol const&) = default; DropTol& operator=(DropTol&&) = default; + KOKKOS_INLINE_FUNCTION //NEW DropTol(real_type val_, real_type diag_, LO col_, bool drop_) : val{val_} , diag{diag_} , col{col_} , drop{drop_} {} - real_type val{Teuchos::ScalarTraits::zero()}; - real_type diag{Teuchos::ScalarTraits::zero()}; - LO col{Teuchos::OrdinalTraits::invalid()}; + real_type val{0}; + real_type diag{0}; + LO col{-1}; + //NEW Can't run these host functions on device + //real_type val{Teuchos::ScalarTraits::zero()}; + //real_type diag{Teuchos::ScalarTraits::zero()}; + //LO col{Teuchos::OrdinalTraits::invalid()}; bool drop{true}; // CMS: Auxillary information for debugging info @@ -414,6 +424,1645 @@ void CoalesceDropFactory::Build(Level TEUCHOS_TEST_FOR_EXCEPTION(A->GetFixedBlockSize() % A->GetStorageBlockSize() != 0, Exceptions::RuntimeError, "A->GetFixedBlockSize() needs to be a multiple of A->GetStorageBlockSize()"); const LO BlockSize = A->GetFixedBlockSize() / A->GetStorageBlockSize(); + /************************** RS or SA-style Classical Dropping (and variants) **************************/ + if (algo == "classical") { + if (predrop_ == null) { + // ap: this is a hack: had to declare predrop_ as mutable + predrop_ = rcp(new PreDropFunctionConstVal(threshold)); + } + + if (predrop_ != null) { + RCP predropConstVal = rcp_dynamic_cast(predrop_); + TEUCHOS_TEST_FOR_EXCEPTION(predropConstVal == Teuchos::null, Exceptions::BadCast, + "MueLu::CoalesceFactory::Build: cast to PreDropFunctionConstVal failed."); + // If a user provided a predrop function, it overwrites the XML threshold parameter + SC newt = predropConstVal->GetThreshold(); + if (newt != threshold) { + GetOStream(Warnings0) << "switching threshold parameter from " << threshold << " (list) to " << newt << " (user function" << std::endl; + threshold = newt; + } + } + // At this points we either have + // (predrop_ != null) + // Therefore, it is sufficient to check only threshold + if (BlockSize == 1 && threshold == STS::zero() && !useSignedClassicalRS && !useSignedClassicalSA && A->hasCrsGraph()) { + // Case 1: scalar problem, no dropping => just use matrix graph + RCP graph = rcp(new LWGraph(A->getCrsGraph(), "graph of A")); + // Detect and record rows that correspond to Dirichlet boundary conditions + auto boundaryNodes = MueLu::Utilities::DetectDirichletRows_kokkos_host(*A, dirichletThreshold); + if (rowSumTol > 0.) + Utilities::ApplyRowSumCriterionHost(*A, rowSumTol, boundaryNodes); + + graph->SetBoundaryNodeMap(boundaryNodes); + numTotal = A->getLocalNumEntries(); + + if (GetVerbLevel() & Statistics1) { + GO numLocalBoundaryNodes = 0; + GO numGlobalBoundaryNodes = 0; + for (size_t i = 0; i < boundaryNodes.size(); ++i) + if (boundaryNodes[i]) + numLocalBoundaryNodes++; + RCP> comm = A->getRowMap()->getComm(); + MueLu_sumAll(comm, numLocalBoundaryNodes, numGlobalBoundaryNodes); + GetOStream(Statistics1) << "Detected " << numGlobalBoundaryNodes << " Dirichlet nodes" << std::endl; + } + + Set(currentLevel, "DofsPerNode", 1); + Set(currentLevel, "Graph", graph); + + } else if ((BlockSize == 1 && threshold != STS::zero()) || + (BlockSize == 1 && threshold == STS::zero() && !A->hasCrsGraph()) || + (BlockSize == 1 && useSignedClassicalRS) || + (BlockSize == 1 && useSignedClassicalSA)) { + // Case 2: scalar problem with dropping => record the column indices of undropped entries, but still use original + // graph's map information, e.g., whether index is local + // OR a matrix without a CrsGraph + + // allocate space for the local graph + typename LWGraph::row_type::non_const_type rows("rows", A->getLocalNumRows() + 1); + typename LWGraph::entries_type::non_const_type columns("columns", A->getLocalNumEntries()); + + using MT = typename STS::magnitudeType; + RCP ghostedDiag; + ArrayRCP ghostedDiagVals; + ArrayRCP negMaxOffDiagonal; + // RS style needs the max negative off-diagonal, SA style needs the diagonal + if (useSignedClassicalRS) { + if (ghostedBlockNumber.is_null()) { + negMaxOffDiagonal = MueLu::Utilities::GetMatrixMaxMinusOffDiagonal(*A); + if (GetVerbLevel() & Statistics1) + GetOStream(Statistics1) << "Calculated max point off-diagonal" << std::endl; + } else { + negMaxOffDiagonal = MueLu::Utilities::GetMatrixMaxMinusOffDiagonal(*A, *ghostedBlockNumber); + if (GetVerbLevel() & Statistics1) + GetOStream(Statistics1) << "Calculating max block off-diagonal" << std::endl; + } + } else { + ghostedDiag = MueLu::Utilities::GetMatrixOverlappedDiagonal(*A); + ghostedDiagVals = ghostedDiag->getData(0); + } + auto boundaryNodes = MueLu::Utilities::DetectDirichletRows_kokkos_host(*A, dirichletThreshold); + if (rowSumTol > 0.) { + if (ghostedBlockNumber.is_null()) { + if (GetVerbLevel() & Statistics1) + GetOStream(Statistics1) << "Applying point row sum criterion." << std::endl; + Utilities::ApplyRowSumCriterionHost(*A, rowSumTol, boundaryNodes); + } else { + if (GetVerbLevel() & Statistics1) + GetOStream(Statistics1) << "Applying block row sum criterion." << std::endl; + Utilities::ApplyRowSumCriterionHost(*A, *ghostedBlockNumber, rowSumTol, boundaryNodes); + } + } + + LO realnnz = 0; + rows(0) = 0; +#define NEW +#ifdef ORIGINAL + for (LO row = 0; row < Teuchos::as(A->getRowMap()->getLocalNumElements()); ++row) { + size_t nnz = A->getNumEntriesInLocalRow(row); + bool rowIsDirichlet = boundaryNodes[row]; + ArrayView indices; + ArrayView vals; + A->getLocalRowView(row, indices, vals); + + if (classicalAlgo == defaultAlgo) { + // FIXME the current predrop function uses the following + // FIXME if(std::abs(vals[k]) > std::abs(threshold_) || grow == gcid ) + // FIXME but the threshold doesn't take into account the rows' diagonal entries + // FIXME For now, hardwiring the dropping in here + + LO rownnz = 0; + if (useSignedClassicalRS) { + // Signed classical RS style + for (LO colID = 0; colID < Teuchos::as(nnz); colID++) { + LO col = indices[colID]; + MT max_neg_aik = realThreshold * STS::real(negMaxOffDiagonal[row]); + MT neg_aij = -STS::real(vals[colID]); + /* if(row==1326) printf("A(%d,%d) = %6.4e, block = (%d,%d) neg_aij = %6.4e max_neg_aik = %6.4e\n",row,col,vals[colID], + g_block_id.is_null() ? -1 : g_block_id[row], + g_block_id.is_null() ? -1 : g_block_id[col], + neg_aij, max_neg_aik);*/ + if ((!rowIsDirichlet && (g_block_id.is_null() || g_block_id[row] == g_block_id[col]) && neg_aij > max_neg_aik) || row == col) { + columns[realnnz++] = col; + rownnz++; + } else + numDropped++; + } + rows(row + 1) = realnnz; + } else if (useSignedClassicalSA) { + // Signed classical SA style + for (LO colID = 0; colID < Teuchos::as(nnz); colID++) { + LO col = indices[colID]; + + bool is_nonpositive = STS::real(vals[colID]) <= 0; + MT aiiajj = STS::magnitude(threshold * threshold * ghostedDiagVals[col] * ghostedDiagVals[row]); // eps^2*|a_ii|*|a_jj| + MT aij = is_nonpositive ? STS::magnitude(vals[colID] * vals[colID]) : (-STS::magnitude(vals[colID] * vals[colID])); // + |a_ij|^2, if a_ij < 0, - |a_ij|^2 if a_ij >=0 + /* + if(row==1326) printf("A(%d,%d) = %6.4e, raw_aij = %6.4e aij = %6.4e aiiajj = %6.4e\n",row,col,vals[colID], + vals[colID],aij, aiiajj); + */ + + if ((!rowIsDirichlet && aij > aiiajj) || row == col) { + columns(realnnz++) = col; + rownnz++; + } else + numDropped++; + } + rows[row + 1] = realnnz; + } else { + // Standard abs classical + for (LO colID = 0; colID < Teuchos::as(nnz); colID++) { + LO col = indices[colID]; + MT aiiajj = STS::magnitude(threshold * threshold * ghostedDiagVals[col] * ghostedDiagVals[row]); // eps^2*|a_ii|*|a_jj| + MT aij = STS::magnitude(vals[colID] * vals[colID]); // |a_ij|^2 + + if ((!rowIsDirichlet && aij > aiiajj) || row == col) { + columns(realnnz++) = col; + rownnz++; + } else + numDropped++; + } + rows(row + 1) = realnnz; + } + } else { + /* Cut Algorithm */ + // CMS + using DropTol = Details::DropTol; + std::vector drop_vec; + drop_vec.reserve(nnz); + const real_type zero = Teuchos::ScalarTraits::zero(); + const real_type one = Teuchos::ScalarTraits::one(); + LO rownnz = 0; + // NOTE: This probably needs to be fixed for rowsum + + // find magnitudes + for (LO colID = 0; colID < (LO)nnz; colID++) { + LO col = indices[colID]; + if (row == col) { + drop_vec.emplace_back(zero, one, colID, false); + continue; + } + + // Don't aggregate boundaries + if (boundaryNodes[colID]) continue; + typename STS::magnitudeType aiiajj = STS::magnitude(threshold * threshold * ghostedDiagVals[col] * ghostedDiagVals[row]); // eps^2*|a_ii|*|a_jj| + typename STS::magnitudeType aij = STS::magnitude(vals[colID] * vals[colID]); // |a_ij|^2 + drop_vec.emplace_back(aij, aiiajj, colID, false); + } + + const size_t n = drop_vec.size(); + + if (classicalAlgo == unscaled_cut) { + std::sort(drop_vec.begin(), drop_vec.end(), [](DropTol const& a, DropTol const& b) { + return a.val > b.val; + }); + + bool drop = false; + for (size_t i = 1; i < n; ++i) { + if (!drop) { + auto const& x = drop_vec[i - 1]; + auto const& y = drop_vec[i]; + auto a = x.val; + auto b = y.val; + if (a > realThreshold * b) { + drop = true; +#ifdef HAVE_MUELU_DEBUG + if (distanceLaplacianCutVerbose) { + std::cout << "DJS: KEEP, N, ROW: " << i + 1 << ", " << n << ", " << row << std::endl; + } +#endif + } + } + drop_vec[i].drop = drop; + } + } else if (classicalAlgo == scaled_cut) { + std::sort(drop_vec.begin(), drop_vec.end(), [](DropTol const& a, DropTol const& b) { + return a.val / a.diag > b.val / b.diag; + }); + bool drop = false; + // printf("[%d] Scaled Cut: ",(int)row); + // printf("%3d(%4s) ",indices[drop_vec[0].col],"keep"); + for (size_t i = 1; i < n; ++i) { + if (!drop) { + auto const& x = drop_vec[i - 1]; + auto const& y = drop_vec[i]; + auto a = x.val / x.diag; + auto b = y.val / y.diag; + if (a > realThreshold * b) { + drop = true; + +#ifdef HAVE_MUELU_DEBUG + if (distanceLaplacianCutVerbose) { + std::cout << "DJS: KEEP, N, ROW: " << i + 1 << ", " << n << ", " << row << std::endl; + } +#endif + } + // printf("%3d(%4s) ",indices[drop_vec[i].col],drop?"drop":"keep"); + } + drop_vec[i].drop = drop; + } + // printf("\n"); + } + std::sort(drop_vec.begin(), drop_vec.end(), [](DropTol const& a, DropTol const& b) { + return a.col < b.col; + }); + + for (LO idxID = 0; idxID < (LO)drop_vec.size(); idxID++) { + LO col = indices[drop_vec[idxID].col]; + // don't drop diagonal + if (row == col) { + columns[realnnz++] = col; + rownnz++; + continue; + } + + if (!drop_vec[idxID].drop) { + columns[realnnz++] = col; + rownnz++; + } else { + numDropped++; + } + } + // CMS + rows[row + 1] = realnnz; + } + } // end for row +#endif + +#ifdef NEW + if(classicalAlgo == defaultAlgo) { + SubFactoryMonitor m1(*this, "Classical RS/SA", currentLevel); + for (LO row = 0; row < Teuchos::as(A->getRowMap()->getLocalNumElements()); ++row) { + size_t nnz = A->getNumEntriesInLocalRow(row); + bool rowIsDirichlet = boundaryNodes[row]; + ArrayView indices; + ArrayView vals; + A->getLocalRowView(row, indices, vals); + + // FIXME the current predrop function uses the following + // FIXME if(std::abs(vals[k]) > std::abs(threshold_) || grow == gcid ) + // FIXME but the threshold doesn't take into account the rows' diagonal entries + // FIXME For now, hardwiring the dropping in here + + LO rownnz = 0; + if (useSignedClassicalRS) { + // Signed classical RS style + for (LO colID = 0; colID < Teuchos::as(nnz); colID++) { + LO col = indices[colID]; + MT max_neg_aik = realThreshold * STS::real(negMaxOffDiagonal[row]); + MT neg_aij = -STS::real(vals[colID]); + /* if(row==1326) printf("A(%d,%d) = %6.4e, block = (%d,%d) neg_aij = %6.4e max_neg_aik = %6.4e\n",row,col,vals[colID], + g_block_id.is_null() ? -1 : g_block_id[row], + g_block_id.is_null() ? -1 : g_block_id[col], + neg_aij, max_neg_aik);*/ + if ((!rowIsDirichlet && (g_block_id.is_null() || g_block_id[row] == g_block_id[col]) && neg_aij > max_neg_aik) || row == col) { + columns[realnnz++] = col; + rownnz++; + } else + numDropped++; + } + rows(row + 1) = realnnz; + } else if (useSignedClassicalSA) { + // Signed classical SA style + for (LO colID = 0; colID < Teuchos::as(nnz); colID++) { + LO col = indices[colID]; + + bool is_nonpositive = STS::real(vals[colID]) <= 0; + MT aiiajj = STS::magnitude(threshold * threshold * ghostedDiagVals[col] * ghostedDiagVals[row]); // eps^2*|a_ii|*|a_jj| + MT aij = is_nonpositive ? STS::magnitude(vals[colID] * vals[colID]) : (-STS::magnitude(vals[colID] * vals[colID])); // + |a_ij|^2, if a_ij < 0, - |a_ij|^2 if a_ij >=0 + /* + if(row==1326) printf("A(%d,%d) = %6.4e, raw_aij = %6.4e aij = %6.4e aiiajj = %6.4e\n",row,col,vals[colID], + vals[colID],aij, aiiajj); + */ + + if ((!rowIsDirichlet && aij > aiiajj) || row == col) { + columns(realnnz++) = col; + rownnz++; + } else + numDropped++; + } + rows[row + 1] = realnnz; + } else { + // Standard abs classical + for (LO colID = 0; colID < Teuchos::as(nnz); colID++) { + LO col = indices[colID]; + MT aiiajj = STS::magnitude(threshold * threshold * ghostedDiagVals[col] * ghostedDiagVals[row]); // eps^2*|a_ii|*|a_jj| + MT aij = STS::magnitude(vals[colID] * vals[colID]); // |a_ij|^2 + + if ((!rowIsDirichlet && aij > aiiajj) || row == col) { + columns(realnnz++) = col; + rownnz++; + } else + numDropped++; + } + rows(row + 1) = realnnz; + } + } // end for row + } + else { //NEW START + //auto stackedTimer = rcp(new Teuchos::StackedTimer("timer")); + //Teuchos::TimeMonitor::setStackedTimer(stackedTimer); + //stackedTimer->start("init"); + SubFactoryMonitor m1(*this, "Cut Drop", currentLevel); + using ExecSpace = typename Node::execution_space; + using TeamPol = Kokkos::TeamPolicy; + using TeamMem = typename TeamPol::member_type; + using DropTol = Details::DropTol; + + //move from host to device + ArrayView ghostedDiagValsArrayView = ghostedDiagVals.view(ghostedDiagVals.lowerOffset(), ghostedDiagVals.size()); + Kokkos::View ghostedDiagValsView = Kokkos::Compat::getKokkosViewDeepCopy(ghostedDiagValsArrayView); + auto boundaryNodesDevice = Kokkos::create_mirror_view(ExecSpace(), boundaryNodes); + + auto At = Utilities::Op2TpetraCrs(A); + auto A_device = At->getLocalMatrixDevice(); + + int algorithm = classicalAlgo; + Kokkos::ViewrownnzView("rownnzView", A_device.numRows()); + auto drop_views = Kokkos::View("drop_views", A_device.nnz()); + //stackedTimer->stop("init"); + + //stackedTimer->start("loop"); + Kokkos::parallel_reduce("classical_cut", TeamPol(A_device.numRows(), Kokkos::AUTO), KOKKOS_LAMBDA(const TeamMem& teamMember, LO& globalnnz, GO& totalDropped) { + LO row = teamMember.league_rank(); + auto rowView = A_device.row(row); + size_t nnz = rowView.length; + + size_t n = 0; + auto drop_view = Kokkos::subview(drop_views, Kokkos::make_pair(A_device.graph.row_map(row), A_device.graph.row_map(row+1))); + //find magnitudes + Kokkos::parallel_reduce(Kokkos::TeamThreadRange(teamMember, (LO)nnz), [&] (const LO colID, size_t &count) { + LO col = rowView.colidx(colID); + if(row == col) { + drop_view(colID) = DropTol(0, 1, colID, false); + count++; + } + //Don't aggregate boundaries + else if(!boundaryNodesDevice(colID)) { + typename STS::magnitudeType aiiajj = static_cast(std::fabs(static_cast(threshold * threshold * ghostedDiagValsView(col) * ghostedDiagValsView(row)))); // eps^2*|a _ii|*|a_jj| + typename STS::magnitudeType aij = static_cast(std::fabs(static_cast(rowView.value(colID) * rowView.value(colID)))); // |a_i j|^2 + drop_view(colID) = DropTol(aij, aiiajj, colID, false); + count++; + } + }, n); + if (algorithm == unscaled_cut) { + Kokkos::Experimental::sort_team(teamMember, drop_view, [](DropTol const& a, DropTol const& b) { + return a.val > b.val; + }); + + //find index where dropping starts + size_t dropStart; + Kokkos::parallel_reduce(Kokkos::TeamThreadRange(teamMember, 1, n), [=](size_t i, size_t& min) { + auto const& x = drop_view(i - 1); + auto const& y = drop_view(i); + auto a = x.val; + auto b = y.val; + if(a > realThreshold * b) { + if(i < min) { + min = i; + } + } + }, Kokkos::Min(dropStart)); + + if(dropStart < n) { + Kokkos::parallel_for(Kokkos::TeamThreadRange(teamMember, dropStart, n), [=](size_t i) { + drop_view(i).drop = true; + }); + } + } else if (algorithm == scaled_cut) { + Kokkos::Experimental::sort_team(teamMember, drop_view, [](DropTol const& a, DropTol const& b) { + return a.val / a.diag > b.val / b.diag; + }); + + //find index where dropping starts + size_t dropStart; + Kokkos::parallel_reduce(Kokkos::TeamThreadRange(teamMember, 1, n), [=](size_t i, size_t& min) { + auto const& x = drop_view(i - 1); + auto const& y = drop_view(i); + auto a = x.val / x.diag; + auto b = y.val / y.diag; + if(a > realThreshold * b) { + if(i < min) { + min = i; + } + } + }, Kokkos::Min(dropStart)); + + if(dropStart < n) { + Kokkos::parallel_for(Kokkos::TeamThreadRange(teamMember, dropStart, n), [=](size_t i) { + drop_view(i).drop = true; + }); + } + } + Kokkos::Experimental::sort_team(teamMember, drop_view, [](DropTol const& a, DropTol const& b) { + return a.col < b.col; + }); + + LO rownnz = 0; + GO rowDropped = 0; + Kokkos::parallel_reduce(Kokkos::TeamThreadRange(teamMember, n), [=](const size_t idxID, LO& keep, GO& drop) { + LO col = rowView.colidx(idxID); + //don't drop diagonal + if(row == col || !drop_view(idxID).drop) { + keep++; + } + else { + rowView.colidx(idxID) = -1; + drop++; + } + }, rownnz, rowDropped); + globalnnz += rownnz; + totalDropped += rowDropped; + rownnzView(row) = rownnz; + }, realnnz, numDropped); + //stackedTimer->stop("loop"); + + //stackedTimer->start("remove"); + + auto columnsDevice = Kokkos::create_mirror_view(ExecSpace(), columns); + Kokkos::Experimental::remove(ExecSpace(), columnsDevice, -1); + Kokkos::deep_copy(columns, columnsDevice); + + //stackedTimer->stop("remove"); + + //update row indices + //stackedTimer->start("scan"); + auto rowsDevice = Kokkos::create_mirror_view(ExecSpace(), rows); + Kokkos::parallel_scan(Kokkos::RangePolicy(0, A_device.numRows()), KOKKOS_LAMBDA(const int i, LO& partial_sum, bool is_final) { + partial_sum += rownnzView(i); + if(is_final) rowsDevice(i+1) = partial_sum; + }); + Kokkos::deep_copy(rows, rowsDevice); + //stackedTimer->stop("scan"); + + //stackedTimer->stop("timer"); + //stackedTimer->report(std::cout, Teuchos::DefaultComm::getComm()); + } //NEW END +#endif + + numTotal = A->getLocalNumEntries(); + + if (aggregationMayCreateDirichlet) { + // If the only element remaining after filtering is diagonal, mark node as boundary + for (LO row = 0; row < Teuchos::as(A->getRowMap()->getLocalNumElements()); ++row) { + if (rows[row + 1] - rows[row] <= 1) + boundaryNodes[row] = true; + } + } + + RCP graph = rcp(new LWGraph(rows, Kokkos::subview(columns, Kokkos::make_pair(0, realnnz)), A->getRowMap(), A->getColMap(), "thresholded graph of A")); + graph->SetBoundaryNodeMap(boundaryNodes); + if (GetVerbLevel() & Statistics1) { + GO numLocalBoundaryNodes = 0; + GO numGlobalBoundaryNodes = 0; + for (size_t i = 0; i < boundaryNodes.size(); ++i) + if (boundaryNodes(i)) + numLocalBoundaryNodes++; + RCP> comm = A->getRowMap()->getComm(); + MueLu_sumAll(comm, numLocalBoundaryNodes, numGlobalBoundaryNodes); + GetOStream(Statistics1) << "Detected " << numGlobalBoundaryNodes << " Dirichlet nodes" << std::endl; + } + Set(currentLevel, "Graph", graph); + Set(currentLevel, "DofsPerNode", 1); + + // If we're doing signed classical, we might want to block-diagonalize *after* the dropping + if (generateColoringGraph) { + RCP colorGraph; + RCP importer = A->getCrsGraph()->getImporter(); + BlockDiagonalizeGraph(graph, ghostedBlockNumber, colorGraph, importer); + Set(currentLevel, "Coloring Graph", colorGraph); + // #define CMS_DUMP +#ifdef CMS_DUMP + { + Xpetra::IO::Write("m_regular_graph." + std::to_string(currentLevel.GetLevelID()), *rcp_dynamic_cast(graph)->GetCrsGraph()); + Xpetra::IO::Write("m_color_graph." + std::to_string(currentLevel.GetLevelID()), *rcp_dynamic_cast(colorGraph)->GetCrsGraph()); + // int rank = graph->GetDomainMap()->getComm()->getRank(); + // { + // std::ofstream ofs(std::string("m_color_graph_") + std::to_string(currentLevel.GetLevelID())+std::string("_") + std::to_string(rank) + std::string(".dat"),std::ofstream::out); + // RCP fancy = Teuchos::fancyOStream(Teuchos::rcpFromRef(ofs)); + // colorGraph->print(*fancy,Debug); + // } + // { + // std::ofstream ofs(std::string("m_regular_graph_") + std::to_string(currentLevel.GetLevelID())+std::string("_") + std::to_string(rank) + std::string(".dat"),std::ofstream::out); + // RCP fancy = Teuchos::fancyOStream(Teuchos::rcpFromRef(ofs)); + // graph->print(*fancy,Debug); + // } + } +#endif + } // end generateColoringGraph + } else if (BlockSize > 1 && threshold == STS::zero()) { + // Case 3: Multiple DOF/node problem without dropping + const RCP rowMap = A->getRowMap(); + const RCP colMap = A->getColMap(); + + graphType = "amalgamated"; + + // build node row map (uniqueMap) and node column map (nonUniqueMap) + // the arrays rowTranslation and colTranslation contain the local node id + // given a local dof id. The data is calculated by the AmalgamationFactory and + // stored in the variable container "UnAmalgamationInfo" + RCP uniqueMap = amalInfo->getNodeRowMap(); + RCP nonUniqueMap = amalInfo->getNodeColMap(); + Array rowTranslation = *(amalInfo->getRowTranslation()); + Array colTranslation = *(amalInfo->getColTranslation()); + + // get number of local nodes + LO numRows = Teuchos::as(uniqueMap->getLocalNumElements()); + + // Allocate space for the local graph + typename LWGraph::row_type::non_const_type rows("rows", numRows + 1); + typename LWGraph::entries_type::non_const_type columns("columns", A->getLocalNumEntries()); + + typename LWGraph::boundary_nodes_type amalgBoundaryNodes("amalgBoundaryNodes", numRows); + Kokkos::deep_copy(amalgBoundaryNodes, false); + + // Detect and record rows that correspond to Dirichlet boundary conditions + // TODO If we use ArrayRCP, then we can record boundary nodes as usual. Size + // TODO the array one bigger than the number of local rows, and the last entry can + // TODO hold the actual number of boundary nodes. Clever, huh? + ArrayRCP pointBoundaryNodes; + pointBoundaryNodes = Teuchos::arcp_const_cast(MueLu::Utilities::DetectDirichletRows(*A, dirichletThreshold)); + if (rowSumTol > 0.) + Utilities::ApplyRowSumCriterion(*A, rowSumTol, pointBoundaryNodes); + + // extract striding information + LO blkSize = A->GetFixedBlockSize(); //< the full block size (number of dofs per node in strided map) + LO blkId = -1; //< the block id within the strided map (or -1 if it is a full block map) + LO blkPartSize = A->GetFixedBlockSize(); //< stores the size of the block within the strided map + if (A->IsView("stridedMaps") == true) { + Teuchos::RCP myMap = A->getRowMap("stridedMaps"); + Teuchos::RCP strMap = Teuchos::rcp_dynamic_cast(myMap); + TEUCHOS_TEST_FOR_EXCEPTION(strMap == null, Exceptions::RuntimeError, "Map is not of type StridedMap"); + blkSize = Teuchos::as(strMap->getFixedBlockSize()); + blkId = strMap->getStridedBlockId(); + if (blkId > -1) + blkPartSize = Teuchos::as(strMap->getStridingData()[blkId]); + } + + // loop over all local nodes + LO realnnz = 0; + rows(0) = 0; + Array indicesExtra; + for (LO row = 0; row < numRows; row++) { + ArrayView indices; + indicesExtra.resize(0); + + // The amalgamated row is marked as Dirichlet iff all point rows are Dirichlet + // Note, that pointBoundaryNodes lives on the dofmap (and not the node map). + // Therefore, looping over all dofs is fine here. We use blkPartSize as we work + // with local ids. + // TODO: Here we have different options of how to define a node to be a boundary (or Dirichlet) + // node. + bool isBoundary = false; + if (pL.get("aggregation: greedy Dirichlet") == true) { + for (LO j = 0; j < blkPartSize; j++) { + if (pointBoundaryNodes[row * blkPartSize + j]) { + isBoundary = true; + break; + } + } + } else { + isBoundary = true; + for (LO j = 0; j < blkPartSize; j++) { + if (!pointBoundaryNodes[row * blkPartSize + j]) { + isBoundary = false; + break; + } + } + } + + // Merge rows of A + // The array indicesExtra contains local column node ids for the current local node "row" + if (!isBoundary) + MergeRows(*A, row, indicesExtra, colTranslation); + else + indicesExtra.push_back(row); + indices = indicesExtra; + numTotal += indices.size(); + + // add the local column node ids to the full columns array which + // contains the local column node ids for all local node rows + LO nnz = indices.size(), rownnz = 0; + for (LO colID = 0; colID < nnz; colID++) { + LO col = indices[colID]; + columns(realnnz++) = col; + rownnz++; + } + + if (rownnz == 1) { + // If the only element remaining after filtering is diagonal, mark node as boundary + // FIXME: this should really be replaced by the following + // if (indices.size() == 1 && indices[0] == row) + // boundaryNodes[row] = true; + // We do not do it this way now because there is no framework for distinguishing isolated + // and boundary nodes in the aggregation algorithms + amalgBoundaryNodes[row] = true; + } + rows(row + 1) = realnnz; + } // for (LO row = 0; row < numRows; row++) + + RCP graph = rcp(new LWGraph(rows, Kokkos::subview(columns, Kokkos::make_pair(0, realnnz)), uniqueMap, nonUniqueMap, "amalgamated graph of A")); + graph->SetBoundaryNodeMap(amalgBoundaryNodes); + + if (GetVerbLevel() & Statistics1) { + GO numLocalBoundaryNodes = 0; + GO numGlobalBoundaryNodes = 0; + + for (size_t i = 0; i < amalgBoundaryNodes.size(); ++i) + if (amalgBoundaryNodes(i)) + numLocalBoundaryNodes++; + + RCP> comm = A->getRowMap()->getComm(); + MueLu_sumAll(comm, numLocalBoundaryNodes, numGlobalBoundaryNodes); + GetOStream(Statistics1) << "Detected " << numGlobalBoundaryNodes + << " agglomerated Dirichlet nodes" << std::endl; + } + + Set(currentLevel, "Graph", graph); + Set(currentLevel, "DofsPerNode", blkSize); // full block size + + } else if (BlockSize > 1 && threshold != STS::zero()) { + // Case 4: Multiple DOF/node problem with dropping + const RCP rowMap = A->getRowMap(); + const RCP colMap = A->getColMap(); + graphType = "amalgamated"; + + // build node row map (uniqueMap) and node column map (nonUniqueMap) + // the arrays rowTranslation and colTranslation contain the local node id + // given a local dof id. The data is calculated by the AmalgamationFactory and + // stored in the variable container "UnAmalgamationInfo" + RCP uniqueMap = amalInfo->getNodeRowMap(); + RCP nonUniqueMap = amalInfo->getNodeColMap(); + Array rowTranslation = *(amalInfo->getRowTranslation()); + Array colTranslation = *(amalInfo->getColTranslation()); + + // get number of local nodes + LO numRows = Teuchos::as(uniqueMap->getLocalNumElements()); + + // Allocate space for the local graph + typename LWGraph::row_type::non_const_type rows("rows", numRows + 1); + typename LWGraph::entries_type::non_const_type columns("columns", A->getLocalNumEntries()); + + typename LWGraph::boundary_nodes_type amalgBoundaryNodes("amalgBoundaryNodes", numRows); + Kokkos::deep_copy(amalgBoundaryNodes, false); + + // Detect and record rows that correspond to Dirichlet boundary conditions + // TODO If we use ArrayRCP, then we can record boundary nodes as usual. Size + // TODO the array one bigger than the number of local rows, and the last entry can + // TODO hold the actual number of boundary nodes. Clever, huh? + auto pointBoundaryNodes = MueLu::Utilities::DetectDirichletRows_kokkos_host(*A, dirichletThreshold); + if (rowSumTol > 0.) + Utilities::ApplyRowSumCriterionHost(*A, rowSumTol, pointBoundaryNodes); + + // extract striding information + LO blkSize = A->GetFixedBlockSize(); //< the full block size (number of dofs per node in strided map) + LO blkId = -1; //< the block id within the strided map (or -1 if it is a full block map) + LO blkPartSize = A->GetFixedBlockSize(); //< stores the size of the block within the strided map + if (A->IsView("stridedMaps") == true) { + Teuchos::RCP myMap = A->getRowMap("stridedMaps"); + Teuchos::RCP strMap = Teuchos::rcp_dynamic_cast(myMap); + TEUCHOS_TEST_FOR_EXCEPTION(strMap == null, Exceptions::RuntimeError, "Map is not of type StridedMap"); + blkSize = Teuchos::as(strMap->getFixedBlockSize()); + blkId = strMap->getStridedBlockId(); + if (blkId > -1) + blkPartSize = Teuchos::as(strMap->getStridingData()[blkId]); + } + + // extract diagonal data for dropping strategy + RCP ghostedDiag = MueLu::Utilities::GetMatrixOverlappedDiagonal(*A); + const ArrayRCP ghostedDiagVals = ghostedDiag->getData(0); + + // loop over all local nodes + LO realnnz = 0; + rows[0] = 0; + Array indicesExtra; + for (LO row = 0; row < numRows; row++) { + ArrayView indices; + indicesExtra.resize(0); + + // The amalgamated row is marked as Dirichlet iff all point rows are Dirichlet + // Note, that pointBoundaryNodes lives on the dofmap (and not the node map). + // Therefore, looping over all dofs is fine here. We use blkPartSize as we work + // with local ids. + // TODO: Here we have different options of how to define a node to be a boundary (or Dirichlet) + // node. + bool isBoundary = false; + if (pL.get("aggregation: greedy Dirichlet") == true) { + for (LO j = 0; j < blkPartSize; j++) { + if (pointBoundaryNodes[row * blkPartSize + j]) { + isBoundary = true; + break; + } + } + } else { + isBoundary = true; + for (LO j = 0; j < blkPartSize; j++) { + if (!pointBoundaryNodes[row * blkPartSize + j]) { + isBoundary = false; + break; + } + } + } + + // Merge rows of A + // The array indicesExtra contains local column node ids for the current local node "row" + if (!isBoundary) + MergeRowsWithDropping(*A, row, ghostedDiagVals, threshold, indicesExtra, colTranslation); + else + indicesExtra.push_back(row); + indices = indicesExtra; + numTotal += indices.size(); + + // add the local column node ids to the full columns array which + // contains the local column node ids for all local node rows + LO nnz = indices.size(), rownnz = 0; + for (LO colID = 0; colID < nnz; colID++) { + LO col = indices[colID]; + columns[realnnz++] = col; + rownnz++; + } + + if (rownnz == 1) { + // If the only element remaining after filtering is diagonal, mark node as boundary + // FIXME: this should really be replaced by the following + // if (indices.size() == 1 && indices[0] == row) + // boundaryNodes[row] = true; + // We do not do it this way now because there is no framework for distinguishing isolated + // and boundary nodes in the aggregation algorithms + amalgBoundaryNodes[row] = true; + } + rows[row + 1] = realnnz; + } // for (LO row = 0; row < numRows; row++) + // columns.resize(realnnz); + + RCP graph = rcp(new LWGraph(rows, Kokkos::subview(columns, Kokkos::make_pair(0, realnnz)), uniqueMap, nonUniqueMap, "amalgamated graph of A")); + graph->SetBoundaryNodeMap(amalgBoundaryNodes); + + if (GetVerbLevel() & Statistics1) { + GO numLocalBoundaryNodes = 0; + GO numGlobalBoundaryNodes = 0; + + for (size_t i = 0; i < amalgBoundaryNodes.size(); ++i) + if (amalgBoundaryNodes(i)) + numLocalBoundaryNodes++; + + RCP> comm = A->getRowMap()->getComm(); + MueLu_sumAll(comm, numLocalBoundaryNodes, numGlobalBoundaryNodes); + GetOStream(Statistics1) << "Detected " << numGlobalBoundaryNodes + << " agglomerated Dirichlet nodes" << std::endl; + } + + Set(currentLevel, "Graph", graph); + Set(currentLevel, "DofsPerNode", blkSize); // full block size + } + + } else if (algo == "distance laplacian") { + LO blkSize = A->GetFixedBlockSize(); + GO indexBase = A->getRowMap()->getIndexBase(); + // [*0*] : FIXME + // ap: somehow, if I move this line to [*1*], Belos throws an error + // I'm not sure what's going on. Do we always have to Get data, if we did + // DeclareInput for it? + // RCP Coords = Get< RCP >(currentLevel, "Coordinates"); + + // Detect and record rows that correspond to Dirichlet boundary conditions + // TODO If we use ArrayRCP, then we can record boundary nodes as usual. Size + // TODO the array one bigger than the number of local rows, and the last entry can + // TODO hold the actual number of boundary nodes. Clever, huh? + auto pointBoundaryNodes = MueLu::Utilities::DetectDirichletRows_kokkos_host(*A, dirichletThreshold); + if (rowSumTol > 0.) + Utilities::ApplyRowSumCriterionHost(*A, rowSumTol, pointBoundaryNodes); + + if ((blkSize == 1) && (threshold == STS::zero())) { + // Trivial case: scalar problem, no dropping. Can return original graph + RCP graph = rcp(new LWGraph(A->getCrsGraph(), "graph of A")); + graph->SetBoundaryNodeMap(pointBoundaryNodes); + graphType = "unamalgamated"; + numTotal = A->getLocalNumEntries(); + + if (GetVerbLevel() & Statistics1) { + GO numLocalBoundaryNodes = 0; + GO numGlobalBoundaryNodes = 0; + for (size_t i = 0; i < pointBoundaryNodes.size(); ++i) + if (pointBoundaryNodes(i)) + numLocalBoundaryNodes++; + RCP> comm = A->getRowMap()->getComm(); + MueLu_sumAll(comm, numLocalBoundaryNodes, numGlobalBoundaryNodes); + GetOStream(Statistics1) << "Detected " << numGlobalBoundaryNodes << " Dirichlet nodes" << std::endl; + } + + Set(currentLevel, "DofsPerNode", blkSize); + Set(currentLevel, "Graph", graph); + + } else { + // ap: We make quite a few assumptions here; general case may be a lot different, + // but much much harder to implement. We assume that: + // 1) all maps are standard maps, not strided maps + // 2) global indices of dofs in A are related to dofs in coordinates in a simple arithmetic + // way: rows i*blkSize, i*blkSize+1, ..., i*blkSize + (blkSize-1) correspond to node i + // + // NOTE: Potentially, some of the code below could be simplified with UnAmalgamationInfo, + // but as I totally don't understand that code, here is my solution + + // [*1*]: see [*0*] + + // Check that the number of local coordinates is consistent with the #rows in A + TEUCHOS_TEST_FOR_EXCEPTION(A->getRowMap()->getLocalNumElements() / blkSize != Coords->getLocalLength(), Exceptions::Incompatible, + "Coordinate vector length (" << Coords->getLocalLength() << ") is incompatible with number of rows in A (" << A->getRowMap()->getLocalNumElements() << ") by modulo block size (" << blkSize << ")."); + + const RCP colMap = A->getColMap(); + RCP uniqueMap, nonUniqueMap; + Array colTranslation; + if (blkSize == 1) { + uniqueMap = A->getRowMap(); + nonUniqueMap = A->getColMap(); + graphType = "unamalgamated"; + + } else { + uniqueMap = Coords->getMap(); + TEUCHOS_TEST_FOR_EXCEPTION(uniqueMap->getIndexBase() != indexBase, Exceptions::Incompatible, + "Different index bases for matrix and coordinates"); + + AmalgamationFactory::AmalgamateMap(*(A->getColMap()), *A, nonUniqueMap, colTranslation); + + graphType = "amalgamated"; + } + LO numRows = Teuchos::as(uniqueMap->getLocalNumElements()); + + RCP ghostedCoords; + RCP ghostedLaplDiag; + Teuchos::ArrayRCP ghostedLaplDiagData; + if (threshold != STS::zero()) { + // Get ghost coordinates + RCP importer; + { + SubFactoryMonitor m1(*this, "Import construction", currentLevel); + if (blkSize == 1 && realA->getCrsGraph()->getImporter() != Teuchos::null) { + GetOStream(Warnings1) << "Using existing importer from matrix graph" << std::endl; + importer = realA->getCrsGraph()->getImporter(); + } else { + GetOStream(Warnings0) << "Constructing new importer instance" << std::endl; + importer = ImportFactory::Build(uniqueMap, nonUniqueMap); + } + } // subtimer + ghostedCoords = Xpetra::MultiVectorFactory::Build(nonUniqueMap, Coords->getNumVectors()); + { + SubFactoryMonitor m1(*this, "Coordinate import", currentLevel); + ghostedCoords->doImport(*Coords, *importer, Xpetra::INSERT); + } // subtimer + + // Construct Distance Laplacian diagonal + RCP localLaplDiag = VectorFactory::Build(uniqueMap); + Array indicesExtra; + Teuchos::Array> coordData; + if (threshold != STS::zero()) { + const size_t numVectors = ghostedCoords->getNumVectors(); + coordData.reserve(numVectors); + for (size_t j = 0; j < numVectors; j++) { + Teuchos::ArrayRCP tmpData = ghostedCoords->getData(j); + coordData.push_back(tmpData); + } + } + { + SubFactoryMonitor m1(*this, "Laplacian local diagonal", currentLevel); + ArrayRCP localLaplDiagData = localLaplDiag->getDataNonConst(0); + for (LO row = 0; row < numRows; row++) { + ArrayView indices; + + if (blkSize == 1) { + ArrayView vals; + A->getLocalRowView(row, indices, vals); + + } else { + // Merge rows of A + indicesExtra.resize(0); + MergeRows(*A, row, indicesExtra, colTranslation); + indices = indicesExtra; + } + + LO nnz = indices.size(); + bool haveAddedToDiag = false; + for (LO colID = 0; colID < nnz; colID++) { + const LO col = indices[colID]; + + if (row != col) { + if (use_dlap_weights == SINGLE_WEIGHTS) { + /*printf("[%d,%d] Unweighted Distance = %6.4e Weighted Distance = %6.4e\n",row,col, + MueLu::Utilities::Distance2(coordData, row, col), + MueLu::Utilities::Distance2(dlap_weights(),coordData, row, col));*/ + localLaplDiagData[row] += STS::one() / MueLu::Utilities::Distance2(dlap_weights(), coordData, row, col); + } else if (use_dlap_weights == BLOCK_WEIGHTS) { + int block_id = row % interleaved_blocksize; + int block_start = block_id * interleaved_blocksize; + localLaplDiagData[row] += STS::one() / MueLu::Utilities::Distance2(dlap_weights(block_start, interleaved_blocksize), coordData, row, col); + } else { + // printf("[%d,%d] Unweighted Distance = %6.4e\n",row,col,MueLu::Utilities::Distance2(coordData, row, col)); + localLaplDiagData[row] += STS::one() / MueLu::Utilities::Distance2(coordData, row, col); + } + haveAddedToDiag = true; + } + } + // Deal with the situation where boundary conditions have only been enforced on rows, but not on columns. + // We enforce dropping of these entries by assigning a very large number to the diagonal entries corresponding to BCs. + if (!haveAddedToDiag) + localLaplDiagData[row] = STS::rmax(); + } + } // subtimer + { + SubFactoryMonitor m1(*this, "Laplacian distributed diagonal", currentLevel); + ghostedLaplDiag = VectorFactory::Build(nonUniqueMap); + ghostedLaplDiag->doImport(*localLaplDiag, *importer, Xpetra::INSERT); + ghostedLaplDiagData = ghostedLaplDiag->getDataNonConst(0); + } // subtimer + + } else { + GetOStream(Runtime0) << "Skipping distance laplacian construction due to 0 threshold" << std::endl; + } + + // NOTE: ghostedLaplDiagData might be zero if we don't actually calculate the laplacian + + // allocate space for the local graph + typename LWGraph::row_type::non_const_type rows("rows", numRows + 1); + typename LWGraph::entries_type::non_const_type columns("columns", A->getLocalNumEntries()); + +#ifdef HAVE_MUELU_DEBUG + // DEBUGGING + for (LO i = 0; i < (LO)columns.size(); i++) columns[i] = -666; +#endif + + // Extra array for if we're allowing symmetrization with cutting + ArrayRCP rows_stop; + bool use_stop_array = threshold != STS::zero() && distanceLaplacianAlgo == scaled_cut_symmetric; + if (use_stop_array) + // rows_stop = typename LWGraph::row_type::non_const_type("rows_stop", numRows); + rows_stop.resize(numRows); + + typename LWGraph::boundary_nodes_type amalgBoundaryNodes("amalgBoundaryNodes", numRows); + Kokkos::deep_copy(amalgBoundaryNodes, false); + + LO realnnz = 0; + rows(0) = 0; + + Array indicesExtra; + { + SubFactoryMonitor m1(*this, "Laplacian dropping", currentLevel); + Teuchos::Array> coordData; + if (threshold != STS::zero()) { + const size_t numVectors = ghostedCoords->getNumVectors(); + coordData.reserve(numVectors); + for (size_t j = 0; j < numVectors; j++) { + Teuchos::ArrayRCP tmpData = ghostedCoords->getData(j); + coordData.push_back(tmpData); + } + } + + ArrayView vals; // CMS hackery + for (LO row = 0; row < numRows; row++) { + ArrayView indices; + indicesExtra.resize(0); + bool isBoundary = false; + + if (blkSize == 1) { + // ArrayView vals;//CMS uncomment + A->getLocalRowView(row, indices, vals); + isBoundary = pointBoundaryNodes[row]; + } else { + // The amalgamated row is marked as Dirichlet iff all point rows are Dirichlet + for (LO j = 0; j < blkSize; j++) { + if (!pointBoundaryNodes[row * blkSize + j]) { + isBoundary = false; + break; + } + } + + // Merge rows of A + if (!isBoundary) + MergeRows(*A, row, indicesExtra, colTranslation); + else + indicesExtra.push_back(row); + indices = indicesExtra; + } + numTotal += indices.size(); + + LO nnz = indices.size(), rownnz = 0; + + if (use_stop_array) { + rows(row + 1) = rows(row) + nnz; + realnnz = rows(row); + } + + if (threshold != STS::zero()) { + // default + if (distanceLaplacianAlgo == defaultAlgo) { + /* Standard Distance Laplacian */ + for (LO colID = 0; colID < nnz; colID++) { + LO col = indices[colID]; + + if (row == col) { + columns(realnnz++) = col; + rownnz++; + continue; + } + + // We do not want the distance Laplacian aggregating boundary nodes + if (isBoundary) continue; + + SC laplVal; + if (use_dlap_weights == SINGLE_WEIGHTS) { + laplVal = STS::one() / MueLu::Utilities::Distance2(dlap_weights(), coordData, row, col); + } else if (use_dlap_weights == BLOCK_WEIGHTS) { + int block_id = row % interleaved_blocksize; + int block_start = block_id * interleaved_blocksize; + laplVal = STS::one() / MueLu::Utilities::Distance2(dlap_weights(block_start, interleaved_blocksize), coordData, row, col); + } else { + laplVal = STS::one() / MueLu::Utilities::Distance2(coordData, row, col); + } + real_type aiiajj = STS::magnitude(realThreshold * realThreshold * ghostedLaplDiagData[row] * ghostedLaplDiagData[col]); + real_type aij = STS::magnitude(laplVal * laplVal); + + if (aij > aiiajj) { + columns(realnnz++) = col; + rownnz++; + } else { + numDropped++; + } + } + } else { + /* Cut Algorithm */ + using DropTol = Details::DropTol; + std::vector drop_vec; + drop_vec.reserve(nnz); + const real_type zero = Teuchos::ScalarTraits::zero(); + const real_type one = Teuchos::ScalarTraits::one(); + + // find magnitudes + for (LO colID = 0; colID < nnz; colID++) { + LO col = indices[colID]; + + if (row == col) { + drop_vec.emplace_back(zero, one, colID, false); + continue; + } + // We do not want the distance Laplacian aggregating boundary nodes + if (isBoundary) continue; + + SC laplVal; + if (use_dlap_weights == SINGLE_WEIGHTS) { + laplVal = STS::one() / MueLu::Utilities::Distance2(dlap_weights(), coordData, row, col); + } else if (use_dlap_weights == BLOCK_WEIGHTS) { + int block_id = row % interleaved_blocksize; + int block_start = block_id * interleaved_blocksize; + laplVal = STS::one() / MueLu::Utilities::Distance2(dlap_weights(block_start, interleaved_blocksize), coordData, row, col); + } else { + laplVal = STS::one() / MueLu::Utilities::Distance2(coordData, row, col); + } + + real_type aiiajj = STS::magnitude(ghostedLaplDiagData[row] * ghostedLaplDiagData[col]); + real_type aij = STS::magnitude(laplVal * laplVal); + + drop_vec.emplace_back(aij, aiiajj, colID, false); + } + + const size_t n = drop_vec.size(); + + if (distanceLaplacianAlgo == unscaled_cut) { + std::sort(drop_vec.begin(), drop_vec.end(), [](DropTol const& a, DropTol const& b) { + return a.val > b.val; + }); + + bool drop = false; + for (size_t i = 1; i < n; ++i) { + if (!drop) { + auto const& x = drop_vec[i - 1]; + auto const& y = drop_vec[i]; + auto a = x.val; + auto b = y.val; + if (a > realThreshold * b) { + drop = true; +#ifdef HAVE_MUELU_DEBUG + if (distanceLaplacianCutVerbose) { + std::cout << "DJS: KEEP, N, ROW: " << i + 1 << ", " << n << ", " << row << std::endl; + } +#endif + } + } + drop_vec[i].drop = drop; + } + } else if (distanceLaplacianAlgo == scaled_cut || distanceLaplacianAlgo == scaled_cut_symmetric) { + std::sort(drop_vec.begin(), drop_vec.end(), [](DropTol const& a, DropTol const& b) { + return a.val / a.diag > b.val / b.diag; + }); + + bool drop = false; + for (size_t i = 1; i < n; ++i) { + if (!drop) { + auto const& x = drop_vec[i - 1]; + auto const& y = drop_vec[i]; + auto a = x.val / x.diag; + auto b = y.val / y.diag; + if (a > realThreshold * b) { + drop = true; +#ifdef HAVE_MUELU_DEBUG + if (distanceLaplacianCutVerbose) { + std::cout << "DJS: KEEP, N, ROW: " << i + 1 << ", " << n << ", " << row << std::endl; + } +#endif + } + } + drop_vec[i].drop = drop; + } + } + + std::sort(drop_vec.begin(), drop_vec.end(), [](DropTol const& a, DropTol const& b) { + return a.col < b.col; + }); + + for (LO idxID = 0; idxID < (LO)drop_vec.size(); idxID++) { + LO col = indices[drop_vec[idxID].col]; + + // don't drop diagonal + if (row == col) { + columns(realnnz++) = col; + rownnz++; + // printf("(%d,%d) KEEP %13s matrix = %6.4e\n",row,row,"DIAGONAL",drop_vec[idxID].aux_val); + continue; + } + + if (!drop_vec[idxID].drop) { + columns(realnnz++) = col; + // printf("(%d,%d) KEEP dlap = %6.4e matrix = %6.4e\n",row,col,drop_vec[idxID].val/drop_vec[idxID].diag,drop_vec[idxID].aux_val); + rownnz++; + } else { + // printf("(%d,%d) DROP dlap = %6.4e matrix = %6.4e\n",row,col,drop_vec[idxID].val/drop_vec[idxID].diag,drop_vec[idxID].aux_val); + numDropped++; + } + } + } + } else { + // Skip laplace calculation and threshold comparison for zero threshold + for (LO colID = 0; colID < nnz; colID++) { + LO col = indices[colID]; + columns(realnnz++) = col; + rownnz++; + } + } + + if (rownnz == 1) { + // If the only element remaining after filtering is diagonal, mark node as boundary + // FIXME: this should really be replaced by the following + // if (indices.size() == 1 && indices[0] == row) + // boundaryNodes[row] = true; + // We do not do it this way now because there is no framework for distinguishing isolated + // and boundary nodes in the aggregation algorithms + amalgBoundaryNodes[row] = true; + } + + if (use_stop_array) + rows_stop[row] = rownnz + rows[row]; + else + rows[row + 1] = realnnz; + } // for (LO row = 0; row < numRows; row++) + + } // subtimer + + if (use_stop_array) { + // Do symmetrization of the cut matrix + // NOTE: We assume nested row/column maps here + for (LO row = 0; row < numRows; row++) { + for (LO colidx = rows[row]; colidx < rows_stop[row]; colidx++) { + LO col = columns[colidx]; + if (col >= numRows) continue; + + bool found = false; + for (LO t_col = rows(col); !found && t_col < rows_stop[col]; t_col++) { + if (columns[t_col] == row) + found = true; + } + // We didn't find the transpose buddy, so let's symmetrize, unless we'd be symmetrizing + // into a Dirichlet unknown. In that case don't. + if (!found && !pointBoundaryNodes[col] && Teuchos::as(rows_stop[col]) < rows[col + 1]) { + LO new_idx = rows_stop[col]; + // printf("(%d,%d) SYMADD entry\n",col,row); + columns[new_idx] = row; + rows_stop[col]++; + numDropped--; + } + } + } + + // Condense everything down + LO current_start = 0; + for (LO row = 0; row < numRows; row++) { + LO old_start = current_start; + for (LO col = rows(row); col < rows_stop[row]; col++) { + if (current_start != col) { + columns(current_start) = columns(col); + } + current_start++; + } + rows[row] = old_start; + } + rows(numRows) = realnnz = current_start; + } + + RCP graph; + { + SubFactoryMonitor m1(*this, "Build amalgamated graph", currentLevel); + graph = rcp(new LWGraph(rows, Kokkos::subview(columns, Kokkos::make_pair(0, realnnz)), uniqueMap, nonUniqueMap, "amalgamated graph of A")); + graph->SetBoundaryNodeMap(amalgBoundaryNodes); + } // subtimer + + if (GetVerbLevel() & Statistics1) { + GO numLocalBoundaryNodes = 0; + GO numGlobalBoundaryNodes = 0; + + for (size_t i = 0; i < amalgBoundaryNodes.size(); ++i) + if (amalgBoundaryNodes(i)) + numLocalBoundaryNodes++; + + RCP> comm = A->getRowMap()->getComm(); + MueLu_sumAll(comm, numLocalBoundaryNodes, numGlobalBoundaryNodes); + GetOStream(Statistics1) << "Detected " << numGlobalBoundaryNodes << " agglomerated Dirichlet nodes" + << " using threshold " << dirichletThreshold << std::endl; + } + + Set(currentLevel, "Graph", graph); + Set(currentLevel, "DofsPerNode", blkSize); + } + } + + if ((GetVerbLevel() & Statistics1) && !(A->GetFixedBlockSize() > 1 && threshold != STS::zero())) { + RCP> comm = A->getRowMap()->getComm(); + GO numGlobalTotal, numGlobalDropped; + MueLu_sumAll(comm, numTotal, numGlobalTotal); + MueLu_sumAll(comm, numDropped, numGlobalDropped); + GetOStream(Statistics1) << "Number of dropped entries in " << graphType << " matrix graph: " << numGlobalDropped << "/" << numGlobalTotal; + if (numGlobalTotal != 0) + GetOStream(Statistics1) << " (" << 100 * Teuchos::as(numGlobalDropped) / Teuchos::as(numGlobalTotal) << "%)"; + GetOStream(Statistics1) << std::endl; + } + + } else { + // what Tobias has implemented + + SC threshold = as(pL.get("aggregation: drop tol")); + // GetOStream(Runtime0) << "algorithm = \"" << algo << "\": threshold = " << threshold << ", blocksize = " << A->GetFixedBlockSize() << std::endl; + GetOStream(Runtime0) << "algorithm = \"" + << "failsafe" + << "\": threshold = " << threshold << ", blocksize = " << A->GetFixedBlockSize() << std::endl; + Set(currentLevel, "Filtering", (threshold != STS::zero())); + + RCP rowMap = A->getRowMap(); + RCP colMap = A->getColMap(); + + LO blockdim = 1; // block dim for fixed size blocks + GO indexBase = rowMap->getIndexBase(); // index base of maps + GO offset = 0; + + // 1) check for blocking/striding information + if (A->IsView("stridedMaps") && + Teuchos::rcp_dynamic_cast(A->getRowMap("stridedMaps")) != Teuchos::null) { + Xpetra::viewLabel_t oldView = A->SwitchToView("stridedMaps"); // note: "stridedMaps are always non-overlapping (correspond to range and domain maps!) + RCP strMap = Teuchos::rcp_dynamic_cast(A->getRowMap()); + TEUCHOS_TEST_FOR_EXCEPTION(strMap == Teuchos::null, Exceptions::BadCast, "MueLu::CoalesceFactory::Build: cast to strided row map failed."); + blockdim = strMap->getFixedBlockSize(); + offset = strMap->getOffset(); + oldView = A->SwitchToView(oldView); + GetOStream(Statistics1) << "CoalesceDropFactory::Build():" + << " found blockdim=" << blockdim << " from strided maps. offset=" << offset << std::endl; + } else + GetOStream(Statistics1) << "CoalesceDropFactory::Build(): no striding information available. Use blockdim=1 with offset=0" << std::endl; + + // 2) get row map for amalgamated matrix (graph of A) + // with same distribution over all procs as row map of A + RCP nodeMap = amalInfo->getNodeRowMap(); + GetOStream(Statistics1) << "CoalesceDropFactory: nodeMap " << nodeMap->getLocalNumElements() << "/" << nodeMap->getGlobalNumElements() << " elements" << std::endl; + + // 3) create graph of amalgamated matrix + RCP crsGraph = CrsGraphFactory::Build(nodeMap, A->getLocalMaxNumRowEntries() * blockdim); + + LO numRows = A->getRowMap()->getLocalNumElements(); + LO numNodes = nodeMap->getLocalNumElements(); + typename LWGraph::boundary_nodes_type amalgBoundaryNodes("amalgBoundaryNodes", numNodes); + Kokkos::deep_copy(amalgBoundaryNodes, false); + const ArrayRCP numberDirichletRowsPerNode(numNodes, 0); // helper array counting the number of Dirichlet nodes associated with node + bool bIsDiagonalEntry = false; // boolean flag stating that grid==gcid + + // 4) do amalgamation. generate graph of amalgamated matrix + // Note, this code is much more inefficient than the leightwight implementation + // Most of the work has already been done in the AmalgamationFactory + for (LO row = 0; row < numRows; row++) { + // get global DOF id + GO grid = rowMap->getGlobalElement(row); + + // reinitialize boolean helper variable + bIsDiagonalEntry = false; + + // translate grid to nodeid + GO nodeId = AmalgamationFactory::DOFGid2NodeId(grid, blockdim, offset, indexBase); + + size_t nnz = A->getNumEntriesInLocalRow(row); + Teuchos::ArrayView indices; + Teuchos::ArrayView vals; + A->getLocalRowView(row, indices, vals); + + RCP> cnodeIds = Teuchos::rcp(new std::vector); // global column block ids + LO realnnz = 0; + for (LO col = 0; col < Teuchos::as(nnz); col++) { + GO gcid = colMap->getGlobalElement(indices[col]); // global column id + + if (vals[col] != STS::zero()) { + GO cnodeId = AmalgamationFactory::DOFGid2NodeId(gcid, blockdim, offset, indexBase); + cnodeIds->push_back(cnodeId); + realnnz++; // increment number of nnz in matrix row + if (grid == gcid) bIsDiagonalEntry = true; + } + } + + if (realnnz == 1 && bIsDiagonalEntry == true) { + LO lNodeId = nodeMap->getLocalElement(nodeId); + numberDirichletRowsPerNode[lNodeId] += 1; // increment Dirichlet row counter associated with lNodeId + if (numberDirichletRowsPerNode[lNodeId] == blockdim) // mark full Dirichlet nodes + amalgBoundaryNodes[lNodeId] = true; + } + + Teuchos::ArrayRCP arr_cnodeIds = Teuchos::arcp(cnodeIds); + + if (arr_cnodeIds.size() > 0) + crsGraph->insertGlobalIndices(nodeId, arr_cnodeIds()); + } + // fill matrix graph + crsGraph->fillComplete(nodeMap, nodeMap); + + // 5) create MueLu Graph object + RCP graph = rcp(new LWGraph(crsGraph, "amalgamated graph of A")); + + // Detect and record rows that correspond to Dirichlet boundary conditions + graph->SetBoundaryNodeMap(amalgBoundaryNodes); + + if (GetVerbLevel() & Statistics1) { + GO numLocalBoundaryNodes = 0; + GO numGlobalBoundaryNodes = 0; + for (size_t i = 0; i < amalgBoundaryNodes.size(); ++i) + if (amalgBoundaryNodes(i)) + numLocalBoundaryNodes++; + RCP> comm = A->getRowMap()->getComm(); + MueLu_sumAll(comm, numLocalBoundaryNodes, numGlobalBoundaryNodes); + GetOStream(Statistics1) << "Detected " << numGlobalBoundaryNodes << " Dirichlet nodes" << std::endl; + } + + // 6) store results in Level + // graph->SetBoundaryNodeMap(gBoundaryNodeMap); + Set(currentLevel, "DofsPerNode", blockdim); + Set(currentLevel, "Graph", graph); + + } // if (doExperimentalWrap) ... else ... + +} // Build + +template +void CoalesceDropFactory::BuildKokkos(Level& currentLevel) const { + FactoryMonitor m(*this, "BuildKokkos", currentLevel); + + typedef Teuchos::ScalarTraits STS; + typedef typename STS::magnitudeType real_type; + typedef Xpetra::MultiVector RealValuedMultiVector; + typedef Xpetra::MultiVectorFactory RealValuedMultiVectorFactory; + + if (predrop_ != Teuchos::null) + GetOStream(Parameters0) << predrop_->description(); + + RCP realA = Get>(currentLevel, "A"); + RCP amalInfo = Get>(currentLevel, "UnAmalgamationInfo"); + const ParameterList& pL = GetParameterList(); + bool doExperimentalWrap = pL.get("lightweight wrap"); + + GetOStream(Parameters0) << "lightweight wrap = " << doExperimentalWrap << std::endl; + std::string algo = pL.get("aggregation: drop scheme"); + const bool aggregationMayCreateDirichlet = pL.get("aggregation: dropping may create Dirichlet"); + + RCP Coords; + RCP A; + + bool use_block_algorithm = false; + LO interleaved_blocksize = as(pL.get("aggregation: block diagonal: interleaved blocksize")); + bool useSignedClassicalRS = false; + bool useSignedClassicalSA = false; + bool generateColoringGraph = false; + + // NOTE: If we're doing blockDiagonal, we'll not want to do rowSum twice (we'll do it + // in the block diagonalization). So we'll clobber the rowSumTol with -1.0 in this case + typename STS::magnitudeType rowSumTol = as(pL.get("aggregation: row sum drop tol")); + + RCP ghostedBlockNumber; + ArrayRCP g_block_id; + + if (algo == "distance laplacian") { + // Grab the coordinates for distance laplacian + Coords = Get>(currentLevel, "Coordinates"); + A = realA; + } else if (algo == "signed classical sa") { + useSignedClassicalSA = true; + algo = "classical"; + A = realA; + } else if (algo == "signed classical" || algo == "block diagonal colored signed classical" || algo == "block diagonal signed classical") { + useSignedClassicalRS = true; + // if(realA->GetFixedBlockSize() > 1) { + RCP BlockNumber = Get>(currentLevel, "BlockNumber"); + // Ghost the column block numbers if we need to + RCP importer = realA->getCrsGraph()->getImporter(); + if (!importer.is_null()) { + SubFactoryMonitor m1(*this, "Block Number import", currentLevel); + ghostedBlockNumber = Xpetra::VectorFactory::Build(importer->getTargetMap()); + ghostedBlockNumber->doImport(*BlockNumber, *importer, Xpetra::INSERT); + } else { + ghostedBlockNumber = BlockNumber; + } + g_block_id = ghostedBlockNumber->getData(0); + // } + if (algo == "block diagonal colored signed classical") + generateColoringGraph = true; + algo = "classical"; + A = realA; + + } else if (algo == "block diagonal") { + // Handle the "block diagonal" filtering and then leave + BlockDiagonalize(currentLevel, realA, false); + return; + } else if (algo == "block diagonal classical" || algo == "block diagonal distance laplacian") { + // Handle the "block diagonal" filtering, and then continue onward + use_block_algorithm = true; + RCP filteredMatrix = BlockDiagonalize(currentLevel, realA, true); + if (algo == "block diagonal distance laplacian") { + // We now need to expand the coordinates by the interleaved blocksize + RCP OldCoords = Get>(currentLevel, "Coordinates"); + if (OldCoords->getLocalLength() != realA->getLocalNumRows()) { + LO dim = (LO)OldCoords->getNumVectors(); + Coords = RealValuedMultiVectorFactory::Build(realA->getRowMap(), dim); + for (LO k = 0; k < dim; k++) { + ArrayRCP old_vec = OldCoords->getData(k); + ArrayRCP new_vec = Coords->getDataNonConst(k); + for (LO i = 0; i < (LO)OldCoords->getLocalLength(); i++) { + LO new_base = i * dim; + for (LO j = 0; j < interleaved_blocksize; j++) + new_vec[new_base + j] = old_vec[i]; + } + } + } else { + Coords = OldCoords; + } + algo = "distance laplacian"; + } else if (algo == "block diagonal classical") { + algo = "classical"; + } + // All cases + A = filteredMatrix; + rowSumTol = -1.0; + } else { + A = realA; + } + + // Distance Laplacian weights + Array dlap_weights = pL.get>("aggregation: distance laplacian directional weights"); + enum { NO_WEIGHTS = 0, + SINGLE_WEIGHTS, + BLOCK_WEIGHTS }; + int use_dlap_weights = NO_WEIGHTS; + if (algo == "distance laplacian") { + LO dim = (LO)Coords->getNumVectors(); + // If anything isn't 1.0 we need to turn on the weighting + bool non_unity = false; + for (LO i = 0; !non_unity && i < (LO)dlap_weights.size(); i++) { + if (dlap_weights[i] != 1.0) { + non_unity = true; + } + } + if (non_unity) { + LO blocksize = use_block_algorithm ? as(pL.get("aggregation: block diagonal: interleaved blocksize")) : 1; + if ((LO)dlap_weights.size() == dim) + use_dlap_weights = SINGLE_WEIGHTS; + else if ((LO)dlap_weights.size() == blocksize * dim) + use_dlap_weights = BLOCK_WEIGHTS; + else { + TEUCHOS_TEST_FOR_EXCEPTION(1, Exceptions::RuntimeError, + "length of 'aggregation: distance laplacian directional weights' must equal the coordinate dimension OR the coordinate dimension times the blocksize"); + } + if (GetVerbLevel() & Statistics1) + GetOStream(Statistics1) << "Using distance laplacian weights: " << dlap_weights << std::endl; + } + } + + // decide wether to use the fast-track code path for standard maps or the somewhat slower + // code path for non-standard maps + /*bool bNonStandardMaps = false; + if (A->IsView("stridedMaps") == true) { + Teuchos::RCP myMap = A->getRowMap("stridedMaps"); + Teuchos::RCP strMap = Teuchos::rcp_dynamic_cast(myMap); + TEUCHOS_TEST_FOR_EXCEPTION(strMap == null, Exceptions::RuntimeError, "Map is not of type StridedMap"); + if (strMap->getStridedBlockId() != -1 || strMap->getOffset() > 0) + bNonStandardMaps = true; + }*/ + + if (doExperimentalWrap) { + TEUCHOS_TEST_FOR_EXCEPTION(predrop_ != null && algo != "classical", Exceptions::RuntimeError, "Dropping function must not be provided for \"" << algo << "\" algorithm"); + TEUCHOS_TEST_FOR_EXCEPTION(algo != "classical" && algo != "distance laplacian" && algo != "signed classical", Exceptions::RuntimeError, "\"algorithm\" must be one of (classical|distance laplacian|signed classical)"); + + SC threshold; + // If we're doing the ML-style halving of the drop tol at each level, we do that here. + if (pL.get("aggregation: use ml scaling of drop tol")) + threshold = pL.get("aggregation: drop tol") / pow(2.0, currentLevel.GetLevelID()); + else + threshold = as(pL.get("aggregation: drop tol")); + + std::string distanceLaplacianAlgoStr = pL.get("aggregation: distance laplacian algo"); + std::string classicalAlgoStr = pL.get("aggregation: classical algo"); + real_type realThreshold = STS::magnitude(threshold); // CMS: Rename this to "magnitude threshold" sometime + + //////////////////////////////////////////////////// + // Remove this bit once we are confident that cut-based dropping works. +#ifdef HAVE_MUELU_DEBUG + int distanceLaplacianCutVerbose = 0; +#endif +#ifdef DJS_READ_ENV_VARIABLES + if (getenv("MUELU_DROP_TOLERANCE_MODE")) { + distanceLaplacianAlgoStr = std::string(getenv("MUELU_DROP_TOLERANCE_MODE")); + } + + if (getenv("MUELU_DROP_TOLERANCE_THRESHOLD")) { + auto tmp = atoi(getenv("MUELU_DROP_TOLERANCE_THRESHOLD")); + realThreshold = 1e-4 * tmp; + } + +#ifdef HAVE_MUELU_DEBUG + if (getenv("MUELU_DROP_TOLERANCE_VERBOSE")) { + distanceLaplacianCutVerbose = atoi(getenv("MUELU_DROP_TOLERANCE_VERBOSE")); + } +#endif +#endif + //////////////////////////////////////////////////// + + enum decisionAlgoType { defaultAlgo, + unscaled_cut, + scaled_cut, + scaled_cut_symmetric }; + + decisionAlgoType distanceLaplacianAlgo = defaultAlgo; + decisionAlgoType classicalAlgo = defaultAlgo; + if (algo == "distance laplacian") { + if (distanceLaplacianAlgoStr == "default") + distanceLaplacianAlgo = defaultAlgo; + else if (distanceLaplacianAlgoStr == "unscaled cut") + distanceLaplacianAlgo = unscaled_cut; + else if (distanceLaplacianAlgoStr == "scaled cut") + distanceLaplacianAlgo = scaled_cut; + else if (distanceLaplacianAlgoStr == "scaled cut symmetric") + distanceLaplacianAlgo = scaled_cut_symmetric; + else + TEUCHOS_TEST_FOR_EXCEPTION(true, Exceptions::RuntimeError, "\"aggregation: distance laplacian algo\" must be one of (default|unscaled cut|scaled cut), not \"" << distanceLaplacianAlgoStr << "\""); + GetOStream(Runtime0) << "algorithm = \"" << algo << "\" distance laplacian algorithm = \"" << distanceLaplacianAlgoStr << "\": threshold = " << threshold << ", blocksize = " << A->GetFixedBlockSize() << std::endl; + } else if (algo == "classical") { + if (classicalAlgoStr == "default") + classicalAlgo = defaultAlgo; + else if (classicalAlgoStr == "unscaled cut") + classicalAlgo = unscaled_cut; + else if (classicalAlgoStr == "scaled cut") + classicalAlgo = scaled_cut; + else + TEUCHOS_TEST_FOR_EXCEPTION(true, Exceptions::RuntimeError, "\"aggregation: classical algo\" must be one of (default|unscaled cut|scaled cut), not \"" << classicalAlgoStr << "\""); + GetOStream(Runtime0) << "algorithm = \"" << algo << "\" classical algorithm = \"" << classicalAlgoStr << "\": threshold = " << threshold << ", blocksize = " << A->GetFixedBlockSize() << std::endl; + + } else + GetOStream(Runtime0) << "algorithm = \"" << algo << "\": threshold = " << threshold << ", blocksize = " << A->GetFixedBlockSize() << std::endl; + Set(currentLevel, "Filtering", (threshold != STS::zero())); + + const typename STS::magnitudeType dirichletThreshold = STS::magnitude(as(pL.get("aggregation: Dirichlet threshold"))); + + // NOTE: We don't support signed classical RS or SA with cut drop at present + TEUCHOS_TEST_FOR_EXCEPTION(useSignedClassicalRS && classicalAlgo != defaultAlgo, Exceptions::RuntimeError, "\"aggregation: classical algo\" != default is not supported for scalled classical aggregation"); + TEUCHOS_TEST_FOR_EXCEPTION(useSignedClassicalSA && classicalAlgo != defaultAlgo, Exceptions::RuntimeError, "\"aggregation: classical algo\" != default is not supported for scalled classical sa aggregation"); + + GO numDropped = 0, numTotal = 0; + std::string graphType = "unamalgamated"; // for description purposes only + + /* NOTE: storageblocksize (from GetStorageBlockSize()) is the size of a block in the chosen storage scheme. + BlockSize is the number of storage blocks that must kept together during the amalgamation process. + + Both of these quantities may be different than numPDEs (from GetFixedBlockSize()), but the following must always hold: + + numPDEs = BlockSize * storageblocksize. + + If numPDEs==1 + Matrix is point storage (classical CRS storage). storageblocksize=1 and BlockSize=1 + No other values makes sense. + + If numPDEs>1 + If matrix uses point storage, then storageblocksize=1 and BlockSize=numPDEs. + If matrix uses block storage, with block size of n, then storageblocksize=n, and BlockSize=numPDEs/n. + Thus far, only storageblocksize=numPDEs and BlockSize=1 has been tested. + */ + TEUCHOS_TEST_FOR_EXCEPTION(A->GetFixedBlockSize() % A->GetStorageBlockSize() != 0, Exceptions::RuntimeError, "A->GetFixedBlockSize() needs to be a multiple of A->GetStorageBlockSize()"); + const LO BlockSize = A->GetFixedBlockSize() / A->GetStorageBlockSize(); + /************************** RS or SA-style Classical Dropping (and variants) **************************/ if (algo == "classical") { if (predrop_ == null) { @@ -506,7 +2155,7 @@ void CoalesceDropFactory::Build(Level LO realnnz = 0; rows(0) = 0; - for (LO row = 0; row < Teuchos::as(A->getRowMap()->getLocalNumElements()); ++row) { + for (LO row = 0; row < Teuchos::as(A->getRowMap()->getLocalNumElements()); ++row) { size_t nnz = A->getNumEntriesInLocalRow(row); bool rowIsDirichlet = boundaryNodes[row]; ArrayView indices; @@ -573,11 +2222,11 @@ void CoalesceDropFactory::Build(Level rows(row + 1) = realnnz; } } else { - /* Cut Algorithm */ + /* Cut Algorithm */ // CMS using DropTol = Details::DropTol; std::vector drop_vec; - drop_vec.reserve(nnz); + drop_vec.reserve(nnz); const real_type zero = Teuchos::ScalarTraits::zero(); const real_type one = Teuchos::ScalarTraits::one(); LO rownnz = 0; @@ -1594,7 +3243,7 @@ void CoalesceDropFactory::Build(Level } // if (doExperimentalWrap) ... else ... -} // Build +} // BuildKokkos template void CoalesceDropFactory::MergeRows(const Matrix& A, const LO row, Array& cols, const Array& translation) const { From 105e33e71f5ab985b6559fcc047b4b596336efb1 Mon Sep 17 00:00:00 2001 From: Ian Halim Date: Mon, 22 Jul 2024 12:19:34 -0600 Subject: [PATCH 2/7] MueLu: Cut Drop Memory Optimization DropTol structure in algorithm replaced with new, smaller DropTolKokkos structure. Computations are now done on the fly. Code passes current unit tests. No significant change in speed. Signed-off-by: Ian Halim --- .../MueLu_CoalesceDropFactory_def.hpp | 130 ++++++++++++------ 1 file changed, 86 insertions(+), 44 deletions(-) diff --git a/packages/muelu/src/Graph/MatrixTransformation/MueLu_CoalesceDropFactory_def.hpp b/packages/muelu/src/Graph/MatrixTransformation/MueLu_CoalesceDropFactory_def.hpp index a8befaea592b..eeb3f91dbfd6 100644 --- a/packages/muelu/src/Graph/MatrixTransformation/MueLu_CoalesceDropFactory_def.hpp +++ b/packages/muelu/src/Graph/MatrixTransformation/MueLu_CoalesceDropFactory_def.hpp @@ -94,35 +94,48 @@ namespace MueLu { namespace Details { template struct DropTol { - KOKKOS_INLINE_FUNCTION //NEW DropTol() = default; - KOKKOS_INLINE_FUNCTION //NEW DropTol(DropTol const&) = default; - KOKKOS_INLINE_FUNCTION //NEW DropTol(DropTol&&) = default; DropTol& operator=(DropTol const&) = default; DropTol& operator=(DropTol&&) = default; - KOKKOS_INLINE_FUNCTION //NEW DropTol(real_type val_, real_type diag_, LO col_, bool drop_) : val{val_} , diag{diag_} , col{col_} , drop{drop_} {} - real_type val{0}; - real_type diag{0}; - LO col{-1}; - //NEW Can't run these host functions on device - //real_type val{Teuchos::ScalarTraits::zero()}; - //real_type diag{Teuchos::ScalarTraits::zero()}; - //LO col{Teuchos::OrdinalTraits::invalid()}; + real_type val{Teuchos::ScalarTraits::zero()}; + real_type diag{Teuchos::ScalarTraits::zero()}; + LO col{Teuchos::OrdinalTraits::invalid()}; bool drop{true}; // CMS: Auxillary information for debugging info // real_type aux_val {Teuchos::ScalarTraits::nan()}; }; + +template +struct DropTolKokkos { + KOKKOS_INLINE_FUNCTION //NEW + DropTolKokkos() = default; + KOKKOS_INLINE_FUNCTION //NEW + DropTolKokkos(DropTolKokkos const&) = default; + KOKKOS_INLINE_FUNCTION //NEW + DropTolKokkos(DropTolKokkos&&) = default; + + DropTolKokkos& operator=(DropTolKokkos const&) = default; + DropTolKokkos& operator=(DropTolKokkos&&) = default; + + KOKKOS_INLINE_FUNCTION //NEW + DropTolKokkos(LO col_, bool drop_) + : col{col_} + , drop{drop_} {} + + LO col{-1}; + LO drop{true}; +}; } // namespace Details template @@ -767,7 +780,7 @@ void CoalesceDropFactory::Build(Level using ExecSpace = typename Node::execution_space; using TeamPol = Kokkos::TeamPolicy; using TeamMem = typename TeamPol::member_type; - using DropTol = Details::DropTol; + using DropTolKokkos = Details::DropTolKokkos; //move from host to device ArrayView ghostedDiagValsArrayView = ghostedDiagVals.view(ghostedDiagVals.lowerOffset(), ghostedDiagVals.size()); @@ -779,7 +792,7 @@ void CoalesceDropFactory::Build(Level int algorithm = classicalAlgo; Kokkos::ViewrownnzView("rownnzView", A_device.numRows()); - auto drop_views = Kokkos::View("drop_views", A_device.nnz()); + auto drop_views = Kokkos::View("drop_views", A_device.nnz()); //stackedTimer->stop("init"); //stackedTimer->start("loop"); @@ -790,74 +803,103 @@ void CoalesceDropFactory::Build(Level size_t n = 0; auto drop_view = Kokkos::subview(drop_views, Kokkos::make_pair(A_device.graph.row_map(row), A_device.graph.row_map(row+1))); + //find magnitudes Kokkos::parallel_reduce(Kokkos::TeamThreadRange(teamMember, (LO)nnz), [&] (const LO colID, size_t &count) { LO col = rowView.colidx(colID); if(row == col) { - drop_view(colID) = DropTol(0, 1, colID, false); + drop_view(colID) = DropTolKokkos(colID, true); count++; } //Don't aggregate boundaries else if(!boundaryNodesDevice(colID)) { - typename STS::magnitudeType aiiajj = static_cast(std::fabs(static_cast(threshold * threshold * ghostedDiagValsView(col) * ghostedDiagValsView(row)))); // eps^2*|a _ii|*|a_jj| - typename STS::magnitudeType aij = static_cast(std::fabs(static_cast(rowView.value(colID) * rowView.value(colID)))); // |a_i j|^2 - drop_view(colID) = DropTol(aij, aiiajj, colID, false); + drop_view(colID) = DropTolKokkos(colID, false); count++; } }, n); + + size_t dropStart = n; if (algorithm == unscaled_cut) { - Kokkos::Experimental::sort_team(teamMember, drop_view, [](DropTol const& a, DropTol const& b) { - return a.val > b.val; + Kokkos::Experimental::sort_team(teamMember, drop_view, [=](DropTolKokkos const& x, DropTolKokkos const& y) { + if(x.drop || y.drop) { + return x.drop < y.drop; + } + else { + typename STS::magnitudeType x_aij = static_cast(std::fabs(static_cast(rowView.value(x.col) * rowView.value(x.col)))); // |a_i j|^2 + typename STS::magnitudeType y_aij = static_cast(std::fabs(static_cast(rowView.value(y.col) * rowView.value(y.col)))); // |a_i j|^2 + return x_aij > y_aij; + } }); //find index where dropping starts - size_t dropStart; Kokkos::parallel_reduce(Kokkos::TeamThreadRange(teamMember, 1, n), [=](size_t i, size_t& min) { auto const& x = drop_view(i - 1); auto const& y = drop_view(i); - auto a = x.val; - auto b = y.val; - if(a > realThreshold * b) { + typename STS::magnitudeType x_aij = 0; + typename STS::magnitudeType y_aij = 0; + if(!x.drop) { + x_aij = static_cast(std::fabs(static_cast(rowView.value(x.col) * rowView.value(x.col)))); // |a_i j|^2 + } + if(!y.drop) { + y_aij = static_cast(std::fabs(static_cast(rowView.value(y.col) * rowView.value(y.col)))); // |a_i j|^2 + } + + if(x_aij > realThreshold * y_aij) { if(i < min) { min = i; } } }, Kokkos::Min(dropStart)); - - if(dropStart < n) { - Kokkos::parallel_for(Kokkos::TeamThreadRange(teamMember, dropStart, n), [=](size_t i) { - drop_view(i).drop = true; - }); - } } else if (algorithm == scaled_cut) { - Kokkos::Experimental::sort_team(teamMember, drop_view, [](DropTol const& a, DropTol const& b) { - return a.val / a.diag > b.val / b.diag; + Kokkos::Experimental::sort_team(teamMember, drop_view, [=](DropTolKokkos const& x, DropTolKokkos const& y) { + if(x.drop || y.drop) { + return x.drop < y.drop; + } + else { + typename STS::magnitudeType x_aij = static_cast(std::fabs(static_cast(rowView.value(x.col) * rowView.value(x.col)))); // |a_i j|^2 + typename STS::magnitudeType y_aij = static_cast(std::fabs(static_cast(rowView.value(y.col) * rowView.value(y.col)))); // |a_i j|^2 + typename STS::magnitudeType x_aiiajj = static_cast(std::fabs(static_cast(threshold * threshold * ghostedDiagValsView(rowView.colidx(x.col)) * ghostedDiagValsView(row)))); // eps^2*|a _ii|*|a_jj| + typename STS::magnitudeType y_aiiajj = static_cast(std::fabs(static_cast(threshold * threshold * ghostedDiagValsView(rowView.colidx(y.col)) * ghostedDiagValsView(row)))); // eps^2*|a _ii|*|a_jj| + return x_aij / x_aiiajj > y_aij / y_aiiajj; + } }); + //find index where dropping starts - size_t dropStart; Kokkos::parallel_reduce(Kokkos::TeamThreadRange(teamMember, 1, n), [=](size_t i, size_t& min) { auto const& x = drop_view(i - 1); auto const& y = drop_view(i); - auto a = x.val / x.diag; - auto b = y.val / y.diag; - if(a > realThreshold * b) { + typename STS::magnitudeType x_val = 0; + typename STS::magnitudeType y_val = 0; + if(!x.drop) { + typename STS::magnitudeType x_aij = static_cast(std::fabs(static_cast(rowView.value(x.col) * rowView.value(x.col)))); // |a_i j|^2 + typename STS::magnitudeType x_aiiajj = static_cast(std::fabs(static_cast(threshold * threshold * ghostedDiagValsView(rowView.colidx(x.col)) * ghostedDiagValsView(row)))); // eps^2*|a _ii|*|a_jj| + x_val = x_aij / x_aiiajj; + } + if(!y.drop) { + typename STS::magnitudeType y_aij = static_cast(std::fabs(static_cast(rowView.value(y.col) * rowView.value(y.col)))); // |a_i j|^2 + typename STS::magnitudeType y_aiiajj = static_cast(std::fabs(static_cast(threshold * threshold * ghostedDiagValsView(rowView.colidx(y.col)) * ghostedDiagValsView(row)))); // eps^2*|a _ii|*|a_jj| + y_val = y_aij / y_aiiajj; + } + + if(x_val > realThreshold * y_val) { if(i < min) { min = i; } } }, Kokkos::Min(dropStart)); - - if(dropStart < n) { - Kokkos::parallel_for(Kokkos::TeamThreadRange(teamMember, dropStart, n), [=](size_t i) { - drop_view(i).drop = true; - }); - } } - Kokkos::Experimental::sort_team(teamMember, drop_view, [](DropTol const& a, DropTol const& b) { - return a.col < b.col; + + if(dropStart < n) { + Kokkos::parallel_for(Kokkos::TeamThreadRange(teamMember, dropStart, n), [=](size_t i) { + drop_view(i).drop = true; + }); + } + + Kokkos::Experimental::sort_team(teamMember, drop_view, [](DropTolKokkos const& a, DropTolKokkos const& b) { + return a.col < b.col; }); - + LO rownnz = 0; GO rowDropped = 0; Kokkos::parallel_reduce(Kokkos::TeamThreadRange(teamMember, n), [=](const size_t idxID, LO& keep, GO& drop) { From cdee728089dcf993f67cf194dbc51575a4a766e5 Mon Sep 17 00:00:00 2001 From: Ian Halim Date: Mon, 22 Jul 2024 19:00:22 -0600 Subject: [PATCH 3/7] MueLu: Sorting Now Resembles numpy.argsort Per Christian's request. DropTolKokkos structure removed and replaced with view indices and view of drop flags. ORIGINAL code removed. BuildKokkos removed. Removed commented out timers. Added comments. Signed-off-by: Ian Halim --- .../MueLu_CoalesceDropFactory_decl.hpp | 1 - .../MueLu_CoalesceDropFactory_def.hpp | 1736 +---------------- 2 files changed, 53 insertions(+), 1684 deletions(-) diff --git a/packages/muelu/src/Graph/MatrixTransformation/MueLu_CoalesceDropFactory_decl.hpp b/packages/muelu/src/Graph/MatrixTransformation/MueLu_CoalesceDropFactory_decl.hpp index db5e9a291313..96b5e778f6bc 100644 --- a/packages/muelu/src/Graph/MatrixTransformation/MueLu_CoalesceDropFactory_decl.hpp +++ b/packages/muelu/src/Graph/MatrixTransformation/MueLu_CoalesceDropFactory_decl.hpp @@ -160,7 +160,6 @@ class CoalesceDropFactory : public SingleLevelFactoryBase { //@} void Build(Level& currentLevel) const; // Build - void BuildKokkos(Level& currentLevel) const; private: // pre-drop function diff --git a/packages/muelu/src/Graph/MatrixTransformation/MueLu_CoalesceDropFactory_def.hpp b/packages/muelu/src/Graph/MatrixTransformation/MueLu_CoalesceDropFactory_def.hpp index eeb3f91dbfd6..da606ab20ff6 100644 --- a/packages/muelu/src/Graph/MatrixTransformation/MueLu_CoalesceDropFactory_def.hpp +++ b/packages/muelu/src/Graph/MatrixTransformation/MueLu_CoalesceDropFactory_def.hpp @@ -61,8 +61,8 @@ #include -#include //NEW -#include //NEW +#include +#include #include "MueLu_CoalesceDropFactory_decl.hpp" #include "MueLu_AmalgamationFactory.hpp" @@ -115,27 +115,6 @@ struct DropTol { // CMS: Auxillary information for debugging info // real_type aux_val {Teuchos::ScalarTraits::nan()}; }; - -template -struct DropTolKokkos { - KOKKOS_INLINE_FUNCTION //NEW - DropTolKokkos() = default; - KOKKOS_INLINE_FUNCTION //NEW - DropTolKokkos(DropTolKokkos const&) = default; - KOKKOS_INLINE_FUNCTION //NEW - DropTolKokkos(DropTolKokkos&&) = default; - - DropTolKokkos& operator=(DropTolKokkos const&) = default; - DropTolKokkos& operator=(DropTolKokkos&&) = default; - - KOKKOS_INLINE_FUNCTION //NEW - DropTolKokkos(LO col_, bool drop_) - : col{col_} - , drop{drop_} {} - - LO col{-1}; - LO drop{true}; -}; } // namespace Details template @@ -529,180 +508,6 @@ void CoalesceDropFactory::Build(Level LO realnnz = 0; rows(0) = 0; -#define NEW -#ifdef ORIGINAL - for (LO row = 0; row < Teuchos::as(A->getRowMap()->getLocalNumElements()); ++row) { - size_t nnz = A->getNumEntriesInLocalRow(row); - bool rowIsDirichlet = boundaryNodes[row]; - ArrayView indices; - ArrayView vals; - A->getLocalRowView(row, indices, vals); - - if (classicalAlgo == defaultAlgo) { - // FIXME the current predrop function uses the following - // FIXME if(std::abs(vals[k]) > std::abs(threshold_) || grow == gcid ) - // FIXME but the threshold doesn't take into account the rows' diagonal entries - // FIXME For now, hardwiring the dropping in here - - LO rownnz = 0; - if (useSignedClassicalRS) { - // Signed classical RS style - for (LO colID = 0; colID < Teuchos::as(nnz); colID++) { - LO col = indices[colID]; - MT max_neg_aik = realThreshold * STS::real(negMaxOffDiagonal[row]); - MT neg_aij = -STS::real(vals[colID]); - /* if(row==1326) printf("A(%d,%d) = %6.4e, block = (%d,%d) neg_aij = %6.4e max_neg_aik = %6.4e\n",row,col,vals[colID], - g_block_id.is_null() ? -1 : g_block_id[row], - g_block_id.is_null() ? -1 : g_block_id[col], - neg_aij, max_neg_aik);*/ - if ((!rowIsDirichlet && (g_block_id.is_null() || g_block_id[row] == g_block_id[col]) && neg_aij > max_neg_aik) || row == col) { - columns[realnnz++] = col; - rownnz++; - } else - numDropped++; - } - rows(row + 1) = realnnz; - } else if (useSignedClassicalSA) { - // Signed classical SA style - for (LO colID = 0; colID < Teuchos::as(nnz); colID++) { - LO col = indices[colID]; - - bool is_nonpositive = STS::real(vals[colID]) <= 0; - MT aiiajj = STS::magnitude(threshold * threshold * ghostedDiagVals[col] * ghostedDiagVals[row]); // eps^2*|a_ii|*|a_jj| - MT aij = is_nonpositive ? STS::magnitude(vals[colID] * vals[colID]) : (-STS::magnitude(vals[colID] * vals[colID])); // + |a_ij|^2, if a_ij < 0, - |a_ij|^2 if a_ij >=0 - /* - if(row==1326) printf("A(%d,%d) = %6.4e, raw_aij = %6.4e aij = %6.4e aiiajj = %6.4e\n",row,col,vals[colID], - vals[colID],aij, aiiajj); - */ - - if ((!rowIsDirichlet && aij > aiiajj) || row == col) { - columns(realnnz++) = col; - rownnz++; - } else - numDropped++; - } - rows[row + 1] = realnnz; - } else { - // Standard abs classical - for (LO colID = 0; colID < Teuchos::as(nnz); colID++) { - LO col = indices[colID]; - MT aiiajj = STS::magnitude(threshold * threshold * ghostedDiagVals[col] * ghostedDiagVals[row]); // eps^2*|a_ii|*|a_jj| - MT aij = STS::magnitude(vals[colID] * vals[colID]); // |a_ij|^2 - - if ((!rowIsDirichlet && aij > aiiajj) || row == col) { - columns(realnnz++) = col; - rownnz++; - } else - numDropped++; - } - rows(row + 1) = realnnz; - } - } else { - /* Cut Algorithm */ - // CMS - using DropTol = Details::DropTol; - std::vector drop_vec; - drop_vec.reserve(nnz); - const real_type zero = Teuchos::ScalarTraits::zero(); - const real_type one = Teuchos::ScalarTraits::one(); - LO rownnz = 0; - // NOTE: This probably needs to be fixed for rowsum - - // find magnitudes - for (LO colID = 0; colID < (LO)nnz; colID++) { - LO col = indices[colID]; - if (row == col) { - drop_vec.emplace_back(zero, one, colID, false); - continue; - } - - // Don't aggregate boundaries - if (boundaryNodes[colID]) continue; - typename STS::magnitudeType aiiajj = STS::magnitude(threshold * threshold * ghostedDiagVals[col] * ghostedDiagVals[row]); // eps^2*|a_ii|*|a_jj| - typename STS::magnitudeType aij = STS::magnitude(vals[colID] * vals[colID]); // |a_ij|^2 - drop_vec.emplace_back(aij, aiiajj, colID, false); - } - - const size_t n = drop_vec.size(); - - if (classicalAlgo == unscaled_cut) { - std::sort(drop_vec.begin(), drop_vec.end(), [](DropTol const& a, DropTol const& b) { - return a.val > b.val; - }); - - bool drop = false; - for (size_t i = 1; i < n; ++i) { - if (!drop) { - auto const& x = drop_vec[i - 1]; - auto const& y = drop_vec[i]; - auto a = x.val; - auto b = y.val; - if (a > realThreshold * b) { - drop = true; -#ifdef HAVE_MUELU_DEBUG - if (distanceLaplacianCutVerbose) { - std::cout << "DJS: KEEP, N, ROW: " << i + 1 << ", " << n << ", " << row << std::endl; - } -#endif - } - } - drop_vec[i].drop = drop; - } - } else if (classicalAlgo == scaled_cut) { - std::sort(drop_vec.begin(), drop_vec.end(), [](DropTol const& a, DropTol const& b) { - return a.val / a.diag > b.val / b.diag; - }); - bool drop = false; - // printf("[%d] Scaled Cut: ",(int)row); - // printf("%3d(%4s) ",indices[drop_vec[0].col],"keep"); - for (size_t i = 1; i < n; ++i) { - if (!drop) { - auto const& x = drop_vec[i - 1]; - auto const& y = drop_vec[i]; - auto a = x.val / x.diag; - auto b = y.val / y.diag; - if (a > realThreshold * b) { - drop = true; - -#ifdef HAVE_MUELU_DEBUG - if (distanceLaplacianCutVerbose) { - std::cout << "DJS: KEEP, N, ROW: " << i + 1 << ", " << n << ", " << row << std::endl; - } -#endif - } - // printf("%3d(%4s) ",indices[drop_vec[i].col],drop?"drop":"keep"); - } - drop_vec[i].drop = drop; - } - // printf("\n"); - } - std::sort(drop_vec.begin(), drop_vec.end(), [](DropTol const& a, DropTol const& b) { - return a.col < b.col; - }); - - for (LO idxID = 0; idxID < (LO)drop_vec.size(); idxID++) { - LO col = indices[drop_vec[idxID].col]; - // don't drop diagonal - if (row == col) { - columns[realnnz++] = col; - rownnz++; - continue; - } - - if (!drop_vec[idxID].drop) { - columns[realnnz++] = col; - rownnz++; - } else { - numDropped++; - } - } - // CMS - rows[row + 1] = realnnz; - } - } // end for row -#endif - -#ifdef NEW if(classicalAlgo == defaultAlgo) { SubFactoryMonitor m1(*this, "Classical RS/SA", currentLevel); for (LO row = 0; row < Teuchos::as(A->getRowMap()->getLocalNumElements()); ++row) { @@ -772,15 +577,11 @@ void CoalesceDropFactory::Build(Level } } // end for row } - else { //NEW START - //auto stackedTimer = rcp(new Teuchos::StackedTimer("timer")); - //Teuchos::TimeMonitor::setStackedTimer(stackedTimer); - //stackedTimer->start("init"); + else { SubFactoryMonitor m1(*this, "Cut Drop", currentLevel); using ExecSpace = typename Node::execution_space; using TeamPol = Kokkos::TeamPolicy; using TeamMem = typename TeamPol::member_type; - using DropTolKokkos = Details::DropTolKokkos; //move from host to device ArrayView ghostedDiagValsArrayView = ghostedDiagVals.view(ghostedDiagVals.lowerOffset(), ghostedDiagVals.size()); @@ -792,10 +593,9 @@ void CoalesceDropFactory::Build(Level int algorithm = classicalAlgo; Kokkos::ViewrownnzView("rownnzView", A_device.numRows()); - auto drop_views = Kokkos::View("drop_views", A_device.nnz()); - //stackedTimer->stop("init"); + auto drop_views = Kokkos::View("drop_views", A_device.nnz()); + auto index_views = Kokkos::View("index_views", A_device.nnz()); - //stackedTimer->start("loop"); Kokkos::parallel_reduce("classical_cut", TeamPol(A_device.numRows(), Kokkos::AUTO), KOKKOS_LAMBDA(const TeamMem& teamMember, LO& globalnnz, GO& totalDropped) { LO row = teamMember.league_rank(); auto rowView = A_device.row(row); @@ -803,45 +603,52 @@ void CoalesceDropFactory::Build(Level size_t n = 0; auto drop_view = Kokkos::subview(drop_views, Kokkos::make_pair(A_device.graph.row_map(row), A_device.graph.row_map(row+1))); - + auto index_view = Kokkos::subview(index_views, Kokkos::make_pair(A_device.graph.row_map(row), A_device.graph.row_map(row+1))); + //find magnitudes - Kokkos::parallel_reduce(Kokkos::TeamThreadRange(teamMember, (LO)nnz), [&] (const LO colID, size_t &count) { + Kokkos::parallel_reduce(Kokkos::TeamThreadRange(teamMember, (LO)nnz), [&](const LO colID, size_t &count) { + index_view(colID) = colID; LO col = rowView.colidx(colID); + //ignore diagonals for now, they are checked again later if(row == col) { - drop_view(colID) = DropTolKokkos(colID, true); + drop_view(colID) = true; count++; } //Don't aggregate boundaries - else if(!boundaryNodesDevice(colID)) { - drop_view(colID) = DropTolKokkos(colID, false); + else if(boundaryNodesDevice(colID)) { + drop_view(colID) = true; + } + else { + drop_view(colID) = false; count++; } }, n); size_t dropStart = n; if (algorithm == unscaled_cut) { - Kokkos::Experimental::sort_team(teamMember, drop_view, [=](DropTolKokkos const& x, DropTolKokkos const& y) { - if(x.drop || y.drop) { - return x.drop < y.drop; + //push diagonals and boundaries to the right, sort everything else by aij on the left + Kokkos::Experimental::sort_team(teamMember, index_view, [=](size_t& x, size_t& y) { + if(drop_view(x) || drop_view(y)) { + return drop_view(x) < drop_view(y); } else { - typename STS::magnitudeType x_aij = static_cast(std::fabs(static_cast(rowView.value(x.col) * rowView.value(x.col)))); // |a_i j|^2 - typename STS::magnitudeType y_aij = static_cast(std::fabs(static_cast(rowView.value(y.col) * rowView.value(y.col)))); // |a_i j|^2 + typename STS::magnitudeType x_aij = static_cast(std::fabs(static_cast(rowView.value(x) * rowView.value(x)))); + typename STS::magnitudeType y_aij = static_cast(std::fabs(static_cast(rowView.value(y) * rowView.value(y)))); return x_aij > y_aij; } }); //find index where dropping starts Kokkos::parallel_reduce(Kokkos::TeamThreadRange(teamMember, 1, n), [=](size_t i, size_t& min) { - auto const& x = drop_view(i - 1); - auto const& y = drop_view(i); + auto const& x = index_view(i - 1); + auto const& y = index_view(i); typename STS::magnitudeType x_aij = 0; typename STS::magnitudeType y_aij = 0; - if(!x.drop) { - x_aij = static_cast(std::fabs(static_cast(rowView.value(x.col) * rowView.value(x.col)))); // |a_i j|^2 + if(!drop_view(x)) { + x_aij = static_cast(std::fabs(static_cast(rowView.value(x) * rowView.value(x)))); } - if(!y.drop) { - y_aij = static_cast(std::fabs(static_cast(rowView.value(y.col) * rowView.value(y.col)))); // |a_i j|^2 + if(!drop_view(y)) { + y_aij = static_cast(std::fabs(static_cast(rowView.value(y) * rowView.value(y)))); } if(x_aij > realThreshold * y_aij) { @@ -851,34 +658,34 @@ void CoalesceDropFactory::Build(Level } }, Kokkos::Min(dropStart)); } else if (algorithm == scaled_cut) { - Kokkos::Experimental::sort_team(teamMember, drop_view, [=](DropTolKokkos const& x, DropTolKokkos const& y) { - if(x.drop || y.drop) { - return x.drop < y.drop; + //push diagonals and boundaries to the right, sort everything else by aij/aiiajj on the left + Kokkos::Experimental::sort_team(teamMember, index_view, [=](size_t& x, size_t& y) { + if(drop_view(x) || drop_view(y)) { + return drop_view(x) < drop_view(y); } else { - typename STS::magnitudeType x_aij = static_cast(std::fabs(static_cast(rowView.value(x.col) * rowView.value(x.col)))); // |a_i j|^2 - typename STS::magnitudeType y_aij = static_cast(std::fabs(static_cast(rowView.value(y.col) * rowView.value(y.col)))); // |a_i j|^2 - typename STS::magnitudeType x_aiiajj = static_cast(std::fabs(static_cast(threshold * threshold * ghostedDiagValsView(rowView.colidx(x.col)) * ghostedDiagValsView(row)))); // eps^2*|a _ii|*|a_jj| - typename STS::magnitudeType y_aiiajj = static_cast(std::fabs(static_cast(threshold * threshold * ghostedDiagValsView(rowView.colidx(y.col)) * ghostedDiagValsView(row)))); // eps^2*|a _ii|*|a_jj| + typename STS::magnitudeType x_aij = static_cast(std::fabs(static_cast(rowView.value(x) * rowView.value(x)))); + typename STS::magnitudeType y_aij = static_cast(std::fabs(static_cast(rowView.value(y) * rowView.value(y)))); + typename STS::magnitudeType x_aiiajj = static_cast(std::fabs(static_cast(threshold * threshold * ghostedDiagValsView(rowView.colidx(x)) * ghostedDiagValsView(row)))); + typename STS::magnitudeType y_aiiajj = static_cast(std::fabs(static_cast(threshold * threshold * ghostedDiagValsView(rowView.colidx(y)) * ghostedDiagValsView(row)))); return x_aij / x_aiiajj > y_aij / y_aiiajj; } }); - //find index where dropping starts Kokkos::parallel_reduce(Kokkos::TeamThreadRange(teamMember, 1, n), [=](size_t i, size_t& min) { - auto const& x = drop_view(i - 1); - auto const& y = drop_view(i); + auto const& x = index_view(i - 1); + auto const& y = index_view(i); typename STS::magnitudeType x_val = 0; typename STS::magnitudeType y_val = 0; - if(!x.drop) { - typename STS::magnitudeType x_aij = static_cast(std::fabs(static_cast(rowView.value(x.col) * rowView.value(x.col)))); // |a_i j|^2 - typename STS::magnitudeType x_aiiajj = static_cast(std::fabs(static_cast(threshold * threshold * ghostedDiagValsView(rowView.colidx(x.col)) * ghostedDiagValsView(row)))); // eps^2*|a _ii|*|a_jj| + if(!drop_view(x)) { + typename STS::magnitudeType x_aij = static_cast(std::fabs(static_cast(rowView.value(x) * rowView.value(x)))); + typename STS::magnitudeType x_aiiajj = static_cast(std::fabs(static_cast(threshold * threshold * ghostedDiagValsView(rowView.colidx(x)) * ghostedDiagValsView(row)))); x_val = x_aij / x_aiiajj; } - if(!y.drop) { - typename STS::magnitudeType y_aij = static_cast(std::fabs(static_cast(rowView.value(y.col) * rowView.value(y.col)))); // |a_i j|^2 - typename STS::magnitudeType y_aiiajj = static_cast(std::fabs(static_cast(threshold * threshold * ghostedDiagValsView(rowView.colidx(y.col)) * ghostedDiagValsView(row)))); // eps^2*|a _ii|*|a_jj| + if(!drop_view(y)) { + typename STS::magnitudeType y_aij = static_cast(std::fabs(static_cast(rowView.value(y) * rowView.value(y)))); + typename STS::magnitudeType y_aiiajj = static_cast(std::fabs(static_cast(threshold * threshold * ghostedDiagValsView(rowView.colidx(y)) * ghostedDiagValsView(row)))); y_val = y_aij / y_aiiajj; } @@ -890,22 +697,19 @@ void CoalesceDropFactory::Build(Level }, Kokkos::Min(dropStart)); } + //drop everything to the right of where values stop passing threshold if(dropStart < n) { Kokkos::parallel_for(Kokkos::TeamThreadRange(teamMember, dropStart, n), [=](size_t i) { - drop_view(i).drop = true; + drop_view(index_view(i)) = true; }); } - Kokkos::Experimental::sort_team(teamMember, drop_view, [](DropTolKokkos const& a, DropTolKokkos const& b) { - return a.col < b.col; - }); - LO rownnz = 0; GO rowDropped = 0; Kokkos::parallel_reduce(Kokkos::TeamThreadRange(teamMember, n), [=](const size_t idxID, LO& keep, GO& drop) { LO col = rowView.colidx(idxID); //don't drop diagonal - if(row == col || !drop_view(idxID).drop) { + if(row == col || !drop_view(idxID)) { keep++; } else { @@ -913,1459 +717,25 @@ void CoalesceDropFactory::Build(Level drop++; } }, rownnz, rowDropped); + globalnnz += rownnz; totalDropped += rowDropped; rownnzView(row) = rownnz; }, realnnz, numDropped); - //stackedTimer->stop("loop"); - - //stackedTimer->start("remove"); - + + //update column indices so that kept indices are aligned to the left for subview that happens later on auto columnsDevice = Kokkos::create_mirror_view(ExecSpace(), columns); Kokkos::Experimental::remove(ExecSpace(), columnsDevice, -1); Kokkos::deep_copy(columns, columnsDevice); - //stackedTimer->stop("remove"); - - //update row indices - //stackedTimer->start("scan"); + //update row indices by adding up new # of nnz in each row auto rowsDevice = Kokkos::create_mirror_view(ExecSpace(), rows); Kokkos::parallel_scan(Kokkos::RangePolicy(0, A_device.numRows()), KOKKOS_LAMBDA(const int i, LO& partial_sum, bool is_final) { partial_sum += rownnzView(i); if(is_final) rowsDevice(i+1) = partial_sum; }); Kokkos::deep_copy(rows, rowsDevice); - //stackedTimer->stop("scan"); - - //stackedTimer->stop("timer"); - //stackedTimer->report(std::cout, Teuchos::DefaultComm::getComm()); - } //NEW END -#endif - - numTotal = A->getLocalNumEntries(); - - if (aggregationMayCreateDirichlet) { - // If the only element remaining after filtering is diagonal, mark node as boundary - for (LO row = 0; row < Teuchos::as(A->getRowMap()->getLocalNumElements()); ++row) { - if (rows[row + 1] - rows[row] <= 1) - boundaryNodes[row] = true; - } - } - - RCP graph = rcp(new LWGraph(rows, Kokkos::subview(columns, Kokkos::make_pair(0, realnnz)), A->getRowMap(), A->getColMap(), "thresholded graph of A")); - graph->SetBoundaryNodeMap(boundaryNodes); - if (GetVerbLevel() & Statistics1) { - GO numLocalBoundaryNodes = 0; - GO numGlobalBoundaryNodes = 0; - for (size_t i = 0; i < boundaryNodes.size(); ++i) - if (boundaryNodes(i)) - numLocalBoundaryNodes++; - RCP> comm = A->getRowMap()->getComm(); - MueLu_sumAll(comm, numLocalBoundaryNodes, numGlobalBoundaryNodes); - GetOStream(Statistics1) << "Detected " << numGlobalBoundaryNodes << " Dirichlet nodes" << std::endl; - } - Set(currentLevel, "Graph", graph); - Set(currentLevel, "DofsPerNode", 1); - - // If we're doing signed classical, we might want to block-diagonalize *after* the dropping - if (generateColoringGraph) { - RCP colorGraph; - RCP importer = A->getCrsGraph()->getImporter(); - BlockDiagonalizeGraph(graph, ghostedBlockNumber, colorGraph, importer); - Set(currentLevel, "Coloring Graph", colorGraph); - // #define CMS_DUMP -#ifdef CMS_DUMP - { - Xpetra::IO::Write("m_regular_graph." + std::to_string(currentLevel.GetLevelID()), *rcp_dynamic_cast(graph)->GetCrsGraph()); - Xpetra::IO::Write("m_color_graph." + std::to_string(currentLevel.GetLevelID()), *rcp_dynamic_cast(colorGraph)->GetCrsGraph()); - // int rank = graph->GetDomainMap()->getComm()->getRank(); - // { - // std::ofstream ofs(std::string("m_color_graph_") + std::to_string(currentLevel.GetLevelID())+std::string("_") + std::to_string(rank) + std::string(".dat"),std::ofstream::out); - // RCP fancy = Teuchos::fancyOStream(Teuchos::rcpFromRef(ofs)); - // colorGraph->print(*fancy,Debug); - // } - // { - // std::ofstream ofs(std::string("m_regular_graph_") + std::to_string(currentLevel.GetLevelID())+std::string("_") + std::to_string(rank) + std::string(".dat"),std::ofstream::out); - // RCP fancy = Teuchos::fancyOStream(Teuchos::rcpFromRef(ofs)); - // graph->print(*fancy,Debug); - // } - } -#endif - } // end generateColoringGraph - } else if (BlockSize > 1 && threshold == STS::zero()) { - // Case 3: Multiple DOF/node problem without dropping - const RCP rowMap = A->getRowMap(); - const RCP colMap = A->getColMap(); - - graphType = "amalgamated"; - - // build node row map (uniqueMap) and node column map (nonUniqueMap) - // the arrays rowTranslation and colTranslation contain the local node id - // given a local dof id. The data is calculated by the AmalgamationFactory and - // stored in the variable container "UnAmalgamationInfo" - RCP uniqueMap = amalInfo->getNodeRowMap(); - RCP nonUniqueMap = amalInfo->getNodeColMap(); - Array rowTranslation = *(amalInfo->getRowTranslation()); - Array colTranslation = *(amalInfo->getColTranslation()); - - // get number of local nodes - LO numRows = Teuchos::as(uniqueMap->getLocalNumElements()); - - // Allocate space for the local graph - typename LWGraph::row_type::non_const_type rows("rows", numRows + 1); - typename LWGraph::entries_type::non_const_type columns("columns", A->getLocalNumEntries()); - - typename LWGraph::boundary_nodes_type amalgBoundaryNodes("amalgBoundaryNodes", numRows); - Kokkos::deep_copy(amalgBoundaryNodes, false); - - // Detect and record rows that correspond to Dirichlet boundary conditions - // TODO If we use ArrayRCP, then we can record boundary nodes as usual. Size - // TODO the array one bigger than the number of local rows, and the last entry can - // TODO hold the actual number of boundary nodes. Clever, huh? - ArrayRCP pointBoundaryNodes; - pointBoundaryNodes = Teuchos::arcp_const_cast(MueLu::Utilities::DetectDirichletRows(*A, dirichletThreshold)); - if (rowSumTol > 0.) - Utilities::ApplyRowSumCriterion(*A, rowSumTol, pointBoundaryNodes); - - // extract striding information - LO blkSize = A->GetFixedBlockSize(); //< the full block size (number of dofs per node in strided map) - LO blkId = -1; //< the block id within the strided map (or -1 if it is a full block map) - LO blkPartSize = A->GetFixedBlockSize(); //< stores the size of the block within the strided map - if (A->IsView("stridedMaps") == true) { - Teuchos::RCP myMap = A->getRowMap("stridedMaps"); - Teuchos::RCP strMap = Teuchos::rcp_dynamic_cast(myMap); - TEUCHOS_TEST_FOR_EXCEPTION(strMap == null, Exceptions::RuntimeError, "Map is not of type StridedMap"); - blkSize = Teuchos::as(strMap->getFixedBlockSize()); - blkId = strMap->getStridedBlockId(); - if (blkId > -1) - blkPartSize = Teuchos::as(strMap->getStridingData()[blkId]); - } - - // loop over all local nodes - LO realnnz = 0; - rows(0) = 0; - Array indicesExtra; - for (LO row = 0; row < numRows; row++) { - ArrayView indices; - indicesExtra.resize(0); - - // The amalgamated row is marked as Dirichlet iff all point rows are Dirichlet - // Note, that pointBoundaryNodes lives on the dofmap (and not the node map). - // Therefore, looping over all dofs is fine here. We use blkPartSize as we work - // with local ids. - // TODO: Here we have different options of how to define a node to be a boundary (or Dirichlet) - // node. - bool isBoundary = false; - if (pL.get("aggregation: greedy Dirichlet") == true) { - for (LO j = 0; j < blkPartSize; j++) { - if (pointBoundaryNodes[row * blkPartSize + j]) { - isBoundary = true; - break; - } - } - } else { - isBoundary = true; - for (LO j = 0; j < blkPartSize; j++) { - if (!pointBoundaryNodes[row * blkPartSize + j]) { - isBoundary = false; - break; - } - } - } - - // Merge rows of A - // The array indicesExtra contains local column node ids for the current local node "row" - if (!isBoundary) - MergeRows(*A, row, indicesExtra, colTranslation); - else - indicesExtra.push_back(row); - indices = indicesExtra; - numTotal += indices.size(); - - // add the local column node ids to the full columns array which - // contains the local column node ids for all local node rows - LO nnz = indices.size(), rownnz = 0; - for (LO colID = 0; colID < nnz; colID++) { - LO col = indices[colID]; - columns(realnnz++) = col; - rownnz++; - } - - if (rownnz == 1) { - // If the only element remaining after filtering is diagonal, mark node as boundary - // FIXME: this should really be replaced by the following - // if (indices.size() == 1 && indices[0] == row) - // boundaryNodes[row] = true; - // We do not do it this way now because there is no framework for distinguishing isolated - // and boundary nodes in the aggregation algorithms - amalgBoundaryNodes[row] = true; - } - rows(row + 1) = realnnz; - } // for (LO row = 0; row < numRows; row++) - - RCP graph = rcp(new LWGraph(rows, Kokkos::subview(columns, Kokkos::make_pair(0, realnnz)), uniqueMap, nonUniqueMap, "amalgamated graph of A")); - graph->SetBoundaryNodeMap(amalgBoundaryNodes); - - if (GetVerbLevel() & Statistics1) { - GO numLocalBoundaryNodes = 0; - GO numGlobalBoundaryNodes = 0; - - for (size_t i = 0; i < amalgBoundaryNodes.size(); ++i) - if (amalgBoundaryNodes(i)) - numLocalBoundaryNodes++; - - RCP> comm = A->getRowMap()->getComm(); - MueLu_sumAll(comm, numLocalBoundaryNodes, numGlobalBoundaryNodes); - GetOStream(Statistics1) << "Detected " << numGlobalBoundaryNodes - << " agglomerated Dirichlet nodes" << std::endl; - } - - Set(currentLevel, "Graph", graph); - Set(currentLevel, "DofsPerNode", blkSize); // full block size - - } else if (BlockSize > 1 && threshold != STS::zero()) { - // Case 4: Multiple DOF/node problem with dropping - const RCP rowMap = A->getRowMap(); - const RCP colMap = A->getColMap(); - graphType = "amalgamated"; - - // build node row map (uniqueMap) and node column map (nonUniqueMap) - // the arrays rowTranslation and colTranslation contain the local node id - // given a local dof id. The data is calculated by the AmalgamationFactory and - // stored in the variable container "UnAmalgamationInfo" - RCP uniqueMap = amalInfo->getNodeRowMap(); - RCP nonUniqueMap = amalInfo->getNodeColMap(); - Array rowTranslation = *(amalInfo->getRowTranslation()); - Array colTranslation = *(amalInfo->getColTranslation()); - - // get number of local nodes - LO numRows = Teuchos::as(uniqueMap->getLocalNumElements()); - - // Allocate space for the local graph - typename LWGraph::row_type::non_const_type rows("rows", numRows + 1); - typename LWGraph::entries_type::non_const_type columns("columns", A->getLocalNumEntries()); - - typename LWGraph::boundary_nodes_type amalgBoundaryNodes("amalgBoundaryNodes", numRows); - Kokkos::deep_copy(amalgBoundaryNodes, false); - - // Detect and record rows that correspond to Dirichlet boundary conditions - // TODO If we use ArrayRCP, then we can record boundary nodes as usual. Size - // TODO the array one bigger than the number of local rows, and the last entry can - // TODO hold the actual number of boundary nodes. Clever, huh? - auto pointBoundaryNodes = MueLu::Utilities::DetectDirichletRows_kokkos_host(*A, dirichletThreshold); - if (rowSumTol > 0.) - Utilities::ApplyRowSumCriterionHost(*A, rowSumTol, pointBoundaryNodes); - - // extract striding information - LO blkSize = A->GetFixedBlockSize(); //< the full block size (number of dofs per node in strided map) - LO blkId = -1; //< the block id within the strided map (or -1 if it is a full block map) - LO blkPartSize = A->GetFixedBlockSize(); //< stores the size of the block within the strided map - if (A->IsView("stridedMaps") == true) { - Teuchos::RCP myMap = A->getRowMap("stridedMaps"); - Teuchos::RCP strMap = Teuchos::rcp_dynamic_cast(myMap); - TEUCHOS_TEST_FOR_EXCEPTION(strMap == null, Exceptions::RuntimeError, "Map is not of type StridedMap"); - blkSize = Teuchos::as(strMap->getFixedBlockSize()); - blkId = strMap->getStridedBlockId(); - if (blkId > -1) - blkPartSize = Teuchos::as(strMap->getStridingData()[blkId]); - } - - // extract diagonal data for dropping strategy - RCP ghostedDiag = MueLu::Utilities::GetMatrixOverlappedDiagonal(*A); - const ArrayRCP ghostedDiagVals = ghostedDiag->getData(0); - - // loop over all local nodes - LO realnnz = 0; - rows[0] = 0; - Array indicesExtra; - for (LO row = 0; row < numRows; row++) { - ArrayView indices; - indicesExtra.resize(0); - - // The amalgamated row is marked as Dirichlet iff all point rows are Dirichlet - // Note, that pointBoundaryNodes lives on the dofmap (and not the node map). - // Therefore, looping over all dofs is fine here. We use blkPartSize as we work - // with local ids. - // TODO: Here we have different options of how to define a node to be a boundary (or Dirichlet) - // node. - bool isBoundary = false; - if (pL.get("aggregation: greedy Dirichlet") == true) { - for (LO j = 0; j < blkPartSize; j++) { - if (pointBoundaryNodes[row * blkPartSize + j]) { - isBoundary = true; - break; - } - } - } else { - isBoundary = true; - for (LO j = 0; j < blkPartSize; j++) { - if (!pointBoundaryNodes[row * blkPartSize + j]) { - isBoundary = false; - break; - } - } - } - - // Merge rows of A - // The array indicesExtra contains local column node ids for the current local node "row" - if (!isBoundary) - MergeRowsWithDropping(*A, row, ghostedDiagVals, threshold, indicesExtra, colTranslation); - else - indicesExtra.push_back(row); - indices = indicesExtra; - numTotal += indices.size(); - - // add the local column node ids to the full columns array which - // contains the local column node ids for all local node rows - LO nnz = indices.size(), rownnz = 0; - for (LO colID = 0; colID < nnz; colID++) { - LO col = indices[colID]; - columns[realnnz++] = col; - rownnz++; - } - - if (rownnz == 1) { - // If the only element remaining after filtering is diagonal, mark node as boundary - // FIXME: this should really be replaced by the following - // if (indices.size() == 1 && indices[0] == row) - // boundaryNodes[row] = true; - // We do not do it this way now because there is no framework for distinguishing isolated - // and boundary nodes in the aggregation algorithms - amalgBoundaryNodes[row] = true; - } - rows[row + 1] = realnnz; - } // for (LO row = 0; row < numRows; row++) - // columns.resize(realnnz); - - RCP graph = rcp(new LWGraph(rows, Kokkos::subview(columns, Kokkos::make_pair(0, realnnz)), uniqueMap, nonUniqueMap, "amalgamated graph of A")); - graph->SetBoundaryNodeMap(amalgBoundaryNodes); - - if (GetVerbLevel() & Statistics1) { - GO numLocalBoundaryNodes = 0; - GO numGlobalBoundaryNodes = 0; - - for (size_t i = 0; i < amalgBoundaryNodes.size(); ++i) - if (amalgBoundaryNodes(i)) - numLocalBoundaryNodes++; - - RCP> comm = A->getRowMap()->getComm(); - MueLu_sumAll(comm, numLocalBoundaryNodes, numGlobalBoundaryNodes); - GetOStream(Statistics1) << "Detected " << numGlobalBoundaryNodes - << " agglomerated Dirichlet nodes" << std::endl; - } - - Set(currentLevel, "Graph", graph); - Set(currentLevel, "DofsPerNode", blkSize); // full block size - } - - } else if (algo == "distance laplacian") { - LO blkSize = A->GetFixedBlockSize(); - GO indexBase = A->getRowMap()->getIndexBase(); - // [*0*] : FIXME - // ap: somehow, if I move this line to [*1*], Belos throws an error - // I'm not sure what's going on. Do we always have to Get data, if we did - // DeclareInput for it? - // RCP Coords = Get< RCP >(currentLevel, "Coordinates"); - - // Detect and record rows that correspond to Dirichlet boundary conditions - // TODO If we use ArrayRCP, then we can record boundary nodes as usual. Size - // TODO the array one bigger than the number of local rows, and the last entry can - // TODO hold the actual number of boundary nodes. Clever, huh? - auto pointBoundaryNodes = MueLu::Utilities::DetectDirichletRows_kokkos_host(*A, dirichletThreshold); - if (rowSumTol > 0.) - Utilities::ApplyRowSumCriterionHost(*A, rowSumTol, pointBoundaryNodes); - - if ((blkSize == 1) && (threshold == STS::zero())) { - // Trivial case: scalar problem, no dropping. Can return original graph - RCP graph = rcp(new LWGraph(A->getCrsGraph(), "graph of A")); - graph->SetBoundaryNodeMap(pointBoundaryNodes); - graphType = "unamalgamated"; - numTotal = A->getLocalNumEntries(); - - if (GetVerbLevel() & Statistics1) { - GO numLocalBoundaryNodes = 0; - GO numGlobalBoundaryNodes = 0; - for (size_t i = 0; i < pointBoundaryNodes.size(); ++i) - if (pointBoundaryNodes(i)) - numLocalBoundaryNodes++; - RCP> comm = A->getRowMap()->getComm(); - MueLu_sumAll(comm, numLocalBoundaryNodes, numGlobalBoundaryNodes); - GetOStream(Statistics1) << "Detected " << numGlobalBoundaryNodes << " Dirichlet nodes" << std::endl; - } - - Set(currentLevel, "DofsPerNode", blkSize); - Set(currentLevel, "Graph", graph); - - } else { - // ap: We make quite a few assumptions here; general case may be a lot different, - // but much much harder to implement. We assume that: - // 1) all maps are standard maps, not strided maps - // 2) global indices of dofs in A are related to dofs in coordinates in a simple arithmetic - // way: rows i*blkSize, i*blkSize+1, ..., i*blkSize + (blkSize-1) correspond to node i - // - // NOTE: Potentially, some of the code below could be simplified with UnAmalgamationInfo, - // but as I totally don't understand that code, here is my solution - - // [*1*]: see [*0*] - - // Check that the number of local coordinates is consistent with the #rows in A - TEUCHOS_TEST_FOR_EXCEPTION(A->getRowMap()->getLocalNumElements() / blkSize != Coords->getLocalLength(), Exceptions::Incompatible, - "Coordinate vector length (" << Coords->getLocalLength() << ") is incompatible with number of rows in A (" << A->getRowMap()->getLocalNumElements() << ") by modulo block size (" << blkSize << ")."); - - const RCP colMap = A->getColMap(); - RCP uniqueMap, nonUniqueMap; - Array colTranslation; - if (blkSize == 1) { - uniqueMap = A->getRowMap(); - nonUniqueMap = A->getColMap(); - graphType = "unamalgamated"; - - } else { - uniqueMap = Coords->getMap(); - TEUCHOS_TEST_FOR_EXCEPTION(uniqueMap->getIndexBase() != indexBase, Exceptions::Incompatible, - "Different index bases for matrix and coordinates"); - - AmalgamationFactory::AmalgamateMap(*(A->getColMap()), *A, nonUniqueMap, colTranslation); - - graphType = "amalgamated"; - } - LO numRows = Teuchos::as(uniqueMap->getLocalNumElements()); - - RCP ghostedCoords; - RCP ghostedLaplDiag; - Teuchos::ArrayRCP ghostedLaplDiagData; - if (threshold != STS::zero()) { - // Get ghost coordinates - RCP importer; - { - SubFactoryMonitor m1(*this, "Import construction", currentLevel); - if (blkSize == 1 && realA->getCrsGraph()->getImporter() != Teuchos::null) { - GetOStream(Warnings1) << "Using existing importer from matrix graph" << std::endl; - importer = realA->getCrsGraph()->getImporter(); - } else { - GetOStream(Warnings0) << "Constructing new importer instance" << std::endl; - importer = ImportFactory::Build(uniqueMap, nonUniqueMap); - } - } // subtimer - ghostedCoords = Xpetra::MultiVectorFactory::Build(nonUniqueMap, Coords->getNumVectors()); - { - SubFactoryMonitor m1(*this, "Coordinate import", currentLevel); - ghostedCoords->doImport(*Coords, *importer, Xpetra::INSERT); - } // subtimer - - // Construct Distance Laplacian diagonal - RCP localLaplDiag = VectorFactory::Build(uniqueMap); - Array indicesExtra; - Teuchos::Array> coordData; - if (threshold != STS::zero()) { - const size_t numVectors = ghostedCoords->getNumVectors(); - coordData.reserve(numVectors); - for (size_t j = 0; j < numVectors; j++) { - Teuchos::ArrayRCP tmpData = ghostedCoords->getData(j); - coordData.push_back(tmpData); - } - } - { - SubFactoryMonitor m1(*this, "Laplacian local diagonal", currentLevel); - ArrayRCP localLaplDiagData = localLaplDiag->getDataNonConst(0); - for (LO row = 0; row < numRows; row++) { - ArrayView indices; - - if (blkSize == 1) { - ArrayView vals; - A->getLocalRowView(row, indices, vals); - - } else { - // Merge rows of A - indicesExtra.resize(0); - MergeRows(*A, row, indicesExtra, colTranslation); - indices = indicesExtra; - } - - LO nnz = indices.size(); - bool haveAddedToDiag = false; - for (LO colID = 0; colID < nnz; colID++) { - const LO col = indices[colID]; - - if (row != col) { - if (use_dlap_weights == SINGLE_WEIGHTS) { - /*printf("[%d,%d] Unweighted Distance = %6.4e Weighted Distance = %6.4e\n",row,col, - MueLu::Utilities::Distance2(coordData, row, col), - MueLu::Utilities::Distance2(dlap_weights(),coordData, row, col));*/ - localLaplDiagData[row] += STS::one() / MueLu::Utilities::Distance2(dlap_weights(), coordData, row, col); - } else if (use_dlap_weights == BLOCK_WEIGHTS) { - int block_id = row % interleaved_blocksize; - int block_start = block_id * interleaved_blocksize; - localLaplDiagData[row] += STS::one() / MueLu::Utilities::Distance2(dlap_weights(block_start, interleaved_blocksize), coordData, row, col); - } else { - // printf("[%d,%d] Unweighted Distance = %6.4e\n",row,col,MueLu::Utilities::Distance2(coordData, row, col)); - localLaplDiagData[row] += STS::one() / MueLu::Utilities::Distance2(coordData, row, col); - } - haveAddedToDiag = true; - } - } - // Deal with the situation where boundary conditions have only been enforced on rows, but not on columns. - // We enforce dropping of these entries by assigning a very large number to the diagonal entries corresponding to BCs. - if (!haveAddedToDiag) - localLaplDiagData[row] = STS::rmax(); - } - } // subtimer - { - SubFactoryMonitor m1(*this, "Laplacian distributed diagonal", currentLevel); - ghostedLaplDiag = VectorFactory::Build(nonUniqueMap); - ghostedLaplDiag->doImport(*localLaplDiag, *importer, Xpetra::INSERT); - ghostedLaplDiagData = ghostedLaplDiag->getDataNonConst(0); - } // subtimer - - } else { - GetOStream(Runtime0) << "Skipping distance laplacian construction due to 0 threshold" << std::endl; - } - - // NOTE: ghostedLaplDiagData might be zero if we don't actually calculate the laplacian - - // allocate space for the local graph - typename LWGraph::row_type::non_const_type rows("rows", numRows + 1); - typename LWGraph::entries_type::non_const_type columns("columns", A->getLocalNumEntries()); - -#ifdef HAVE_MUELU_DEBUG - // DEBUGGING - for (LO i = 0; i < (LO)columns.size(); i++) columns[i] = -666; -#endif - - // Extra array for if we're allowing symmetrization with cutting - ArrayRCP rows_stop; - bool use_stop_array = threshold != STS::zero() && distanceLaplacianAlgo == scaled_cut_symmetric; - if (use_stop_array) - // rows_stop = typename LWGraph::row_type::non_const_type("rows_stop", numRows); - rows_stop.resize(numRows); - - typename LWGraph::boundary_nodes_type amalgBoundaryNodes("amalgBoundaryNodes", numRows); - Kokkos::deep_copy(amalgBoundaryNodes, false); - - LO realnnz = 0; - rows(0) = 0; - - Array indicesExtra; - { - SubFactoryMonitor m1(*this, "Laplacian dropping", currentLevel); - Teuchos::Array> coordData; - if (threshold != STS::zero()) { - const size_t numVectors = ghostedCoords->getNumVectors(); - coordData.reserve(numVectors); - for (size_t j = 0; j < numVectors; j++) { - Teuchos::ArrayRCP tmpData = ghostedCoords->getData(j); - coordData.push_back(tmpData); - } - } - - ArrayView vals; // CMS hackery - for (LO row = 0; row < numRows; row++) { - ArrayView indices; - indicesExtra.resize(0); - bool isBoundary = false; - - if (blkSize == 1) { - // ArrayView vals;//CMS uncomment - A->getLocalRowView(row, indices, vals); - isBoundary = pointBoundaryNodes[row]; - } else { - // The amalgamated row is marked as Dirichlet iff all point rows are Dirichlet - for (LO j = 0; j < blkSize; j++) { - if (!pointBoundaryNodes[row * blkSize + j]) { - isBoundary = false; - break; - } - } - - // Merge rows of A - if (!isBoundary) - MergeRows(*A, row, indicesExtra, colTranslation); - else - indicesExtra.push_back(row); - indices = indicesExtra; - } - numTotal += indices.size(); - - LO nnz = indices.size(), rownnz = 0; - - if (use_stop_array) { - rows(row + 1) = rows(row) + nnz; - realnnz = rows(row); - } - - if (threshold != STS::zero()) { - // default - if (distanceLaplacianAlgo == defaultAlgo) { - /* Standard Distance Laplacian */ - for (LO colID = 0; colID < nnz; colID++) { - LO col = indices[colID]; - - if (row == col) { - columns(realnnz++) = col; - rownnz++; - continue; - } - - // We do not want the distance Laplacian aggregating boundary nodes - if (isBoundary) continue; - - SC laplVal; - if (use_dlap_weights == SINGLE_WEIGHTS) { - laplVal = STS::one() / MueLu::Utilities::Distance2(dlap_weights(), coordData, row, col); - } else if (use_dlap_weights == BLOCK_WEIGHTS) { - int block_id = row % interleaved_blocksize; - int block_start = block_id * interleaved_blocksize; - laplVal = STS::one() / MueLu::Utilities::Distance2(dlap_weights(block_start, interleaved_blocksize), coordData, row, col); - } else { - laplVal = STS::one() / MueLu::Utilities::Distance2(coordData, row, col); - } - real_type aiiajj = STS::magnitude(realThreshold * realThreshold * ghostedLaplDiagData[row] * ghostedLaplDiagData[col]); - real_type aij = STS::magnitude(laplVal * laplVal); - - if (aij > aiiajj) { - columns(realnnz++) = col; - rownnz++; - } else { - numDropped++; - } - } - } else { - /* Cut Algorithm */ - using DropTol = Details::DropTol; - std::vector drop_vec; - drop_vec.reserve(nnz); - const real_type zero = Teuchos::ScalarTraits::zero(); - const real_type one = Teuchos::ScalarTraits::one(); - - // find magnitudes - for (LO colID = 0; colID < nnz; colID++) { - LO col = indices[colID]; - - if (row == col) { - drop_vec.emplace_back(zero, one, colID, false); - continue; - } - // We do not want the distance Laplacian aggregating boundary nodes - if (isBoundary) continue; - - SC laplVal; - if (use_dlap_weights == SINGLE_WEIGHTS) { - laplVal = STS::one() / MueLu::Utilities::Distance2(dlap_weights(), coordData, row, col); - } else if (use_dlap_weights == BLOCK_WEIGHTS) { - int block_id = row % interleaved_blocksize; - int block_start = block_id * interleaved_blocksize; - laplVal = STS::one() / MueLu::Utilities::Distance2(dlap_weights(block_start, interleaved_blocksize), coordData, row, col); - } else { - laplVal = STS::one() / MueLu::Utilities::Distance2(coordData, row, col); - } - - real_type aiiajj = STS::magnitude(ghostedLaplDiagData[row] * ghostedLaplDiagData[col]); - real_type aij = STS::magnitude(laplVal * laplVal); - - drop_vec.emplace_back(aij, aiiajj, colID, false); - } - - const size_t n = drop_vec.size(); - - if (distanceLaplacianAlgo == unscaled_cut) { - std::sort(drop_vec.begin(), drop_vec.end(), [](DropTol const& a, DropTol const& b) { - return a.val > b.val; - }); - - bool drop = false; - for (size_t i = 1; i < n; ++i) { - if (!drop) { - auto const& x = drop_vec[i - 1]; - auto const& y = drop_vec[i]; - auto a = x.val; - auto b = y.val; - if (a > realThreshold * b) { - drop = true; -#ifdef HAVE_MUELU_DEBUG - if (distanceLaplacianCutVerbose) { - std::cout << "DJS: KEEP, N, ROW: " << i + 1 << ", " << n << ", " << row << std::endl; - } -#endif - } - } - drop_vec[i].drop = drop; - } - } else if (distanceLaplacianAlgo == scaled_cut || distanceLaplacianAlgo == scaled_cut_symmetric) { - std::sort(drop_vec.begin(), drop_vec.end(), [](DropTol const& a, DropTol const& b) { - return a.val / a.diag > b.val / b.diag; - }); - - bool drop = false; - for (size_t i = 1; i < n; ++i) { - if (!drop) { - auto const& x = drop_vec[i - 1]; - auto const& y = drop_vec[i]; - auto a = x.val / x.diag; - auto b = y.val / y.diag; - if (a > realThreshold * b) { - drop = true; -#ifdef HAVE_MUELU_DEBUG - if (distanceLaplacianCutVerbose) { - std::cout << "DJS: KEEP, N, ROW: " << i + 1 << ", " << n << ", " << row << std::endl; - } -#endif - } - } - drop_vec[i].drop = drop; - } - } - - std::sort(drop_vec.begin(), drop_vec.end(), [](DropTol const& a, DropTol const& b) { - return a.col < b.col; - }); - - for (LO idxID = 0; idxID < (LO)drop_vec.size(); idxID++) { - LO col = indices[drop_vec[idxID].col]; - - // don't drop diagonal - if (row == col) { - columns(realnnz++) = col; - rownnz++; - // printf("(%d,%d) KEEP %13s matrix = %6.4e\n",row,row,"DIAGONAL",drop_vec[idxID].aux_val); - continue; - } - - if (!drop_vec[idxID].drop) { - columns(realnnz++) = col; - // printf("(%d,%d) KEEP dlap = %6.4e matrix = %6.4e\n",row,col,drop_vec[idxID].val/drop_vec[idxID].diag,drop_vec[idxID].aux_val); - rownnz++; - } else { - // printf("(%d,%d) DROP dlap = %6.4e matrix = %6.4e\n",row,col,drop_vec[idxID].val/drop_vec[idxID].diag,drop_vec[idxID].aux_val); - numDropped++; - } - } - } - } else { - // Skip laplace calculation and threshold comparison for zero threshold - for (LO colID = 0; colID < nnz; colID++) { - LO col = indices[colID]; - columns(realnnz++) = col; - rownnz++; - } - } - - if (rownnz == 1) { - // If the only element remaining after filtering is diagonal, mark node as boundary - // FIXME: this should really be replaced by the following - // if (indices.size() == 1 && indices[0] == row) - // boundaryNodes[row] = true; - // We do not do it this way now because there is no framework for distinguishing isolated - // and boundary nodes in the aggregation algorithms - amalgBoundaryNodes[row] = true; - } - - if (use_stop_array) - rows_stop[row] = rownnz + rows[row]; - else - rows[row + 1] = realnnz; - } // for (LO row = 0; row < numRows; row++) - - } // subtimer - - if (use_stop_array) { - // Do symmetrization of the cut matrix - // NOTE: We assume nested row/column maps here - for (LO row = 0; row < numRows; row++) { - for (LO colidx = rows[row]; colidx < rows_stop[row]; colidx++) { - LO col = columns[colidx]; - if (col >= numRows) continue; - - bool found = false; - for (LO t_col = rows(col); !found && t_col < rows_stop[col]; t_col++) { - if (columns[t_col] == row) - found = true; - } - // We didn't find the transpose buddy, so let's symmetrize, unless we'd be symmetrizing - // into a Dirichlet unknown. In that case don't. - if (!found && !pointBoundaryNodes[col] && Teuchos::as(rows_stop[col]) < rows[col + 1]) { - LO new_idx = rows_stop[col]; - // printf("(%d,%d) SYMADD entry\n",col,row); - columns[new_idx] = row; - rows_stop[col]++; - numDropped--; - } - } - } - - // Condense everything down - LO current_start = 0; - for (LO row = 0; row < numRows; row++) { - LO old_start = current_start; - for (LO col = rows(row); col < rows_stop[row]; col++) { - if (current_start != col) { - columns(current_start) = columns(col); - } - current_start++; - } - rows[row] = old_start; - } - rows(numRows) = realnnz = current_start; - } - - RCP graph; - { - SubFactoryMonitor m1(*this, "Build amalgamated graph", currentLevel); - graph = rcp(new LWGraph(rows, Kokkos::subview(columns, Kokkos::make_pair(0, realnnz)), uniqueMap, nonUniqueMap, "amalgamated graph of A")); - graph->SetBoundaryNodeMap(amalgBoundaryNodes); - } // subtimer - - if (GetVerbLevel() & Statistics1) { - GO numLocalBoundaryNodes = 0; - GO numGlobalBoundaryNodes = 0; - - for (size_t i = 0; i < amalgBoundaryNodes.size(); ++i) - if (amalgBoundaryNodes(i)) - numLocalBoundaryNodes++; - - RCP> comm = A->getRowMap()->getComm(); - MueLu_sumAll(comm, numLocalBoundaryNodes, numGlobalBoundaryNodes); - GetOStream(Statistics1) << "Detected " << numGlobalBoundaryNodes << " agglomerated Dirichlet nodes" - << " using threshold " << dirichletThreshold << std::endl; - } - - Set(currentLevel, "Graph", graph); - Set(currentLevel, "DofsPerNode", blkSize); - } - } - - if ((GetVerbLevel() & Statistics1) && !(A->GetFixedBlockSize() > 1 && threshold != STS::zero())) { - RCP> comm = A->getRowMap()->getComm(); - GO numGlobalTotal, numGlobalDropped; - MueLu_sumAll(comm, numTotal, numGlobalTotal); - MueLu_sumAll(comm, numDropped, numGlobalDropped); - GetOStream(Statistics1) << "Number of dropped entries in " << graphType << " matrix graph: " << numGlobalDropped << "/" << numGlobalTotal; - if (numGlobalTotal != 0) - GetOStream(Statistics1) << " (" << 100 * Teuchos::as(numGlobalDropped) / Teuchos::as(numGlobalTotal) << "%)"; - GetOStream(Statistics1) << std::endl; - } - - } else { - // what Tobias has implemented - - SC threshold = as(pL.get("aggregation: drop tol")); - // GetOStream(Runtime0) << "algorithm = \"" << algo << "\": threshold = " << threshold << ", blocksize = " << A->GetFixedBlockSize() << std::endl; - GetOStream(Runtime0) << "algorithm = \"" - << "failsafe" - << "\": threshold = " << threshold << ", blocksize = " << A->GetFixedBlockSize() << std::endl; - Set(currentLevel, "Filtering", (threshold != STS::zero())); - - RCP rowMap = A->getRowMap(); - RCP colMap = A->getColMap(); - - LO blockdim = 1; // block dim for fixed size blocks - GO indexBase = rowMap->getIndexBase(); // index base of maps - GO offset = 0; - - // 1) check for blocking/striding information - if (A->IsView("stridedMaps") && - Teuchos::rcp_dynamic_cast(A->getRowMap("stridedMaps")) != Teuchos::null) { - Xpetra::viewLabel_t oldView = A->SwitchToView("stridedMaps"); // note: "stridedMaps are always non-overlapping (correspond to range and domain maps!) - RCP strMap = Teuchos::rcp_dynamic_cast(A->getRowMap()); - TEUCHOS_TEST_FOR_EXCEPTION(strMap == Teuchos::null, Exceptions::BadCast, "MueLu::CoalesceFactory::Build: cast to strided row map failed."); - blockdim = strMap->getFixedBlockSize(); - offset = strMap->getOffset(); - oldView = A->SwitchToView(oldView); - GetOStream(Statistics1) << "CoalesceDropFactory::Build():" - << " found blockdim=" << blockdim << " from strided maps. offset=" << offset << std::endl; - } else - GetOStream(Statistics1) << "CoalesceDropFactory::Build(): no striding information available. Use blockdim=1 with offset=0" << std::endl; - - // 2) get row map for amalgamated matrix (graph of A) - // with same distribution over all procs as row map of A - RCP nodeMap = amalInfo->getNodeRowMap(); - GetOStream(Statistics1) << "CoalesceDropFactory: nodeMap " << nodeMap->getLocalNumElements() << "/" << nodeMap->getGlobalNumElements() << " elements" << std::endl; - - // 3) create graph of amalgamated matrix - RCP crsGraph = CrsGraphFactory::Build(nodeMap, A->getLocalMaxNumRowEntries() * blockdim); - - LO numRows = A->getRowMap()->getLocalNumElements(); - LO numNodes = nodeMap->getLocalNumElements(); - typename LWGraph::boundary_nodes_type amalgBoundaryNodes("amalgBoundaryNodes", numNodes); - Kokkos::deep_copy(amalgBoundaryNodes, false); - const ArrayRCP numberDirichletRowsPerNode(numNodes, 0); // helper array counting the number of Dirichlet nodes associated with node - bool bIsDiagonalEntry = false; // boolean flag stating that grid==gcid - - // 4) do amalgamation. generate graph of amalgamated matrix - // Note, this code is much more inefficient than the leightwight implementation - // Most of the work has already been done in the AmalgamationFactory - for (LO row = 0; row < numRows; row++) { - // get global DOF id - GO grid = rowMap->getGlobalElement(row); - - // reinitialize boolean helper variable - bIsDiagonalEntry = false; - - // translate grid to nodeid - GO nodeId = AmalgamationFactory::DOFGid2NodeId(grid, blockdim, offset, indexBase); - - size_t nnz = A->getNumEntriesInLocalRow(row); - Teuchos::ArrayView indices; - Teuchos::ArrayView vals; - A->getLocalRowView(row, indices, vals); - - RCP> cnodeIds = Teuchos::rcp(new std::vector); // global column block ids - LO realnnz = 0; - for (LO col = 0; col < Teuchos::as(nnz); col++) { - GO gcid = colMap->getGlobalElement(indices[col]); // global column id - - if (vals[col] != STS::zero()) { - GO cnodeId = AmalgamationFactory::DOFGid2NodeId(gcid, blockdim, offset, indexBase); - cnodeIds->push_back(cnodeId); - realnnz++; // increment number of nnz in matrix row - if (grid == gcid) bIsDiagonalEntry = true; - } - } - - if (realnnz == 1 && bIsDiagonalEntry == true) { - LO lNodeId = nodeMap->getLocalElement(nodeId); - numberDirichletRowsPerNode[lNodeId] += 1; // increment Dirichlet row counter associated with lNodeId - if (numberDirichletRowsPerNode[lNodeId] == blockdim) // mark full Dirichlet nodes - amalgBoundaryNodes[lNodeId] = true; - } - - Teuchos::ArrayRCP arr_cnodeIds = Teuchos::arcp(cnodeIds); - - if (arr_cnodeIds.size() > 0) - crsGraph->insertGlobalIndices(nodeId, arr_cnodeIds()); - } - // fill matrix graph - crsGraph->fillComplete(nodeMap, nodeMap); - - // 5) create MueLu Graph object - RCP graph = rcp(new LWGraph(crsGraph, "amalgamated graph of A")); - - // Detect and record rows that correspond to Dirichlet boundary conditions - graph->SetBoundaryNodeMap(amalgBoundaryNodes); - - if (GetVerbLevel() & Statistics1) { - GO numLocalBoundaryNodes = 0; - GO numGlobalBoundaryNodes = 0; - for (size_t i = 0; i < amalgBoundaryNodes.size(); ++i) - if (amalgBoundaryNodes(i)) - numLocalBoundaryNodes++; - RCP> comm = A->getRowMap()->getComm(); - MueLu_sumAll(comm, numLocalBoundaryNodes, numGlobalBoundaryNodes); - GetOStream(Statistics1) << "Detected " << numGlobalBoundaryNodes << " Dirichlet nodes" << std::endl; - } - - // 6) store results in Level - // graph->SetBoundaryNodeMap(gBoundaryNodeMap); - Set(currentLevel, "DofsPerNode", blockdim); - Set(currentLevel, "Graph", graph); - - } // if (doExperimentalWrap) ... else ... - -} // Build - -template -void CoalesceDropFactory::BuildKokkos(Level& currentLevel) const { - FactoryMonitor m(*this, "BuildKokkos", currentLevel); - - typedef Teuchos::ScalarTraits STS; - typedef typename STS::magnitudeType real_type; - typedef Xpetra::MultiVector RealValuedMultiVector; - typedef Xpetra::MultiVectorFactory RealValuedMultiVectorFactory; - - if (predrop_ != Teuchos::null) - GetOStream(Parameters0) << predrop_->description(); - - RCP realA = Get>(currentLevel, "A"); - RCP amalInfo = Get>(currentLevel, "UnAmalgamationInfo"); - const ParameterList& pL = GetParameterList(); - bool doExperimentalWrap = pL.get("lightweight wrap"); - - GetOStream(Parameters0) << "lightweight wrap = " << doExperimentalWrap << std::endl; - std::string algo = pL.get("aggregation: drop scheme"); - const bool aggregationMayCreateDirichlet = pL.get("aggregation: dropping may create Dirichlet"); - - RCP Coords; - RCP A; - - bool use_block_algorithm = false; - LO interleaved_blocksize = as(pL.get("aggregation: block diagonal: interleaved blocksize")); - bool useSignedClassicalRS = false; - bool useSignedClassicalSA = false; - bool generateColoringGraph = false; - - // NOTE: If we're doing blockDiagonal, we'll not want to do rowSum twice (we'll do it - // in the block diagonalization). So we'll clobber the rowSumTol with -1.0 in this case - typename STS::magnitudeType rowSumTol = as(pL.get("aggregation: row sum drop tol")); - - RCP ghostedBlockNumber; - ArrayRCP g_block_id; - - if (algo == "distance laplacian") { - // Grab the coordinates for distance laplacian - Coords = Get>(currentLevel, "Coordinates"); - A = realA; - } else if (algo == "signed classical sa") { - useSignedClassicalSA = true; - algo = "classical"; - A = realA; - } else if (algo == "signed classical" || algo == "block diagonal colored signed classical" || algo == "block diagonal signed classical") { - useSignedClassicalRS = true; - // if(realA->GetFixedBlockSize() > 1) { - RCP BlockNumber = Get>(currentLevel, "BlockNumber"); - // Ghost the column block numbers if we need to - RCP importer = realA->getCrsGraph()->getImporter(); - if (!importer.is_null()) { - SubFactoryMonitor m1(*this, "Block Number import", currentLevel); - ghostedBlockNumber = Xpetra::VectorFactory::Build(importer->getTargetMap()); - ghostedBlockNumber->doImport(*BlockNumber, *importer, Xpetra::INSERT); - } else { - ghostedBlockNumber = BlockNumber; - } - g_block_id = ghostedBlockNumber->getData(0); - // } - if (algo == "block diagonal colored signed classical") - generateColoringGraph = true; - algo = "classical"; - A = realA; - - } else if (algo == "block diagonal") { - // Handle the "block diagonal" filtering and then leave - BlockDiagonalize(currentLevel, realA, false); - return; - } else if (algo == "block diagonal classical" || algo == "block diagonal distance laplacian") { - // Handle the "block diagonal" filtering, and then continue onward - use_block_algorithm = true; - RCP filteredMatrix = BlockDiagonalize(currentLevel, realA, true); - if (algo == "block diagonal distance laplacian") { - // We now need to expand the coordinates by the interleaved blocksize - RCP OldCoords = Get>(currentLevel, "Coordinates"); - if (OldCoords->getLocalLength() != realA->getLocalNumRows()) { - LO dim = (LO)OldCoords->getNumVectors(); - Coords = RealValuedMultiVectorFactory::Build(realA->getRowMap(), dim); - for (LO k = 0; k < dim; k++) { - ArrayRCP old_vec = OldCoords->getData(k); - ArrayRCP new_vec = Coords->getDataNonConst(k); - for (LO i = 0; i < (LO)OldCoords->getLocalLength(); i++) { - LO new_base = i * dim; - for (LO j = 0; j < interleaved_blocksize; j++) - new_vec[new_base + j] = old_vec[i]; - } - } - } else { - Coords = OldCoords; - } - algo = "distance laplacian"; - } else if (algo == "block diagonal classical") { - algo = "classical"; - } - // All cases - A = filteredMatrix; - rowSumTol = -1.0; - } else { - A = realA; - } - - // Distance Laplacian weights - Array dlap_weights = pL.get>("aggregation: distance laplacian directional weights"); - enum { NO_WEIGHTS = 0, - SINGLE_WEIGHTS, - BLOCK_WEIGHTS }; - int use_dlap_weights = NO_WEIGHTS; - if (algo == "distance laplacian") { - LO dim = (LO)Coords->getNumVectors(); - // If anything isn't 1.0 we need to turn on the weighting - bool non_unity = false; - for (LO i = 0; !non_unity && i < (LO)dlap_weights.size(); i++) { - if (dlap_weights[i] != 1.0) { - non_unity = true; - } - } - if (non_unity) { - LO blocksize = use_block_algorithm ? as(pL.get("aggregation: block diagonal: interleaved blocksize")) : 1; - if ((LO)dlap_weights.size() == dim) - use_dlap_weights = SINGLE_WEIGHTS; - else if ((LO)dlap_weights.size() == blocksize * dim) - use_dlap_weights = BLOCK_WEIGHTS; - else { - TEUCHOS_TEST_FOR_EXCEPTION(1, Exceptions::RuntimeError, - "length of 'aggregation: distance laplacian directional weights' must equal the coordinate dimension OR the coordinate dimension times the blocksize"); - } - if (GetVerbLevel() & Statistics1) - GetOStream(Statistics1) << "Using distance laplacian weights: " << dlap_weights << std::endl; - } - } - - // decide wether to use the fast-track code path for standard maps or the somewhat slower - // code path for non-standard maps - /*bool bNonStandardMaps = false; - if (A->IsView("stridedMaps") == true) { - Teuchos::RCP myMap = A->getRowMap("stridedMaps"); - Teuchos::RCP strMap = Teuchos::rcp_dynamic_cast(myMap); - TEUCHOS_TEST_FOR_EXCEPTION(strMap == null, Exceptions::RuntimeError, "Map is not of type StridedMap"); - if (strMap->getStridedBlockId() != -1 || strMap->getOffset() > 0) - bNonStandardMaps = true; - }*/ - - if (doExperimentalWrap) { - TEUCHOS_TEST_FOR_EXCEPTION(predrop_ != null && algo != "classical", Exceptions::RuntimeError, "Dropping function must not be provided for \"" << algo << "\" algorithm"); - TEUCHOS_TEST_FOR_EXCEPTION(algo != "classical" && algo != "distance laplacian" && algo != "signed classical", Exceptions::RuntimeError, "\"algorithm\" must be one of (classical|distance laplacian|signed classical)"); - - SC threshold; - // If we're doing the ML-style halving of the drop tol at each level, we do that here. - if (pL.get("aggregation: use ml scaling of drop tol")) - threshold = pL.get("aggregation: drop tol") / pow(2.0, currentLevel.GetLevelID()); - else - threshold = as(pL.get("aggregation: drop tol")); - - std::string distanceLaplacianAlgoStr = pL.get("aggregation: distance laplacian algo"); - std::string classicalAlgoStr = pL.get("aggregation: classical algo"); - real_type realThreshold = STS::magnitude(threshold); // CMS: Rename this to "magnitude threshold" sometime - - //////////////////////////////////////////////////// - // Remove this bit once we are confident that cut-based dropping works. -#ifdef HAVE_MUELU_DEBUG - int distanceLaplacianCutVerbose = 0; -#endif -#ifdef DJS_READ_ENV_VARIABLES - if (getenv("MUELU_DROP_TOLERANCE_MODE")) { - distanceLaplacianAlgoStr = std::string(getenv("MUELU_DROP_TOLERANCE_MODE")); - } - - if (getenv("MUELU_DROP_TOLERANCE_THRESHOLD")) { - auto tmp = atoi(getenv("MUELU_DROP_TOLERANCE_THRESHOLD")); - realThreshold = 1e-4 * tmp; - } - -#ifdef HAVE_MUELU_DEBUG - if (getenv("MUELU_DROP_TOLERANCE_VERBOSE")) { - distanceLaplacianCutVerbose = atoi(getenv("MUELU_DROP_TOLERANCE_VERBOSE")); - } -#endif -#endif - //////////////////////////////////////////////////// - - enum decisionAlgoType { defaultAlgo, - unscaled_cut, - scaled_cut, - scaled_cut_symmetric }; - - decisionAlgoType distanceLaplacianAlgo = defaultAlgo; - decisionAlgoType classicalAlgo = defaultAlgo; - if (algo == "distance laplacian") { - if (distanceLaplacianAlgoStr == "default") - distanceLaplacianAlgo = defaultAlgo; - else if (distanceLaplacianAlgoStr == "unscaled cut") - distanceLaplacianAlgo = unscaled_cut; - else if (distanceLaplacianAlgoStr == "scaled cut") - distanceLaplacianAlgo = scaled_cut; - else if (distanceLaplacianAlgoStr == "scaled cut symmetric") - distanceLaplacianAlgo = scaled_cut_symmetric; - else - TEUCHOS_TEST_FOR_EXCEPTION(true, Exceptions::RuntimeError, "\"aggregation: distance laplacian algo\" must be one of (default|unscaled cut|scaled cut), not \"" << distanceLaplacianAlgoStr << "\""); - GetOStream(Runtime0) << "algorithm = \"" << algo << "\" distance laplacian algorithm = \"" << distanceLaplacianAlgoStr << "\": threshold = " << threshold << ", blocksize = " << A->GetFixedBlockSize() << std::endl; - } else if (algo == "classical") { - if (classicalAlgoStr == "default") - classicalAlgo = defaultAlgo; - else if (classicalAlgoStr == "unscaled cut") - classicalAlgo = unscaled_cut; - else if (classicalAlgoStr == "scaled cut") - classicalAlgo = scaled_cut; - else - TEUCHOS_TEST_FOR_EXCEPTION(true, Exceptions::RuntimeError, "\"aggregation: classical algo\" must be one of (default|unscaled cut|scaled cut), not \"" << classicalAlgoStr << "\""); - GetOStream(Runtime0) << "algorithm = \"" << algo << "\" classical algorithm = \"" << classicalAlgoStr << "\": threshold = " << threshold << ", blocksize = " << A->GetFixedBlockSize() << std::endl; - - } else - GetOStream(Runtime0) << "algorithm = \"" << algo << "\": threshold = " << threshold << ", blocksize = " << A->GetFixedBlockSize() << std::endl; - Set(currentLevel, "Filtering", (threshold != STS::zero())); - - const typename STS::magnitudeType dirichletThreshold = STS::magnitude(as(pL.get("aggregation: Dirichlet threshold"))); - - // NOTE: We don't support signed classical RS or SA with cut drop at present - TEUCHOS_TEST_FOR_EXCEPTION(useSignedClassicalRS && classicalAlgo != defaultAlgo, Exceptions::RuntimeError, "\"aggregation: classical algo\" != default is not supported for scalled classical aggregation"); - TEUCHOS_TEST_FOR_EXCEPTION(useSignedClassicalSA && classicalAlgo != defaultAlgo, Exceptions::RuntimeError, "\"aggregation: classical algo\" != default is not supported for scalled classical sa aggregation"); - - GO numDropped = 0, numTotal = 0; - std::string graphType = "unamalgamated"; // for description purposes only - - /* NOTE: storageblocksize (from GetStorageBlockSize()) is the size of a block in the chosen storage scheme. - BlockSize is the number of storage blocks that must kept together during the amalgamation process. - - Both of these quantities may be different than numPDEs (from GetFixedBlockSize()), but the following must always hold: - - numPDEs = BlockSize * storageblocksize. - - If numPDEs==1 - Matrix is point storage (classical CRS storage). storageblocksize=1 and BlockSize=1 - No other values makes sense. - - If numPDEs>1 - If matrix uses point storage, then storageblocksize=1 and BlockSize=numPDEs. - If matrix uses block storage, with block size of n, then storageblocksize=n, and BlockSize=numPDEs/n. - Thus far, only storageblocksize=numPDEs and BlockSize=1 has been tested. - */ - TEUCHOS_TEST_FOR_EXCEPTION(A->GetFixedBlockSize() % A->GetStorageBlockSize() != 0, Exceptions::RuntimeError, "A->GetFixedBlockSize() needs to be a multiple of A->GetStorageBlockSize()"); - const LO BlockSize = A->GetFixedBlockSize() / A->GetStorageBlockSize(); - - /************************** RS or SA-style Classical Dropping (and variants) **************************/ - if (algo == "classical") { - if (predrop_ == null) { - // ap: this is a hack: had to declare predrop_ as mutable - predrop_ = rcp(new PreDropFunctionConstVal(threshold)); - } - - if (predrop_ != null) { - RCP predropConstVal = rcp_dynamic_cast(predrop_); - TEUCHOS_TEST_FOR_EXCEPTION(predropConstVal == Teuchos::null, Exceptions::BadCast, - "MueLu::CoalesceFactory::Build: cast to PreDropFunctionConstVal failed."); - // If a user provided a predrop function, it overwrites the XML threshold parameter - SC newt = predropConstVal->GetThreshold(); - if (newt != threshold) { - GetOStream(Warnings0) << "switching threshold parameter from " << threshold << " (list) to " << newt << " (user function" << std::endl; - threshold = newt; - } - } - // At this points we either have - // (predrop_ != null) - // Therefore, it is sufficient to check only threshold - if (BlockSize == 1 && threshold == STS::zero() && !useSignedClassicalRS && !useSignedClassicalSA && A->hasCrsGraph()) { - // Case 1: scalar problem, no dropping => just use matrix graph - RCP graph = rcp(new LWGraph(A->getCrsGraph(), "graph of A")); - // Detect and record rows that correspond to Dirichlet boundary conditions - auto boundaryNodes = MueLu::Utilities::DetectDirichletRows_kokkos_host(*A, dirichletThreshold); - if (rowSumTol > 0.) - Utilities::ApplyRowSumCriterionHost(*A, rowSumTol, boundaryNodes); - - graph->SetBoundaryNodeMap(boundaryNodes); - numTotal = A->getLocalNumEntries(); - - if (GetVerbLevel() & Statistics1) { - GO numLocalBoundaryNodes = 0; - GO numGlobalBoundaryNodes = 0; - for (size_t i = 0; i < boundaryNodes.size(); ++i) - if (boundaryNodes[i]) - numLocalBoundaryNodes++; - RCP> comm = A->getRowMap()->getComm(); - MueLu_sumAll(comm, numLocalBoundaryNodes, numGlobalBoundaryNodes); - GetOStream(Statistics1) << "Detected " << numGlobalBoundaryNodes << " Dirichlet nodes" << std::endl; - } - - Set(currentLevel, "DofsPerNode", 1); - Set(currentLevel, "Graph", graph); - - } else if ((BlockSize == 1 && threshold != STS::zero()) || - (BlockSize == 1 && threshold == STS::zero() && !A->hasCrsGraph()) || - (BlockSize == 1 && useSignedClassicalRS) || - (BlockSize == 1 && useSignedClassicalSA)) { - // Case 2: scalar problem with dropping => record the column indices of undropped entries, but still use original - // graph's map information, e.g., whether index is local - // OR a matrix without a CrsGraph - - // allocate space for the local graph - typename LWGraph::row_type::non_const_type rows("rows", A->getLocalNumRows() + 1); - typename LWGraph::entries_type::non_const_type columns("columns", A->getLocalNumEntries()); - - using MT = typename STS::magnitudeType; - RCP ghostedDiag; - ArrayRCP ghostedDiagVals; - ArrayRCP negMaxOffDiagonal; - // RS style needs the max negative off-diagonal, SA style needs the diagonal - if (useSignedClassicalRS) { - if (ghostedBlockNumber.is_null()) { - negMaxOffDiagonal = MueLu::Utilities::GetMatrixMaxMinusOffDiagonal(*A); - if (GetVerbLevel() & Statistics1) - GetOStream(Statistics1) << "Calculated max point off-diagonal" << std::endl; - } else { - negMaxOffDiagonal = MueLu::Utilities::GetMatrixMaxMinusOffDiagonal(*A, *ghostedBlockNumber); - if (GetVerbLevel() & Statistics1) - GetOStream(Statistics1) << "Calculating max block off-diagonal" << std::endl; - } - } else { - ghostedDiag = MueLu::Utilities::GetMatrixOverlappedDiagonal(*A); - ghostedDiagVals = ghostedDiag->getData(0); - } - auto boundaryNodes = MueLu::Utilities::DetectDirichletRows_kokkos_host(*A, dirichletThreshold); - if (rowSumTol > 0.) { - if (ghostedBlockNumber.is_null()) { - if (GetVerbLevel() & Statistics1) - GetOStream(Statistics1) << "Applying point row sum criterion." << std::endl; - Utilities::ApplyRowSumCriterionHost(*A, rowSumTol, boundaryNodes); - } else { - if (GetVerbLevel() & Statistics1) - GetOStream(Statistics1) << "Applying block row sum criterion." << std::endl; - Utilities::ApplyRowSumCriterionHost(*A, *ghostedBlockNumber, rowSumTol, boundaryNodes); - } - } - - LO realnnz = 0; - rows(0) = 0; - for (LO row = 0; row < Teuchos::as(A->getRowMap()->getLocalNumElements()); ++row) { - size_t nnz = A->getNumEntriesInLocalRow(row); - bool rowIsDirichlet = boundaryNodes[row]; - ArrayView indices; - ArrayView vals; - A->getLocalRowView(row, indices, vals); - - if (classicalAlgo == defaultAlgo) { - // FIXME the current predrop function uses the following - // FIXME if(std::abs(vals[k]) > std::abs(threshold_) || grow == gcid ) - // FIXME but the threshold doesn't take into account the rows' diagonal entries - // FIXME For now, hardwiring the dropping in here - - LO rownnz = 0; - if (useSignedClassicalRS) { - // Signed classical RS style - for (LO colID = 0; colID < Teuchos::as(nnz); colID++) { - LO col = indices[colID]; - MT max_neg_aik = realThreshold * STS::real(negMaxOffDiagonal[row]); - MT neg_aij = -STS::real(vals[colID]); - /* if(row==1326) printf("A(%d,%d) = %6.4e, block = (%d,%d) neg_aij = %6.4e max_neg_aik = %6.4e\n",row,col,vals[colID], - g_block_id.is_null() ? -1 : g_block_id[row], - g_block_id.is_null() ? -1 : g_block_id[col], - neg_aij, max_neg_aik);*/ - if ((!rowIsDirichlet && (g_block_id.is_null() || g_block_id[row] == g_block_id[col]) && neg_aij > max_neg_aik) || row == col) { - columns[realnnz++] = col; - rownnz++; - } else - numDropped++; - } - rows(row + 1) = realnnz; - } else if (useSignedClassicalSA) { - // Signed classical SA style - for (LO colID = 0; colID < Teuchos::as(nnz); colID++) { - LO col = indices[colID]; - - bool is_nonpositive = STS::real(vals[colID]) <= 0; - MT aiiajj = STS::magnitude(threshold * threshold * ghostedDiagVals[col] * ghostedDiagVals[row]); // eps^2*|a_ii|*|a_jj| - MT aij = is_nonpositive ? STS::magnitude(vals[colID] * vals[colID]) : (-STS::magnitude(vals[colID] * vals[colID])); // + |a_ij|^2, if a_ij < 0, - |a_ij|^2 if a_ij >=0 - /* - if(row==1326) printf("A(%d,%d) = %6.4e, raw_aij = %6.4e aij = %6.4e aiiajj = %6.4e\n",row,col,vals[colID], - vals[colID],aij, aiiajj); - */ - - if ((!rowIsDirichlet && aij > aiiajj) || row == col) { - columns(realnnz++) = col; - rownnz++; - } else - numDropped++; - } - rows[row + 1] = realnnz; - } else { - // Standard abs classical - for (LO colID = 0; colID < Teuchos::as(nnz); colID++) { - LO col = indices[colID]; - MT aiiajj = STS::magnitude(threshold * threshold * ghostedDiagVals[col] * ghostedDiagVals[row]); // eps^2*|a_ii|*|a_jj| - MT aij = STS::magnitude(vals[colID] * vals[colID]); // |a_ij|^2 - - if ((!rowIsDirichlet && aij > aiiajj) || row == col) { - columns(realnnz++) = col; - rownnz++; - } else - numDropped++; - } - rows(row + 1) = realnnz; - } - } else { - /* Cut Algorithm */ - // CMS - using DropTol = Details::DropTol; - std::vector drop_vec; - drop_vec.reserve(nnz); - const real_type zero = Teuchos::ScalarTraits::zero(); - const real_type one = Teuchos::ScalarTraits::one(); - LO rownnz = 0; - // NOTE: This probably needs to be fixed for rowsum - - // find magnitudes - for (LO colID = 0; colID < (LO)nnz; colID++) { - LO col = indices[colID]; - if (row == col) { - drop_vec.emplace_back(zero, one, colID, false); - continue; - } - - // Don't aggregate boundaries - if (boundaryNodes[colID]) continue; - typename STS::magnitudeType aiiajj = STS::magnitude(threshold * threshold * ghostedDiagVals[col] * ghostedDiagVals[row]); // eps^2*|a_ii|*|a_jj| - typename STS::magnitudeType aij = STS::magnitude(vals[colID] * vals[colID]); // |a_ij|^2 - drop_vec.emplace_back(aij, aiiajj, colID, false); - } - - const size_t n = drop_vec.size(); - - if (classicalAlgo == unscaled_cut) { - std::sort(drop_vec.begin(), drop_vec.end(), [](DropTol const& a, DropTol const& b) { - return a.val > b.val; - }); - - bool drop = false; - for (size_t i = 1; i < n; ++i) { - if (!drop) { - auto const& x = drop_vec[i - 1]; - auto const& y = drop_vec[i]; - auto a = x.val; - auto b = y.val; - if (a > realThreshold * b) { - drop = true; -#ifdef HAVE_MUELU_DEBUG - if (distanceLaplacianCutVerbose) { - std::cout << "DJS: KEEP, N, ROW: " << i + 1 << ", " << n << ", " << row << std::endl; - } -#endif - } - } - drop_vec[i].drop = drop; - } - } else if (classicalAlgo == scaled_cut) { - std::sort(drop_vec.begin(), drop_vec.end(), [](DropTol const& a, DropTol const& b) { - return a.val / a.diag > b.val / b.diag; - }); - bool drop = false; - // printf("[%d] Scaled Cut: ",(int)row); - // printf("%3d(%4s) ",indices[drop_vec[0].col],"keep"); - for (size_t i = 1; i < n; ++i) { - if (!drop) { - auto const& x = drop_vec[i - 1]; - auto const& y = drop_vec[i]; - auto a = x.val / x.diag; - auto b = y.val / y.diag; - if (a > realThreshold * b) { - drop = true; - -#ifdef HAVE_MUELU_DEBUG - if (distanceLaplacianCutVerbose) { - std::cout << "DJS: KEEP, N, ROW: " << i + 1 << ", " << n << ", " << row << std::endl; - } -#endif - } - // printf("%3d(%4s) ",indices[drop_vec[i].col],drop?"drop":"keep"); - } - drop_vec[i].drop = drop; - } - // printf("\n"); - } - std::sort(drop_vec.begin(), drop_vec.end(), [](DropTol const& a, DropTol const& b) { - return a.col < b.col; - }); - - for (LO idxID = 0; idxID < (LO)drop_vec.size(); idxID++) { - LO col = indices[drop_vec[idxID].col]; - // don't drop diagonal - if (row == col) { - columns[realnnz++] = col; - rownnz++; - continue; - } - - if (!drop_vec[idxID].drop) { - columns[realnnz++] = col; - rownnz++; - } else { - numDropped++; - } - } - // CMS - rows[row + 1] = realnnz; - } - } // end for row + } numTotal = A->getLocalNumEntries(); @@ -3285,7 +1655,7 @@ void CoalesceDropFactory::BuildKokkos } // if (doExperimentalWrap) ... else ... -} // BuildKokkos +} // Build template void CoalesceDropFactory::MergeRows(const Matrix& A, const LO row, Array& cols, const Array& translation) const { From 7c78025bf66884fd8b788bac2ae352e871241e7d Mon Sep 17 00:00:00 2001 From: Ian Halim Date: Wed, 24 Jul 2024 17:51:23 -0600 Subject: [PATCH 4/7] MueLu: std::complex Replaced With Kokkos::complex Signed-off-by: Ian Halim --- .../MueLu_CoalesceDropFactory_def.hpp | 475 +++++++++--------- 1 file changed, 241 insertions(+), 234 deletions(-) diff --git a/packages/muelu/src/Graph/MatrixTransformation/MueLu_CoalesceDropFactory_def.hpp b/packages/muelu/src/Graph/MatrixTransformation/MueLu_CoalesceDropFactory_def.hpp index da606ab20ff6..ad5895e2e41b 100644 --- a/packages/muelu/src/Graph/MatrixTransformation/MueLu_CoalesceDropFactory_def.hpp +++ b/packages/muelu/src/Graph/MatrixTransformation/MueLu_CoalesceDropFactory_def.hpp @@ -475,10 +475,10 @@ void CoalesceDropFactory::Build(Level typename LWGraph::entries_type::non_const_type columns("columns", A->getLocalNumEntries()); using MT = typename STS::magnitudeType; - RCP ghostedDiag; + RCP ghostedDiag; ArrayRCP ghostedDiagVals; ArrayRCP negMaxOffDiagonal; - // RS style needs the max negative off-diagonal, SA style needs the diagonal + // RS style needs the max negative off-diagonal, SA style needs the diagonal if (useSignedClassicalRS) { if (ghostedBlockNumber.is_null()) { negMaxOffDiagonal = MueLu::Utilities::GetMatrixMaxMinusOffDiagonal(*A); @@ -491,10 +491,12 @@ void CoalesceDropFactory::Build(Level } } else { ghostedDiag = MueLu::Utilities::GetMatrixOverlappedDiagonal(*A); - ghostedDiagVals = ghostedDiag->getData(0); - } + if(classicalAlgo == defaultAlgo) { + ghostedDiagVals = ghostedDiag->getData(0); + } + } auto boundaryNodes = MueLu::Utilities::DetectDirichletRows_kokkos_host(*A, dirichletThreshold); - if (rowSumTol > 0.) { + if (rowSumTol > 0.) { if (ghostedBlockNumber.is_null()) { if (GetVerbLevel() & Statistics1) GetOStream(Statistics1) << "Applying point row sum criterion." << std::endl; @@ -508,234 +510,239 @@ void CoalesceDropFactory::Build(Level LO realnnz = 0; rows(0) = 0; - if(classicalAlgo == defaultAlgo) { - SubFactoryMonitor m1(*this, "Classical RS/SA", currentLevel); - for (LO row = 0; row < Teuchos::as(A->getRowMap()->getLocalNumElements()); ++row) { - size_t nnz = A->getNumEntriesInLocalRow(row); - bool rowIsDirichlet = boundaryNodes[row]; - ArrayView indices; - ArrayView vals; - A->getLocalRowView(row, indices, vals); - - // FIXME the current predrop function uses the following - // FIXME if(std::abs(vals[k]) > std::abs(threshold_) || grow == gcid ) - // FIXME but the threshold doesn't take into account the rows' diagonal entries - // FIXME For now, hardwiring the dropping in here - - LO rownnz = 0; - if (useSignedClassicalRS) { - // Signed classical RS style - for (LO colID = 0; colID < Teuchos::as(nnz); colID++) { - LO col = indices[colID]; - MT max_neg_aik = realThreshold * STS::real(negMaxOffDiagonal[row]); - MT neg_aij = -STS::real(vals[colID]); - /* if(row==1326) printf("A(%d,%d) = %6.4e, block = (%d,%d) neg_aij = %6.4e max_neg_aik = %6.4e\n",row,col,vals[colID], - g_block_id.is_null() ? -1 : g_block_id[row], - g_block_id.is_null() ? -1 : g_block_id[col], - neg_aij, max_neg_aik);*/ - if ((!rowIsDirichlet && (g_block_id.is_null() || g_block_id[row] == g_block_id[col]) && neg_aij > max_neg_aik) || row == col) { - columns[realnnz++] = col; - rownnz++; - } else - numDropped++; - } - rows(row + 1) = realnnz; - } else if (useSignedClassicalSA) { - // Signed classical SA style - for (LO colID = 0; colID < Teuchos::as(nnz); colID++) { - LO col = indices[colID]; - - bool is_nonpositive = STS::real(vals[colID]) <= 0; - MT aiiajj = STS::magnitude(threshold * threshold * ghostedDiagVals[col] * ghostedDiagVals[row]); // eps^2*|a_ii|*|a_jj| - MT aij = is_nonpositive ? STS::magnitude(vals[colID] * vals[colID]) : (-STS::magnitude(vals[colID] * vals[colID])); // + |a_ij|^2, if a_ij < 0, - |a_ij|^2 if a_ij >=0 - /* - if(row==1326) printf("A(%d,%d) = %6.4e, raw_aij = %6.4e aij = %6.4e aiiajj = %6.4e\n",row,col,vals[colID], - vals[colID],aij, aiiajj); - */ - - if ((!rowIsDirichlet && aij > aiiajj) || row == col) { - columns(realnnz++) = col; - rownnz++; - } else - numDropped++; - } - rows[row + 1] = realnnz; - } else { - // Standard abs classical - for (LO colID = 0; colID < Teuchos::as(nnz); colID++) { - LO col = indices[colID]; - MT aiiajj = STS::magnitude(threshold * threshold * ghostedDiagVals[col] * ghostedDiagVals[row]); // eps^2*|a_ii|*|a_jj| - MT aij = STS::magnitude(vals[colID] * vals[colID]); // |a_ij|^2 - - if ((!rowIsDirichlet && aij > aiiajj) || row == col) { - columns(realnnz++) = col; - rownnz++; - } else - numDropped++; - } - rows(row + 1) = realnnz; - } - } // end for row - } - else { - SubFactoryMonitor m1(*this, "Cut Drop", currentLevel); - using ExecSpace = typename Node::execution_space; - using TeamPol = Kokkos::TeamPolicy; - using TeamMem = typename TeamPol::member_type; - - //move from host to device - ArrayView ghostedDiagValsArrayView = ghostedDiagVals.view(ghostedDiagVals.lowerOffset(), ghostedDiagVals.size()); - Kokkos::View ghostedDiagValsView = Kokkos::Compat::getKokkosViewDeepCopy(ghostedDiagValsArrayView); - auto boundaryNodesDevice = Kokkos::create_mirror_view(ExecSpace(), boundaryNodes); - - auto At = Utilities::Op2TpetraCrs(A); - auto A_device = At->getLocalMatrixDevice(); - - int algorithm = classicalAlgo; - Kokkos::ViewrownnzView("rownnzView", A_device.numRows()); - auto drop_views = Kokkos::View("drop_views", A_device.nnz()); - auto index_views = Kokkos::View("index_views", A_device.nnz()); - - Kokkos::parallel_reduce("classical_cut", TeamPol(A_device.numRows(), Kokkos::AUTO), KOKKOS_LAMBDA(const TeamMem& teamMember, LO& globalnnz, GO& totalDropped) { - LO row = teamMember.league_rank(); - auto rowView = A_device.row(row); - size_t nnz = rowView.length; - - size_t n = 0; - auto drop_view = Kokkos::subview(drop_views, Kokkos::make_pair(A_device.graph.row_map(row), A_device.graph.row_map(row+1))); - auto index_view = Kokkos::subview(index_views, Kokkos::make_pair(A_device.graph.row_map(row), A_device.graph.row_map(row+1))); - - //find magnitudes - Kokkos::parallel_reduce(Kokkos::TeamThreadRange(teamMember, (LO)nnz), [&](const LO colID, size_t &count) { - index_view(colID) = colID; - LO col = rowView.colidx(colID); - //ignore diagonals for now, they are checked again later - if(row == col) { - drop_view(colID) = true; - count++; - } - //Don't aggregate boundaries - else if(boundaryNodesDevice(colID)) { - drop_view(colID) = true; - } - else { - drop_view(colID) = false; - count++; - } - }, n); - - size_t dropStart = n; - if (algorithm == unscaled_cut) { - //push diagonals and boundaries to the right, sort everything else by aij on the left - Kokkos::Experimental::sort_team(teamMember, index_view, [=](size_t& x, size_t& y) { - if(drop_view(x) || drop_view(y)) { - return drop_view(x) < drop_view(y); - } - else { - typename STS::magnitudeType x_aij = static_cast(std::fabs(static_cast(rowView.value(x) * rowView.value(x)))); - typename STS::magnitudeType y_aij = static_cast(std::fabs(static_cast(rowView.value(y) * rowView.value(y)))); - return x_aij > y_aij; - } - }); - - //find index where dropping starts - Kokkos::parallel_reduce(Kokkos::TeamThreadRange(teamMember, 1, n), [=](size_t i, size_t& min) { - auto const& x = index_view(i - 1); - auto const& y = index_view(i); - typename STS::magnitudeType x_aij = 0; - typename STS::magnitudeType y_aij = 0; - if(!drop_view(x)) { - x_aij = static_cast(std::fabs(static_cast(rowView.value(x) * rowView.value(x)))); - } - if(!drop_view(y)) { - y_aij = static_cast(std::fabs(static_cast(rowView.value(y) * rowView.value(y)))); - } - - if(x_aij > realThreshold * y_aij) { - if(i < min) { - min = i; - } - } - }, Kokkos::Min(dropStart)); - } else if (algorithm == scaled_cut) { - //push diagonals and boundaries to the right, sort everything else by aij/aiiajj on the left - Kokkos::Experimental::sort_team(teamMember, index_view, [=](size_t& x, size_t& y) { - if(drop_view(x) || drop_view(y)) { - return drop_view(x) < drop_view(y); - } - else { - typename STS::magnitudeType x_aij = static_cast(std::fabs(static_cast(rowView.value(x) * rowView.value(x)))); - typename STS::magnitudeType y_aij = static_cast(std::fabs(static_cast(rowView.value(y) * rowView.value(y)))); - typename STS::magnitudeType x_aiiajj = static_cast(std::fabs(static_cast(threshold * threshold * ghostedDiagValsView(rowView.colidx(x)) * ghostedDiagValsView(row)))); - typename STS::magnitudeType y_aiiajj = static_cast(std::fabs(static_cast(threshold * threshold * ghostedDiagValsView(rowView.colidx(y)) * ghostedDiagValsView(row)))); - return x_aij / x_aiiajj > y_aij / y_aiiajj; - } - }); - - //find index where dropping starts - Kokkos::parallel_reduce(Kokkos::TeamThreadRange(teamMember, 1, n), [=](size_t i, size_t& min) { - auto const& x = index_view(i - 1); - auto const& y = index_view(i); - typename STS::magnitudeType x_val = 0; - typename STS::magnitudeType y_val = 0; - if(!drop_view(x)) { - typename STS::magnitudeType x_aij = static_cast(std::fabs(static_cast(rowView.value(x) * rowView.value(x)))); - typename STS::magnitudeType x_aiiajj = static_cast(std::fabs(static_cast(threshold * threshold * ghostedDiagValsView(rowView.colidx(x)) * ghostedDiagValsView(row)))); - x_val = x_aij / x_aiiajj; - } - if(!drop_view(y)) { - typename STS::magnitudeType y_aij = static_cast(std::fabs(static_cast(rowView.value(y) * rowView.value(y)))); - typename STS::magnitudeType y_aiiajj = static_cast(std::fabs(static_cast(threshold * threshold * ghostedDiagValsView(rowView.colidx(y)) * ghostedDiagValsView(row)))); - y_val = y_aij / y_aiiajj; - } - - if(x_val > realThreshold * y_val) { - if(i < min) { - min = i; - } - } - }, Kokkos::Min(dropStart)); - } - - //drop everything to the right of where values stop passing threshold - if(dropStart < n) { - Kokkos::parallel_for(Kokkos::TeamThreadRange(teamMember, dropStart, n), [=](size_t i) { - drop_view(index_view(i)) = true; - }); - } - - LO rownnz = 0; - GO rowDropped = 0; - Kokkos::parallel_reduce(Kokkos::TeamThreadRange(teamMember, n), [=](const size_t idxID, LO& keep, GO& drop) { - LO col = rowView.colidx(idxID); - //don't drop diagonal - if(row == col || !drop_view(idxID)) { - keep++; - } - else { - rowView.colidx(idxID) = -1; - drop++; - } - }, rownnz, rowDropped); - - globalnnz += rownnz; - totalDropped += rowDropped; - rownnzView(row) = rownnz; - }, realnnz, numDropped); - - //update column indices so that kept indices are aligned to the left for subview that happens later on - auto columnsDevice = Kokkos::create_mirror_view(ExecSpace(), columns); - Kokkos::Experimental::remove(ExecSpace(), columnsDevice, -1); - Kokkos::deep_copy(columns, columnsDevice); - - //update row indices by adding up new # of nnz in each row - auto rowsDevice = Kokkos::create_mirror_view(ExecSpace(), rows); - Kokkos::parallel_scan(Kokkos::RangePolicy(0, A_device.numRows()), KOKKOS_LAMBDA(const int i, LO& partial_sum, bool is_final) { - partial_sum += rownnzView(i); - if(is_final) rowsDevice(i+1) = partial_sum; - }); - Kokkos::deep_copy(rows, rowsDevice); - } + if(classicalAlgo == defaultAlgo) { + SubFactoryMonitor m1(*this, "Classical RS/SA", currentLevel); + for (LO row = 0; row < Teuchos::as(A->getRowMap()->getLocalNumElements()); ++row) { + size_t nnz = A->getNumEntriesInLocalRow(row); + bool rowIsDirichlet = boundaryNodes[row]; + ArrayView indices; + ArrayView vals; + A->getLocalRowView(row, indices, vals); + + // FIXME the current predrop function uses the following + // FIXME if(std::abs(vals[k]) > std::abs(threshold_) || grow == gcid ) + // FIXME but the threshold doesn't take into account the rows' diagonal entries + // FIXME For now, hardwiring the dropping in here + + LO rownnz = 0; + if (useSignedClassicalRS) { + // Signed classical RS style + for (LO colID = 0; colID < Teuchos::as(nnz); colID++) { + LO col = indices[colID]; + MT max_neg_aik = realThreshold * STS::real(negMaxOffDiagonal[row]); + MT neg_aij = -STS::real(vals[colID]); + /* if(row==1326) printf("A(%d,%d) = %6.4e, block = (%d,%d) neg_aij = %6.4e max_neg_aik = %6.4e\n",row,col,vals[colID], + g_block_id.is_null() ? -1 : g_block_id[row], + g_block_id.is_null() ? -1 : g_block_id[col], + neg_aij, max_neg_aik);*/ + if ((!rowIsDirichlet && (g_block_id.is_null() || g_block_id[row] == g_block_id[col]) && neg_aij > max_neg_aik) || row == col) { + columns[realnnz++] = col; + rownnz++; + } else + numDropped++; + } + rows(row + 1) = realnnz; + } else if (useSignedClassicalSA) { + // Signed classical SA style + for (LO colID = 0; colID < Teuchos::as(nnz); colID++) { + LO col = indices[colID]; + + bool is_nonpositive = STS::real(vals[colID]) <= 0; + MT aiiajj = STS::magnitude(threshold * threshold * ghostedDiagVals[col] * ghostedDiagVals[row]); // eps^2*|a_ii|*|a_jj| + MT aij = is_nonpositive ? STS::magnitude(vals[colID] * vals[colID]) : (-STS::magnitude(vals[colID] * vals[colID])); // + |a_ij|^2, if a_ij < 0, - |a_ij|^2 if a_ij >=0 + /* + if(row==1326) printf("A(%d,%d) = %6.4e, raw_aij = %6.4e aij = %6.4e aiiajj = %6.4e\n",row,col,vals[colID], + vals[colID],aij, aiiajj); + */ + + if ((!rowIsDirichlet && aij > aiiajj) || row == col) { + columns(realnnz++) = col; + rownnz++; + } else + numDropped++; + } + rows[row + 1] = realnnz; + } else { + // Standard abs classical + for (LO colID = 0; colID < Teuchos::as(nnz); colID++) { + LO col = indices[colID]; + MT aiiajj = STS::magnitude(threshold * threshold * ghostedDiagVals[col] * ghostedDiagVals[row]); // eps^2*|a_ii|*|a_jj| + MT aij = STS::magnitude(vals[colID] * vals[colID]); // |a_ij|^2 + + if ((!rowIsDirichlet && aij > aiiajj) || row == col) { + columns(realnnz++) = col; + rownnz++; + } else + numDropped++; + } + rows(row + 1) = realnnz; + } + } // end for row + } + else { + /* Cut Algorithm */ + SubFactoryMonitor m1(*this, "Cut Drop", currentLevel); + using ExecSpace = typename Node::execution_space; + using TeamPol = Kokkos::TeamPolicy; + using TeamMem = typename TeamPol::member_type; + using ATS = Kokkos::ArithTraits; + using impl_scalar_type = typename ATS::val_type; + using implATS = Kokkos::ArithTraits; + + //move from host to device + auto ghostedDiagValsView = Kokkos::subview(ghostedDiag->getDeviceLocalView(Xpetra::Access::ReadOnly), Kokkos::ALL(), 0); + auto boundaryNodesDevice = Kokkos::create_mirror_view(ExecSpace(), boundaryNodes); + auto thresholdKokkos = static_cast(threshold); + auto realThresholdKokkos = implATS::magnitude(thresholdKokkos); + + auto At = Utilities::Op2TpetraCrs(A); + auto A_device = At->getLocalMatrixDevice(); + + int algorithm = classicalAlgo; + Kokkos::ViewrownnzView("rownnzView", A_device.numRows()); + auto drop_views = Kokkos::View("drop_views", A_device.nnz()); + auto index_views = Kokkos::View("index_views", A_device.nnz()); + + Kokkos::parallel_reduce("classical_cut", TeamPol(A_device.numRows(), Kokkos::AUTO), KOKKOS_LAMBDA(const TeamMem& teamMember, LO& globalnnz, GO& totalDropped) { + LO row = teamMember.league_rank(); + auto rowView = A_device.row(row); + size_t nnz = rowView.length; + + size_t n = 0; + auto drop_view = Kokkos::subview(drop_views, Kokkos::make_pair(A_device.graph.row_map(row), A_device.graph.row_map(row+1))); + auto index_view = Kokkos::subview(index_views, Kokkos::make_pair(A_device.graph.row_map(row), A_device.graph.row_map(row+1))); + + //find magnitudes + Kokkos::parallel_reduce(Kokkos::TeamThreadRange(teamMember, (LO)nnz), [&](const LO colID, size_t &count) { + index_view(colID) = colID; + LO col = rowView.colidx(colID); + //ignore diagonals for now, they are checked again later + if(row == col) { + drop_view(colID) = true; + count++; + } + //Don't aggregate boundaries + else if(boundaryNodesDevice(colID)) { + drop_view(colID) = true; + } + else { + drop_view(colID) = false; + count++; + } + }, n); + + size_t dropStart = n; + if (algorithm == unscaled_cut) { + //push diagonals and boundaries to the right, sort everything else by aij on the left + Kokkos::Experimental::sort_team(teamMember, index_view, [=](size_t& x, size_t& y) -> bool { + if(drop_view(x) || drop_view(y)) { + return drop_view(x) < drop_view(y); + } + else { + auto x_aij = implATS::magnitude(rowView.value(x) * rowView.value(x)); + auto y_aij = implATS::magnitude(rowView.value(y) * rowView.value(y)); + return x_aij > y_aij; + } + }); + + //find index where dropping starts + Kokkos::parallel_reduce(Kokkos::TeamThreadRange(teamMember, 1, n), [=](size_t i, size_t& min) { + auto const& x = index_view(i - 1); + auto const& y = index_view(i); + typename implATS::magnitudeType x_aij = 0; + typename implATS::magnitudeType y_aij = 0; + if(!drop_view(x)) { + x_aij = implATS::magnitude(rowView.value(x) * rowView.value(x)); + } + if(!drop_view(y)) { + y_aij = implATS::magnitude(rowView.value(y) * rowView.value(y)); + } + + if(x_aij > realThresholdKokkos * y_aij) { + if(i < min) { + min = i; + } + } + }, Kokkos::Min(dropStart)); + } else if (algorithm == scaled_cut) { + //push diagonals and boundaries to the right, sort everything else by aij/aiiajj on the left + Kokkos::Experimental::sort_team(teamMember, index_view, [=](size_t& x, size_t& y) -> bool { + if(drop_view(x) || drop_view(y)) { + return drop_view(x) < drop_view(y); + } + else { + auto x_aij = implATS::magnitude(rowView.value(x) * rowView.value(x)); + auto y_aij = implATS::magnitude(rowView.value(y) * rowView.value(y)); + auto x_aiiajj = implATS::magnitude(thresholdKokkos * thresholdKokkos * ghostedDiagValsView(rowView.colidx(x)) * ghostedDiagValsView(row)); + auto y_aiiajj = implATS::magnitude(thresholdKokkos * thresholdKokkos * ghostedDiagValsView(rowView.colidx(y)) * ghostedDiagValsView(row)); + return (x_aij / x_aiiajj) > (y_aij / y_aiiajj); + } + }); + + //find index where dropping starts + Kokkos::parallel_reduce(Kokkos::TeamThreadRange(teamMember, 1, n), [=](size_t i, size_t& min) { + auto const& x = index_view(i - 1); + auto const& y = index_view(i); + typename implATS::magnitudeType x_val = 0; + typename implATS::magnitudeType y_val = 0; + if(!drop_view(x)) { + typename implATS::magnitudeType x_aij = implATS::magnitude(rowView.value(x) * rowView.value(x)); + typename implATS::magnitudeType x_aiiajj = implATS::magnitude(thresholdKokkos * thresholdKokkos * ghostedDiagValsView(rowView.colidx(x)) * ghostedDiagValsView(row)); + x_val = x_aij / x_aiiajj; + } + if(!drop_view(y)) { + typename implATS::magnitudeType y_aij = implATS::magnitude(rowView.value(y) * rowView.value(y)); + typename implATS::magnitudeType y_aiiajj = implATS::magnitude(thresholdKokkos * thresholdKokkos * ghostedDiagValsView(rowView.colidx(y)) * ghostedDiagValsView(row)); + y_val = y_aij / y_aiiajj; + } + + if(x_val > realThresholdKokkos * y_val) { + if(i < min) { + min = i; + } + } + }, Kokkos::Min(dropStart)); + } + + //drop everything to the right of where values stop passing threshold + if(dropStart < n) { + Kokkos::parallel_for(Kokkos::TeamThreadRange(teamMember, dropStart, n), [=](size_t i) { + drop_view(index_view(i)) = true; + }); + } + + LO rownnz = 0; + GO rowDropped = 0; + Kokkos::parallel_reduce(Kokkos::TeamThreadRange(teamMember, n), [=](const size_t idxID, LO& keep, GO& drop) { + LO col = rowView.colidx(idxID); + //don't drop diagonal + if(row == col || !drop_view(idxID)) { + keep++; + } + else { + rowView.colidx(idxID) = -1; + drop++; + } + }, rownnz, rowDropped); + + globalnnz += rownnz; + totalDropped += rowDropped; + rownnzView(row) = rownnz; + }, realnnz, numDropped); + + //update column indices so that kept indices are aligned to the left for subview that happens later on + auto columnsDevice = Kokkos::create_mirror_view(ExecSpace(), columns); + Kokkos::Experimental::remove(ExecSpace(), columnsDevice, -1); + Kokkos::deep_copy(columns, columnsDevice); + + //update row indices by adding up new # of nnz in each row + auto rowsDevice = Kokkos::create_mirror_view(ExecSpace(), rows); + Kokkos::parallel_scan(Kokkos::RangePolicy(0, A_device.numRows()), KOKKOS_LAMBDA(const int i, LO& partial_sum, bool is_final) { + partial_sum += rownnzView(i); + if(is_final) rowsDevice(i+1) = partial_sum; + }); + Kokkos::deep_copy(rows, rowsDevice); + } numTotal = A->getLocalNumEntries(); @@ -1655,7 +1662,7 @@ void CoalesceDropFactory::Build(Level } // if (doExperimentalWrap) ... else ... -} // Build +} // Build template void CoalesceDropFactory::MergeRows(const Matrix& A, const LO row, Array& cols, const Array& translation) const { From 4dff65a96640d6473d88c9dc0282bb0441f77c77 Mon Sep 17 00:00:00 2001 From: Ian Halim Date: Thu, 15 Aug 2024 12:40:22 -0600 Subject: [PATCH 5/7] MueLu: Code Review Fixes Signed-off-by: Ian Halim --- .../MueLu_CoalesceDropFactory_def.hpp | 40 +++++++++---------- 1 file changed, 20 insertions(+), 20 deletions(-) diff --git a/packages/muelu/src/Graph/MatrixTransformation/MueLu_CoalesceDropFactory_def.hpp b/packages/muelu/src/Graph/MatrixTransformation/MueLu_CoalesceDropFactory_def.hpp index ad5895e2e41b..0431bf011541 100644 --- a/packages/muelu/src/Graph/MatrixTransformation/MueLu_CoalesceDropFactory_def.hpp +++ b/packages/muelu/src/Graph/MatrixTransformation/MueLu_CoalesceDropFactory_def.hpp @@ -117,6 +117,11 @@ struct DropTol { }; } // namespace Details +enum decisionAlgoType { defaultAlgo, + unscaled_cut, + scaled_cut, + scaled_cut_symmetric }; + template RCP CoalesceDropFactory::GetValidParameterList() const { RCP validParamList = rcp(new ParameterList()); @@ -354,11 +359,6 @@ void CoalesceDropFactory::Build(Level #endif //////////////////////////////////////////////////// - enum decisionAlgoType { defaultAlgo, - unscaled_cut, - scaled_cut, - scaled_cut_symmetric }; - decisionAlgoType distanceLaplacianAlgo = defaultAlgo; decisionAlgoType classicalAlgo = defaultAlgo; if (algo == "distance laplacian") { @@ -591,24 +591,24 @@ void CoalesceDropFactory::Build(Level //move from host to device auto ghostedDiagValsView = Kokkos::subview(ghostedDiag->getDeviceLocalView(Xpetra::Access::ReadOnly), Kokkos::ALL(), 0); - auto boundaryNodesDevice = Kokkos::create_mirror_view(ExecSpace(), boundaryNodes); + auto boundaryNodesDevice = Kokkos::create_mirror_view_and_copy(ExecSpace(), boundaryNodes); auto thresholdKokkos = static_cast(threshold); auto realThresholdKokkos = implATS::magnitude(thresholdKokkos); + auto columnsDevice = Kokkos::create_mirror_view(ExecSpace(), columns); auto At = Utilities::Op2TpetraCrs(A); auto A_device = At->getLocalMatrixDevice(); - int algorithm = classicalAlgo; Kokkos::ViewrownnzView("rownnzView", A_device.numRows()); auto drop_views = Kokkos::View("drop_views", A_device.nnz()); auto index_views = Kokkos::View("index_views", A_device.nnz()); Kokkos::parallel_reduce("classical_cut", TeamPol(A_device.numRows(), Kokkos::AUTO), KOKKOS_LAMBDA(const TeamMem& teamMember, LO& globalnnz, GO& totalDropped) { LO row = teamMember.league_rank(); - auto rowView = A_device.row(row); + auto rowView = A_device.rowConst(row); size_t nnz = rowView.length; - size_t n = 0; + size_t dropSize = 0; auto drop_view = Kokkos::subview(drop_views, Kokkos::make_pair(A_device.graph.row_map(row), A_device.graph.row_map(row+1))); auto index_view = Kokkos::subview(index_views, Kokkos::make_pair(A_device.graph.row_map(row), A_device.graph.row_map(row+1))); @@ -629,10 +629,10 @@ void CoalesceDropFactory::Build(Level drop_view(colID) = false; count++; } - }, n); + }, dropSize); - size_t dropStart = n; - if (algorithm == unscaled_cut) { + size_t dropStart = dropSize; + if (classicalAlgo == unscaled_cut) { //push diagonals and boundaries to the right, sort everything else by aij on the left Kokkos::Experimental::sort_team(teamMember, index_view, [=](size_t& x, size_t& y) -> bool { if(drop_view(x) || drop_view(y)) { @@ -646,7 +646,7 @@ void CoalesceDropFactory::Build(Level }); //find index where dropping starts - Kokkos::parallel_reduce(Kokkos::TeamThreadRange(teamMember, 1, n), [=](size_t i, size_t& min) { + Kokkos::parallel_reduce(Kokkos::TeamThreadRange(teamMember, 1, dropSize), [=](size_t i, size_t& min) { auto const& x = index_view(i - 1); auto const& y = index_view(i); typename implATS::magnitudeType x_aij = 0; @@ -664,7 +664,7 @@ void CoalesceDropFactory::Build(Level } } }, Kokkos::Min(dropStart)); - } else if (algorithm == scaled_cut) { + } else if (classicalAlgo == scaled_cut) { //push diagonals and boundaries to the right, sort everything else by aij/aiiajj on the left Kokkos::Experimental::sort_team(teamMember, index_view, [=](size_t& x, size_t& y) -> bool { if(drop_view(x) || drop_view(y)) { @@ -680,7 +680,7 @@ void CoalesceDropFactory::Build(Level }); //find index where dropping starts - Kokkos::parallel_reduce(Kokkos::TeamThreadRange(teamMember, 1, n), [=](size_t i, size_t& min) { + Kokkos::parallel_reduce(Kokkos::TeamThreadRange(teamMember, 1, dropSize), [=](size_t i, size_t& min) { auto const& x = index_view(i - 1); auto const& y = index_view(i); typename implATS::magnitudeType x_val = 0; @@ -705,22 +705,23 @@ void CoalesceDropFactory::Build(Level } //drop everything to the right of where values stop passing threshold - if(dropStart < n) { - Kokkos::parallel_for(Kokkos::TeamThreadRange(teamMember, dropStart, n), [=](size_t i) { + if(dropStart < dropSize) { + Kokkos::parallel_for(Kokkos::TeamThreadRange(teamMember, dropStart, dropSize), [=](size_t i) { drop_view(index_view(i)) = true; }); } LO rownnz = 0; GO rowDropped = 0; - Kokkos::parallel_reduce(Kokkos::TeamThreadRange(teamMember, n), [=](const size_t idxID, LO& keep, GO& drop) { + Kokkos::parallel_reduce(Kokkos::TeamThreadRange(teamMember, dropSize), [=](const size_t idxID, LO& keep, GO& drop) { LO col = rowView.colidx(idxID); //don't drop diagonal if(row == col || !drop_view(idxID)) { + columnsDevice(A_device.graph.row_map(row) + idxID) = col; keep++; } else { - rowView.colidx(idxID) = -1; + columnsDevice(A_device.graph.row_map(row) + idxID) = -1; drop++; } }, rownnz, rowDropped); @@ -731,7 +732,6 @@ void CoalesceDropFactory::Build(Level }, realnnz, numDropped); //update column indices so that kept indices are aligned to the left for subview that happens later on - auto columnsDevice = Kokkos::create_mirror_view(ExecSpace(), columns); Kokkos::Experimental::remove(ExecSpace(), columnsDevice, -1); Kokkos::deep_copy(columns, columnsDevice); From d88fa994bdb626eb4dac0026e6be601f9a62f03c Mon Sep 17 00:00:00 2001 From: Ian Halim Date: Fri, 23 Aug 2024 19:21:52 -0600 Subject: [PATCH 6/7] MueLu: Fixing Issue #13377 and #13378 Issues listed above have been addressed. Threshold has been redefined to 1/threshold. Unit tests have been modified to be more thorough. Signed-off-by: Ian Halim --- .../MueLu_CoalesceDropFactory_def.hpp | 60 +++--- .../test/unit_tests/CoalesceDropFactory.cpp | 178 +++++++++++++++--- 2 files changed, 187 insertions(+), 51 deletions(-) diff --git a/packages/muelu/src/Graph/MatrixTransformation/MueLu_CoalesceDropFactory_def.hpp b/packages/muelu/src/Graph/MatrixTransformation/MueLu_CoalesceDropFactory_def.hpp index 0431bf011541..1f9961289cb0 100644 --- a/packages/muelu/src/Graph/MatrixTransformation/MueLu_CoalesceDropFactory_def.hpp +++ b/packages/muelu/src/Graph/MatrixTransformation/MueLu_CoalesceDropFactory_def.hpp @@ -591,13 +591,27 @@ void CoalesceDropFactory::Build(Level //move from host to device auto ghostedDiagValsView = Kokkos::subview(ghostedDiag->getDeviceLocalView(Xpetra::Access::ReadOnly), Kokkos::ALL(), 0); - auto boundaryNodesDevice = Kokkos::create_mirror_view_and_copy(ExecSpace(), boundaryNodes); auto thresholdKokkos = static_cast(threshold); auto realThresholdKokkos = implATS::magnitude(thresholdKokkos); auto columnsDevice = Kokkos::create_mirror_view(ExecSpace(), columns); - auto At = Utilities::Op2TpetraCrs(A); - auto A_device = At->getLocalMatrixDevice(); + auto A_device = A->getLocalMatrixDevice(); + RCP graph = rcp(new LWGraph(A->getCrsGraph(), "graph of A")); + RCP importer = A->getCrsGraph()->getImporter(); + RCP boundaryNodesVector = Xpetra::VectorFactory::Build(graph->GetDomainMap()); + RCP boundaryColumnVector; + for(size_t i = 0; i < graph->GetNodeNumVertices(); i++) { + boundaryNodesVector->getDataNonConst(0)[i] = boundaryNodes[i]; + } + if(!importer.is_null()) { + boundaryColumnVector = Xpetra::VectorFactory::Build(graph->GetImportMap()); + boundaryColumnVector->doImport(*boundaryNodesVector, *importer, Xpetra::INSERT); + } + else { + boundaryColumnVector = boundaryNodesVector; + } + auto boundaryColumn = boundaryColumnVector->getDeviceLocalView(Xpetra::Access::ReadOnly); + auto boundary = Kokkos::subview(boundaryColumn, Kokkos::ALL(), 0); Kokkos::ViewrownnzView("rownnzView", A_device.numRows()); auto drop_views = Kokkos::View("drop_views", A_device.nnz()); @@ -608,30 +622,24 @@ void CoalesceDropFactory::Build(Level auto rowView = A_device.rowConst(row); size_t nnz = rowView.length; - size_t dropSize = 0; auto drop_view = Kokkos::subview(drop_views, Kokkos::make_pair(A_device.graph.row_map(row), A_device.graph.row_map(row+1))); auto index_view = Kokkos::subview(index_views, Kokkos::make_pair(A_device.graph.row_map(row), A_device.graph.row_map(row+1))); //find magnitudes - Kokkos::parallel_reduce(Kokkos::TeamThreadRange(teamMember, (LO)nnz), [&](const LO colID, size_t &count) { + Kokkos::parallel_for(Kokkos::TeamThreadRange(teamMember, (LO)nnz), [&](const LO colID) { index_view(colID) = colID; LO col = rowView.colidx(colID); //ignore diagonals for now, they are checked again later - if(row == col) { - drop_view(colID) = true; - count++; - } //Don't aggregate boundaries - else if(boundaryNodesDevice(colID)) { + if(row == col || boundary(col)) { drop_view(colID) = true; } else { drop_view(colID) = false; - count++; } - }, dropSize); + }); - size_t dropStart = dropSize; + size_t dropStart = nnz; if (classicalAlgo == unscaled_cut) { //push diagonals and boundaries to the right, sort everything else by aij on the left Kokkos::Experimental::sort_team(teamMember, index_view, [=](size_t& x, size_t& y) -> bool { @@ -646,7 +654,7 @@ void CoalesceDropFactory::Build(Level }); //find index where dropping starts - Kokkos::parallel_reduce(Kokkos::TeamThreadRange(teamMember, 1, dropSize), [=](size_t i, size_t& min) { + Kokkos::parallel_reduce(Kokkos::TeamThreadRange(teamMember, 1, nnz), [=](size_t i, size_t& min) { auto const& x = index_view(i - 1); auto const& y = index_view(i); typename implATS::magnitudeType x_aij = 0; @@ -658,7 +666,7 @@ void CoalesceDropFactory::Build(Level y_aij = implATS::magnitude(rowView.value(y) * rowView.value(y)); } - if(x_aij > realThresholdKokkos * y_aij) { + if(realThresholdKokkos * realThresholdKokkos * x_aij > y_aij) { if(i < min) { min = i; } @@ -673,30 +681,30 @@ void CoalesceDropFactory::Build(Level else { auto x_aij = implATS::magnitude(rowView.value(x) * rowView.value(x)); auto y_aij = implATS::magnitude(rowView.value(y) * rowView.value(y)); - auto x_aiiajj = implATS::magnitude(thresholdKokkos * thresholdKokkos * ghostedDiagValsView(rowView.colidx(x)) * ghostedDiagValsView(row)); - auto y_aiiajj = implATS::magnitude(thresholdKokkos * thresholdKokkos * ghostedDiagValsView(rowView.colidx(y)) * ghostedDiagValsView(row)); + auto x_aiiajj = implATS::magnitude(ghostedDiagValsView(rowView.colidx(x)) * ghostedDiagValsView(row)); + auto y_aiiajj = implATS::magnitude(ghostedDiagValsView(rowView.colidx(y)) * ghostedDiagValsView(row)); return (x_aij / x_aiiajj) > (y_aij / y_aiiajj); } }); //find index where dropping starts - Kokkos::parallel_reduce(Kokkos::TeamThreadRange(teamMember, 1, dropSize), [=](size_t i, size_t& min) { + Kokkos::parallel_reduce(Kokkos::TeamThreadRange(teamMember, 1, nnz), [=](size_t i, size_t& min) { auto const& x = index_view(i - 1); auto const& y = index_view(i); typename implATS::magnitudeType x_val = 0; typename implATS::magnitudeType y_val = 0; if(!drop_view(x)) { typename implATS::magnitudeType x_aij = implATS::magnitude(rowView.value(x) * rowView.value(x)); - typename implATS::magnitudeType x_aiiajj = implATS::magnitude(thresholdKokkos * thresholdKokkos * ghostedDiagValsView(rowView.colidx(x)) * ghostedDiagValsView(row)); + typename implATS::magnitudeType x_aiiajj = implATS::magnitude(ghostedDiagValsView(rowView.colidx(x)) * ghostedDiagValsView(row)); x_val = x_aij / x_aiiajj; } if(!drop_view(y)) { typename implATS::magnitudeType y_aij = implATS::magnitude(rowView.value(y) * rowView.value(y)); - typename implATS::magnitudeType y_aiiajj = implATS::magnitude(thresholdKokkos * thresholdKokkos * ghostedDiagValsView(rowView.colidx(y)) * ghostedDiagValsView(row)); + typename implATS::magnitudeType y_aiiajj = implATS::magnitude(ghostedDiagValsView(rowView.colidx(y)) * ghostedDiagValsView(row)); y_val = y_aij / y_aiiajj; } - if(x_val > realThresholdKokkos * y_val) { + if(realThresholdKokkos * realThresholdKokkos * x_val > y_val) { if(i < min) { min = i; } @@ -705,15 +713,15 @@ void CoalesceDropFactory::Build(Level } //drop everything to the right of where values stop passing threshold - if(dropStart < dropSize) { - Kokkos::parallel_for(Kokkos::TeamThreadRange(teamMember, dropStart, dropSize), [=](size_t i) { + if(dropStart < nnz) { + Kokkos::parallel_for(Kokkos::TeamThreadRange(teamMember, dropStart, nnz), [=](size_t i) { drop_view(index_view(i)) = true; }); } LO rownnz = 0; GO rowDropped = 0; - Kokkos::parallel_reduce(Kokkos::TeamThreadRange(teamMember, dropSize), [=](const size_t idxID, LO& keep, GO& drop) { + Kokkos::parallel_reduce(Kokkos::TeamThreadRange(teamMember, nnz), [=](const size_t idxID, LO& keep, GO& drop) { LO col = rowView.colidx(idxID); //don't drop diagonal if(row == col || !drop_view(idxID)) { @@ -1381,7 +1389,7 @@ void CoalesceDropFactory::Build(Level auto const& y = drop_vec[i]; auto a = x.val; auto b = y.val; - if (a > realThreshold * b) { + if (realThreshold * realThreshold * a > b) { drop = true; #ifdef HAVE_MUELU_DEBUG if (distanceLaplacianCutVerbose) { @@ -1404,7 +1412,7 @@ void CoalesceDropFactory::Build(Level auto const& y = drop_vec[i]; auto a = x.val / x.diag; auto b = y.val / y.diag; - if (a > realThreshold * b) { + if (realThreshold * realThreshold * a > b) { drop = true; #ifdef HAVE_MUELU_DEBUG if (distanceLaplacianCutVerbose) { diff --git a/packages/muelu/test/unit_tests/CoalesceDropFactory.cpp b/packages/muelu/test/unit_tests/CoalesceDropFactory.cpp index e8902b178708..0073ca7e9bfb 100644 --- a/packages/muelu/test/unit_tests/CoalesceDropFactory.cpp +++ b/packages/muelu/test/unit_tests/CoalesceDropFactory.cpp @@ -1223,7 +1223,7 @@ TEUCHOS_UNIT_TEST_TEMPLATE_4_DECL(CoalesceDropFactory, DistanceLaplacianScaledCu // L_ij = -36 // L_ii = 72 // criterion for dropping is |L_ij|^2 <= tol^2 * |L_ii*L_jj| - coalesceDropFact.SetParameter("aggregation: drop tol", Teuchos::ParameterEntry(8.0)); + coalesceDropFact.SetParameter("aggregation: drop tol", Teuchos::ParameterEntry(sqrt(0.125))); coalesceDropFact.SetParameter("aggregation: drop scheme", Teuchos::ParameterEntry(std::string("distance laplacian"))); coalesceDropFact.SetParameter("aggregation: distance laplacian algo", Teuchos::ParameterEntry(std::string("scaled cut"))); fineLevel.Request("Graph", &coalesceDropFact); @@ -1289,7 +1289,7 @@ TEUCHOS_UNIT_TEST_TEMPLATE_4_DECL(CoalesceDropFactory, DistanceLaplacianUnscaled // L_ij = -36 // L_ii = 72 // criterion for dropping is |L_ij|^2 <= tol^2 * |L_ii*L_jj| - coalesceDropFact.SetParameter("aggregation: drop tol", Teuchos::ParameterEntry(8.0)); + coalesceDropFact.SetParameter("aggregation: drop tol", Teuchos::ParameterEntry(sqrt(0.125))); coalesceDropFact.SetParameter("aggregation: drop scheme", Teuchos::ParameterEntry(std::string("distance laplacian"))); coalesceDropFact.SetParameter("aggregation: distance laplacian algo", Teuchos::ParameterEntry(std::string("unscaled cut"))); fineLevel.Request("Graph", &coalesceDropFact); @@ -1355,7 +1355,7 @@ TEUCHOS_UNIT_TEST_TEMPLATE_4_DECL(CoalesceDropFactory, DistanceLaplacianCutSym, // L_ij = -36 // L_ii = 72 // criterion for dropping is |L_ij|^2 <= tol^2 * |L_ii*L_jj| - coalesceDropFact.SetParameter("aggregation: drop tol", Teuchos::ParameterEntry(8.0)); + coalesceDropFact.SetParameter("aggregation: drop tol", Teuchos::ParameterEntry(sqrt(0.125))); coalesceDropFact.SetParameter("aggregation: drop scheme", Teuchos::ParameterEntry(std::string("distance laplacian"))); coalesceDropFact.SetParameter("aggregation: distance laplacian algo", Teuchos::ParameterEntry(std::string("scaled cut symmetric"))); fineLevel.Request("Graph", &coalesceDropFact); @@ -1389,6 +1389,8 @@ TEUCHOS_UNIT_TEST_TEMPLATE_4_DECL(CoalesceDropFactory, ClassicalScaledCut, Scala typedef Teuchos::ScalarTraits STS; typedef typename STS::magnitudeType real_type; typedef Xpetra::MultiVector RealValuedMultiVector; + typedef Tpetra::Map map_type; + typedef Tpetra::CrsMatrix crs_matrix_type; MUELU_TESTING_SET_OSTREAM; MUELU_TESTING_LIMIT_SCOPE(Scalar, GlobalOrdinal, Node); @@ -1399,11 +1401,41 @@ TEUCHOS_UNIT_TEST_TEMPLATE_4_DECL(CoalesceDropFactory, ClassicalScaledCut, Scala Level fineLevel; TestHelpers::TestFactory::createSingleLevelHierarchy(fineLevel); - RCP A = TestHelpers::TestFactory::Build1DPoisson(36); + const global_size_t globalIndices = 12; + const GO indexBase = 0; + RCP map = rcp(new map_type(globalIndices, indexBase, comm)); + RCP A_t(new crs_matrix_type(map, 5)); + const SC two = static_cast(2.0); + const SC one = static_cast(1.0); + const SC negOne = static_cast(-1.0); + for(LO lclRow = 0; lclRow < static_cast (map->getLocalNumElements()); lclRow++) { + const GO gblRow = map->getGlobalElement(lclRow); + if(gblRow == 0) { + A_t->insertGlobalValues(gblRow, Teuchos::tuple(gblRow, gblRow + 1), Teuchos::tuple(two, negOne)); + } + else if(static_cast(gblRow) == globalIndices - 1) { + A_t->insertGlobalValues(gblRow, Teuchos::tuple(gblRow - 1, gblRow), Teuchos::tuple(negOne, two)); + } + else if(gblRow == 2 || gblRow == 9) { + A_t->insertGlobalValues(gblRow, Teuchos::tuple(gblRow), Teuchos::tuple(one)); + } + else if(gblRow == 5) { + A_t->insertGlobalValues(gblRow, Teuchos::tuple(gblRow-2, gblRow-1, gblRow, gblRow+1, gblRow+2), Teuchos::tuple(negOne, negOne, two, negOne, negOne)); + } + else if(gblRow == 6) { + A_t->insertGlobalValues(gblRow, Teuchos::tuple(gblRow-2, gblRow-1, gblRow, gblRow+1, gblRow+2), Teuchos::tuple(negOne, two, two, two, negOne)); + } + else { + A_t->insertGlobalValues(gblRow, Teuchos::tuple(gblRow - 1, gblRow, gblRow + 1), Teuchos::tuple(negOne, two, negOne)); + } + } + A_t->fillComplete(); + RCP A_x = rcp(new TpetraCrsMatrix(A_t)); + RCP A = rcp(new CrsMatrixWrap(A_x)); fineLevel.Set("A", A); Teuchos::ParameterList galeriList; - galeriList.set("nx", Teuchos::as(36)); + galeriList.set("nx", Teuchos::as(globalIndices)); RCP coordinates = Galeri::Xpetra::Utils::CreateCartesianCoordinates("1D", A->getRowMap(), galeriList); fineLevel.Set("Coordinates", coordinates); @@ -1429,25 +1461,59 @@ TEUCHOS_UNIT_TEST_TEMPLATE_4_DECL(CoalesceDropFactory, ClassicalScaledCut, Scala const RCP myImportMap = graph->GetImportMap(); // < note that the ImportMap is built from the column map of the matrix A WITHOUT dropping! const RCP myDomainMap = graph->GetDomainMap(); - TEST_EQUALITY(myImportMap->getMaxAllGlobalIndex(), 35); + TEST_EQUALITY(myImportMap->getMaxAllGlobalIndex(), globalIndices-1); TEST_EQUALITY(myImportMap->getMinAllGlobalIndex(), 0); TEST_EQUALITY(myImportMap->getMinLocalIndex(), 0); - TEST_EQUALITY(myImportMap->getGlobalNumElements(), Teuchos::as(36 + (comm->getSize() - 1) * 2)); + TEST_EQUALITY(myImportMap->getGlobalNumElements(), Teuchos::as(globalIndices + (comm->getSize() - 1) * 2)); - TEST_EQUALITY(myDomainMap->getMaxAllGlobalIndex(), 35); + TEST_EQUALITY(myDomainMap->getMaxAllGlobalIndex(), globalIndices-1); TEST_EQUALITY(myDomainMap->getMinAllGlobalIndex(), 0); TEST_EQUALITY(myDomainMap->getMinLocalIndex(), 0); - TEST_EQUALITY(myDomainMap->getGlobalNumElements(), 36); - - TEST_EQUALITY(graph->GetGlobalNumEdges(), 72); - -} // SignaledClassical + TEST_EQUALITY(myDomainMap->getGlobalNumElements(), globalIndices); + + TEST_EQUALITY(graph->GetGlobalNumEdges(), 28); + + int rows[13] = {0, 2, 4, 5, 7, 10, 15, 18, 21, 23, 24, 26, 28}; + int columns[28] = {0, 1, + 0, 1, + 2, + 3, 4, + 3, 4, 5, + 3, 4, 5, 6, 7, + 5, 6, 7, + 6, 7, 8, + 7, 8, + 9, + 10, 11, + 10, 11}; + auto rowPtrs = graph->getRowPtrs(); + auto entries = graph->getEntries(); + size_t rowID = 0; + TEST_EQUALITY(rowPtrs(0), rowID); + for(size_t i = 0; i < rowPtrs.size()-1; i++) { + auto gblID = myDomainMap->getGlobalElement(i); + int rownnz = rows[gblID+1]-rows[gblID]; + rowID += rownnz; + TEST_EQUALITY(rowPtrs(i+1), rowID); + + std::vector colID; + for(int j = 0; j < rownnz; j++) { + colID.push_back(myImportMap->getGlobalElement(entries(rowPtrs(i)+j))); + } + std::sort(std::begin(colID), std::end(colID)); + for(int j = 0; j < rownnz; j++) { + TEST_EQUALITY(colID[j], columns[rows[gblID]+j]); + } + } +} // ClassicalScaledCut TEUCHOS_UNIT_TEST_TEMPLATE_4_DECL(CoalesceDropFactory, ClassicalUnScaledCut, Scalar, LocalOrdinal, GlobalOrdinal, Node) { #include typedef Teuchos::ScalarTraits STS; typedef typename STS::magnitudeType real_type; typedef Xpetra::MultiVector RealValuedMultiVector; + typedef Tpetra::Map map_type; + typedef Tpetra::CrsMatrix crs_matrix_type; MUELU_TESTING_SET_OSTREAM; MUELU_TESTING_LIMIT_SCOPE(Scalar, GlobalOrdinal, Node); @@ -1458,11 +1524,41 @@ TEUCHOS_UNIT_TEST_TEMPLATE_4_DECL(CoalesceDropFactory, ClassicalUnScaledCut, Sca Level fineLevel; TestHelpers::TestFactory::createSingleLevelHierarchy(fineLevel); - RCP A = TestHelpers::TestFactory::Build1DPoisson(36); + const global_size_t globalIndices = 12; + const GO indexBase = 0; + RCP map = rcp(new map_type(globalIndices, indexBase, comm)); + RCP A_t(new crs_matrix_type(map, 5)); + const SC two = static_cast(2.0); + const SC one = static_cast(1.0); + const SC negOne = static_cast(-1.0); + for(LO lclRow = 0; lclRow < static_cast (map->getLocalNumElements()); lclRow++) { + const GO gblRow = map->getGlobalElement(lclRow); + if(gblRow == 0) { + A_t->insertGlobalValues(gblRow, Teuchos::tuple(gblRow, gblRow + 1), Teuchos::tuple(two, negOne)); + } + else if(static_cast(gblRow) == globalIndices - 1) { + A_t->insertGlobalValues(gblRow, Teuchos::tuple(gblRow - 1, gblRow), Teuchos::tuple(negOne, two)); + } + else if(gblRow == 2 || gblRow == 9) { + A_t->insertGlobalValues(gblRow, Teuchos::tuple(gblRow), Teuchos::tuple(one)); + } + else if(gblRow == 5) { + A_t->insertGlobalValues(gblRow, Teuchos::tuple(gblRow-2, gblRow-1, gblRow, gblRow+1, gblRow+2), Teuchos::tuple(negOne, negOne, two, negOne, negOne)); + } + else if(gblRow == 6) { + A_t->insertGlobalValues(gblRow, Teuchos::tuple(gblRow-2, gblRow-1, gblRow, gblRow+1, gblRow+2), Teuchos::tuple(negOne, two, two, two, negOne)); + } + else { + A_t->insertGlobalValues(gblRow, Teuchos::tuple(gblRow - 1, gblRow, gblRow + 1), Teuchos::tuple(negOne, two, negOne)); + } + } + A_t->fillComplete(); + RCP A_x = rcp(new TpetraCrsMatrix(A_t)); + RCP A = rcp(new CrsMatrixWrap(A_x)); fineLevel.Set("A", A); Teuchos::ParameterList galeriList; - galeriList.set("nx", Teuchos::as(36)); + galeriList.set("nx", Teuchos::as(globalIndices)); RCP coordinates = Galeri::Xpetra::Utils::CreateCartesianCoordinates("1D", A->getRowMap(), galeriList); fineLevel.Set("Coordinates", coordinates); @@ -1488,19 +1584,51 @@ TEUCHOS_UNIT_TEST_TEMPLATE_4_DECL(CoalesceDropFactory, ClassicalUnScaledCut, Sca const RCP myImportMap = graph->GetImportMap(); // < note that the ImportMap is built from the column map of the matrix A WITHOUT dropping! const RCP myDomainMap = graph->GetDomainMap(); - TEST_EQUALITY(myImportMap->getMaxAllGlobalIndex(), 35); + TEST_EQUALITY(myImportMap->getMaxAllGlobalIndex(), globalIndices-1); TEST_EQUALITY(myImportMap->getMinAllGlobalIndex(), 0); TEST_EQUALITY(myImportMap->getMinLocalIndex(), 0); - TEST_EQUALITY(myImportMap->getGlobalNumElements(), Teuchos::as(36 + (comm->getSize() - 1) * 2)); + TEST_EQUALITY(myImportMap->getGlobalNumElements(), Teuchos::as(globalIndices + (comm->getSize() - 1) * 2)); - TEST_EQUALITY(myDomainMap->getMaxAllGlobalIndex(), 35); + TEST_EQUALITY(myDomainMap->getMaxAllGlobalIndex(), globalIndices-1); TEST_EQUALITY(myDomainMap->getMinAllGlobalIndex(), 0); TEST_EQUALITY(myDomainMap->getMinLocalIndex(), 0); - TEST_EQUALITY(myDomainMap->getGlobalNumElements(), 36); - - TEST_EQUALITY(graph->GetGlobalNumEdges(), 72); - -} // SignaledClassical + TEST_EQUALITY(myDomainMap->getGlobalNumElements(), globalIndices); + + TEST_EQUALITY(graph->GetGlobalNumEdges(), 28); + + int rows[13] = {0, 2, 4, 5, 7, 10, 15, 18, 21, 23, 24, 26, 28}; + int columns[28] = {0, 1, + 0, 1, + 2, + 3, 4, + 3, 4, 5, + 3, 4, 5, 6, 7, + 5, 6, 7, + 6, 7, 8, + 7, 8, + 9, + 10, 11, + 10, 11}; + auto rowPtrs = graph->getRowPtrs(); + auto entries = graph->getEntries(); + size_t rowID = 0; + TEST_EQUALITY(rowPtrs(0), rowID); + for(size_t i = 0; i < rowPtrs.size()-1; i++) { + auto gblID = myDomainMap->getGlobalElement(i); + int rownnz = rows[gblID+1]-rows[gblID]; + rowID += rownnz; + TEST_EQUALITY(rowPtrs(i+1), rowID); + + std::vector colID; + for(int j = 0; j < rownnz; j++) { + colID.push_back(myImportMap->getGlobalElement(entries(rowPtrs(i)+j))); + } + std::sort(std::begin(colID), std::end(colID)); + for(int j = 0; j < rownnz; j++) { + TEST_EQUALITY(colID[j], columns[rows[gblID]+j]); + } + } +} // ClassicalUnScaledCut TEUCHOS_UNIT_TEST_TEMPLATE_4_DECL(CoalesceDropFactory, SignaledClassical, Scalar, LocalOrdinal, GlobalOrdinal, Node) { #include @@ -1902,7 +2030,7 @@ TEUCHOS_UNIT_TEST_TEMPLATE_4_DECL(CoalesceDropFactory, BlockDiagonal, Scalar, Lo coalesceDropFact.SetDefaultVerbLevel(MueLu::Extreme); coalesceDropFact.SetFactory("UnAmalgamationInfo", amalgFact); coalesceDropFact.SetFactory("BlockNumber", ibFact); - coalesceDropFact.SetParameter("aggregation: drop tol", Teuchos::ParameterEntry(8.0)); + coalesceDropFact.SetParameter("aggregation: drop tol", Teuchos::ParameterEntry(sqrt(0.125))); coalesceDropFact.SetParameter("aggregation: drop scheme", Teuchos::ParameterEntry(std::string("block diagonal"))); coalesceDropFact.SetParameter("aggregation: block diagonal: interleaved blocksize", Teuchos::ParameterEntry(3)); coalesceDropFact.SetDefaultVerbLevel(MueLu::Extreme); @@ -1949,7 +2077,7 @@ TEUCHOS_UNIT_TEST_TEMPLATE_4_DECL(CoalesceDropFactory, BlockDiagonalClassical, S coalesceDropFact.SetDefaultVerbLevel(MueLu::Extreme); coalesceDropFact.SetFactory("UnAmalgamationInfo", amalgFact); coalesceDropFact.SetFactory("BlockNumber", ibFact); - coalesceDropFact.SetParameter("aggregation: drop tol", Teuchos::ParameterEntry(8.0)); + coalesceDropFact.SetParameter("aggregation: drop tol", Teuchos::ParameterEntry(sqrt(0.125))); coalesceDropFact.SetParameter("aggregation: drop scheme", Teuchos::ParameterEntry(std::string("block diagonal classical"))); coalesceDropFact.SetParameter("aggregation: block diagonal: interleaved blocksize", Teuchos::ParameterEntry(3)); fineLevel.Request("Graph", &coalesceDropFact); From 26b3eac40a0dd818b7cb5950744c5fba53087005 Mon Sep 17 00:00:00 2001 From: Christian Glusa Date: Mon, 4 Nov 2024 09:11:53 -0700 Subject: [PATCH 7/7] MueLu: Fix clang-format Signed-off-by: Christian Glusa --- .../MueLu_CoalesceDropFactory_def.hpp | 285 +++++++++--------- .../test/unit_tests/CoalesceDropFactory.cpp | 118 ++++---- 2 files changed, 198 insertions(+), 205 deletions(-) diff --git a/packages/muelu/src/Graph/MatrixTransformation/MueLu_CoalesceDropFactory_def.hpp b/packages/muelu/src/Graph/MatrixTransformation/MueLu_CoalesceDropFactory_def.hpp index 1f9961289cb0..e2bae01ffa21 100644 --- a/packages/muelu/src/Graph/MatrixTransformation/MueLu_CoalesceDropFactory_def.hpp +++ b/packages/muelu/src/Graph/MatrixTransformation/MueLu_CoalesceDropFactory_def.hpp @@ -490,8 +490,8 @@ void CoalesceDropFactory::Build(Level GetOStream(Statistics1) << "Calculating max block off-diagonal" << std::endl; } } else { - ghostedDiag = MueLu::Utilities::GetMatrixOverlappedDiagonal(*A); - if(classicalAlgo == defaultAlgo) { + ghostedDiag = MueLu::Utilities::GetMatrixOverlappedDiagonal(*A); + if (classicalAlgo == defaultAlgo) { ghostedDiagVals = ghostedDiag->getData(0); } } @@ -510,7 +510,7 @@ void CoalesceDropFactory::Build(Level LO realnnz = 0; rows(0) = 0; - if(classicalAlgo == defaultAlgo) { + if (classicalAlgo == defaultAlgo) { SubFactoryMonitor m1(*this, "Classical RS/SA", currentLevel); for (LO row = 0; row < Teuchos::as(A->getRowMap()->getLocalNumElements()); ++row) { size_t nnz = A->getNumEntriesInLocalRow(row); @@ -578,177 +578,180 @@ void CoalesceDropFactory::Build(Level rows(row + 1) = realnnz; } } // end for row - } - else { + } else { /* Cut Algorithm */ SubFactoryMonitor m1(*this, "Cut Drop", currentLevel); - using ExecSpace = typename Node::execution_space; - using TeamPol = Kokkos::TeamPolicy; - using TeamMem = typename TeamPol::member_type; - using ATS = Kokkos::ArithTraits; + using ExecSpace = typename Node::execution_space; + using TeamPol = Kokkos::TeamPolicy; + using TeamMem = typename TeamPol::member_type; + using ATS = Kokkos::ArithTraits; using impl_scalar_type = typename ATS::val_type; - using implATS = Kokkos::ArithTraits; + using implATS = Kokkos::ArithTraits; - //move from host to device + // move from host to device auto ghostedDiagValsView = Kokkos::subview(ghostedDiag->getDeviceLocalView(Xpetra::Access::ReadOnly), Kokkos::ALL(), 0); - auto thresholdKokkos = static_cast(threshold); + auto thresholdKokkos = static_cast(threshold); auto realThresholdKokkos = implATS::magnitude(thresholdKokkos); - auto columnsDevice = Kokkos::create_mirror_view(ExecSpace(), columns); + auto columnsDevice = Kokkos::create_mirror_view(ExecSpace(), columns); - auto A_device = A->getLocalMatrixDevice(); - RCP graph = rcp(new LWGraph(A->getCrsGraph(), "graph of A")); - RCP importer = A->getCrsGraph()->getImporter(); + auto A_device = A->getLocalMatrixDevice(); + RCP graph = rcp(new LWGraph(A->getCrsGraph(), "graph of A")); + RCP importer = A->getCrsGraph()->getImporter(); RCP boundaryNodesVector = Xpetra::VectorFactory::Build(graph->GetDomainMap()); RCP boundaryColumnVector; - for(size_t i = 0; i < graph->GetNodeNumVertices(); i++) { + for (size_t i = 0; i < graph->GetNodeNumVertices(); i++) { boundaryNodesVector->getDataNonConst(0)[i] = boundaryNodes[i]; } - if(!importer.is_null()) { + if (!importer.is_null()) { boundaryColumnVector = Xpetra::VectorFactory::Build(graph->GetImportMap()); boundaryColumnVector->doImport(*boundaryNodesVector, *importer, Xpetra::INSERT); - } - else { + } else { boundaryColumnVector = boundaryNodesVector; } auto boundaryColumn = boundaryColumnVector->getDeviceLocalView(Xpetra::Access::ReadOnly); - auto boundary = Kokkos::subview(boundaryColumn, Kokkos::ALL(), 0); + auto boundary = Kokkos::subview(boundaryColumn, Kokkos::ALL(), 0); - Kokkos::ViewrownnzView("rownnzView", A_device.numRows()); - auto drop_views = Kokkos::View("drop_views", A_device.nnz()); + Kokkos::View rownnzView("rownnzView", A_device.numRows()); + auto drop_views = Kokkos::View("drop_views", A_device.nnz()); auto index_views = Kokkos::View("index_views", A_device.nnz()); - Kokkos::parallel_reduce("classical_cut", TeamPol(A_device.numRows(), Kokkos::AUTO), KOKKOS_LAMBDA(const TeamMem& teamMember, LO& globalnnz, GO& totalDropped) { - LO row = teamMember.league_rank(); - auto rowView = A_device.rowConst(row); - size_t nnz = rowView.length; - - auto drop_view = Kokkos::subview(drop_views, Kokkos::make_pair(A_device.graph.row_map(row), A_device.graph.row_map(row+1))); - auto index_view = Kokkos::subview(index_views, Kokkos::make_pair(A_device.graph.row_map(row), A_device.graph.row_map(row+1))); - - //find magnitudes - Kokkos::parallel_for(Kokkos::TeamThreadRange(teamMember, (LO)nnz), [&](const LO colID) { - index_view(colID) = colID; - LO col = rowView.colidx(colID); - //ignore diagonals for now, they are checked again later - //Don't aggregate boundaries - if(row == col || boundary(col)) { - drop_view(colID) = true; - } - else { - drop_view(colID) = false; - } - }); - - size_t dropStart = nnz; - if (classicalAlgo == unscaled_cut) { - //push diagonals and boundaries to the right, sort everything else by aij on the left - Kokkos::Experimental::sort_team(teamMember, index_view, [=](size_t& x, size_t& y) -> bool { - if(drop_view(x) || drop_view(y)) { - return drop_view(x) < drop_view(y); - } - else { - auto x_aij = implATS::magnitude(rowView.value(x) * rowView.value(x)); - auto y_aij = implATS::magnitude(rowView.value(y) * rowView.value(y)); - return x_aij > y_aij; - } - }); + Kokkos::parallel_reduce( + "classical_cut", TeamPol(A_device.numRows(), Kokkos::AUTO), KOKKOS_LAMBDA(const TeamMem& teamMember, LO& globalnnz, GO& totalDropped) { + LO row = teamMember.league_rank(); + auto rowView = A_device.rowConst(row); + size_t nnz = rowView.length; - //find index where dropping starts - Kokkos::parallel_reduce(Kokkos::TeamThreadRange(teamMember, 1, nnz), [=](size_t i, size_t& min) { - auto const& x = index_view(i - 1); - auto const& y = index_view(i); - typename implATS::magnitudeType x_aij = 0; - typename implATS::magnitudeType y_aij = 0; - if(!drop_view(x)) { - x_aij = implATS::magnitude(rowView.value(x) * rowView.value(x)); - } - if(!drop_view(y)) { - y_aij = implATS::magnitude(rowView.value(y) * rowView.value(y)); - } + auto drop_view = Kokkos::subview(drop_views, Kokkos::make_pair(A_device.graph.row_map(row), A_device.graph.row_map(row + 1))); + auto index_view = Kokkos::subview(index_views, Kokkos::make_pair(A_device.graph.row_map(row), A_device.graph.row_map(row + 1))); - if(realThresholdKokkos * realThresholdKokkos * x_aij > y_aij) { - if(i < min) { - min = i; + // find magnitudes + Kokkos::parallel_for(Kokkos::TeamThreadRange(teamMember, (LO)nnz), [&](const LO colID) { + index_view(colID) = colID; + LO col = rowView.colidx(colID); + // ignore diagonals for now, they are checked again later + // Don't aggregate boundaries + if (row == col || boundary(col)) { + drop_view(colID) = true; + } else { + drop_view(colID) = false; } - } - }, Kokkos::Min(dropStart)); - } else if (classicalAlgo == scaled_cut) { - //push diagonals and boundaries to the right, sort everything else by aij/aiiajj on the left - Kokkos::Experimental::sort_team(teamMember, index_view, [=](size_t& x, size_t& y) -> bool { - if(drop_view(x) || drop_view(y)) { - return drop_view(x) < drop_view(y); - } - else { - auto x_aij = implATS::magnitude(rowView.value(x) * rowView.value(x)); - auto y_aij = implATS::magnitude(rowView.value(y) * rowView.value(y)); - auto x_aiiajj = implATS::magnitude(ghostedDiagValsView(rowView.colidx(x)) * ghostedDiagValsView(row)); - auto y_aiiajj = implATS::magnitude(ghostedDiagValsView(rowView.colidx(y)) * ghostedDiagValsView(row)); - return (x_aij / x_aiiajj) > (y_aij / y_aiiajj); - } - }); + }); - //find index where dropping starts - Kokkos::parallel_reduce(Kokkos::TeamThreadRange(teamMember, 1, nnz), [=](size_t i, size_t& min) { - auto const& x = index_view(i - 1); - auto const& y = index_view(i); - typename implATS::magnitudeType x_val = 0; - typename implATS::magnitudeType y_val = 0; - if(!drop_view(x)) { - typename implATS::magnitudeType x_aij = implATS::magnitude(rowView.value(x) * rowView.value(x)); - typename implATS::magnitudeType x_aiiajj = implATS::magnitude(ghostedDiagValsView(rowView.colidx(x)) * ghostedDiagValsView(row)); - x_val = x_aij / x_aiiajj; - } - if(!drop_view(y)) { - typename implATS::magnitudeType y_aij = implATS::magnitude(rowView.value(y) * rowView.value(y)); - typename implATS::magnitudeType y_aiiajj = implATS::magnitude(ghostedDiagValsView(rowView.colidx(y)) * ghostedDiagValsView(row)); - y_val = y_aij / y_aiiajj; - } + size_t dropStart = nnz; + if (classicalAlgo == unscaled_cut) { + // push diagonals and boundaries to the right, sort everything else by aij on the left + Kokkos::Experimental::sort_team(teamMember, index_view, [=](size_t& x, size_t& y) -> bool { + if (drop_view(x) || drop_view(y)) { + return drop_view(x) < drop_view(y); + } else { + auto x_aij = implATS::magnitude(rowView.value(x) * rowView.value(x)); + auto y_aij = implATS::magnitude(rowView.value(y) * rowView.value(y)); + return x_aij > y_aij; + } + }); - if(realThresholdKokkos * realThresholdKokkos * x_val > y_val) { - if(i < min) { - min = i; - } + // find index where dropping starts + Kokkos::parallel_reduce( + Kokkos::TeamThreadRange(teamMember, 1, nnz), [=](size_t i, size_t& min) { + auto const& x = index_view(i - 1); + auto const& y = index_view(i); + typename implATS::magnitudeType x_aij = 0; + typename implATS::magnitudeType y_aij = 0; + if (!drop_view(x)) { + x_aij = implATS::magnitude(rowView.value(x) * rowView.value(x)); + } + if (!drop_view(y)) { + y_aij = implATS::magnitude(rowView.value(y) * rowView.value(y)); + } + + if (realThresholdKokkos * realThresholdKokkos * x_aij > y_aij) { + if (i < min) { + min = i; + } + } + }, + Kokkos::Min(dropStart)); + } else if (classicalAlgo == scaled_cut) { + // push diagonals and boundaries to the right, sort everything else by aij/aiiajj on the left + Kokkos::Experimental::sort_team(teamMember, index_view, [=](size_t& x, size_t& y) -> bool { + if (drop_view(x) || drop_view(y)) { + return drop_view(x) < drop_view(y); + } else { + auto x_aij = implATS::magnitude(rowView.value(x) * rowView.value(x)); + auto y_aij = implATS::magnitude(rowView.value(y) * rowView.value(y)); + auto x_aiiajj = implATS::magnitude(ghostedDiagValsView(rowView.colidx(x)) * ghostedDiagValsView(row)); + auto y_aiiajj = implATS::magnitude(ghostedDiagValsView(rowView.colidx(y)) * ghostedDiagValsView(row)); + return (x_aij / x_aiiajj) > (y_aij / y_aiiajj); + } + }); + + // find index where dropping starts + Kokkos::parallel_reduce( + Kokkos::TeamThreadRange(teamMember, 1, nnz), [=](size_t i, size_t& min) { + auto const& x = index_view(i - 1); + auto const& y = index_view(i); + typename implATS::magnitudeType x_val = 0; + typename implATS::magnitudeType y_val = 0; + if (!drop_view(x)) { + typename implATS::magnitudeType x_aij = implATS::magnitude(rowView.value(x) * rowView.value(x)); + typename implATS::magnitudeType x_aiiajj = implATS::magnitude(ghostedDiagValsView(rowView.colidx(x)) * ghostedDiagValsView(row)); + x_val = x_aij / x_aiiajj; + } + if (!drop_view(y)) { + typename implATS::magnitudeType y_aij = implATS::magnitude(rowView.value(y) * rowView.value(y)); + typename implATS::magnitudeType y_aiiajj = implATS::magnitude(ghostedDiagValsView(rowView.colidx(y)) * ghostedDiagValsView(row)); + y_val = y_aij / y_aiiajj; + } + + if (realThresholdKokkos * realThresholdKokkos * x_val > y_val) { + if (i < min) { + min = i; + } + } + }, + Kokkos::Min(dropStart)); } - }, Kokkos::Min(dropStart)); - } - //drop everything to the right of where values stop passing threshold - if(dropStart < nnz) { - Kokkos::parallel_for(Kokkos::TeamThreadRange(teamMember, dropStart, nnz), [=](size_t i) { - drop_view(index_view(i)) = true; - }); - } + // drop everything to the right of where values stop passing threshold + if (dropStart < nnz) { + Kokkos::parallel_for(Kokkos::TeamThreadRange(teamMember, dropStart, nnz), [=](size_t i) { + drop_view(index_view(i)) = true; + }); + } - LO rownnz = 0; - GO rowDropped = 0; - Kokkos::parallel_reduce(Kokkos::TeamThreadRange(teamMember, nnz), [=](const size_t idxID, LO& keep, GO& drop) { - LO col = rowView.colidx(idxID); - //don't drop diagonal - if(row == col || !drop_view(idxID)) { - columnsDevice(A_device.graph.row_map(row) + idxID) = col; - keep++; - } - else { - columnsDevice(A_device.graph.row_map(row) + idxID) = -1; - drop++; - } - }, rownnz, rowDropped); + LO rownnz = 0; + GO rowDropped = 0; + Kokkos::parallel_reduce( + Kokkos::TeamThreadRange(teamMember, nnz), [=](const size_t idxID, LO& keep, GO& drop) { + LO col = rowView.colidx(idxID); + // don't drop diagonal + if (row == col || !drop_view(idxID)) { + columnsDevice(A_device.graph.row_map(row) + idxID) = col; + keep++; + } else { + columnsDevice(A_device.graph.row_map(row) + idxID) = -1; + drop++; + } + }, + rownnz, rowDropped); - globalnnz += rownnz; - totalDropped += rowDropped; - rownnzView(row) = rownnz; - }, realnnz, numDropped); + globalnnz += rownnz; + totalDropped += rowDropped; + rownnzView(row) = rownnz; + }, + realnnz, numDropped); - //update column indices so that kept indices are aligned to the left for subview that happens later on + // update column indices so that kept indices are aligned to the left for subview that happens later on Kokkos::Experimental::remove(ExecSpace(), columnsDevice, -1); Kokkos::deep_copy(columns, columnsDevice); - //update row indices by adding up new # of nnz in each row + // update row indices by adding up new # of nnz in each row auto rowsDevice = Kokkos::create_mirror_view(ExecSpace(), rows); - Kokkos::parallel_scan(Kokkos::RangePolicy(0, A_device.numRows()), KOKKOS_LAMBDA(const int i, LO& partial_sum, bool is_final) { - partial_sum += rownnzView(i); - if(is_final) rowsDevice(i+1) = partial_sum; - }); + Kokkos::parallel_scan( + Kokkos::RangePolicy(0, A_device.numRows()), KOKKOS_LAMBDA(const int i, LO& partial_sum, bool is_final) { + partial_sum += rownnzView(i); + if (is_final) rowsDevice(i + 1) = partial_sum; + }); Kokkos::deep_copy(rows, rowsDevice); } diff --git a/packages/muelu/test/unit_tests/CoalesceDropFactory.cpp b/packages/muelu/test/unit_tests/CoalesceDropFactory.cpp index 0073ca7e9bfb..7ec8dbe27a3a 100644 --- a/packages/muelu/test/unit_tests/CoalesceDropFactory.cpp +++ b/packages/muelu/test/unit_tests/CoalesceDropFactory.cpp @@ -1402,36 +1402,31 @@ TEUCHOS_UNIT_TEST_TEMPLATE_4_DECL(CoalesceDropFactory, ClassicalScaledCut, Scala TestHelpers::TestFactory::createSingleLevelHierarchy(fineLevel); const global_size_t globalIndices = 12; - const GO indexBase = 0; - RCP map = rcp(new map_type(globalIndices, indexBase, comm)); + const GO indexBase = 0; + RCP map = rcp(new map_type(globalIndices, indexBase, comm)); RCP A_t(new crs_matrix_type(map, 5)); - const SC two = static_cast(2.0); - const SC one = static_cast(1.0); + const SC two = static_cast(2.0); + const SC one = static_cast(1.0); const SC negOne = static_cast(-1.0); - for(LO lclRow = 0; lclRow < static_cast (map->getLocalNumElements()); lclRow++) { + for (LO lclRow = 0; lclRow < static_cast(map->getLocalNumElements()); lclRow++) { const GO gblRow = map->getGlobalElement(lclRow); - if(gblRow == 0) { + if (gblRow == 0) { A_t->insertGlobalValues(gblRow, Teuchos::tuple(gblRow, gblRow + 1), Teuchos::tuple(two, negOne)); - } - else if(static_cast(gblRow) == globalIndices - 1) { + } else if (static_cast(gblRow) == globalIndices - 1) { A_t->insertGlobalValues(gblRow, Teuchos::tuple(gblRow - 1, gblRow), Teuchos::tuple(negOne, two)); - } - else if(gblRow == 2 || gblRow == 9) { + } else if (gblRow == 2 || gblRow == 9) { A_t->insertGlobalValues(gblRow, Teuchos::tuple(gblRow), Teuchos::tuple(one)); - } - else if(gblRow == 5) { - A_t->insertGlobalValues(gblRow, Teuchos::tuple(gblRow-2, gblRow-1, gblRow, gblRow+1, gblRow+2), Teuchos::tuple(negOne, negOne, two, negOne, negOne)); - } - else if(gblRow == 6) { - A_t->insertGlobalValues(gblRow, Teuchos::tuple(gblRow-2, gblRow-1, gblRow, gblRow+1, gblRow+2), Teuchos::tuple(negOne, two, two, two, negOne)); - } - else { + } else if (gblRow == 5) { + A_t->insertGlobalValues(gblRow, Teuchos::tuple(gblRow - 2, gblRow - 1, gblRow, gblRow + 1, gblRow + 2), Teuchos::tuple(negOne, negOne, two, negOne, negOne)); + } else if (gblRow == 6) { + A_t->insertGlobalValues(gblRow, Teuchos::tuple(gblRow - 2, gblRow - 1, gblRow, gblRow + 1, gblRow + 2), Teuchos::tuple(negOne, two, two, two, negOne)); + } else { A_t->insertGlobalValues(gblRow, Teuchos::tuple(gblRow - 1, gblRow, gblRow + 1), Teuchos::tuple(negOne, two, negOne)); } } A_t->fillComplete(); RCP A_x = rcp(new TpetraCrsMatrix(A_t)); - RCP A = rcp(new CrsMatrixWrap(A_x)); + RCP A = rcp(new CrsMatrixWrap(A_x)); fineLevel.Set("A", A); Teuchos::ParameterList galeriList; @@ -1461,19 +1456,19 @@ TEUCHOS_UNIT_TEST_TEMPLATE_4_DECL(CoalesceDropFactory, ClassicalScaledCut, Scala const RCP myImportMap = graph->GetImportMap(); // < note that the ImportMap is built from the column map of the matrix A WITHOUT dropping! const RCP myDomainMap = graph->GetDomainMap(); - TEST_EQUALITY(myImportMap->getMaxAllGlobalIndex(), globalIndices-1); + TEST_EQUALITY(myImportMap->getMaxAllGlobalIndex(), globalIndices - 1); TEST_EQUALITY(myImportMap->getMinAllGlobalIndex(), 0); TEST_EQUALITY(myImportMap->getMinLocalIndex(), 0); TEST_EQUALITY(myImportMap->getGlobalNumElements(), Teuchos::as(globalIndices + (comm->getSize() - 1) * 2)); - TEST_EQUALITY(myDomainMap->getMaxAllGlobalIndex(), globalIndices-1); + TEST_EQUALITY(myDomainMap->getMaxAllGlobalIndex(), globalIndices - 1); TEST_EQUALITY(myDomainMap->getMinAllGlobalIndex(), 0); TEST_EQUALITY(myDomainMap->getMinLocalIndex(), 0); TEST_EQUALITY(myDomainMap->getGlobalNumElements(), globalIndices); TEST_EQUALITY(graph->GetGlobalNumEdges(), 28); - int rows[13] = {0, 2, 4, 5, 7, 10, 15, 18, 21, 23, 24, 26, 28}; + int rows[13] = {0, 2, 4, 5, 7, 10, 15, 18, 21, 23, 24, 26, 28}; int columns[28] = {0, 1, 0, 1, 2, @@ -1486,23 +1481,23 @@ TEUCHOS_UNIT_TEST_TEMPLATE_4_DECL(CoalesceDropFactory, ClassicalScaledCut, Scala 9, 10, 11, 10, 11}; - auto rowPtrs = graph->getRowPtrs(); - auto entries = graph->getEntries(); - size_t rowID = 0; + auto rowPtrs = graph->getRowPtrs(); + auto entries = graph->getEntries(); + size_t rowID = 0; TEST_EQUALITY(rowPtrs(0), rowID); - for(size_t i = 0; i < rowPtrs.size()-1; i++) { + for (size_t i = 0; i < rowPtrs.size() - 1; i++) { auto gblID = myDomainMap->getGlobalElement(i); - int rownnz = rows[gblID+1]-rows[gblID]; + int rownnz = rows[gblID + 1] - rows[gblID]; rowID += rownnz; - TEST_EQUALITY(rowPtrs(i+1), rowID); + TEST_EQUALITY(rowPtrs(i + 1), rowID); std::vector colID; - for(int j = 0; j < rownnz; j++) { - colID.push_back(myImportMap->getGlobalElement(entries(rowPtrs(i)+j))); + for (int j = 0; j < rownnz; j++) { + colID.push_back(myImportMap->getGlobalElement(entries(rowPtrs(i) + j))); } std::sort(std::begin(colID), std::end(colID)); - for(int j = 0; j < rownnz; j++) { - TEST_EQUALITY(colID[j], columns[rows[gblID]+j]); + for (int j = 0; j < rownnz; j++) { + TEST_EQUALITY(colID[j], columns[rows[gblID] + j]); } } } // ClassicalScaledCut @@ -1525,36 +1520,31 @@ TEUCHOS_UNIT_TEST_TEMPLATE_4_DECL(CoalesceDropFactory, ClassicalUnScaledCut, Sca TestHelpers::TestFactory::createSingleLevelHierarchy(fineLevel); const global_size_t globalIndices = 12; - const GO indexBase = 0; - RCP map = rcp(new map_type(globalIndices, indexBase, comm)); + const GO indexBase = 0; + RCP map = rcp(new map_type(globalIndices, indexBase, comm)); RCP A_t(new crs_matrix_type(map, 5)); - const SC two = static_cast(2.0); - const SC one = static_cast(1.0); + const SC two = static_cast(2.0); + const SC one = static_cast(1.0); const SC negOne = static_cast(-1.0); - for(LO lclRow = 0; lclRow < static_cast (map->getLocalNumElements()); lclRow++) { + for (LO lclRow = 0; lclRow < static_cast(map->getLocalNumElements()); lclRow++) { const GO gblRow = map->getGlobalElement(lclRow); - if(gblRow == 0) { + if (gblRow == 0) { A_t->insertGlobalValues(gblRow, Teuchos::tuple(gblRow, gblRow + 1), Teuchos::tuple(two, negOne)); - } - else if(static_cast(gblRow) == globalIndices - 1) { + } else if (static_cast(gblRow) == globalIndices - 1) { A_t->insertGlobalValues(gblRow, Teuchos::tuple(gblRow - 1, gblRow), Teuchos::tuple(negOne, two)); - } - else if(gblRow == 2 || gblRow == 9) { + } else if (gblRow == 2 || gblRow == 9) { A_t->insertGlobalValues(gblRow, Teuchos::tuple(gblRow), Teuchos::tuple(one)); - } - else if(gblRow == 5) { - A_t->insertGlobalValues(gblRow, Teuchos::tuple(gblRow-2, gblRow-1, gblRow, gblRow+1, gblRow+2), Teuchos::tuple(negOne, negOne, two, negOne, negOne)); - } - else if(gblRow == 6) { - A_t->insertGlobalValues(gblRow, Teuchos::tuple(gblRow-2, gblRow-1, gblRow, gblRow+1, gblRow+2), Teuchos::tuple(negOne, two, two, two, negOne)); - } - else { + } else if (gblRow == 5) { + A_t->insertGlobalValues(gblRow, Teuchos::tuple(gblRow - 2, gblRow - 1, gblRow, gblRow + 1, gblRow + 2), Teuchos::tuple(negOne, negOne, two, negOne, negOne)); + } else if (gblRow == 6) { + A_t->insertGlobalValues(gblRow, Teuchos::tuple(gblRow - 2, gblRow - 1, gblRow, gblRow + 1, gblRow + 2), Teuchos::tuple(negOne, two, two, two, negOne)); + } else { A_t->insertGlobalValues(gblRow, Teuchos::tuple(gblRow - 1, gblRow, gblRow + 1), Teuchos::tuple(negOne, two, negOne)); } } A_t->fillComplete(); RCP A_x = rcp(new TpetraCrsMatrix(A_t)); - RCP A = rcp(new CrsMatrixWrap(A_x)); + RCP A = rcp(new CrsMatrixWrap(A_x)); fineLevel.Set("A", A); Teuchos::ParameterList galeriList; @@ -1584,19 +1574,19 @@ TEUCHOS_UNIT_TEST_TEMPLATE_4_DECL(CoalesceDropFactory, ClassicalUnScaledCut, Sca const RCP myImportMap = graph->GetImportMap(); // < note that the ImportMap is built from the column map of the matrix A WITHOUT dropping! const RCP myDomainMap = graph->GetDomainMap(); - TEST_EQUALITY(myImportMap->getMaxAllGlobalIndex(), globalIndices-1); + TEST_EQUALITY(myImportMap->getMaxAllGlobalIndex(), globalIndices - 1); TEST_EQUALITY(myImportMap->getMinAllGlobalIndex(), 0); TEST_EQUALITY(myImportMap->getMinLocalIndex(), 0); TEST_EQUALITY(myImportMap->getGlobalNumElements(), Teuchos::as(globalIndices + (comm->getSize() - 1) * 2)); - TEST_EQUALITY(myDomainMap->getMaxAllGlobalIndex(), globalIndices-1); + TEST_EQUALITY(myDomainMap->getMaxAllGlobalIndex(), globalIndices - 1); TEST_EQUALITY(myDomainMap->getMinAllGlobalIndex(), 0); TEST_EQUALITY(myDomainMap->getMinLocalIndex(), 0); TEST_EQUALITY(myDomainMap->getGlobalNumElements(), globalIndices); TEST_EQUALITY(graph->GetGlobalNumEdges(), 28); - int rows[13] = {0, 2, 4, 5, 7, 10, 15, 18, 21, 23, 24, 26, 28}; + int rows[13] = {0, 2, 4, 5, 7, 10, 15, 18, 21, 23, 24, 26, 28}; int columns[28] = {0, 1, 0, 1, 2, @@ -1609,23 +1599,23 @@ TEUCHOS_UNIT_TEST_TEMPLATE_4_DECL(CoalesceDropFactory, ClassicalUnScaledCut, Sca 9, 10, 11, 10, 11}; - auto rowPtrs = graph->getRowPtrs(); - auto entries = graph->getEntries(); - size_t rowID = 0; + auto rowPtrs = graph->getRowPtrs(); + auto entries = graph->getEntries(); + size_t rowID = 0; TEST_EQUALITY(rowPtrs(0), rowID); - for(size_t i = 0; i < rowPtrs.size()-1; i++) { + for (size_t i = 0; i < rowPtrs.size() - 1; i++) { auto gblID = myDomainMap->getGlobalElement(i); - int rownnz = rows[gblID+1]-rows[gblID]; + int rownnz = rows[gblID + 1] - rows[gblID]; rowID += rownnz; - TEST_EQUALITY(rowPtrs(i+1), rowID); + TEST_EQUALITY(rowPtrs(i + 1), rowID); std::vector colID; - for(int j = 0; j < rownnz; j++) { - colID.push_back(myImportMap->getGlobalElement(entries(rowPtrs(i)+j))); + for (int j = 0; j < rownnz; j++) { + colID.push_back(myImportMap->getGlobalElement(entries(rowPtrs(i) + j))); } std::sort(std::begin(colID), std::end(colID)); - for(int j = 0; j < rownnz; j++) { - TEST_EQUALITY(colID[j], columns[rows[gblID]+j]); + for (int j = 0; j < rownnz; j++) { + TEST_EQUALITY(colID[j], columns[rows[gblID] + j]); } } } // ClassicalUnScaledCut