From e67173ac2cec396bf91655f5b3b9f7cec7e4a16f Mon Sep 17 00:00:00 2001 From: Ian Halim Date: Mon, 15 Jul 2024 17:15:36 -0600 Subject: [PATCH 01/25] MueLu: Cut Drop Converted to Use Kokkos Original code within ORIGINAL ifdef. New code within NEW ifdef. DropTol structure marked with KOKKOS_INLINE_FUNCTION and default values are hard coded. Default Algorithm and Cut Drop Algorithm split into separate for loops in NEW code. Cut Drop converted to use Kokkos nested parallel loops. Timers placed in new code and are commented out. Code passes current unit tests. Saw a speedup of about 1.5x with Cuda and 1.2x with Serial when running unit tests with 10,000,000 rows. Signed-off-by: Ian Halim --- .../MueLu_CoalesceDropFactory_decl.hpp | 1 + .../MueLu_CoalesceDropFactory_def.hpp | 1663 ++++++++++++++++- 2 files changed, 1657 insertions(+), 7 deletions(-) diff --git a/packages/muelu/src/Graph/MatrixTransformation/MueLu_CoalesceDropFactory_decl.hpp b/packages/muelu/src/Graph/MatrixTransformation/MueLu_CoalesceDropFactory_decl.hpp index 96b5e778f6bc..db5e9a291313 100644 --- a/packages/muelu/src/Graph/MatrixTransformation/MueLu_CoalesceDropFactory_decl.hpp +++ b/packages/muelu/src/Graph/MatrixTransformation/MueLu_CoalesceDropFactory_decl.hpp @@ -160,6 +160,7 @@ class CoalesceDropFactory : public SingleLevelFactoryBase { //@} void Build(Level& currentLevel) const; // Build + void BuildKokkos(Level& currentLevel) const; private: // pre-drop function diff --git a/packages/muelu/src/Graph/MatrixTransformation/MueLu_CoalesceDropFactory_def.hpp b/packages/muelu/src/Graph/MatrixTransformation/MueLu_CoalesceDropFactory_def.hpp index 2c421c477bde..a8befaea592b 100644 --- a/packages/muelu/src/Graph/MatrixTransformation/MueLu_CoalesceDropFactory_def.hpp +++ b/packages/muelu/src/Graph/MatrixTransformation/MueLu_CoalesceDropFactory_def.hpp @@ -61,6 +61,8 @@ #include +#include //NEW +#include //NEW #include "MueLu_CoalesceDropFactory_decl.hpp" #include "MueLu_AmalgamationFactory.hpp" @@ -92,22 +94,30 @@ namespace MueLu { namespace Details { template struct DropTol { + KOKKOS_INLINE_FUNCTION //NEW DropTol() = default; + KOKKOS_INLINE_FUNCTION //NEW DropTol(DropTol const&) = default; + KOKKOS_INLINE_FUNCTION //NEW DropTol(DropTol&&) = default; DropTol& operator=(DropTol const&) = default; DropTol& operator=(DropTol&&) = default; + KOKKOS_INLINE_FUNCTION //NEW DropTol(real_type val_, real_type diag_, LO col_, bool drop_) : val{val_} , diag{diag_} , col{col_} , drop{drop_} {} - real_type val{Teuchos::ScalarTraits::zero()}; - real_type diag{Teuchos::ScalarTraits::zero()}; - LO col{Teuchos::OrdinalTraits::invalid()}; + real_type val{0}; + real_type diag{0}; + LO col{-1}; + //NEW Can't run these host functions on device + //real_type val{Teuchos::ScalarTraits::zero()}; + //real_type diag{Teuchos::ScalarTraits::zero()}; + //LO col{Teuchos::OrdinalTraits::invalid()}; bool drop{true}; // CMS: Auxillary information for debugging info @@ -414,6 +424,1645 @@ void CoalesceDropFactory::Build(Level TEUCHOS_TEST_FOR_EXCEPTION(A->GetFixedBlockSize() % A->GetStorageBlockSize() != 0, Exceptions::RuntimeError, "A->GetFixedBlockSize() needs to be a multiple of A->GetStorageBlockSize()"); const LO BlockSize = A->GetFixedBlockSize() / A->GetStorageBlockSize(); + /************************** RS or SA-style Classical Dropping (and variants) **************************/ + if (algo == "classical") { + if (predrop_ == null) { + // ap: this is a hack: had to declare predrop_ as mutable + predrop_ = rcp(new PreDropFunctionConstVal(threshold)); + } + + if (predrop_ != null) { + RCP predropConstVal = rcp_dynamic_cast(predrop_); + TEUCHOS_TEST_FOR_EXCEPTION(predropConstVal == Teuchos::null, Exceptions::BadCast, + "MueLu::CoalesceFactory::Build: cast to PreDropFunctionConstVal failed."); + // If a user provided a predrop function, it overwrites the XML threshold parameter + SC newt = predropConstVal->GetThreshold(); + if (newt != threshold) { + GetOStream(Warnings0) << "switching threshold parameter from " << threshold << " (list) to " << newt << " (user function" << std::endl; + threshold = newt; + } + } + // At this points we either have + // (predrop_ != null) + // Therefore, it is sufficient to check only threshold + if (BlockSize == 1 && threshold == STS::zero() && !useSignedClassicalRS && !useSignedClassicalSA && A->hasCrsGraph()) { + // Case 1: scalar problem, no dropping => just use matrix graph + RCP graph = rcp(new LWGraph(A->getCrsGraph(), "graph of A")); + // Detect and record rows that correspond to Dirichlet boundary conditions + auto boundaryNodes = MueLu::Utilities::DetectDirichletRows_kokkos_host(*A, dirichletThreshold); + if (rowSumTol > 0.) + Utilities::ApplyRowSumCriterionHost(*A, rowSumTol, boundaryNodes); + + graph->SetBoundaryNodeMap(boundaryNodes); + numTotal = A->getLocalNumEntries(); + + if (GetVerbLevel() & Statistics1) { + GO numLocalBoundaryNodes = 0; + GO numGlobalBoundaryNodes = 0; + for (size_t i = 0; i < boundaryNodes.size(); ++i) + if (boundaryNodes[i]) + numLocalBoundaryNodes++; + RCP> comm = A->getRowMap()->getComm(); + MueLu_sumAll(comm, numLocalBoundaryNodes, numGlobalBoundaryNodes); + GetOStream(Statistics1) << "Detected " << numGlobalBoundaryNodes << " Dirichlet nodes" << std::endl; + } + + Set(currentLevel, "DofsPerNode", 1); + Set(currentLevel, "Graph", graph); + + } else if ((BlockSize == 1 && threshold != STS::zero()) || + (BlockSize == 1 && threshold == STS::zero() && !A->hasCrsGraph()) || + (BlockSize == 1 && useSignedClassicalRS) || + (BlockSize == 1 && useSignedClassicalSA)) { + // Case 2: scalar problem with dropping => record the column indices of undropped entries, but still use original + // graph's map information, e.g., whether index is local + // OR a matrix without a CrsGraph + + // allocate space for the local graph + typename LWGraph::row_type::non_const_type rows("rows", A->getLocalNumRows() + 1); + typename LWGraph::entries_type::non_const_type columns("columns", A->getLocalNumEntries()); + + using MT = typename STS::magnitudeType; + RCP ghostedDiag; + ArrayRCP ghostedDiagVals; + ArrayRCP negMaxOffDiagonal; + // RS style needs the max negative off-diagonal, SA style needs the diagonal + if (useSignedClassicalRS) { + if (ghostedBlockNumber.is_null()) { + negMaxOffDiagonal = MueLu::Utilities::GetMatrixMaxMinusOffDiagonal(*A); + if (GetVerbLevel() & Statistics1) + GetOStream(Statistics1) << "Calculated max point off-diagonal" << std::endl; + } else { + negMaxOffDiagonal = MueLu::Utilities::GetMatrixMaxMinusOffDiagonal(*A, *ghostedBlockNumber); + if (GetVerbLevel() & Statistics1) + GetOStream(Statistics1) << "Calculating max block off-diagonal" << std::endl; + } + } else { + ghostedDiag = MueLu::Utilities::GetMatrixOverlappedDiagonal(*A); + ghostedDiagVals = ghostedDiag->getData(0); + } + auto boundaryNodes = MueLu::Utilities::DetectDirichletRows_kokkos_host(*A, dirichletThreshold); + if (rowSumTol > 0.) { + if (ghostedBlockNumber.is_null()) { + if (GetVerbLevel() & Statistics1) + GetOStream(Statistics1) << "Applying point row sum criterion." << std::endl; + Utilities::ApplyRowSumCriterionHost(*A, rowSumTol, boundaryNodes); + } else { + if (GetVerbLevel() & Statistics1) + GetOStream(Statistics1) << "Applying block row sum criterion." << std::endl; + Utilities::ApplyRowSumCriterionHost(*A, *ghostedBlockNumber, rowSumTol, boundaryNodes); + } + } + + LO realnnz = 0; + rows(0) = 0; +#define NEW +#ifdef ORIGINAL + for (LO row = 0; row < Teuchos::as(A->getRowMap()->getLocalNumElements()); ++row) { + size_t nnz = A->getNumEntriesInLocalRow(row); + bool rowIsDirichlet = boundaryNodes[row]; + ArrayView indices; + ArrayView vals; + A->getLocalRowView(row, indices, vals); + + if (classicalAlgo == defaultAlgo) { + // FIXME the current predrop function uses the following + // FIXME if(std::abs(vals[k]) > std::abs(threshold_) || grow == gcid ) + // FIXME but the threshold doesn't take into account the rows' diagonal entries + // FIXME For now, hardwiring the dropping in here + + LO rownnz = 0; + if (useSignedClassicalRS) { + // Signed classical RS style + for (LO colID = 0; colID < Teuchos::as(nnz); colID++) { + LO col = indices[colID]; + MT max_neg_aik = realThreshold * STS::real(negMaxOffDiagonal[row]); + MT neg_aij = -STS::real(vals[colID]); + /* if(row==1326) printf("A(%d,%d) = %6.4e, block = (%d,%d) neg_aij = %6.4e max_neg_aik = %6.4e\n",row,col,vals[colID], + g_block_id.is_null() ? -1 : g_block_id[row], + g_block_id.is_null() ? -1 : g_block_id[col], + neg_aij, max_neg_aik);*/ + if ((!rowIsDirichlet && (g_block_id.is_null() || g_block_id[row] == g_block_id[col]) && neg_aij > max_neg_aik) || row == col) { + columns[realnnz++] = col; + rownnz++; + } else + numDropped++; + } + rows(row + 1) = realnnz; + } else if (useSignedClassicalSA) { + // Signed classical SA style + for (LO colID = 0; colID < Teuchos::as(nnz); colID++) { + LO col = indices[colID]; + + bool is_nonpositive = STS::real(vals[colID]) <= 0; + MT aiiajj = STS::magnitude(threshold * threshold * ghostedDiagVals[col] * ghostedDiagVals[row]); // eps^2*|a_ii|*|a_jj| + MT aij = is_nonpositive ? STS::magnitude(vals[colID] * vals[colID]) : (-STS::magnitude(vals[colID] * vals[colID])); // + |a_ij|^2, if a_ij < 0, - |a_ij|^2 if a_ij >=0 + /* + if(row==1326) printf("A(%d,%d) = %6.4e, raw_aij = %6.4e aij = %6.4e aiiajj = %6.4e\n",row,col,vals[colID], + vals[colID],aij, aiiajj); + */ + + if ((!rowIsDirichlet && aij > aiiajj) || row == col) { + columns(realnnz++) = col; + rownnz++; + } else + numDropped++; + } + rows[row + 1] = realnnz; + } else { + // Standard abs classical + for (LO colID = 0; colID < Teuchos::as(nnz); colID++) { + LO col = indices[colID]; + MT aiiajj = STS::magnitude(threshold * threshold * ghostedDiagVals[col] * ghostedDiagVals[row]); // eps^2*|a_ii|*|a_jj| + MT aij = STS::magnitude(vals[colID] * vals[colID]); // |a_ij|^2 + + if ((!rowIsDirichlet && aij > aiiajj) || row == col) { + columns(realnnz++) = col; + rownnz++; + } else + numDropped++; + } + rows(row + 1) = realnnz; + } + } else { + /* Cut Algorithm */ + // CMS + using DropTol = Details::DropTol; + std::vector drop_vec; + drop_vec.reserve(nnz); + const real_type zero = Teuchos::ScalarTraits::zero(); + const real_type one = Teuchos::ScalarTraits::one(); + LO rownnz = 0; + // NOTE: This probably needs to be fixed for rowsum + + // find magnitudes + for (LO colID = 0; colID < (LO)nnz; colID++) { + LO col = indices[colID]; + if (row == col) { + drop_vec.emplace_back(zero, one, colID, false); + continue; + } + + // Don't aggregate boundaries + if (boundaryNodes[colID]) continue; + typename STS::magnitudeType aiiajj = STS::magnitude(threshold * threshold * ghostedDiagVals[col] * ghostedDiagVals[row]); // eps^2*|a_ii|*|a_jj| + typename STS::magnitudeType aij = STS::magnitude(vals[colID] * vals[colID]); // |a_ij|^2 + drop_vec.emplace_back(aij, aiiajj, colID, false); + } + + const size_t n = drop_vec.size(); + + if (classicalAlgo == unscaled_cut) { + std::sort(drop_vec.begin(), drop_vec.end(), [](DropTol const& a, DropTol const& b) { + return a.val > b.val; + }); + + bool drop = false; + for (size_t i = 1; i < n; ++i) { + if (!drop) { + auto const& x = drop_vec[i - 1]; + auto const& y = drop_vec[i]; + auto a = x.val; + auto b = y.val; + if (a > realThreshold * b) { + drop = true; +#ifdef HAVE_MUELU_DEBUG + if (distanceLaplacianCutVerbose) { + std::cout << "DJS: KEEP, N, ROW: " << i + 1 << ", " << n << ", " << row << std::endl; + } +#endif + } + } + drop_vec[i].drop = drop; + } + } else if (classicalAlgo == scaled_cut) { + std::sort(drop_vec.begin(), drop_vec.end(), [](DropTol const& a, DropTol const& b) { + return a.val / a.diag > b.val / b.diag; + }); + bool drop = false; + // printf("[%d] Scaled Cut: ",(int)row); + // printf("%3d(%4s) ",indices[drop_vec[0].col],"keep"); + for (size_t i = 1; i < n; ++i) { + if (!drop) { + auto const& x = drop_vec[i - 1]; + auto const& y = drop_vec[i]; + auto a = x.val / x.diag; + auto b = y.val / y.diag; + if (a > realThreshold * b) { + drop = true; + +#ifdef HAVE_MUELU_DEBUG + if (distanceLaplacianCutVerbose) { + std::cout << "DJS: KEEP, N, ROW: " << i + 1 << ", " << n << ", " << row << std::endl; + } +#endif + } + // printf("%3d(%4s) ",indices[drop_vec[i].col],drop?"drop":"keep"); + } + drop_vec[i].drop = drop; + } + // printf("\n"); + } + std::sort(drop_vec.begin(), drop_vec.end(), [](DropTol const& a, DropTol const& b) { + return a.col < b.col; + }); + + for (LO idxID = 0; idxID < (LO)drop_vec.size(); idxID++) { + LO col = indices[drop_vec[idxID].col]; + // don't drop diagonal + if (row == col) { + columns[realnnz++] = col; + rownnz++; + continue; + } + + if (!drop_vec[idxID].drop) { + columns[realnnz++] = col; + rownnz++; + } else { + numDropped++; + } + } + // CMS + rows[row + 1] = realnnz; + } + } // end for row +#endif + +#ifdef NEW + if(classicalAlgo == defaultAlgo) { + SubFactoryMonitor m1(*this, "Classical RS/SA", currentLevel); + for (LO row = 0; row < Teuchos::as(A->getRowMap()->getLocalNumElements()); ++row) { + size_t nnz = A->getNumEntriesInLocalRow(row); + bool rowIsDirichlet = boundaryNodes[row]; + ArrayView indices; + ArrayView vals; + A->getLocalRowView(row, indices, vals); + + // FIXME the current predrop function uses the following + // FIXME if(std::abs(vals[k]) > std::abs(threshold_) || grow == gcid ) + // FIXME but the threshold doesn't take into account the rows' diagonal entries + // FIXME For now, hardwiring the dropping in here + + LO rownnz = 0; + if (useSignedClassicalRS) { + // Signed classical RS style + for (LO colID = 0; colID < Teuchos::as(nnz); colID++) { + LO col = indices[colID]; + MT max_neg_aik = realThreshold * STS::real(negMaxOffDiagonal[row]); + MT neg_aij = -STS::real(vals[colID]); + /* if(row==1326) printf("A(%d,%d) = %6.4e, block = (%d,%d) neg_aij = %6.4e max_neg_aik = %6.4e\n",row,col,vals[colID], + g_block_id.is_null() ? -1 : g_block_id[row], + g_block_id.is_null() ? -1 : g_block_id[col], + neg_aij, max_neg_aik);*/ + if ((!rowIsDirichlet && (g_block_id.is_null() || g_block_id[row] == g_block_id[col]) && neg_aij > max_neg_aik) || row == col) { + columns[realnnz++] = col; + rownnz++; + } else + numDropped++; + } + rows(row + 1) = realnnz; + } else if (useSignedClassicalSA) { + // Signed classical SA style + for (LO colID = 0; colID < Teuchos::as(nnz); colID++) { + LO col = indices[colID]; + + bool is_nonpositive = STS::real(vals[colID]) <= 0; + MT aiiajj = STS::magnitude(threshold * threshold * ghostedDiagVals[col] * ghostedDiagVals[row]); // eps^2*|a_ii|*|a_jj| + MT aij = is_nonpositive ? STS::magnitude(vals[colID] * vals[colID]) : (-STS::magnitude(vals[colID] * vals[colID])); // + |a_ij|^2, if a_ij < 0, - |a_ij|^2 if a_ij >=0 + /* + if(row==1326) printf("A(%d,%d) = %6.4e, raw_aij = %6.4e aij = %6.4e aiiajj = %6.4e\n",row,col,vals[colID], + vals[colID],aij, aiiajj); + */ + + if ((!rowIsDirichlet && aij > aiiajj) || row == col) { + columns(realnnz++) = col; + rownnz++; + } else + numDropped++; + } + rows[row + 1] = realnnz; + } else { + // Standard abs classical + for (LO colID = 0; colID < Teuchos::as(nnz); colID++) { + LO col = indices[colID]; + MT aiiajj = STS::magnitude(threshold * threshold * ghostedDiagVals[col] * ghostedDiagVals[row]); // eps^2*|a_ii|*|a_jj| + MT aij = STS::magnitude(vals[colID] * vals[colID]); // |a_ij|^2 + + if ((!rowIsDirichlet && aij > aiiajj) || row == col) { + columns(realnnz++) = col; + rownnz++; + } else + numDropped++; + } + rows(row + 1) = realnnz; + } + } // end for row + } + else { //NEW START + //auto stackedTimer = rcp(new Teuchos::StackedTimer("timer")); + //Teuchos::TimeMonitor::setStackedTimer(stackedTimer); + //stackedTimer->start("init"); + SubFactoryMonitor m1(*this, "Cut Drop", currentLevel); + using ExecSpace = typename Node::execution_space; + using TeamPol = Kokkos::TeamPolicy; + using TeamMem = typename TeamPol::member_type; + using DropTol = Details::DropTol; + + //move from host to device + ArrayView ghostedDiagValsArrayView = ghostedDiagVals.view(ghostedDiagVals.lowerOffset(), ghostedDiagVals.size()); + Kokkos::View ghostedDiagValsView = Kokkos::Compat::getKokkosViewDeepCopy(ghostedDiagValsArrayView); + auto boundaryNodesDevice = Kokkos::create_mirror_view(ExecSpace(), boundaryNodes); + + auto At = Utilities::Op2TpetraCrs(A); + auto A_device = At->getLocalMatrixDevice(); + + int algorithm = classicalAlgo; + Kokkos::ViewrownnzView("rownnzView", A_device.numRows()); + auto drop_views = Kokkos::View("drop_views", A_device.nnz()); + //stackedTimer->stop("init"); + + //stackedTimer->start("loop"); + Kokkos::parallel_reduce("classical_cut", TeamPol(A_device.numRows(), Kokkos::AUTO), KOKKOS_LAMBDA(const TeamMem& teamMember, LO& globalnnz, GO& totalDropped) { + LO row = teamMember.league_rank(); + auto rowView = A_device.row(row); + size_t nnz = rowView.length; + + size_t n = 0; + auto drop_view = Kokkos::subview(drop_views, Kokkos::make_pair(A_device.graph.row_map(row), A_device.graph.row_map(row+1))); + //find magnitudes + Kokkos::parallel_reduce(Kokkos::TeamThreadRange(teamMember, (LO)nnz), [&] (const LO colID, size_t &count) { + LO col = rowView.colidx(colID); + if(row == col) { + drop_view(colID) = DropTol(0, 1, colID, false); + count++; + } + //Don't aggregate boundaries + else if(!boundaryNodesDevice(colID)) { + typename STS::magnitudeType aiiajj = static_cast(std::fabs(static_cast(threshold * threshold * ghostedDiagValsView(col) * ghostedDiagValsView(row)))); // eps^2*|a _ii|*|a_jj| + typename STS::magnitudeType aij = static_cast(std::fabs(static_cast(rowView.value(colID) * rowView.value(colID)))); // |a_i j|^2 + drop_view(colID) = DropTol(aij, aiiajj, colID, false); + count++; + } + }, n); + if (algorithm == unscaled_cut) { + Kokkos::Experimental::sort_team(teamMember, drop_view, [](DropTol const& a, DropTol const& b) { + return a.val > b.val; + }); + + //find index where dropping starts + size_t dropStart; + Kokkos::parallel_reduce(Kokkos::TeamThreadRange(teamMember, 1, n), [=](size_t i, size_t& min) { + auto const& x = drop_view(i - 1); + auto const& y = drop_view(i); + auto a = x.val; + auto b = y.val; + if(a > realThreshold * b) { + if(i < min) { + min = i; + } + } + }, Kokkos::Min(dropStart)); + + if(dropStart < n) { + Kokkos::parallel_for(Kokkos::TeamThreadRange(teamMember, dropStart, n), [=](size_t i) { + drop_view(i).drop = true; + }); + } + } else if (algorithm == scaled_cut) { + Kokkos::Experimental::sort_team(teamMember, drop_view, [](DropTol const& a, DropTol const& b) { + return a.val / a.diag > b.val / b.diag; + }); + + //find index where dropping starts + size_t dropStart; + Kokkos::parallel_reduce(Kokkos::TeamThreadRange(teamMember, 1, n), [=](size_t i, size_t& min) { + auto const& x = drop_view(i - 1); + auto const& y = drop_view(i); + auto a = x.val / x.diag; + auto b = y.val / y.diag; + if(a > realThreshold * b) { + if(i < min) { + min = i; + } + } + }, Kokkos::Min(dropStart)); + + if(dropStart < n) { + Kokkos::parallel_for(Kokkos::TeamThreadRange(teamMember, dropStart, n), [=](size_t i) { + drop_view(i).drop = true; + }); + } + } + Kokkos::Experimental::sort_team(teamMember, drop_view, [](DropTol const& a, DropTol const& b) { + return a.col < b.col; + }); + + LO rownnz = 0; + GO rowDropped = 0; + Kokkos::parallel_reduce(Kokkos::TeamThreadRange(teamMember, n), [=](const size_t idxID, LO& keep, GO& drop) { + LO col = rowView.colidx(idxID); + //don't drop diagonal + if(row == col || !drop_view(idxID).drop) { + keep++; + } + else { + rowView.colidx(idxID) = -1; + drop++; + } + }, rownnz, rowDropped); + globalnnz += rownnz; + totalDropped += rowDropped; + rownnzView(row) = rownnz; + }, realnnz, numDropped); + //stackedTimer->stop("loop"); + + //stackedTimer->start("remove"); + + auto columnsDevice = Kokkos::create_mirror_view(ExecSpace(), columns); + Kokkos::Experimental::remove(ExecSpace(), columnsDevice, -1); + Kokkos::deep_copy(columns, columnsDevice); + + //stackedTimer->stop("remove"); + + //update row indices + //stackedTimer->start("scan"); + auto rowsDevice = Kokkos::create_mirror_view(ExecSpace(), rows); + Kokkos::parallel_scan(Kokkos::RangePolicy(0, A_device.numRows()), KOKKOS_LAMBDA(const int i, LO& partial_sum, bool is_final) { + partial_sum += rownnzView(i); + if(is_final) rowsDevice(i+1) = partial_sum; + }); + Kokkos::deep_copy(rows, rowsDevice); + //stackedTimer->stop("scan"); + + //stackedTimer->stop("timer"); + //stackedTimer->report(std::cout, Teuchos::DefaultComm::getComm()); + } //NEW END +#endif + + numTotal = A->getLocalNumEntries(); + + if (aggregationMayCreateDirichlet) { + // If the only element remaining after filtering is diagonal, mark node as boundary + for (LO row = 0; row < Teuchos::as(A->getRowMap()->getLocalNumElements()); ++row) { + if (rows[row + 1] - rows[row] <= 1) + boundaryNodes[row] = true; + } + } + + RCP graph = rcp(new LWGraph(rows, Kokkos::subview(columns, Kokkos::make_pair(0, realnnz)), A->getRowMap(), A->getColMap(), "thresholded graph of A")); + graph->SetBoundaryNodeMap(boundaryNodes); + if (GetVerbLevel() & Statistics1) { + GO numLocalBoundaryNodes = 0; + GO numGlobalBoundaryNodes = 0; + for (size_t i = 0; i < boundaryNodes.size(); ++i) + if (boundaryNodes(i)) + numLocalBoundaryNodes++; + RCP> comm = A->getRowMap()->getComm(); + MueLu_sumAll(comm, numLocalBoundaryNodes, numGlobalBoundaryNodes); + GetOStream(Statistics1) << "Detected " << numGlobalBoundaryNodes << " Dirichlet nodes" << std::endl; + } + Set(currentLevel, "Graph", graph); + Set(currentLevel, "DofsPerNode", 1); + + // If we're doing signed classical, we might want to block-diagonalize *after* the dropping + if (generateColoringGraph) { + RCP colorGraph; + RCP importer = A->getCrsGraph()->getImporter(); + BlockDiagonalizeGraph(graph, ghostedBlockNumber, colorGraph, importer); + Set(currentLevel, "Coloring Graph", colorGraph); + // #define CMS_DUMP +#ifdef CMS_DUMP + { + Xpetra::IO::Write("m_regular_graph." + std::to_string(currentLevel.GetLevelID()), *rcp_dynamic_cast(graph)->GetCrsGraph()); + Xpetra::IO::Write("m_color_graph." + std::to_string(currentLevel.GetLevelID()), *rcp_dynamic_cast(colorGraph)->GetCrsGraph()); + // int rank = graph->GetDomainMap()->getComm()->getRank(); + // { + // std::ofstream ofs(std::string("m_color_graph_") + std::to_string(currentLevel.GetLevelID())+std::string("_") + std::to_string(rank) + std::string(".dat"),std::ofstream::out); + // RCP fancy = Teuchos::fancyOStream(Teuchos::rcpFromRef(ofs)); + // colorGraph->print(*fancy,Debug); + // } + // { + // std::ofstream ofs(std::string("m_regular_graph_") + std::to_string(currentLevel.GetLevelID())+std::string("_") + std::to_string(rank) + std::string(".dat"),std::ofstream::out); + // RCP fancy = Teuchos::fancyOStream(Teuchos::rcpFromRef(ofs)); + // graph->print(*fancy,Debug); + // } + } +#endif + } // end generateColoringGraph + } else if (BlockSize > 1 && threshold == STS::zero()) { + // Case 3: Multiple DOF/node problem without dropping + const RCP rowMap = A->getRowMap(); + const RCP colMap = A->getColMap(); + + graphType = "amalgamated"; + + // build node row map (uniqueMap) and node column map (nonUniqueMap) + // the arrays rowTranslation and colTranslation contain the local node id + // given a local dof id. The data is calculated by the AmalgamationFactory and + // stored in the variable container "UnAmalgamationInfo" + RCP uniqueMap = amalInfo->getNodeRowMap(); + RCP nonUniqueMap = amalInfo->getNodeColMap(); + Array rowTranslation = *(amalInfo->getRowTranslation()); + Array colTranslation = *(amalInfo->getColTranslation()); + + // get number of local nodes + LO numRows = Teuchos::as(uniqueMap->getLocalNumElements()); + + // Allocate space for the local graph + typename LWGraph::row_type::non_const_type rows("rows", numRows + 1); + typename LWGraph::entries_type::non_const_type columns("columns", A->getLocalNumEntries()); + + typename LWGraph::boundary_nodes_type amalgBoundaryNodes("amalgBoundaryNodes", numRows); + Kokkos::deep_copy(amalgBoundaryNodes, false); + + // Detect and record rows that correspond to Dirichlet boundary conditions + // TODO If we use ArrayRCP, then we can record boundary nodes as usual. Size + // TODO the array one bigger than the number of local rows, and the last entry can + // TODO hold the actual number of boundary nodes. Clever, huh? + ArrayRCP pointBoundaryNodes; + pointBoundaryNodes = Teuchos::arcp_const_cast(MueLu::Utilities::DetectDirichletRows(*A, dirichletThreshold)); + if (rowSumTol > 0.) + Utilities::ApplyRowSumCriterion(*A, rowSumTol, pointBoundaryNodes); + + // extract striding information + LO blkSize = A->GetFixedBlockSize(); //< the full block size (number of dofs per node in strided map) + LO blkId = -1; //< the block id within the strided map (or -1 if it is a full block map) + LO blkPartSize = A->GetFixedBlockSize(); //< stores the size of the block within the strided map + if (A->IsView("stridedMaps") == true) { + Teuchos::RCP myMap = A->getRowMap("stridedMaps"); + Teuchos::RCP strMap = Teuchos::rcp_dynamic_cast(myMap); + TEUCHOS_TEST_FOR_EXCEPTION(strMap == null, Exceptions::RuntimeError, "Map is not of type StridedMap"); + blkSize = Teuchos::as(strMap->getFixedBlockSize()); + blkId = strMap->getStridedBlockId(); + if (blkId > -1) + blkPartSize = Teuchos::as(strMap->getStridingData()[blkId]); + } + + // loop over all local nodes + LO realnnz = 0; + rows(0) = 0; + Array indicesExtra; + for (LO row = 0; row < numRows; row++) { + ArrayView indices; + indicesExtra.resize(0); + + // The amalgamated row is marked as Dirichlet iff all point rows are Dirichlet + // Note, that pointBoundaryNodes lives on the dofmap (and not the node map). + // Therefore, looping over all dofs is fine here. We use blkPartSize as we work + // with local ids. + // TODO: Here we have different options of how to define a node to be a boundary (or Dirichlet) + // node. + bool isBoundary = false; + if (pL.get("aggregation: greedy Dirichlet") == true) { + for (LO j = 0; j < blkPartSize; j++) { + if (pointBoundaryNodes[row * blkPartSize + j]) { + isBoundary = true; + break; + } + } + } else { + isBoundary = true; + for (LO j = 0; j < blkPartSize; j++) { + if (!pointBoundaryNodes[row * blkPartSize + j]) { + isBoundary = false; + break; + } + } + } + + // Merge rows of A + // The array indicesExtra contains local column node ids for the current local node "row" + if (!isBoundary) + MergeRows(*A, row, indicesExtra, colTranslation); + else + indicesExtra.push_back(row); + indices = indicesExtra; + numTotal += indices.size(); + + // add the local column node ids to the full columns array which + // contains the local column node ids for all local node rows + LO nnz = indices.size(), rownnz = 0; + for (LO colID = 0; colID < nnz; colID++) { + LO col = indices[colID]; + columns(realnnz++) = col; + rownnz++; + } + + if (rownnz == 1) { + // If the only element remaining after filtering is diagonal, mark node as boundary + // FIXME: this should really be replaced by the following + // if (indices.size() == 1 && indices[0] == row) + // boundaryNodes[row] = true; + // We do not do it this way now because there is no framework for distinguishing isolated + // and boundary nodes in the aggregation algorithms + amalgBoundaryNodes[row] = true; + } + rows(row + 1) = realnnz; + } // for (LO row = 0; row < numRows; row++) + + RCP graph = rcp(new LWGraph(rows, Kokkos::subview(columns, Kokkos::make_pair(0, realnnz)), uniqueMap, nonUniqueMap, "amalgamated graph of A")); + graph->SetBoundaryNodeMap(amalgBoundaryNodes); + + if (GetVerbLevel() & Statistics1) { + GO numLocalBoundaryNodes = 0; + GO numGlobalBoundaryNodes = 0; + + for (size_t i = 0; i < amalgBoundaryNodes.size(); ++i) + if (amalgBoundaryNodes(i)) + numLocalBoundaryNodes++; + + RCP> comm = A->getRowMap()->getComm(); + MueLu_sumAll(comm, numLocalBoundaryNodes, numGlobalBoundaryNodes); + GetOStream(Statistics1) << "Detected " << numGlobalBoundaryNodes + << " agglomerated Dirichlet nodes" << std::endl; + } + + Set(currentLevel, "Graph", graph); + Set(currentLevel, "DofsPerNode", blkSize); // full block size + + } else if (BlockSize > 1 && threshold != STS::zero()) { + // Case 4: Multiple DOF/node problem with dropping + const RCP rowMap = A->getRowMap(); + const RCP colMap = A->getColMap(); + graphType = "amalgamated"; + + // build node row map (uniqueMap) and node column map (nonUniqueMap) + // the arrays rowTranslation and colTranslation contain the local node id + // given a local dof id. The data is calculated by the AmalgamationFactory and + // stored in the variable container "UnAmalgamationInfo" + RCP uniqueMap = amalInfo->getNodeRowMap(); + RCP nonUniqueMap = amalInfo->getNodeColMap(); + Array rowTranslation = *(amalInfo->getRowTranslation()); + Array colTranslation = *(amalInfo->getColTranslation()); + + // get number of local nodes + LO numRows = Teuchos::as(uniqueMap->getLocalNumElements()); + + // Allocate space for the local graph + typename LWGraph::row_type::non_const_type rows("rows", numRows + 1); + typename LWGraph::entries_type::non_const_type columns("columns", A->getLocalNumEntries()); + + typename LWGraph::boundary_nodes_type amalgBoundaryNodes("amalgBoundaryNodes", numRows); + Kokkos::deep_copy(amalgBoundaryNodes, false); + + // Detect and record rows that correspond to Dirichlet boundary conditions + // TODO If we use ArrayRCP, then we can record boundary nodes as usual. Size + // TODO the array one bigger than the number of local rows, and the last entry can + // TODO hold the actual number of boundary nodes. Clever, huh? + auto pointBoundaryNodes = MueLu::Utilities::DetectDirichletRows_kokkos_host(*A, dirichletThreshold); + if (rowSumTol > 0.) + Utilities::ApplyRowSumCriterionHost(*A, rowSumTol, pointBoundaryNodes); + + // extract striding information + LO blkSize = A->GetFixedBlockSize(); //< the full block size (number of dofs per node in strided map) + LO blkId = -1; //< the block id within the strided map (or -1 if it is a full block map) + LO blkPartSize = A->GetFixedBlockSize(); //< stores the size of the block within the strided map + if (A->IsView("stridedMaps") == true) { + Teuchos::RCP myMap = A->getRowMap("stridedMaps"); + Teuchos::RCP strMap = Teuchos::rcp_dynamic_cast(myMap); + TEUCHOS_TEST_FOR_EXCEPTION(strMap == null, Exceptions::RuntimeError, "Map is not of type StridedMap"); + blkSize = Teuchos::as(strMap->getFixedBlockSize()); + blkId = strMap->getStridedBlockId(); + if (blkId > -1) + blkPartSize = Teuchos::as(strMap->getStridingData()[blkId]); + } + + // extract diagonal data for dropping strategy + RCP ghostedDiag = MueLu::Utilities::GetMatrixOverlappedDiagonal(*A); + const ArrayRCP ghostedDiagVals = ghostedDiag->getData(0); + + // loop over all local nodes + LO realnnz = 0; + rows[0] = 0; + Array indicesExtra; + for (LO row = 0; row < numRows; row++) { + ArrayView indices; + indicesExtra.resize(0); + + // The amalgamated row is marked as Dirichlet iff all point rows are Dirichlet + // Note, that pointBoundaryNodes lives on the dofmap (and not the node map). + // Therefore, looping over all dofs is fine here. We use blkPartSize as we work + // with local ids. + // TODO: Here we have different options of how to define a node to be a boundary (or Dirichlet) + // node. + bool isBoundary = false; + if (pL.get("aggregation: greedy Dirichlet") == true) { + for (LO j = 0; j < blkPartSize; j++) { + if (pointBoundaryNodes[row * blkPartSize + j]) { + isBoundary = true; + break; + } + } + } else { + isBoundary = true; + for (LO j = 0; j < blkPartSize; j++) { + if (!pointBoundaryNodes[row * blkPartSize + j]) { + isBoundary = false; + break; + } + } + } + + // Merge rows of A + // The array indicesExtra contains local column node ids for the current local node "row" + if (!isBoundary) + MergeRowsWithDropping(*A, row, ghostedDiagVals, threshold, indicesExtra, colTranslation); + else + indicesExtra.push_back(row); + indices = indicesExtra; + numTotal += indices.size(); + + // add the local column node ids to the full columns array which + // contains the local column node ids for all local node rows + LO nnz = indices.size(), rownnz = 0; + for (LO colID = 0; colID < nnz; colID++) { + LO col = indices[colID]; + columns[realnnz++] = col; + rownnz++; + } + + if (rownnz == 1) { + // If the only element remaining after filtering is diagonal, mark node as boundary + // FIXME: this should really be replaced by the following + // if (indices.size() == 1 && indices[0] == row) + // boundaryNodes[row] = true; + // We do not do it this way now because there is no framework for distinguishing isolated + // and boundary nodes in the aggregation algorithms + amalgBoundaryNodes[row] = true; + } + rows[row + 1] = realnnz; + } // for (LO row = 0; row < numRows; row++) + // columns.resize(realnnz); + + RCP graph = rcp(new LWGraph(rows, Kokkos::subview(columns, Kokkos::make_pair(0, realnnz)), uniqueMap, nonUniqueMap, "amalgamated graph of A")); + graph->SetBoundaryNodeMap(amalgBoundaryNodes); + + if (GetVerbLevel() & Statistics1) { + GO numLocalBoundaryNodes = 0; + GO numGlobalBoundaryNodes = 0; + + for (size_t i = 0; i < amalgBoundaryNodes.size(); ++i) + if (amalgBoundaryNodes(i)) + numLocalBoundaryNodes++; + + RCP> comm = A->getRowMap()->getComm(); + MueLu_sumAll(comm, numLocalBoundaryNodes, numGlobalBoundaryNodes); + GetOStream(Statistics1) << "Detected " << numGlobalBoundaryNodes + << " agglomerated Dirichlet nodes" << std::endl; + } + + Set(currentLevel, "Graph", graph); + Set(currentLevel, "DofsPerNode", blkSize); // full block size + } + + } else if (algo == "distance laplacian") { + LO blkSize = A->GetFixedBlockSize(); + GO indexBase = A->getRowMap()->getIndexBase(); + // [*0*] : FIXME + // ap: somehow, if I move this line to [*1*], Belos throws an error + // I'm not sure what's going on. Do we always have to Get data, if we did + // DeclareInput for it? + // RCP Coords = Get< RCP >(currentLevel, "Coordinates"); + + // Detect and record rows that correspond to Dirichlet boundary conditions + // TODO If we use ArrayRCP, then we can record boundary nodes as usual. Size + // TODO the array one bigger than the number of local rows, and the last entry can + // TODO hold the actual number of boundary nodes. Clever, huh? + auto pointBoundaryNodes = MueLu::Utilities::DetectDirichletRows_kokkos_host(*A, dirichletThreshold); + if (rowSumTol > 0.) + Utilities::ApplyRowSumCriterionHost(*A, rowSumTol, pointBoundaryNodes); + + if ((blkSize == 1) && (threshold == STS::zero())) { + // Trivial case: scalar problem, no dropping. Can return original graph + RCP graph = rcp(new LWGraph(A->getCrsGraph(), "graph of A")); + graph->SetBoundaryNodeMap(pointBoundaryNodes); + graphType = "unamalgamated"; + numTotal = A->getLocalNumEntries(); + + if (GetVerbLevel() & Statistics1) { + GO numLocalBoundaryNodes = 0; + GO numGlobalBoundaryNodes = 0; + for (size_t i = 0; i < pointBoundaryNodes.size(); ++i) + if (pointBoundaryNodes(i)) + numLocalBoundaryNodes++; + RCP> comm = A->getRowMap()->getComm(); + MueLu_sumAll(comm, numLocalBoundaryNodes, numGlobalBoundaryNodes); + GetOStream(Statistics1) << "Detected " << numGlobalBoundaryNodes << " Dirichlet nodes" << std::endl; + } + + Set(currentLevel, "DofsPerNode", blkSize); + Set(currentLevel, "Graph", graph); + + } else { + // ap: We make quite a few assumptions here; general case may be a lot different, + // but much much harder to implement. We assume that: + // 1) all maps are standard maps, not strided maps + // 2) global indices of dofs in A are related to dofs in coordinates in a simple arithmetic + // way: rows i*blkSize, i*blkSize+1, ..., i*blkSize + (blkSize-1) correspond to node i + // + // NOTE: Potentially, some of the code below could be simplified with UnAmalgamationInfo, + // but as I totally don't understand that code, here is my solution + + // [*1*]: see [*0*] + + // Check that the number of local coordinates is consistent with the #rows in A + TEUCHOS_TEST_FOR_EXCEPTION(A->getRowMap()->getLocalNumElements() / blkSize != Coords->getLocalLength(), Exceptions::Incompatible, + "Coordinate vector length (" << Coords->getLocalLength() << ") is incompatible with number of rows in A (" << A->getRowMap()->getLocalNumElements() << ") by modulo block size (" << blkSize << ")."); + + const RCP colMap = A->getColMap(); + RCP uniqueMap, nonUniqueMap; + Array colTranslation; + if (blkSize == 1) { + uniqueMap = A->getRowMap(); + nonUniqueMap = A->getColMap(); + graphType = "unamalgamated"; + + } else { + uniqueMap = Coords->getMap(); + TEUCHOS_TEST_FOR_EXCEPTION(uniqueMap->getIndexBase() != indexBase, Exceptions::Incompatible, + "Different index bases for matrix and coordinates"); + + AmalgamationFactory::AmalgamateMap(*(A->getColMap()), *A, nonUniqueMap, colTranslation); + + graphType = "amalgamated"; + } + LO numRows = Teuchos::as(uniqueMap->getLocalNumElements()); + + RCP ghostedCoords; + RCP ghostedLaplDiag; + Teuchos::ArrayRCP ghostedLaplDiagData; + if (threshold != STS::zero()) { + // Get ghost coordinates + RCP importer; + { + SubFactoryMonitor m1(*this, "Import construction", currentLevel); + if (blkSize == 1 && realA->getCrsGraph()->getImporter() != Teuchos::null) { + GetOStream(Warnings1) << "Using existing importer from matrix graph" << std::endl; + importer = realA->getCrsGraph()->getImporter(); + } else { + GetOStream(Warnings0) << "Constructing new importer instance" << std::endl; + importer = ImportFactory::Build(uniqueMap, nonUniqueMap); + } + } // subtimer + ghostedCoords = Xpetra::MultiVectorFactory::Build(nonUniqueMap, Coords->getNumVectors()); + { + SubFactoryMonitor m1(*this, "Coordinate import", currentLevel); + ghostedCoords->doImport(*Coords, *importer, Xpetra::INSERT); + } // subtimer + + // Construct Distance Laplacian diagonal + RCP localLaplDiag = VectorFactory::Build(uniqueMap); + Array indicesExtra; + Teuchos::Array> coordData; + if (threshold != STS::zero()) { + const size_t numVectors = ghostedCoords->getNumVectors(); + coordData.reserve(numVectors); + for (size_t j = 0; j < numVectors; j++) { + Teuchos::ArrayRCP tmpData = ghostedCoords->getData(j); + coordData.push_back(tmpData); + } + } + { + SubFactoryMonitor m1(*this, "Laplacian local diagonal", currentLevel); + ArrayRCP localLaplDiagData = localLaplDiag->getDataNonConst(0); + for (LO row = 0; row < numRows; row++) { + ArrayView indices; + + if (blkSize == 1) { + ArrayView vals; + A->getLocalRowView(row, indices, vals); + + } else { + // Merge rows of A + indicesExtra.resize(0); + MergeRows(*A, row, indicesExtra, colTranslation); + indices = indicesExtra; + } + + LO nnz = indices.size(); + bool haveAddedToDiag = false; + for (LO colID = 0; colID < nnz; colID++) { + const LO col = indices[colID]; + + if (row != col) { + if (use_dlap_weights == SINGLE_WEIGHTS) { + /*printf("[%d,%d] Unweighted Distance = %6.4e Weighted Distance = %6.4e\n",row,col, + MueLu::Utilities::Distance2(coordData, row, col), + MueLu::Utilities::Distance2(dlap_weights(),coordData, row, col));*/ + localLaplDiagData[row] += STS::one() / MueLu::Utilities::Distance2(dlap_weights(), coordData, row, col); + } else if (use_dlap_weights == BLOCK_WEIGHTS) { + int block_id = row % interleaved_blocksize; + int block_start = block_id * interleaved_blocksize; + localLaplDiagData[row] += STS::one() / MueLu::Utilities::Distance2(dlap_weights(block_start, interleaved_blocksize), coordData, row, col); + } else { + // printf("[%d,%d] Unweighted Distance = %6.4e\n",row,col,MueLu::Utilities::Distance2(coordData, row, col)); + localLaplDiagData[row] += STS::one() / MueLu::Utilities::Distance2(coordData, row, col); + } + haveAddedToDiag = true; + } + } + // Deal with the situation where boundary conditions have only been enforced on rows, but not on columns. + // We enforce dropping of these entries by assigning a very large number to the diagonal entries corresponding to BCs. + if (!haveAddedToDiag) + localLaplDiagData[row] = STS::rmax(); + } + } // subtimer + { + SubFactoryMonitor m1(*this, "Laplacian distributed diagonal", currentLevel); + ghostedLaplDiag = VectorFactory::Build(nonUniqueMap); + ghostedLaplDiag->doImport(*localLaplDiag, *importer, Xpetra::INSERT); + ghostedLaplDiagData = ghostedLaplDiag->getDataNonConst(0); + } // subtimer + + } else { + GetOStream(Runtime0) << "Skipping distance laplacian construction due to 0 threshold" << std::endl; + } + + // NOTE: ghostedLaplDiagData might be zero if we don't actually calculate the laplacian + + // allocate space for the local graph + typename LWGraph::row_type::non_const_type rows("rows", numRows + 1); + typename LWGraph::entries_type::non_const_type columns("columns", A->getLocalNumEntries()); + +#ifdef HAVE_MUELU_DEBUG + // DEBUGGING + for (LO i = 0; i < (LO)columns.size(); i++) columns[i] = -666; +#endif + + // Extra array for if we're allowing symmetrization with cutting + ArrayRCP rows_stop; + bool use_stop_array = threshold != STS::zero() && distanceLaplacianAlgo == scaled_cut_symmetric; + if (use_stop_array) + // rows_stop = typename LWGraph::row_type::non_const_type("rows_stop", numRows); + rows_stop.resize(numRows); + + typename LWGraph::boundary_nodes_type amalgBoundaryNodes("amalgBoundaryNodes", numRows); + Kokkos::deep_copy(amalgBoundaryNodes, false); + + LO realnnz = 0; + rows(0) = 0; + + Array indicesExtra; + { + SubFactoryMonitor m1(*this, "Laplacian dropping", currentLevel); + Teuchos::Array> coordData; + if (threshold != STS::zero()) { + const size_t numVectors = ghostedCoords->getNumVectors(); + coordData.reserve(numVectors); + for (size_t j = 0; j < numVectors; j++) { + Teuchos::ArrayRCP tmpData = ghostedCoords->getData(j); + coordData.push_back(tmpData); + } + } + + ArrayView vals; // CMS hackery + for (LO row = 0; row < numRows; row++) { + ArrayView indices; + indicesExtra.resize(0); + bool isBoundary = false; + + if (blkSize == 1) { + // ArrayView vals;//CMS uncomment + A->getLocalRowView(row, indices, vals); + isBoundary = pointBoundaryNodes[row]; + } else { + // The amalgamated row is marked as Dirichlet iff all point rows are Dirichlet + for (LO j = 0; j < blkSize; j++) { + if (!pointBoundaryNodes[row * blkSize + j]) { + isBoundary = false; + break; + } + } + + // Merge rows of A + if (!isBoundary) + MergeRows(*A, row, indicesExtra, colTranslation); + else + indicesExtra.push_back(row); + indices = indicesExtra; + } + numTotal += indices.size(); + + LO nnz = indices.size(), rownnz = 0; + + if (use_stop_array) { + rows(row + 1) = rows(row) + nnz; + realnnz = rows(row); + } + + if (threshold != STS::zero()) { + // default + if (distanceLaplacianAlgo == defaultAlgo) { + /* Standard Distance Laplacian */ + for (LO colID = 0; colID < nnz; colID++) { + LO col = indices[colID]; + + if (row == col) { + columns(realnnz++) = col; + rownnz++; + continue; + } + + // We do not want the distance Laplacian aggregating boundary nodes + if (isBoundary) continue; + + SC laplVal; + if (use_dlap_weights == SINGLE_WEIGHTS) { + laplVal = STS::one() / MueLu::Utilities::Distance2(dlap_weights(), coordData, row, col); + } else if (use_dlap_weights == BLOCK_WEIGHTS) { + int block_id = row % interleaved_blocksize; + int block_start = block_id * interleaved_blocksize; + laplVal = STS::one() / MueLu::Utilities::Distance2(dlap_weights(block_start, interleaved_blocksize), coordData, row, col); + } else { + laplVal = STS::one() / MueLu::Utilities::Distance2(coordData, row, col); + } + real_type aiiajj = STS::magnitude(realThreshold * realThreshold * ghostedLaplDiagData[row] * ghostedLaplDiagData[col]); + real_type aij = STS::magnitude(laplVal * laplVal); + + if (aij > aiiajj) { + columns(realnnz++) = col; + rownnz++; + } else { + numDropped++; + } + } + } else { + /* Cut Algorithm */ + using DropTol = Details::DropTol; + std::vector drop_vec; + drop_vec.reserve(nnz); + const real_type zero = Teuchos::ScalarTraits::zero(); + const real_type one = Teuchos::ScalarTraits::one(); + + // find magnitudes + for (LO colID = 0; colID < nnz; colID++) { + LO col = indices[colID]; + + if (row == col) { + drop_vec.emplace_back(zero, one, colID, false); + continue; + } + // We do not want the distance Laplacian aggregating boundary nodes + if (isBoundary) continue; + + SC laplVal; + if (use_dlap_weights == SINGLE_WEIGHTS) { + laplVal = STS::one() / MueLu::Utilities::Distance2(dlap_weights(), coordData, row, col); + } else if (use_dlap_weights == BLOCK_WEIGHTS) { + int block_id = row % interleaved_blocksize; + int block_start = block_id * interleaved_blocksize; + laplVal = STS::one() / MueLu::Utilities::Distance2(dlap_weights(block_start, interleaved_blocksize), coordData, row, col); + } else { + laplVal = STS::one() / MueLu::Utilities::Distance2(coordData, row, col); + } + + real_type aiiajj = STS::magnitude(ghostedLaplDiagData[row] * ghostedLaplDiagData[col]); + real_type aij = STS::magnitude(laplVal * laplVal); + + drop_vec.emplace_back(aij, aiiajj, colID, false); + } + + const size_t n = drop_vec.size(); + + if (distanceLaplacianAlgo == unscaled_cut) { + std::sort(drop_vec.begin(), drop_vec.end(), [](DropTol const& a, DropTol const& b) { + return a.val > b.val; + }); + + bool drop = false; + for (size_t i = 1; i < n; ++i) { + if (!drop) { + auto const& x = drop_vec[i - 1]; + auto const& y = drop_vec[i]; + auto a = x.val; + auto b = y.val; + if (a > realThreshold * b) { + drop = true; +#ifdef HAVE_MUELU_DEBUG + if (distanceLaplacianCutVerbose) { + std::cout << "DJS: KEEP, N, ROW: " << i + 1 << ", " << n << ", " << row << std::endl; + } +#endif + } + } + drop_vec[i].drop = drop; + } + } else if (distanceLaplacianAlgo == scaled_cut || distanceLaplacianAlgo == scaled_cut_symmetric) { + std::sort(drop_vec.begin(), drop_vec.end(), [](DropTol const& a, DropTol const& b) { + return a.val / a.diag > b.val / b.diag; + }); + + bool drop = false; + for (size_t i = 1; i < n; ++i) { + if (!drop) { + auto const& x = drop_vec[i - 1]; + auto const& y = drop_vec[i]; + auto a = x.val / x.diag; + auto b = y.val / y.diag; + if (a > realThreshold * b) { + drop = true; +#ifdef HAVE_MUELU_DEBUG + if (distanceLaplacianCutVerbose) { + std::cout << "DJS: KEEP, N, ROW: " << i + 1 << ", " << n << ", " << row << std::endl; + } +#endif + } + } + drop_vec[i].drop = drop; + } + } + + std::sort(drop_vec.begin(), drop_vec.end(), [](DropTol const& a, DropTol const& b) { + return a.col < b.col; + }); + + for (LO idxID = 0; idxID < (LO)drop_vec.size(); idxID++) { + LO col = indices[drop_vec[idxID].col]; + + // don't drop diagonal + if (row == col) { + columns(realnnz++) = col; + rownnz++; + // printf("(%d,%d) KEEP %13s matrix = %6.4e\n",row,row,"DIAGONAL",drop_vec[idxID].aux_val); + continue; + } + + if (!drop_vec[idxID].drop) { + columns(realnnz++) = col; + // printf("(%d,%d) KEEP dlap = %6.4e matrix = %6.4e\n",row,col,drop_vec[idxID].val/drop_vec[idxID].diag,drop_vec[idxID].aux_val); + rownnz++; + } else { + // printf("(%d,%d) DROP dlap = %6.4e matrix = %6.4e\n",row,col,drop_vec[idxID].val/drop_vec[idxID].diag,drop_vec[idxID].aux_val); + numDropped++; + } + } + } + } else { + // Skip laplace calculation and threshold comparison for zero threshold + for (LO colID = 0; colID < nnz; colID++) { + LO col = indices[colID]; + columns(realnnz++) = col; + rownnz++; + } + } + + if (rownnz == 1) { + // If the only element remaining after filtering is diagonal, mark node as boundary + // FIXME: this should really be replaced by the following + // if (indices.size() == 1 && indices[0] == row) + // boundaryNodes[row] = true; + // We do not do it this way now because there is no framework for distinguishing isolated + // and boundary nodes in the aggregation algorithms + amalgBoundaryNodes[row] = true; + } + + if (use_stop_array) + rows_stop[row] = rownnz + rows[row]; + else + rows[row + 1] = realnnz; + } // for (LO row = 0; row < numRows; row++) + + } // subtimer + + if (use_stop_array) { + // Do symmetrization of the cut matrix + // NOTE: We assume nested row/column maps here + for (LO row = 0; row < numRows; row++) { + for (LO colidx = rows[row]; colidx < rows_stop[row]; colidx++) { + LO col = columns[colidx]; + if (col >= numRows) continue; + + bool found = false; + for (LO t_col = rows(col); !found && t_col < rows_stop[col]; t_col++) { + if (columns[t_col] == row) + found = true; + } + // We didn't find the transpose buddy, so let's symmetrize, unless we'd be symmetrizing + // into a Dirichlet unknown. In that case don't. + if (!found && !pointBoundaryNodes[col] && Teuchos::as(rows_stop[col]) < rows[col + 1]) { + LO new_idx = rows_stop[col]; + // printf("(%d,%d) SYMADD entry\n",col,row); + columns[new_idx] = row; + rows_stop[col]++; + numDropped--; + } + } + } + + // Condense everything down + LO current_start = 0; + for (LO row = 0; row < numRows; row++) { + LO old_start = current_start; + for (LO col = rows(row); col < rows_stop[row]; col++) { + if (current_start != col) { + columns(current_start) = columns(col); + } + current_start++; + } + rows[row] = old_start; + } + rows(numRows) = realnnz = current_start; + } + + RCP graph; + { + SubFactoryMonitor m1(*this, "Build amalgamated graph", currentLevel); + graph = rcp(new LWGraph(rows, Kokkos::subview(columns, Kokkos::make_pair(0, realnnz)), uniqueMap, nonUniqueMap, "amalgamated graph of A")); + graph->SetBoundaryNodeMap(amalgBoundaryNodes); + } // subtimer + + if (GetVerbLevel() & Statistics1) { + GO numLocalBoundaryNodes = 0; + GO numGlobalBoundaryNodes = 0; + + for (size_t i = 0; i < amalgBoundaryNodes.size(); ++i) + if (amalgBoundaryNodes(i)) + numLocalBoundaryNodes++; + + RCP> comm = A->getRowMap()->getComm(); + MueLu_sumAll(comm, numLocalBoundaryNodes, numGlobalBoundaryNodes); + GetOStream(Statistics1) << "Detected " << numGlobalBoundaryNodes << " agglomerated Dirichlet nodes" + << " using threshold " << dirichletThreshold << std::endl; + } + + Set(currentLevel, "Graph", graph); + Set(currentLevel, "DofsPerNode", blkSize); + } + } + + if ((GetVerbLevel() & Statistics1) && !(A->GetFixedBlockSize() > 1 && threshold != STS::zero())) { + RCP> comm = A->getRowMap()->getComm(); + GO numGlobalTotal, numGlobalDropped; + MueLu_sumAll(comm, numTotal, numGlobalTotal); + MueLu_sumAll(comm, numDropped, numGlobalDropped); + GetOStream(Statistics1) << "Number of dropped entries in " << graphType << " matrix graph: " << numGlobalDropped << "/" << numGlobalTotal; + if (numGlobalTotal != 0) + GetOStream(Statistics1) << " (" << 100 * Teuchos::as(numGlobalDropped) / Teuchos::as(numGlobalTotal) << "%)"; + GetOStream(Statistics1) << std::endl; + } + + } else { + // what Tobias has implemented + + SC threshold = as(pL.get("aggregation: drop tol")); + // GetOStream(Runtime0) << "algorithm = \"" << algo << "\": threshold = " << threshold << ", blocksize = " << A->GetFixedBlockSize() << std::endl; + GetOStream(Runtime0) << "algorithm = \"" + << "failsafe" + << "\": threshold = " << threshold << ", blocksize = " << A->GetFixedBlockSize() << std::endl; + Set(currentLevel, "Filtering", (threshold != STS::zero())); + + RCP rowMap = A->getRowMap(); + RCP colMap = A->getColMap(); + + LO blockdim = 1; // block dim for fixed size blocks + GO indexBase = rowMap->getIndexBase(); // index base of maps + GO offset = 0; + + // 1) check for blocking/striding information + if (A->IsView("stridedMaps") && + Teuchos::rcp_dynamic_cast(A->getRowMap("stridedMaps")) != Teuchos::null) { + Xpetra::viewLabel_t oldView = A->SwitchToView("stridedMaps"); // note: "stridedMaps are always non-overlapping (correspond to range and domain maps!) + RCP strMap = Teuchos::rcp_dynamic_cast(A->getRowMap()); + TEUCHOS_TEST_FOR_EXCEPTION(strMap == Teuchos::null, Exceptions::BadCast, "MueLu::CoalesceFactory::Build: cast to strided row map failed."); + blockdim = strMap->getFixedBlockSize(); + offset = strMap->getOffset(); + oldView = A->SwitchToView(oldView); + GetOStream(Statistics1) << "CoalesceDropFactory::Build():" + << " found blockdim=" << blockdim << " from strided maps. offset=" << offset << std::endl; + } else + GetOStream(Statistics1) << "CoalesceDropFactory::Build(): no striding information available. Use blockdim=1 with offset=0" << std::endl; + + // 2) get row map for amalgamated matrix (graph of A) + // with same distribution over all procs as row map of A + RCP nodeMap = amalInfo->getNodeRowMap(); + GetOStream(Statistics1) << "CoalesceDropFactory: nodeMap " << nodeMap->getLocalNumElements() << "/" << nodeMap->getGlobalNumElements() << " elements" << std::endl; + + // 3) create graph of amalgamated matrix + RCP crsGraph = CrsGraphFactory::Build(nodeMap, A->getLocalMaxNumRowEntries() * blockdim); + + LO numRows = A->getRowMap()->getLocalNumElements(); + LO numNodes = nodeMap->getLocalNumElements(); + typename LWGraph::boundary_nodes_type amalgBoundaryNodes("amalgBoundaryNodes", numNodes); + Kokkos::deep_copy(amalgBoundaryNodes, false); + const ArrayRCP numberDirichletRowsPerNode(numNodes, 0); // helper array counting the number of Dirichlet nodes associated with node + bool bIsDiagonalEntry = false; // boolean flag stating that grid==gcid + + // 4) do amalgamation. generate graph of amalgamated matrix + // Note, this code is much more inefficient than the leightwight implementation + // Most of the work has already been done in the AmalgamationFactory + for (LO row = 0; row < numRows; row++) { + // get global DOF id + GO grid = rowMap->getGlobalElement(row); + + // reinitialize boolean helper variable + bIsDiagonalEntry = false; + + // translate grid to nodeid + GO nodeId = AmalgamationFactory::DOFGid2NodeId(grid, blockdim, offset, indexBase); + + size_t nnz = A->getNumEntriesInLocalRow(row); + Teuchos::ArrayView indices; + Teuchos::ArrayView vals; + A->getLocalRowView(row, indices, vals); + + RCP> cnodeIds = Teuchos::rcp(new std::vector); // global column block ids + LO realnnz = 0; + for (LO col = 0; col < Teuchos::as(nnz); col++) { + GO gcid = colMap->getGlobalElement(indices[col]); // global column id + + if (vals[col] != STS::zero()) { + GO cnodeId = AmalgamationFactory::DOFGid2NodeId(gcid, blockdim, offset, indexBase); + cnodeIds->push_back(cnodeId); + realnnz++; // increment number of nnz in matrix row + if (grid == gcid) bIsDiagonalEntry = true; + } + } + + if (realnnz == 1 && bIsDiagonalEntry == true) { + LO lNodeId = nodeMap->getLocalElement(nodeId); + numberDirichletRowsPerNode[lNodeId] += 1; // increment Dirichlet row counter associated with lNodeId + if (numberDirichletRowsPerNode[lNodeId] == blockdim) // mark full Dirichlet nodes + amalgBoundaryNodes[lNodeId] = true; + } + + Teuchos::ArrayRCP arr_cnodeIds = Teuchos::arcp(cnodeIds); + + if (arr_cnodeIds.size() > 0) + crsGraph->insertGlobalIndices(nodeId, arr_cnodeIds()); + } + // fill matrix graph + crsGraph->fillComplete(nodeMap, nodeMap); + + // 5) create MueLu Graph object + RCP graph = rcp(new LWGraph(crsGraph, "amalgamated graph of A")); + + // Detect and record rows that correspond to Dirichlet boundary conditions + graph->SetBoundaryNodeMap(amalgBoundaryNodes); + + if (GetVerbLevel() & Statistics1) { + GO numLocalBoundaryNodes = 0; + GO numGlobalBoundaryNodes = 0; + for (size_t i = 0; i < amalgBoundaryNodes.size(); ++i) + if (amalgBoundaryNodes(i)) + numLocalBoundaryNodes++; + RCP> comm = A->getRowMap()->getComm(); + MueLu_sumAll(comm, numLocalBoundaryNodes, numGlobalBoundaryNodes); + GetOStream(Statistics1) << "Detected " << numGlobalBoundaryNodes << " Dirichlet nodes" << std::endl; + } + + // 6) store results in Level + // graph->SetBoundaryNodeMap(gBoundaryNodeMap); + Set(currentLevel, "DofsPerNode", blockdim); + Set(currentLevel, "Graph", graph); + + } // if (doExperimentalWrap) ... else ... + +} // Build + +template +void CoalesceDropFactory::BuildKokkos(Level& currentLevel) const { + FactoryMonitor m(*this, "BuildKokkos", currentLevel); + + typedef Teuchos::ScalarTraits STS; + typedef typename STS::magnitudeType real_type; + typedef Xpetra::MultiVector RealValuedMultiVector; + typedef Xpetra::MultiVectorFactory RealValuedMultiVectorFactory; + + if (predrop_ != Teuchos::null) + GetOStream(Parameters0) << predrop_->description(); + + RCP realA = Get>(currentLevel, "A"); + RCP amalInfo = Get>(currentLevel, "UnAmalgamationInfo"); + const ParameterList& pL = GetParameterList(); + bool doExperimentalWrap = pL.get("lightweight wrap"); + + GetOStream(Parameters0) << "lightweight wrap = " << doExperimentalWrap << std::endl; + std::string algo = pL.get("aggregation: drop scheme"); + const bool aggregationMayCreateDirichlet = pL.get("aggregation: dropping may create Dirichlet"); + + RCP Coords; + RCP A; + + bool use_block_algorithm = false; + LO interleaved_blocksize = as(pL.get("aggregation: block diagonal: interleaved blocksize")); + bool useSignedClassicalRS = false; + bool useSignedClassicalSA = false; + bool generateColoringGraph = false; + + // NOTE: If we're doing blockDiagonal, we'll not want to do rowSum twice (we'll do it + // in the block diagonalization). So we'll clobber the rowSumTol with -1.0 in this case + typename STS::magnitudeType rowSumTol = as(pL.get("aggregation: row sum drop tol")); + + RCP ghostedBlockNumber; + ArrayRCP g_block_id; + + if (algo == "distance laplacian") { + // Grab the coordinates for distance laplacian + Coords = Get>(currentLevel, "Coordinates"); + A = realA; + } else if (algo == "signed classical sa") { + useSignedClassicalSA = true; + algo = "classical"; + A = realA; + } else if (algo == "signed classical" || algo == "block diagonal colored signed classical" || algo == "block diagonal signed classical") { + useSignedClassicalRS = true; + // if(realA->GetFixedBlockSize() > 1) { + RCP BlockNumber = Get>(currentLevel, "BlockNumber"); + // Ghost the column block numbers if we need to + RCP importer = realA->getCrsGraph()->getImporter(); + if (!importer.is_null()) { + SubFactoryMonitor m1(*this, "Block Number import", currentLevel); + ghostedBlockNumber = Xpetra::VectorFactory::Build(importer->getTargetMap()); + ghostedBlockNumber->doImport(*BlockNumber, *importer, Xpetra::INSERT); + } else { + ghostedBlockNumber = BlockNumber; + } + g_block_id = ghostedBlockNumber->getData(0); + // } + if (algo == "block diagonal colored signed classical") + generateColoringGraph = true; + algo = "classical"; + A = realA; + + } else if (algo == "block diagonal") { + // Handle the "block diagonal" filtering and then leave + BlockDiagonalize(currentLevel, realA, false); + return; + } else if (algo == "block diagonal classical" || algo == "block diagonal distance laplacian") { + // Handle the "block diagonal" filtering, and then continue onward + use_block_algorithm = true; + RCP filteredMatrix = BlockDiagonalize(currentLevel, realA, true); + if (algo == "block diagonal distance laplacian") { + // We now need to expand the coordinates by the interleaved blocksize + RCP OldCoords = Get>(currentLevel, "Coordinates"); + if (OldCoords->getLocalLength() != realA->getLocalNumRows()) { + LO dim = (LO)OldCoords->getNumVectors(); + Coords = RealValuedMultiVectorFactory::Build(realA->getRowMap(), dim); + for (LO k = 0; k < dim; k++) { + ArrayRCP old_vec = OldCoords->getData(k); + ArrayRCP new_vec = Coords->getDataNonConst(k); + for (LO i = 0; i < (LO)OldCoords->getLocalLength(); i++) { + LO new_base = i * dim; + for (LO j = 0; j < interleaved_blocksize; j++) + new_vec[new_base + j] = old_vec[i]; + } + } + } else { + Coords = OldCoords; + } + algo = "distance laplacian"; + } else if (algo == "block diagonal classical") { + algo = "classical"; + } + // All cases + A = filteredMatrix; + rowSumTol = -1.0; + } else { + A = realA; + } + + // Distance Laplacian weights + Array dlap_weights = pL.get>("aggregation: distance laplacian directional weights"); + enum { NO_WEIGHTS = 0, + SINGLE_WEIGHTS, + BLOCK_WEIGHTS }; + int use_dlap_weights = NO_WEIGHTS; + if (algo == "distance laplacian") { + LO dim = (LO)Coords->getNumVectors(); + // If anything isn't 1.0 we need to turn on the weighting + bool non_unity = false; + for (LO i = 0; !non_unity && i < (LO)dlap_weights.size(); i++) { + if (dlap_weights[i] != 1.0) { + non_unity = true; + } + } + if (non_unity) { + LO blocksize = use_block_algorithm ? as(pL.get("aggregation: block diagonal: interleaved blocksize")) : 1; + if ((LO)dlap_weights.size() == dim) + use_dlap_weights = SINGLE_WEIGHTS; + else if ((LO)dlap_weights.size() == blocksize * dim) + use_dlap_weights = BLOCK_WEIGHTS; + else { + TEUCHOS_TEST_FOR_EXCEPTION(1, Exceptions::RuntimeError, + "length of 'aggregation: distance laplacian directional weights' must equal the coordinate dimension OR the coordinate dimension times the blocksize"); + } + if (GetVerbLevel() & Statistics1) + GetOStream(Statistics1) << "Using distance laplacian weights: " << dlap_weights << std::endl; + } + } + + // decide wether to use the fast-track code path for standard maps or the somewhat slower + // code path for non-standard maps + /*bool bNonStandardMaps = false; + if (A->IsView("stridedMaps") == true) { + Teuchos::RCP myMap = A->getRowMap("stridedMaps"); + Teuchos::RCP strMap = Teuchos::rcp_dynamic_cast(myMap); + TEUCHOS_TEST_FOR_EXCEPTION(strMap == null, Exceptions::RuntimeError, "Map is not of type StridedMap"); + if (strMap->getStridedBlockId() != -1 || strMap->getOffset() > 0) + bNonStandardMaps = true; + }*/ + + if (doExperimentalWrap) { + TEUCHOS_TEST_FOR_EXCEPTION(predrop_ != null && algo != "classical", Exceptions::RuntimeError, "Dropping function must not be provided for \"" << algo << "\" algorithm"); + TEUCHOS_TEST_FOR_EXCEPTION(algo != "classical" && algo != "distance laplacian" && algo != "signed classical", Exceptions::RuntimeError, "\"algorithm\" must be one of (classical|distance laplacian|signed classical)"); + + SC threshold; + // If we're doing the ML-style halving of the drop tol at each level, we do that here. + if (pL.get("aggregation: use ml scaling of drop tol")) + threshold = pL.get("aggregation: drop tol") / pow(2.0, currentLevel.GetLevelID()); + else + threshold = as(pL.get("aggregation: drop tol")); + + std::string distanceLaplacianAlgoStr = pL.get("aggregation: distance laplacian algo"); + std::string classicalAlgoStr = pL.get("aggregation: classical algo"); + real_type realThreshold = STS::magnitude(threshold); // CMS: Rename this to "magnitude threshold" sometime + + //////////////////////////////////////////////////// + // Remove this bit once we are confident that cut-based dropping works. +#ifdef HAVE_MUELU_DEBUG + int distanceLaplacianCutVerbose = 0; +#endif +#ifdef DJS_READ_ENV_VARIABLES + if (getenv("MUELU_DROP_TOLERANCE_MODE")) { + distanceLaplacianAlgoStr = std::string(getenv("MUELU_DROP_TOLERANCE_MODE")); + } + + if (getenv("MUELU_DROP_TOLERANCE_THRESHOLD")) { + auto tmp = atoi(getenv("MUELU_DROP_TOLERANCE_THRESHOLD")); + realThreshold = 1e-4 * tmp; + } + +#ifdef HAVE_MUELU_DEBUG + if (getenv("MUELU_DROP_TOLERANCE_VERBOSE")) { + distanceLaplacianCutVerbose = atoi(getenv("MUELU_DROP_TOLERANCE_VERBOSE")); + } +#endif +#endif + //////////////////////////////////////////////////// + + enum decisionAlgoType { defaultAlgo, + unscaled_cut, + scaled_cut, + scaled_cut_symmetric }; + + decisionAlgoType distanceLaplacianAlgo = defaultAlgo; + decisionAlgoType classicalAlgo = defaultAlgo; + if (algo == "distance laplacian") { + if (distanceLaplacianAlgoStr == "default") + distanceLaplacianAlgo = defaultAlgo; + else if (distanceLaplacianAlgoStr == "unscaled cut") + distanceLaplacianAlgo = unscaled_cut; + else if (distanceLaplacianAlgoStr == "scaled cut") + distanceLaplacianAlgo = scaled_cut; + else if (distanceLaplacianAlgoStr == "scaled cut symmetric") + distanceLaplacianAlgo = scaled_cut_symmetric; + else + TEUCHOS_TEST_FOR_EXCEPTION(true, Exceptions::RuntimeError, "\"aggregation: distance laplacian algo\" must be one of (default|unscaled cut|scaled cut), not \"" << distanceLaplacianAlgoStr << "\""); + GetOStream(Runtime0) << "algorithm = \"" << algo << "\" distance laplacian algorithm = \"" << distanceLaplacianAlgoStr << "\": threshold = " << threshold << ", blocksize = " << A->GetFixedBlockSize() << std::endl; + } else if (algo == "classical") { + if (classicalAlgoStr == "default") + classicalAlgo = defaultAlgo; + else if (classicalAlgoStr == "unscaled cut") + classicalAlgo = unscaled_cut; + else if (classicalAlgoStr == "scaled cut") + classicalAlgo = scaled_cut; + else + TEUCHOS_TEST_FOR_EXCEPTION(true, Exceptions::RuntimeError, "\"aggregation: classical algo\" must be one of (default|unscaled cut|scaled cut), not \"" << classicalAlgoStr << "\""); + GetOStream(Runtime0) << "algorithm = \"" << algo << "\" classical algorithm = \"" << classicalAlgoStr << "\": threshold = " << threshold << ", blocksize = " << A->GetFixedBlockSize() << std::endl; + + } else + GetOStream(Runtime0) << "algorithm = \"" << algo << "\": threshold = " << threshold << ", blocksize = " << A->GetFixedBlockSize() << std::endl; + Set(currentLevel, "Filtering", (threshold != STS::zero())); + + const typename STS::magnitudeType dirichletThreshold = STS::magnitude(as(pL.get("aggregation: Dirichlet threshold"))); + + // NOTE: We don't support signed classical RS or SA with cut drop at present + TEUCHOS_TEST_FOR_EXCEPTION(useSignedClassicalRS && classicalAlgo != defaultAlgo, Exceptions::RuntimeError, "\"aggregation: classical algo\" != default is not supported for scalled classical aggregation"); + TEUCHOS_TEST_FOR_EXCEPTION(useSignedClassicalSA && classicalAlgo != defaultAlgo, Exceptions::RuntimeError, "\"aggregation: classical algo\" != default is not supported for scalled classical sa aggregation"); + + GO numDropped = 0, numTotal = 0; + std::string graphType = "unamalgamated"; // for description purposes only + + /* NOTE: storageblocksize (from GetStorageBlockSize()) is the size of a block in the chosen storage scheme. + BlockSize is the number of storage blocks that must kept together during the amalgamation process. + + Both of these quantities may be different than numPDEs (from GetFixedBlockSize()), but the following must always hold: + + numPDEs = BlockSize * storageblocksize. + + If numPDEs==1 + Matrix is point storage (classical CRS storage). storageblocksize=1 and BlockSize=1 + No other values makes sense. + + If numPDEs>1 + If matrix uses point storage, then storageblocksize=1 and BlockSize=numPDEs. + If matrix uses block storage, with block size of n, then storageblocksize=n, and BlockSize=numPDEs/n. + Thus far, only storageblocksize=numPDEs and BlockSize=1 has been tested. + */ + TEUCHOS_TEST_FOR_EXCEPTION(A->GetFixedBlockSize() % A->GetStorageBlockSize() != 0, Exceptions::RuntimeError, "A->GetFixedBlockSize() needs to be a multiple of A->GetStorageBlockSize()"); + const LO BlockSize = A->GetFixedBlockSize() / A->GetStorageBlockSize(); + /************************** RS or SA-style Classical Dropping (and variants) **************************/ if (algo == "classical") { if (predrop_ == null) { @@ -506,7 +2155,7 @@ void CoalesceDropFactory::Build(Level LO realnnz = 0; rows(0) = 0; - for (LO row = 0; row < Teuchos::as(A->getRowMap()->getLocalNumElements()); ++row) { + for (LO row = 0; row < Teuchos::as(A->getRowMap()->getLocalNumElements()); ++row) { size_t nnz = A->getNumEntriesInLocalRow(row); bool rowIsDirichlet = boundaryNodes[row]; ArrayView indices; @@ -573,11 +2222,11 @@ void CoalesceDropFactory::Build(Level rows(row + 1) = realnnz; } } else { - /* Cut Algorithm */ + /* Cut Algorithm */ // CMS using DropTol = Details::DropTol; std::vector drop_vec; - drop_vec.reserve(nnz); + drop_vec.reserve(nnz); const real_type zero = Teuchos::ScalarTraits::zero(); const real_type one = Teuchos::ScalarTraits::one(); LO rownnz = 0; @@ -1594,7 +3243,7 @@ void CoalesceDropFactory::Build(Level } // if (doExperimentalWrap) ... else ... -} // Build +} // BuildKokkos template void CoalesceDropFactory::MergeRows(const Matrix& A, const LO row, Array& cols, const Array& translation) const { From 105e33e71f5ab985b6559fcc047b4b596336efb1 Mon Sep 17 00:00:00 2001 From: Ian Halim Date: Mon, 22 Jul 2024 12:19:34 -0600 Subject: [PATCH 02/25] MueLu: Cut Drop Memory Optimization DropTol structure in algorithm replaced with new, smaller DropTolKokkos structure. Computations are now done on the fly. Code passes current unit tests. No significant change in speed. Signed-off-by: Ian Halim --- .../MueLu_CoalesceDropFactory_def.hpp | 130 ++++++++++++------ 1 file changed, 86 insertions(+), 44 deletions(-) diff --git a/packages/muelu/src/Graph/MatrixTransformation/MueLu_CoalesceDropFactory_def.hpp b/packages/muelu/src/Graph/MatrixTransformation/MueLu_CoalesceDropFactory_def.hpp index a8befaea592b..eeb3f91dbfd6 100644 --- a/packages/muelu/src/Graph/MatrixTransformation/MueLu_CoalesceDropFactory_def.hpp +++ b/packages/muelu/src/Graph/MatrixTransformation/MueLu_CoalesceDropFactory_def.hpp @@ -94,35 +94,48 @@ namespace MueLu { namespace Details { template struct DropTol { - KOKKOS_INLINE_FUNCTION //NEW DropTol() = default; - KOKKOS_INLINE_FUNCTION //NEW DropTol(DropTol const&) = default; - KOKKOS_INLINE_FUNCTION //NEW DropTol(DropTol&&) = default; DropTol& operator=(DropTol const&) = default; DropTol& operator=(DropTol&&) = default; - KOKKOS_INLINE_FUNCTION //NEW DropTol(real_type val_, real_type diag_, LO col_, bool drop_) : val{val_} , diag{diag_} , col{col_} , drop{drop_} {} - real_type val{0}; - real_type diag{0}; - LO col{-1}; - //NEW Can't run these host functions on device - //real_type val{Teuchos::ScalarTraits::zero()}; - //real_type diag{Teuchos::ScalarTraits::zero()}; - //LO col{Teuchos::OrdinalTraits::invalid()}; + real_type val{Teuchos::ScalarTraits::zero()}; + real_type diag{Teuchos::ScalarTraits::zero()}; + LO col{Teuchos::OrdinalTraits::invalid()}; bool drop{true}; // CMS: Auxillary information for debugging info // real_type aux_val {Teuchos::ScalarTraits::nan()}; }; + +template +struct DropTolKokkos { + KOKKOS_INLINE_FUNCTION //NEW + DropTolKokkos() = default; + KOKKOS_INLINE_FUNCTION //NEW + DropTolKokkos(DropTolKokkos const&) = default; + KOKKOS_INLINE_FUNCTION //NEW + DropTolKokkos(DropTolKokkos&&) = default; + + DropTolKokkos& operator=(DropTolKokkos const&) = default; + DropTolKokkos& operator=(DropTolKokkos&&) = default; + + KOKKOS_INLINE_FUNCTION //NEW + DropTolKokkos(LO col_, bool drop_) + : col{col_} + , drop{drop_} {} + + LO col{-1}; + LO drop{true}; +}; } // namespace Details template @@ -767,7 +780,7 @@ void CoalesceDropFactory::Build(Level using ExecSpace = typename Node::execution_space; using TeamPol = Kokkos::TeamPolicy; using TeamMem = typename TeamPol::member_type; - using DropTol = Details::DropTol; + using DropTolKokkos = Details::DropTolKokkos; //move from host to device ArrayView ghostedDiagValsArrayView = ghostedDiagVals.view(ghostedDiagVals.lowerOffset(), ghostedDiagVals.size()); @@ -779,7 +792,7 @@ void CoalesceDropFactory::Build(Level int algorithm = classicalAlgo; Kokkos::ViewrownnzView("rownnzView", A_device.numRows()); - auto drop_views = Kokkos::View("drop_views", A_device.nnz()); + auto drop_views = Kokkos::View("drop_views", A_device.nnz()); //stackedTimer->stop("init"); //stackedTimer->start("loop"); @@ -790,74 +803,103 @@ void CoalesceDropFactory::Build(Level size_t n = 0; auto drop_view = Kokkos::subview(drop_views, Kokkos::make_pair(A_device.graph.row_map(row), A_device.graph.row_map(row+1))); + //find magnitudes Kokkos::parallel_reduce(Kokkos::TeamThreadRange(teamMember, (LO)nnz), [&] (const LO colID, size_t &count) { LO col = rowView.colidx(colID); if(row == col) { - drop_view(colID) = DropTol(0, 1, colID, false); + drop_view(colID) = DropTolKokkos(colID, true); count++; } //Don't aggregate boundaries else if(!boundaryNodesDevice(colID)) { - typename STS::magnitudeType aiiajj = static_cast(std::fabs(static_cast(threshold * threshold * ghostedDiagValsView(col) * ghostedDiagValsView(row)))); // eps^2*|a _ii|*|a_jj| - typename STS::magnitudeType aij = static_cast(std::fabs(static_cast(rowView.value(colID) * rowView.value(colID)))); // |a_i j|^2 - drop_view(colID) = DropTol(aij, aiiajj, colID, false); + drop_view(colID) = DropTolKokkos(colID, false); count++; } }, n); + + size_t dropStart = n; if (algorithm == unscaled_cut) { - Kokkos::Experimental::sort_team(teamMember, drop_view, [](DropTol const& a, DropTol const& b) { - return a.val > b.val; + Kokkos::Experimental::sort_team(teamMember, drop_view, [=](DropTolKokkos const& x, DropTolKokkos const& y) { + if(x.drop || y.drop) { + return x.drop < y.drop; + } + else { + typename STS::magnitudeType x_aij = static_cast(std::fabs(static_cast(rowView.value(x.col) * rowView.value(x.col)))); // |a_i j|^2 + typename STS::magnitudeType y_aij = static_cast(std::fabs(static_cast(rowView.value(y.col) * rowView.value(y.col)))); // |a_i j|^2 + return x_aij > y_aij; + } }); //find index where dropping starts - size_t dropStart; Kokkos::parallel_reduce(Kokkos::TeamThreadRange(teamMember, 1, n), [=](size_t i, size_t& min) { auto const& x = drop_view(i - 1); auto const& y = drop_view(i); - auto a = x.val; - auto b = y.val; - if(a > realThreshold * b) { + typename STS::magnitudeType x_aij = 0; + typename STS::magnitudeType y_aij = 0; + if(!x.drop) { + x_aij = static_cast(std::fabs(static_cast(rowView.value(x.col) * rowView.value(x.col)))); // |a_i j|^2 + } + if(!y.drop) { + y_aij = static_cast(std::fabs(static_cast(rowView.value(y.col) * rowView.value(y.col)))); // |a_i j|^2 + } + + if(x_aij > realThreshold * y_aij) { if(i < min) { min = i; } } }, Kokkos::Min(dropStart)); - - if(dropStart < n) { - Kokkos::parallel_for(Kokkos::TeamThreadRange(teamMember, dropStart, n), [=](size_t i) { - drop_view(i).drop = true; - }); - } } else if (algorithm == scaled_cut) { - Kokkos::Experimental::sort_team(teamMember, drop_view, [](DropTol const& a, DropTol const& b) { - return a.val / a.diag > b.val / b.diag; + Kokkos::Experimental::sort_team(teamMember, drop_view, [=](DropTolKokkos const& x, DropTolKokkos const& y) { + if(x.drop || y.drop) { + return x.drop < y.drop; + } + else { + typename STS::magnitudeType x_aij = static_cast(std::fabs(static_cast(rowView.value(x.col) * rowView.value(x.col)))); // |a_i j|^2 + typename STS::magnitudeType y_aij = static_cast(std::fabs(static_cast(rowView.value(y.col) * rowView.value(y.col)))); // |a_i j|^2 + typename STS::magnitudeType x_aiiajj = static_cast(std::fabs(static_cast(threshold * threshold * ghostedDiagValsView(rowView.colidx(x.col)) * ghostedDiagValsView(row)))); // eps^2*|a _ii|*|a_jj| + typename STS::magnitudeType y_aiiajj = static_cast(std::fabs(static_cast(threshold * threshold * ghostedDiagValsView(rowView.colidx(y.col)) * ghostedDiagValsView(row)))); // eps^2*|a _ii|*|a_jj| + return x_aij / x_aiiajj > y_aij / y_aiiajj; + } }); + //find index where dropping starts - size_t dropStart; Kokkos::parallel_reduce(Kokkos::TeamThreadRange(teamMember, 1, n), [=](size_t i, size_t& min) { auto const& x = drop_view(i - 1); auto const& y = drop_view(i); - auto a = x.val / x.diag; - auto b = y.val / y.diag; - if(a > realThreshold * b) { + typename STS::magnitudeType x_val = 0; + typename STS::magnitudeType y_val = 0; + if(!x.drop) { + typename STS::magnitudeType x_aij = static_cast(std::fabs(static_cast(rowView.value(x.col) * rowView.value(x.col)))); // |a_i j|^2 + typename STS::magnitudeType x_aiiajj = static_cast(std::fabs(static_cast(threshold * threshold * ghostedDiagValsView(rowView.colidx(x.col)) * ghostedDiagValsView(row)))); // eps^2*|a _ii|*|a_jj| + x_val = x_aij / x_aiiajj; + } + if(!y.drop) { + typename STS::magnitudeType y_aij = static_cast(std::fabs(static_cast(rowView.value(y.col) * rowView.value(y.col)))); // |a_i j|^2 + typename STS::magnitudeType y_aiiajj = static_cast(std::fabs(static_cast(threshold * threshold * ghostedDiagValsView(rowView.colidx(y.col)) * ghostedDiagValsView(row)))); // eps^2*|a _ii|*|a_jj| + y_val = y_aij / y_aiiajj; + } + + if(x_val > realThreshold * y_val) { if(i < min) { min = i; } } }, Kokkos::Min(dropStart)); - - if(dropStart < n) { - Kokkos::parallel_for(Kokkos::TeamThreadRange(teamMember, dropStart, n), [=](size_t i) { - drop_view(i).drop = true; - }); - } } - Kokkos::Experimental::sort_team(teamMember, drop_view, [](DropTol const& a, DropTol const& b) { - return a.col < b.col; + + if(dropStart < n) { + Kokkos::parallel_for(Kokkos::TeamThreadRange(teamMember, dropStart, n), [=](size_t i) { + drop_view(i).drop = true; + }); + } + + Kokkos::Experimental::sort_team(teamMember, drop_view, [](DropTolKokkos const& a, DropTolKokkos const& b) { + return a.col < b.col; }); - + LO rownnz = 0; GO rowDropped = 0; Kokkos::parallel_reduce(Kokkos::TeamThreadRange(teamMember, n), [=](const size_t idxID, LO& keep, GO& drop) { From cdee728089dcf993f67cf194dbc51575a4a766e5 Mon Sep 17 00:00:00 2001 From: Ian Halim Date: Mon, 22 Jul 2024 19:00:22 -0600 Subject: [PATCH 03/25] MueLu: Sorting Now Resembles numpy.argsort Per Christian's request. DropTolKokkos structure removed and replaced with view indices and view of drop flags. ORIGINAL code removed. BuildKokkos removed. Removed commented out timers. Added comments. Signed-off-by: Ian Halim --- .../MueLu_CoalesceDropFactory_decl.hpp | 1 - .../MueLu_CoalesceDropFactory_def.hpp | 1736 +---------------- 2 files changed, 53 insertions(+), 1684 deletions(-) diff --git a/packages/muelu/src/Graph/MatrixTransformation/MueLu_CoalesceDropFactory_decl.hpp b/packages/muelu/src/Graph/MatrixTransformation/MueLu_CoalesceDropFactory_decl.hpp index db5e9a291313..96b5e778f6bc 100644 --- a/packages/muelu/src/Graph/MatrixTransformation/MueLu_CoalesceDropFactory_decl.hpp +++ b/packages/muelu/src/Graph/MatrixTransformation/MueLu_CoalesceDropFactory_decl.hpp @@ -160,7 +160,6 @@ class CoalesceDropFactory : public SingleLevelFactoryBase { //@} void Build(Level& currentLevel) const; // Build - void BuildKokkos(Level& currentLevel) const; private: // pre-drop function diff --git a/packages/muelu/src/Graph/MatrixTransformation/MueLu_CoalesceDropFactory_def.hpp b/packages/muelu/src/Graph/MatrixTransformation/MueLu_CoalesceDropFactory_def.hpp index eeb3f91dbfd6..da606ab20ff6 100644 --- a/packages/muelu/src/Graph/MatrixTransformation/MueLu_CoalesceDropFactory_def.hpp +++ b/packages/muelu/src/Graph/MatrixTransformation/MueLu_CoalesceDropFactory_def.hpp @@ -61,8 +61,8 @@ #include -#include //NEW -#include //NEW +#include +#include #include "MueLu_CoalesceDropFactory_decl.hpp" #include "MueLu_AmalgamationFactory.hpp" @@ -115,27 +115,6 @@ struct DropTol { // CMS: Auxillary information for debugging info // real_type aux_val {Teuchos::ScalarTraits::nan()}; }; - -template -struct DropTolKokkos { - KOKKOS_INLINE_FUNCTION //NEW - DropTolKokkos() = default; - KOKKOS_INLINE_FUNCTION //NEW - DropTolKokkos(DropTolKokkos const&) = default; - KOKKOS_INLINE_FUNCTION //NEW - DropTolKokkos(DropTolKokkos&&) = default; - - DropTolKokkos& operator=(DropTolKokkos const&) = default; - DropTolKokkos& operator=(DropTolKokkos&&) = default; - - KOKKOS_INLINE_FUNCTION //NEW - DropTolKokkos(LO col_, bool drop_) - : col{col_} - , drop{drop_} {} - - LO col{-1}; - LO drop{true}; -}; } // namespace Details template @@ -529,180 +508,6 @@ void CoalesceDropFactory::Build(Level LO realnnz = 0; rows(0) = 0; -#define NEW -#ifdef ORIGINAL - for (LO row = 0; row < Teuchos::as(A->getRowMap()->getLocalNumElements()); ++row) { - size_t nnz = A->getNumEntriesInLocalRow(row); - bool rowIsDirichlet = boundaryNodes[row]; - ArrayView indices; - ArrayView vals; - A->getLocalRowView(row, indices, vals); - - if (classicalAlgo == defaultAlgo) { - // FIXME the current predrop function uses the following - // FIXME if(std::abs(vals[k]) > std::abs(threshold_) || grow == gcid ) - // FIXME but the threshold doesn't take into account the rows' diagonal entries - // FIXME For now, hardwiring the dropping in here - - LO rownnz = 0; - if (useSignedClassicalRS) { - // Signed classical RS style - for (LO colID = 0; colID < Teuchos::as(nnz); colID++) { - LO col = indices[colID]; - MT max_neg_aik = realThreshold * STS::real(negMaxOffDiagonal[row]); - MT neg_aij = -STS::real(vals[colID]); - /* if(row==1326) printf("A(%d,%d) = %6.4e, block = (%d,%d) neg_aij = %6.4e max_neg_aik = %6.4e\n",row,col,vals[colID], - g_block_id.is_null() ? -1 : g_block_id[row], - g_block_id.is_null() ? -1 : g_block_id[col], - neg_aij, max_neg_aik);*/ - if ((!rowIsDirichlet && (g_block_id.is_null() || g_block_id[row] == g_block_id[col]) && neg_aij > max_neg_aik) || row == col) { - columns[realnnz++] = col; - rownnz++; - } else - numDropped++; - } - rows(row + 1) = realnnz; - } else if (useSignedClassicalSA) { - // Signed classical SA style - for (LO colID = 0; colID < Teuchos::as(nnz); colID++) { - LO col = indices[colID]; - - bool is_nonpositive = STS::real(vals[colID]) <= 0; - MT aiiajj = STS::magnitude(threshold * threshold * ghostedDiagVals[col] * ghostedDiagVals[row]); // eps^2*|a_ii|*|a_jj| - MT aij = is_nonpositive ? STS::magnitude(vals[colID] * vals[colID]) : (-STS::magnitude(vals[colID] * vals[colID])); // + |a_ij|^2, if a_ij < 0, - |a_ij|^2 if a_ij >=0 - /* - if(row==1326) printf("A(%d,%d) = %6.4e, raw_aij = %6.4e aij = %6.4e aiiajj = %6.4e\n",row,col,vals[colID], - vals[colID],aij, aiiajj); - */ - - if ((!rowIsDirichlet && aij > aiiajj) || row == col) { - columns(realnnz++) = col; - rownnz++; - } else - numDropped++; - } - rows[row + 1] = realnnz; - } else { - // Standard abs classical - for (LO colID = 0; colID < Teuchos::as(nnz); colID++) { - LO col = indices[colID]; - MT aiiajj = STS::magnitude(threshold * threshold * ghostedDiagVals[col] * ghostedDiagVals[row]); // eps^2*|a_ii|*|a_jj| - MT aij = STS::magnitude(vals[colID] * vals[colID]); // |a_ij|^2 - - if ((!rowIsDirichlet && aij > aiiajj) || row == col) { - columns(realnnz++) = col; - rownnz++; - } else - numDropped++; - } - rows(row + 1) = realnnz; - } - } else { - /* Cut Algorithm */ - // CMS - using DropTol = Details::DropTol; - std::vector drop_vec; - drop_vec.reserve(nnz); - const real_type zero = Teuchos::ScalarTraits::zero(); - const real_type one = Teuchos::ScalarTraits::one(); - LO rownnz = 0; - // NOTE: This probably needs to be fixed for rowsum - - // find magnitudes - for (LO colID = 0; colID < (LO)nnz; colID++) { - LO col = indices[colID]; - if (row == col) { - drop_vec.emplace_back(zero, one, colID, false); - continue; - } - - // Don't aggregate boundaries - if (boundaryNodes[colID]) continue; - typename STS::magnitudeType aiiajj = STS::magnitude(threshold * threshold * ghostedDiagVals[col] * ghostedDiagVals[row]); // eps^2*|a_ii|*|a_jj| - typename STS::magnitudeType aij = STS::magnitude(vals[colID] * vals[colID]); // |a_ij|^2 - drop_vec.emplace_back(aij, aiiajj, colID, false); - } - - const size_t n = drop_vec.size(); - - if (classicalAlgo == unscaled_cut) { - std::sort(drop_vec.begin(), drop_vec.end(), [](DropTol const& a, DropTol const& b) { - return a.val > b.val; - }); - - bool drop = false; - for (size_t i = 1; i < n; ++i) { - if (!drop) { - auto const& x = drop_vec[i - 1]; - auto const& y = drop_vec[i]; - auto a = x.val; - auto b = y.val; - if (a > realThreshold * b) { - drop = true; -#ifdef HAVE_MUELU_DEBUG - if (distanceLaplacianCutVerbose) { - std::cout << "DJS: KEEP, N, ROW: " << i + 1 << ", " << n << ", " << row << std::endl; - } -#endif - } - } - drop_vec[i].drop = drop; - } - } else if (classicalAlgo == scaled_cut) { - std::sort(drop_vec.begin(), drop_vec.end(), [](DropTol const& a, DropTol const& b) { - return a.val / a.diag > b.val / b.diag; - }); - bool drop = false; - // printf("[%d] Scaled Cut: ",(int)row); - // printf("%3d(%4s) ",indices[drop_vec[0].col],"keep"); - for (size_t i = 1; i < n; ++i) { - if (!drop) { - auto const& x = drop_vec[i - 1]; - auto const& y = drop_vec[i]; - auto a = x.val / x.diag; - auto b = y.val / y.diag; - if (a > realThreshold * b) { - drop = true; - -#ifdef HAVE_MUELU_DEBUG - if (distanceLaplacianCutVerbose) { - std::cout << "DJS: KEEP, N, ROW: " << i + 1 << ", " << n << ", " << row << std::endl; - } -#endif - } - // printf("%3d(%4s) ",indices[drop_vec[i].col],drop?"drop":"keep"); - } - drop_vec[i].drop = drop; - } - // printf("\n"); - } - std::sort(drop_vec.begin(), drop_vec.end(), [](DropTol const& a, DropTol const& b) { - return a.col < b.col; - }); - - for (LO idxID = 0; idxID < (LO)drop_vec.size(); idxID++) { - LO col = indices[drop_vec[idxID].col]; - // don't drop diagonal - if (row == col) { - columns[realnnz++] = col; - rownnz++; - continue; - } - - if (!drop_vec[idxID].drop) { - columns[realnnz++] = col; - rownnz++; - } else { - numDropped++; - } - } - // CMS - rows[row + 1] = realnnz; - } - } // end for row -#endif - -#ifdef NEW if(classicalAlgo == defaultAlgo) { SubFactoryMonitor m1(*this, "Classical RS/SA", currentLevel); for (LO row = 0; row < Teuchos::as(A->getRowMap()->getLocalNumElements()); ++row) { @@ -772,15 +577,11 @@ void CoalesceDropFactory::Build(Level } } // end for row } - else { //NEW START - //auto stackedTimer = rcp(new Teuchos::StackedTimer("timer")); - //Teuchos::TimeMonitor::setStackedTimer(stackedTimer); - //stackedTimer->start("init"); + else { SubFactoryMonitor m1(*this, "Cut Drop", currentLevel); using ExecSpace = typename Node::execution_space; using TeamPol = Kokkos::TeamPolicy; using TeamMem = typename TeamPol::member_type; - using DropTolKokkos = Details::DropTolKokkos; //move from host to device ArrayView ghostedDiagValsArrayView = ghostedDiagVals.view(ghostedDiagVals.lowerOffset(), ghostedDiagVals.size()); @@ -792,10 +593,9 @@ void CoalesceDropFactory::Build(Level int algorithm = classicalAlgo; Kokkos::ViewrownnzView("rownnzView", A_device.numRows()); - auto drop_views = Kokkos::View("drop_views", A_device.nnz()); - //stackedTimer->stop("init"); + auto drop_views = Kokkos::View("drop_views", A_device.nnz()); + auto index_views = Kokkos::View("index_views", A_device.nnz()); - //stackedTimer->start("loop"); Kokkos::parallel_reduce("classical_cut", TeamPol(A_device.numRows(), Kokkos::AUTO), KOKKOS_LAMBDA(const TeamMem& teamMember, LO& globalnnz, GO& totalDropped) { LO row = teamMember.league_rank(); auto rowView = A_device.row(row); @@ -803,45 +603,52 @@ void CoalesceDropFactory::Build(Level size_t n = 0; auto drop_view = Kokkos::subview(drop_views, Kokkos::make_pair(A_device.graph.row_map(row), A_device.graph.row_map(row+1))); - + auto index_view = Kokkos::subview(index_views, Kokkos::make_pair(A_device.graph.row_map(row), A_device.graph.row_map(row+1))); + //find magnitudes - Kokkos::parallel_reduce(Kokkos::TeamThreadRange(teamMember, (LO)nnz), [&] (const LO colID, size_t &count) { + Kokkos::parallel_reduce(Kokkos::TeamThreadRange(teamMember, (LO)nnz), [&](const LO colID, size_t &count) { + index_view(colID) = colID; LO col = rowView.colidx(colID); + //ignore diagonals for now, they are checked again later if(row == col) { - drop_view(colID) = DropTolKokkos(colID, true); + drop_view(colID) = true; count++; } //Don't aggregate boundaries - else if(!boundaryNodesDevice(colID)) { - drop_view(colID) = DropTolKokkos(colID, false); + else if(boundaryNodesDevice(colID)) { + drop_view(colID) = true; + } + else { + drop_view(colID) = false; count++; } }, n); size_t dropStart = n; if (algorithm == unscaled_cut) { - Kokkos::Experimental::sort_team(teamMember, drop_view, [=](DropTolKokkos const& x, DropTolKokkos const& y) { - if(x.drop || y.drop) { - return x.drop < y.drop; + //push diagonals and boundaries to the right, sort everything else by aij on the left + Kokkos::Experimental::sort_team(teamMember, index_view, [=](size_t& x, size_t& y) { + if(drop_view(x) || drop_view(y)) { + return drop_view(x) < drop_view(y); } else { - typename STS::magnitudeType x_aij = static_cast(std::fabs(static_cast(rowView.value(x.col) * rowView.value(x.col)))); // |a_i j|^2 - typename STS::magnitudeType y_aij = static_cast(std::fabs(static_cast(rowView.value(y.col) * rowView.value(y.col)))); // |a_i j|^2 + typename STS::magnitudeType x_aij = static_cast(std::fabs(static_cast(rowView.value(x) * rowView.value(x)))); + typename STS::magnitudeType y_aij = static_cast(std::fabs(static_cast(rowView.value(y) * rowView.value(y)))); return x_aij > y_aij; } }); //find index where dropping starts Kokkos::parallel_reduce(Kokkos::TeamThreadRange(teamMember, 1, n), [=](size_t i, size_t& min) { - auto const& x = drop_view(i - 1); - auto const& y = drop_view(i); + auto const& x = index_view(i - 1); + auto const& y = index_view(i); typename STS::magnitudeType x_aij = 0; typename STS::magnitudeType y_aij = 0; - if(!x.drop) { - x_aij = static_cast(std::fabs(static_cast(rowView.value(x.col) * rowView.value(x.col)))); // |a_i j|^2 + if(!drop_view(x)) { + x_aij = static_cast(std::fabs(static_cast(rowView.value(x) * rowView.value(x)))); } - if(!y.drop) { - y_aij = static_cast(std::fabs(static_cast(rowView.value(y.col) * rowView.value(y.col)))); // |a_i j|^2 + if(!drop_view(y)) { + y_aij = static_cast(std::fabs(static_cast(rowView.value(y) * rowView.value(y)))); } if(x_aij > realThreshold * y_aij) { @@ -851,34 +658,34 @@ void CoalesceDropFactory::Build(Level } }, Kokkos::Min(dropStart)); } else if (algorithm == scaled_cut) { - Kokkos::Experimental::sort_team(teamMember, drop_view, [=](DropTolKokkos const& x, DropTolKokkos const& y) { - if(x.drop || y.drop) { - return x.drop < y.drop; + //push diagonals and boundaries to the right, sort everything else by aij/aiiajj on the left + Kokkos::Experimental::sort_team(teamMember, index_view, [=](size_t& x, size_t& y) { + if(drop_view(x) || drop_view(y)) { + return drop_view(x) < drop_view(y); } else { - typename STS::magnitudeType x_aij = static_cast(std::fabs(static_cast(rowView.value(x.col) * rowView.value(x.col)))); // |a_i j|^2 - typename STS::magnitudeType y_aij = static_cast(std::fabs(static_cast(rowView.value(y.col) * rowView.value(y.col)))); // |a_i j|^2 - typename STS::magnitudeType x_aiiajj = static_cast(std::fabs(static_cast(threshold * threshold * ghostedDiagValsView(rowView.colidx(x.col)) * ghostedDiagValsView(row)))); // eps^2*|a _ii|*|a_jj| - typename STS::magnitudeType y_aiiajj = static_cast(std::fabs(static_cast(threshold * threshold * ghostedDiagValsView(rowView.colidx(y.col)) * ghostedDiagValsView(row)))); // eps^2*|a _ii|*|a_jj| + typename STS::magnitudeType x_aij = static_cast(std::fabs(static_cast(rowView.value(x) * rowView.value(x)))); + typename STS::magnitudeType y_aij = static_cast(std::fabs(static_cast(rowView.value(y) * rowView.value(y)))); + typename STS::magnitudeType x_aiiajj = static_cast(std::fabs(static_cast(threshold * threshold * ghostedDiagValsView(rowView.colidx(x)) * ghostedDiagValsView(row)))); + typename STS::magnitudeType y_aiiajj = static_cast(std::fabs(static_cast(threshold * threshold * ghostedDiagValsView(rowView.colidx(y)) * ghostedDiagValsView(row)))); return x_aij / x_aiiajj > y_aij / y_aiiajj; } }); - //find index where dropping starts Kokkos::parallel_reduce(Kokkos::TeamThreadRange(teamMember, 1, n), [=](size_t i, size_t& min) { - auto const& x = drop_view(i - 1); - auto const& y = drop_view(i); + auto const& x = index_view(i - 1); + auto const& y = index_view(i); typename STS::magnitudeType x_val = 0; typename STS::magnitudeType y_val = 0; - if(!x.drop) { - typename STS::magnitudeType x_aij = static_cast(std::fabs(static_cast(rowView.value(x.col) * rowView.value(x.col)))); // |a_i j|^2 - typename STS::magnitudeType x_aiiajj = static_cast(std::fabs(static_cast(threshold * threshold * ghostedDiagValsView(rowView.colidx(x.col)) * ghostedDiagValsView(row)))); // eps^2*|a _ii|*|a_jj| + if(!drop_view(x)) { + typename STS::magnitudeType x_aij = static_cast(std::fabs(static_cast(rowView.value(x) * rowView.value(x)))); + typename STS::magnitudeType x_aiiajj = static_cast(std::fabs(static_cast(threshold * threshold * ghostedDiagValsView(rowView.colidx(x)) * ghostedDiagValsView(row)))); x_val = x_aij / x_aiiajj; } - if(!y.drop) { - typename STS::magnitudeType y_aij = static_cast(std::fabs(static_cast(rowView.value(y.col) * rowView.value(y.col)))); // |a_i j|^2 - typename STS::magnitudeType y_aiiajj = static_cast(std::fabs(static_cast(threshold * threshold * ghostedDiagValsView(rowView.colidx(y.col)) * ghostedDiagValsView(row)))); // eps^2*|a _ii|*|a_jj| + if(!drop_view(y)) { + typename STS::magnitudeType y_aij = static_cast(std::fabs(static_cast(rowView.value(y) * rowView.value(y)))); + typename STS::magnitudeType y_aiiajj = static_cast(std::fabs(static_cast(threshold * threshold * ghostedDiagValsView(rowView.colidx(y)) * ghostedDiagValsView(row)))); y_val = y_aij / y_aiiajj; } @@ -890,22 +697,19 @@ void CoalesceDropFactory::Build(Level }, Kokkos::Min(dropStart)); } + //drop everything to the right of where values stop passing threshold if(dropStart < n) { Kokkos::parallel_for(Kokkos::TeamThreadRange(teamMember, dropStart, n), [=](size_t i) { - drop_view(i).drop = true; + drop_view(index_view(i)) = true; }); } - Kokkos::Experimental::sort_team(teamMember, drop_view, [](DropTolKokkos const& a, DropTolKokkos const& b) { - return a.col < b.col; - }); - LO rownnz = 0; GO rowDropped = 0; Kokkos::parallel_reduce(Kokkos::TeamThreadRange(teamMember, n), [=](const size_t idxID, LO& keep, GO& drop) { LO col = rowView.colidx(idxID); //don't drop diagonal - if(row == col || !drop_view(idxID).drop) { + if(row == col || !drop_view(idxID)) { keep++; } else { @@ -913,1459 +717,25 @@ void CoalesceDropFactory::Build(Level drop++; } }, rownnz, rowDropped); + globalnnz += rownnz; totalDropped += rowDropped; rownnzView(row) = rownnz; }, realnnz, numDropped); - //stackedTimer->stop("loop"); - - //stackedTimer->start("remove"); - + + //update column indices so that kept indices are aligned to the left for subview that happens later on auto columnsDevice = Kokkos::create_mirror_view(ExecSpace(), columns); Kokkos::Experimental::remove(ExecSpace(), columnsDevice, -1); Kokkos::deep_copy(columns, columnsDevice); - //stackedTimer->stop("remove"); - - //update row indices - //stackedTimer->start("scan"); + //update row indices by adding up new # of nnz in each row auto rowsDevice = Kokkos::create_mirror_view(ExecSpace(), rows); Kokkos::parallel_scan(Kokkos::RangePolicy(0, A_device.numRows()), KOKKOS_LAMBDA(const int i, LO& partial_sum, bool is_final) { partial_sum += rownnzView(i); if(is_final) rowsDevice(i+1) = partial_sum; }); Kokkos::deep_copy(rows, rowsDevice); - //stackedTimer->stop("scan"); - - //stackedTimer->stop("timer"); - //stackedTimer->report(std::cout, Teuchos::DefaultComm::getComm()); - } //NEW END -#endif - - numTotal = A->getLocalNumEntries(); - - if (aggregationMayCreateDirichlet) { - // If the only element remaining after filtering is diagonal, mark node as boundary - for (LO row = 0; row < Teuchos::as(A->getRowMap()->getLocalNumElements()); ++row) { - if (rows[row + 1] - rows[row] <= 1) - boundaryNodes[row] = true; - } - } - - RCP graph = rcp(new LWGraph(rows, Kokkos::subview(columns, Kokkos::make_pair(0, realnnz)), A->getRowMap(), A->getColMap(), "thresholded graph of A")); - graph->SetBoundaryNodeMap(boundaryNodes); - if (GetVerbLevel() & Statistics1) { - GO numLocalBoundaryNodes = 0; - GO numGlobalBoundaryNodes = 0; - for (size_t i = 0; i < boundaryNodes.size(); ++i) - if (boundaryNodes(i)) - numLocalBoundaryNodes++; - RCP> comm = A->getRowMap()->getComm(); - MueLu_sumAll(comm, numLocalBoundaryNodes, numGlobalBoundaryNodes); - GetOStream(Statistics1) << "Detected " << numGlobalBoundaryNodes << " Dirichlet nodes" << std::endl; - } - Set(currentLevel, "Graph", graph); - Set(currentLevel, "DofsPerNode", 1); - - // If we're doing signed classical, we might want to block-diagonalize *after* the dropping - if (generateColoringGraph) { - RCP colorGraph; - RCP importer = A->getCrsGraph()->getImporter(); - BlockDiagonalizeGraph(graph, ghostedBlockNumber, colorGraph, importer); - Set(currentLevel, "Coloring Graph", colorGraph); - // #define CMS_DUMP -#ifdef CMS_DUMP - { - Xpetra::IO::Write("m_regular_graph." + std::to_string(currentLevel.GetLevelID()), *rcp_dynamic_cast(graph)->GetCrsGraph()); - Xpetra::IO::Write("m_color_graph." + std::to_string(currentLevel.GetLevelID()), *rcp_dynamic_cast(colorGraph)->GetCrsGraph()); - // int rank = graph->GetDomainMap()->getComm()->getRank(); - // { - // std::ofstream ofs(std::string("m_color_graph_") + std::to_string(currentLevel.GetLevelID())+std::string("_") + std::to_string(rank) + std::string(".dat"),std::ofstream::out); - // RCP fancy = Teuchos::fancyOStream(Teuchos::rcpFromRef(ofs)); - // colorGraph->print(*fancy,Debug); - // } - // { - // std::ofstream ofs(std::string("m_regular_graph_") + std::to_string(currentLevel.GetLevelID())+std::string("_") + std::to_string(rank) + std::string(".dat"),std::ofstream::out); - // RCP fancy = Teuchos::fancyOStream(Teuchos::rcpFromRef(ofs)); - // graph->print(*fancy,Debug); - // } - } -#endif - } // end generateColoringGraph - } else if (BlockSize > 1 && threshold == STS::zero()) { - // Case 3: Multiple DOF/node problem without dropping - const RCP rowMap = A->getRowMap(); - const RCP colMap = A->getColMap(); - - graphType = "amalgamated"; - - // build node row map (uniqueMap) and node column map (nonUniqueMap) - // the arrays rowTranslation and colTranslation contain the local node id - // given a local dof id. The data is calculated by the AmalgamationFactory and - // stored in the variable container "UnAmalgamationInfo" - RCP uniqueMap = amalInfo->getNodeRowMap(); - RCP nonUniqueMap = amalInfo->getNodeColMap(); - Array rowTranslation = *(amalInfo->getRowTranslation()); - Array colTranslation = *(amalInfo->getColTranslation()); - - // get number of local nodes - LO numRows = Teuchos::as(uniqueMap->getLocalNumElements()); - - // Allocate space for the local graph - typename LWGraph::row_type::non_const_type rows("rows", numRows + 1); - typename LWGraph::entries_type::non_const_type columns("columns", A->getLocalNumEntries()); - - typename LWGraph::boundary_nodes_type amalgBoundaryNodes("amalgBoundaryNodes", numRows); - Kokkos::deep_copy(amalgBoundaryNodes, false); - - // Detect and record rows that correspond to Dirichlet boundary conditions - // TODO If we use ArrayRCP, then we can record boundary nodes as usual. Size - // TODO the array one bigger than the number of local rows, and the last entry can - // TODO hold the actual number of boundary nodes. Clever, huh? - ArrayRCP pointBoundaryNodes; - pointBoundaryNodes = Teuchos::arcp_const_cast(MueLu::Utilities::DetectDirichletRows(*A, dirichletThreshold)); - if (rowSumTol > 0.) - Utilities::ApplyRowSumCriterion(*A, rowSumTol, pointBoundaryNodes); - - // extract striding information - LO blkSize = A->GetFixedBlockSize(); //< the full block size (number of dofs per node in strided map) - LO blkId = -1; //< the block id within the strided map (or -1 if it is a full block map) - LO blkPartSize = A->GetFixedBlockSize(); //< stores the size of the block within the strided map - if (A->IsView("stridedMaps") == true) { - Teuchos::RCP myMap = A->getRowMap("stridedMaps"); - Teuchos::RCP strMap = Teuchos::rcp_dynamic_cast(myMap); - TEUCHOS_TEST_FOR_EXCEPTION(strMap == null, Exceptions::RuntimeError, "Map is not of type StridedMap"); - blkSize = Teuchos::as(strMap->getFixedBlockSize()); - blkId = strMap->getStridedBlockId(); - if (blkId > -1) - blkPartSize = Teuchos::as(strMap->getStridingData()[blkId]); - } - - // loop over all local nodes - LO realnnz = 0; - rows(0) = 0; - Array indicesExtra; - for (LO row = 0; row < numRows; row++) { - ArrayView indices; - indicesExtra.resize(0); - - // The amalgamated row is marked as Dirichlet iff all point rows are Dirichlet - // Note, that pointBoundaryNodes lives on the dofmap (and not the node map). - // Therefore, looping over all dofs is fine here. We use blkPartSize as we work - // with local ids. - // TODO: Here we have different options of how to define a node to be a boundary (or Dirichlet) - // node. - bool isBoundary = false; - if (pL.get("aggregation: greedy Dirichlet") == true) { - for (LO j = 0; j < blkPartSize; j++) { - if (pointBoundaryNodes[row * blkPartSize + j]) { - isBoundary = true; - break; - } - } - } else { - isBoundary = true; - for (LO j = 0; j < blkPartSize; j++) { - if (!pointBoundaryNodes[row * blkPartSize + j]) { - isBoundary = false; - break; - } - } - } - - // Merge rows of A - // The array indicesExtra contains local column node ids for the current local node "row" - if (!isBoundary) - MergeRows(*A, row, indicesExtra, colTranslation); - else - indicesExtra.push_back(row); - indices = indicesExtra; - numTotal += indices.size(); - - // add the local column node ids to the full columns array which - // contains the local column node ids for all local node rows - LO nnz = indices.size(), rownnz = 0; - for (LO colID = 0; colID < nnz; colID++) { - LO col = indices[colID]; - columns(realnnz++) = col; - rownnz++; - } - - if (rownnz == 1) { - // If the only element remaining after filtering is diagonal, mark node as boundary - // FIXME: this should really be replaced by the following - // if (indices.size() == 1 && indices[0] == row) - // boundaryNodes[row] = true; - // We do not do it this way now because there is no framework for distinguishing isolated - // and boundary nodes in the aggregation algorithms - amalgBoundaryNodes[row] = true; - } - rows(row + 1) = realnnz; - } // for (LO row = 0; row < numRows; row++) - - RCP graph = rcp(new LWGraph(rows, Kokkos::subview(columns, Kokkos::make_pair(0, realnnz)), uniqueMap, nonUniqueMap, "amalgamated graph of A")); - graph->SetBoundaryNodeMap(amalgBoundaryNodes); - - if (GetVerbLevel() & Statistics1) { - GO numLocalBoundaryNodes = 0; - GO numGlobalBoundaryNodes = 0; - - for (size_t i = 0; i < amalgBoundaryNodes.size(); ++i) - if (amalgBoundaryNodes(i)) - numLocalBoundaryNodes++; - - RCP> comm = A->getRowMap()->getComm(); - MueLu_sumAll(comm, numLocalBoundaryNodes, numGlobalBoundaryNodes); - GetOStream(Statistics1) << "Detected " << numGlobalBoundaryNodes - << " agglomerated Dirichlet nodes" << std::endl; - } - - Set(currentLevel, "Graph", graph); - Set(currentLevel, "DofsPerNode", blkSize); // full block size - - } else if (BlockSize > 1 && threshold != STS::zero()) { - // Case 4: Multiple DOF/node problem with dropping - const RCP rowMap = A->getRowMap(); - const RCP colMap = A->getColMap(); - graphType = "amalgamated"; - - // build node row map (uniqueMap) and node column map (nonUniqueMap) - // the arrays rowTranslation and colTranslation contain the local node id - // given a local dof id. The data is calculated by the AmalgamationFactory and - // stored in the variable container "UnAmalgamationInfo" - RCP uniqueMap = amalInfo->getNodeRowMap(); - RCP nonUniqueMap = amalInfo->getNodeColMap(); - Array rowTranslation = *(amalInfo->getRowTranslation()); - Array colTranslation = *(amalInfo->getColTranslation()); - - // get number of local nodes - LO numRows = Teuchos::as(uniqueMap->getLocalNumElements()); - - // Allocate space for the local graph - typename LWGraph::row_type::non_const_type rows("rows", numRows + 1); - typename LWGraph::entries_type::non_const_type columns("columns", A->getLocalNumEntries()); - - typename LWGraph::boundary_nodes_type amalgBoundaryNodes("amalgBoundaryNodes", numRows); - Kokkos::deep_copy(amalgBoundaryNodes, false); - - // Detect and record rows that correspond to Dirichlet boundary conditions - // TODO If we use ArrayRCP, then we can record boundary nodes as usual. Size - // TODO the array one bigger than the number of local rows, and the last entry can - // TODO hold the actual number of boundary nodes. Clever, huh? - auto pointBoundaryNodes = MueLu::Utilities::DetectDirichletRows_kokkos_host(*A, dirichletThreshold); - if (rowSumTol > 0.) - Utilities::ApplyRowSumCriterionHost(*A, rowSumTol, pointBoundaryNodes); - - // extract striding information - LO blkSize = A->GetFixedBlockSize(); //< the full block size (number of dofs per node in strided map) - LO blkId = -1; //< the block id within the strided map (or -1 if it is a full block map) - LO blkPartSize = A->GetFixedBlockSize(); //< stores the size of the block within the strided map - if (A->IsView("stridedMaps") == true) { - Teuchos::RCP myMap = A->getRowMap("stridedMaps"); - Teuchos::RCP strMap = Teuchos::rcp_dynamic_cast(myMap); - TEUCHOS_TEST_FOR_EXCEPTION(strMap == null, Exceptions::RuntimeError, "Map is not of type StridedMap"); - blkSize = Teuchos::as(strMap->getFixedBlockSize()); - blkId = strMap->getStridedBlockId(); - if (blkId > -1) - blkPartSize = Teuchos::as(strMap->getStridingData()[blkId]); - } - - // extract diagonal data for dropping strategy - RCP ghostedDiag = MueLu::Utilities::GetMatrixOverlappedDiagonal(*A); - const ArrayRCP ghostedDiagVals = ghostedDiag->getData(0); - - // loop over all local nodes - LO realnnz = 0; - rows[0] = 0; - Array indicesExtra; - for (LO row = 0; row < numRows; row++) { - ArrayView indices; - indicesExtra.resize(0); - - // The amalgamated row is marked as Dirichlet iff all point rows are Dirichlet - // Note, that pointBoundaryNodes lives on the dofmap (and not the node map). - // Therefore, looping over all dofs is fine here. We use blkPartSize as we work - // with local ids. - // TODO: Here we have different options of how to define a node to be a boundary (or Dirichlet) - // node. - bool isBoundary = false; - if (pL.get("aggregation: greedy Dirichlet") == true) { - for (LO j = 0; j < blkPartSize; j++) { - if (pointBoundaryNodes[row * blkPartSize + j]) { - isBoundary = true; - break; - } - } - } else { - isBoundary = true; - for (LO j = 0; j < blkPartSize; j++) { - if (!pointBoundaryNodes[row * blkPartSize + j]) { - isBoundary = false; - break; - } - } - } - - // Merge rows of A - // The array indicesExtra contains local column node ids for the current local node "row" - if (!isBoundary) - MergeRowsWithDropping(*A, row, ghostedDiagVals, threshold, indicesExtra, colTranslation); - else - indicesExtra.push_back(row); - indices = indicesExtra; - numTotal += indices.size(); - - // add the local column node ids to the full columns array which - // contains the local column node ids for all local node rows - LO nnz = indices.size(), rownnz = 0; - for (LO colID = 0; colID < nnz; colID++) { - LO col = indices[colID]; - columns[realnnz++] = col; - rownnz++; - } - - if (rownnz == 1) { - // If the only element remaining after filtering is diagonal, mark node as boundary - // FIXME: this should really be replaced by the following - // if (indices.size() == 1 && indices[0] == row) - // boundaryNodes[row] = true; - // We do not do it this way now because there is no framework for distinguishing isolated - // and boundary nodes in the aggregation algorithms - amalgBoundaryNodes[row] = true; - } - rows[row + 1] = realnnz; - } // for (LO row = 0; row < numRows; row++) - // columns.resize(realnnz); - - RCP graph = rcp(new LWGraph(rows, Kokkos::subview(columns, Kokkos::make_pair(0, realnnz)), uniqueMap, nonUniqueMap, "amalgamated graph of A")); - graph->SetBoundaryNodeMap(amalgBoundaryNodes); - - if (GetVerbLevel() & Statistics1) { - GO numLocalBoundaryNodes = 0; - GO numGlobalBoundaryNodes = 0; - - for (size_t i = 0; i < amalgBoundaryNodes.size(); ++i) - if (amalgBoundaryNodes(i)) - numLocalBoundaryNodes++; - - RCP> comm = A->getRowMap()->getComm(); - MueLu_sumAll(comm, numLocalBoundaryNodes, numGlobalBoundaryNodes); - GetOStream(Statistics1) << "Detected " << numGlobalBoundaryNodes - << " agglomerated Dirichlet nodes" << std::endl; - } - - Set(currentLevel, "Graph", graph); - Set(currentLevel, "DofsPerNode", blkSize); // full block size - } - - } else if (algo == "distance laplacian") { - LO blkSize = A->GetFixedBlockSize(); - GO indexBase = A->getRowMap()->getIndexBase(); - // [*0*] : FIXME - // ap: somehow, if I move this line to [*1*], Belos throws an error - // I'm not sure what's going on. Do we always have to Get data, if we did - // DeclareInput for it? - // RCP Coords = Get< RCP >(currentLevel, "Coordinates"); - - // Detect and record rows that correspond to Dirichlet boundary conditions - // TODO If we use ArrayRCP, then we can record boundary nodes as usual. Size - // TODO the array one bigger than the number of local rows, and the last entry can - // TODO hold the actual number of boundary nodes. Clever, huh? - auto pointBoundaryNodes = MueLu::Utilities::DetectDirichletRows_kokkos_host(*A, dirichletThreshold); - if (rowSumTol > 0.) - Utilities::ApplyRowSumCriterionHost(*A, rowSumTol, pointBoundaryNodes); - - if ((blkSize == 1) && (threshold == STS::zero())) { - // Trivial case: scalar problem, no dropping. Can return original graph - RCP graph = rcp(new LWGraph(A->getCrsGraph(), "graph of A")); - graph->SetBoundaryNodeMap(pointBoundaryNodes); - graphType = "unamalgamated"; - numTotal = A->getLocalNumEntries(); - - if (GetVerbLevel() & Statistics1) { - GO numLocalBoundaryNodes = 0; - GO numGlobalBoundaryNodes = 0; - for (size_t i = 0; i < pointBoundaryNodes.size(); ++i) - if (pointBoundaryNodes(i)) - numLocalBoundaryNodes++; - RCP> comm = A->getRowMap()->getComm(); - MueLu_sumAll(comm, numLocalBoundaryNodes, numGlobalBoundaryNodes); - GetOStream(Statistics1) << "Detected " << numGlobalBoundaryNodes << " Dirichlet nodes" << std::endl; - } - - Set(currentLevel, "DofsPerNode", blkSize); - Set(currentLevel, "Graph", graph); - - } else { - // ap: We make quite a few assumptions here; general case may be a lot different, - // but much much harder to implement. We assume that: - // 1) all maps are standard maps, not strided maps - // 2) global indices of dofs in A are related to dofs in coordinates in a simple arithmetic - // way: rows i*blkSize, i*blkSize+1, ..., i*blkSize + (blkSize-1) correspond to node i - // - // NOTE: Potentially, some of the code below could be simplified with UnAmalgamationInfo, - // but as I totally don't understand that code, here is my solution - - // [*1*]: see [*0*] - - // Check that the number of local coordinates is consistent with the #rows in A - TEUCHOS_TEST_FOR_EXCEPTION(A->getRowMap()->getLocalNumElements() / blkSize != Coords->getLocalLength(), Exceptions::Incompatible, - "Coordinate vector length (" << Coords->getLocalLength() << ") is incompatible with number of rows in A (" << A->getRowMap()->getLocalNumElements() << ") by modulo block size (" << blkSize << ")."); - - const RCP colMap = A->getColMap(); - RCP uniqueMap, nonUniqueMap; - Array colTranslation; - if (blkSize == 1) { - uniqueMap = A->getRowMap(); - nonUniqueMap = A->getColMap(); - graphType = "unamalgamated"; - - } else { - uniqueMap = Coords->getMap(); - TEUCHOS_TEST_FOR_EXCEPTION(uniqueMap->getIndexBase() != indexBase, Exceptions::Incompatible, - "Different index bases for matrix and coordinates"); - - AmalgamationFactory::AmalgamateMap(*(A->getColMap()), *A, nonUniqueMap, colTranslation); - - graphType = "amalgamated"; - } - LO numRows = Teuchos::as(uniqueMap->getLocalNumElements()); - - RCP ghostedCoords; - RCP ghostedLaplDiag; - Teuchos::ArrayRCP ghostedLaplDiagData; - if (threshold != STS::zero()) { - // Get ghost coordinates - RCP importer; - { - SubFactoryMonitor m1(*this, "Import construction", currentLevel); - if (blkSize == 1 && realA->getCrsGraph()->getImporter() != Teuchos::null) { - GetOStream(Warnings1) << "Using existing importer from matrix graph" << std::endl; - importer = realA->getCrsGraph()->getImporter(); - } else { - GetOStream(Warnings0) << "Constructing new importer instance" << std::endl; - importer = ImportFactory::Build(uniqueMap, nonUniqueMap); - } - } // subtimer - ghostedCoords = Xpetra::MultiVectorFactory::Build(nonUniqueMap, Coords->getNumVectors()); - { - SubFactoryMonitor m1(*this, "Coordinate import", currentLevel); - ghostedCoords->doImport(*Coords, *importer, Xpetra::INSERT); - } // subtimer - - // Construct Distance Laplacian diagonal - RCP localLaplDiag = VectorFactory::Build(uniqueMap); - Array indicesExtra; - Teuchos::Array> coordData; - if (threshold != STS::zero()) { - const size_t numVectors = ghostedCoords->getNumVectors(); - coordData.reserve(numVectors); - for (size_t j = 0; j < numVectors; j++) { - Teuchos::ArrayRCP tmpData = ghostedCoords->getData(j); - coordData.push_back(tmpData); - } - } - { - SubFactoryMonitor m1(*this, "Laplacian local diagonal", currentLevel); - ArrayRCP localLaplDiagData = localLaplDiag->getDataNonConst(0); - for (LO row = 0; row < numRows; row++) { - ArrayView indices; - - if (blkSize == 1) { - ArrayView vals; - A->getLocalRowView(row, indices, vals); - - } else { - // Merge rows of A - indicesExtra.resize(0); - MergeRows(*A, row, indicesExtra, colTranslation); - indices = indicesExtra; - } - - LO nnz = indices.size(); - bool haveAddedToDiag = false; - for (LO colID = 0; colID < nnz; colID++) { - const LO col = indices[colID]; - - if (row != col) { - if (use_dlap_weights == SINGLE_WEIGHTS) { - /*printf("[%d,%d] Unweighted Distance = %6.4e Weighted Distance = %6.4e\n",row,col, - MueLu::Utilities::Distance2(coordData, row, col), - MueLu::Utilities::Distance2(dlap_weights(),coordData, row, col));*/ - localLaplDiagData[row] += STS::one() / MueLu::Utilities::Distance2(dlap_weights(), coordData, row, col); - } else if (use_dlap_weights == BLOCK_WEIGHTS) { - int block_id = row % interleaved_blocksize; - int block_start = block_id * interleaved_blocksize; - localLaplDiagData[row] += STS::one() / MueLu::Utilities::Distance2(dlap_weights(block_start, interleaved_blocksize), coordData, row, col); - } else { - // printf("[%d,%d] Unweighted Distance = %6.4e\n",row,col,MueLu::Utilities::Distance2(coordData, row, col)); - localLaplDiagData[row] += STS::one() / MueLu::Utilities::Distance2(coordData, row, col); - } - haveAddedToDiag = true; - } - } - // Deal with the situation where boundary conditions have only been enforced on rows, but not on columns. - // We enforce dropping of these entries by assigning a very large number to the diagonal entries corresponding to BCs. - if (!haveAddedToDiag) - localLaplDiagData[row] = STS::rmax(); - } - } // subtimer - { - SubFactoryMonitor m1(*this, "Laplacian distributed diagonal", currentLevel); - ghostedLaplDiag = VectorFactory::Build(nonUniqueMap); - ghostedLaplDiag->doImport(*localLaplDiag, *importer, Xpetra::INSERT); - ghostedLaplDiagData = ghostedLaplDiag->getDataNonConst(0); - } // subtimer - - } else { - GetOStream(Runtime0) << "Skipping distance laplacian construction due to 0 threshold" << std::endl; - } - - // NOTE: ghostedLaplDiagData might be zero if we don't actually calculate the laplacian - - // allocate space for the local graph - typename LWGraph::row_type::non_const_type rows("rows", numRows + 1); - typename LWGraph::entries_type::non_const_type columns("columns", A->getLocalNumEntries()); - -#ifdef HAVE_MUELU_DEBUG - // DEBUGGING - for (LO i = 0; i < (LO)columns.size(); i++) columns[i] = -666; -#endif - - // Extra array for if we're allowing symmetrization with cutting - ArrayRCP rows_stop; - bool use_stop_array = threshold != STS::zero() && distanceLaplacianAlgo == scaled_cut_symmetric; - if (use_stop_array) - // rows_stop = typename LWGraph::row_type::non_const_type("rows_stop", numRows); - rows_stop.resize(numRows); - - typename LWGraph::boundary_nodes_type amalgBoundaryNodes("amalgBoundaryNodes", numRows); - Kokkos::deep_copy(amalgBoundaryNodes, false); - - LO realnnz = 0; - rows(0) = 0; - - Array indicesExtra; - { - SubFactoryMonitor m1(*this, "Laplacian dropping", currentLevel); - Teuchos::Array> coordData; - if (threshold != STS::zero()) { - const size_t numVectors = ghostedCoords->getNumVectors(); - coordData.reserve(numVectors); - for (size_t j = 0; j < numVectors; j++) { - Teuchos::ArrayRCP tmpData = ghostedCoords->getData(j); - coordData.push_back(tmpData); - } - } - - ArrayView vals; // CMS hackery - for (LO row = 0; row < numRows; row++) { - ArrayView indices; - indicesExtra.resize(0); - bool isBoundary = false; - - if (blkSize == 1) { - // ArrayView vals;//CMS uncomment - A->getLocalRowView(row, indices, vals); - isBoundary = pointBoundaryNodes[row]; - } else { - // The amalgamated row is marked as Dirichlet iff all point rows are Dirichlet - for (LO j = 0; j < blkSize; j++) { - if (!pointBoundaryNodes[row * blkSize + j]) { - isBoundary = false; - break; - } - } - - // Merge rows of A - if (!isBoundary) - MergeRows(*A, row, indicesExtra, colTranslation); - else - indicesExtra.push_back(row); - indices = indicesExtra; - } - numTotal += indices.size(); - - LO nnz = indices.size(), rownnz = 0; - - if (use_stop_array) { - rows(row + 1) = rows(row) + nnz; - realnnz = rows(row); - } - - if (threshold != STS::zero()) { - // default - if (distanceLaplacianAlgo == defaultAlgo) { - /* Standard Distance Laplacian */ - for (LO colID = 0; colID < nnz; colID++) { - LO col = indices[colID]; - - if (row == col) { - columns(realnnz++) = col; - rownnz++; - continue; - } - - // We do not want the distance Laplacian aggregating boundary nodes - if (isBoundary) continue; - - SC laplVal; - if (use_dlap_weights == SINGLE_WEIGHTS) { - laplVal = STS::one() / MueLu::Utilities::Distance2(dlap_weights(), coordData, row, col); - } else if (use_dlap_weights == BLOCK_WEIGHTS) { - int block_id = row % interleaved_blocksize; - int block_start = block_id * interleaved_blocksize; - laplVal = STS::one() / MueLu::Utilities::Distance2(dlap_weights(block_start, interleaved_blocksize), coordData, row, col); - } else { - laplVal = STS::one() / MueLu::Utilities::Distance2(coordData, row, col); - } - real_type aiiajj = STS::magnitude(realThreshold * realThreshold * ghostedLaplDiagData[row] * ghostedLaplDiagData[col]); - real_type aij = STS::magnitude(laplVal * laplVal); - - if (aij > aiiajj) { - columns(realnnz++) = col; - rownnz++; - } else { - numDropped++; - } - } - } else { - /* Cut Algorithm */ - using DropTol = Details::DropTol; - std::vector drop_vec; - drop_vec.reserve(nnz); - const real_type zero = Teuchos::ScalarTraits::zero(); - const real_type one = Teuchos::ScalarTraits::one(); - - // find magnitudes - for (LO colID = 0; colID < nnz; colID++) { - LO col = indices[colID]; - - if (row == col) { - drop_vec.emplace_back(zero, one, colID, false); - continue; - } - // We do not want the distance Laplacian aggregating boundary nodes - if (isBoundary) continue; - - SC laplVal; - if (use_dlap_weights == SINGLE_WEIGHTS) { - laplVal = STS::one() / MueLu::Utilities::Distance2(dlap_weights(), coordData, row, col); - } else if (use_dlap_weights == BLOCK_WEIGHTS) { - int block_id = row % interleaved_blocksize; - int block_start = block_id * interleaved_blocksize; - laplVal = STS::one() / MueLu::Utilities::Distance2(dlap_weights(block_start, interleaved_blocksize), coordData, row, col); - } else { - laplVal = STS::one() / MueLu::Utilities::Distance2(coordData, row, col); - } - - real_type aiiajj = STS::magnitude(ghostedLaplDiagData[row] * ghostedLaplDiagData[col]); - real_type aij = STS::magnitude(laplVal * laplVal); - - drop_vec.emplace_back(aij, aiiajj, colID, false); - } - - const size_t n = drop_vec.size(); - - if (distanceLaplacianAlgo == unscaled_cut) { - std::sort(drop_vec.begin(), drop_vec.end(), [](DropTol const& a, DropTol const& b) { - return a.val > b.val; - }); - - bool drop = false; - for (size_t i = 1; i < n; ++i) { - if (!drop) { - auto const& x = drop_vec[i - 1]; - auto const& y = drop_vec[i]; - auto a = x.val; - auto b = y.val; - if (a > realThreshold * b) { - drop = true; -#ifdef HAVE_MUELU_DEBUG - if (distanceLaplacianCutVerbose) { - std::cout << "DJS: KEEP, N, ROW: " << i + 1 << ", " << n << ", " << row << std::endl; - } -#endif - } - } - drop_vec[i].drop = drop; - } - } else if (distanceLaplacianAlgo == scaled_cut || distanceLaplacianAlgo == scaled_cut_symmetric) { - std::sort(drop_vec.begin(), drop_vec.end(), [](DropTol const& a, DropTol const& b) { - return a.val / a.diag > b.val / b.diag; - }); - - bool drop = false; - for (size_t i = 1; i < n; ++i) { - if (!drop) { - auto const& x = drop_vec[i - 1]; - auto const& y = drop_vec[i]; - auto a = x.val / x.diag; - auto b = y.val / y.diag; - if (a > realThreshold * b) { - drop = true; -#ifdef HAVE_MUELU_DEBUG - if (distanceLaplacianCutVerbose) { - std::cout << "DJS: KEEP, N, ROW: " << i + 1 << ", " << n << ", " << row << std::endl; - } -#endif - } - } - drop_vec[i].drop = drop; - } - } - - std::sort(drop_vec.begin(), drop_vec.end(), [](DropTol const& a, DropTol const& b) { - return a.col < b.col; - }); - - for (LO idxID = 0; idxID < (LO)drop_vec.size(); idxID++) { - LO col = indices[drop_vec[idxID].col]; - - // don't drop diagonal - if (row == col) { - columns(realnnz++) = col; - rownnz++; - // printf("(%d,%d) KEEP %13s matrix = %6.4e\n",row,row,"DIAGONAL",drop_vec[idxID].aux_val); - continue; - } - - if (!drop_vec[idxID].drop) { - columns(realnnz++) = col; - // printf("(%d,%d) KEEP dlap = %6.4e matrix = %6.4e\n",row,col,drop_vec[idxID].val/drop_vec[idxID].diag,drop_vec[idxID].aux_val); - rownnz++; - } else { - // printf("(%d,%d) DROP dlap = %6.4e matrix = %6.4e\n",row,col,drop_vec[idxID].val/drop_vec[idxID].diag,drop_vec[idxID].aux_val); - numDropped++; - } - } - } - } else { - // Skip laplace calculation and threshold comparison for zero threshold - for (LO colID = 0; colID < nnz; colID++) { - LO col = indices[colID]; - columns(realnnz++) = col; - rownnz++; - } - } - - if (rownnz == 1) { - // If the only element remaining after filtering is diagonal, mark node as boundary - // FIXME: this should really be replaced by the following - // if (indices.size() == 1 && indices[0] == row) - // boundaryNodes[row] = true; - // We do not do it this way now because there is no framework for distinguishing isolated - // and boundary nodes in the aggregation algorithms - amalgBoundaryNodes[row] = true; - } - - if (use_stop_array) - rows_stop[row] = rownnz + rows[row]; - else - rows[row + 1] = realnnz; - } // for (LO row = 0; row < numRows; row++) - - } // subtimer - - if (use_stop_array) { - // Do symmetrization of the cut matrix - // NOTE: We assume nested row/column maps here - for (LO row = 0; row < numRows; row++) { - for (LO colidx = rows[row]; colidx < rows_stop[row]; colidx++) { - LO col = columns[colidx]; - if (col >= numRows) continue; - - bool found = false; - for (LO t_col = rows(col); !found && t_col < rows_stop[col]; t_col++) { - if (columns[t_col] == row) - found = true; - } - // We didn't find the transpose buddy, so let's symmetrize, unless we'd be symmetrizing - // into a Dirichlet unknown. In that case don't. - if (!found && !pointBoundaryNodes[col] && Teuchos::as(rows_stop[col]) < rows[col + 1]) { - LO new_idx = rows_stop[col]; - // printf("(%d,%d) SYMADD entry\n",col,row); - columns[new_idx] = row; - rows_stop[col]++; - numDropped--; - } - } - } - - // Condense everything down - LO current_start = 0; - for (LO row = 0; row < numRows; row++) { - LO old_start = current_start; - for (LO col = rows(row); col < rows_stop[row]; col++) { - if (current_start != col) { - columns(current_start) = columns(col); - } - current_start++; - } - rows[row] = old_start; - } - rows(numRows) = realnnz = current_start; - } - - RCP graph; - { - SubFactoryMonitor m1(*this, "Build amalgamated graph", currentLevel); - graph = rcp(new LWGraph(rows, Kokkos::subview(columns, Kokkos::make_pair(0, realnnz)), uniqueMap, nonUniqueMap, "amalgamated graph of A")); - graph->SetBoundaryNodeMap(amalgBoundaryNodes); - } // subtimer - - if (GetVerbLevel() & Statistics1) { - GO numLocalBoundaryNodes = 0; - GO numGlobalBoundaryNodes = 0; - - for (size_t i = 0; i < amalgBoundaryNodes.size(); ++i) - if (amalgBoundaryNodes(i)) - numLocalBoundaryNodes++; - - RCP> comm = A->getRowMap()->getComm(); - MueLu_sumAll(comm, numLocalBoundaryNodes, numGlobalBoundaryNodes); - GetOStream(Statistics1) << "Detected " << numGlobalBoundaryNodes << " agglomerated Dirichlet nodes" - << " using threshold " << dirichletThreshold << std::endl; - } - - Set(currentLevel, "Graph", graph); - Set(currentLevel, "DofsPerNode", blkSize); - } - } - - if ((GetVerbLevel() & Statistics1) && !(A->GetFixedBlockSize() > 1 && threshold != STS::zero())) { - RCP> comm = A->getRowMap()->getComm(); - GO numGlobalTotal, numGlobalDropped; - MueLu_sumAll(comm, numTotal, numGlobalTotal); - MueLu_sumAll(comm, numDropped, numGlobalDropped); - GetOStream(Statistics1) << "Number of dropped entries in " << graphType << " matrix graph: " << numGlobalDropped << "/" << numGlobalTotal; - if (numGlobalTotal != 0) - GetOStream(Statistics1) << " (" << 100 * Teuchos::as(numGlobalDropped) / Teuchos::as(numGlobalTotal) << "%)"; - GetOStream(Statistics1) << std::endl; - } - - } else { - // what Tobias has implemented - - SC threshold = as(pL.get("aggregation: drop tol")); - // GetOStream(Runtime0) << "algorithm = \"" << algo << "\": threshold = " << threshold << ", blocksize = " << A->GetFixedBlockSize() << std::endl; - GetOStream(Runtime0) << "algorithm = \"" - << "failsafe" - << "\": threshold = " << threshold << ", blocksize = " << A->GetFixedBlockSize() << std::endl; - Set(currentLevel, "Filtering", (threshold != STS::zero())); - - RCP rowMap = A->getRowMap(); - RCP colMap = A->getColMap(); - - LO blockdim = 1; // block dim for fixed size blocks - GO indexBase = rowMap->getIndexBase(); // index base of maps - GO offset = 0; - - // 1) check for blocking/striding information - if (A->IsView("stridedMaps") && - Teuchos::rcp_dynamic_cast(A->getRowMap("stridedMaps")) != Teuchos::null) { - Xpetra::viewLabel_t oldView = A->SwitchToView("stridedMaps"); // note: "stridedMaps are always non-overlapping (correspond to range and domain maps!) - RCP strMap = Teuchos::rcp_dynamic_cast(A->getRowMap()); - TEUCHOS_TEST_FOR_EXCEPTION(strMap == Teuchos::null, Exceptions::BadCast, "MueLu::CoalesceFactory::Build: cast to strided row map failed."); - blockdim = strMap->getFixedBlockSize(); - offset = strMap->getOffset(); - oldView = A->SwitchToView(oldView); - GetOStream(Statistics1) << "CoalesceDropFactory::Build():" - << " found blockdim=" << blockdim << " from strided maps. offset=" << offset << std::endl; - } else - GetOStream(Statistics1) << "CoalesceDropFactory::Build(): no striding information available. Use blockdim=1 with offset=0" << std::endl; - - // 2) get row map for amalgamated matrix (graph of A) - // with same distribution over all procs as row map of A - RCP nodeMap = amalInfo->getNodeRowMap(); - GetOStream(Statistics1) << "CoalesceDropFactory: nodeMap " << nodeMap->getLocalNumElements() << "/" << nodeMap->getGlobalNumElements() << " elements" << std::endl; - - // 3) create graph of amalgamated matrix - RCP crsGraph = CrsGraphFactory::Build(nodeMap, A->getLocalMaxNumRowEntries() * blockdim); - - LO numRows = A->getRowMap()->getLocalNumElements(); - LO numNodes = nodeMap->getLocalNumElements(); - typename LWGraph::boundary_nodes_type amalgBoundaryNodes("amalgBoundaryNodes", numNodes); - Kokkos::deep_copy(amalgBoundaryNodes, false); - const ArrayRCP numberDirichletRowsPerNode(numNodes, 0); // helper array counting the number of Dirichlet nodes associated with node - bool bIsDiagonalEntry = false; // boolean flag stating that grid==gcid - - // 4) do amalgamation. generate graph of amalgamated matrix - // Note, this code is much more inefficient than the leightwight implementation - // Most of the work has already been done in the AmalgamationFactory - for (LO row = 0; row < numRows; row++) { - // get global DOF id - GO grid = rowMap->getGlobalElement(row); - - // reinitialize boolean helper variable - bIsDiagonalEntry = false; - - // translate grid to nodeid - GO nodeId = AmalgamationFactory::DOFGid2NodeId(grid, blockdim, offset, indexBase); - - size_t nnz = A->getNumEntriesInLocalRow(row); - Teuchos::ArrayView indices; - Teuchos::ArrayView vals; - A->getLocalRowView(row, indices, vals); - - RCP> cnodeIds = Teuchos::rcp(new std::vector); // global column block ids - LO realnnz = 0; - for (LO col = 0; col < Teuchos::as(nnz); col++) { - GO gcid = colMap->getGlobalElement(indices[col]); // global column id - - if (vals[col] != STS::zero()) { - GO cnodeId = AmalgamationFactory::DOFGid2NodeId(gcid, blockdim, offset, indexBase); - cnodeIds->push_back(cnodeId); - realnnz++; // increment number of nnz in matrix row - if (grid == gcid) bIsDiagonalEntry = true; - } - } - - if (realnnz == 1 && bIsDiagonalEntry == true) { - LO lNodeId = nodeMap->getLocalElement(nodeId); - numberDirichletRowsPerNode[lNodeId] += 1; // increment Dirichlet row counter associated with lNodeId - if (numberDirichletRowsPerNode[lNodeId] == blockdim) // mark full Dirichlet nodes - amalgBoundaryNodes[lNodeId] = true; - } - - Teuchos::ArrayRCP arr_cnodeIds = Teuchos::arcp(cnodeIds); - - if (arr_cnodeIds.size() > 0) - crsGraph->insertGlobalIndices(nodeId, arr_cnodeIds()); - } - // fill matrix graph - crsGraph->fillComplete(nodeMap, nodeMap); - - // 5) create MueLu Graph object - RCP graph = rcp(new LWGraph(crsGraph, "amalgamated graph of A")); - - // Detect and record rows that correspond to Dirichlet boundary conditions - graph->SetBoundaryNodeMap(amalgBoundaryNodes); - - if (GetVerbLevel() & Statistics1) { - GO numLocalBoundaryNodes = 0; - GO numGlobalBoundaryNodes = 0; - for (size_t i = 0; i < amalgBoundaryNodes.size(); ++i) - if (amalgBoundaryNodes(i)) - numLocalBoundaryNodes++; - RCP> comm = A->getRowMap()->getComm(); - MueLu_sumAll(comm, numLocalBoundaryNodes, numGlobalBoundaryNodes); - GetOStream(Statistics1) << "Detected " << numGlobalBoundaryNodes << " Dirichlet nodes" << std::endl; - } - - // 6) store results in Level - // graph->SetBoundaryNodeMap(gBoundaryNodeMap); - Set(currentLevel, "DofsPerNode", blockdim); - Set(currentLevel, "Graph", graph); - - } // if (doExperimentalWrap) ... else ... - -} // Build - -template -void CoalesceDropFactory::BuildKokkos(Level& currentLevel) const { - FactoryMonitor m(*this, "BuildKokkos", currentLevel); - - typedef Teuchos::ScalarTraits STS; - typedef typename STS::magnitudeType real_type; - typedef Xpetra::MultiVector RealValuedMultiVector; - typedef Xpetra::MultiVectorFactory RealValuedMultiVectorFactory; - - if (predrop_ != Teuchos::null) - GetOStream(Parameters0) << predrop_->description(); - - RCP realA = Get>(currentLevel, "A"); - RCP amalInfo = Get>(currentLevel, "UnAmalgamationInfo"); - const ParameterList& pL = GetParameterList(); - bool doExperimentalWrap = pL.get("lightweight wrap"); - - GetOStream(Parameters0) << "lightweight wrap = " << doExperimentalWrap << std::endl; - std::string algo = pL.get("aggregation: drop scheme"); - const bool aggregationMayCreateDirichlet = pL.get("aggregation: dropping may create Dirichlet"); - - RCP Coords; - RCP A; - - bool use_block_algorithm = false; - LO interleaved_blocksize = as(pL.get("aggregation: block diagonal: interleaved blocksize")); - bool useSignedClassicalRS = false; - bool useSignedClassicalSA = false; - bool generateColoringGraph = false; - - // NOTE: If we're doing blockDiagonal, we'll not want to do rowSum twice (we'll do it - // in the block diagonalization). So we'll clobber the rowSumTol with -1.0 in this case - typename STS::magnitudeType rowSumTol = as(pL.get("aggregation: row sum drop tol")); - - RCP ghostedBlockNumber; - ArrayRCP g_block_id; - - if (algo == "distance laplacian") { - // Grab the coordinates for distance laplacian - Coords = Get>(currentLevel, "Coordinates"); - A = realA; - } else if (algo == "signed classical sa") { - useSignedClassicalSA = true; - algo = "classical"; - A = realA; - } else if (algo == "signed classical" || algo == "block diagonal colored signed classical" || algo == "block diagonal signed classical") { - useSignedClassicalRS = true; - // if(realA->GetFixedBlockSize() > 1) { - RCP BlockNumber = Get>(currentLevel, "BlockNumber"); - // Ghost the column block numbers if we need to - RCP importer = realA->getCrsGraph()->getImporter(); - if (!importer.is_null()) { - SubFactoryMonitor m1(*this, "Block Number import", currentLevel); - ghostedBlockNumber = Xpetra::VectorFactory::Build(importer->getTargetMap()); - ghostedBlockNumber->doImport(*BlockNumber, *importer, Xpetra::INSERT); - } else { - ghostedBlockNumber = BlockNumber; - } - g_block_id = ghostedBlockNumber->getData(0); - // } - if (algo == "block diagonal colored signed classical") - generateColoringGraph = true; - algo = "classical"; - A = realA; - - } else if (algo == "block diagonal") { - // Handle the "block diagonal" filtering and then leave - BlockDiagonalize(currentLevel, realA, false); - return; - } else if (algo == "block diagonal classical" || algo == "block diagonal distance laplacian") { - // Handle the "block diagonal" filtering, and then continue onward - use_block_algorithm = true; - RCP filteredMatrix = BlockDiagonalize(currentLevel, realA, true); - if (algo == "block diagonal distance laplacian") { - // We now need to expand the coordinates by the interleaved blocksize - RCP OldCoords = Get>(currentLevel, "Coordinates"); - if (OldCoords->getLocalLength() != realA->getLocalNumRows()) { - LO dim = (LO)OldCoords->getNumVectors(); - Coords = RealValuedMultiVectorFactory::Build(realA->getRowMap(), dim); - for (LO k = 0; k < dim; k++) { - ArrayRCP old_vec = OldCoords->getData(k); - ArrayRCP new_vec = Coords->getDataNonConst(k); - for (LO i = 0; i < (LO)OldCoords->getLocalLength(); i++) { - LO new_base = i * dim; - for (LO j = 0; j < interleaved_blocksize; j++) - new_vec[new_base + j] = old_vec[i]; - } - } - } else { - Coords = OldCoords; - } - algo = "distance laplacian"; - } else if (algo == "block diagonal classical") { - algo = "classical"; - } - // All cases - A = filteredMatrix; - rowSumTol = -1.0; - } else { - A = realA; - } - - // Distance Laplacian weights - Array dlap_weights = pL.get>("aggregation: distance laplacian directional weights"); - enum { NO_WEIGHTS = 0, - SINGLE_WEIGHTS, - BLOCK_WEIGHTS }; - int use_dlap_weights = NO_WEIGHTS; - if (algo == "distance laplacian") { - LO dim = (LO)Coords->getNumVectors(); - // If anything isn't 1.0 we need to turn on the weighting - bool non_unity = false; - for (LO i = 0; !non_unity && i < (LO)dlap_weights.size(); i++) { - if (dlap_weights[i] != 1.0) { - non_unity = true; - } - } - if (non_unity) { - LO blocksize = use_block_algorithm ? as(pL.get("aggregation: block diagonal: interleaved blocksize")) : 1; - if ((LO)dlap_weights.size() == dim) - use_dlap_weights = SINGLE_WEIGHTS; - else if ((LO)dlap_weights.size() == blocksize * dim) - use_dlap_weights = BLOCK_WEIGHTS; - else { - TEUCHOS_TEST_FOR_EXCEPTION(1, Exceptions::RuntimeError, - "length of 'aggregation: distance laplacian directional weights' must equal the coordinate dimension OR the coordinate dimension times the blocksize"); - } - if (GetVerbLevel() & Statistics1) - GetOStream(Statistics1) << "Using distance laplacian weights: " << dlap_weights << std::endl; - } - } - - // decide wether to use the fast-track code path for standard maps or the somewhat slower - // code path for non-standard maps - /*bool bNonStandardMaps = false; - if (A->IsView("stridedMaps") == true) { - Teuchos::RCP myMap = A->getRowMap("stridedMaps"); - Teuchos::RCP strMap = Teuchos::rcp_dynamic_cast(myMap); - TEUCHOS_TEST_FOR_EXCEPTION(strMap == null, Exceptions::RuntimeError, "Map is not of type StridedMap"); - if (strMap->getStridedBlockId() != -1 || strMap->getOffset() > 0) - bNonStandardMaps = true; - }*/ - - if (doExperimentalWrap) { - TEUCHOS_TEST_FOR_EXCEPTION(predrop_ != null && algo != "classical", Exceptions::RuntimeError, "Dropping function must not be provided for \"" << algo << "\" algorithm"); - TEUCHOS_TEST_FOR_EXCEPTION(algo != "classical" && algo != "distance laplacian" && algo != "signed classical", Exceptions::RuntimeError, "\"algorithm\" must be one of (classical|distance laplacian|signed classical)"); - - SC threshold; - // If we're doing the ML-style halving of the drop tol at each level, we do that here. - if (pL.get("aggregation: use ml scaling of drop tol")) - threshold = pL.get("aggregation: drop tol") / pow(2.0, currentLevel.GetLevelID()); - else - threshold = as(pL.get("aggregation: drop tol")); - - std::string distanceLaplacianAlgoStr = pL.get("aggregation: distance laplacian algo"); - std::string classicalAlgoStr = pL.get("aggregation: classical algo"); - real_type realThreshold = STS::magnitude(threshold); // CMS: Rename this to "magnitude threshold" sometime - - //////////////////////////////////////////////////// - // Remove this bit once we are confident that cut-based dropping works. -#ifdef HAVE_MUELU_DEBUG - int distanceLaplacianCutVerbose = 0; -#endif -#ifdef DJS_READ_ENV_VARIABLES - if (getenv("MUELU_DROP_TOLERANCE_MODE")) { - distanceLaplacianAlgoStr = std::string(getenv("MUELU_DROP_TOLERANCE_MODE")); - } - - if (getenv("MUELU_DROP_TOLERANCE_THRESHOLD")) { - auto tmp = atoi(getenv("MUELU_DROP_TOLERANCE_THRESHOLD")); - realThreshold = 1e-4 * tmp; - } - -#ifdef HAVE_MUELU_DEBUG - if (getenv("MUELU_DROP_TOLERANCE_VERBOSE")) { - distanceLaplacianCutVerbose = atoi(getenv("MUELU_DROP_TOLERANCE_VERBOSE")); - } -#endif -#endif - //////////////////////////////////////////////////// - - enum decisionAlgoType { defaultAlgo, - unscaled_cut, - scaled_cut, - scaled_cut_symmetric }; - - decisionAlgoType distanceLaplacianAlgo = defaultAlgo; - decisionAlgoType classicalAlgo = defaultAlgo; - if (algo == "distance laplacian") { - if (distanceLaplacianAlgoStr == "default") - distanceLaplacianAlgo = defaultAlgo; - else if (distanceLaplacianAlgoStr == "unscaled cut") - distanceLaplacianAlgo = unscaled_cut; - else if (distanceLaplacianAlgoStr == "scaled cut") - distanceLaplacianAlgo = scaled_cut; - else if (distanceLaplacianAlgoStr == "scaled cut symmetric") - distanceLaplacianAlgo = scaled_cut_symmetric; - else - TEUCHOS_TEST_FOR_EXCEPTION(true, Exceptions::RuntimeError, "\"aggregation: distance laplacian algo\" must be one of (default|unscaled cut|scaled cut), not \"" << distanceLaplacianAlgoStr << "\""); - GetOStream(Runtime0) << "algorithm = \"" << algo << "\" distance laplacian algorithm = \"" << distanceLaplacianAlgoStr << "\": threshold = " << threshold << ", blocksize = " << A->GetFixedBlockSize() << std::endl; - } else if (algo == "classical") { - if (classicalAlgoStr == "default") - classicalAlgo = defaultAlgo; - else if (classicalAlgoStr == "unscaled cut") - classicalAlgo = unscaled_cut; - else if (classicalAlgoStr == "scaled cut") - classicalAlgo = scaled_cut; - else - TEUCHOS_TEST_FOR_EXCEPTION(true, Exceptions::RuntimeError, "\"aggregation: classical algo\" must be one of (default|unscaled cut|scaled cut), not \"" << classicalAlgoStr << "\""); - GetOStream(Runtime0) << "algorithm = \"" << algo << "\" classical algorithm = \"" << classicalAlgoStr << "\": threshold = " << threshold << ", blocksize = " << A->GetFixedBlockSize() << std::endl; - - } else - GetOStream(Runtime0) << "algorithm = \"" << algo << "\": threshold = " << threshold << ", blocksize = " << A->GetFixedBlockSize() << std::endl; - Set(currentLevel, "Filtering", (threshold != STS::zero())); - - const typename STS::magnitudeType dirichletThreshold = STS::magnitude(as(pL.get("aggregation: Dirichlet threshold"))); - - // NOTE: We don't support signed classical RS or SA with cut drop at present - TEUCHOS_TEST_FOR_EXCEPTION(useSignedClassicalRS && classicalAlgo != defaultAlgo, Exceptions::RuntimeError, "\"aggregation: classical algo\" != default is not supported for scalled classical aggregation"); - TEUCHOS_TEST_FOR_EXCEPTION(useSignedClassicalSA && classicalAlgo != defaultAlgo, Exceptions::RuntimeError, "\"aggregation: classical algo\" != default is not supported for scalled classical sa aggregation"); - - GO numDropped = 0, numTotal = 0; - std::string graphType = "unamalgamated"; // for description purposes only - - /* NOTE: storageblocksize (from GetStorageBlockSize()) is the size of a block in the chosen storage scheme. - BlockSize is the number of storage blocks that must kept together during the amalgamation process. - - Both of these quantities may be different than numPDEs (from GetFixedBlockSize()), but the following must always hold: - - numPDEs = BlockSize * storageblocksize. - - If numPDEs==1 - Matrix is point storage (classical CRS storage). storageblocksize=1 and BlockSize=1 - No other values makes sense. - - If numPDEs>1 - If matrix uses point storage, then storageblocksize=1 and BlockSize=numPDEs. - If matrix uses block storage, with block size of n, then storageblocksize=n, and BlockSize=numPDEs/n. - Thus far, only storageblocksize=numPDEs and BlockSize=1 has been tested. - */ - TEUCHOS_TEST_FOR_EXCEPTION(A->GetFixedBlockSize() % A->GetStorageBlockSize() != 0, Exceptions::RuntimeError, "A->GetFixedBlockSize() needs to be a multiple of A->GetStorageBlockSize()"); - const LO BlockSize = A->GetFixedBlockSize() / A->GetStorageBlockSize(); - - /************************** RS or SA-style Classical Dropping (and variants) **************************/ - if (algo == "classical") { - if (predrop_ == null) { - // ap: this is a hack: had to declare predrop_ as mutable - predrop_ = rcp(new PreDropFunctionConstVal(threshold)); - } - - if (predrop_ != null) { - RCP predropConstVal = rcp_dynamic_cast(predrop_); - TEUCHOS_TEST_FOR_EXCEPTION(predropConstVal == Teuchos::null, Exceptions::BadCast, - "MueLu::CoalesceFactory::Build: cast to PreDropFunctionConstVal failed."); - // If a user provided a predrop function, it overwrites the XML threshold parameter - SC newt = predropConstVal->GetThreshold(); - if (newt != threshold) { - GetOStream(Warnings0) << "switching threshold parameter from " << threshold << " (list) to " << newt << " (user function" << std::endl; - threshold = newt; - } - } - // At this points we either have - // (predrop_ != null) - // Therefore, it is sufficient to check only threshold - if (BlockSize == 1 && threshold == STS::zero() && !useSignedClassicalRS && !useSignedClassicalSA && A->hasCrsGraph()) { - // Case 1: scalar problem, no dropping => just use matrix graph - RCP graph = rcp(new LWGraph(A->getCrsGraph(), "graph of A")); - // Detect and record rows that correspond to Dirichlet boundary conditions - auto boundaryNodes = MueLu::Utilities::DetectDirichletRows_kokkos_host(*A, dirichletThreshold); - if (rowSumTol > 0.) - Utilities::ApplyRowSumCriterionHost(*A, rowSumTol, boundaryNodes); - - graph->SetBoundaryNodeMap(boundaryNodes); - numTotal = A->getLocalNumEntries(); - - if (GetVerbLevel() & Statistics1) { - GO numLocalBoundaryNodes = 0; - GO numGlobalBoundaryNodes = 0; - for (size_t i = 0; i < boundaryNodes.size(); ++i) - if (boundaryNodes[i]) - numLocalBoundaryNodes++; - RCP> comm = A->getRowMap()->getComm(); - MueLu_sumAll(comm, numLocalBoundaryNodes, numGlobalBoundaryNodes); - GetOStream(Statistics1) << "Detected " << numGlobalBoundaryNodes << " Dirichlet nodes" << std::endl; - } - - Set(currentLevel, "DofsPerNode", 1); - Set(currentLevel, "Graph", graph); - - } else if ((BlockSize == 1 && threshold != STS::zero()) || - (BlockSize == 1 && threshold == STS::zero() && !A->hasCrsGraph()) || - (BlockSize == 1 && useSignedClassicalRS) || - (BlockSize == 1 && useSignedClassicalSA)) { - // Case 2: scalar problem with dropping => record the column indices of undropped entries, but still use original - // graph's map information, e.g., whether index is local - // OR a matrix without a CrsGraph - - // allocate space for the local graph - typename LWGraph::row_type::non_const_type rows("rows", A->getLocalNumRows() + 1); - typename LWGraph::entries_type::non_const_type columns("columns", A->getLocalNumEntries()); - - using MT = typename STS::magnitudeType; - RCP ghostedDiag; - ArrayRCP ghostedDiagVals; - ArrayRCP negMaxOffDiagonal; - // RS style needs the max negative off-diagonal, SA style needs the diagonal - if (useSignedClassicalRS) { - if (ghostedBlockNumber.is_null()) { - negMaxOffDiagonal = MueLu::Utilities::GetMatrixMaxMinusOffDiagonal(*A); - if (GetVerbLevel() & Statistics1) - GetOStream(Statistics1) << "Calculated max point off-diagonal" << std::endl; - } else { - negMaxOffDiagonal = MueLu::Utilities::GetMatrixMaxMinusOffDiagonal(*A, *ghostedBlockNumber); - if (GetVerbLevel() & Statistics1) - GetOStream(Statistics1) << "Calculating max block off-diagonal" << std::endl; - } - } else { - ghostedDiag = MueLu::Utilities::GetMatrixOverlappedDiagonal(*A); - ghostedDiagVals = ghostedDiag->getData(0); - } - auto boundaryNodes = MueLu::Utilities::DetectDirichletRows_kokkos_host(*A, dirichletThreshold); - if (rowSumTol > 0.) { - if (ghostedBlockNumber.is_null()) { - if (GetVerbLevel() & Statistics1) - GetOStream(Statistics1) << "Applying point row sum criterion." << std::endl; - Utilities::ApplyRowSumCriterionHost(*A, rowSumTol, boundaryNodes); - } else { - if (GetVerbLevel() & Statistics1) - GetOStream(Statistics1) << "Applying block row sum criterion." << std::endl; - Utilities::ApplyRowSumCriterionHost(*A, *ghostedBlockNumber, rowSumTol, boundaryNodes); - } - } - - LO realnnz = 0; - rows(0) = 0; - for (LO row = 0; row < Teuchos::as(A->getRowMap()->getLocalNumElements()); ++row) { - size_t nnz = A->getNumEntriesInLocalRow(row); - bool rowIsDirichlet = boundaryNodes[row]; - ArrayView indices; - ArrayView vals; - A->getLocalRowView(row, indices, vals); - - if (classicalAlgo == defaultAlgo) { - // FIXME the current predrop function uses the following - // FIXME if(std::abs(vals[k]) > std::abs(threshold_) || grow == gcid ) - // FIXME but the threshold doesn't take into account the rows' diagonal entries - // FIXME For now, hardwiring the dropping in here - - LO rownnz = 0; - if (useSignedClassicalRS) { - // Signed classical RS style - for (LO colID = 0; colID < Teuchos::as(nnz); colID++) { - LO col = indices[colID]; - MT max_neg_aik = realThreshold * STS::real(negMaxOffDiagonal[row]); - MT neg_aij = -STS::real(vals[colID]); - /* if(row==1326) printf("A(%d,%d) = %6.4e, block = (%d,%d) neg_aij = %6.4e max_neg_aik = %6.4e\n",row,col,vals[colID], - g_block_id.is_null() ? -1 : g_block_id[row], - g_block_id.is_null() ? -1 : g_block_id[col], - neg_aij, max_neg_aik);*/ - if ((!rowIsDirichlet && (g_block_id.is_null() || g_block_id[row] == g_block_id[col]) && neg_aij > max_neg_aik) || row == col) { - columns[realnnz++] = col; - rownnz++; - } else - numDropped++; - } - rows(row + 1) = realnnz; - } else if (useSignedClassicalSA) { - // Signed classical SA style - for (LO colID = 0; colID < Teuchos::as(nnz); colID++) { - LO col = indices[colID]; - - bool is_nonpositive = STS::real(vals[colID]) <= 0; - MT aiiajj = STS::magnitude(threshold * threshold * ghostedDiagVals[col] * ghostedDiagVals[row]); // eps^2*|a_ii|*|a_jj| - MT aij = is_nonpositive ? STS::magnitude(vals[colID] * vals[colID]) : (-STS::magnitude(vals[colID] * vals[colID])); // + |a_ij|^2, if a_ij < 0, - |a_ij|^2 if a_ij >=0 - /* - if(row==1326) printf("A(%d,%d) = %6.4e, raw_aij = %6.4e aij = %6.4e aiiajj = %6.4e\n",row,col,vals[colID], - vals[colID],aij, aiiajj); - */ - - if ((!rowIsDirichlet && aij > aiiajj) || row == col) { - columns(realnnz++) = col; - rownnz++; - } else - numDropped++; - } - rows[row + 1] = realnnz; - } else { - // Standard abs classical - for (LO colID = 0; colID < Teuchos::as(nnz); colID++) { - LO col = indices[colID]; - MT aiiajj = STS::magnitude(threshold * threshold * ghostedDiagVals[col] * ghostedDiagVals[row]); // eps^2*|a_ii|*|a_jj| - MT aij = STS::magnitude(vals[colID] * vals[colID]); // |a_ij|^2 - - if ((!rowIsDirichlet && aij > aiiajj) || row == col) { - columns(realnnz++) = col; - rownnz++; - } else - numDropped++; - } - rows(row + 1) = realnnz; - } - } else { - /* Cut Algorithm */ - // CMS - using DropTol = Details::DropTol; - std::vector drop_vec; - drop_vec.reserve(nnz); - const real_type zero = Teuchos::ScalarTraits::zero(); - const real_type one = Teuchos::ScalarTraits::one(); - LO rownnz = 0; - // NOTE: This probably needs to be fixed for rowsum - - // find magnitudes - for (LO colID = 0; colID < (LO)nnz; colID++) { - LO col = indices[colID]; - if (row == col) { - drop_vec.emplace_back(zero, one, colID, false); - continue; - } - - // Don't aggregate boundaries - if (boundaryNodes[colID]) continue; - typename STS::magnitudeType aiiajj = STS::magnitude(threshold * threshold * ghostedDiagVals[col] * ghostedDiagVals[row]); // eps^2*|a_ii|*|a_jj| - typename STS::magnitudeType aij = STS::magnitude(vals[colID] * vals[colID]); // |a_ij|^2 - drop_vec.emplace_back(aij, aiiajj, colID, false); - } - - const size_t n = drop_vec.size(); - - if (classicalAlgo == unscaled_cut) { - std::sort(drop_vec.begin(), drop_vec.end(), [](DropTol const& a, DropTol const& b) { - return a.val > b.val; - }); - - bool drop = false; - for (size_t i = 1; i < n; ++i) { - if (!drop) { - auto const& x = drop_vec[i - 1]; - auto const& y = drop_vec[i]; - auto a = x.val; - auto b = y.val; - if (a > realThreshold * b) { - drop = true; -#ifdef HAVE_MUELU_DEBUG - if (distanceLaplacianCutVerbose) { - std::cout << "DJS: KEEP, N, ROW: " << i + 1 << ", " << n << ", " << row << std::endl; - } -#endif - } - } - drop_vec[i].drop = drop; - } - } else if (classicalAlgo == scaled_cut) { - std::sort(drop_vec.begin(), drop_vec.end(), [](DropTol const& a, DropTol const& b) { - return a.val / a.diag > b.val / b.diag; - }); - bool drop = false; - // printf("[%d] Scaled Cut: ",(int)row); - // printf("%3d(%4s) ",indices[drop_vec[0].col],"keep"); - for (size_t i = 1; i < n; ++i) { - if (!drop) { - auto const& x = drop_vec[i - 1]; - auto const& y = drop_vec[i]; - auto a = x.val / x.diag; - auto b = y.val / y.diag; - if (a > realThreshold * b) { - drop = true; - -#ifdef HAVE_MUELU_DEBUG - if (distanceLaplacianCutVerbose) { - std::cout << "DJS: KEEP, N, ROW: " << i + 1 << ", " << n << ", " << row << std::endl; - } -#endif - } - // printf("%3d(%4s) ",indices[drop_vec[i].col],drop?"drop":"keep"); - } - drop_vec[i].drop = drop; - } - // printf("\n"); - } - std::sort(drop_vec.begin(), drop_vec.end(), [](DropTol const& a, DropTol const& b) { - return a.col < b.col; - }); - - for (LO idxID = 0; idxID < (LO)drop_vec.size(); idxID++) { - LO col = indices[drop_vec[idxID].col]; - // don't drop diagonal - if (row == col) { - columns[realnnz++] = col; - rownnz++; - continue; - } - - if (!drop_vec[idxID].drop) { - columns[realnnz++] = col; - rownnz++; - } else { - numDropped++; - } - } - // CMS - rows[row + 1] = realnnz; - } - } // end for row + } numTotal = A->getLocalNumEntries(); @@ -3285,7 +1655,7 @@ void CoalesceDropFactory::BuildKokkos } // if (doExperimentalWrap) ... else ... -} // BuildKokkos +} // Build template void CoalesceDropFactory::MergeRows(const Matrix& A, const LO row, Array& cols, const Array& translation) const { From 7c78025bf66884fd8b788bac2ae352e871241e7d Mon Sep 17 00:00:00 2001 From: Ian Halim Date: Wed, 24 Jul 2024 17:51:23 -0600 Subject: [PATCH 04/25] MueLu: std::complex Replaced With Kokkos::complex Signed-off-by: Ian Halim --- .../MueLu_CoalesceDropFactory_def.hpp | 475 +++++++++--------- 1 file changed, 241 insertions(+), 234 deletions(-) diff --git a/packages/muelu/src/Graph/MatrixTransformation/MueLu_CoalesceDropFactory_def.hpp b/packages/muelu/src/Graph/MatrixTransformation/MueLu_CoalesceDropFactory_def.hpp index da606ab20ff6..ad5895e2e41b 100644 --- a/packages/muelu/src/Graph/MatrixTransformation/MueLu_CoalesceDropFactory_def.hpp +++ b/packages/muelu/src/Graph/MatrixTransformation/MueLu_CoalesceDropFactory_def.hpp @@ -475,10 +475,10 @@ void CoalesceDropFactory::Build(Level typename LWGraph::entries_type::non_const_type columns("columns", A->getLocalNumEntries()); using MT = typename STS::magnitudeType; - RCP ghostedDiag; + RCP ghostedDiag; ArrayRCP ghostedDiagVals; ArrayRCP negMaxOffDiagonal; - // RS style needs the max negative off-diagonal, SA style needs the diagonal + // RS style needs the max negative off-diagonal, SA style needs the diagonal if (useSignedClassicalRS) { if (ghostedBlockNumber.is_null()) { negMaxOffDiagonal = MueLu::Utilities::GetMatrixMaxMinusOffDiagonal(*A); @@ -491,10 +491,12 @@ void CoalesceDropFactory::Build(Level } } else { ghostedDiag = MueLu::Utilities::GetMatrixOverlappedDiagonal(*A); - ghostedDiagVals = ghostedDiag->getData(0); - } + if(classicalAlgo == defaultAlgo) { + ghostedDiagVals = ghostedDiag->getData(0); + } + } auto boundaryNodes = MueLu::Utilities::DetectDirichletRows_kokkos_host(*A, dirichletThreshold); - if (rowSumTol > 0.) { + if (rowSumTol > 0.) { if (ghostedBlockNumber.is_null()) { if (GetVerbLevel() & Statistics1) GetOStream(Statistics1) << "Applying point row sum criterion." << std::endl; @@ -508,234 +510,239 @@ void CoalesceDropFactory::Build(Level LO realnnz = 0; rows(0) = 0; - if(classicalAlgo == defaultAlgo) { - SubFactoryMonitor m1(*this, "Classical RS/SA", currentLevel); - for (LO row = 0; row < Teuchos::as(A->getRowMap()->getLocalNumElements()); ++row) { - size_t nnz = A->getNumEntriesInLocalRow(row); - bool rowIsDirichlet = boundaryNodes[row]; - ArrayView indices; - ArrayView vals; - A->getLocalRowView(row, indices, vals); - - // FIXME the current predrop function uses the following - // FIXME if(std::abs(vals[k]) > std::abs(threshold_) || grow == gcid ) - // FIXME but the threshold doesn't take into account the rows' diagonal entries - // FIXME For now, hardwiring the dropping in here - - LO rownnz = 0; - if (useSignedClassicalRS) { - // Signed classical RS style - for (LO colID = 0; colID < Teuchos::as(nnz); colID++) { - LO col = indices[colID]; - MT max_neg_aik = realThreshold * STS::real(negMaxOffDiagonal[row]); - MT neg_aij = -STS::real(vals[colID]); - /* if(row==1326) printf("A(%d,%d) = %6.4e, block = (%d,%d) neg_aij = %6.4e max_neg_aik = %6.4e\n",row,col,vals[colID], - g_block_id.is_null() ? -1 : g_block_id[row], - g_block_id.is_null() ? -1 : g_block_id[col], - neg_aij, max_neg_aik);*/ - if ((!rowIsDirichlet && (g_block_id.is_null() || g_block_id[row] == g_block_id[col]) && neg_aij > max_neg_aik) || row == col) { - columns[realnnz++] = col; - rownnz++; - } else - numDropped++; - } - rows(row + 1) = realnnz; - } else if (useSignedClassicalSA) { - // Signed classical SA style - for (LO colID = 0; colID < Teuchos::as(nnz); colID++) { - LO col = indices[colID]; - - bool is_nonpositive = STS::real(vals[colID]) <= 0; - MT aiiajj = STS::magnitude(threshold * threshold * ghostedDiagVals[col] * ghostedDiagVals[row]); // eps^2*|a_ii|*|a_jj| - MT aij = is_nonpositive ? STS::magnitude(vals[colID] * vals[colID]) : (-STS::magnitude(vals[colID] * vals[colID])); // + |a_ij|^2, if a_ij < 0, - |a_ij|^2 if a_ij >=0 - /* - if(row==1326) printf("A(%d,%d) = %6.4e, raw_aij = %6.4e aij = %6.4e aiiajj = %6.4e\n",row,col,vals[colID], - vals[colID],aij, aiiajj); - */ - - if ((!rowIsDirichlet && aij > aiiajj) || row == col) { - columns(realnnz++) = col; - rownnz++; - } else - numDropped++; - } - rows[row + 1] = realnnz; - } else { - // Standard abs classical - for (LO colID = 0; colID < Teuchos::as(nnz); colID++) { - LO col = indices[colID]; - MT aiiajj = STS::magnitude(threshold * threshold * ghostedDiagVals[col] * ghostedDiagVals[row]); // eps^2*|a_ii|*|a_jj| - MT aij = STS::magnitude(vals[colID] * vals[colID]); // |a_ij|^2 - - if ((!rowIsDirichlet && aij > aiiajj) || row == col) { - columns(realnnz++) = col; - rownnz++; - } else - numDropped++; - } - rows(row + 1) = realnnz; - } - } // end for row - } - else { - SubFactoryMonitor m1(*this, "Cut Drop", currentLevel); - using ExecSpace = typename Node::execution_space; - using TeamPol = Kokkos::TeamPolicy; - using TeamMem = typename TeamPol::member_type; - - //move from host to device - ArrayView ghostedDiagValsArrayView = ghostedDiagVals.view(ghostedDiagVals.lowerOffset(), ghostedDiagVals.size()); - Kokkos::View ghostedDiagValsView = Kokkos::Compat::getKokkosViewDeepCopy(ghostedDiagValsArrayView); - auto boundaryNodesDevice = Kokkos::create_mirror_view(ExecSpace(), boundaryNodes); - - auto At = Utilities::Op2TpetraCrs(A); - auto A_device = At->getLocalMatrixDevice(); - - int algorithm = classicalAlgo; - Kokkos::ViewrownnzView("rownnzView", A_device.numRows()); - auto drop_views = Kokkos::View("drop_views", A_device.nnz()); - auto index_views = Kokkos::View("index_views", A_device.nnz()); - - Kokkos::parallel_reduce("classical_cut", TeamPol(A_device.numRows(), Kokkos::AUTO), KOKKOS_LAMBDA(const TeamMem& teamMember, LO& globalnnz, GO& totalDropped) { - LO row = teamMember.league_rank(); - auto rowView = A_device.row(row); - size_t nnz = rowView.length; - - size_t n = 0; - auto drop_view = Kokkos::subview(drop_views, Kokkos::make_pair(A_device.graph.row_map(row), A_device.graph.row_map(row+1))); - auto index_view = Kokkos::subview(index_views, Kokkos::make_pair(A_device.graph.row_map(row), A_device.graph.row_map(row+1))); - - //find magnitudes - Kokkos::parallel_reduce(Kokkos::TeamThreadRange(teamMember, (LO)nnz), [&](const LO colID, size_t &count) { - index_view(colID) = colID; - LO col = rowView.colidx(colID); - //ignore diagonals for now, they are checked again later - if(row == col) { - drop_view(colID) = true; - count++; - } - //Don't aggregate boundaries - else if(boundaryNodesDevice(colID)) { - drop_view(colID) = true; - } - else { - drop_view(colID) = false; - count++; - } - }, n); - - size_t dropStart = n; - if (algorithm == unscaled_cut) { - //push diagonals and boundaries to the right, sort everything else by aij on the left - Kokkos::Experimental::sort_team(teamMember, index_view, [=](size_t& x, size_t& y) { - if(drop_view(x) || drop_view(y)) { - return drop_view(x) < drop_view(y); - } - else { - typename STS::magnitudeType x_aij = static_cast(std::fabs(static_cast(rowView.value(x) * rowView.value(x)))); - typename STS::magnitudeType y_aij = static_cast(std::fabs(static_cast(rowView.value(y) * rowView.value(y)))); - return x_aij > y_aij; - } - }); - - //find index where dropping starts - Kokkos::parallel_reduce(Kokkos::TeamThreadRange(teamMember, 1, n), [=](size_t i, size_t& min) { - auto const& x = index_view(i - 1); - auto const& y = index_view(i); - typename STS::magnitudeType x_aij = 0; - typename STS::magnitudeType y_aij = 0; - if(!drop_view(x)) { - x_aij = static_cast(std::fabs(static_cast(rowView.value(x) * rowView.value(x)))); - } - if(!drop_view(y)) { - y_aij = static_cast(std::fabs(static_cast(rowView.value(y) * rowView.value(y)))); - } - - if(x_aij > realThreshold * y_aij) { - if(i < min) { - min = i; - } - } - }, Kokkos::Min(dropStart)); - } else if (algorithm == scaled_cut) { - //push diagonals and boundaries to the right, sort everything else by aij/aiiajj on the left - Kokkos::Experimental::sort_team(teamMember, index_view, [=](size_t& x, size_t& y) { - if(drop_view(x) || drop_view(y)) { - return drop_view(x) < drop_view(y); - } - else { - typename STS::magnitudeType x_aij = static_cast(std::fabs(static_cast(rowView.value(x) * rowView.value(x)))); - typename STS::magnitudeType y_aij = static_cast(std::fabs(static_cast(rowView.value(y) * rowView.value(y)))); - typename STS::magnitudeType x_aiiajj = static_cast(std::fabs(static_cast(threshold * threshold * ghostedDiagValsView(rowView.colidx(x)) * ghostedDiagValsView(row)))); - typename STS::magnitudeType y_aiiajj = static_cast(std::fabs(static_cast(threshold * threshold * ghostedDiagValsView(rowView.colidx(y)) * ghostedDiagValsView(row)))); - return x_aij / x_aiiajj > y_aij / y_aiiajj; - } - }); - - //find index where dropping starts - Kokkos::parallel_reduce(Kokkos::TeamThreadRange(teamMember, 1, n), [=](size_t i, size_t& min) { - auto const& x = index_view(i - 1); - auto const& y = index_view(i); - typename STS::magnitudeType x_val = 0; - typename STS::magnitudeType y_val = 0; - if(!drop_view(x)) { - typename STS::magnitudeType x_aij = static_cast(std::fabs(static_cast(rowView.value(x) * rowView.value(x)))); - typename STS::magnitudeType x_aiiajj = static_cast(std::fabs(static_cast(threshold * threshold * ghostedDiagValsView(rowView.colidx(x)) * ghostedDiagValsView(row)))); - x_val = x_aij / x_aiiajj; - } - if(!drop_view(y)) { - typename STS::magnitudeType y_aij = static_cast(std::fabs(static_cast(rowView.value(y) * rowView.value(y)))); - typename STS::magnitudeType y_aiiajj = static_cast(std::fabs(static_cast(threshold * threshold * ghostedDiagValsView(rowView.colidx(y)) * ghostedDiagValsView(row)))); - y_val = y_aij / y_aiiajj; - } - - if(x_val > realThreshold * y_val) { - if(i < min) { - min = i; - } - } - }, Kokkos::Min(dropStart)); - } - - //drop everything to the right of where values stop passing threshold - if(dropStart < n) { - Kokkos::parallel_for(Kokkos::TeamThreadRange(teamMember, dropStart, n), [=](size_t i) { - drop_view(index_view(i)) = true; - }); - } - - LO rownnz = 0; - GO rowDropped = 0; - Kokkos::parallel_reduce(Kokkos::TeamThreadRange(teamMember, n), [=](const size_t idxID, LO& keep, GO& drop) { - LO col = rowView.colidx(idxID); - //don't drop diagonal - if(row == col || !drop_view(idxID)) { - keep++; - } - else { - rowView.colidx(idxID) = -1; - drop++; - } - }, rownnz, rowDropped); - - globalnnz += rownnz; - totalDropped += rowDropped; - rownnzView(row) = rownnz; - }, realnnz, numDropped); - - //update column indices so that kept indices are aligned to the left for subview that happens later on - auto columnsDevice = Kokkos::create_mirror_view(ExecSpace(), columns); - Kokkos::Experimental::remove(ExecSpace(), columnsDevice, -1); - Kokkos::deep_copy(columns, columnsDevice); - - //update row indices by adding up new # of nnz in each row - auto rowsDevice = Kokkos::create_mirror_view(ExecSpace(), rows); - Kokkos::parallel_scan(Kokkos::RangePolicy(0, A_device.numRows()), KOKKOS_LAMBDA(const int i, LO& partial_sum, bool is_final) { - partial_sum += rownnzView(i); - if(is_final) rowsDevice(i+1) = partial_sum; - }); - Kokkos::deep_copy(rows, rowsDevice); - } + if(classicalAlgo == defaultAlgo) { + SubFactoryMonitor m1(*this, "Classical RS/SA", currentLevel); + for (LO row = 0; row < Teuchos::as(A->getRowMap()->getLocalNumElements()); ++row) { + size_t nnz = A->getNumEntriesInLocalRow(row); + bool rowIsDirichlet = boundaryNodes[row]; + ArrayView indices; + ArrayView vals; + A->getLocalRowView(row, indices, vals); + + // FIXME the current predrop function uses the following + // FIXME if(std::abs(vals[k]) > std::abs(threshold_) || grow == gcid ) + // FIXME but the threshold doesn't take into account the rows' diagonal entries + // FIXME For now, hardwiring the dropping in here + + LO rownnz = 0; + if (useSignedClassicalRS) { + // Signed classical RS style + for (LO colID = 0; colID < Teuchos::as(nnz); colID++) { + LO col = indices[colID]; + MT max_neg_aik = realThreshold * STS::real(negMaxOffDiagonal[row]); + MT neg_aij = -STS::real(vals[colID]); + /* if(row==1326) printf("A(%d,%d) = %6.4e, block = (%d,%d) neg_aij = %6.4e max_neg_aik = %6.4e\n",row,col,vals[colID], + g_block_id.is_null() ? -1 : g_block_id[row], + g_block_id.is_null() ? -1 : g_block_id[col], + neg_aij, max_neg_aik);*/ + if ((!rowIsDirichlet && (g_block_id.is_null() || g_block_id[row] == g_block_id[col]) && neg_aij > max_neg_aik) || row == col) { + columns[realnnz++] = col; + rownnz++; + } else + numDropped++; + } + rows(row + 1) = realnnz; + } else if (useSignedClassicalSA) { + // Signed classical SA style + for (LO colID = 0; colID < Teuchos::as(nnz); colID++) { + LO col = indices[colID]; + + bool is_nonpositive = STS::real(vals[colID]) <= 0; + MT aiiajj = STS::magnitude(threshold * threshold * ghostedDiagVals[col] * ghostedDiagVals[row]); // eps^2*|a_ii|*|a_jj| + MT aij = is_nonpositive ? STS::magnitude(vals[colID] * vals[colID]) : (-STS::magnitude(vals[colID] * vals[colID])); // + |a_ij|^2, if a_ij < 0, - |a_ij|^2 if a_ij >=0 + /* + if(row==1326) printf("A(%d,%d) = %6.4e, raw_aij = %6.4e aij = %6.4e aiiajj = %6.4e\n",row,col,vals[colID], + vals[colID],aij, aiiajj); + */ + + if ((!rowIsDirichlet && aij > aiiajj) || row == col) { + columns(realnnz++) = col; + rownnz++; + } else + numDropped++; + } + rows[row + 1] = realnnz; + } else { + // Standard abs classical + for (LO colID = 0; colID < Teuchos::as(nnz); colID++) { + LO col = indices[colID]; + MT aiiajj = STS::magnitude(threshold * threshold * ghostedDiagVals[col] * ghostedDiagVals[row]); // eps^2*|a_ii|*|a_jj| + MT aij = STS::magnitude(vals[colID] * vals[colID]); // |a_ij|^2 + + if ((!rowIsDirichlet && aij > aiiajj) || row == col) { + columns(realnnz++) = col; + rownnz++; + } else + numDropped++; + } + rows(row + 1) = realnnz; + } + } // end for row + } + else { + /* Cut Algorithm */ + SubFactoryMonitor m1(*this, "Cut Drop", currentLevel); + using ExecSpace = typename Node::execution_space; + using TeamPol = Kokkos::TeamPolicy; + using TeamMem = typename TeamPol::member_type; + using ATS = Kokkos::ArithTraits; + using impl_scalar_type = typename ATS::val_type; + using implATS = Kokkos::ArithTraits; + + //move from host to device + auto ghostedDiagValsView = Kokkos::subview(ghostedDiag->getDeviceLocalView(Xpetra::Access::ReadOnly), Kokkos::ALL(), 0); + auto boundaryNodesDevice = Kokkos::create_mirror_view(ExecSpace(), boundaryNodes); + auto thresholdKokkos = static_cast(threshold); + auto realThresholdKokkos = implATS::magnitude(thresholdKokkos); + + auto At = Utilities::Op2TpetraCrs(A); + auto A_device = At->getLocalMatrixDevice(); + + int algorithm = classicalAlgo; + Kokkos::ViewrownnzView("rownnzView", A_device.numRows()); + auto drop_views = Kokkos::View("drop_views", A_device.nnz()); + auto index_views = Kokkos::View("index_views", A_device.nnz()); + + Kokkos::parallel_reduce("classical_cut", TeamPol(A_device.numRows(), Kokkos::AUTO), KOKKOS_LAMBDA(const TeamMem& teamMember, LO& globalnnz, GO& totalDropped) { + LO row = teamMember.league_rank(); + auto rowView = A_device.row(row); + size_t nnz = rowView.length; + + size_t n = 0; + auto drop_view = Kokkos::subview(drop_views, Kokkos::make_pair(A_device.graph.row_map(row), A_device.graph.row_map(row+1))); + auto index_view = Kokkos::subview(index_views, Kokkos::make_pair(A_device.graph.row_map(row), A_device.graph.row_map(row+1))); + + //find magnitudes + Kokkos::parallel_reduce(Kokkos::TeamThreadRange(teamMember, (LO)nnz), [&](const LO colID, size_t &count) { + index_view(colID) = colID; + LO col = rowView.colidx(colID); + //ignore diagonals for now, they are checked again later + if(row == col) { + drop_view(colID) = true; + count++; + } + //Don't aggregate boundaries + else if(boundaryNodesDevice(colID)) { + drop_view(colID) = true; + } + else { + drop_view(colID) = false; + count++; + } + }, n); + + size_t dropStart = n; + if (algorithm == unscaled_cut) { + //push diagonals and boundaries to the right, sort everything else by aij on the left + Kokkos::Experimental::sort_team(teamMember, index_view, [=](size_t& x, size_t& y) -> bool { + if(drop_view(x) || drop_view(y)) { + return drop_view(x) < drop_view(y); + } + else { + auto x_aij = implATS::magnitude(rowView.value(x) * rowView.value(x)); + auto y_aij = implATS::magnitude(rowView.value(y) * rowView.value(y)); + return x_aij > y_aij; + } + }); + + //find index where dropping starts + Kokkos::parallel_reduce(Kokkos::TeamThreadRange(teamMember, 1, n), [=](size_t i, size_t& min) { + auto const& x = index_view(i - 1); + auto const& y = index_view(i); + typename implATS::magnitudeType x_aij = 0; + typename implATS::magnitudeType y_aij = 0; + if(!drop_view(x)) { + x_aij = implATS::magnitude(rowView.value(x) * rowView.value(x)); + } + if(!drop_view(y)) { + y_aij = implATS::magnitude(rowView.value(y) * rowView.value(y)); + } + + if(x_aij > realThresholdKokkos * y_aij) { + if(i < min) { + min = i; + } + } + }, Kokkos::Min(dropStart)); + } else if (algorithm == scaled_cut) { + //push diagonals and boundaries to the right, sort everything else by aij/aiiajj on the left + Kokkos::Experimental::sort_team(teamMember, index_view, [=](size_t& x, size_t& y) -> bool { + if(drop_view(x) || drop_view(y)) { + return drop_view(x) < drop_view(y); + } + else { + auto x_aij = implATS::magnitude(rowView.value(x) * rowView.value(x)); + auto y_aij = implATS::magnitude(rowView.value(y) * rowView.value(y)); + auto x_aiiajj = implATS::magnitude(thresholdKokkos * thresholdKokkos * ghostedDiagValsView(rowView.colidx(x)) * ghostedDiagValsView(row)); + auto y_aiiajj = implATS::magnitude(thresholdKokkos * thresholdKokkos * ghostedDiagValsView(rowView.colidx(y)) * ghostedDiagValsView(row)); + return (x_aij / x_aiiajj) > (y_aij / y_aiiajj); + } + }); + + //find index where dropping starts + Kokkos::parallel_reduce(Kokkos::TeamThreadRange(teamMember, 1, n), [=](size_t i, size_t& min) { + auto const& x = index_view(i - 1); + auto const& y = index_view(i); + typename implATS::magnitudeType x_val = 0; + typename implATS::magnitudeType y_val = 0; + if(!drop_view(x)) { + typename implATS::magnitudeType x_aij = implATS::magnitude(rowView.value(x) * rowView.value(x)); + typename implATS::magnitudeType x_aiiajj = implATS::magnitude(thresholdKokkos * thresholdKokkos * ghostedDiagValsView(rowView.colidx(x)) * ghostedDiagValsView(row)); + x_val = x_aij / x_aiiajj; + } + if(!drop_view(y)) { + typename implATS::magnitudeType y_aij = implATS::magnitude(rowView.value(y) * rowView.value(y)); + typename implATS::magnitudeType y_aiiajj = implATS::magnitude(thresholdKokkos * thresholdKokkos * ghostedDiagValsView(rowView.colidx(y)) * ghostedDiagValsView(row)); + y_val = y_aij / y_aiiajj; + } + + if(x_val > realThresholdKokkos * y_val) { + if(i < min) { + min = i; + } + } + }, Kokkos::Min(dropStart)); + } + + //drop everything to the right of where values stop passing threshold + if(dropStart < n) { + Kokkos::parallel_for(Kokkos::TeamThreadRange(teamMember, dropStart, n), [=](size_t i) { + drop_view(index_view(i)) = true; + }); + } + + LO rownnz = 0; + GO rowDropped = 0; + Kokkos::parallel_reduce(Kokkos::TeamThreadRange(teamMember, n), [=](const size_t idxID, LO& keep, GO& drop) { + LO col = rowView.colidx(idxID); + //don't drop diagonal + if(row == col || !drop_view(idxID)) { + keep++; + } + else { + rowView.colidx(idxID) = -1; + drop++; + } + }, rownnz, rowDropped); + + globalnnz += rownnz; + totalDropped += rowDropped; + rownnzView(row) = rownnz; + }, realnnz, numDropped); + + //update column indices so that kept indices are aligned to the left for subview that happens later on + auto columnsDevice = Kokkos::create_mirror_view(ExecSpace(), columns); + Kokkos::Experimental::remove(ExecSpace(), columnsDevice, -1); + Kokkos::deep_copy(columns, columnsDevice); + + //update row indices by adding up new # of nnz in each row + auto rowsDevice = Kokkos::create_mirror_view(ExecSpace(), rows); + Kokkos::parallel_scan(Kokkos::RangePolicy(0, A_device.numRows()), KOKKOS_LAMBDA(const int i, LO& partial_sum, bool is_final) { + partial_sum += rownnzView(i); + if(is_final) rowsDevice(i+1) = partial_sum; + }); + Kokkos::deep_copy(rows, rowsDevice); + } numTotal = A->getLocalNumEntries(); @@ -1655,7 +1662,7 @@ void CoalesceDropFactory::Build(Level } // if (doExperimentalWrap) ... else ... -} // Build +} // Build template void CoalesceDropFactory::MergeRows(const Matrix& A, const LO row, Array& cols, const Array& translation) const { From 4dff65a96640d6473d88c9dc0282bb0441f77c77 Mon Sep 17 00:00:00 2001 From: Ian Halim Date: Thu, 15 Aug 2024 12:40:22 -0600 Subject: [PATCH 05/25] MueLu: Code Review Fixes Signed-off-by: Ian Halim --- .../MueLu_CoalesceDropFactory_def.hpp | 40 +++++++++---------- 1 file changed, 20 insertions(+), 20 deletions(-) diff --git a/packages/muelu/src/Graph/MatrixTransformation/MueLu_CoalesceDropFactory_def.hpp b/packages/muelu/src/Graph/MatrixTransformation/MueLu_CoalesceDropFactory_def.hpp index ad5895e2e41b..0431bf011541 100644 --- a/packages/muelu/src/Graph/MatrixTransformation/MueLu_CoalesceDropFactory_def.hpp +++ b/packages/muelu/src/Graph/MatrixTransformation/MueLu_CoalesceDropFactory_def.hpp @@ -117,6 +117,11 @@ struct DropTol { }; } // namespace Details +enum decisionAlgoType { defaultAlgo, + unscaled_cut, + scaled_cut, + scaled_cut_symmetric }; + template RCP CoalesceDropFactory::GetValidParameterList() const { RCP validParamList = rcp(new ParameterList()); @@ -354,11 +359,6 @@ void CoalesceDropFactory::Build(Level #endif //////////////////////////////////////////////////// - enum decisionAlgoType { defaultAlgo, - unscaled_cut, - scaled_cut, - scaled_cut_symmetric }; - decisionAlgoType distanceLaplacianAlgo = defaultAlgo; decisionAlgoType classicalAlgo = defaultAlgo; if (algo == "distance laplacian") { @@ -591,24 +591,24 @@ void CoalesceDropFactory::Build(Level //move from host to device auto ghostedDiagValsView = Kokkos::subview(ghostedDiag->getDeviceLocalView(Xpetra::Access::ReadOnly), Kokkos::ALL(), 0); - auto boundaryNodesDevice = Kokkos::create_mirror_view(ExecSpace(), boundaryNodes); + auto boundaryNodesDevice = Kokkos::create_mirror_view_and_copy(ExecSpace(), boundaryNodes); auto thresholdKokkos = static_cast(threshold); auto realThresholdKokkos = implATS::magnitude(thresholdKokkos); + auto columnsDevice = Kokkos::create_mirror_view(ExecSpace(), columns); auto At = Utilities::Op2TpetraCrs(A); auto A_device = At->getLocalMatrixDevice(); - int algorithm = classicalAlgo; Kokkos::ViewrownnzView("rownnzView", A_device.numRows()); auto drop_views = Kokkos::View("drop_views", A_device.nnz()); auto index_views = Kokkos::View("index_views", A_device.nnz()); Kokkos::parallel_reduce("classical_cut", TeamPol(A_device.numRows(), Kokkos::AUTO), KOKKOS_LAMBDA(const TeamMem& teamMember, LO& globalnnz, GO& totalDropped) { LO row = teamMember.league_rank(); - auto rowView = A_device.row(row); + auto rowView = A_device.rowConst(row); size_t nnz = rowView.length; - size_t n = 0; + size_t dropSize = 0; auto drop_view = Kokkos::subview(drop_views, Kokkos::make_pair(A_device.graph.row_map(row), A_device.graph.row_map(row+1))); auto index_view = Kokkos::subview(index_views, Kokkos::make_pair(A_device.graph.row_map(row), A_device.graph.row_map(row+1))); @@ -629,10 +629,10 @@ void CoalesceDropFactory::Build(Level drop_view(colID) = false; count++; } - }, n); + }, dropSize); - size_t dropStart = n; - if (algorithm == unscaled_cut) { + size_t dropStart = dropSize; + if (classicalAlgo == unscaled_cut) { //push diagonals and boundaries to the right, sort everything else by aij on the left Kokkos::Experimental::sort_team(teamMember, index_view, [=](size_t& x, size_t& y) -> bool { if(drop_view(x) || drop_view(y)) { @@ -646,7 +646,7 @@ void CoalesceDropFactory::Build(Level }); //find index where dropping starts - Kokkos::parallel_reduce(Kokkos::TeamThreadRange(teamMember, 1, n), [=](size_t i, size_t& min) { + Kokkos::parallel_reduce(Kokkos::TeamThreadRange(teamMember, 1, dropSize), [=](size_t i, size_t& min) { auto const& x = index_view(i - 1); auto const& y = index_view(i); typename implATS::magnitudeType x_aij = 0; @@ -664,7 +664,7 @@ void CoalesceDropFactory::Build(Level } } }, Kokkos::Min(dropStart)); - } else if (algorithm == scaled_cut) { + } else if (classicalAlgo == scaled_cut) { //push diagonals and boundaries to the right, sort everything else by aij/aiiajj on the left Kokkos::Experimental::sort_team(teamMember, index_view, [=](size_t& x, size_t& y) -> bool { if(drop_view(x) || drop_view(y)) { @@ -680,7 +680,7 @@ void CoalesceDropFactory::Build(Level }); //find index where dropping starts - Kokkos::parallel_reduce(Kokkos::TeamThreadRange(teamMember, 1, n), [=](size_t i, size_t& min) { + Kokkos::parallel_reduce(Kokkos::TeamThreadRange(teamMember, 1, dropSize), [=](size_t i, size_t& min) { auto const& x = index_view(i - 1); auto const& y = index_view(i); typename implATS::magnitudeType x_val = 0; @@ -705,22 +705,23 @@ void CoalesceDropFactory::Build(Level } //drop everything to the right of where values stop passing threshold - if(dropStart < n) { - Kokkos::parallel_for(Kokkos::TeamThreadRange(teamMember, dropStart, n), [=](size_t i) { + if(dropStart < dropSize) { + Kokkos::parallel_for(Kokkos::TeamThreadRange(teamMember, dropStart, dropSize), [=](size_t i) { drop_view(index_view(i)) = true; }); } LO rownnz = 0; GO rowDropped = 0; - Kokkos::parallel_reduce(Kokkos::TeamThreadRange(teamMember, n), [=](const size_t idxID, LO& keep, GO& drop) { + Kokkos::parallel_reduce(Kokkos::TeamThreadRange(teamMember, dropSize), [=](const size_t idxID, LO& keep, GO& drop) { LO col = rowView.colidx(idxID); //don't drop diagonal if(row == col || !drop_view(idxID)) { + columnsDevice(A_device.graph.row_map(row) + idxID) = col; keep++; } else { - rowView.colidx(idxID) = -1; + columnsDevice(A_device.graph.row_map(row) + idxID) = -1; drop++; } }, rownnz, rowDropped); @@ -731,7 +732,6 @@ void CoalesceDropFactory::Build(Level }, realnnz, numDropped); //update column indices so that kept indices are aligned to the left for subview that happens later on - auto columnsDevice = Kokkos::create_mirror_view(ExecSpace(), columns); Kokkos::Experimental::remove(ExecSpace(), columnsDevice, -1); Kokkos::deep_copy(columns, columnsDevice); From 18097083fa60054ee56ada1f3afdf71f80c6fcc0 Mon Sep 17 00:00:00 2001 From: Anderson Chauphan Date: Mon, 28 Oct 2024 15:56:58 -0600 Subject: [PATCH 06/25] Add AT2 runner, usage of GenConfig, get-changed-packages.sh Squashing all the terrible commits I made while using the GitHub web interface. The interface does not seem to have support for signing with DCO. Includes changes which modify the event triggers to comply with new AT2 specifications, assignment of an AT2 runner to run on, usage of GenConfig to load the environment, and prototype implementation of calling the get-changed-trilinos-packages.sh script. Signed-off-by: Anderson Chauphan --- .github/workflows/codeql.yml | 65 +++++++++++++++++++----------------- 1 file changed, 35 insertions(+), 30 deletions(-) diff --git a/.github/workflows/codeql.yml b/.github/workflows/codeql.yml index 7b51bbec8c75..e0478400bf5a 100644 --- a/.github/workflows/codeql.yml +++ b/.github/workflows/codeql.yml @@ -12,10 +12,11 @@ name: "CodeQL: Linear Solvers" on: - #push: - # branches: [ "muelu-sync-workflow" ] pull_request: branches: [ "develop" ] + types: + - opened + - synchronize schedule: - cron: '41 23 * * 2' @@ -25,17 +26,12 @@ permissions: jobs: analyze: name: Analyze (${{ matrix.language }}) - # Runner size impacts CodeQL analysis time. To learn more, please see: - # - https://gh.io/recommended-hardware-resources-for-running-codeql - # - https://gh.io/supported-runners-and-hardware-resources - # - https://gh.io/using-larger-runners (GitHub.com only) - # Consider using larger runners or machines with greater resources for possible analysis time improvements. - runs-on: ${{ (matrix.language == 'swift' && 'macos-latest') || 'ubuntu-latest' }} - timeout-minutes: ${{ (matrix.language == 'swift' && 120) || 360 }} + runs-on: [self-hosted, gcc-10.3.0_openmpi-4.1.6] + if: ${{ github.event.action == 'synchronize' || github.event.action == 'opened' }} + permissions: # required for all workflows security-events: write - # only required for workflows in private repositories actions: read contents: read @@ -46,16 +42,7 @@ jobs: include: - language: c-cpp build-mode: manual - #- language: python - # build-mode: none - # CodeQL supports the following values keywords for 'language': 'c-cpp', 'csharp', 'go', 'java-kotlin', 'javascript-typescript', 'python', 'ruby', 'swift' - # Use `c-cpp` to analyze code written in C, C++ or both - # Use 'java-kotlin' to analyze code written in Java, Kotlin or both - # Use 'javascript-typescript' to analyze code written in JavaScript, TypeScript or both - # To learn more about changing the languages that are analyzed or customizing the build mode for your analysis, - # see https://docs.github.com/en/code-security/code-scanning/creating-an-advanced-setup-for-code-scanning/customizing-your-advanced-setup-for-code-scanning. - # If you are analyzing a compiled language, you can modify the 'build-mode' for that language to customize how - # your codebase is analyzed, see https://docs.github.com/en/code-security/code-scanning/creating-an-advanced-setup-for-code-scanning/codeql-code-scanning-for-compiled-languages + steps: - name: Checkout repository uses: actions/checkout@692973e3d937129bcbf40652eb9f2f61becf3332 # v4.1.7 @@ -70,20 +57,38 @@ jobs: query-filters: - exclude: tags: cpp/integer-multiplication-cast-to-long - + - name: env + env: + GITHUB_CONTEXT: ${{ toJson(github) }} + run: | + env + - name: module list + shell: bash -l {0} + run: | + module list + printenv PATH - if: matrix.build-mode == 'manual' - name: Configure Trilinos + name: Get dependencies run: | - mkdir -p trilinos_build - cd trilinos_build - cmake -G 'Unix Makefiles' -DTrilinos_ENABLE_TESTS=OFF -DTrilinos_ENABLE_Epetra=OFF -DTrilinos_ENABLE_AztecOO=OFF -DTrilinos_ENABLE_Ifpack=OFF -DTrilinos_ENABLE_ML=OFF -D Trilinos_ENABLE_Triutils=OFF -DTrilinos_ENABLE_Tpetra=ON -DTrilinos_ENABLE_MueLu=ON -DTrilinos_ENABLE_Krino=OFF .. - + bash -lc "${GITHUB_WORKSPACE}/packages/framework/get_dependencies.sh --container" + - if: matrix.build-mode == 'manual' + name: Generate CMake fragment for changed packages + run: | + git fetch origin ${GITHUB_BASE_REF} + git branch + bash -lc "${GITHUB_WORKSPACE}/commonTools/framework/get-changed-trilinos-packages.sh ${GITHUB_BASE_REF} ${GITHUB_HEAD_REF} package_enables.cmake package_subprojects.cmake" - if: matrix.build-mode == 'manual' - name: Build Trilinos + name: Configure and Build Trilinos + shell: bash -lc {0} run: | - cd trilinos_build - make -j 2 - + mkdir -p trilinos_build + mv package_enables.cmake trilinos_build + cd trilinos_build + + source ${GITHUB_WORKSPACE}/packages/framework/GenConfig/gen-config.sh --force --cmake-fragment genconfig_fragment.cmake rhel8_gcc-openmpi_debug_shared_no-kokkos-arch_no-asan_complex_no-fpic_mpi_no-pt_no-rdc_no-uvm_deprecated-on_no-package-enables + cmake -C genconfig_fragment.cmake -C package_enables.cmake .. + ninja -j 16 + - name: Perform CodeQL Analysis uses: github/codeql-action/analyze@294a9d92911152fe08befb9ec03e240add280cb3 # v3.26.8 with: From 5742da5d3c8b3599e1348e016a85484ec0826dcf Mon Sep 17 00:00:00 2001 From: Anderson Chauphan Date: Mon, 28 Oct 2024 16:03:26 -0600 Subject: [PATCH 07/25] Fix arguments of get-changed-trilinos-packages.sh for CodeQL Fix calling of get-changed-trilinos-packages.sh to correctly reference the origin remote. Signed-off-by: Anderson Chauphan --- .github/workflows/codeql.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/codeql.yml b/.github/workflows/codeql.yml index e0478400bf5a..0851448adb05 100644 --- a/.github/workflows/codeql.yml +++ b/.github/workflows/codeql.yml @@ -76,7 +76,7 @@ jobs: run: | git fetch origin ${GITHUB_BASE_REF} git branch - bash -lc "${GITHUB_WORKSPACE}/commonTools/framework/get-changed-trilinos-packages.sh ${GITHUB_BASE_REF} ${GITHUB_HEAD_REF} package_enables.cmake package_subprojects.cmake" + bash -lc "${GITHUB_WORKSPACE}/commonTools/framework/get-changed-trilinos-packages.sh origin/${GITHUB_BASE_REF} HEAD package_enables.cmake package_subprojects.cmake" - if: matrix.build-mode == 'manual' name: Configure and Build Trilinos shell: bash -lc {0} From 661569af6dffdaaaeb11f548ea26ac71f207e356 Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Mon, 28 Oct 2024 22:59:16 +0000 Subject: [PATCH 08/25] Bump actions/dependency-review-action from 4.3.4 to 4.4.0 Bumps [actions/dependency-review-action](https://github.com/actions/dependency-review-action) from 4.3.4 to 4.4.0. - [Release notes](https://github.com/actions/dependency-review-action/releases) - [Commits](https://github.com/actions/dependency-review-action/compare/5a2ce3f5b92ee19cbb1541a4984c76d921601d7c...4081bf99e2866ebe428fc0477b69eb4fcda7220a) --- updated-dependencies: - dependency-name: actions/dependency-review-action dependency-type: direct:production update-type: version-update:semver-minor ... Signed-off-by: dependabot[bot] --- .github/workflows/dependency-review.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/dependency-review.yml b/.github/workflows/dependency-review.yml index 7b0990bcf5ca..bf29beac76d5 100644 --- a/.github/workflows/dependency-review.yml +++ b/.github/workflows/dependency-review.yml @@ -24,4 +24,4 @@ jobs: - name: 'Checkout Repository' uses: actions/checkout@eef61447b9ff4aafe5dcd4e0bbf5d482be7e7871 # v4.2.1 - name: 'Dependency Review' - uses: actions/dependency-review-action@5a2ce3f5b92ee19cbb1541a4984c76d921601d7c # v4.3.4 + uses: actions/dependency-review-action@4081bf99e2866ebe428fc0477b69eb4fcda7220a # v4.4.0 From 40e117299710784a953a03469b2b32752a4ea29b Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Mon, 28 Oct 2024 22:59:27 +0000 Subject: [PATCH 09/25] Bump github/codeql-action from 3.26.13 to 3.27.0 Bumps [github/codeql-action](https://github.com/github/codeql-action) from 3.26.13 to 3.27.0. - [Release notes](https://github.com/github/codeql-action/releases) - [Changelog](https://github.com/github/codeql-action/blob/main/CHANGELOG.md) - [Commits](https://github.com/github/codeql-action/compare/f779452ac5af1c261dce0346a8f964149f49322b...662472033e021d55d94146f66f6058822b0b39fd) --- updated-dependencies: - dependency-name: github/codeql-action dependency-type: direct:production update-type: version-update:semver-minor ... Signed-off-by: dependabot[bot] --- .github/workflows/codeql.yml | 4 ++-- .github/workflows/scorecards.yml | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/.github/workflows/codeql.yml b/.github/workflows/codeql.yml index 56bbf091adaf..4139508fa42b 100644 --- a/.github/workflows/codeql.yml +++ b/.github/workflows/codeql.yml @@ -62,7 +62,7 @@ jobs: # Initializes the CodeQL tools for scanning. - name: Initialize CodeQL - uses: github/codeql-action/init@f779452ac5af1c261dce0346a8f964149f49322b # v3.26.13 + uses: github/codeql-action/init@662472033e021d55d94146f66f6058822b0b39fd # v3.27.0 with: languages: ${{ matrix.language }} build-mode: ${{ matrix.build-mode }} @@ -85,6 +85,6 @@ jobs: make -j 2 - name: Perform CodeQL Analysis - uses: github/codeql-action/analyze@f779452ac5af1c261dce0346a8f964149f49322b # v3.26.13 + uses: github/codeql-action/analyze@662472033e021d55d94146f66f6058822b0b39fd # v3.27.0 with: category: "/language:${{matrix.language}}" diff --git a/.github/workflows/scorecards.yml b/.github/workflows/scorecards.yml index c648a7e9b626..46a2c4571aff 100644 --- a/.github/workflows/scorecards.yml +++ b/.github/workflows/scorecards.yml @@ -66,6 +66,6 @@ jobs: # Upload the results to GitHub's code scanning dashboard. - name: "Upload to code-scanning" - uses: github/codeql-action/upload-sarif@f779452ac5af1c261dce0346a8f964149f49322b # v3.26.13 + uses: github/codeql-action/upload-sarif@662472033e021d55d94146f66f6058822b0b39fd # v3.27.0 with: sarif_file: results.sarif From 1d278e8c57e5f0a936f8eb8f6184222a19e5f681 Mon Sep 17 00:00:00 2001 From: Anderson Chauphan Date: Mon, 28 Oct 2024 17:12:28 -0600 Subject: [PATCH 10/25] Move GenConfig step into the Generate CMake fragment step The GenConfig step is just used to generate a cmake fragment for the configuration. This would fit nicely with the step that generates the other cmake fragment for package enables. Signed-off-by: Anderson Chauphan --- .github/workflows/codeql.yml | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/.github/workflows/codeql.yml b/.github/workflows/codeql.yml index 0851448adb05..aa9b8043c094 100644 --- a/.github/workflows/codeql.yml +++ b/.github/workflows/codeql.yml @@ -72,20 +72,20 @@ jobs: run: | bash -lc "${GITHUB_WORKSPACE}/packages/framework/get_dependencies.sh --container" - if: matrix.build-mode == 'manual' - name: Generate CMake fragment for changed packages + name: Generate CMake fragments run: | git fetch origin ${GITHUB_BASE_REF} - git branch + + mkdir -p trilinos_build && cd trilinos_build + + source ${GITHUB_WORKSPACE}/packages/framework/GenConfig/gen-config.sh --force --cmake-fragment genconfig_fragment.cmake rhel8_gcc-openmpi_debug_shared_no-kokkos-arch_no-asan_complex_no-fpic_mpi_no-pt_no-rdc_no-uvm_deprecated-on_no-package-enables bash -lc "${GITHUB_WORKSPACE}/commonTools/framework/get-changed-trilinos-packages.sh origin/${GITHUB_BASE_REF} HEAD package_enables.cmake package_subprojects.cmake" - if: matrix.build-mode == 'manual' name: Configure and Build Trilinos shell: bash -lc {0} run: | - mkdir -p trilinos_build - mv package_enables.cmake trilinos_build cd trilinos_build - source ${GITHUB_WORKSPACE}/packages/framework/GenConfig/gen-config.sh --force --cmake-fragment genconfig_fragment.cmake rhel8_gcc-openmpi_debug_shared_no-kokkos-arch_no-asan_complex_no-fpic_mpi_no-pt_no-rdc_no-uvm_deprecated-on_no-package-enables cmake -C genconfig_fragment.cmake -C package_enables.cmake .. ninja -j 16 From 237a61123802fd9d586eac0a32193178bbeda52e Mon Sep 17 00:00:00 2001 From: Anderson Chauphan Date: Mon, 28 Oct 2024 17:14:14 -0600 Subject: [PATCH 11/25] Tidy up workflow file and add newlines Tidy up workflow file with consistent naming and add newlines between each named step for better readability. Signed-off-by: Anderson Chauphan --- .github/workflows/codeql.yml | 13 +++++++++---- 1 file changed, 9 insertions(+), 4 deletions(-) diff --git a/.github/workflows/codeql.yml b/.github/workflows/codeql.yml index aa9b8043c094..6fc58693f0b7 100644 --- a/.github/workflows/codeql.yml +++ b/.github/workflows/codeql.yml @@ -57,20 +57,24 @@ jobs: query-filters: - exclude: tags: cpp/integer-multiplication-cast-to-long - - name: env + + - name: Print environment env: GITHUB_CONTEXT: ${{ toJson(github) }} run: | env - - name: module list + + - name: Module list shell: bash -l {0} run: | module list printenv PATH + - if: matrix.build-mode == 'manual' name: Get dependencies run: | bash -lc "${GITHUB_WORKSPACE}/packages/framework/get_dependencies.sh --container" + - if: matrix.build-mode == 'manual' name: Generate CMake fragments run: | @@ -80,15 +84,16 @@ jobs: source ${GITHUB_WORKSPACE}/packages/framework/GenConfig/gen-config.sh --force --cmake-fragment genconfig_fragment.cmake rhel8_gcc-openmpi_debug_shared_no-kokkos-arch_no-asan_complex_no-fpic_mpi_no-pt_no-rdc_no-uvm_deprecated-on_no-package-enables bash -lc "${GITHUB_WORKSPACE}/commonTools/framework/get-changed-trilinos-packages.sh origin/${GITHUB_BASE_REF} HEAD package_enables.cmake package_subprojects.cmake" + - if: matrix.build-mode == 'manual' - name: Configure and Build Trilinos + name: Configure and build Trilinos shell: bash -lc {0} run: | cd trilinos_build cmake -C genconfig_fragment.cmake -C package_enables.cmake .. ninja -j 16 - + - name: Perform CodeQL Analysis uses: github/codeql-action/analyze@294a9d92911152fe08befb9ec03e240add280cb3 # v3.26.8 with: From 7727e25ee133d6fbdce85589cca9c6359cb4109e Mon Sep 17 00:00:00 2001 From: Anderson Chauphan Date: Tue, 29 Oct 2024 08:38:52 -0600 Subject: [PATCH 12/25] Fix bash login shell for generate CMake fragment Signed-off-by: Anderson Chauphan --- .github/workflows/codeql.yml | 1 + 1 file changed, 1 insertion(+) diff --git a/.github/workflows/codeql.yml b/.github/workflows/codeql.yml index 6fc58693f0b7..be4c96a2a393 100644 --- a/.github/workflows/codeql.yml +++ b/.github/workflows/codeql.yml @@ -77,6 +77,7 @@ jobs: - if: matrix.build-mode == 'manual' name: Generate CMake fragments + shell: bash -lc {0} run: | git fetch origin ${GITHUB_BASE_REF} From 26eb6ff3cd6d003781d9fa51be2108cd77b94a3d Mon Sep 17 00:00:00 2001 From: Anderson Chauphan Date: Tue, 29 Oct 2024 09:00:32 -0600 Subject: [PATCH 13/25] Add TriBITS cache variables to reduce code built Add TriBITS cache variables to reduce code built for packages that are not needed. Signed-off-by: Anderson Chauphan --- .github/workflows/codeql.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/codeql.yml b/.github/workflows/codeql.yml index be4c96a2a393..130ed194da46 100644 --- a/.github/workflows/codeql.yml +++ b/.github/workflows/codeql.yml @@ -92,7 +92,7 @@ jobs: run: | cd trilinos_build - cmake -C genconfig_fragment.cmake -C package_enables.cmake .. + cmake -C genconfig_fragment.cmake -C package_enables.cmake -DTrilinos_ENABLE_ALL_FORWARD_DEP_PACKAGES=OFF -DTrilinos_ENABLE_ALL_OPTIONAL_PACKAGES=OFF -DTrilinos_ENABLE_SECONDARY_TESTED_CODE=OFF .. ninja -j 16 - name: Perform CodeQL Analysis From c367a234d9de789c533a4e7b853ed64a463e0a8d Mon Sep 17 00:00:00 2001 From: Anderson Chauphan Date: Tue, 29 Oct 2024 16:41:33 -0600 Subject: [PATCH 14/25] Use multi-line yml for cmake command Signed-off-by: Anderson Chauphan --- .github/workflows/codeql.yml | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/.github/workflows/codeql.yml b/.github/workflows/codeql.yml index 130ed194da46..3ada32ba19fe 100644 --- a/.github/workflows/codeql.yml +++ b/.github/workflows/codeql.yml @@ -92,7 +92,11 @@ jobs: run: | cd trilinos_build - cmake -C genconfig_fragment.cmake -C package_enables.cmake -DTrilinos_ENABLE_ALL_FORWARD_DEP_PACKAGES=OFF -DTrilinos_ENABLE_ALL_OPTIONAL_PACKAGES=OFF -DTrilinos_ENABLE_SECONDARY_TESTED_CODE=OFF .. + cmake -C genconfig_fragment.cmake -C package_enables.cmake \ + -DTrilinos_ENABLE_ALL_FORWARD_DEP_PACKAGES=OFF \ + -DTrilinos_ENABLE_ALL_OPTIONAL_PACKAGES=OFF \ + -DTrilinos_ENABLE_SECONDARY_TESTED_CODE=OFF .. + ninja -j 16 - name: Perform CodeQL Analysis From 54d711e40daac2252c661c52482c29a9179b6974 Mon Sep 17 00:00:00 2001 From: Anderson Chauphan Date: Tue, 29 Oct 2024 17:02:58 -0600 Subject: [PATCH 15/25] Manually disable each deprecated package in cmake command Manually disable each deprecated package in the cmake command for CodeQL configuration. Since these are defined on the command line, they should take priority over any of the enables from the package_enables.cmake or the genconfig_fragment.cmake fragments. Signed-off-by: Anderson Chauphan --- .github/workflows/codeql.yml | 17 ++++++++++++++++- 1 file changed, 16 insertions(+), 1 deletion(-) diff --git a/.github/workflows/codeql.yml b/.github/workflows/codeql.yml index 3ada32ba19fe..b739518db9ef 100644 --- a/.github/workflows/codeql.yml +++ b/.github/workflows/codeql.yml @@ -95,7 +95,22 @@ jobs: cmake -C genconfig_fragment.cmake -C package_enables.cmake \ -DTrilinos_ENABLE_ALL_FORWARD_DEP_PACKAGES=OFF \ -DTrilinos_ENABLE_ALL_OPTIONAL_PACKAGES=OFF \ - -DTrilinos_ENABLE_SECONDARY_TESTED_CODE=OFF .. + -DTrilinos_ENABLE_SECONDARY_TESTED_CODE=OFF \ + -DTrilinos_ENABLE_Amesos=OFF \ + -DTrilinos_ENABLE_AztecOO=OFF \ + -DTrilinos_ENABLE_Epetra=OFF \ + -DTrilinos_ENABLE_EpetraExt=OFF \ + -DTrilinos_ENABLE_Ifpack=OFF \ + -DTrilinos_ENABLE_Intrepid=OFF \ + -DTrilinos_ENABLE_Isorropia=OFF \ + -DTrilinos_ENABLE_ML=OFF \ + -DTrilinos_ENABLE_NewPackage=OFF \ + -DTrilinos_ENABLE_Pliris=OFF \ + -DTrilinos_ENABLE_PyTrilinos=OFF \ + -DTrilinos_ENABLE_ShyLU_DDCore=OFF \ + -DTrilinos_ENABLE_ThyraEpetraAdapters=OFF \ + -DTrilinos_ENABLE_ThyraEpetraExtAdapters=OFF \ + -DTrilinos_ENABLE_Triutils=OFF .. ninja -j 16 From d88fa994bdb626eb4dac0026e6be601f9a62f03c Mon Sep 17 00:00:00 2001 From: Ian Halim Date: Fri, 23 Aug 2024 19:21:52 -0600 Subject: [PATCH 16/25] MueLu: Fixing Issue #13377 and #13378 Issues listed above have been addressed. Threshold has been redefined to 1/threshold. Unit tests have been modified to be more thorough. Signed-off-by: Ian Halim --- .../MueLu_CoalesceDropFactory_def.hpp | 60 +++--- .../test/unit_tests/CoalesceDropFactory.cpp | 178 +++++++++++++++--- 2 files changed, 187 insertions(+), 51 deletions(-) diff --git a/packages/muelu/src/Graph/MatrixTransformation/MueLu_CoalesceDropFactory_def.hpp b/packages/muelu/src/Graph/MatrixTransformation/MueLu_CoalesceDropFactory_def.hpp index 0431bf011541..1f9961289cb0 100644 --- a/packages/muelu/src/Graph/MatrixTransformation/MueLu_CoalesceDropFactory_def.hpp +++ b/packages/muelu/src/Graph/MatrixTransformation/MueLu_CoalesceDropFactory_def.hpp @@ -591,13 +591,27 @@ void CoalesceDropFactory::Build(Level //move from host to device auto ghostedDiagValsView = Kokkos::subview(ghostedDiag->getDeviceLocalView(Xpetra::Access::ReadOnly), Kokkos::ALL(), 0); - auto boundaryNodesDevice = Kokkos::create_mirror_view_and_copy(ExecSpace(), boundaryNodes); auto thresholdKokkos = static_cast(threshold); auto realThresholdKokkos = implATS::magnitude(thresholdKokkos); auto columnsDevice = Kokkos::create_mirror_view(ExecSpace(), columns); - auto At = Utilities::Op2TpetraCrs(A); - auto A_device = At->getLocalMatrixDevice(); + auto A_device = A->getLocalMatrixDevice(); + RCP graph = rcp(new LWGraph(A->getCrsGraph(), "graph of A")); + RCP importer = A->getCrsGraph()->getImporter(); + RCP boundaryNodesVector = Xpetra::VectorFactory::Build(graph->GetDomainMap()); + RCP boundaryColumnVector; + for(size_t i = 0; i < graph->GetNodeNumVertices(); i++) { + boundaryNodesVector->getDataNonConst(0)[i] = boundaryNodes[i]; + } + if(!importer.is_null()) { + boundaryColumnVector = Xpetra::VectorFactory::Build(graph->GetImportMap()); + boundaryColumnVector->doImport(*boundaryNodesVector, *importer, Xpetra::INSERT); + } + else { + boundaryColumnVector = boundaryNodesVector; + } + auto boundaryColumn = boundaryColumnVector->getDeviceLocalView(Xpetra::Access::ReadOnly); + auto boundary = Kokkos::subview(boundaryColumn, Kokkos::ALL(), 0); Kokkos::ViewrownnzView("rownnzView", A_device.numRows()); auto drop_views = Kokkos::View("drop_views", A_device.nnz()); @@ -608,30 +622,24 @@ void CoalesceDropFactory::Build(Level auto rowView = A_device.rowConst(row); size_t nnz = rowView.length; - size_t dropSize = 0; auto drop_view = Kokkos::subview(drop_views, Kokkos::make_pair(A_device.graph.row_map(row), A_device.graph.row_map(row+1))); auto index_view = Kokkos::subview(index_views, Kokkos::make_pair(A_device.graph.row_map(row), A_device.graph.row_map(row+1))); //find magnitudes - Kokkos::parallel_reduce(Kokkos::TeamThreadRange(teamMember, (LO)nnz), [&](const LO colID, size_t &count) { + Kokkos::parallel_for(Kokkos::TeamThreadRange(teamMember, (LO)nnz), [&](const LO colID) { index_view(colID) = colID; LO col = rowView.colidx(colID); //ignore diagonals for now, they are checked again later - if(row == col) { - drop_view(colID) = true; - count++; - } //Don't aggregate boundaries - else if(boundaryNodesDevice(colID)) { + if(row == col || boundary(col)) { drop_view(colID) = true; } else { drop_view(colID) = false; - count++; } - }, dropSize); + }); - size_t dropStart = dropSize; + size_t dropStart = nnz; if (classicalAlgo == unscaled_cut) { //push diagonals and boundaries to the right, sort everything else by aij on the left Kokkos::Experimental::sort_team(teamMember, index_view, [=](size_t& x, size_t& y) -> bool { @@ -646,7 +654,7 @@ void CoalesceDropFactory::Build(Level }); //find index where dropping starts - Kokkos::parallel_reduce(Kokkos::TeamThreadRange(teamMember, 1, dropSize), [=](size_t i, size_t& min) { + Kokkos::parallel_reduce(Kokkos::TeamThreadRange(teamMember, 1, nnz), [=](size_t i, size_t& min) { auto const& x = index_view(i - 1); auto const& y = index_view(i); typename implATS::magnitudeType x_aij = 0; @@ -658,7 +666,7 @@ void CoalesceDropFactory::Build(Level y_aij = implATS::magnitude(rowView.value(y) * rowView.value(y)); } - if(x_aij > realThresholdKokkos * y_aij) { + if(realThresholdKokkos * realThresholdKokkos * x_aij > y_aij) { if(i < min) { min = i; } @@ -673,30 +681,30 @@ void CoalesceDropFactory::Build(Level else { auto x_aij = implATS::magnitude(rowView.value(x) * rowView.value(x)); auto y_aij = implATS::magnitude(rowView.value(y) * rowView.value(y)); - auto x_aiiajj = implATS::magnitude(thresholdKokkos * thresholdKokkos * ghostedDiagValsView(rowView.colidx(x)) * ghostedDiagValsView(row)); - auto y_aiiajj = implATS::magnitude(thresholdKokkos * thresholdKokkos * ghostedDiagValsView(rowView.colidx(y)) * ghostedDiagValsView(row)); + auto x_aiiajj = implATS::magnitude(ghostedDiagValsView(rowView.colidx(x)) * ghostedDiagValsView(row)); + auto y_aiiajj = implATS::magnitude(ghostedDiagValsView(rowView.colidx(y)) * ghostedDiagValsView(row)); return (x_aij / x_aiiajj) > (y_aij / y_aiiajj); } }); //find index where dropping starts - Kokkos::parallel_reduce(Kokkos::TeamThreadRange(teamMember, 1, dropSize), [=](size_t i, size_t& min) { + Kokkos::parallel_reduce(Kokkos::TeamThreadRange(teamMember, 1, nnz), [=](size_t i, size_t& min) { auto const& x = index_view(i - 1); auto const& y = index_view(i); typename implATS::magnitudeType x_val = 0; typename implATS::magnitudeType y_val = 0; if(!drop_view(x)) { typename implATS::magnitudeType x_aij = implATS::magnitude(rowView.value(x) * rowView.value(x)); - typename implATS::magnitudeType x_aiiajj = implATS::magnitude(thresholdKokkos * thresholdKokkos * ghostedDiagValsView(rowView.colidx(x)) * ghostedDiagValsView(row)); + typename implATS::magnitudeType x_aiiajj = implATS::magnitude(ghostedDiagValsView(rowView.colidx(x)) * ghostedDiagValsView(row)); x_val = x_aij / x_aiiajj; } if(!drop_view(y)) { typename implATS::magnitudeType y_aij = implATS::magnitude(rowView.value(y) * rowView.value(y)); - typename implATS::magnitudeType y_aiiajj = implATS::magnitude(thresholdKokkos * thresholdKokkos * ghostedDiagValsView(rowView.colidx(y)) * ghostedDiagValsView(row)); + typename implATS::magnitudeType y_aiiajj = implATS::magnitude(ghostedDiagValsView(rowView.colidx(y)) * ghostedDiagValsView(row)); y_val = y_aij / y_aiiajj; } - if(x_val > realThresholdKokkos * y_val) { + if(realThresholdKokkos * realThresholdKokkos * x_val > y_val) { if(i < min) { min = i; } @@ -705,15 +713,15 @@ void CoalesceDropFactory::Build(Level } //drop everything to the right of where values stop passing threshold - if(dropStart < dropSize) { - Kokkos::parallel_for(Kokkos::TeamThreadRange(teamMember, dropStart, dropSize), [=](size_t i) { + if(dropStart < nnz) { + Kokkos::parallel_for(Kokkos::TeamThreadRange(teamMember, dropStart, nnz), [=](size_t i) { drop_view(index_view(i)) = true; }); } LO rownnz = 0; GO rowDropped = 0; - Kokkos::parallel_reduce(Kokkos::TeamThreadRange(teamMember, dropSize), [=](const size_t idxID, LO& keep, GO& drop) { + Kokkos::parallel_reduce(Kokkos::TeamThreadRange(teamMember, nnz), [=](const size_t idxID, LO& keep, GO& drop) { LO col = rowView.colidx(idxID); //don't drop diagonal if(row == col || !drop_view(idxID)) { @@ -1381,7 +1389,7 @@ void CoalesceDropFactory::Build(Level auto const& y = drop_vec[i]; auto a = x.val; auto b = y.val; - if (a > realThreshold * b) { + if (realThreshold * realThreshold * a > b) { drop = true; #ifdef HAVE_MUELU_DEBUG if (distanceLaplacianCutVerbose) { @@ -1404,7 +1412,7 @@ void CoalesceDropFactory::Build(Level auto const& y = drop_vec[i]; auto a = x.val / x.diag; auto b = y.val / y.diag; - if (a > realThreshold * b) { + if (realThreshold * realThreshold * a > b) { drop = true; #ifdef HAVE_MUELU_DEBUG if (distanceLaplacianCutVerbose) { diff --git a/packages/muelu/test/unit_tests/CoalesceDropFactory.cpp b/packages/muelu/test/unit_tests/CoalesceDropFactory.cpp index e8902b178708..0073ca7e9bfb 100644 --- a/packages/muelu/test/unit_tests/CoalesceDropFactory.cpp +++ b/packages/muelu/test/unit_tests/CoalesceDropFactory.cpp @@ -1223,7 +1223,7 @@ TEUCHOS_UNIT_TEST_TEMPLATE_4_DECL(CoalesceDropFactory, DistanceLaplacianScaledCu // L_ij = -36 // L_ii = 72 // criterion for dropping is |L_ij|^2 <= tol^2 * |L_ii*L_jj| - coalesceDropFact.SetParameter("aggregation: drop tol", Teuchos::ParameterEntry(8.0)); + coalesceDropFact.SetParameter("aggregation: drop tol", Teuchos::ParameterEntry(sqrt(0.125))); coalesceDropFact.SetParameter("aggregation: drop scheme", Teuchos::ParameterEntry(std::string("distance laplacian"))); coalesceDropFact.SetParameter("aggregation: distance laplacian algo", Teuchos::ParameterEntry(std::string("scaled cut"))); fineLevel.Request("Graph", &coalesceDropFact); @@ -1289,7 +1289,7 @@ TEUCHOS_UNIT_TEST_TEMPLATE_4_DECL(CoalesceDropFactory, DistanceLaplacianUnscaled // L_ij = -36 // L_ii = 72 // criterion for dropping is |L_ij|^2 <= tol^2 * |L_ii*L_jj| - coalesceDropFact.SetParameter("aggregation: drop tol", Teuchos::ParameterEntry(8.0)); + coalesceDropFact.SetParameter("aggregation: drop tol", Teuchos::ParameterEntry(sqrt(0.125))); coalesceDropFact.SetParameter("aggregation: drop scheme", Teuchos::ParameterEntry(std::string("distance laplacian"))); coalesceDropFact.SetParameter("aggregation: distance laplacian algo", Teuchos::ParameterEntry(std::string("unscaled cut"))); fineLevel.Request("Graph", &coalesceDropFact); @@ -1355,7 +1355,7 @@ TEUCHOS_UNIT_TEST_TEMPLATE_4_DECL(CoalesceDropFactory, DistanceLaplacianCutSym, // L_ij = -36 // L_ii = 72 // criterion for dropping is |L_ij|^2 <= tol^2 * |L_ii*L_jj| - coalesceDropFact.SetParameter("aggregation: drop tol", Teuchos::ParameterEntry(8.0)); + coalesceDropFact.SetParameter("aggregation: drop tol", Teuchos::ParameterEntry(sqrt(0.125))); coalesceDropFact.SetParameter("aggregation: drop scheme", Teuchos::ParameterEntry(std::string("distance laplacian"))); coalesceDropFact.SetParameter("aggregation: distance laplacian algo", Teuchos::ParameterEntry(std::string("scaled cut symmetric"))); fineLevel.Request("Graph", &coalesceDropFact); @@ -1389,6 +1389,8 @@ TEUCHOS_UNIT_TEST_TEMPLATE_4_DECL(CoalesceDropFactory, ClassicalScaledCut, Scala typedef Teuchos::ScalarTraits STS; typedef typename STS::magnitudeType real_type; typedef Xpetra::MultiVector RealValuedMultiVector; + typedef Tpetra::Map map_type; + typedef Tpetra::CrsMatrix crs_matrix_type; MUELU_TESTING_SET_OSTREAM; MUELU_TESTING_LIMIT_SCOPE(Scalar, GlobalOrdinal, Node); @@ -1399,11 +1401,41 @@ TEUCHOS_UNIT_TEST_TEMPLATE_4_DECL(CoalesceDropFactory, ClassicalScaledCut, Scala Level fineLevel; TestHelpers::TestFactory::createSingleLevelHierarchy(fineLevel); - RCP A = TestHelpers::TestFactory::Build1DPoisson(36); + const global_size_t globalIndices = 12; + const GO indexBase = 0; + RCP map = rcp(new map_type(globalIndices, indexBase, comm)); + RCP A_t(new crs_matrix_type(map, 5)); + const SC two = static_cast(2.0); + const SC one = static_cast(1.0); + const SC negOne = static_cast(-1.0); + for(LO lclRow = 0; lclRow < static_cast (map->getLocalNumElements()); lclRow++) { + const GO gblRow = map->getGlobalElement(lclRow); + if(gblRow == 0) { + A_t->insertGlobalValues(gblRow, Teuchos::tuple(gblRow, gblRow + 1), Teuchos::tuple(two, negOne)); + } + else if(static_cast(gblRow) == globalIndices - 1) { + A_t->insertGlobalValues(gblRow, Teuchos::tuple(gblRow - 1, gblRow), Teuchos::tuple(negOne, two)); + } + else if(gblRow == 2 || gblRow == 9) { + A_t->insertGlobalValues(gblRow, Teuchos::tuple(gblRow), Teuchos::tuple(one)); + } + else if(gblRow == 5) { + A_t->insertGlobalValues(gblRow, Teuchos::tuple(gblRow-2, gblRow-1, gblRow, gblRow+1, gblRow+2), Teuchos::tuple(negOne, negOne, two, negOne, negOne)); + } + else if(gblRow == 6) { + A_t->insertGlobalValues(gblRow, Teuchos::tuple(gblRow-2, gblRow-1, gblRow, gblRow+1, gblRow+2), Teuchos::tuple(negOne, two, two, two, negOne)); + } + else { + A_t->insertGlobalValues(gblRow, Teuchos::tuple(gblRow - 1, gblRow, gblRow + 1), Teuchos::tuple(negOne, two, negOne)); + } + } + A_t->fillComplete(); + RCP A_x = rcp(new TpetraCrsMatrix(A_t)); + RCP A = rcp(new CrsMatrixWrap(A_x)); fineLevel.Set("A", A); Teuchos::ParameterList galeriList; - galeriList.set("nx", Teuchos::as(36)); + galeriList.set("nx", Teuchos::as(globalIndices)); RCP coordinates = Galeri::Xpetra::Utils::CreateCartesianCoordinates("1D", A->getRowMap(), galeriList); fineLevel.Set("Coordinates", coordinates); @@ -1429,25 +1461,59 @@ TEUCHOS_UNIT_TEST_TEMPLATE_4_DECL(CoalesceDropFactory, ClassicalScaledCut, Scala const RCP myImportMap = graph->GetImportMap(); // < note that the ImportMap is built from the column map of the matrix A WITHOUT dropping! const RCP myDomainMap = graph->GetDomainMap(); - TEST_EQUALITY(myImportMap->getMaxAllGlobalIndex(), 35); + TEST_EQUALITY(myImportMap->getMaxAllGlobalIndex(), globalIndices-1); TEST_EQUALITY(myImportMap->getMinAllGlobalIndex(), 0); TEST_EQUALITY(myImportMap->getMinLocalIndex(), 0); - TEST_EQUALITY(myImportMap->getGlobalNumElements(), Teuchos::as(36 + (comm->getSize() - 1) * 2)); + TEST_EQUALITY(myImportMap->getGlobalNumElements(), Teuchos::as(globalIndices + (comm->getSize() - 1) * 2)); - TEST_EQUALITY(myDomainMap->getMaxAllGlobalIndex(), 35); + TEST_EQUALITY(myDomainMap->getMaxAllGlobalIndex(), globalIndices-1); TEST_EQUALITY(myDomainMap->getMinAllGlobalIndex(), 0); TEST_EQUALITY(myDomainMap->getMinLocalIndex(), 0); - TEST_EQUALITY(myDomainMap->getGlobalNumElements(), 36); - - TEST_EQUALITY(graph->GetGlobalNumEdges(), 72); - -} // SignaledClassical + TEST_EQUALITY(myDomainMap->getGlobalNumElements(), globalIndices); + + TEST_EQUALITY(graph->GetGlobalNumEdges(), 28); + + int rows[13] = {0, 2, 4, 5, 7, 10, 15, 18, 21, 23, 24, 26, 28}; + int columns[28] = {0, 1, + 0, 1, + 2, + 3, 4, + 3, 4, 5, + 3, 4, 5, 6, 7, + 5, 6, 7, + 6, 7, 8, + 7, 8, + 9, + 10, 11, + 10, 11}; + auto rowPtrs = graph->getRowPtrs(); + auto entries = graph->getEntries(); + size_t rowID = 0; + TEST_EQUALITY(rowPtrs(0), rowID); + for(size_t i = 0; i < rowPtrs.size()-1; i++) { + auto gblID = myDomainMap->getGlobalElement(i); + int rownnz = rows[gblID+1]-rows[gblID]; + rowID += rownnz; + TEST_EQUALITY(rowPtrs(i+1), rowID); + + std::vector colID; + for(int j = 0; j < rownnz; j++) { + colID.push_back(myImportMap->getGlobalElement(entries(rowPtrs(i)+j))); + } + std::sort(std::begin(colID), std::end(colID)); + for(int j = 0; j < rownnz; j++) { + TEST_EQUALITY(colID[j], columns[rows[gblID]+j]); + } + } +} // ClassicalScaledCut TEUCHOS_UNIT_TEST_TEMPLATE_4_DECL(CoalesceDropFactory, ClassicalUnScaledCut, Scalar, LocalOrdinal, GlobalOrdinal, Node) { #include typedef Teuchos::ScalarTraits STS; typedef typename STS::magnitudeType real_type; typedef Xpetra::MultiVector RealValuedMultiVector; + typedef Tpetra::Map map_type; + typedef Tpetra::CrsMatrix crs_matrix_type; MUELU_TESTING_SET_OSTREAM; MUELU_TESTING_LIMIT_SCOPE(Scalar, GlobalOrdinal, Node); @@ -1458,11 +1524,41 @@ TEUCHOS_UNIT_TEST_TEMPLATE_4_DECL(CoalesceDropFactory, ClassicalUnScaledCut, Sca Level fineLevel; TestHelpers::TestFactory::createSingleLevelHierarchy(fineLevel); - RCP A = TestHelpers::TestFactory::Build1DPoisson(36); + const global_size_t globalIndices = 12; + const GO indexBase = 0; + RCP map = rcp(new map_type(globalIndices, indexBase, comm)); + RCP A_t(new crs_matrix_type(map, 5)); + const SC two = static_cast(2.0); + const SC one = static_cast(1.0); + const SC negOne = static_cast(-1.0); + for(LO lclRow = 0; lclRow < static_cast (map->getLocalNumElements()); lclRow++) { + const GO gblRow = map->getGlobalElement(lclRow); + if(gblRow == 0) { + A_t->insertGlobalValues(gblRow, Teuchos::tuple(gblRow, gblRow + 1), Teuchos::tuple(two, negOne)); + } + else if(static_cast(gblRow) == globalIndices - 1) { + A_t->insertGlobalValues(gblRow, Teuchos::tuple(gblRow - 1, gblRow), Teuchos::tuple(negOne, two)); + } + else if(gblRow == 2 || gblRow == 9) { + A_t->insertGlobalValues(gblRow, Teuchos::tuple(gblRow), Teuchos::tuple(one)); + } + else if(gblRow == 5) { + A_t->insertGlobalValues(gblRow, Teuchos::tuple(gblRow-2, gblRow-1, gblRow, gblRow+1, gblRow+2), Teuchos::tuple(negOne, negOne, two, negOne, negOne)); + } + else if(gblRow == 6) { + A_t->insertGlobalValues(gblRow, Teuchos::tuple(gblRow-2, gblRow-1, gblRow, gblRow+1, gblRow+2), Teuchos::tuple(negOne, two, two, two, negOne)); + } + else { + A_t->insertGlobalValues(gblRow, Teuchos::tuple(gblRow - 1, gblRow, gblRow + 1), Teuchos::tuple(negOne, two, negOne)); + } + } + A_t->fillComplete(); + RCP A_x = rcp(new TpetraCrsMatrix(A_t)); + RCP A = rcp(new CrsMatrixWrap(A_x)); fineLevel.Set("A", A); Teuchos::ParameterList galeriList; - galeriList.set("nx", Teuchos::as(36)); + galeriList.set("nx", Teuchos::as(globalIndices)); RCP coordinates = Galeri::Xpetra::Utils::CreateCartesianCoordinates("1D", A->getRowMap(), galeriList); fineLevel.Set("Coordinates", coordinates); @@ -1488,19 +1584,51 @@ TEUCHOS_UNIT_TEST_TEMPLATE_4_DECL(CoalesceDropFactory, ClassicalUnScaledCut, Sca const RCP myImportMap = graph->GetImportMap(); // < note that the ImportMap is built from the column map of the matrix A WITHOUT dropping! const RCP myDomainMap = graph->GetDomainMap(); - TEST_EQUALITY(myImportMap->getMaxAllGlobalIndex(), 35); + TEST_EQUALITY(myImportMap->getMaxAllGlobalIndex(), globalIndices-1); TEST_EQUALITY(myImportMap->getMinAllGlobalIndex(), 0); TEST_EQUALITY(myImportMap->getMinLocalIndex(), 0); - TEST_EQUALITY(myImportMap->getGlobalNumElements(), Teuchos::as(36 + (comm->getSize() - 1) * 2)); + TEST_EQUALITY(myImportMap->getGlobalNumElements(), Teuchos::as(globalIndices + (comm->getSize() - 1) * 2)); - TEST_EQUALITY(myDomainMap->getMaxAllGlobalIndex(), 35); + TEST_EQUALITY(myDomainMap->getMaxAllGlobalIndex(), globalIndices-1); TEST_EQUALITY(myDomainMap->getMinAllGlobalIndex(), 0); TEST_EQUALITY(myDomainMap->getMinLocalIndex(), 0); - TEST_EQUALITY(myDomainMap->getGlobalNumElements(), 36); - - TEST_EQUALITY(graph->GetGlobalNumEdges(), 72); - -} // SignaledClassical + TEST_EQUALITY(myDomainMap->getGlobalNumElements(), globalIndices); + + TEST_EQUALITY(graph->GetGlobalNumEdges(), 28); + + int rows[13] = {0, 2, 4, 5, 7, 10, 15, 18, 21, 23, 24, 26, 28}; + int columns[28] = {0, 1, + 0, 1, + 2, + 3, 4, + 3, 4, 5, + 3, 4, 5, 6, 7, + 5, 6, 7, + 6, 7, 8, + 7, 8, + 9, + 10, 11, + 10, 11}; + auto rowPtrs = graph->getRowPtrs(); + auto entries = graph->getEntries(); + size_t rowID = 0; + TEST_EQUALITY(rowPtrs(0), rowID); + for(size_t i = 0; i < rowPtrs.size()-1; i++) { + auto gblID = myDomainMap->getGlobalElement(i); + int rownnz = rows[gblID+1]-rows[gblID]; + rowID += rownnz; + TEST_EQUALITY(rowPtrs(i+1), rowID); + + std::vector colID; + for(int j = 0; j < rownnz; j++) { + colID.push_back(myImportMap->getGlobalElement(entries(rowPtrs(i)+j))); + } + std::sort(std::begin(colID), std::end(colID)); + for(int j = 0; j < rownnz; j++) { + TEST_EQUALITY(colID[j], columns[rows[gblID]+j]); + } + } +} // ClassicalUnScaledCut TEUCHOS_UNIT_TEST_TEMPLATE_4_DECL(CoalesceDropFactory, SignaledClassical, Scalar, LocalOrdinal, GlobalOrdinal, Node) { #include @@ -1902,7 +2030,7 @@ TEUCHOS_UNIT_TEST_TEMPLATE_4_DECL(CoalesceDropFactory, BlockDiagonal, Scalar, Lo coalesceDropFact.SetDefaultVerbLevel(MueLu::Extreme); coalesceDropFact.SetFactory("UnAmalgamationInfo", amalgFact); coalesceDropFact.SetFactory("BlockNumber", ibFact); - coalesceDropFact.SetParameter("aggregation: drop tol", Teuchos::ParameterEntry(8.0)); + coalesceDropFact.SetParameter("aggregation: drop tol", Teuchos::ParameterEntry(sqrt(0.125))); coalesceDropFact.SetParameter("aggregation: drop scheme", Teuchos::ParameterEntry(std::string("block diagonal"))); coalesceDropFact.SetParameter("aggregation: block diagonal: interleaved blocksize", Teuchos::ParameterEntry(3)); coalesceDropFact.SetDefaultVerbLevel(MueLu::Extreme); @@ -1949,7 +2077,7 @@ TEUCHOS_UNIT_TEST_TEMPLATE_4_DECL(CoalesceDropFactory, BlockDiagonalClassical, S coalesceDropFact.SetDefaultVerbLevel(MueLu::Extreme); coalesceDropFact.SetFactory("UnAmalgamationInfo", amalgFact); coalesceDropFact.SetFactory("BlockNumber", ibFact); - coalesceDropFact.SetParameter("aggregation: drop tol", Teuchos::ParameterEntry(8.0)); + coalesceDropFact.SetParameter("aggregation: drop tol", Teuchos::ParameterEntry(sqrt(0.125))); coalesceDropFact.SetParameter("aggregation: drop scheme", Teuchos::ParameterEntry(std::string("block diagonal classical"))); coalesceDropFact.SetParameter("aggregation: block diagonal: interleaved blocksize", Teuchos::ParameterEntry(3)); fineLevel.Request("Graph", &coalesceDropFact); From 26b3eac40a0dd818b7cb5950744c5fba53087005 Mon Sep 17 00:00:00 2001 From: Christian Glusa Date: Mon, 4 Nov 2024 09:11:53 -0700 Subject: [PATCH 17/25] MueLu: Fix clang-format Signed-off-by: Christian Glusa --- .../MueLu_CoalesceDropFactory_def.hpp | 285 +++++++++--------- .../test/unit_tests/CoalesceDropFactory.cpp | 118 ++++---- 2 files changed, 198 insertions(+), 205 deletions(-) diff --git a/packages/muelu/src/Graph/MatrixTransformation/MueLu_CoalesceDropFactory_def.hpp b/packages/muelu/src/Graph/MatrixTransformation/MueLu_CoalesceDropFactory_def.hpp index 1f9961289cb0..e2bae01ffa21 100644 --- a/packages/muelu/src/Graph/MatrixTransformation/MueLu_CoalesceDropFactory_def.hpp +++ b/packages/muelu/src/Graph/MatrixTransformation/MueLu_CoalesceDropFactory_def.hpp @@ -490,8 +490,8 @@ void CoalesceDropFactory::Build(Level GetOStream(Statistics1) << "Calculating max block off-diagonal" << std::endl; } } else { - ghostedDiag = MueLu::Utilities::GetMatrixOverlappedDiagonal(*A); - if(classicalAlgo == defaultAlgo) { + ghostedDiag = MueLu::Utilities::GetMatrixOverlappedDiagonal(*A); + if (classicalAlgo == defaultAlgo) { ghostedDiagVals = ghostedDiag->getData(0); } } @@ -510,7 +510,7 @@ void CoalesceDropFactory::Build(Level LO realnnz = 0; rows(0) = 0; - if(classicalAlgo == defaultAlgo) { + if (classicalAlgo == defaultAlgo) { SubFactoryMonitor m1(*this, "Classical RS/SA", currentLevel); for (LO row = 0; row < Teuchos::as(A->getRowMap()->getLocalNumElements()); ++row) { size_t nnz = A->getNumEntriesInLocalRow(row); @@ -578,177 +578,180 @@ void CoalesceDropFactory::Build(Level rows(row + 1) = realnnz; } } // end for row - } - else { + } else { /* Cut Algorithm */ SubFactoryMonitor m1(*this, "Cut Drop", currentLevel); - using ExecSpace = typename Node::execution_space; - using TeamPol = Kokkos::TeamPolicy; - using TeamMem = typename TeamPol::member_type; - using ATS = Kokkos::ArithTraits; + using ExecSpace = typename Node::execution_space; + using TeamPol = Kokkos::TeamPolicy; + using TeamMem = typename TeamPol::member_type; + using ATS = Kokkos::ArithTraits; using impl_scalar_type = typename ATS::val_type; - using implATS = Kokkos::ArithTraits; + using implATS = Kokkos::ArithTraits; - //move from host to device + // move from host to device auto ghostedDiagValsView = Kokkos::subview(ghostedDiag->getDeviceLocalView(Xpetra::Access::ReadOnly), Kokkos::ALL(), 0); - auto thresholdKokkos = static_cast(threshold); + auto thresholdKokkos = static_cast(threshold); auto realThresholdKokkos = implATS::magnitude(thresholdKokkos); - auto columnsDevice = Kokkos::create_mirror_view(ExecSpace(), columns); + auto columnsDevice = Kokkos::create_mirror_view(ExecSpace(), columns); - auto A_device = A->getLocalMatrixDevice(); - RCP graph = rcp(new LWGraph(A->getCrsGraph(), "graph of A")); - RCP importer = A->getCrsGraph()->getImporter(); + auto A_device = A->getLocalMatrixDevice(); + RCP graph = rcp(new LWGraph(A->getCrsGraph(), "graph of A")); + RCP importer = A->getCrsGraph()->getImporter(); RCP boundaryNodesVector = Xpetra::VectorFactory::Build(graph->GetDomainMap()); RCP boundaryColumnVector; - for(size_t i = 0; i < graph->GetNodeNumVertices(); i++) { + for (size_t i = 0; i < graph->GetNodeNumVertices(); i++) { boundaryNodesVector->getDataNonConst(0)[i] = boundaryNodes[i]; } - if(!importer.is_null()) { + if (!importer.is_null()) { boundaryColumnVector = Xpetra::VectorFactory::Build(graph->GetImportMap()); boundaryColumnVector->doImport(*boundaryNodesVector, *importer, Xpetra::INSERT); - } - else { + } else { boundaryColumnVector = boundaryNodesVector; } auto boundaryColumn = boundaryColumnVector->getDeviceLocalView(Xpetra::Access::ReadOnly); - auto boundary = Kokkos::subview(boundaryColumn, Kokkos::ALL(), 0); + auto boundary = Kokkos::subview(boundaryColumn, Kokkos::ALL(), 0); - Kokkos::ViewrownnzView("rownnzView", A_device.numRows()); - auto drop_views = Kokkos::View("drop_views", A_device.nnz()); + Kokkos::View rownnzView("rownnzView", A_device.numRows()); + auto drop_views = Kokkos::View("drop_views", A_device.nnz()); auto index_views = Kokkos::View("index_views", A_device.nnz()); - Kokkos::parallel_reduce("classical_cut", TeamPol(A_device.numRows(), Kokkos::AUTO), KOKKOS_LAMBDA(const TeamMem& teamMember, LO& globalnnz, GO& totalDropped) { - LO row = teamMember.league_rank(); - auto rowView = A_device.rowConst(row); - size_t nnz = rowView.length; - - auto drop_view = Kokkos::subview(drop_views, Kokkos::make_pair(A_device.graph.row_map(row), A_device.graph.row_map(row+1))); - auto index_view = Kokkos::subview(index_views, Kokkos::make_pair(A_device.graph.row_map(row), A_device.graph.row_map(row+1))); - - //find magnitudes - Kokkos::parallel_for(Kokkos::TeamThreadRange(teamMember, (LO)nnz), [&](const LO colID) { - index_view(colID) = colID; - LO col = rowView.colidx(colID); - //ignore diagonals for now, they are checked again later - //Don't aggregate boundaries - if(row == col || boundary(col)) { - drop_view(colID) = true; - } - else { - drop_view(colID) = false; - } - }); - - size_t dropStart = nnz; - if (classicalAlgo == unscaled_cut) { - //push diagonals and boundaries to the right, sort everything else by aij on the left - Kokkos::Experimental::sort_team(teamMember, index_view, [=](size_t& x, size_t& y) -> bool { - if(drop_view(x) || drop_view(y)) { - return drop_view(x) < drop_view(y); - } - else { - auto x_aij = implATS::magnitude(rowView.value(x) * rowView.value(x)); - auto y_aij = implATS::magnitude(rowView.value(y) * rowView.value(y)); - return x_aij > y_aij; - } - }); + Kokkos::parallel_reduce( + "classical_cut", TeamPol(A_device.numRows(), Kokkos::AUTO), KOKKOS_LAMBDA(const TeamMem& teamMember, LO& globalnnz, GO& totalDropped) { + LO row = teamMember.league_rank(); + auto rowView = A_device.rowConst(row); + size_t nnz = rowView.length; - //find index where dropping starts - Kokkos::parallel_reduce(Kokkos::TeamThreadRange(teamMember, 1, nnz), [=](size_t i, size_t& min) { - auto const& x = index_view(i - 1); - auto const& y = index_view(i); - typename implATS::magnitudeType x_aij = 0; - typename implATS::magnitudeType y_aij = 0; - if(!drop_view(x)) { - x_aij = implATS::magnitude(rowView.value(x) * rowView.value(x)); - } - if(!drop_view(y)) { - y_aij = implATS::magnitude(rowView.value(y) * rowView.value(y)); - } + auto drop_view = Kokkos::subview(drop_views, Kokkos::make_pair(A_device.graph.row_map(row), A_device.graph.row_map(row + 1))); + auto index_view = Kokkos::subview(index_views, Kokkos::make_pair(A_device.graph.row_map(row), A_device.graph.row_map(row + 1))); - if(realThresholdKokkos * realThresholdKokkos * x_aij > y_aij) { - if(i < min) { - min = i; + // find magnitudes + Kokkos::parallel_for(Kokkos::TeamThreadRange(teamMember, (LO)nnz), [&](const LO colID) { + index_view(colID) = colID; + LO col = rowView.colidx(colID); + // ignore diagonals for now, they are checked again later + // Don't aggregate boundaries + if (row == col || boundary(col)) { + drop_view(colID) = true; + } else { + drop_view(colID) = false; } - } - }, Kokkos::Min(dropStart)); - } else if (classicalAlgo == scaled_cut) { - //push diagonals and boundaries to the right, sort everything else by aij/aiiajj on the left - Kokkos::Experimental::sort_team(teamMember, index_view, [=](size_t& x, size_t& y) -> bool { - if(drop_view(x) || drop_view(y)) { - return drop_view(x) < drop_view(y); - } - else { - auto x_aij = implATS::magnitude(rowView.value(x) * rowView.value(x)); - auto y_aij = implATS::magnitude(rowView.value(y) * rowView.value(y)); - auto x_aiiajj = implATS::magnitude(ghostedDiagValsView(rowView.colidx(x)) * ghostedDiagValsView(row)); - auto y_aiiajj = implATS::magnitude(ghostedDiagValsView(rowView.colidx(y)) * ghostedDiagValsView(row)); - return (x_aij / x_aiiajj) > (y_aij / y_aiiajj); - } - }); + }); - //find index where dropping starts - Kokkos::parallel_reduce(Kokkos::TeamThreadRange(teamMember, 1, nnz), [=](size_t i, size_t& min) { - auto const& x = index_view(i - 1); - auto const& y = index_view(i); - typename implATS::magnitudeType x_val = 0; - typename implATS::magnitudeType y_val = 0; - if(!drop_view(x)) { - typename implATS::magnitudeType x_aij = implATS::magnitude(rowView.value(x) * rowView.value(x)); - typename implATS::magnitudeType x_aiiajj = implATS::magnitude(ghostedDiagValsView(rowView.colidx(x)) * ghostedDiagValsView(row)); - x_val = x_aij / x_aiiajj; - } - if(!drop_view(y)) { - typename implATS::magnitudeType y_aij = implATS::magnitude(rowView.value(y) * rowView.value(y)); - typename implATS::magnitudeType y_aiiajj = implATS::magnitude(ghostedDiagValsView(rowView.colidx(y)) * ghostedDiagValsView(row)); - y_val = y_aij / y_aiiajj; - } + size_t dropStart = nnz; + if (classicalAlgo == unscaled_cut) { + // push diagonals and boundaries to the right, sort everything else by aij on the left + Kokkos::Experimental::sort_team(teamMember, index_view, [=](size_t& x, size_t& y) -> bool { + if (drop_view(x) || drop_view(y)) { + return drop_view(x) < drop_view(y); + } else { + auto x_aij = implATS::magnitude(rowView.value(x) * rowView.value(x)); + auto y_aij = implATS::magnitude(rowView.value(y) * rowView.value(y)); + return x_aij > y_aij; + } + }); - if(realThresholdKokkos * realThresholdKokkos * x_val > y_val) { - if(i < min) { - min = i; - } + // find index where dropping starts + Kokkos::parallel_reduce( + Kokkos::TeamThreadRange(teamMember, 1, nnz), [=](size_t i, size_t& min) { + auto const& x = index_view(i - 1); + auto const& y = index_view(i); + typename implATS::magnitudeType x_aij = 0; + typename implATS::magnitudeType y_aij = 0; + if (!drop_view(x)) { + x_aij = implATS::magnitude(rowView.value(x) * rowView.value(x)); + } + if (!drop_view(y)) { + y_aij = implATS::magnitude(rowView.value(y) * rowView.value(y)); + } + + if (realThresholdKokkos * realThresholdKokkos * x_aij > y_aij) { + if (i < min) { + min = i; + } + } + }, + Kokkos::Min(dropStart)); + } else if (classicalAlgo == scaled_cut) { + // push diagonals and boundaries to the right, sort everything else by aij/aiiajj on the left + Kokkos::Experimental::sort_team(teamMember, index_view, [=](size_t& x, size_t& y) -> bool { + if (drop_view(x) || drop_view(y)) { + return drop_view(x) < drop_view(y); + } else { + auto x_aij = implATS::magnitude(rowView.value(x) * rowView.value(x)); + auto y_aij = implATS::magnitude(rowView.value(y) * rowView.value(y)); + auto x_aiiajj = implATS::magnitude(ghostedDiagValsView(rowView.colidx(x)) * ghostedDiagValsView(row)); + auto y_aiiajj = implATS::magnitude(ghostedDiagValsView(rowView.colidx(y)) * ghostedDiagValsView(row)); + return (x_aij / x_aiiajj) > (y_aij / y_aiiajj); + } + }); + + // find index where dropping starts + Kokkos::parallel_reduce( + Kokkos::TeamThreadRange(teamMember, 1, nnz), [=](size_t i, size_t& min) { + auto const& x = index_view(i - 1); + auto const& y = index_view(i); + typename implATS::magnitudeType x_val = 0; + typename implATS::magnitudeType y_val = 0; + if (!drop_view(x)) { + typename implATS::magnitudeType x_aij = implATS::magnitude(rowView.value(x) * rowView.value(x)); + typename implATS::magnitudeType x_aiiajj = implATS::magnitude(ghostedDiagValsView(rowView.colidx(x)) * ghostedDiagValsView(row)); + x_val = x_aij / x_aiiajj; + } + if (!drop_view(y)) { + typename implATS::magnitudeType y_aij = implATS::magnitude(rowView.value(y) * rowView.value(y)); + typename implATS::magnitudeType y_aiiajj = implATS::magnitude(ghostedDiagValsView(rowView.colidx(y)) * ghostedDiagValsView(row)); + y_val = y_aij / y_aiiajj; + } + + if (realThresholdKokkos * realThresholdKokkos * x_val > y_val) { + if (i < min) { + min = i; + } + } + }, + Kokkos::Min(dropStart)); } - }, Kokkos::Min(dropStart)); - } - //drop everything to the right of where values stop passing threshold - if(dropStart < nnz) { - Kokkos::parallel_for(Kokkos::TeamThreadRange(teamMember, dropStart, nnz), [=](size_t i) { - drop_view(index_view(i)) = true; - }); - } + // drop everything to the right of where values stop passing threshold + if (dropStart < nnz) { + Kokkos::parallel_for(Kokkos::TeamThreadRange(teamMember, dropStart, nnz), [=](size_t i) { + drop_view(index_view(i)) = true; + }); + } - LO rownnz = 0; - GO rowDropped = 0; - Kokkos::parallel_reduce(Kokkos::TeamThreadRange(teamMember, nnz), [=](const size_t idxID, LO& keep, GO& drop) { - LO col = rowView.colidx(idxID); - //don't drop diagonal - if(row == col || !drop_view(idxID)) { - columnsDevice(A_device.graph.row_map(row) + idxID) = col; - keep++; - } - else { - columnsDevice(A_device.graph.row_map(row) + idxID) = -1; - drop++; - } - }, rownnz, rowDropped); + LO rownnz = 0; + GO rowDropped = 0; + Kokkos::parallel_reduce( + Kokkos::TeamThreadRange(teamMember, nnz), [=](const size_t idxID, LO& keep, GO& drop) { + LO col = rowView.colidx(idxID); + // don't drop diagonal + if (row == col || !drop_view(idxID)) { + columnsDevice(A_device.graph.row_map(row) + idxID) = col; + keep++; + } else { + columnsDevice(A_device.graph.row_map(row) + idxID) = -1; + drop++; + } + }, + rownnz, rowDropped); - globalnnz += rownnz; - totalDropped += rowDropped; - rownnzView(row) = rownnz; - }, realnnz, numDropped); + globalnnz += rownnz; + totalDropped += rowDropped; + rownnzView(row) = rownnz; + }, + realnnz, numDropped); - //update column indices so that kept indices are aligned to the left for subview that happens later on + // update column indices so that kept indices are aligned to the left for subview that happens later on Kokkos::Experimental::remove(ExecSpace(), columnsDevice, -1); Kokkos::deep_copy(columns, columnsDevice); - //update row indices by adding up new # of nnz in each row + // update row indices by adding up new # of nnz in each row auto rowsDevice = Kokkos::create_mirror_view(ExecSpace(), rows); - Kokkos::parallel_scan(Kokkos::RangePolicy(0, A_device.numRows()), KOKKOS_LAMBDA(const int i, LO& partial_sum, bool is_final) { - partial_sum += rownnzView(i); - if(is_final) rowsDevice(i+1) = partial_sum; - }); + Kokkos::parallel_scan( + Kokkos::RangePolicy(0, A_device.numRows()), KOKKOS_LAMBDA(const int i, LO& partial_sum, bool is_final) { + partial_sum += rownnzView(i); + if (is_final) rowsDevice(i + 1) = partial_sum; + }); Kokkos::deep_copy(rows, rowsDevice); } diff --git a/packages/muelu/test/unit_tests/CoalesceDropFactory.cpp b/packages/muelu/test/unit_tests/CoalesceDropFactory.cpp index 0073ca7e9bfb..7ec8dbe27a3a 100644 --- a/packages/muelu/test/unit_tests/CoalesceDropFactory.cpp +++ b/packages/muelu/test/unit_tests/CoalesceDropFactory.cpp @@ -1402,36 +1402,31 @@ TEUCHOS_UNIT_TEST_TEMPLATE_4_DECL(CoalesceDropFactory, ClassicalScaledCut, Scala TestHelpers::TestFactory::createSingleLevelHierarchy(fineLevel); const global_size_t globalIndices = 12; - const GO indexBase = 0; - RCP map = rcp(new map_type(globalIndices, indexBase, comm)); + const GO indexBase = 0; + RCP map = rcp(new map_type(globalIndices, indexBase, comm)); RCP A_t(new crs_matrix_type(map, 5)); - const SC two = static_cast(2.0); - const SC one = static_cast(1.0); + const SC two = static_cast(2.0); + const SC one = static_cast(1.0); const SC negOne = static_cast(-1.0); - for(LO lclRow = 0; lclRow < static_cast (map->getLocalNumElements()); lclRow++) { + for (LO lclRow = 0; lclRow < static_cast(map->getLocalNumElements()); lclRow++) { const GO gblRow = map->getGlobalElement(lclRow); - if(gblRow == 0) { + if (gblRow == 0) { A_t->insertGlobalValues(gblRow, Teuchos::tuple(gblRow, gblRow + 1), Teuchos::tuple(two, negOne)); - } - else if(static_cast(gblRow) == globalIndices - 1) { + } else if (static_cast(gblRow) == globalIndices - 1) { A_t->insertGlobalValues(gblRow, Teuchos::tuple(gblRow - 1, gblRow), Teuchos::tuple(negOne, two)); - } - else if(gblRow == 2 || gblRow == 9) { + } else if (gblRow == 2 || gblRow == 9) { A_t->insertGlobalValues(gblRow, Teuchos::tuple(gblRow), Teuchos::tuple(one)); - } - else if(gblRow == 5) { - A_t->insertGlobalValues(gblRow, Teuchos::tuple(gblRow-2, gblRow-1, gblRow, gblRow+1, gblRow+2), Teuchos::tuple(negOne, negOne, two, negOne, negOne)); - } - else if(gblRow == 6) { - A_t->insertGlobalValues(gblRow, Teuchos::tuple(gblRow-2, gblRow-1, gblRow, gblRow+1, gblRow+2), Teuchos::tuple(negOne, two, two, two, negOne)); - } - else { + } else if (gblRow == 5) { + A_t->insertGlobalValues(gblRow, Teuchos::tuple(gblRow - 2, gblRow - 1, gblRow, gblRow + 1, gblRow + 2), Teuchos::tuple(negOne, negOne, two, negOne, negOne)); + } else if (gblRow == 6) { + A_t->insertGlobalValues(gblRow, Teuchos::tuple(gblRow - 2, gblRow - 1, gblRow, gblRow + 1, gblRow + 2), Teuchos::tuple(negOne, two, two, two, negOne)); + } else { A_t->insertGlobalValues(gblRow, Teuchos::tuple(gblRow - 1, gblRow, gblRow + 1), Teuchos::tuple(negOne, two, negOne)); } } A_t->fillComplete(); RCP A_x = rcp(new TpetraCrsMatrix(A_t)); - RCP A = rcp(new CrsMatrixWrap(A_x)); + RCP A = rcp(new CrsMatrixWrap(A_x)); fineLevel.Set("A", A); Teuchos::ParameterList galeriList; @@ -1461,19 +1456,19 @@ TEUCHOS_UNIT_TEST_TEMPLATE_4_DECL(CoalesceDropFactory, ClassicalScaledCut, Scala const RCP myImportMap = graph->GetImportMap(); // < note that the ImportMap is built from the column map of the matrix A WITHOUT dropping! const RCP myDomainMap = graph->GetDomainMap(); - TEST_EQUALITY(myImportMap->getMaxAllGlobalIndex(), globalIndices-1); + TEST_EQUALITY(myImportMap->getMaxAllGlobalIndex(), globalIndices - 1); TEST_EQUALITY(myImportMap->getMinAllGlobalIndex(), 0); TEST_EQUALITY(myImportMap->getMinLocalIndex(), 0); TEST_EQUALITY(myImportMap->getGlobalNumElements(), Teuchos::as(globalIndices + (comm->getSize() - 1) * 2)); - TEST_EQUALITY(myDomainMap->getMaxAllGlobalIndex(), globalIndices-1); + TEST_EQUALITY(myDomainMap->getMaxAllGlobalIndex(), globalIndices - 1); TEST_EQUALITY(myDomainMap->getMinAllGlobalIndex(), 0); TEST_EQUALITY(myDomainMap->getMinLocalIndex(), 0); TEST_EQUALITY(myDomainMap->getGlobalNumElements(), globalIndices); TEST_EQUALITY(graph->GetGlobalNumEdges(), 28); - int rows[13] = {0, 2, 4, 5, 7, 10, 15, 18, 21, 23, 24, 26, 28}; + int rows[13] = {0, 2, 4, 5, 7, 10, 15, 18, 21, 23, 24, 26, 28}; int columns[28] = {0, 1, 0, 1, 2, @@ -1486,23 +1481,23 @@ TEUCHOS_UNIT_TEST_TEMPLATE_4_DECL(CoalesceDropFactory, ClassicalScaledCut, Scala 9, 10, 11, 10, 11}; - auto rowPtrs = graph->getRowPtrs(); - auto entries = graph->getEntries(); - size_t rowID = 0; + auto rowPtrs = graph->getRowPtrs(); + auto entries = graph->getEntries(); + size_t rowID = 0; TEST_EQUALITY(rowPtrs(0), rowID); - for(size_t i = 0; i < rowPtrs.size()-1; i++) { + for (size_t i = 0; i < rowPtrs.size() - 1; i++) { auto gblID = myDomainMap->getGlobalElement(i); - int rownnz = rows[gblID+1]-rows[gblID]; + int rownnz = rows[gblID + 1] - rows[gblID]; rowID += rownnz; - TEST_EQUALITY(rowPtrs(i+1), rowID); + TEST_EQUALITY(rowPtrs(i + 1), rowID); std::vector colID; - for(int j = 0; j < rownnz; j++) { - colID.push_back(myImportMap->getGlobalElement(entries(rowPtrs(i)+j))); + for (int j = 0; j < rownnz; j++) { + colID.push_back(myImportMap->getGlobalElement(entries(rowPtrs(i) + j))); } std::sort(std::begin(colID), std::end(colID)); - for(int j = 0; j < rownnz; j++) { - TEST_EQUALITY(colID[j], columns[rows[gblID]+j]); + for (int j = 0; j < rownnz; j++) { + TEST_EQUALITY(colID[j], columns[rows[gblID] + j]); } } } // ClassicalScaledCut @@ -1525,36 +1520,31 @@ TEUCHOS_UNIT_TEST_TEMPLATE_4_DECL(CoalesceDropFactory, ClassicalUnScaledCut, Sca TestHelpers::TestFactory::createSingleLevelHierarchy(fineLevel); const global_size_t globalIndices = 12; - const GO indexBase = 0; - RCP map = rcp(new map_type(globalIndices, indexBase, comm)); + const GO indexBase = 0; + RCP map = rcp(new map_type(globalIndices, indexBase, comm)); RCP A_t(new crs_matrix_type(map, 5)); - const SC two = static_cast(2.0); - const SC one = static_cast(1.0); + const SC two = static_cast(2.0); + const SC one = static_cast(1.0); const SC negOne = static_cast(-1.0); - for(LO lclRow = 0; lclRow < static_cast (map->getLocalNumElements()); lclRow++) { + for (LO lclRow = 0; lclRow < static_cast(map->getLocalNumElements()); lclRow++) { const GO gblRow = map->getGlobalElement(lclRow); - if(gblRow == 0) { + if (gblRow == 0) { A_t->insertGlobalValues(gblRow, Teuchos::tuple(gblRow, gblRow + 1), Teuchos::tuple(two, negOne)); - } - else if(static_cast(gblRow) == globalIndices - 1) { + } else if (static_cast(gblRow) == globalIndices - 1) { A_t->insertGlobalValues(gblRow, Teuchos::tuple(gblRow - 1, gblRow), Teuchos::tuple(negOne, two)); - } - else if(gblRow == 2 || gblRow == 9) { + } else if (gblRow == 2 || gblRow == 9) { A_t->insertGlobalValues(gblRow, Teuchos::tuple(gblRow), Teuchos::tuple(one)); - } - else if(gblRow == 5) { - A_t->insertGlobalValues(gblRow, Teuchos::tuple(gblRow-2, gblRow-1, gblRow, gblRow+1, gblRow+2), Teuchos::tuple(negOne, negOne, two, negOne, negOne)); - } - else if(gblRow == 6) { - A_t->insertGlobalValues(gblRow, Teuchos::tuple(gblRow-2, gblRow-1, gblRow, gblRow+1, gblRow+2), Teuchos::tuple(negOne, two, two, two, negOne)); - } - else { + } else if (gblRow == 5) { + A_t->insertGlobalValues(gblRow, Teuchos::tuple(gblRow - 2, gblRow - 1, gblRow, gblRow + 1, gblRow + 2), Teuchos::tuple(negOne, negOne, two, negOne, negOne)); + } else if (gblRow == 6) { + A_t->insertGlobalValues(gblRow, Teuchos::tuple(gblRow - 2, gblRow - 1, gblRow, gblRow + 1, gblRow + 2), Teuchos::tuple(negOne, two, two, two, negOne)); + } else { A_t->insertGlobalValues(gblRow, Teuchos::tuple(gblRow - 1, gblRow, gblRow + 1), Teuchos::tuple(negOne, two, negOne)); } } A_t->fillComplete(); RCP A_x = rcp(new TpetraCrsMatrix(A_t)); - RCP A = rcp(new CrsMatrixWrap(A_x)); + RCP A = rcp(new CrsMatrixWrap(A_x)); fineLevel.Set("A", A); Teuchos::ParameterList galeriList; @@ -1584,19 +1574,19 @@ TEUCHOS_UNIT_TEST_TEMPLATE_4_DECL(CoalesceDropFactory, ClassicalUnScaledCut, Sca const RCP myImportMap = graph->GetImportMap(); // < note that the ImportMap is built from the column map of the matrix A WITHOUT dropping! const RCP myDomainMap = graph->GetDomainMap(); - TEST_EQUALITY(myImportMap->getMaxAllGlobalIndex(), globalIndices-1); + TEST_EQUALITY(myImportMap->getMaxAllGlobalIndex(), globalIndices - 1); TEST_EQUALITY(myImportMap->getMinAllGlobalIndex(), 0); TEST_EQUALITY(myImportMap->getMinLocalIndex(), 0); TEST_EQUALITY(myImportMap->getGlobalNumElements(), Teuchos::as(globalIndices + (comm->getSize() - 1) * 2)); - TEST_EQUALITY(myDomainMap->getMaxAllGlobalIndex(), globalIndices-1); + TEST_EQUALITY(myDomainMap->getMaxAllGlobalIndex(), globalIndices - 1); TEST_EQUALITY(myDomainMap->getMinAllGlobalIndex(), 0); TEST_EQUALITY(myDomainMap->getMinLocalIndex(), 0); TEST_EQUALITY(myDomainMap->getGlobalNumElements(), globalIndices); TEST_EQUALITY(graph->GetGlobalNumEdges(), 28); - int rows[13] = {0, 2, 4, 5, 7, 10, 15, 18, 21, 23, 24, 26, 28}; + int rows[13] = {0, 2, 4, 5, 7, 10, 15, 18, 21, 23, 24, 26, 28}; int columns[28] = {0, 1, 0, 1, 2, @@ -1609,23 +1599,23 @@ TEUCHOS_UNIT_TEST_TEMPLATE_4_DECL(CoalesceDropFactory, ClassicalUnScaledCut, Sca 9, 10, 11, 10, 11}; - auto rowPtrs = graph->getRowPtrs(); - auto entries = graph->getEntries(); - size_t rowID = 0; + auto rowPtrs = graph->getRowPtrs(); + auto entries = graph->getEntries(); + size_t rowID = 0; TEST_EQUALITY(rowPtrs(0), rowID); - for(size_t i = 0; i < rowPtrs.size()-1; i++) { + for (size_t i = 0; i < rowPtrs.size() - 1; i++) { auto gblID = myDomainMap->getGlobalElement(i); - int rownnz = rows[gblID+1]-rows[gblID]; + int rownnz = rows[gblID + 1] - rows[gblID]; rowID += rownnz; - TEST_EQUALITY(rowPtrs(i+1), rowID); + TEST_EQUALITY(rowPtrs(i + 1), rowID); std::vector colID; - for(int j = 0; j < rownnz; j++) { - colID.push_back(myImportMap->getGlobalElement(entries(rowPtrs(i)+j))); + for (int j = 0; j < rownnz; j++) { + colID.push_back(myImportMap->getGlobalElement(entries(rowPtrs(i) + j))); } std::sort(std::begin(colID), std::end(colID)); - for(int j = 0; j < rownnz; j++) { - TEST_EQUALITY(colID[j], columns[rows[gblID]+j]); + for (int j = 0; j < rownnz; j++) { + TEST_EQUALITY(colID[j], columns[rows[gblID] + j]); } } } // ClassicalUnScaledCut From 9b5fd842f76fdd9bb45d7a62aa8f0e10568f4e52 Mon Sep 17 00:00:00 2001 From: Anderson Chauphan Date: Mon, 4 Nov 2024 11:05:14 -0700 Subject: [PATCH 18/25] Add default shell setting to CodeQL job Add default shell setting to CodeQL job specifying to use a `bash -l` login shell to clean the syntax throughout the file where these were being individually specified. Signed-off-by: Anderson Chauphan --- .github/workflows/codeql.yml | 17 ++++++++--------- 1 file changed, 8 insertions(+), 9 deletions(-) diff --git a/.github/workflows/codeql.yml b/.github/workflows/codeql.yml index b739518db9ef..82c7df08f3e9 100644 --- a/.github/workflows/codeql.yml +++ b/.github/workflows/codeql.yml @@ -35,19 +35,20 @@ jobs: # only required for workflows in private repositories actions: read contents: read - strategy: fail-fast: false matrix: include: - language: c-cpp build-mode: manual + defaults: + run: + shell: bash -l steps: - name: Checkout repository uses: actions/checkout@692973e3d937129bcbf40652eb9f2f61becf3332 # v4.1.7 - # Initializes the CodeQL tools for scanning. - name: Initialize CodeQL uses: github/codeql-action/init@294a9d92911152fe08befb9ec03e240add280cb3 # v3.26.8 with: @@ -65,33 +66,31 @@ jobs: env - name: Module list - shell: bash -l {0} run: | module list printenv PATH - if: matrix.build-mode == 'manual' name: Get dependencies + working-directory: ${GITHUB_WORKSPACE}/packages/framework run: | - bash -lc "${GITHUB_WORKSPACE}/packages/framework/get_dependencies.sh --container" + ./get_dependencies.sh --container + ${GITHUB_WORKSPACE}/packages/framework/get_dependencies.sh --container - if: matrix.build-mode == 'manual' name: Generate CMake fragments - shell: bash -lc {0} run: | git fetch origin ${GITHUB_BASE_REF} mkdir -p trilinos_build && cd trilinos_build source ${GITHUB_WORKSPACE}/packages/framework/GenConfig/gen-config.sh --force --cmake-fragment genconfig_fragment.cmake rhel8_gcc-openmpi_debug_shared_no-kokkos-arch_no-asan_complex_no-fpic_mpi_no-pt_no-rdc_no-uvm_deprecated-on_no-package-enables - bash -lc "${GITHUB_WORKSPACE}/commonTools/framework/get-changed-trilinos-packages.sh origin/${GITHUB_BASE_REF} HEAD package_enables.cmake package_subprojects.cmake" + ${GITHUB_WORKSPACE}/commonTools/framework/get-changed-trilinos-packages.sh origin/${GITHUB_BASE_REF} HEAD package_enables.cmake package_subprojects.cmake - if: matrix.build-mode == 'manual' name: Configure and build Trilinos - shell: bash -lc {0} + working-directory: ./trilinos_build run: | - cd trilinos_build - cmake -C genconfig_fragment.cmake -C package_enables.cmake \ -DTrilinos_ENABLE_ALL_FORWARD_DEP_PACKAGES=OFF \ -DTrilinos_ENABLE_ALL_OPTIONAL_PACKAGES=OFF \ From 6c999fcf300f274adb2e479671f7f3410e84c0b7 Mon Sep 17 00:00:00 2001 From: Anderson Chauphan Date: Mon, 4 Nov 2024 11:12:39 -0700 Subject: [PATCH 19/25] Fix bash argument syntax Signed-off-by: Anderson Chauphan --- .github/workflows/codeql.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/codeql.yml b/.github/workflows/codeql.yml index 82c7df08f3e9..4ddc84a920b7 100644 --- a/.github/workflows/codeql.yml +++ b/.github/workflows/codeql.yml @@ -43,7 +43,7 @@ jobs: build-mode: manual defaults: run: - shell: bash -l + shell: bash -l {0} steps: - name: Checkout repository From c322f5a454f59bf0c3048df3cdd72ca4e7f1ba26 Mon Sep 17 00:00:00 2001 From: Anderson Chauphan Date: Mon, 4 Nov 2024 11:19:40 -0700 Subject: [PATCH 20/25] Fix working-directory workflow setting Signed-off-by: Anderson Chauphan --- .github/workflows/codeql.yml | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/.github/workflows/codeql.yml b/.github/workflows/codeql.yml index 4ddc84a920b7..6daaeb02e030 100644 --- a/.github/workflows/codeql.yml +++ b/.github/workflows/codeql.yml @@ -72,10 +72,9 @@ jobs: - if: matrix.build-mode == 'manual' name: Get dependencies - working-directory: ${GITHUB_WORKSPACE}/packages/framework + working-directory: ./packages/framework run: | ./get_dependencies.sh --container - ${GITHUB_WORKSPACE}/packages/framework/get_dependencies.sh --container - if: matrix.build-mode == 'manual' name: Generate CMake fragments From f7fdee0000c9d9d81c2ff4affb22aa740c20a2be Mon Sep 17 00:00:00 2001 From: Anderson Chauphan Date: Mon, 4 Nov 2024 12:49:18 -0700 Subject: [PATCH 21/25] Add workflow concurrency cancel and rename workflow Add workflow concurrency cancelation check for previous workflows associated with the same PR. Signed-off-by: Anderson Chauphan --- .github/workflows/codeql.yml | 21 ++++++++------------- 1 file changed, 8 insertions(+), 13 deletions(-) diff --git a/.github/workflows/codeql.yml b/.github/workflows/codeql.yml index 6daaeb02e030..fff932c18dc7 100644 --- a/.github/workflows/codeql.yml +++ b/.github/workflows/codeql.yml @@ -1,25 +1,20 @@ -# For most projects, this workflow file will not need changing; you simply need -# to commit it to your repository. -# -# You may wish to alter this file to override the set of languages analyzed, -# or to provide custom queries or build logic. -# -# ******** NOTE ******** -# We have attempted to detect the languages in your repository. Please check -# the `language` matrix defined below to confirm you have the correct set of -# supported CodeQL languages. -# -name: "CodeQL: Linear Solvers" +name: "CodeQL Security Scan" on: pull_request: - branches: [ "develop" ] + branches: + - develop types: - opened - synchronize schedule: - cron: '41 23 * * 2' +# Cancels any in progress workflows associated with this PR +concurrency: + group: ${{ github.workflow }}-${{ github.ref }} + cancel-in-progress: true + permissions: contents: read From 127a471342446f90dcbd614596f6a8da8adc5738 Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Tue, 5 Nov 2024 07:51:38 +0000 Subject: [PATCH 22/25] Bump actions/checkout from 4.2.1 to 4.2.2 Bumps [actions/checkout](https://github.com/actions/checkout) from 4.2.1 to 4.2.2. - [Release notes](https://github.com/actions/checkout/releases) - [Changelog](https://github.com/actions/checkout/blob/main/CHANGELOG.md) - [Commits](https://github.com/actions/checkout/compare/eef61447b9ff4aafe5dcd4e0bbf5d482be7e7871...11bd71901bbe5b1630ceea73d27597364c9af683) --- updated-dependencies: - dependency-name: actions/checkout dependency-type: direct:production update-type: version-update:semver-patch ... Signed-off-by: dependabot[bot] --- .github/workflows/AT2.yml | 8 ++++---- .github/workflows/clang_format.yml | 2 +- .github/workflows/codeql.yml | 2 +- .github/workflows/dependency-review.yml | 2 +- .github/workflows/detect-git-lfs.yml | 2 +- .github/workflows/detect-mpi-comm-world.yml | 2 +- .github/workflows/per-commit.yml | 2 +- .github/workflows/scorecards.yml | 2 +- .github/workflows/spack.yml | 2 +- 9 files changed, 12 insertions(+), 12 deletions(-) diff --git a/.github/workflows/AT2.yml b/.github/workflows/AT2.yml index b232051eddf2..c085620db33a 100644 --- a/.github/workflows/AT2.yml +++ b/.github/workflows/AT2.yml @@ -60,7 +60,7 @@ jobs: mkdir -p /home/Trilinos/src/Trilinos mkdir -p /home/Trilinos/build - name: Clone trilinos - uses: actions/checkout@eef61447b9ff4aafe5dcd4e0bbf5d482be7e7871 # v4.2.1 + uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2 with: fetch-depth: 0 - name: Repo status @@ -151,7 +151,7 @@ jobs: mkdir -p /home/Trilinos/src/Trilinos mkdir -p /home/Trilinos/build - name: Clone trilinos - uses: actions/checkout@eef61447b9ff4aafe5dcd4e0bbf5d482be7e7871 # v4.2.1 + uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2 with: fetch-depth: 0 - name: Repo status @@ -242,7 +242,7 @@ jobs: mkdir -p /home/Trilinos/src/Trilinos mkdir -p /home/Trilinos/build - name: Clone trilinos - uses: actions/checkout@eef61447b9ff4aafe5dcd4e0bbf5d482be7e7871 # v4.2.1 + uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2 with: fetch-depth: 0 - name: Repo status @@ -334,7 +334,7 @@ jobs: mkdir -p /home/Trilinos/src/Trilinos mkdir -p /home/Trilinos/build - name: Clone trilinos - uses: actions/checkout@eef61447b9ff4aafe5dcd4e0bbf5d482be7e7871 # v4.2.1 + uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2 with: fetch-depth: 0 - name: Repo status diff --git a/.github/workflows/clang_format.yml b/.github/workflows/clang_format.yml index a3fd0968ad75..d0b7392226a0 100644 --- a/.github/workflows/clang_format.yml +++ b/.github/workflows/clang_format.yml @@ -11,7 +11,7 @@ jobs: runs-on: ubuntu-latest steps: - - uses: actions/checkout@eef61447b9ff4aafe5dcd4e0bbf5d482be7e7871 # v4.2.1 + - uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2 - uses: DoozyX/clang-format-lint-action@c71d0bf4e21876ebec3e5647491186f8797fde31 # v0.18.2 with: source: './packages/muelu ./packages/tempus ./packages/teko ./packages/xpetra' diff --git a/.github/workflows/codeql.yml b/.github/workflows/codeql.yml index 4139508fa42b..3ee521f94e90 100644 --- a/.github/workflows/codeql.yml +++ b/.github/workflows/codeql.yml @@ -58,7 +58,7 @@ jobs: # your codebase is analyzed, see https://docs.github.com/en/code-security/code-scanning/creating-an-advanced-setup-for-code-scanning/codeql-code-scanning-for-compiled-languages steps: - name: Checkout repository - uses: actions/checkout@eef61447b9ff4aafe5dcd4e0bbf5d482be7e7871 # v4.2.1 + uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2 # Initializes the CodeQL tools for scanning. - name: Initialize CodeQL diff --git a/.github/workflows/dependency-review.yml b/.github/workflows/dependency-review.yml index bf29beac76d5..955b3b3fb2d0 100644 --- a/.github/workflows/dependency-review.yml +++ b/.github/workflows/dependency-review.yml @@ -22,6 +22,6 @@ jobs: egress-policy: audit - name: 'Checkout Repository' - uses: actions/checkout@eef61447b9ff4aafe5dcd4e0bbf5d482be7e7871 # v4.2.1 + uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2 - name: 'Dependency Review' uses: actions/dependency-review-action@4081bf99e2866ebe428fc0477b69eb4fcda7220a # v4.4.0 diff --git a/.github/workflows/detect-git-lfs.yml b/.github/workflows/detect-git-lfs.yml index ebe778088863..68595577ec7c 100644 --- a/.github/workflows/detect-git-lfs.yml +++ b/.github/workflows/detect-git-lfs.yml @@ -12,7 +12,7 @@ jobs: steps: - name: Check out code - uses: actions/checkout@eef61447b9ff4aafe5dcd4e0bbf5d482be7e7871 # v4.2.1 + uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2 with: fetch-depth: 0 diff --git a/.github/workflows/detect-mpi-comm-world.yml b/.github/workflows/detect-mpi-comm-world.yml index 1fd6790c8c86..e85d71db2f6a 100644 --- a/.github/workflows/detect-mpi-comm-world.yml +++ b/.github/workflows/detect-mpi-comm-world.yml @@ -12,7 +12,7 @@ jobs: steps: - name: Check out code - uses: actions/checkout@eef61447b9ff4aafe5dcd4e0bbf5d482be7e7871 # v4.2.1 + uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2 with: fetch-depth: 0 diff --git a/.github/workflows/per-commit.yml b/.github/workflows/per-commit.yml index 3f619a7dbbc0..80dfc8b94008 100644 --- a/.github/workflows/per-commit.yml +++ b/.github/workflows/per-commit.yml @@ -12,7 +12,7 @@ jobs: steps: - name: Check out code - uses: actions/checkout@eef61447b9ff4aafe5dcd4e0bbf5d482be7e7871 # v4.2.1 + uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2 with: fetch-depth: 0 diff --git a/.github/workflows/scorecards.yml b/.github/workflows/scorecards.yml index 46a2c4571aff..1ac917d3af8a 100644 --- a/.github/workflows/scorecards.yml +++ b/.github/workflows/scorecards.yml @@ -31,7 +31,7 @@ jobs: steps: - name: "Checkout code" - uses: actions/checkout@eef61447b9ff4aafe5dcd4e0bbf5d482be7e7871 # v4.2.1 + uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2 with: persist-credentials: false diff --git a/.github/workflows/spack.yml b/.github/workflows/spack.yml index 59976c1d9b3e..3c3c01b75849 100644 --- a/.github/workflows/spack.yml +++ b/.github/workflows/spack.yml @@ -24,7 +24,7 @@ jobs: runs-on: [self-hosted, gcc-10.3.0_openmpi-4.1.6] steps: - name: Clone Trilinos - uses: actions/checkout@eef61447b9ff4aafe5dcd4e0bbf5d482be7e7871 # v4.2.1 + uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2 with: fetch-depth: 1 - name: Spack build From 0b3f08df7492cb9c580b902b862bfcf07097a1f1 Mon Sep 17 00:00:00 2001 From: maxfirmbach Date: Mon, 4 Nov 2024 10:01:09 -0700 Subject: [PATCH 23/25] Make AggregateQualityFactory a transfer factory Signed-off-by: maxfirmbach --- .../MueLu_NotayAggregationFactory_def.hpp | 7 --- .../MueLu_UncoupledAggregationFactory_def.hpp | 10 ----- .../MueLu_ParameterListInterpreter_def.hpp | 43 ++++++++++--------- ...u_AggregateQualityEstimateFactory_decl.hpp | 13 +++--- ...Lu_AggregateQualityEstimateFactory_def.hpp | 26 +++++------ .../aggregatequalities.xml | 7 +++ .../aggregatequalities.xml | 6 +-- .../Output/aggregatequalities_epetra.gold | 20 ++++----- .../Output/aggregatequalities_tpetra.gold | 20 ++++----- .../AggregateQualityEstimateFactory.cpp | 37 ++++++++++------ 10 files changed, 98 insertions(+), 91 deletions(-) rename packages/muelu/src/{Misc => Utils}/MueLu_AggregateQualityEstimateFactory_decl.hpp (88%) rename packages/muelu/src/{Misc => Utils}/MueLu_AggregateQualityEstimateFactory_def.hpp (96%) create mode 100644 packages/muelu/test/interface/default/EasyParameterListInterpreter/aggregatequalities.xml diff --git a/packages/muelu/src/Graph/PairwiseAggregation/MueLu_NotayAggregationFactory_def.hpp b/packages/muelu/src/Graph/PairwiseAggregation/MueLu_NotayAggregationFactory_def.hpp index b432ffb1d868..40f4635e0b3d 100644 --- a/packages/muelu/src/Graph/PairwiseAggregation/MueLu_NotayAggregationFactory_def.hpp +++ b/packages/muelu/src/Graph/PairwiseAggregation/MueLu_NotayAggregationFactory_def.hpp @@ -55,7 +55,6 @@ RCP NotayAggregationFactorysetEntry(name, MasterList::getEntry(name)) SET_VALID_ENTRY("aggregation: pairwise: size"); SET_VALID_ENTRY("aggregation: pairwise: tie threshold"); - SET_VALID_ENTRY("aggregation: compute aggregate qualities"); SET_VALID_ENTRY("aggregation: Dirichlet threshold"); SET_VALID_ENTRY("aggregation: ordering"); #undef SET_VALID_ENTRY @@ -64,21 +63,15 @@ RCP NotayAggregationFactoryset>("A", null, "Generating factory of the matrix"); validParamList->set>("Graph", null, "Generating factory of the graph"); validParamList->set>("DofsPerNode", null, "Generating factory for variable \'DofsPerNode\', usually the same as for \'Graph\'"); - validParamList->set>("AggregateQualities", null, "Generating factory for variable \'AggregateQualities\'"); return validParamList; } template void NotayAggregationFactory::DeclareInput(Level& currentLevel) const { - const ParameterList& pL = GetParameterList(); - Input(currentLevel, "A"); Input(currentLevel, "Graph"); Input(currentLevel, "DofsPerNode"); - if (pL.get("aggregation: compute aggregate qualities")) { - Input(currentLevel, "AggregateQualities"); - } } template diff --git a/packages/muelu/src/Graph/UncoupledAggregation/MueLu_UncoupledAggregationFactory_def.hpp b/packages/muelu/src/Graph/UncoupledAggregation/MueLu_UncoupledAggregationFactory_def.hpp index fdbb1106294c..386451d1cfc3 100644 --- a/packages/muelu/src/Graph/UncoupledAggregation/MueLu_UncoupledAggregationFactory_def.hpp +++ b/packages/muelu/src/Graph/UncoupledAggregation/MueLu_UncoupledAggregationFactory_def.hpp @@ -75,14 +75,12 @@ RCP UncoupledAggregationFactoryset>("Graph", null, "Generating factory of the graph"); validParamList->set>("DofsPerNode", null, "Generating factory for variable \'DofsPerNode\', usually the same as for \'Graph\'"); - validParamList->set>("AggregateQualities", null, "Generating factory for variable \'AggregateQualities\'"); // special variables necessary for OnePtAggregationAlgorithm validParamList->set("OnePt aggregate map name", "", "Name of input map for single node aggregates. (default='')"); @@ -131,10 +129,6 @@ void UncoupledAggregationFactory::DeclareInpu Input(currentLevel, "nodeOnInterface"); } } - - if (pL.get("aggregation: compute aggregate qualities")) { - Input(currentLevel, "AggregateQualities"); - } } template @@ -375,10 +369,6 @@ void UncoupledAggregationFactory::Build(Level aggregates->ComputeAggregateSizes(true /*forceRecompute*/); Set(currentLevel, "Aggregates", aggregates); - - if (pL.get("aggregation: compute aggregate qualities")) { - RCP> aggQualities = Get>>(currentLevel, "AggregateQualities"); - } } template diff --git a/packages/muelu/src/Interface/MueLu_ParameterListInterpreter_def.hpp b/packages/muelu/src/Interface/MueLu_ParameterListInterpreter_def.hpp index e46d286abb90..207791bf5b5b 100644 --- a/packages/muelu/src/Interface/MueLu_ParameterListInterpreter_def.hpp +++ b/packages/muelu/src/Interface/MueLu_ParameterListInterpreter_def.hpp @@ -1098,7 +1098,6 @@ void ParameterListInterpreter:: MUELU_TEST_AND_SET_PARAM_2LIST(paramList, defaultList, "aggregation: preserve Dirichlet points", bool, aggParams); MUELU_TEST_AND_SET_PARAM_2LIST(paramList, defaultList, "aggregation: error on nodes with no on-rank neighbors", bool, aggParams); MUELU_TEST_AND_SET_PARAM_2LIST(paramList, defaultList, "aggregation: phase3 avoid singletons", bool, aggParams); - MUELU_TEST_AND_SET_PARAM_2LIST(paramList, defaultList, "aggregation: compute aggregate qualities", bool, aggParams); aggFactory->SetParameterList(aggParams); // make sure that the aggregation factory has all necessary data aggFactory->SetFactory("DofsPerNode", manager.GetFactory("Graph")); @@ -1180,7 +1179,6 @@ void ParameterListInterpreter:: MUELU_TEST_AND_SET_PARAM_2LIST(paramList, defaultList, "aggregation: pairwise: tie threshold", double, aggParams); MUELU_TEST_AND_SET_PARAM_2LIST(paramList, defaultList, "aggregation: Dirichlet threshold", double, aggParams); MUELU_TEST_AND_SET_PARAM_2LIST(paramList, defaultList, "aggregation: ordering", std::string, aggParams); - MUELU_TEST_AND_SET_PARAM_2LIST(paramList, defaultList, "aggregation: compute aggregate qualities", bool, aggParams); aggFactory->SetParameterList(aggParams); aggFactory->SetFactory("DofsPerNode", manager.GetFactory("Graph")); aggFactory->SetFactory("Graph", manager.GetFactory("Graph")); @@ -1200,25 +1198,6 @@ void ParameterListInterpreter:: coarseMap->SetFactory("Aggregates", manager.GetFactory("Aggregates")); manager.SetFactory("CoarseMap", coarseMap); - // Aggregate qualities - if (MUELU_TEST_PARAM_2LIST(paramList, defaultList, "aggregation: compute aggregate qualities", bool, true)) { - RCP aggQualityFact = rcp(new AggregateQualityEstimateFactory()); - ParameterList aggQualityParams; - MUELU_TEST_AND_SET_PARAM_2LIST(paramList, defaultList, "aggregate qualities: good aggregate threshold", double, aggQualityParams); - MUELU_TEST_AND_SET_PARAM_2LIST(paramList, defaultList, "aggregate qualities: file output", bool, aggQualityParams); - MUELU_TEST_AND_SET_PARAM_2LIST(paramList, defaultList, "aggregate qualities: file base", std::string, aggQualityParams); - MUELU_TEST_AND_SET_PARAM_2LIST(paramList, defaultList, "aggregate qualities: check symmetry", bool, aggQualityParams); - MUELU_TEST_AND_SET_PARAM_2LIST(paramList, defaultList, "aggregate qualities: algorithm", std::string, aggQualityParams); - MUELU_TEST_AND_SET_PARAM_2LIST(paramList, defaultList, "aggregate qualities: zero threshold", double, aggQualityParams); - MUELU_TEST_AND_SET_PARAM_2LIST(paramList, defaultList, "aggregate qualities: percentiles", Teuchos::Array, aggQualityParams); - MUELU_TEST_AND_SET_PARAM_2LIST(paramList, defaultList, "aggregate qualities: mode", std::string, aggQualityParams); - aggQualityFact->SetParameterList(aggQualityParams); - manager.SetFactory("AggregateQualities", aggQualityFact); - - assert(aggType == "uncoupled"); - aggFactory->SetFactory("AggregateQualities", aggQualityFact); - } - // Tentative P MUELU_KOKKOS_FACTORY(Ptent, TentativePFactory, TentativePFactory_kokkos); ParameterList ptentParams; @@ -1319,6 +1298,28 @@ void ParameterListInterpreter:: RAPs->SetFactory("R", manager.GetFactory("R")); } + // Aggregate qualities + if (MUELU_TEST_PARAM_2LIST(paramList, defaultList, "aggregation: compute aggregate qualities", bool, true)) { + RCP aggQualityFact = rcp(new AggregateQualityEstimateFactory()); + ParameterList aggQualityParams; + MUELU_TEST_AND_SET_PARAM_2LIST(paramList, defaultList, "aggregate qualities: good aggregate threshold", double, aggQualityParams); + MUELU_TEST_AND_SET_PARAM_2LIST(paramList, defaultList, "aggregate qualities: file output", bool, aggQualityParams); + MUELU_TEST_AND_SET_PARAM_2LIST(paramList, defaultList, "aggregate qualities: file base", std::string, aggQualityParams); + MUELU_TEST_AND_SET_PARAM_2LIST(paramList, defaultList, "aggregate qualities: check symmetry", bool, aggQualityParams); + MUELU_TEST_AND_SET_PARAM_2LIST(paramList, defaultList, "aggregate qualities: algorithm", std::string, aggQualityParams); + MUELU_TEST_AND_SET_PARAM_2LIST(paramList, defaultList, "aggregate qualities: zero threshold", double, aggQualityParams); + MUELU_TEST_AND_SET_PARAM_2LIST(paramList, defaultList, "aggregate qualities: percentiles", Teuchos::Array, aggQualityParams); + MUELU_TEST_AND_SET_PARAM_2LIST(paramList, defaultList, "aggregate qualities: mode", std::string, aggQualityParams); + aggQualityFact->SetParameterList(aggQualityParams); + aggQualityFact->SetFactory("Aggregates", manager.GetFactory("Aggregates")); + aggQualityFact->SetFactory("CoarseMap", manager.GetFactory("CoarseMap")); + + if (!RAP.is_null()) + RAP->AddTransferFactory(aggQualityFact); + else + RAPs->AddTransferFactory(aggQualityFact); + } + if (MUELU_TEST_PARAM_2LIST(paramList, defaultList, "aggregation: export visualization data", bool, true)) { RCP aggExport = rcp(new AggregationExportFactory()); ParameterList aggExportParams; diff --git a/packages/muelu/src/Misc/MueLu_AggregateQualityEstimateFactory_decl.hpp b/packages/muelu/src/Utils/MueLu_AggregateQualityEstimateFactory_decl.hpp similarity index 88% rename from packages/muelu/src/Misc/MueLu_AggregateQualityEstimateFactory_decl.hpp rename to packages/muelu/src/Utils/MueLu_AggregateQualityEstimateFactory_decl.hpp index be87ec960139..473ad53ce0bf 100644 --- a/packages/muelu/src/Misc/MueLu_AggregateQualityEstimateFactory_decl.hpp +++ b/packages/muelu/src/Utils/MueLu_AggregateQualityEstimateFactory_decl.hpp @@ -11,7 +11,7 @@ #define MUELU_AGGREGATEQUALITYESTIMATEFACTORY_DECL_HPP #include "MueLu_ConfigDefs.hpp" -#include "MueLu_SingleLevelFactoryBase.hpp" +#include "MueLu_TwoLevelFactoryBase.hpp" #include "MueLu_AggregateQualityEstimateFactory_fwd.hpp" #include @@ -41,8 +41,11 @@ namespace MueLu { computing, 34(2), A1079-A1109. */ -template -class AggregateQualityEstimateFactory : public SingleLevelFactoryBase { +template +class AggregateQualityEstimateFactory : public TwoLevelFactoryBase { #undef MUELU_AGGREGATEQUALITYESTIMATEFACTORY_SHORT #include "MueLu_UseShortNames.hpp" @@ -70,7 +73,7 @@ class AggregateQualityEstimateFactory : public SingleLevelFactoryBase { If the Build method of this class requires some data, but the generating factory is not specified in DeclareInput, then this class will fall back to the settings in FactoryManager. */ - void DeclareInput(Level& currentLevel) const; + void DeclareInput(Level& fineLevel, Level& coarseLevel) const; //@} @@ -78,7 +81,7 @@ class AggregateQualityEstimateFactory : public SingleLevelFactoryBase { //@{ //! Build aggregate quality esimates with this factory. - void Build(Level& currentLevel) const; + void Build(Level& fineLevel, Level& coarseLevel) const; //@} diff --git a/packages/muelu/src/Misc/MueLu_AggregateQualityEstimateFactory_def.hpp b/packages/muelu/src/Utils/MueLu_AggregateQualityEstimateFactory_def.hpp similarity index 96% rename from packages/muelu/src/Misc/MueLu_AggregateQualityEstimateFactory_def.hpp rename to packages/muelu/src/Utils/MueLu_AggregateQualityEstimateFactory_def.hpp index c2c288192214..e7a2943d9969 100644 --- a/packages/muelu/src/Misc/MueLu_AggregateQualityEstimateFactory_def.hpp +++ b/packages/muelu/src/Utils/MueLu_AggregateQualityEstimateFactory_def.hpp @@ -34,10 +34,10 @@ template AggregateQualityEstimateFactory::~AggregateQualityEstimateFactory() {} template -void AggregateQualityEstimateFactory::DeclareInput(Level& currentLevel) const { - Input(currentLevel, "A"); - Input(currentLevel, "Aggregates"); - Input(currentLevel, "CoarseMap"); +void AggregateQualityEstimateFactory::DeclareInput(Level& fineLevel, Level& coarseLevel) const { + Input(fineLevel, "A"); + Input(fineLevel, "Aggregates"); + Input(fineLevel, "CoarseMap"); } template @@ -64,13 +64,13 @@ RCP AggregateQualityEstimateFactory -void AggregateQualityEstimateFactory::Build(Level& currentLevel) const { - FactoryMonitor m(*this, "Build", currentLevel); +void AggregateQualityEstimateFactory::Build(Level& fineLevel, Level& coarseLevel) const { + FactoryMonitor m(*this, "Build", fineLevel); - RCP A = Get>(currentLevel, "A"); - RCP aggregates = Get>(currentLevel, "Aggregates"); + RCP A = Get>(fineLevel, "A"); + RCP aggregates = Get>(fineLevel, "Aggregates"); - RCP map = Get>(currentLevel, "CoarseMap"); + RCP map = Get>(fineLevel, "CoarseMap"); assert(!aggregates->AggregatesCrossProcessors()); ParameterList pL = GetParameterList(); @@ -81,15 +81,15 @@ void AggregateQualityEstimateFactory: if (mode == "eigenvalue" || mode == "both") { aggregate_qualities = Xpetra::MultiVectorFactory::Build(map, 1); ComputeAggregateQualities(A, aggregates, aggregate_qualities); - OutputAggQualities(currentLevel, aggregate_qualities); + OutputAggQualities(fineLevel, aggregate_qualities); } if (mode == "size" || mode == "both") { RCP aggregate_sizes = Xpetra::VectorFactory::Build(map); ComputeAggregateSizes(A, aggregates, aggregate_sizes); - Set(currentLevel, "AggregateSizes", aggregate_sizes); - OutputAggSizes(currentLevel, aggregate_sizes); + Set(fineLevel, "AggregateSizes", aggregate_sizes); + OutputAggSizes(fineLevel, aggregate_sizes); } - Set(currentLevel, "AggregateQualities", aggregate_qualities); + Set(coarseLevel, "AggregateQualities", aggregate_qualities); } template diff --git a/packages/muelu/test/interface/default/EasyParameterListInterpreter/aggregatequalities.xml b/packages/muelu/test/interface/default/EasyParameterListInterpreter/aggregatequalities.xml new file mode 100644 index 000000000000..f732f2a3c9b5 --- /dev/null +++ b/packages/muelu/test/interface/default/EasyParameterListInterpreter/aggregatequalities.xml @@ -0,0 +1,7 @@ + + + + + + + diff --git a/packages/muelu/test/interface/default/FactoryParameterListInterpreter/aggregatequalities.xml b/packages/muelu/test/interface/default/FactoryParameterListInterpreter/aggregatequalities.xml index b36abd859cdd..56565e5f4de7 100644 --- a/packages/muelu/test/interface/default/FactoryParameterListInterpreter/aggregatequalities.xml +++ b/packages/muelu/test/interface/default/FactoryParameterListInterpreter/aggregatequalities.xml @@ -47,6 +47,7 @@ + @@ -58,15 +59,14 @@ - - - + + diff --git a/packages/muelu/test/interface/default/Output/aggregatequalities_epetra.gold b/packages/muelu/test/interface/default/Output/aggregatequalities_epetra.gold index 3714e69e8895..5d4a2e452dab 100644 --- a/packages/muelu/test/interface/default/Output/aggregatequalities_epetra.gold +++ b/packages/muelu/test/interface/default/Output/aggregatequalities_epetra.gold @@ -26,13 +26,9 @@ BuildAggregatesNonKokkos (Phase 1 (main)) BuildAggregatesNonKokkos (Phase 2a (secondary)) BuildAggregatesNonKokkos (Phase 2b (expansion)) BuildAggregatesNonKokkos (Phase 3 (cleanup)) -Build (MueLu::AggregateQualityEstimateFactory) -Build (MueLu::CoarseMapFactory) Nullspace factory (MueLu::NullspaceFactory) Fine level nullspace = Nullspace -aggregate qualities: good aggregate threshold = 100 [unused] -aggregate qualities: check symmetry = 0 [unused] -aggregation: compute aggregate qualities = 1 +Build (MueLu::CoarseMapFactory) matrixmatrix: kernel params -> [empty list] matrixmatrix: kernel params -> @@ -41,6 +37,10 @@ Transpose P (MueLu::TransPFactory) matrixmatrix: kernel params -> [empty list] Computing Ac (MueLu::RAPFactory) +RAPFactory: call transfer factory: MueLu::AggregateQualityEstimateFactory +Build (MueLu::AggregateQualityEstimateFactory) +aggregate qualities: good aggregate threshold = 100 [unused] +aggregate qualities: check symmetry = 0 [unused] RAPFactory: call transfer factory: MueLu::CoordinatesTransferFactory Build (MueLu::CoordinatesTransferFactory) Transferring coordinates @@ -71,13 +71,9 @@ BuildAggregatesNonKokkos (Phase 1 (main)) BuildAggregatesNonKokkos (Phase 2a (secondary)) BuildAggregatesNonKokkos (Phase 2b (expansion)) BuildAggregatesNonKokkos (Phase 3 (cleanup)) -Build (MueLu::AggregateQualityEstimateFactory) -Build (MueLu::CoarseMapFactory) Nullspace factory (MueLu::NullspaceFactory) Fine level nullspace = Nullspace -aggregate qualities: good aggregate threshold = 100 [unused] -aggregate qualities: check symmetry = 0 [unused] -aggregation: compute aggregate qualities = 1 +Build (MueLu::CoarseMapFactory) matrixmatrix: kernel params -> [empty list] matrixmatrix: kernel params -> @@ -86,6 +82,10 @@ Transpose P (MueLu::TransPFactory) matrixmatrix: kernel params -> [empty list] Computing Ac (MueLu::RAPFactory) +RAPFactory: call transfer factory: MueLu::AggregateQualityEstimateFactory +Build (MueLu::AggregateQualityEstimateFactory) +aggregate qualities: good aggregate threshold = 100 [unused] +aggregate qualities: check symmetry = 0 [unused] RAPFactory: call transfer factory: MueLu::CoordinatesTransferFactory Build (MueLu::CoordinatesTransferFactory) Transferring coordinates diff --git a/packages/muelu/test/interface/default/Output/aggregatequalities_tpetra.gold b/packages/muelu/test/interface/default/Output/aggregatequalities_tpetra.gold index ef6897802897..4c9b7d57f952 100644 --- a/packages/muelu/test/interface/default/Output/aggregatequalities_tpetra.gold +++ b/packages/muelu/test/interface/default/Output/aggregatequalities_tpetra.gold @@ -27,13 +27,9 @@ BuildAggregatesNonKokkos (Phase 1 (main)) BuildAggregatesNonKokkos (Phase 2a (secondary)) BuildAggregatesNonKokkos (Phase 2b (expansion)) BuildAggregatesNonKokkos (Phase 3 (cleanup)) -Build (MueLu::AggregateQualityEstimateFactory) -Build (MueLu::CoarseMapFactory) Nullspace factory (MueLu::NullspaceFactory) Fine level nullspace = Nullspace -aggregate qualities: good aggregate threshold = 100 [unused] -aggregate qualities: check symmetry = 0 [unused] -aggregation: compute aggregate qualities = 1 +Build (MueLu::CoarseMapFactory) matrixmatrix: kernel params -> [empty list] matrixmatrix: kernel params -> @@ -42,6 +38,10 @@ Transpose P (MueLu::TransPFactory) matrixmatrix: kernel params -> [empty list] Computing Ac (MueLu::RAPFactory) +RAPFactory: call transfer factory: MueLu::AggregateQualityEstimateFactory +Build (MueLu::AggregateQualityEstimateFactory) +aggregate qualities: good aggregate threshold = 100 [unused] +aggregate qualities: check symmetry = 0 [unused] RAPFactory: call transfer factory: MueLu::CoordinatesTransferFactory Build (MueLu::CoordinatesTransferFactory) Transferring coordinates @@ -73,13 +73,9 @@ BuildAggregatesNonKokkos (Phase 1 (main)) BuildAggregatesNonKokkos (Phase 2a (secondary)) BuildAggregatesNonKokkos (Phase 2b (expansion)) BuildAggregatesNonKokkos (Phase 3 (cleanup)) -Build (MueLu::AggregateQualityEstimateFactory) -Build (MueLu::CoarseMapFactory) Nullspace factory (MueLu::NullspaceFactory) Fine level nullspace = Nullspace -aggregate qualities: good aggregate threshold = 100 [unused] -aggregate qualities: check symmetry = 0 [unused] -aggregation: compute aggregate qualities = 1 +Build (MueLu::CoarseMapFactory) matrixmatrix: kernel params -> [empty list] matrixmatrix: kernel params -> @@ -88,6 +84,10 @@ Transpose P (MueLu::TransPFactory) matrixmatrix: kernel params -> [empty list] Computing Ac (MueLu::RAPFactory) +RAPFactory: call transfer factory: MueLu::AggregateQualityEstimateFactory +Build (MueLu::AggregateQualityEstimateFactory) +aggregate qualities: good aggregate threshold = 100 [unused] +aggregate qualities: check symmetry = 0 [unused] RAPFactory: call transfer factory: MueLu::CoordinatesTransferFactory Build (MueLu::CoordinatesTransferFactory) Transferring coordinates diff --git a/packages/muelu/test/unit_tests/AggregateQualityEstimateFactory.cpp b/packages/muelu/test/unit_tests/AggregateQualityEstimateFactory.cpp index 769b47c77c19..dd095e626038 100644 --- a/packages/muelu/test/unit_tests/AggregateQualityEstimateFactory.cpp +++ b/packages/muelu/test/unit_tests/AggregateQualityEstimateFactory.cpp @@ -90,26 +90,40 @@ TEUCHOS_UNIT_TEST_TEMPLATE_4_DECL(AggregateQualityEstimateFactory, Poisson2D, Sc RCP> comm = Parameters::getDefaultComm(); - Level level; - TestHelpers::TestFactory::createSingleLevelHierarchy(level); + Level fineLevel, coarseLevel; + TestHelpers::TestFactory::createTwoLevelHierarchy(fineLevel, coarseLevel); GO nx = 20 * comm->getSize(); GO ny = nx; RCP Op = TestHelpers::TestFactory::Build2DPoisson(nx, ny); - level.Set("A", Op); + fineLevel.Set("A", Op); - AggregateQualityEstimateFactory aggQualityEstimateFactory; - std::cout << *(aggQualityEstimateFactory.GetValidParameterList()) << std::endl; - aggQualityEstimateFactory.SetParameter("aggregate qualities: check symmetry", Teuchos::ParameterEntry(false)); - aggQualityEstimateFactory.SetParameter("aggregate qualities: good aggregate threshold", Teuchos::ParameterEntry(100.0)); - aggQualityEstimateFactory.SetParameter("aggregate qualities: file output", Teuchos::ParameterEntry(false)); + RCP aggQualityEstimateFactory = rcp(new AggregateQualityEstimateFactory()); + aggQualityEstimateFactory->SetParameter("aggregate qualities: check symmetry", Teuchos::ParameterEntry(false)); + aggQualityEstimateFactory->SetParameter("aggregate qualities: good aggregate threshold", Teuchos::ParameterEntry(100.0)); + aggQualityEstimateFactory->SetParameter("aggregate qualities: file output", Teuchos::ParameterEntry(false)); - level.Request("AggregateQualities", &aggQualityEstimateFactory); - level.Request(aggQualityEstimateFactory); + RCP amalgFact = rcp(new AmalgamationFactory()); + RCP dropFact = rcp(new CoalesceDropFactory()); + dropFact->SetFactory("UnAmalgamationInfo", amalgFact); + RCP aggFact = rcp(new UncoupledAggregationFactory()); + aggFact->SetFactory("Graph", dropFact); + RCP coarsemapFact = Teuchos::rcp(new CoarseMapFactory()); + coarsemapFact->SetFactory("Aggregates", aggFact); + aggQualityEstimateFactory->SetFactory("Aggregates", aggFact); + aggQualityEstimateFactory->SetFactory("CoarseMap", coarsemapFact); + + coarseLevel.Request(*aggQualityEstimateFactory); + fineLevel.Request(*aggFact); + fineLevel.Request(*coarsemapFact); + + aggQualityEstimateFactory->Build(fineLevel, coarseLevel); + + coarseLevel.Request("AggregateQualities", aggQualityEstimateFactory.get()); out << "Getting aggregate qualities...\n\n"; - RCP aggQualities = level.Get>("AggregateQualities", &aggQualityEstimateFactory); + RCP aggQualities = coarseLevel.Get>("AggregateQualities", aggQualityEstimateFactory.get()); out << "Testing aggregate qualities to make sure all aggregates are of good quality...\n\n"; @@ -536,7 +550,6 @@ TEUCHOS_UNIT_TEST_TEMPLATE_4_DECL(AggregateQualityEstimateFactory, ConvectionDif TEUCHOS_UNIT_TEST_TEMPLATE_4_INSTANT(AggregateQualityEstimateFactory, Constructor, Scalar, LO, GO, Node) \ TEUCHOS_UNIT_TEST_TEMPLATE_4_INSTANT(AggregateQualityEstimateFactory, Poisson2D, Scalar, LO, GO, Node) // TEUCHOS_UNIT_TEST_TEMPLATE_4_INSTANT(AggregateQualityEstimateFactory,AnisotropicDiffusion2D,Scalar,LO,GO,Node) - // TEUCHOS_UNIT_TEST_TEMPLATE_4_INSTANT(AggregateQualityEstimateFactory,ConvectionDiffusion2D,Scalar,LO,GO,Node) #include From 4e6e2ca89df63eb0050d822a60b575e2c0ddcd9c Mon Sep 17 00:00:00 2001 From: reuterb Date: Wed, 6 Nov 2024 16:40:02 -0700 Subject: [PATCH 24/25] Panzer tangent unit tests (Blocked Gather) (#13576) Refresh Gather_BlockedTpetra evaluator, put tangent capability on device, and update the unit test. --------- Signed-off-by: Bryan Reuter --- .../test/evaluator_tests/CMakeLists.txt | 7 + .../tpetra_blocked_gather_solution.cpp | 721 ++++++++++++++++++ .../Panzer_GatherSolution_BlockedTpetra.hpp | 21 +- ...nzer_GatherSolution_BlockedTpetra_impl.hpp | 178 +++-- 4 files changed, 853 insertions(+), 74 deletions(-) create mode 100644 packages/panzer/adapters-stk/test/evaluator_tests/tpetra_blocked_gather_solution.cpp diff --git a/packages/panzer/adapters-stk/test/evaluator_tests/CMakeLists.txt b/packages/panzer/adapters-stk/test/evaluator_tests/CMakeLists.txt index 3b2202563a32..d871d0375cb0 100644 --- a/packages/panzer/adapters-stk/test/evaluator_tests/CMakeLists.txt +++ b/packages/panzer/adapters-stk/test/evaluator_tests/CMakeLists.txt @@ -43,6 +43,13 @@ TRIBITS_ADD_EXECUTABLE_AND_TEST( NUM_MPI_PROCS 2 ) +TRIBITS_ADD_EXECUTABLE_AND_TEST( + tGatherSolution_BlockedTpetra + SOURCES tpetra_blocked_gather_solution.cpp ${UNIT_TEST_DRIVER} + COMM serial mpi + NUM_MPI_PROCS 2 + ) + TRIBITS_ADD_EXECUTABLE_AND_TEST( tScatterResidual_Tpetra SOURCES tpetra_scatter_residual.cpp ${UNIT_TEST_DRIVER} diff --git a/packages/panzer/adapters-stk/test/evaluator_tests/tpetra_blocked_gather_solution.cpp b/packages/panzer/adapters-stk/test/evaluator_tests/tpetra_blocked_gather_solution.cpp new file mode 100644 index 000000000000..279956f8d6eb --- /dev/null +++ b/packages/panzer/adapters-stk/test/evaluator_tests/tpetra_blocked_gather_solution.cpp @@ -0,0 +1,721 @@ +// @HEADER +// ***************************************************************************** +// Panzer: A partial differential equation assembly +// engine for strongly coupled complex multiphysics systems +// +// Copyright 2011 NTESS and the Panzer contributors. +// SPDX-License-Identifier: BSD-3-Clause +// ***************************************************************************** +// @HEADER + +/////////////////////////////////////////////////////////////////////////////// +// +// Include Files +// +/////////////////////////////////////////////////////////////////////////////// + +// C++ +#include +#include +#include + +// Kokkos +#include "Kokkos_View_Fad.hpp" + +// Panzer +#include "PanzerAdaptersSTK_config.hpp" +#include "Panzer_BasisIRLayout.hpp" +#include "Panzer_BlockedTpetraLinearObjFactory.hpp" +#include "Panzer_BlockedDOFManager.hpp" +#include "Panzer_DOFManager.hpp" +#include "Panzer_Evaluator_WithBaseImpl.hpp" +#include "Panzer_FieldManagerBuilder.hpp" +#include "Panzer_GatherOrientation.hpp" +#include "Panzer_PureBasis.hpp" +#include "Panzer_STKConnManager.hpp" +#include "Panzer_STK_Interface.hpp" +#include "Panzer_STK_SetupUtilities.hpp" +#include "Panzer_STK_SquareQuadMeshFactory.hpp" +#include "Panzer_STK_Version.hpp" +#include "Panzer_Workset.hpp" +#include "Panzer_LOCPair_GlobalEvaluationData.hpp" +#include "Panzer_GlobalEvaluationDataContainer.hpp" + +// Teuchos +#include "Teuchos_DefaultMpiComm.hpp" +#include "Teuchos_GlobalMPISession.hpp" +#include "Teuchos_OpaqueWrapper.hpp" +#include "Teuchos_RCP.hpp" +#include "Teuchos_TimeMonitor.hpp" +#include "Teuchos_UnitTestHarness.hpp" + +// Thyra +#include "Thyra_ProductVectorBase.hpp" +#include "Thyra_VectorStdOps.hpp" + +// Tpetra +#include "Tpetra_Vector.hpp" + +// user_app +#include "user_app_EquationSetFactory.hpp" + +typedef double ScalarT; +using LocalOrdinalT = panzer::LocalOrdinal; +using GlobalOrdinalT = panzer::GlobalOrdinal; + +typedef Tpetra::Vector VectorType; +typedef Tpetra::Operator OperatorType; +typedef Tpetra::CrsMatrix CrsMatrixType; +typedef Tpetra::CrsGraph CrsGraphType; +typedef Tpetra::Map MapType; +typedef Tpetra::Import ImportType; +typedef Tpetra::Export ExportType; + +typedef Thyra::TpetraLinearOp ThyraLinearOp; + +typedef panzer::BlockedTpetraLinearObjFactory BlockedTpetraLinObjFactoryType; +typedef panzer::TpetraLinearObjFactory TpetraLinObjFactoryType; +typedef panzer::BlockedTpetraLinearObjContainer BlockedTpetraLinObjContainerType; +typedef panzer::TpetraLinearObjContainer TpetraLinObjContainerType; + +namespace panzer +{ + + Teuchos::RCP buildBasis(std::size_t worksetSize, const std::string &basisName); + void testInitialization(const Teuchos::RCP &ipb); + Teuchos::RCP buildMesh(int elemX, int elemY); + void testGatherScatter(const bool enable_tangents, Teuchos::FancyOStream &out, bool &success); + + // Test without tangent fields in gather evaluator + TEUCHOS_UNIT_TEST(tpetra_assembly, gather_solution_no_tangents) + { + testGatherScatter(false, out, success); + } + + // Test with tangent fields in gather evaluator + TEUCHOS_UNIT_TEST(tpetra_assembly, gather_solution_tangents) + { + testGatherScatter(true, out, success); + } + + // enable_tangents determines whether tangent fields dx/dp are added to gather evaluator. + // These are used when computing df/dx*dx/dp with the tangent evaluation type + void testGatherScatter(const bool enable_tangents, Teuchos::FancyOStream &out, bool &success) + { +#ifdef HAVE_MPI + Teuchos::RCP> tComm = Teuchos::rcp(new Teuchos::MpiComm(MPI_COMM_WORLD)); +#else + Teuchos::RCP> tComm = Teuchos::rcp(new Teuchos::SerialComm(MPI_COMM_WORLD)); +#endif + + int myRank = tComm->getRank(); + int numProcs = tComm->getSize(); + + const std::size_t workset_size = 4 / numProcs; + const std::string fieldName1_q1 = "U"; + const std::string fieldName2_q1 = "V"; + const std::string fieldName_qedge1 = "B"; + const int num_tangent = enable_tangents ? 5 : 0; + + Teuchos::RCP mesh = buildMesh(2, 2); + + // build input physics block + Teuchos::RCP basis_q1 = buildBasis(workset_size, "Q1"); + Teuchos::RCP basis_qedge1 = buildBasis(workset_size, "QEdge1"); + + Teuchos::RCP ipb = Teuchos::parameterList(); + testInitialization(ipb); + + const int default_int_order = 1; + std::string eBlockID = "eblock-0_0"; + Teuchos::RCP eqset_factory = Teuchos::rcp(new user_app::MyFactory); + panzer::CellData cellData(workset_size, mesh->getCellTopology("eblock-0_0")); + Teuchos::RCP gd = panzer::createGlobalData(); + Teuchos::RCP physicsBlock = + Teuchos::rcp(new PhysicsBlock(ipb, eBlockID, default_int_order, cellData, eqset_factory, gd, false)); + + Teuchos::RCP> work_sets = panzer_stk::buildWorksets(*mesh, physicsBlock->elementBlockID(), + physicsBlock->getWorksetNeeds()); + TEST_EQUALITY(work_sets->size(), 1); + + // build connection manager and field manager + const Teuchos::RCP conn_manager = Teuchos::rcp(new panzer_stk::STKConnManager(mesh)); + Teuchos::RCP blocked_dofManager = Teuchos::rcp(new panzer::BlockedDOFManager(conn_manager, MPI_COMM_WORLD)); + + blocked_dofManager->addField(fieldName1_q1, Teuchos::rcp(new panzer::Intrepid2FieldPattern(basis_q1->getIntrepid2Basis()))); + blocked_dofManager->addField(fieldName2_q1, Teuchos::rcp(new panzer::Intrepid2FieldPattern(basis_q1->getIntrepid2Basis()))); + blocked_dofManager->addField(fieldName_qedge1, Teuchos::rcp(new panzer::Intrepid2FieldPattern(basis_qedge1->getIntrepid2Basis()))); + + std::vector > fieldOrder(3); + fieldOrder[0].push_back(fieldName1_q1); + fieldOrder[1].push_back(fieldName_qedge1); + fieldOrder[2].push_back(fieldName2_q1); + blocked_dofManager->setFieldOrder(fieldOrder); + + blocked_dofManager->buildGlobalUnknowns(); + + // setup linear object factory + ///////////////////////////////////////////////////////////// + + Teuchos::RCP t_lof = Teuchos::rcp(new BlockedTpetraLinObjFactoryType(tComm.getConst(), blocked_dofManager)); + Teuchos::RCP> lof = t_lof; + Teuchos::RCP loc = t_lof->buildGhostedLinearObjContainer(); + t_lof->initializeGhostedContainer(LinearObjContainer::X, *loc); + loc->initialize(); + + Teuchos::RCP t_loc = Teuchos::rcp_dynamic_cast(loc); + Teuchos::RCP> x_vec = t_loc->get_x_th(); + Thyra::assign(x_vec.ptr(), 123.0 + myRank); + + // need a place to evaluate the tangent fields, so we create a + // unblocked DOFManager and LOF and set up if needed + std::vector> tangentContainers; + Teuchos::RCP dofManager = Teuchos::rcp(new panzer::DOFManager(conn_manager, MPI_COMM_WORLD)); + Teuchos::RCP tangent_lof = Teuchos::rcp(new TpetraLinObjFactoryType(tComm.getConst(), dofManager)); + Teuchos::RCP> parent_tangent_lof = tangent_lof; + + if (enable_tangents) + { + using Teuchos::RCP; + using Teuchos::rcp; + using Teuchos::rcp_dynamic_cast; + using Thyra::ProductVectorBase; + using LOCPair = panzer::LOCPair_GlobalEvaluationData; + + std::vector tangent_fieldOrder; + for (int i(0); i < num_tangent; ++i) { + std::stringstream ssedge; + ssedge << fieldName_qedge1 << " Tangent " << i; + std::stringstream ss1, ss2; + ss1 << fieldName1_q1 << " Tangent " << i; + ss2 << fieldName2_q1 << " Tangent " << i; + + dofManager->addField(ss1.str(), Teuchos::rcp(new panzer::Intrepid2FieldPattern(basis_q1->getIntrepid2Basis()))); + dofManager->addField(ss2.str(), Teuchos::rcp(new panzer::Intrepid2FieldPattern(basis_q1->getIntrepid2Basis()))); + dofManager->addField(ssedge.str(), Teuchos::rcp(new panzer::Intrepid2FieldPattern(basis_qedge1->getIntrepid2Basis()))); + tangent_fieldOrder.push_back(ss1.str()); + tangent_fieldOrder.push_back(ss2.str()); + tangent_fieldOrder.push_back(ssedge.str()); + } + dofManager->setFieldOrder(tangent_fieldOrder); + dofManager->buildGlobalUnknowns(); + + // generate and evaluate some fields + Teuchos::RCP tangent_loc = tangent_lof->buildGhostedLinearObjContainer(); + tangent_lof->initializeGhostedContainer(LinearObjContainer::X, *tangent_loc); + tangent_loc->initialize(); + + for (int i(0); i < num_tangent; ++i) + { + auto locPair = Teuchos::rcp(new LOCPair(tangent_lof, panzer::LinearObjContainer::X)); + + auto global_t_loc = rcp_dynamic_cast(locPair->getGlobalLOC()); + Teuchos::RCP> global_x_vec = global_t_loc->get_x_th(); + Thyra::assign(global_x_vec.ptr(), 0.123 + myRank + i); + + auto ghosted_t_loc = rcp_dynamic_cast(locPair->getGhostedLOC()); + Teuchos::RCP> ghosted_x_vec = ghosted_t_loc->get_x_th(); + Thyra::assign(ghosted_x_vec.ptr(), 0.123 + myRank + i); + + tangentContainers.push_back(locPair); + } // end loop over the tangents + } // end if (enable_tangents) + + // setup field manager, add evaluator under test + ///////////////////////////////////////////////////////////// + + PHX::FieldManager fm; + + std::vector derivative_dimensions; + derivative_dimensions.push_back(12); + fm.setKokkosExtendedDataTypeDimensions(derivative_dimensions); + + std::vector tan_derivative_dimensions; + if (enable_tangents) + tan_derivative_dimensions.push_back(num_tangent); + else + tan_derivative_dimensions.push_back(0); + fm.setKokkosExtendedDataTypeDimensions(tan_derivative_dimensions); + + Teuchos::RCP evalField_q1, evalField_qedge1; + { + using Teuchos::RCP; + using Teuchos::rcp; + RCP> names = rcp(new std::vector); + names->push_back(fieldName1_q1); + names->push_back(fieldName2_q1); + + Teuchos::ParameterList pl; + pl.set("Basis", basis_q1); + pl.set("DOF Names", names); + pl.set("Indexer Names", names); + + Teuchos::RCP> evaluator = lof->buildGather(pl); + + TEST_EQUALITY(evaluator->evaluatedFields().size(), 2); + + fm.registerEvaluator(evaluator); + fm.requireField(*evaluator->evaluatedFields()[0]); + } + { + using Teuchos::RCP; + using Teuchos::rcp; + RCP> names = rcp(new std::vector); + names->push_back(fieldName_qedge1); + + Teuchos::ParameterList pl; + pl.set("Basis", basis_qedge1); + pl.set("DOF Names", names); + pl.set("Indexer Names", names); + + Teuchos::RCP> evaluator = lof->buildGather(pl); + + TEST_EQUALITY(evaluator->evaluatedFields().size(), 1); + + fm.registerEvaluator(evaluator); + fm.requireField(*evaluator->evaluatedFields()[0]); + } + + { + using Teuchos::RCP; + using Teuchos::rcp; + RCP> names = rcp(new std::vector); + names->push_back(fieldName1_q1); + names->push_back(fieldName2_q1); + + Teuchos::ParameterList pl; + pl.set("Basis", basis_q1); + pl.set("DOF Names", names); + pl.set("Indexer Names", names); + + Teuchos::RCP> evaluator = lof->buildGather(pl); + + TEST_EQUALITY(evaluator->evaluatedFields().size(), 2); + + fm.registerEvaluator(evaluator); + fm.requireField(*evaluator->evaluatedFields()[0]); + } + { + using Teuchos::RCP; + using Teuchos::rcp; + RCP> names = rcp(new std::vector); + names->push_back(fieldName_qedge1); + + Teuchos::ParameterList pl; + pl.set("Basis", basis_qedge1); + pl.set("DOF Names", names); + pl.set("Indexer Names", names); + + Teuchos::RCP> evaluator = lof->buildGather(pl); + + TEST_EQUALITY(evaluator->evaluatedFields().size(), 1); + + fm.registerEvaluator(evaluator); + fm.requireField(*evaluator->evaluatedFields()[0]); + } + + { + using Teuchos::RCP; + using Teuchos::rcp; + RCP> names = rcp(new std::vector); + names->push_back(fieldName1_q1); + names->push_back(fieldName2_q1); + + Teuchos::ParameterList pl; + pl.set("Basis", basis_q1); + pl.set("DOF Names", names); + pl.set("Indexer Names", names); + + if (enable_tangents) + { + RCP>> tangent_names = + rcp(new std::vector>(2)); + for (int i = 0; i < num_tangent; ++i) + { + std::stringstream ss1, ss2; + ss1 << fieldName1_q1 << " Tangent " << i; + ss2 << fieldName2_q1 << " Tangent " << i; + (*tangent_names)[0].push_back(ss1.str()); + (*tangent_names)[1].push_back(ss2.str()); + } + pl.set("Tangent Names", tangent_names); + } + + Teuchos::RCP> evaluator = lof->buildGather(pl); + + TEST_EQUALITY(evaluator->evaluatedFields().size(), 2); + + fm.registerEvaluator(evaluator); + fm.requireField(*evaluator->evaluatedFields()[0]); + } + { + using Teuchos::RCP; + using Teuchos::rcp; + RCP> names = rcp(new std::vector); + names->push_back(fieldName_qedge1); + + Teuchos::ParameterList pl; + pl.set("Basis", basis_qedge1); + pl.set("DOF Names", names); + pl.set("Indexer Names", names); + + if (enable_tangents) + { + RCP>> tangent_names = + rcp(new std::vector>(1)); + for (int i = 0; i < num_tangent; ++i) + { + std::stringstream ss; + ss << fieldName_qedge1 << " Tangent " << i; + (*tangent_names)[0].push_back(ss.str()); + } + pl.set("Tangent Names", tangent_names); + } + + Teuchos::RCP> evaluator = lof->buildGather(pl); + + TEST_EQUALITY(evaluator->evaluatedFields().size(), 1); + + fm.registerEvaluator(evaluator); + fm.requireField(*evaluator->evaluatedFields()[0]); + } + + if (enable_tangents) + { + for (int i = 0; i < num_tangent; ++i) + { + using Teuchos::RCP; + using Teuchos::rcp; + RCP> names = rcp(new std::vector); + RCP> tangent_names = rcp(new std::vector); + names->push_back(fieldName1_q1); + names->push_back(fieldName2_q1); + { + std::stringstream ss1, ss2; + ss1 << fieldName1_q1 << " Tangent " << i; + ss2 << fieldName2_q1 << " Tangent " << i; + tangent_names->push_back(ss1.str()); + tangent_names->push_back(ss2.str()); + } + + Teuchos::ParameterList pl; + pl.set("Basis", basis_q1); + pl.set("DOF Names", tangent_names); + pl.set("Indexer Names", tangent_names); + + { + std::stringstream ss; + ss << "Tangent Container " << i; + pl.set("Global Data Key", ss.str()); + } + + Teuchos::RCP> evaluator = + parent_tangent_lof->buildGatherTangent(pl); + + TEST_EQUALITY(evaluator->evaluatedFields().size(), 2); + + fm.registerEvaluator(evaluator); + } + for (int i = 0; i < num_tangent; ++i) + { + using Teuchos::RCP; + using Teuchos::rcp; + RCP> names = rcp(new std::vector); + RCP> tangent_names = rcp(new std::vector); + names->push_back(fieldName_qedge1); + { + std::stringstream ss; + ss << fieldName_qedge1 << " Tangent " << i; + tangent_names->push_back(ss.str()); + } + + Teuchos::ParameterList pl; + pl.set("Basis", basis_qedge1); + pl.set("DOF Names", tangent_names); + pl.set("Indexer Names", tangent_names); + + { + std::stringstream ss; + ss << "Tangent Container " << i; + pl.set("Global Data Key", ss.str()); + } + + Teuchos::RCP> evaluator = + parent_tangent_lof->buildGatherTangent(pl); + + TEST_EQUALITY(evaluator->evaluatedFields().size(), 1); + + fm.registerEvaluator(evaluator); + } + } + + panzer::Traits::SD sd; + + panzer::Workset &workset = (*work_sets)[0]; + workset.alpha = 0.0; + workset.beta = 2.0; // derivatives multiplied by 2 + workset.time = 0.0; + workset.evaluate_transient_terms = false; + + sd.worksets_ = work_sets; + + fm.postRegistrationSetup(sd); + + panzer::Traits::PED ped; + ped.gedc->addDataObject("Solution Gather Container", loc); + if (enable_tangents) + { + for (int i(0); i < num_tangent; ++i) + { + std::stringstream ss; + ss << "Tangent Container " << i; + ped.gedc->addDataObject(ss.str(), tangentContainers[i]); + } + } + + fm.preEvaluate(ped); + fm.evaluateFields(workset); + fm.postEvaluate(0); + + fm.preEvaluate(ped); + fm.evaluateFields(workset); + fm.postEvaluate(0); + + fm.preEvaluate(ped); + fm.evaluateFields(workset); + fm.postEvaluate(0); + + // test Residual fields + { + PHX::MDField + fieldData1_q1(fieldName1_q1, basis_q1->functional); + PHX::MDField + fieldData2_q1(fieldName2_q1, basis_qedge1->functional); + + fm.getFieldData(fieldData1_q1); + fm.getFieldData(fieldData2_q1); + + TEST_EQUALITY(fieldData1_q1.extent(0), Teuchos::as(4 / numProcs)); + TEST_EQUALITY(fieldData1_q1.extent(1), 4); + TEST_EQUALITY(fieldData2_q1.extent(0), Teuchos::as(4 / numProcs)); + TEST_EQUALITY(fieldData2_q1.extent(1), 4); + TEST_EQUALITY(fieldData1_q1.size(), Teuchos::as(4 * 4 / numProcs)); + TEST_EQUALITY(fieldData2_q1.size(), Teuchos::as(4 * 4 / numProcs)); + + auto fieldData1_q1_h = Kokkos::create_mirror_view(fieldData1_q1.get_static_view()); + auto fieldData2_q1_h = Kokkos::create_mirror_view(fieldData2_q1.get_static_view()); + Kokkos::deep_copy(fieldData1_q1_h, fieldData1_q1.get_static_view()); + Kokkos::deep_copy(fieldData2_q1_h, fieldData2_q1.get_static_view()); + + for (unsigned int i = 0; i < fieldData1_q1.extent(0); i++) + for (unsigned int j = 0; j < fieldData1_q1.extent(1); j++) + TEST_EQUALITY(fieldData1_q1_h(i, j), 123.0 + myRank); + + for (unsigned int i = 0; i < fieldData2_q1.extent(0); i++) + for (unsigned int j = 0; j < fieldData2_q1.extent(1); j++) + TEST_EQUALITY(fieldData2_q1_h(i, j), 123.0 + myRank); + } + { + PHX::MDField + fieldData_qedge1(fieldName_qedge1, basis_qedge1->functional); + + fm.getFieldData(fieldData_qedge1); + + auto fieldData_qedge1_h = Kokkos::create_mirror_view(fieldData_qedge1.get_static_view()); + Kokkos::deep_copy(fieldData_qedge1_h, fieldData_qedge1.get_static_view()); + + TEST_EQUALITY(fieldData_qedge1.extent(0), Teuchos::as(4 / numProcs)); + TEST_EQUALITY(fieldData_qedge1.extent(1), 4); + TEST_EQUALITY(fieldData_qedge1.size(), Teuchos::as(4 * 4 / numProcs)); + + for (unsigned int cell = 0; cell < fieldData_qedge1.extent(0); ++cell) + for (unsigned int pt = 0; pt < fieldData_qedge1.extent(1); pt++) + TEST_EQUALITY(fieldData_qedge1_h(cell, pt), 123.0 + myRank); + } + + // test Jacobian fields + { + PHX::MDField + fieldData1_q1(fieldName1_q1, basis_q1->functional); + PHX::MDField + fieldData2_q1(fieldName2_q1, basis_qedge1->functional); + + fm.getFieldData(fieldData1_q1); + fm.getFieldData(fieldData2_q1); + + auto fieldData1_q1_h = Kokkos::create_mirror_view(fieldData1_q1.get_static_view()); + auto fieldData2_q1_h = Kokkos::create_mirror_view(fieldData2_q1.get_static_view()); + Kokkos::deep_copy(fieldData1_q1_h, fieldData1_q1.get_static_view()); + Kokkos::deep_copy(fieldData2_q1_h, fieldData2_q1.get_static_view()); + + for (unsigned int cell = 0; cell < fieldData1_q1.extent(0); ++cell) + { + for (unsigned int pt = 0; pt < fieldData1_q1.extent(1); pt++) + { + TEST_EQUALITY(fieldData1_q1_h(cell, pt), 123.0 + myRank); + TEST_EQUALITY(fieldData1_q1_h(cell, pt).availableSize(), 12); + } + } + for (unsigned int cell = 0; cell < fieldData2_q1.extent(0); ++cell) + { + for (unsigned int pt = 0; pt < fieldData2_q1.extent(1); pt++) + { + TEST_EQUALITY(fieldData2_q1_h(cell, pt), 123.0 + myRank); + TEST_EQUALITY(fieldData2_q1_h(cell, pt).availableSize(), 12); + } + } + } + { + PHX::MDField + fieldData_qedge1(fieldName_qedge1, basis_qedge1->functional); + + fm.getFieldData(fieldData_qedge1); + + auto fieldData_qedge1_h = Kokkos::create_mirror_view(fieldData_qedge1.get_static_view()); + Kokkos::deep_copy(fieldData_qedge1_h, fieldData_qedge1.get_static_view()); + + for (unsigned int cell = 0; cell < fieldData_qedge1.extent(0); ++cell) + { + for (unsigned int pt = 0; pt < fieldData_qedge1.extent(1); ++pt) + { + TEST_EQUALITY(fieldData_qedge1_h(cell, pt), 123.0 + myRank); + TEST_EQUALITY(fieldData_qedge1_h(cell, pt).availableSize(), 12); + } + } + } + + // test Tangent fields + { + PHX::MDField + fieldData1_q1(fieldName1_q1, basis_q1->functional); + PHX::MDField + fieldData2_q1(fieldName2_q1, basis_qedge1->functional); + + fm.getFieldData(fieldData1_q1); + fm.getFieldData(fieldData2_q1); + + auto fieldData1_q1_h = Kokkos::create_mirror_view(fieldData1_q1.get_static_view()); + auto fieldData2_q1_h = Kokkos::create_mirror_view(fieldData2_q1.get_static_view()); + Kokkos::deep_copy(fieldData1_q1_h, fieldData1_q1.get_static_view()); + Kokkos::deep_copy(fieldData2_q1_h, fieldData2_q1.get_static_view()); + + for (unsigned int cell = 0; cell < fieldData1_q1.extent(0); ++cell) + { + for (unsigned int pt = 0; pt < fieldData1_q1.extent(1); pt++) + { + if (enable_tangents) + { + TEST_EQUALITY(fieldData1_q1_h(cell, pt).val(), 123.0 + myRank); + TEST_EQUALITY(fieldData1_q1_h(cell, pt).availableSize(), num_tangent); + for (int i = 0; i < num_tangent; ++i) + TEST_EQUALITY(fieldData1_q1_h(cell, pt).dx(i), 0.123 + myRank + i); + } + else + { + TEST_EQUALITY(fieldData1_q1_h(cell, pt), 123.0 + myRank); + TEST_EQUALITY(fieldData1_q1_h(cell, pt).availableSize(), 0); + } + } + } + for (unsigned int cell = 0; cell < fieldData2_q1.extent(0); ++cell) + { + for (unsigned int pt = 0; pt < fieldData2_q1.extent(1); pt++) + { + if (enable_tangents) + { + TEST_EQUALITY(fieldData2_q1_h(cell, pt).val(), 123.0 + myRank); + TEST_EQUALITY(fieldData2_q1_h(cell, pt).availableSize(), num_tangent); + for (int i = 0; i < num_tangent; ++i) + { + TEST_EQUALITY(fieldData2_q1_h(cell, pt).dx(i), 0.123 + myRank + i); + TEST_EQUALITY(fieldData2_q1_h(cell, pt).dx(i), 0.123 + myRank + i); + } + } + else + { + TEST_EQUALITY(fieldData2_q1_h(cell, pt), 123.0 + myRank); + TEST_EQUALITY(fieldData2_q1_h(cell, pt).availableSize(), 0); + } + } + } + } + { + PHX::MDField + fieldData_qedge1(fieldName_qedge1, basis_qedge1->functional); + + fm.getFieldData(fieldData_qedge1); + + auto fieldData_qedge1_h = Kokkos::create_mirror_view(fieldData_qedge1.get_static_view()); + Kokkos::deep_copy(fieldData_qedge1_h, fieldData_qedge1.get_static_view()); + + for (unsigned int cell = 0; cell < fieldData_qedge1.extent(0); ++cell) + { + for (unsigned int pt = 0; pt < fieldData_qedge1.extent(1); ++pt) + { + if (enable_tangents) + { + TEST_EQUALITY(fieldData_qedge1_h(cell, pt).val(), 123.0 + myRank); + TEST_EQUALITY(fieldData_qedge1_h(cell, pt).availableSize(), num_tangent); + for (int i = 0; i < num_tangent; ++i) + TEST_EQUALITY(fieldData_qedge1_h(cell, pt).dx(i), 0.123 + myRank + i); + } + else + { + TEST_EQUALITY(fieldData_qedge1_h(cell, pt), 123.0 + myRank); + TEST_EQUALITY(fieldData_qedge1_h(cell, pt).availableSize(), 0); + } + } + } + } + } + + Teuchos::RCP buildBasis(std::size_t worksetSize, const std::string &basisName) + { + Teuchos::RCP topo = + Teuchos::rcp(new shards::CellTopology(shards::getCellTopologyData>())); + + panzer::CellData cellData(worksetSize, topo); + return Teuchos::rcp(new panzer::PureBasis(basisName, 1, cellData)); + } + + Teuchos::RCP buildMesh(int elemX, int elemY) + { + Teuchos::RCP pl = rcp(new Teuchos::ParameterList); + pl->set("X Blocks", 1); + pl->set("Y Blocks", 1); + pl->set("X Elements", elemX); + pl->set("Y Elements", elemY); + + panzer_stk::SquareQuadMeshFactory factory; + factory.setParameterList(pl); + Teuchos::RCP mesh = factory.buildUncommitedMesh(MPI_COMM_WORLD); + factory.completeMeshConstruction(*mesh, MPI_COMM_WORLD); + + return mesh; + } + + void testInitialization(const Teuchos::RCP &ipb) + { + // Physics block + ipb->setName("test physics"); + { + Teuchos::ParameterList &p = ipb->sublist("a"); + p.set("Type", "Energy"); + p.set("Prefix", ""); + p.set("Model ID", "solid"); + p.set("Basis Type", "HGrad"); + p.set("Basis Order", 1); + p.set("Integration Order", 1); + } + { + Teuchos::ParameterList &p = ipb->sublist("b"); + p.set("Type", "Energy"); + p.set("Prefix", "ION_"); + p.set("Model ID", "solid"); + p.set("Basis Type", "HCurl"); + p.set("Basis Order", 1); + p.set("Integration Order", 1); + } + } + +} diff --git a/packages/panzer/disc-fe/src/evaluators/Panzer_GatherSolution_BlockedTpetra.hpp b/packages/panzer/disc-fe/src/evaluators/Panzer_GatherSolution_BlockedTpetra.hpp index aec43b41dfbc..6d9bde9d1a3b 100644 --- a/packages/panzer/disc-fe/src/evaluators/Panzer_GatherSolution_BlockedTpetra.hpp +++ b/packages/panzer/disc-fe/src/evaluators/Panzer_GatherSolution_BlockedTpetra.hpp @@ -163,7 +163,7 @@ class GatherSolution_BlockedTpetra public: GatherSolution_BlockedTpetra(const Teuchos::RCP & indexer) - : gidIndexer_(indexer) {} + : globalIndexer_(indexer) {} GatherSolution_BlockedTpetra(const Teuchos::RCP & indexer, const Teuchos::ParameterList& p); @@ -176,13 +176,13 @@ class GatherSolution_BlockedTpetra void evaluateFields(typename TRAITS::EvalData d); virtual Teuchos::RCP clone(const Teuchos::ParameterList & pl) const - { return Teuchos::rcp(new GatherSolution_BlockedTpetra(gidIndexer_,pl)); } + { return Teuchos::rcp(new GatherSolution_BlockedTpetra(globalIndexer_,pl)); } private: typedef typename panzer::Traits::Tangent EvalT; typedef typename panzer::Traits::Tangent::ScalarT ScalarT; - //typedef typename panzer::Traits::RealType RealT; + typedef typename panzer::Traits::RealType RealT; typedef BlockedTpetraLinearObjContainer ContainerType; typedef Tpetra::Vector VectorType; @@ -194,10 +194,14 @@ class GatherSolution_BlockedTpetra // maps the local (field,element,basis) triplet to a global ID // for scattering - Teuchos::RCP gidIndexer_; + Teuchos::RCP globalIndexer_; std::vector fieldIds_; // field IDs needing mapping + //! Returns the index into the Thyra ProductVector sub-block. Size + //! of number of fields to scatter + std::vector productVectorBlockIndex_; + std::vector< PHX::MDField > gatherFields_; std::vector indexerNames_; @@ -206,9 +210,16 @@ class GatherSolution_BlockedTpetra Teuchos::RCP > blockedContainer_; + //! Local indices for unknowns + PHX::View worksetLIDs_; + + //! Offset into the cell lids for each field. Size of number of fields to scatter. + std::vector> fieldOffsets_; + // Fields for storing tangent components dx/dp of solution vector x bool has_tangent_fields_; - std::vector< std::vector< PHX::MDField > > tangentFields_; + std::vector< std::vector< PHX::MDField > > tangentFields_; + PHX::ViewOfViews<2,PHX::View> tangentFieldsVoV_; GatherSolution_BlockedTpetra(); }; diff --git a/packages/panzer/disc-fe/src/evaluators/Panzer_GatherSolution_BlockedTpetra_impl.hpp b/packages/panzer/disc-fe/src/evaluators/Panzer_GatherSolution_BlockedTpetra_impl.hpp index b0ef54fdd70b..52488585d37e 100644 --- a/packages/panzer/disc-fe/src/evaluators/Panzer_GatherSolution_BlockedTpetra_impl.hpp +++ b/packages/panzer/disc-fe/src/evaluators/Panzer_GatherSolution_BlockedTpetra_impl.hpp @@ -8,8 +8,8 @@ // ***************************************************************************** // @HEADER -#ifndef PANZER_GATHER_SOLUTION_BLOCKED_EPETRA_IMPL_HPP -#define PANZER_GATHER_SOLUTION_BLOCKED_EPETRA_IMPL_HPP +#ifndef PANZER_GATHER_SOLUTION_BLOCKED_TPETRA_IMPL_HPP +#define PANZER_GATHER_SOLUTION_BLOCKED_TPETRA_IMPL_HPP #include "Teuchos_Assert.hpp" #include "Phalanx_DataLayout.hpp" @@ -216,7 +216,7 @@ panzer::GatherSolution_BlockedTpetra & indexer, const Teuchos::ParameterList& p) - : gidIndexer_(indexer) + : globalIndexer_(indexer) , has_tangent_fields_(false) { typedef std::vector< std::vector > vvstring; @@ -250,7 +250,7 @@ GatherSolution_BlockedTpetra( tangentFields_[fd].resize(tangent_field_names[fd].size()); for (std::size_t i=0; i(tangent_field_names[fd][i],basis->functional); + PHX::MDField(tangent_field_names[fd][i],basis->functional); this->addDependentField(tangentFields_[fd][i]); } } @@ -268,17 +268,60 @@ GatherSolution_BlockedTpetra( // ********************************************************************** template void panzer::GatherSolution_BlockedTpetra:: -postRegistrationSetup(typename TRAITS::SetupData /* d */, +postRegistrationSetup(typename TRAITS::SetupData d, PHX::FieldManager& /* fm */) { TEUCHOS_ASSERT(gatherFields_.size() == indexerNames_.size()); - fieldIds_.resize(gatherFields_.size()); + const Workset & workset_0 = (*d.worksets_)[0]; + const std::string blockId = this->wda(workset_0).block_id; + fieldIds_.resize(gatherFields_.size()); + fieldOffsets_.resize(gatherFields_.size()); + productVectorBlockIndex_.resize(gatherFields_.size()); + int maxElementBlockGIDCount = -1; for (std::size_t fd = 0; fd < gatherFields_.size(); ++fd) { - // get field ID from DOF manager - const std::string& fieldName = indexerNames_[fd]; - fieldIds_[fd] = gidIndexer_->getFieldNum(fieldName); + + const std::string fieldName = indexerNames_[fd]; + const int globalFieldNum = globalIndexer_->getFieldNum(fieldName); // Field number in the aggregate BlockDOFManager + productVectorBlockIndex_[fd] = globalIndexer_->getFieldBlock(globalFieldNum); + const auto& subGlobalIndexer = globalIndexer_->getFieldDOFManagers()[productVectorBlockIndex_[fd]]; + fieldIds_[fd] = subGlobalIndexer->getFieldNum(fieldName); // Field number in the sub-global-indexer + + const std::vector& offsets = subGlobalIndexer->getGIDFieldOffsets(blockId,fieldIds_[fd]); + fieldOffsets_[fd] = PHX::View("GatherSolution_BlockedTpetra(Tangent):fieldOffsets",offsets.size()); + auto hostOffsets = Kokkos::create_mirror_view(fieldOffsets_[fd]); + for (std::size_t i=0; i < offsets.size(); ++i) + hostOffsets(i) = offsets[i]; + Kokkos::deep_copy(fieldOffsets_[fd], hostOffsets); + maxElementBlockGIDCount = std::max(subGlobalIndexer->getElementBlockGIDCount(blockId),maxElementBlockGIDCount); + } + + // We will use one workset lid view for all fields, but has to be + // sized big enough to hold the largest elementBlockGIDCount in the + // ProductVector. + worksetLIDs_ = PHX::View("ScatterResidual_BlockedTpetra(Tangent):worksetLIDs", + gatherFields_[0].extent(0), + maxElementBlockGIDCount); + + // Set up storage for tangentFields using view of views + // We also need storage for the number of tangent fields associated with + // each gatherField + + if (has_tangent_fields_) { + + size_t inner_vector_max_size = 0; + for (std::size_t fd = 0; fd < tangentFields_.size(); ++fd) + inner_vector_max_size = std::max(inner_vector_max_size,tangentFields_[fd].size()); + tangentFieldsVoV_.initialize("GatherSolution_BlockedTpetra::tangentFieldsVoV_",gatherFields_.size(),inner_vector_max_size); + + for (std::size_t fd = 0; fd < gatherFields_.size(); ++fd) { + for (std::size_t i=0; i void panzer::GatherSolution_BlockedTpetra:: evaluateFields(typename TRAITS::EvalData workset) { - using Teuchos::RCP; - using Teuchos::ArrayRCP; - using Teuchos::ptrFromRef; - using Teuchos::rcp_dynamic_cast; - - using Thyra::VectorBase; - using Thyra::SpmdVectorBase; - using Thyra::ProductVectorBase; + using Teuchos::RCP; + using Teuchos::ArrayRCP; + using Teuchos::ptrFromRef; + using Teuchos::rcp_dynamic_cast; - Teuchos::FancyOStream out(Teuchos::rcpFromRef(std::cout)); - out.setShowProcRank(true); - out.setOutputToRootOnly(-1); + using Thyra::VectorBase; + using Thyra::SpmdVectorBase; + using Thyra::ProductVectorBase; - std::vector > GIDs; - std::vector LIDs; + Teuchos::FancyOStream out(Teuchos::rcpFromRef(std::cout)); + out.setShowProcRank(true); + out.setOutputToRootOnly(-1); - // for convenience pull out some objects from workset - std::string blockId = this->wda(workset).block_id; - const std::vector & localCellIds = this->wda(workset).cell_local_ids; + const PHX::View & localCellIds = this->wda(workset).getLocalCellIDs(); - Teuchos::RCP > x; - if (useTimeDerivativeSolutionVector_) - x = rcp_dynamic_cast >(blockedContainer_->get_dxdt()); - else - x = rcp_dynamic_cast >(blockedContainer_->get_x()); + Teuchos::RCP > blockedSolution; + if (useTimeDerivativeSolutionVector_) + blockedSolution = rcp_dynamic_cast >(blockedContainer_->get_dxdt()); + else + blockedSolution = rcp_dynamic_cast >(blockedContainer_->get_x()); - // gather operation for each cell in workset - for(std::size_t worksetCellIndex=0;worksetCellIndexgetFieldDOFManagers()[productVectorBlockIndex_[fieldIndex]]; + const std::string blockId = this->wda(workset).block_id; + const int num_dofs = globalIndexer_->getFieldDOFManagers()[productVectorBlockIndex_[fieldIndex]]->getElementBlockGIDCount(blockId); + blockIndexer->getElementLIDs(localCellIds,worksetLIDs_,num_dofs); + currentWorksetLIDSubBlock = productVectorBlockIndex_[fieldIndex]; + } - gidIndexer_->getElementGIDsPair(cellLocalId,GIDs,blockId); + const int blockRowIndex = productVectorBlockIndex_[fieldIndex]; + const auto& subblockSolution = *((rcp_dynamic_cast>(blockedSolution->getNonconstVectorBlock(blockRowIndex),true))->getTpetraVector()); + const auto kokkosSolution = subblockSolution.getLocalViewDevice(Tpetra::Access::ReadOnly); - // caculate the local IDs for this element - LIDs.resize(GIDs.size()); - for(std::size_t i=0;i x_map = blockedContainer_->getMapForBlock(GIDs[i].first); + // Class data fields for lambda capture + const PHX::View fieldOffsets = fieldOffsets_[fieldIndex]; + const PHX::View worksetLIDs = worksetLIDs_; + const PHX::View fieldValues = gatherFields_[fieldIndex].get_static_view(); - LIDs[i] = x_map->getLocalElement(GIDs[i].second); - } + if (has_tangent_fields_) { + const int numTangents = tangentFields_[fieldIndex].size(); + const auto tangentFieldsDevice = tangentFieldsVoV_.getViewDevice(); + const auto kokkosTangents = Kokkos::subview(tangentFieldsDevice,fieldIndex,Kokkos::ALL()); + Kokkos::parallel_for(Kokkos::RangePolicy(0,workset.num_cells), KOKKOS_LAMBDA (const int& cell) { + for (int basis=0; basis < static_cast(fieldOffsets.size()); ++basis) { + const int rowLID = worksetLIDs(cell,fieldOffsets(basis)); + fieldValues(cell,basis).zero(); + fieldValues(cell,basis).val() = kokkosSolution(rowLID,0); + for (int i_tangent=0; i_tangent(0,workset.num_cells), KOKKOS_LAMBDA (const int& cell) { + for (int basis=0; basis < static_cast(fieldOffsets.size()); ++basis) { + const int rowLID = worksetLIDs(cell,fieldOffsets(basis)); + fieldValues(cell,basis).zero(); + fieldValues(cell,basis) = kokkosSolution(rowLID,0); + } + }); + } + } - // loop over the fields to be gathered - Teuchos::ArrayRCP local_x; - for (std::size_t fieldIndex=0; fieldIndexgetFieldBlock(fieldNum); - - // grab local data for inputing - RCP > block_x = rcp_dynamic_cast >(x->getNonconstVectorBlock(indexerId)); - block_x->getLocalData(ptrFromRef(local_x)); - - const std::vector & elmtOffset = gidIndexer_->getGIDFieldOffsets(blockId,fieldNum); - - // loop over basis functions and fill the fields - for(std::size_t basis=0;basis Date: Thu, 7 Nov 2024 10:48:29 -0500 Subject: [PATCH 25/25] Remove use of Kokkos::Impl::DynRankViewFill Signed-off-by: Paul Zehner --- .../sacado/src/Kokkos_DynRankView_Fad.hpp | 26 ++++++++++++++----- 1 file changed, 20 insertions(+), 6 deletions(-) diff --git a/packages/sacado/src/Kokkos_DynRankView_Fad.hpp b/packages/sacado/src/Kokkos_DynRankView_Fad.hpp index e2c1d78aa81b..7e413cbc9393 100644 --- a/packages/sacado/src/Kokkos_DynRankView_Fad.hpp +++ b/packages/sacado/src/Kokkos_DynRankView_Fad.hpp @@ -988,9 +988,16 @@ void deep_copy( typename ViewTraits::non_const_value_type >::value , "Can only deep copy into non-const type" ); - Kokkos::fence(); - Kokkos::Impl::DynRankViewFill< DynRankView >( view , value ); - Kokkos::fence(); + switch(view.rank()) { + case 0: deep_copy(Impl::as_view_of_rank_n<0>(view), value); break; + case 1: deep_copy(Impl::as_view_of_rank_n<1>(view), value); break; + case 2: deep_copy(Impl::as_view_of_rank_n<2>(view), value); break; + case 3: deep_copy(Impl::as_view_of_rank_n<3>(view), value); break; + case 4: deep_copy(Impl::as_view_of_rank_n<4>(view), value); break; + case 5: deep_copy(Impl::as_view_of_rank_n<5>(view), value); break; + case 6: deep_copy(Impl::as_view_of_rank_n<6>(view), value); break; + case 7: deep_copy(Impl::as_view_of_rank_n<7>(view), value); break; + } } // Overload of deep_copy for Fad views intializing to a constant Fad @@ -1010,9 +1017,16 @@ void deep_copy( typename ViewTraits::non_const_value_type >::value , "Can only deep copy into non-const type" ); - Kokkos::fence(); - Kokkos::Impl::DynRankViewFill< DynRankView >( view , value ); - Kokkos::fence(); + switch(view.rank()) { + case 0: deep_copy(Impl::as_view_of_rank_n<0>(view), value); break; + case 1: deep_copy(Impl::as_view_of_rank_n<1>(view), value); break; + case 2: deep_copy(Impl::as_view_of_rank_n<2>(view), value); break; + case 3: deep_copy(Impl::as_view_of_rank_n<3>(view), value); break; + case 4: deep_copy(Impl::as_view_of_rank_n<4>(view), value); break; + case 5: deep_copy(Impl::as_view_of_rank_n<5>(view), value); break; + case 6: deep_copy(Impl::as_view_of_rank_n<6>(view), value); break; + case 7: deep_copy(Impl::as_view_of_rank_n<7>(view), value); break; + } } template< class DstType , class SrcType >