From e67173ac2cec396bf91655f5b3b9f7cec7e4a16f Mon Sep 17 00:00:00 2001
From: Ian Halim <ihalim@ascicgpu031.sandia.gov>
Date: Mon, 15 Jul 2024 17:15:36 -0600
Subject: [PATCH 01/25] MueLu: Cut Drop Converted to Use Kokkos

Original code within ORIGINAL ifdef. New code within NEW ifdef.
DropTol structure marked with KOKKOS_INLINE_FUNCTION and default values are hard coded.
Default Algorithm and Cut Drop Algorithm split into separate for loops in NEW code.
Cut Drop converted to use Kokkos nested parallel loops.
Timers placed in new code and are commented out.
Code passes current unit tests.
Saw a speedup of about 1.5x with Cuda and 1.2x with Serial when running unit tests with 10,000,000 rows.

Signed-off-by: Ian Halim <ihalim@ascicgpu031.sandia.gov>
---
 .../MueLu_CoalesceDropFactory_decl.hpp        |    1 +
 .../MueLu_CoalesceDropFactory_def.hpp         | 1663 ++++++++++++++++-
 2 files changed, 1657 insertions(+), 7 deletions(-)
diff --git a/packages/muelu/src/Graph/MatrixTransformation/MueLu_CoalesceDropFactory_decl.hpp b/packages/muelu/src/Graph/MatrixTransformation/MueLu_CoalesceDropFactory_decl.hpp
index 96b5e778f6bc..db5e9a291313 100644
--- a/packages/muelu/src/Graph/MatrixTransformation/MueLu_CoalesceDropFactory_decl.hpp
+++ b/packages/muelu/src/Graph/MatrixTransformation/MueLu_CoalesceDropFactory_decl.hpp
@@ -160,6 +160,7 @@ class CoalesceDropFactory : public SingleLevelFactoryBase {
   //@}
 
   void Build(Level& currentLevel) const;  // Build
+  void BuildKokkos(Level& currentLevel) const;
 
  private:
   // pre-drop function
diff --git a/packages/muelu/src/Graph/MatrixTransformation/MueLu_CoalesceDropFactory_def.hpp b/packages/muelu/src/Graph/MatrixTransformation/MueLu_CoalesceDropFactory_def.hpp
index 2c421c477bde..a8befaea592b 100644
--- a/packages/muelu/src/Graph/MatrixTransformation/MueLu_CoalesceDropFactory_def.hpp
+++ b/packages/muelu/src/Graph/MatrixTransformation/MueLu_CoalesceDropFactory_def.hpp
@@ -61,6 +61,8 @@
 
 #include <Xpetra_IO.hpp>
 
+#include <Kokkos_NestedSort.hpp> //NEW
+#include <Kokkos_StdAlgorithms.hpp> //NEW
 #include "MueLu_CoalesceDropFactory_decl.hpp"
 
 #include "MueLu_AmalgamationFactory.hpp"
@@ -92,22 +94,30 @@ namespace MueLu {
 namespace Details {
 template <class real_type, class LO>
 struct DropTol {
+  KOKKOS_INLINE_FUNCTION //NEW
   DropTol()               = default;
+  KOKKOS_INLINE_FUNCTION //NEW
   DropTol(DropTol const&) = default;
+  KOKKOS_INLINE_FUNCTION //NEW
   DropTol(DropTol&&)      = default;
 
   DropTol& operator=(DropTol const&) = default;
   DropTol& operator=(DropTol&&)      = default;
 
+  KOKKOS_INLINE_FUNCTION //NEW
   DropTol(real_type val_, real_type diag_, LO col_, bool drop_)
     : val{val_}
     , diag{diag_}
     , col{col_}
     , drop{drop_} {}
 
-  real_type val{Teuchos::ScalarTraits<real_type>::zero()};
-  real_type diag{Teuchos::ScalarTraits<real_type>::zero()};
-  LO col{Teuchos::OrdinalTraits<LO>::invalid()};
+  real_type val{0};
+  real_type diag{0};
+  LO col{-1};
+  //NEW Can't run these host functions on device
+  //real_type val{Teuchos::ScalarTraits<real_type>::zero()};
+  //real_type diag{Teuchos::ScalarTraits<real_type>::zero()};
+  //LO col{Teuchos::OrdinalTraits<LO>::invalid()};
   bool drop{true};
 
   // CMS: Auxillary information for debugging info
@@ -414,6 +424,1645 @@ void CoalesceDropFactory<Scalar, LocalOrdinal, GlobalOrdinal, Node>::Build(Level
     TEUCHOS_TEST_FOR_EXCEPTION(A->GetFixedBlockSize() % A->GetStorageBlockSize() != 0, Exceptions::RuntimeError, "A->GetFixedBlockSize() needs to be a multiple of A->GetStorageBlockSize()");
     const LO BlockSize = A->GetFixedBlockSize() / A->GetStorageBlockSize();
 
+    /************************** RS or SA-style Classical Dropping (and variants) **************************/
+    if (algo == "classical") {
+      if (predrop_ == null) {
+        // ap: this is a hack: had to declare predrop_ as mutable
+        predrop_ = rcp(new PreDropFunctionConstVal(threshold));
+      }
+
+      if (predrop_ != null) {
+        RCP<PreDropFunctionConstVal> predropConstVal = rcp_dynamic_cast<PreDropFunctionConstVal>(predrop_);
+        TEUCHOS_TEST_FOR_EXCEPTION(predropConstVal == Teuchos::null, Exceptions::BadCast,
+                                   "MueLu::CoalesceFactory::Build: cast to PreDropFunctionConstVal failed.");
+        // If a user provided a predrop function, it overwrites the XML threshold parameter
+        SC newt = predropConstVal->GetThreshold();
+        if (newt != threshold) {
+          GetOStream(Warnings0) << "switching threshold parameter from " << threshold << " (list) to " << newt << " (user function" << std::endl;
+          threshold = newt;
+        }
+      }
+      // At this points we either have
+      //     (predrop_ != null)
+      // Therefore, it is sufficient to check only threshold
+      if (BlockSize == 1 && threshold == STS::zero() && !useSignedClassicalRS && !useSignedClassicalSA && A->hasCrsGraph()) {
+        // Case 1:  scalar problem, no dropping => just use matrix graph
+        RCP<LWGraph> graph = rcp(new LWGraph(A->getCrsGraph(), "graph of A"));
+        // Detect and record rows that correspond to Dirichlet boundary conditions
+        auto boundaryNodes = MueLu::Utilities<SC, LO, GO, NO>::DetectDirichletRows_kokkos_host(*A, dirichletThreshold);
+        if (rowSumTol > 0.)
+          Utilities::ApplyRowSumCriterionHost(*A, rowSumTol, boundaryNodes);
+
+        graph->SetBoundaryNodeMap(boundaryNodes);
+        numTotal = A->getLocalNumEntries();
+
+        if (GetVerbLevel() & Statistics1) {
+          GO numLocalBoundaryNodes  = 0;
+          GO numGlobalBoundaryNodes = 0;
+          for (size_t i = 0; i < boundaryNodes.size(); ++i)
+            if (boundaryNodes[i])
+              numLocalBoundaryNodes++;
+          RCP<const Teuchos::Comm<int>> comm = A->getRowMap()->getComm();
+          MueLu_sumAll(comm, numLocalBoundaryNodes, numGlobalBoundaryNodes);
+          GetOStream(Statistics1) << "Detected " << numGlobalBoundaryNodes << " Dirichlet nodes" << std::endl;
+        }
+
+        Set(currentLevel, "DofsPerNode", 1);
+        Set(currentLevel, "Graph", graph);
+
+      } else if ((BlockSize == 1 && threshold != STS::zero()) ||
+                 (BlockSize == 1 && threshold == STS::zero() && !A->hasCrsGraph()) ||
+                 (BlockSize == 1 && useSignedClassicalRS) ||
+                 (BlockSize == 1 && useSignedClassicalSA)) {
+        // Case 2:  scalar problem with dropping => record the column indices of undropped entries, but still use original
+        //                                          graph's map information, e.g., whether index is local
+        // OR a matrix without a CrsGraph
+
+        // allocate space for the local graph
+        typename LWGraph::row_type::non_const_type rows("rows", A->getLocalNumRows() + 1);
+        typename LWGraph::entries_type::non_const_type columns("columns", A->getLocalNumEntries());
+
+        using MT = typename STS::magnitudeType;
+	RCP<Vector> ghostedDiag;
+        ArrayRCP<const SC> ghostedDiagVals;
+        ArrayRCP<const MT> negMaxOffDiagonal;
+	// RS style needs the max negative off-diagonal, SA style needs the diagonal
+        if (useSignedClassicalRS) {
+          if (ghostedBlockNumber.is_null()) {
+            negMaxOffDiagonal = MueLu::Utilities<SC, LO, GO, NO>::GetMatrixMaxMinusOffDiagonal(*A);
+            if (GetVerbLevel() & Statistics1)
+              GetOStream(Statistics1) << "Calculated max point off-diagonal" << std::endl;
+          } else {
+            negMaxOffDiagonal = MueLu::Utilities<SC, LO, GO, NO>::GetMatrixMaxMinusOffDiagonal(*A, *ghostedBlockNumber);
+            if (GetVerbLevel() & Statistics1)
+              GetOStream(Statistics1) << "Calculating max block off-diagonal" << std::endl;
+          }
+        } else {
+          ghostedDiag     = MueLu::Utilities<SC, LO, GO, NO>::GetMatrixOverlappedDiagonal(*A);
+          ghostedDiagVals = ghostedDiag->getData(0);
+	}
+        auto boundaryNodes = MueLu::Utilities<SC, LO, GO, NO>::DetectDirichletRows_kokkos_host(*A, dirichletThreshold);
+	if (rowSumTol > 0.) {
+          if (ghostedBlockNumber.is_null()) {
+            if (GetVerbLevel() & Statistics1)
+              GetOStream(Statistics1) << "Applying point row sum criterion." << std::endl;
+            Utilities::ApplyRowSumCriterionHost(*A, rowSumTol, boundaryNodes);
+          } else {
+            if (GetVerbLevel() & Statistics1)
+              GetOStream(Statistics1) << "Applying block row sum criterion." << std::endl;
+            Utilities::ApplyRowSumCriterionHost(*A, *ghostedBlockNumber, rowSumTol, boundaryNodes);
+          }
+        }
+
+        LO realnnz = 0;
+        rows(0)    = 0;
+#define NEW 
+#ifdef ORIGINAL
+	for (LO row = 0; row < Teuchos::as<LO>(A->getRowMap()->getLocalNumElements()); ++row) {
+	  size_t nnz          = A->getNumEntriesInLocalRow(row);
+          bool rowIsDirichlet = boundaryNodes[row];
+          ArrayView<const LO> indices;
+          ArrayView<const SC> vals;
+          A->getLocalRowView(row, indices, vals);
+
+          if (classicalAlgo == defaultAlgo) {
+            // FIXME the current predrop function uses the following
+            // FIXME    if(std::abs(vals[k]) > std::abs(threshold_) || grow == gcid )
+            // FIXME but the threshold doesn't take into account the rows' diagonal entries
+            // FIXME For now, hardwiring the dropping in here
+
+            LO rownnz = 0;
+            if (useSignedClassicalRS) {
+              // Signed classical RS style
+              for (LO colID = 0; colID < Teuchos::as<LO>(nnz); colID++) {
+                LO col         = indices[colID];
+                MT max_neg_aik = realThreshold * STS::real(negMaxOffDiagonal[row]);
+                MT neg_aij     = -STS::real(vals[colID]);
+                /*                  if(row==1326) printf("A(%d,%d) = %6.4e, block = (%d,%d) neg_aij = %6.4e max_neg_aik = %6.4e\n",row,col,vals[colID],
+                                     g_block_id.is_null() ? -1 :  g_block_id[row],
+                                     g_block_id.is_null() ? -1 :  g_block_id[col],
+                                     neg_aij, max_neg_aik);*/
+                if ((!rowIsDirichlet && (g_block_id.is_null() || g_block_id[row] == g_block_id[col]) && neg_aij > max_neg_aik) || row == col) {
+                  columns[realnnz++] = col;
+                  rownnz++;
+                } else
+                  numDropped++;
+              }
+              rows(row + 1) = realnnz;
+            } else if (useSignedClassicalSA) {
+              // Signed classical SA style
+              for (LO colID = 0; colID < Teuchos::as<LO>(nnz); colID++) {
+                LO col = indices[colID];
+
+                bool is_nonpositive = STS::real(vals[colID]) <= 0;
+                MT aiiajj           = STS::magnitude(threshold * threshold * ghostedDiagVals[col] * ghostedDiagVals[row]);                        // eps^2*|a_ii|*|a_jj|
+                MT aij              = is_nonpositive ? STS::magnitude(vals[colID] * vals[colID]) : (-STS::magnitude(vals[colID] * vals[colID]));  // + |a_ij|^2, if a_ij < 0, - |a_ij|^2 if a_ij >=0
+                /*
+                if(row==1326) printf("A(%d,%d) = %6.4e, raw_aij = %6.4e aij = %6.4e aiiajj = %6.4e\n",row,col,vals[colID],
+                                     vals[colID],aij, aiiajj);
+                */
+
+                if ((!rowIsDirichlet && aij > aiiajj) || row == col) {
+                  columns(realnnz++) = col;
+                  rownnz++;
+                } else
+                  numDropped++;
+              }
+              rows[row + 1] = realnnz;
+            } else {
+              // Standard abs classical
+              for (LO colID = 0; colID < Teuchos::as<LO>(nnz); colID++) {
+                LO col    = indices[colID];
+                MT aiiajj = STS::magnitude(threshold * threshold * ghostedDiagVals[col] * ghostedDiagVals[row]);  // eps^2*|a_ii|*|a_jj|
+                MT aij    = STS::magnitude(vals[colID] * vals[colID]);                                            // |a_ij|^2
+
+                if ((!rowIsDirichlet && aij > aiiajj) || row == col) {
+                  columns(realnnz++) = col;
+                  rownnz++;
+                } else
+                  numDropped++;
+              }
+              rows(row + 1) = realnnz;
+            }
+          } else {
+	    /* Cut Algorithm */
+            // CMS
+            using DropTol = Details::DropTol<real_type, LO>;
+            std::vector<DropTol> drop_vec;
+	    drop_vec.reserve(nnz);
+            const real_type zero = Teuchos::ScalarTraits<real_type>::zero();
+            const real_type one  = Teuchos::ScalarTraits<real_type>::one();
+            LO rownnz            = 0;
+            // NOTE: This probably needs to be fixed for rowsum
+
+            // find magnitudes
+	    for (LO colID = 0; colID < (LO)nnz; colID++) {
+              LO col = indices[colID];
+              if (row == col) {
+                drop_vec.emplace_back(zero, one, colID, false);
+                continue;
+              }
+
+              // Don't aggregate boundaries
+              if (boundaryNodes[colID]) continue;
+              typename STS::magnitudeType aiiajj = STS::magnitude(threshold * threshold * ghostedDiagVals[col] * ghostedDiagVals[row]);  // eps^2*|a_ii|*|a_jj|
+              typename STS::magnitudeType aij    = STS::magnitude(vals[colID] * vals[colID]);                                            // |a_ij|^2
+              drop_vec.emplace_back(aij, aiiajj, colID, false);
+            }
+
+            const size_t n = drop_vec.size();
+
+            if (classicalAlgo == unscaled_cut) {
+              std::sort(drop_vec.begin(), drop_vec.end(), [](DropTol const& a, DropTol const& b) {
+                return a.val > b.val;
+              });
+
+              bool drop = false;
+              for (size_t i = 1; i < n; ++i) {
+                if (!drop) {
+                  auto const& x = drop_vec[i - 1];
+                  auto const& y = drop_vec[i];
+                  auto a        = x.val;
+                  auto b        = y.val;
+                  if (a > realThreshold * b) {
+                    drop = true;
+#ifdef HAVE_MUELU_DEBUG
+                    if (distanceLaplacianCutVerbose) {
+                      std::cout << "DJS: KEEP, N, ROW:  " << i + 1 << ", " << n << ", " << row << std::endl;
+                    }
+#endif
+                  }
+                }
+                drop_vec[i].drop = drop;
+              }
+            } else if (classicalAlgo == scaled_cut) {
+              std::sort(drop_vec.begin(), drop_vec.end(), [](DropTol const& a, DropTol const& b) {
+                return a.val / a.diag > b.val / b.diag;
+              });
+              bool drop = false;
+              //                  printf("[%d] Scaled Cut: ",(int)row);
+              //                  printf("%3d(%4s) ",indices[drop_vec[0].col],"keep");
+              for (size_t i = 1; i < n; ++i) {
+                if (!drop) {
+                  auto const& x = drop_vec[i - 1];
+                  auto const& y = drop_vec[i];
+                  auto a        = x.val / x.diag;
+                  auto b        = y.val / y.diag;
+                  if (a > realThreshold * b) {
+                    drop = true;
+
+#ifdef HAVE_MUELU_DEBUG
+                    if (distanceLaplacianCutVerbose) {
+                      std::cout << "DJS: KEEP, N, ROW:  " << i + 1 << ", " << n << ", " << row << std::endl;
+                    }
+#endif
+                  }
+                  //                      printf("%3d(%4s) ",indices[drop_vec[i].col],drop?"drop":"keep");
+                }
+                drop_vec[i].drop = drop;
+              }
+              //                  printf("\n");
+            }
+            std::sort(drop_vec.begin(), drop_vec.end(), [](DropTol const& a, DropTol const& b) {
+              return a.col < b.col;
+            });
+
+            for (LO idxID = 0; idxID < (LO)drop_vec.size(); idxID++) {
+              LO col = indices[drop_vec[idxID].col];
+              // don't drop diagonal
+              if (row == col) {
+                columns[realnnz++] = col;
+                rownnz++;
+                continue;
+              }
+
+              if (!drop_vec[idxID].drop) {
+                columns[realnnz++] = col;
+                rownnz++;
+              } else {
+                numDropped++;
+              }
+            }
+            // CMS
+            rows[row + 1] = realnnz;
+          }
+        }  // end for row
+#endif
+
+#ifdef NEW
+	if(classicalAlgo == defaultAlgo) {
+            	SubFactoryMonitor m1(*this, "Classical RS/SA", currentLevel);
+		for (LO row = 0; row < Teuchos::as<LO>(A->getRowMap()->getLocalNumElements()); ++row) {
+        		size_t nnz          = A->getNumEntriesInLocalRow(row);
+        		bool rowIsDirichlet = boundaryNodes[row];
+        		ArrayView<const LO> indices;
+        		ArrayView<const SC> vals;
+        		A->getLocalRowView(row, indices, vals);
+
+        		// FIXME the current predrop function uses the following
+        		// FIXME    if(std::abs(vals[k]) > std::abs(threshold_) || grow == gcid )
+        		// FIXME but the threshold doesn't take into account the rows' diagonal entries
+        		// FIXME For now, hardwiring the dropping in here
+
+        		LO rownnz = 0;
+        		if (useSignedClassicalRS) {
+        			// Signed classical RS style
+            			for (LO colID = 0; colID < Teuchos::as<LO>(nnz); colID++) {
+                			LO col         = indices[colID];
+                			MT max_neg_aik = realThreshold * STS::real(negMaxOffDiagonal[row]);
+                			MT neg_aij     = -STS::real(vals[colID]);
+                			/*                  if(row==1326) printf("A(%d,%d) = %6.4e, block = (%d,%d) neg_aij = %6.4e max_neg_aik = %6.4e\n",row,col,vals[colID],
+                               		 		    g_block_id.is_null() ? -1 :  g_block_id[row],
+                              		 		    g_block_id.is_null() ? -1 :  g_block_id[col],
+                               			     	    neg_aij, max_neg_aik);*/
+                			if ((!rowIsDirichlet && (g_block_id.is_null() || g_block_id[row] == g_block_id[col]) && neg_aij > max_neg_aik) || row == col) {
+                  				columns[realnnz++] = col;
+                  				rownnz++;
+                			} else
+                  			numDropped++;
+              			}
+              			rows(row + 1) = realnnz;
+            		} else if (useSignedClassicalSA) {
+              			// Signed classical SA style
+              			for (LO colID = 0; colID < Teuchos::as<LO>(nnz); colID++) {
+                			LO col = indices[colID];
+
+			                bool is_nonpositive = STS::real(vals[colID]) <= 0;
+                			MT aiiajj           = STS::magnitude(threshold * threshold * ghostedDiagVals[col] * ghostedDiagVals[row]);                        // eps^2*|a_ii|*|a_jj|
+                			MT aij              = is_nonpositive ? STS::magnitude(vals[colID] * vals[colID]) : (-STS::magnitude(vals[colID] * vals[colID]));  // + |a_ij|^2, if a_ij < 0, - |a_ij|^2 if a_ij >=0
+                			/*
+                			if(row==1326) printf("A(%d,%d) = %6.4e, raw_aij = %6.4e aij = %6.4e aiiajj = %6.4e\n",row,col,vals[colID],
+                                			     vals[colID],aij, aiiajj);
+                			*/
+
+                			if ((!rowIsDirichlet && aij > aiiajj) || row == col) {
+                  				columns(realnnz++) = col;
+                  				rownnz++;
+                			} else
+                			  numDropped++;
+              			}
+              			rows[row + 1] = realnnz;
+            		} else {
+              			// Standard abs classical
+              			for (LO colID = 0; colID < Teuchos::as<LO>(nnz); colID++) {
+                			LO col    = indices[colID];
+                			MT aiiajj = STS::magnitude(threshold * threshold * ghostedDiagVals[col] * ghostedDiagVals[row]);  // eps^2*|a_ii|*|a_jj|
+                			MT aij    = STS::magnitude(vals[colID] * vals[colID]);                                            // |a_ij|^2
+
+                			if ((!rowIsDirichlet && aij > aiiajj) || row == col) {
+                  				columns(realnnz++) = col;
+                  				rownnz++;
+                			} else
+                  		 	  numDropped++;
+              			}
+              			rows(row + 1) = realnnz;
+            		}
+        	}  // end for row
+	}
+	else { //NEW START
+		//auto stackedTimer = rcp(new Teuchos::StackedTimer("timer"));
+		//Teuchos::TimeMonitor::setStackedTimer(stackedTimer);
+		//stackedTimer->start("init");
+            	SubFactoryMonitor m1(*this, "Cut Drop", currentLevel);
+		using ExecSpace = typename Node::execution_space;
+		using TeamPol = Kokkos::TeamPolicy<ExecSpace>;
+		using TeamMem = typename TeamPol::member_type;
+		using DropTol = Details::DropTol<real_type, LO>;
+		
+		//move from host to device
+		ArrayView<const SC>  ghostedDiagValsArrayView = ghostedDiagVals.view(ghostedDiagVals.lowerOffset(), ghostedDiagVals.size());
+		Kokkos::View<const SC*, ExecSpace> ghostedDiagValsView = Kokkos::Compat::getKokkosViewDeepCopy<ExecSpace>(ghostedDiagValsArrayView);
+		auto boundaryNodesDevice = Kokkos::create_mirror_view(ExecSpace(), boundaryNodes);
+		
+		auto At = Utilities::Op2TpetraCrs(A);
+		auto A_device = At->getLocalMatrixDevice();
+		
+		int algorithm = classicalAlgo;
+		Kokkos::View<LO*, ExecSpace>rownnzView("rownnzView", A_device.numRows());
+		auto drop_views = Kokkos::View<DropTol*, ExecSpace>("drop_views", A_device.nnz());
+		//stackedTimer->stop("init");
+
+		//stackedTimer->start("loop");
+		Kokkos::parallel_reduce("classical_cut", TeamPol(A_device.numRows(), Kokkos::AUTO), KOKKOS_LAMBDA(const TeamMem& teamMember, LO& globalnnz, GO& totalDropped) {
+			LO row = teamMember.league_rank();
+			auto rowView = A_device.row(row);
+			size_t nnz = rowView.length;
+
+			size_t n = 0;
+			auto drop_view = Kokkos::subview(drop_views, Kokkos::make_pair(A_device.graph.row_map(row), A_device.graph.row_map(row+1)));
+			//find magnitudes
+			Kokkos::parallel_reduce(Kokkos::TeamThreadRange(teamMember, (LO)nnz), [&] (const LO colID, size_t &count) {
+				LO col = rowView.colidx(colID);
+				if(row == col) {
+					drop_view(colID) = DropTol(0, 1, colID, false);
+					count++;
+				}
+				//Don't aggregate boundaries
+				else if(!boundaryNodesDevice(colID)) {
+					typename STS::magnitudeType aiiajj = static_cast<SC>(std::fabs(static_cast<double>(threshold * threshold * ghostedDiagValsView(col) * ghostedDiagValsView(row))));  // eps^2*|a     _ii|*|a_jj|
+					typename STS::magnitudeType aij    = static_cast<SC>(std::fabs(static_cast<double>(rowView.value(colID) * rowView.value(colID))));                                            // |a_i     j|^2
+					drop_view(colID) = DropTol(aij, aiiajj, colID, false);
+					count++;
+				}
+			}, n);
+			if (algorithm == unscaled_cut) {
+				Kokkos::Experimental::sort_team(teamMember, drop_view, [](DropTol const& a, DropTol const& b) {
+					return a.val > b.val;
+				});
+
+				//find index where dropping starts
+				size_t dropStart;
+				Kokkos::parallel_reduce(Kokkos::TeamThreadRange(teamMember, 1, n), [=](size_t i, size_t& min) {
+					auto const& x = drop_view(i - 1);
+					auto const& y = drop_view(i);
+					auto a = x.val;
+					auto b = y.val;
+					if(a > realThreshold * b) {
+						if(i < min) {
+							min = i;
+						}
+					}
+				}, Kokkos::Min<size_t>(dropStart));
+
+				if(dropStart < n) {
+					Kokkos::parallel_for(Kokkos::TeamThreadRange(teamMember, dropStart, n), [=](size_t i) {
+						drop_view(i).drop = true;
+					});
+				}
+          	 	} else if (algorithm == scaled_cut) {
+				Kokkos::Experimental::sort_team(teamMember, drop_view, [](DropTol const& a, DropTol const& b) {
+					return a.val / a.diag > b.val / b.diag;
+				});
+
+				//find index where dropping starts
+				size_t dropStart;
+				Kokkos::parallel_reduce(Kokkos::TeamThreadRange(teamMember, 1, n), [=](size_t i, size_t& min) {
+					auto const& x = drop_view(i - 1);
+					auto const& y = drop_view(i);
+					auto a = x.val / x.diag;
+					auto b = y.val / y.diag;
+					if(a > realThreshold * b) {
+						if(i < min) {
+							min = i;
+						}
+					}
+				}, Kokkos::Min<size_t>(dropStart));
+
+				if(dropStart < n) {
+					Kokkos::parallel_for(Kokkos::TeamThreadRange(teamMember, dropStart, n), [=](size_t i) {
+						drop_view(i).drop = true;
+					});
+				}
+	  	 	}
+			Kokkos::Experimental::sort_team(teamMember, drop_view, [](DropTol const& a, DropTol const& b) {
+				return a.col < b.col;
+			});
+		 
+		  	LO rownnz = 0;
+		  	GO rowDropped = 0;
+		  	Kokkos::parallel_reduce(Kokkos::TeamThreadRange(teamMember, n), [=](const size_t idxID, LO& keep, GO& drop) {
+				LO col = rowView.colidx(idxID);
+				//don't drop diagonal
+				if(row == col || !drop_view(idxID).drop) {
+					keep++;
+				}
+				else {
+					rowView.colidx(idxID) = -1;
+					drop++;
+				}
+	  	 	}, rownnz, rowDropped);
+		  	globalnnz += rownnz;
+		  	totalDropped += rowDropped;
+			rownnzView(row) = rownnz;
+		}, realnnz, numDropped);
+		//stackedTimer->stop("loop");
+
+		//stackedTimer->start("remove");
+		
+		auto columnsDevice = Kokkos::create_mirror_view(ExecSpace(), columns);
+		Kokkos::Experimental::remove(ExecSpace(), columnsDevice, -1);
+		Kokkos::deep_copy(columns, columnsDevice);
+		
+		//stackedTimer->stop("remove");
+	
+		//update row indices
+		//stackedTimer->start("scan");
+		auto rowsDevice = Kokkos::create_mirror_view(ExecSpace(), rows);
+		Kokkos::parallel_scan(Kokkos::RangePolicy<ExecSpace>(0, A_device.numRows()), KOKKOS_LAMBDA(const int i, LO& partial_sum, bool is_final) {
+			partial_sum += rownnzView(i);
+			if(is_final) rowsDevice(i+1) = partial_sum;
+		});
+		Kokkos::deep_copy(rows, rowsDevice);
+		//stackedTimer->stop("scan");
+	
+		//stackedTimer->stop("timer");
+		//stackedTimer->report(std::cout, Teuchos::DefaultComm<int>::getComm());
+	} //NEW END
+#endif
+
+        numTotal = A->getLocalNumEntries();
+
+        if (aggregationMayCreateDirichlet) {
+          // If the only element remaining after filtering is diagonal, mark node as boundary
+          for (LO row = 0; row < Teuchos::as<LO>(A->getRowMap()->getLocalNumElements()); ++row) {
+            if (rows[row + 1] - rows[row] <= 1)
+              boundaryNodes[row] = true;
+          }
+        }
+
+        RCP<LWGraph> graph = rcp(new LWGraph(rows, Kokkos::subview(columns, Kokkos::make_pair(0, realnnz)), A->getRowMap(), A->getColMap(), "thresholded graph of A"));
+        graph->SetBoundaryNodeMap(boundaryNodes);
+        if (GetVerbLevel() & Statistics1) {
+          GO numLocalBoundaryNodes  = 0;
+          GO numGlobalBoundaryNodes = 0;
+          for (size_t i = 0; i < boundaryNodes.size(); ++i)
+            if (boundaryNodes(i))
+              numLocalBoundaryNodes++;
+          RCP<const Teuchos::Comm<int>> comm = A->getRowMap()->getComm();
+          MueLu_sumAll(comm, numLocalBoundaryNodes, numGlobalBoundaryNodes);
+          GetOStream(Statistics1) << "Detected " << numGlobalBoundaryNodes << " Dirichlet nodes" << std::endl;
+        }
+        Set(currentLevel, "Graph", graph);
+        Set(currentLevel, "DofsPerNode", 1);
+
+        // If we're doing signed classical, we might want to block-diagonalize *after* the dropping
+        if (generateColoringGraph) {
+          RCP<LWGraph> colorGraph;
+          RCP<const Import> importer = A->getCrsGraph()->getImporter();
+          BlockDiagonalizeGraph(graph, ghostedBlockNumber, colorGraph, importer);
+          Set(currentLevel, "Coloring Graph", colorGraph);
+          // #define CMS_DUMP
+#ifdef CMS_DUMP
+          {
+            Xpetra::IO<Scalar, LocalOrdinal, GlobalOrdinal, Node>::Write("m_regular_graph." + std::to_string(currentLevel.GetLevelID()), *rcp_dynamic_cast<LWGraph>(graph)->GetCrsGraph());
+            Xpetra::IO<Scalar, LocalOrdinal, GlobalOrdinal, Node>::Write("m_color_graph." + std::to_string(currentLevel.GetLevelID()), *rcp_dynamic_cast<LWGraph>(colorGraph)->GetCrsGraph());
+            // int rank = graph->GetDomainMap()->getComm()->getRank();
+            // {
+            //   std::ofstream ofs(std::string("m_color_graph_") + std::to_string(currentLevel.GetLevelID())+std::string("_") + std::to_string(rank) + std::string(".dat"),std::ofstream::out);
+            //   RCP<Teuchos::FancyOStream> fancy = Teuchos::fancyOStream(Teuchos::rcpFromRef(ofs));
+            //   colorGraph->print(*fancy,Debug);
+            // }
+            // {
+            //   std::ofstream ofs(std::string("m_regular_graph_") + std::to_string(currentLevel.GetLevelID())+std::string("_") + std::to_string(rank) + std::string(".dat"),std::ofstream::out);
+            //   RCP<Teuchos::FancyOStream> fancy = Teuchos::fancyOStream(Teuchos::rcpFromRef(ofs));
+            //   graph->print(*fancy,Debug);
+            // }
+          }
+#endif
+        }  // end generateColoringGraph
+      } else if (BlockSize > 1 && threshold == STS::zero()) {
+        // Case 3:  Multiple DOF/node problem without dropping
+        const RCP<const Map> rowMap = A->getRowMap();
+        const RCP<const Map> colMap = A->getColMap();
+
+        graphType = "amalgamated";
+
+        // build node row map (uniqueMap) and node column map (nonUniqueMap)
+        // the arrays rowTranslation and colTranslation contain the local node id
+        // given a local dof id. The data is calculated by the AmalgamationFactory and
+        // stored in the variable container "UnAmalgamationInfo"
+        RCP<const Map> uniqueMap    = amalInfo->getNodeRowMap();
+        RCP<const Map> nonUniqueMap = amalInfo->getNodeColMap();
+        Array<LO> rowTranslation    = *(amalInfo->getRowTranslation());
+        Array<LO> colTranslation    = *(amalInfo->getColTranslation());
+
+        // get number of local nodes
+        LO numRows = Teuchos::as<LocalOrdinal>(uniqueMap->getLocalNumElements());
+
+        // Allocate space for the local graph
+        typename LWGraph::row_type::non_const_type rows("rows", numRows + 1);
+        typename LWGraph::entries_type::non_const_type columns("columns", A->getLocalNumEntries());
+
+        typename LWGraph::boundary_nodes_type amalgBoundaryNodes("amalgBoundaryNodes", numRows);
+        Kokkos::deep_copy(amalgBoundaryNodes, false);
+
+        // Detect and record rows that correspond to Dirichlet boundary conditions
+        // TODO If we use ArrayRCP<LO>, then we can record boundary nodes as usual.  Size
+        // TODO the array one bigger than the number of local rows, and the last entry can
+        // TODO hold the actual number of boundary nodes.  Clever, huh?
+        ArrayRCP<bool> pointBoundaryNodes;
+        pointBoundaryNodes = Teuchos::arcp_const_cast<bool>(MueLu::Utilities<SC, LO, GO, NO>::DetectDirichletRows(*A, dirichletThreshold));
+        if (rowSumTol > 0.)
+          Utilities::ApplyRowSumCriterion(*A, rowSumTol, pointBoundaryNodes);
+
+        // extract striding information
+        LO blkSize     = A->GetFixedBlockSize();  //< the full block size (number of dofs per node in strided map)
+        LO blkId       = -1;                      //< the block id within the strided map (or -1 if it is a full block map)
+        LO blkPartSize = A->GetFixedBlockSize();  //< stores the size of the block within the strided map
+        if (A->IsView("stridedMaps") == true) {
+          Teuchos::RCP<const Map> myMap         = A->getRowMap("stridedMaps");
+          Teuchos::RCP<const StridedMap> strMap = Teuchos::rcp_dynamic_cast<const StridedMap>(myMap);
+          TEUCHOS_TEST_FOR_EXCEPTION(strMap == null, Exceptions::RuntimeError, "Map is not of type StridedMap");
+          blkSize = Teuchos::as<const LO>(strMap->getFixedBlockSize());
+          blkId   = strMap->getStridedBlockId();
+          if (blkId > -1)
+            blkPartSize = Teuchos::as<LO>(strMap->getStridingData()[blkId]);
+        }
+
+        // loop over all local nodes
+        LO realnnz = 0;
+        rows(0)    = 0;
+        Array<LO> indicesExtra;
+        for (LO row = 0; row < numRows; row++) {
+          ArrayView<const LO> indices;
+          indicesExtra.resize(0);
+
+          // The amalgamated row is marked as Dirichlet iff all point rows are Dirichlet
+          // Note, that pointBoundaryNodes lives on the dofmap (and not the node map).
+          // Therefore, looping over all dofs is fine here. We use blkPartSize as we work
+          // with local ids.
+          // TODO: Here we have different options of how to define a node to be a boundary (or Dirichlet)
+          // node.
+          bool isBoundary = false;
+          if (pL.get<bool>("aggregation: greedy Dirichlet") == true) {
+            for (LO j = 0; j < blkPartSize; j++) {
+              if (pointBoundaryNodes[row * blkPartSize + j]) {
+                isBoundary = true;
+                break;
+              }
+            }
+          } else {
+            isBoundary = true;
+            for (LO j = 0; j < blkPartSize; j++) {
+              if (!pointBoundaryNodes[row * blkPartSize + j]) {
+                isBoundary = false;
+                break;
+              }
+            }
+          }
+
+          // Merge rows of A
+          // The array indicesExtra contains local column node ids for the current local node "row"
+          if (!isBoundary)
+            MergeRows(*A, row, indicesExtra, colTranslation);
+          else
+            indicesExtra.push_back(row);
+          indices = indicesExtra;
+          numTotal += indices.size();
+
+          // add the local column node ids to the full columns array which
+          // contains the local column node ids for all local node rows
+          LO nnz = indices.size(), rownnz = 0;
+          for (LO colID = 0; colID < nnz; colID++) {
+            LO col             = indices[colID];
+            columns(realnnz++) = col;
+            rownnz++;
+          }
+
+          if (rownnz == 1) {
+            // If the only element remaining after filtering is diagonal, mark node as boundary
+            // FIXME: this should really be replaced by the following
+            //    if (indices.size() == 1 && indices[0] == row)
+            //        boundaryNodes[row] = true;
+            // We do not do it this way now because there is no framework for distinguishing isolated
+            // and boundary nodes in the aggregation algorithms
+            amalgBoundaryNodes[row] = true;
+          }
+          rows(row + 1) = realnnz;
+        }  // for (LO row = 0; row < numRows; row++)
+
+        RCP<LWGraph> graph = rcp(new LWGraph(rows, Kokkos::subview(columns, Kokkos::make_pair(0, realnnz)), uniqueMap, nonUniqueMap, "amalgamated graph of A"));
+        graph->SetBoundaryNodeMap(amalgBoundaryNodes);
+
+        if (GetVerbLevel() & Statistics1) {
+          GO numLocalBoundaryNodes  = 0;
+          GO numGlobalBoundaryNodes = 0;
+
+          for (size_t i = 0; i < amalgBoundaryNodes.size(); ++i)
+            if (amalgBoundaryNodes(i))
+              numLocalBoundaryNodes++;
+
+          RCP<const Teuchos::Comm<int>> comm = A->getRowMap()->getComm();
+          MueLu_sumAll(comm, numLocalBoundaryNodes, numGlobalBoundaryNodes);
+          GetOStream(Statistics1) << "Detected " << numGlobalBoundaryNodes
+                                  << " agglomerated Dirichlet nodes" << std::endl;
+        }
+
+        Set(currentLevel, "Graph", graph);
+        Set(currentLevel, "DofsPerNode", blkSize);  // full block size
+
+      } else if (BlockSize > 1 && threshold != STS::zero()) {
+        // Case 4:  Multiple DOF/node problem with dropping
+        const RCP<const Map> rowMap = A->getRowMap();
+        const RCP<const Map> colMap = A->getColMap();
+        graphType                   = "amalgamated";
+
+        // build node row map (uniqueMap) and node column map (nonUniqueMap)
+        // the arrays rowTranslation and colTranslation contain the local node id
+        // given a local dof id. The data is calculated by the AmalgamationFactory and
+        // stored in the variable container "UnAmalgamationInfo"
+        RCP<const Map> uniqueMap    = amalInfo->getNodeRowMap();
+        RCP<const Map> nonUniqueMap = amalInfo->getNodeColMap();
+        Array<LO> rowTranslation    = *(amalInfo->getRowTranslation());
+        Array<LO> colTranslation    = *(amalInfo->getColTranslation());
+
+        // get number of local nodes
+        LO numRows = Teuchos::as<LocalOrdinal>(uniqueMap->getLocalNumElements());
+
+        // Allocate space for the local graph
+        typename LWGraph::row_type::non_const_type rows("rows", numRows + 1);
+        typename LWGraph::entries_type::non_const_type columns("columns", A->getLocalNumEntries());
+
+        typename LWGraph::boundary_nodes_type amalgBoundaryNodes("amalgBoundaryNodes", numRows);
+        Kokkos::deep_copy(amalgBoundaryNodes, false);
+
+        // Detect and record rows that correspond to Dirichlet boundary conditions
+        // TODO If we use ArrayRCP<LO>, then we can record boundary nodes as usual.  Size
+        // TODO the array one bigger than the number of local rows, and the last entry can
+        // TODO hold the actual number of boundary nodes.  Clever, huh?
+        auto pointBoundaryNodes = MueLu::Utilities<SC, LO, GO, NO>::DetectDirichletRows_kokkos_host(*A, dirichletThreshold);
+        if (rowSumTol > 0.)
+          Utilities::ApplyRowSumCriterionHost(*A, rowSumTol, pointBoundaryNodes);
+
+        // extract striding information
+        LO blkSize     = A->GetFixedBlockSize();  //< the full block size (number of dofs per node in strided map)
+        LO blkId       = -1;                      //< the block id within the strided map (or -1 if it is a full block map)
+        LO blkPartSize = A->GetFixedBlockSize();  //< stores the size of the block within the strided map
+        if (A->IsView("stridedMaps") == true) {
+          Teuchos::RCP<const Map> myMap         = A->getRowMap("stridedMaps");
+          Teuchos::RCP<const StridedMap> strMap = Teuchos::rcp_dynamic_cast<const StridedMap>(myMap);
+          TEUCHOS_TEST_FOR_EXCEPTION(strMap == null, Exceptions::RuntimeError, "Map is not of type StridedMap");
+          blkSize = Teuchos::as<const LO>(strMap->getFixedBlockSize());
+          blkId   = strMap->getStridedBlockId();
+          if (blkId > -1)
+            blkPartSize = Teuchos::as<LO>(strMap->getStridingData()[blkId]);
+        }
+
+        // extract diagonal data for dropping strategy
+        RCP<Vector> ghostedDiag                  = MueLu::Utilities<SC, LO, GO, NO>::GetMatrixOverlappedDiagonal(*A);
+        const ArrayRCP<const SC> ghostedDiagVals = ghostedDiag->getData(0);
+
+        // loop over all local nodes
+        LO realnnz = 0;
+        rows[0]    = 0;
+        Array<LO> indicesExtra;
+        for (LO row = 0; row < numRows; row++) {
+          ArrayView<const LO> indices;
+          indicesExtra.resize(0);
+
+          // The amalgamated row is marked as Dirichlet iff all point rows are Dirichlet
+          // Note, that pointBoundaryNodes lives on the dofmap (and not the node map).
+          // Therefore, looping over all dofs is fine here. We use blkPartSize as we work
+          // with local ids.
+          // TODO: Here we have different options of how to define a node to be a boundary (or Dirichlet)
+          // node.
+          bool isBoundary = false;
+          if (pL.get<bool>("aggregation: greedy Dirichlet") == true) {
+            for (LO j = 0; j < blkPartSize; j++) {
+              if (pointBoundaryNodes[row * blkPartSize + j]) {
+                isBoundary = true;
+                break;
+              }
+            }
+          } else {
+            isBoundary = true;
+            for (LO j = 0; j < blkPartSize; j++) {
+              if (!pointBoundaryNodes[row * blkPartSize + j]) {
+                isBoundary = false;
+                break;
+              }
+            }
+          }
+
+          // Merge rows of A
+          // The array indicesExtra contains local column node ids for the current local node "row"
+          if (!isBoundary)
+            MergeRowsWithDropping(*A, row, ghostedDiagVals, threshold, indicesExtra, colTranslation);
+          else
+            indicesExtra.push_back(row);
+          indices = indicesExtra;
+          numTotal += indices.size();
+
+          // add the local column node ids to the full columns array which
+          // contains the local column node ids for all local node rows
+          LO nnz = indices.size(), rownnz = 0;
+          for (LO colID = 0; colID < nnz; colID++) {
+            LO col             = indices[colID];
+            columns[realnnz++] = col;
+            rownnz++;
+          }
+
+          if (rownnz == 1) {
+            // If the only element remaining after filtering is diagonal, mark node as boundary
+            // FIXME: this should really be replaced by the following
+            //    if (indices.size() == 1 && indices[0] == row)
+            //        boundaryNodes[row] = true;
+            // We do not do it this way now because there is no framework for distinguishing isolated
+            // and boundary nodes in the aggregation algorithms
+            amalgBoundaryNodes[row] = true;
+          }
+          rows[row + 1] = realnnz;
+        }  // for (LO row = 0; row < numRows; row++)
+        // columns.resize(realnnz);
+
+        RCP<LWGraph> graph = rcp(new LWGraph(rows, Kokkos::subview(columns, Kokkos::make_pair(0, realnnz)), uniqueMap, nonUniqueMap, "amalgamated graph of A"));
+        graph->SetBoundaryNodeMap(amalgBoundaryNodes);
+
+        if (GetVerbLevel() & Statistics1) {
+          GO numLocalBoundaryNodes  = 0;
+          GO numGlobalBoundaryNodes = 0;
+
+          for (size_t i = 0; i < amalgBoundaryNodes.size(); ++i)
+            if (amalgBoundaryNodes(i))
+              numLocalBoundaryNodes++;
+
+          RCP<const Teuchos::Comm<int>> comm = A->getRowMap()->getComm();
+          MueLu_sumAll(comm, numLocalBoundaryNodes, numGlobalBoundaryNodes);
+          GetOStream(Statistics1) << "Detected " << numGlobalBoundaryNodes
+                                  << " agglomerated Dirichlet nodes" << std::endl;
+        }
+
+        Set(currentLevel, "Graph", graph);
+        Set(currentLevel, "DofsPerNode", blkSize);  // full block size
+      }
+
+    } else if (algo == "distance laplacian") {
+      LO blkSize   = A->GetFixedBlockSize();
+      GO indexBase = A->getRowMap()->getIndexBase();
+      // [*0*] : FIXME
+      // ap: somehow, if I move this line to [*1*], Belos throws an error
+      // I'm not sure what's going on. Do we always have to Get data, if we did
+      // DeclareInput for it?
+      //        RCP<RealValuedMultiVector> Coords = Get< RCP<RealValuedMultiVector > >(currentLevel, "Coordinates");
+
+      // Detect and record rows that correspond to Dirichlet boundary conditions
+      // TODO If we use ArrayRCP<LO>, then we can record boundary nodes as usual.  Size
+      // TODO the array one bigger than the number of local rows, and the last entry can
+      // TODO hold the actual number of boundary nodes.  Clever, huh?
+      auto pointBoundaryNodes = MueLu::Utilities<SC, LO, GO, NO>::DetectDirichletRows_kokkos_host(*A, dirichletThreshold);
+      if (rowSumTol > 0.)
+        Utilities::ApplyRowSumCriterionHost(*A, rowSumTol, pointBoundaryNodes);
+
+      if ((blkSize == 1) && (threshold == STS::zero())) {
+        // Trivial case: scalar problem, no dropping. Can return original graph
+        RCP<LWGraph> graph = rcp(new LWGraph(A->getCrsGraph(), "graph of A"));
+        graph->SetBoundaryNodeMap(pointBoundaryNodes);
+        graphType = "unamalgamated";
+        numTotal  = A->getLocalNumEntries();
+
+        if (GetVerbLevel() & Statistics1) {
+          GO numLocalBoundaryNodes  = 0;
+          GO numGlobalBoundaryNodes = 0;
+          for (size_t i = 0; i < pointBoundaryNodes.size(); ++i)
+            if (pointBoundaryNodes(i))
+              numLocalBoundaryNodes++;
+          RCP<const Teuchos::Comm<int>> comm = A->getRowMap()->getComm();
+          MueLu_sumAll(comm, numLocalBoundaryNodes, numGlobalBoundaryNodes);
+          GetOStream(Statistics1) << "Detected " << numGlobalBoundaryNodes << " Dirichlet nodes" << std::endl;
+        }
+
+        Set(currentLevel, "DofsPerNode", blkSize);
+        Set(currentLevel, "Graph", graph);
+
+      } else {
+        // ap: We make quite a few assumptions here; general case may be a lot different,
+        // but much much harder to implement. We assume that:
+        //  1) all maps are standard maps, not strided maps
+        //  2) global indices of dofs in A are related to dofs in coordinates in a simple arithmetic
+        //     way: rows i*blkSize, i*blkSize+1, ..., i*blkSize + (blkSize-1) correspond to node i
+        //
+        // NOTE: Potentially, some of the code below could be simplified with UnAmalgamationInfo,
+        // but as I totally don't understand that code, here is my solution
+
+        // [*1*]: see [*0*]
+
+        // Check that the number of local coordinates is consistent with the #rows in A
+        TEUCHOS_TEST_FOR_EXCEPTION(A->getRowMap()->getLocalNumElements() / blkSize != Coords->getLocalLength(), Exceptions::Incompatible,
+                                   "Coordinate vector length (" << Coords->getLocalLength() << ") is incompatible with number of rows in A (" << A->getRowMap()->getLocalNumElements() << ") by modulo block size (" << blkSize << ").");
+
+        const RCP<const Map> colMap = A->getColMap();
+        RCP<const Map> uniqueMap, nonUniqueMap;
+        Array<LO> colTranslation;
+        if (blkSize == 1) {
+          uniqueMap    = A->getRowMap();
+          nonUniqueMap = A->getColMap();
+          graphType    = "unamalgamated";
+
+        } else {
+          uniqueMap = Coords->getMap();
+          TEUCHOS_TEST_FOR_EXCEPTION(uniqueMap->getIndexBase() != indexBase, Exceptions::Incompatible,
+                                     "Different index bases for matrix and coordinates");
+
+          AmalgamationFactory::AmalgamateMap(*(A->getColMap()), *A, nonUniqueMap, colTranslation);
+
+          graphType = "amalgamated";
+        }
+        LO numRows = Teuchos::as<LocalOrdinal>(uniqueMap->getLocalNumElements());
+
+        RCP<RealValuedMultiVector> ghostedCoords;
+        RCP<Vector> ghostedLaplDiag;
+        Teuchos::ArrayRCP<SC> ghostedLaplDiagData;
+        if (threshold != STS::zero()) {
+          // Get ghost coordinates
+          RCP<const Import> importer;
+          {
+            SubFactoryMonitor m1(*this, "Import construction", currentLevel);
+            if (blkSize == 1 && realA->getCrsGraph()->getImporter() != Teuchos::null) {
+              GetOStream(Warnings1) << "Using existing importer from matrix graph" << std::endl;
+              importer = realA->getCrsGraph()->getImporter();
+            } else {
+              GetOStream(Warnings0) << "Constructing new importer instance" << std::endl;
+              importer = ImportFactory::Build(uniqueMap, nonUniqueMap);
+            }
+          }  // subtimer
+          ghostedCoords = Xpetra::MultiVectorFactory<real_type, LO, GO, NO>::Build(nonUniqueMap, Coords->getNumVectors());
+          {
+            SubFactoryMonitor m1(*this, "Coordinate import", currentLevel);
+            ghostedCoords->doImport(*Coords, *importer, Xpetra::INSERT);
+          }  // subtimer
+
+          // Construct Distance Laplacian diagonal
+          RCP<Vector> localLaplDiag = VectorFactory::Build(uniqueMap);
+          Array<LO> indicesExtra;
+          Teuchos::Array<Teuchos::ArrayRCP<const real_type>> coordData;
+          if (threshold != STS::zero()) {
+            const size_t numVectors = ghostedCoords->getNumVectors();
+            coordData.reserve(numVectors);
+            for (size_t j = 0; j < numVectors; j++) {
+              Teuchos::ArrayRCP<const real_type> tmpData = ghostedCoords->getData(j);
+              coordData.push_back(tmpData);
+            }
+          }
+          {
+            SubFactoryMonitor m1(*this, "Laplacian local diagonal", currentLevel);
+            ArrayRCP<SC> localLaplDiagData = localLaplDiag->getDataNonConst(0);
+            for (LO row = 0; row < numRows; row++) {
+              ArrayView<const LO> indices;
+
+              if (blkSize == 1) {
+                ArrayView<const SC> vals;
+                A->getLocalRowView(row, indices, vals);
+
+              } else {
+                // Merge rows of A
+                indicesExtra.resize(0);
+                MergeRows(*A, row, indicesExtra, colTranslation);
+                indices = indicesExtra;
+              }
+
+              LO nnz               = indices.size();
+              bool haveAddedToDiag = false;
+              for (LO colID = 0; colID < nnz; colID++) {
+                const LO col = indices[colID];
+
+                if (row != col) {
+                  if (use_dlap_weights == SINGLE_WEIGHTS) {
+                    /*printf("[%d,%d] Unweighted Distance = %6.4e Weighted Distance = %6.4e\n",row,col,
+                           MueLu::Utilities<real_type,LO,GO,NO>::Distance2(coordData, row, col),
+                           MueLu::Utilities<real_type,LO,GO,NO>::Distance2(dlap_weights(),coordData, row, col));*/
+                    localLaplDiagData[row] += STS::one() / MueLu::Utilities<real_type, LO, GO, NO>::Distance2(dlap_weights(), coordData, row, col);
+                  } else if (use_dlap_weights == BLOCK_WEIGHTS) {
+                    int block_id    = row % interleaved_blocksize;
+                    int block_start = block_id * interleaved_blocksize;
+                    localLaplDiagData[row] += STS::one() / MueLu::Utilities<real_type, LO, GO, NO>::Distance2(dlap_weights(block_start, interleaved_blocksize), coordData, row, col);
+                  } else {
+                    //                    printf("[%d,%d] Unweighted Distance = %6.4e\n",row,col,MueLu::Utilities<real_type,LO,GO,NO>::Distance2(coordData, row, col));
+                    localLaplDiagData[row] += STS::one() / MueLu::Utilities<real_type, LO, GO, NO>::Distance2(coordData, row, col);
+                  }
+                  haveAddedToDiag = true;
+                }
+              }
+              // Deal with the situation where boundary conditions have only been enforced on rows, but not on columns.
+              // We enforce dropping of these entries by assigning a very large number to the diagonal entries corresponding to BCs.
+              if (!haveAddedToDiag)
+                localLaplDiagData[row] = STS::rmax();
+            }
+          }  // subtimer
+          {
+            SubFactoryMonitor m1(*this, "Laplacian distributed diagonal", currentLevel);
+            ghostedLaplDiag = VectorFactory::Build(nonUniqueMap);
+            ghostedLaplDiag->doImport(*localLaplDiag, *importer, Xpetra::INSERT);
+            ghostedLaplDiagData = ghostedLaplDiag->getDataNonConst(0);
+          }  // subtimer
+
+        } else {
+          GetOStream(Runtime0) << "Skipping distance laplacian construction due to 0 threshold" << std::endl;
+        }
+
+        // NOTE: ghostedLaplDiagData might be zero if we don't actually calculate the laplacian
+
+        // allocate space for the local graph
+        typename LWGraph::row_type::non_const_type rows("rows", numRows + 1);
+        typename LWGraph::entries_type::non_const_type columns("columns", A->getLocalNumEntries());
+
+#ifdef HAVE_MUELU_DEBUG
+        // DEBUGGING
+        for (LO i = 0; i < (LO)columns.size(); i++) columns[i] = -666;
+#endif
+
+        // Extra array for if we're allowing symmetrization with cutting
+        ArrayRCP<LO> rows_stop;
+        bool use_stop_array = threshold != STS::zero() && distanceLaplacianAlgo == scaled_cut_symmetric;
+        if (use_stop_array)
+          // rows_stop = typename LWGraph::row_type::non_const_type("rows_stop", numRows);
+          rows_stop.resize(numRows);
+
+        typename LWGraph::boundary_nodes_type amalgBoundaryNodes("amalgBoundaryNodes", numRows);
+        Kokkos::deep_copy(amalgBoundaryNodes, false);
+
+        LO realnnz = 0;
+        rows(0)    = 0;
+
+        Array<LO> indicesExtra;
+        {
+          SubFactoryMonitor m1(*this, "Laplacian dropping", currentLevel);
+          Teuchos::Array<Teuchos::ArrayRCP<const real_type>> coordData;
+          if (threshold != STS::zero()) {
+            const size_t numVectors = ghostedCoords->getNumVectors();
+            coordData.reserve(numVectors);
+            for (size_t j = 0; j < numVectors; j++) {
+              Teuchos::ArrayRCP<const real_type> tmpData = ghostedCoords->getData(j);
+              coordData.push_back(tmpData);
+            }
+          }
+
+          ArrayView<const SC> vals;  // CMS hackery
+          for (LO row = 0; row < numRows; row++) {
+            ArrayView<const LO> indices;
+            indicesExtra.resize(0);
+            bool isBoundary = false;
+
+            if (blkSize == 1) {
+              //	      ArrayView<const SC>     vals;//CMS uncomment
+              A->getLocalRowView(row, indices, vals);
+              isBoundary = pointBoundaryNodes[row];
+            } else {
+              // The amalgamated row is marked as Dirichlet iff all point rows are Dirichlet
+              for (LO j = 0; j < blkSize; j++) {
+                if (!pointBoundaryNodes[row * blkSize + j]) {
+                  isBoundary = false;
+                  break;
+                }
+              }
+
+              // Merge rows of A
+              if (!isBoundary)
+                MergeRows(*A, row, indicesExtra, colTranslation);
+              else
+                indicesExtra.push_back(row);
+              indices = indicesExtra;
+            }
+            numTotal += indices.size();
+
+            LO nnz = indices.size(), rownnz = 0;
+
+            if (use_stop_array) {
+              rows(row + 1) = rows(row) + nnz;
+              realnnz       = rows(row);
+            }
+
+            if (threshold != STS::zero()) {
+              // default
+              if (distanceLaplacianAlgo == defaultAlgo) {
+                /* Standard Distance Laplacian */
+                for (LO colID = 0; colID < nnz; colID++) {
+                  LO col = indices[colID];
+
+                  if (row == col) {
+                    columns(realnnz++) = col;
+                    rownnz++;
+                    continue;
+                  }
+
+                  // We do not want the distance Laplacian aggregating boundary nodes
+                  if (isBoundary) continue;
+
+                  SC laplVal;
+                  if (use_dlap_weights == SINGLE_WEIGHTS) {
+                    laplVal = STS::one() / MueLu::Utilities<real_type, LO, GO, NO>::Distance2(dlap_weights(), coordData, row, col);
+                  } else if (use_dlap_weights == BLOCK_WEIGHTS) {
+                    int block_id    = row % interleaved_blocksize;
+                    int block_start = block_id * interleaved_blocksize;
+                    laplVal         = STS::one() / MueLu::Utilities<real_type, LO, GO, NO>::Distance2(dlap_weights(block_start, interleaved_blocksize), coordData, row, col);
+                  } else {
+                    laplVal = STS::one() / MueLu::Utilities<real_type, LO, GO, NO>::Distance2(coordData, row, col);
+                  }
+                  real_type aiiajj = STS::magnitude(realThreshold * realThreshold * ghostedLaplDiagData[row] * ghostedLaplDiagData[col]);
+                  real_type aij    = STS::magnitude(laplVal * laplVal);
+
+                  if (aij > aiiajj) {
+                    columns(realnnz++) = col;
+                    rownnz++;
+                  } else {
+                    numDropped++;
+                  }
+                }
+              } else {
+                /* Cut Algorithm */
+                using DropTol = Details::DropTol<real_type, LO>;
+                std::vector<DropTol> drop_vec;
+                drop_vec.reserve(nnz);
+                const real_type zero = Teuchos::ScalarTraits<real_type>::zero();
+                const real_type one  = Teuchos::ScalarTraits<real_type>::one();
+
+                // find magnitudes
+                for (LO colID = 0; colID < nnz; colID++) {
+                  LO col = indices[colID];
+
+                  if (row == col) {
+                    drop_vec.emplace_back(zero, one, colID, false);
+                    continue;
+                  }
+                  // We do not want the distance Laplacian aggregating boundary nodes
+                  if (isBoundary) continue;
+
+                  SC laplVal;
+                  if (use_dlap_weights == SINGLE_WEIGHTS) {
+                    laplVal = STS::one() / MueLu::Utilities<real_type, LO, GO, NO>::Distance2(dlap_weights(), coordData, row, col);
+                  } else if (use_dlap_weights == BLOCK_WEIGHTS) {
+                    int block_id    = row % interleaved_blocksize;
+                    int block_start = block_id * interleaved_blocksize;
+                    laplVal         = STS::one() / MueLu::Utilities<real_type, LO, GO, NO>::Distance2(dlap_weights(block_start, interleaved_blocksize), coordData, row, col);
+                  } else {
+                    laplVal = STS::one() / MueLu::Utilities<real_type, LO, GO, NO>::Distance2(coordData, row, col);
+                  }
+
+                  real_type aiiajj = STS::magnitude(ghostedLaplDiagData[row] * ghostedLaplDiagData[col]);
+                  real_type aij    = STS::magnitude(laplVal * laplVal);
+
+                  drop_vec.emplace_back(aij, aiiajj, colID, false);
+                }
+
+                const size_t n = drop_vec.size();
+
+                if (distanceLaplacianAlgo == unscaled_cut) {
+                  std::sort(drop_vec.begin(), drop_vec.end(), [](DropTol const& a, DropTol const& b) {
+                    return a.val > b.val;
+                  });
+
+                  bool drop = false;
+                  for (size_t i = 1; i < n; ++i) {
+                    if (!drop) {
+                      auto const& x = drop_vec[i - 1];
+                      auto const& y = drop_vec[i];
+                      auto a        = x.val;
+                      auto b        = y.val;
+                      if (a > realThreshold * b) {
+                        drop = true;
+#ifdef HAVE_MUELU_DEBUG
+                        if (distanceLaplacianCutVerbose) {
+                          std::cout << "DJS: KEEP, N, ROW:  " << i + 1 << ", " << n << ", " << row << std::endl;
+                        }
+#endif
+                      }
+                    }
+                    drop_vec[i].drop = drop;
+                  }
+                } else if (distanceLaplacianAlgo == scaled_cut || distanceLaplacianAlgo == scaled_cut_symmetric) {
+                  std::sort(drop_vec.begin(), drop_vec.end(), [](DropTol const& a, DropTol const& b) {
+                    return a.val / a.diag > b.val / b.diag;
+                  });
+
+                  bool drop = false;
+                  for (size_t i = 1; i < n; ++i) {
+                    if (!drop) {
+                      auto const& x = drop_vec[i - 1];
+                      auto const& y = drop_vec[i];
+                      auto a        = x.val / x.diag;
+                      auto b        = y.val / y.diag;
+                      if (a > realThreshold * b) {
+                        drop = true;
+#ifdef HAVE_MUELU_DEBUG
+                        if (distanceLaplacianCutVerbose) {
+                          std::cout << "DJS: KEEP, N, ROW:  " << i + 1 << ", " << n << ", " << row << std::endl;
+                        }
+#endif
+                      }
+                    }
+                    drop_vec[i].drop = drop;
+                  }
+                }
+
+                std::sort(drop_vec.begin(), drop_vec.end(), [](DropTol const& a, DropTol const& b) {
+                  return a.col < b.col;
+                });
+
+                for (LO idxID = 0; idxID < (LO)drop_vec.size(); idxID++) {
+                  LO col = indices[drop_vec[idxID].col];
+
+                  // don't drop diagonal
+                  if (row == col) {
+                    columns(realnnz++) = col;
+                    rownnz++;
+                    //		    printf("(%d,%d) KEEP %13s matrix = %6.4e\n",row,row,"DIAGONAL",drop_vec[idxID].aux_val);
+                    continue;
+                  }
+
+                  if (!drop_vec[idxID].drop) {
+                    columns(realnnz++) = col;
+                    //		    printf("(%d,%d) KEEP dlap = %6.4e matrix = %6.4e\n",row,col,drop_vec[idxID].val/drop_vec[idxID].diag,drop_vec[idxID].aux_val);
+                    rownnz++;
+                  } else {
+                    //		    printf("(%d,%d) DROP dlap = %6.4e matrix = %6.4e\n",row,col,drop_vec[idxID].val/drop_vec[idxID].diag,drop_vec[idxID].aux_val);
+                    numDropped++;
+                  }
+                }
+              }
+            } else {
+              // Skip laplace calculation and threshold comparison for zero threshold
+              for (LO colID = 0; colID < nnz; colID++) {
+                LO col             = indices[colID];
+                columns(realnnz++) = col;
+                rownnz++;
+              }
+            }
+
+            if (rownnz == 1) {
+              // If the only element remaining after filtering is diagonal, mark node as boundary
+              // FIXME: this should really be replaced by the following
+              //    if (indices.size() == 1 && indices[0] == row)
+              //        boundaryNodes[row] = true;
+              // We do not do it this way now because there is no framework for distinguishing isolated
+              // and boundary nodes in the aggregation algorithms
+              amalgBoundaryNodes[row] = true;
+            }
+
+            if (use_stop_array)
+              rows_stop[row] = rownnz + rows[row];
+            else
+              rows[row + 1] = realnnz;
+          }  // for (LO row = 0; row < numRows; row++)
+
+        }  // subtimer
+
+        if (use_stop_array) {
+          // Do symmetrization of the cut matrix
+          // NOTE: We assume nested row/column maps here
+          for (LO row = 0; row < numRows; row++) {
+            for (LO colidx = rows[row]; colidx < rows_stop[row]; colidx++) {
+              LO col = columns[colidx];
+              if (col >= numRows) continue;
+
+              bool found = false;
+              for (LO t_col = rows(col); !found && t_col < rows_stop[col]; t_col++) {
+                if (columns[t_col] == row)
+                  found = true;
+              }
+              // We didn't find the transpose buddy, so let's symmetrize, unless we'd be symmetrizing
+              // into a Dirichlet unknown.  In that case don't.
+              if (!found && !pointBoundaryNodes[col] && Teuchos::as<typename LWGraph::row_type::value_type>(rows_stop[col]) < rows[col + 1]) {
+                LO new_idx = rows_stop[col];
+                //		  printf("(%d,%d) SYMADD entry\n",col,row);
+                columns[new_idx] = row;
+                rows_stop[col]++;
+                numDropped--;
+              }
+            }
+          }
+
+          // Condense everything down
+          LO current_start = 0;
+          for (LO row = 0; row < numRows; row++) {
+            LO old_start = current_start;
+            for (LO col = rows(row); col < rows_stop[row]; col++) {
+              if (current_start != col) {
+                columns(current_start) = columns(col);
+              }
+              current_start++;
+            }
+            rows[row] = old_start;
+          }
+          rows(numRows) = realnnz = current_start;
+        }
+
+        RCP<LWGraph> graph;
+        {
+          SubFactoryMonitor m1(*this, "Build amalgamated graph", currentLevel);
+          graph = rcp(new LWGraph(rows, Kokkos::subview(columns, Kokkos::make_pair(0, realnnz)), uniqueMap, nonUniqueMap, "amalgamated graph of A"));
+          graph->SetBoundaryNodeMap(amalgBoundaryNodes);
+        }  // subtimer
+
+        if (GetVerbLevel() & Statistics1) {
+          GO numLocalBoundaryNodes  = 0;
+          GO numGlobalBoundaryNodes = 0;
+
+          for (size_t i = 0; i < amalgBoundaryNodes.size(); ++i)
+            if (amalgBoundaryNodes(i))
+              numLocalBoundaryNodes++;
+
+          RCP<const Teuchos::Comm<int>> comm = A->getRowMap()->getComm();
+          MueLu_sumAll(comm, numLocalBoundaryNodes, numGlobalBoundaryNodes);
+          GetOStream(Statistics1) << "Detected " << numGlobalBoundaryNodes << " agglomerated Dirichlet nodes"
+                                  << " using threshold " << dirichletThreshold << std::endl;
+        }
+
+        Set(currentLevel, "Graph", graph);
+        Set(currentLevel, "DofsPerNode", blkSize);
+      }
+    }
+
+    if ((GetVerbLevel() & Statistics1) && !(A->GetFixedBlockSize() > 1 && threshold != STS::zero())) {
+      RCP<const Teuchos::Comm<int>> comm = A->getRowMap()->getComm();
+      GO numGlobalTotal, numGlobalDropped;
+      MueLu_sumAll(comm, numTotal, numGlobalTotal);
+      MueLu_sumAll(comm, numDropped, numGlobalDropped);
+      GetOStream(Statistics1) << "Number of dropped entries in " << graphType << " matrix graph: " << numGlobalDropped << "/" << numGlobalTotal;
+      if (numGlobalTotal != 0)
+        GetOStream(Statistics1) << " (" << 100 * Teuchos::as<double>(numGlobalDropped) / Teuchos::as<double>(numGlobalTotal) << "%)";
+      GetOStream(Statistics1) << std::endl;
+    }
+
+  } else {
+    // what Tobias has implemented
+
+    SC threshold = as<SC>(pL.get<double>("aggregation: drop tol"));
+    // GetOStream(Runtime0) << "algorithm = \"" << algo << "\": threshold = " << threshold << ", blocksize = " << A->GetFixedBlockSize() << std::endl;
+    GetOStream(Runtime0) << "algorithm = \""
+                         << "failsafe"
+                         << "\": threshold = " << threshold << ", blocksize = " << A->GetFixedBlockSize() << std::endl;
+    Set<bool>(currentLevel, "Filtering", (threshold != STS::zero()));
+
+    RCP<const Map> rowMap = A->getRowMap();
+    RCP<const Map> colMap = A->getColMap();
+
+    LO blockdim  = 1;                       // block dim for fixed size blocks
+    GO indexBase = rowMap->getIndexBase();  // index base of maps
+    GO offset    = 0;
+
+    // 1) check for blocking/striding information
+    if (A->IsView("stridedMaps") &&
+        Teuchos::rcp_dynamic_cast<const StridedMap>(A->getRowMap("stridedMaps")) != Teuchos::null) {
+      Xpetra::viewLabel_t oldView  = A->SwitchToView("stridedMaps");  // note: "stridedMaps are always non-overlapping (correspond to range and domain maps!)
+      RCP<const StridedMap> strMap = Teuchos::rcp_dynamic_cast<const StridedMap>(A->getRowMap());
+      TEUCHOS_TEST_FOR_EXCEPTION(strMap == Teuchos::null, Exceptions::BadCast, "MueLu::CoalesceFactory::Build: cast to strided row map failed.");
+      blockdim = strMap->getFixedBlockSize();
+      offset   = strMap->getOffset();
+      oldView  = A->SwitchToView(oldView);
+      GetOStream(Statistics1) << "CoalesceDropFactory::Build():"
+                              << " found blockdim=" << blockdim << " from strided maps. offset=" << offset << std::endl;
+    } else
+      GetOStream(Statistics1) << "CoalesceDropFactory::Build(): no striding information available. Use blockdim=1 with offset=0" << std::endl;
+
+    // 2) get row map for amalgamated matrix (graph of A)
+    //    with same distribution over all procs as row map of A
+    RCP<const Map> nodeMap = amalInfo->getNodeRowMap();
+    GetOStream(Statistics1) << "CoalesceDropFactory: nodeMap " << nodeMap->getLocalNumElements() << "/" << nodeMap->getGlobalNumElements() << " elements" << std::endl;
+
+    // 3) create graph of amalgamated matrix
+    RCP<CrsGraph> crsGraph = CrsGraphFactory::Build(nodeMap, A->getLocalMaxNumRowEntries() * blockdim);
+
+    LO numRows  = A->getRowMap()->getLocalNumElements();
+    LO numNodes = nodeMap->getLocalNumElements();
+    typename LWGraph::boundary_nodes_type amalgBoundaryNodes("amalgBoundaryNodes", numNodes);
+    Kokkos::deep_copy(amalgBoundaryNodes, false);
+    const ArrayRCP<int> numberDirichletRowsPerNode(numNodes, 0);  // helper array counting the number of Dirichlet nodes associated with node
+    bool bIsDiagonalEntry = false;                                // boolean flag stating that grid==gcid
+
+    // 4) do amalgamation. generate graph of amalgamated matrix
+    //    Note, this code is much more inefficient than the leightwight implementation
+    //    Most of the work has already been done in the AmalgamationFactory
+    for (LO row = 0; row < numRows; row++) {
+      // get global DOF id
+      GO grid = rowMap->getGlobalElement(row);
+
+      // reinitialize boolean helper variable
+      bIsDiagonalEntry = false;
+
+      // translate grid to nodeid
+      GO nodeId = AmalgamationFactory::DOFGid2NodeId(grid, blockdim, offset, indexBase);
+
+      size_t nnz = A->getNumEntriesInLocalRow(row);
+      Teuchos::ArrayView<const LO> indices;
+      Teuchos::ArrayView<const SC> vals;
+      A->getLocalRowView(row, indices, vals);
+
+      RCP<std::vector<GO>> cnodeIds = Teuchos::rcp(new std::vector<GO>);  // global column block ids
+      LO realnnz                    = 0;
+      for (LO col = 0; col < Teuchos::as<LO>(nnz); col++) {
+        GO gcid = colMap->getGlobalElement(indices[col]);  // global column id
+
+        if (vals[col] != STS::zero()) {
+          GO cnodeId = AmalgamationFactory::DOFGid2NodeId(gcid, blockdim, offset, indexBase);
+          cnodeIds->push_back(cnodeId);
+          realnnz++;  // increment number of nnz in matrix row
+          if (grid == gcid) bIsDiagonalEntry = true;
+        }
+      }
+
+      if (realnnz == 1 && bIsDiagonalEntry == true) {
+        LO lNodeId = nodeMap->getLocalElement(nodeId);
+        numberDirichletRowsPerNode[lNodeId] += 1;             // increment Dirichlet row counter associated with lNodeId
+        if (numberDirichletRowsPerNode[lNodeId] == blockdim)  // mark full Dirichlet nodes
+          amalgBoundaryNodes[lNodeId] = true;
+      }
+
+      Teuchos::ArrayRCP<GO> arr_cnodeIds = Teuchos::arcp(cnodeIds);
+
+      if (arr_cnodeIds.size() > 0)
+        crsGraph->insertGlobalIndices(nodeId, arr_cnodeIds());
+    }
+    // fill matrix graph
+    crsGraph->fillComplete(nodeMap, nodeMap);
+
+    // 5) create MueLu Graph object
+    RCP<LWGraph> graph = rcp(new LWGraph(crsGraph, "amalgamated graph of A"));
+
+    // Detect and record rows that correspond to Dirichlet boundary conditions
+    graph->SetBoundaryNodeMap(amalgBoundaryNodes);
+
+    if (GetVerbLevel() & Statistics1) {
+      GO numLocalBoundaryNodes  = 0;
+      GO numGlobalBoundaryNodes = 0;
+      for (size_t i = 0; i < amalgBoundaryNodes.size(); ++i)
+        if (amalgBoundaryNodes(i))
+          numLocalBoundaryNodes++;
+      RCP<const Teuchos::Comm<int>> comm = A->getRowMap()->getComm();
+      MueLu_sumAll(comm, numLocalBoundaryNodes, numGlobalBoundaryNodes);
+      GetOStream(Statistics1) << "Detected " << numGlobalBoundaryNodes << " Dirichlet nodes" << std::endl;
+    }
+
+    // 6) store results in Level
+    // graph->SetBoundaryNodeMap(gBoundaryNodeMap);
+    Set(currentLevel, "DofsPerNode", blockdim);
+    Set(currentLevel, "Graph", graph);
+
+  }  // if (doExperimentalWrap) ... else ...
+
+}  // Build
+
+template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
+void CoalesceDropFactory<Scalar, LocalOrdinal, GlobalOrdinal, Node>::BuildKokkos(Level& currentLevel) const {
+  FactoryMonitor m(*this, "BuildKokkos", currentLevel);
+
+  typedef Teuchos::ScalarTraits<SC> STS;
+  typedef typename STS::magnitudeType real_type;
+  typedef Xpetra::MultiVector<real_type, LO, GO, NO> RealValuedMultiVector;
+  typedef Xpetra::MultiVectorFactory<real_type, LO, GO, NO> RealValuedMultiVectorFactory;
+
+  if (predrop_ != Teuchos::null)
+    GetOStream(Parameters0) << predrop_->description();
+
+  RCP<Matrix> realA              = Get<RCP<Matrix>>(currentLevel, "A");
+  RCP<AmalgamationInfo> amalInfo = Get<RCP<AmalgamationInfo>>(currentLevel, "UnAmalgamationInfo");
+  const ParameterList& pL        = GetParameterList();
+  bool doExperimentalWrap        = pL.get<bool>("lightweight wrap");
+
+  GetOStream(Parameters0) << "lightweight wrap = " << doExperimentalWrap << std::endl;
+  std::string algo                         = pL.get<std::string>("aggregation: drop scheme");
+  const bool aggregationMayCreateDirichlet = pL.get<bool>("aggregation: dropping may create Dirichlet");
+
+  RCP<RealValuedMultiVector> Coords;
+  RCP<Matrix> A;
+
+  bool use_block_algorithm   = false;
+  LO interleaved_blocksize   = as<LO>(pL.get<int>("aggregation: block diagonal: interleaved blocksize"));
+  bool useSignedClassicalRS  = false;
+  bool useSignedClassicalSA  = false;
+  bool generateColoringGraph = false;
+
+  // NOTE:  If we're doing blockDiagonal, we'll not want to do rowSum twice (we'll do it
+  // in the block diagonalization). So we'll clobber the rowSumTol with -1.0 in this case
+  typename STS::magnitudeType rowSumTol = as<typename STS::magnitudeType>(pL.get<double>("aggregation: row sum drop tol"));
+
+  RCP<LocalOrdinalVector> ghostedBlockNumber;
+  ArrayRCP<const LO> g_block_id;
+
+  if (algo == "distance laplacian") {
+    // Grab the coordinates for distance laplacian
+    Coords = Get<RCP<RealValuedMultiVector>>(currentLevel, "Coordinates");
+    A      = realA;
+  } else if (algo == "signed classical sa") {
+    useSignedClassicalSA = true;
+    algo                 = "classical";
+    A                    = realA;
+  } else if (algo == "signed classical" || algo == "block diagonal colored signed classical" || algo == "block diagonal signed classical") {
+    useSignedClassicalRS = true;
+    //      if(realA->GetFixedBlockSize() > 1) {
+    RCP<LocalOrdinalVector> BlockNumber = Get<RCP<LocalOrdinalVector>>(currentLevel, "BlockNumber");
+    // Ghost the column block numbers if we need to
+    RCP<const Import> importer = realA->getCrsGraph()->getImporter();
+    if (!importer.is_null()) {
+      SubFactoryMonitor m1(*this, "Block Number import", currentLevel);
+      ghostedBlockNumber = Xpetra::VectorFactory<LO, LO, GO, NO>::Build(importer->getTargetMap());
+      ghostedBlockNumber->doImport(*BlockNumber, *importer, Xpetra::INSERT);
+    } else {
+      ghostedBlockNumber = BlockNumber;
+    }
+    g_block_id = ghostedBlockNumber->getData(0);
+    //      }
+    if (algo == "block diagonal colored signed classical")
+      generateColoringGraph = true;
+    algo = "classical";
+    A    = realA;
+
+  } else if (algo == "block diagonal") {
+    // Handle the "block diagonal" filtering and then leave
+    BlockDiagonalize(currentLevel, realA, false);
+    return;
+  } else if (algo == "block diagonal classical" || algo == "block diagonal distance laplacian") {
+    // Handle the "block diagonal" filtering, and then continue onward
+    use_block_algorithm        = true;
+    RCP<Matrix> filteredMatrix = BlockDiagonalize(currentLevel, realA, true);
+    if (algo == "block diagonal distance laplacian") {
+      // We now need to expand the coordinates by the interleaved blocksize
+      RCP<RealValuedMultiVector> OldCoords = Get<RCP<RealValuedMultiVector>>(currentLevel, "Coordinates");
+      if (OldCoords->getLocalLength() != realA->getLocalNumRows()) {
+        LO dim = (LO)OldCoords->getNumVectors();
+        Coords = RealValuedMultiVectorFactory::Build(realA->getRowMap(), dim);
+        for (LO k = 0; k < dim; k++) {
+          ArrayRCP<const real_type> old_vec = OldCoords->getData(k);
+          ArrayRCP<real_type> new_vec       = Coords->getDataNonConst(k);
+          for (LO i = 0; i < (LO)OldCoords->getLocalLength(); i++) {
+            LO new_base = i * dim;
+            for (LO j = 0; j < interleaved_blocksize; j++)
+              new_vec[new_base + j] = old_vec[i];
+          }
+        }
+      } else {
+        Coords = OldCoords;
+      }
+      algo = "distance laplacian";
+    } else if (algo == "block diagonal classical") {
+      algo = "classical";
+    }
+    // All cases
+    A         = filteredMatrix;
+    rowSumTol = -1.0;
+  } else {
+    A = realA;
+  }
+
+  // Distance Laplacian weights
+  Array<double> dlap_weights = pL.get<Array<double>>("aggregation: distance laplacian directional weights");
+  enum { NO_WEIGHTS = 0,
+         SINGLE_WEIGHTS,
+         BLOCK_WEIGHTS };
+  int use_dlap_weights = NO_WEIGHTS;
+  if (algo == "distance laplacian") {
+    LO dim = (LO)Coords->getNumVectors();
+    // If anything isn't 1.0 we need to turn on the weighting
+    bool non_unity = false;
+    for (LO i = 0; !non_unity && i < (LO)dlap_weights.size(); i++) {
+      if (dlap_weights[i] != 1.0) {
+        non_unity = true;
+      }
+    }
+    if (non_unity) {
+      LO blocksize = use_block_algorithm ? as<LO>(pL.get<int>("aggregation: block diagonal: interleaved blocksize")) : 1;
+      if ((LO)dlap_weights.size() == dim)
+        use_dlap_weights = SINGLE_WEIGHTS;
+      else if ((LO)dlap_weights.size() == blocksize * dim)
+        use_dlap_weights = BLOCK_WEIGHTS;
+      else {
+        TEUCHOS_TEST_FOR_EXCEPTION(1, Exceptions::RuntimeError,
+                                   "length of 'aggregation: distance laplacian directional weights' must equal the coordinate dimension OR the coordinate dimension times the blocksize");
+      }
+      if (GetVerbLevel() & Statistics1)
+        GetOStream(Statistics1) << "Using distance laplacian weights: " << dlap_weights << std::endl;
+    }
+  }
+
+  // decide wether to use the fast-track code path for standard maps or the somewhat slower
+  // code path for non-standard maps
+  /*bool bNonStandardMaps = false;
+  if (A->IsView("stridedMaps") == true) {
+    Teuchos::RCP<const Map> myMap = A->getRowMap("stridedMaps");
+    Teuchos::RCP<const StridedMap> strMap = Teuchos::rcp_dynamic_cast<const StridedMap>(myMap);
+    TEUCHOS_TEST_FOR_EXCEPTION(strMap == null, Exceptions::RuntimeError, "Map is not of type StridedMap");
+    if (strMap->getStridedBlockId() != -1 || strMap->getOffset() > 0)
+      bNonStandardMaps = true;
+  }*/
+
+  if (doExperimentalWrap) {
+    TEUCHOS_TEST_FOR_EXCEPTION(predrop_ != null && algo != "classical", Exceptions::RuntimeError, "Dropping function must not be provided for \"" << algo << "\" algorithm");
+    TEUCHOS_TEST_FOR_EXCEPTION(algo != "classical" && algo != "distance laplacian" && algo != "signed classical", Exceptions::RuntimeError, "\"algorithm\" must be one of (classical|distance laplacian|signed classical)");
+
+    SC threshold;
+    // If we're doing the ML-style halving of the drop tol at each level, we do that here.
+    if (pL.get<bool>("aggregation: use ml scaling of drop tol"))
+      threshold = pL.get<double>("aggregation: drop tol") / pow(2.0, currentLevel.GetLevelID());
+    else
+      threshold = as<SC>(pL.get<double>("aggregation: drop tol"));
+
+    std::string distanceLaplacianAlgoStr = pL.get<std::string>("aggregation: distance laplacian algo");
+    std::string classicalAlgoStr         = pL.get<std::string>("aggregation: classical algo");
+    real_type realThreshold              = STS::magnitude(threshold);  // CMS: Rename this to "magnitude threshold" sometime
+
+    ////////////////////////////////////////////////////
+    // Remove this bit once we are confident that cut-based dropping works.
+#ifdef HAVE_MUELU_DEBUG
+    int distanceLaplacianCutVerbose = 0;
+#endif
+#ifdef DJS_READ_ENV_VARIABLES
+    if (getenv("MUELU_DROP_TOLERANCE_MODE")) {
+      distanceLaplacianAlgoStr = std::string(getenv("MUELU_DROP_TOLERANCE_MODE"));
+    }
+
+    if (getenv("MUELU_DROP_TOLERANCE_THRESHOLD")) {
+      auto tmp      = atoi(getenv("MUELU_DROP_TOLERANCE_THRESHOLD"));
+      realThreshold = 1e-4 * tmp;
+    }
+
+#ifdef HAVE_MUELU_DEBUG
+    if (getenv("MUELU_DROP_TOLERANCE_VERBOSE")) {
+      distanceLaplacianCutVerbose = atoi(getenv("MUELU_DROP_TOLERANCE_VERBOSE"));
+    }
+#endif
+#endif
+    ////////////////////////////////////////////////////
+
+    enum decisionAlgoType { defaultAlgo,
+                            unscaled_cut,
+                            scaled_cut,
+                            scaled_cut_symmetric };
+
+    decisionAlgoType distanceLaplacianAlgo = defaultAlgo;
+    decisionAlgoType classicalAlgo         = defaultAlgo;
+    if (algo == "distance laplacian") {
+      if (distanceLaplacianAlgoStr == "default")
+        distanceLaplacianAlgo = defaultAlgo;
+      else if (distanceLaplacianAlgoStr == "unscaled cut")
+        distanceLaplacianAlgo = unscaled_cut;
+      else if (distanceLaplacianAlgoStr == "scaled cut")
+        distanceLaplacianAlgo = scaled_cut;
+      else if (distanceLaplacianAlgoStr == "scaled cut symmetric")
+        distanceLaplacianAlgo = scaled_cut_symmetric;
+      else
+        TEUCHOS_TEST_FOR_EXCEPTION(true, Exceptions::RuntimeError, "\"aggregation: distance laplacian algo\" must be one of (default|unscaled cut|scaled cut), not \"" << distanceLaplacianAlgoStr << "\"");
+      GetOStream(Runtime0) << "algorithm = \"" << algo << "\" distance laplacian algorithm = \"" << distanceLaplacianAlgoStr << "\": threshold = " << threshold << ", blocksize = " << A->GetFixedBlockSize() << std::endl;
+    } else if (algo == "classical") {
+      if (classicalAlgoStr == "default")
+        classicalAlgo = defaultAlgo;
+      else if (classicalAlgoStr == "unscaled cut")
+        classicalAlgo = unscaled_cut;
+      else if (classicalAlgoStr == "scaled cut")
+        classicalAlgo = scaled_cut;
+      else
+        TEUCHOS_TEST_FOR_EXCEPTION(true, Exceptions::RuntimeError, "\"aggregation: classical algo\" must be one of (default|unscaled cut|scaled cut), not \"" << classicalAlgoStr << "\"");
+      GetOStream(Runtime0) << "algorithm = \"" << algo << "\" classical algorithm = \"" << classicalAlgoStr << "\": threshold = " << threshold << ", blocksize = " << A->GetFixedBlockSize() << std::endl;
+
+    } else
+      GetOStream(Runtime0) << "algorithm = \"" << algo << "\": threshold = " << threshold << ", blocksize = " << A->GetFixedBlockSize() << std::endl;
+    Set<bool>(currentLevel, "Filtering", (threshold != STS::zero()));
+
+    const typename STS::magnitudeType dirichletThreshold = STS::magnitude(as<SC>(pL.get<double>("aggregation: Dirichlet threshold")));
+
+    // NOTE: We don't support signed classical RS or SA with cut drop at present
+    TEUCHOS_TEST_FOR_EXCEPTION(useSignedClassicalRS && classicalAlgo != defaultAlgo, Exceptions::RuntimeError, "\"aggregation: classical algo\" != default is not supported for scalled classical aggregation");
+    TEUCHOS_TEST_FOR_EXCEPTION(useSignedClassicalSA && classicalAlgo != defaultAlgo, Exceptions::RuntimeError, "\"aggregation: classical algo\" != default is not supported for scalled classical sa aggregation");
+
+    GO numDropped = 0, numTotal = 0;
+    std::string graphType = "unamalgamated";  // for description purposes only
+
+    /* NOTE: storageblocksize (from GetStorageBlockSize()) is the size of a block in the chosen storage scheme.
+     BlockSize is the number of storage blocks that must kept together during the amalgamation process.
+
+     Both of these quantities may be different than numPDEs (from GetFixedBlockSize()), but the following must always hold:
+
+     numPDEs = BlockSize * storageblocksize.
+
+     If numPDEs==1
+       Matrix is point storage (classical CRS storage).  storageblocksize=1 and BlockSize=1
+       No other values makes sense.
+
+     If numPDEs>1
+       If matrix uses point storage, then storageblocksize=1  and BlockSize=numPDEs.
+       If matrix uses block storage, with block size of n, then storageblocksize=n, and BlockSize=numPDEs/n.
+       Thus far, only storageblocksize=numPDEs and BlockSize=1 has been tested.
+    */
+    TEUCHOS_TEST_FOR_EXCEPTION(A->GetFixedBlockSize() % A->GetStorageBlockSize() != 0, Exceptions::RuntimeError, "A->GetFixedBlockSize() needs to be a multiple of A->GetStorageBlockSize()");
+    const LO BlockSize = A->GetFixedBlockSize() / A->GetStorageBlockSize();
+
     /************************** RS or SA-style Classical Dropping (and variants) **************************/
     if (algo == "classical") {
       if (predrop_ == null) {
@@ -506,7 +2155,7 @@ void CoalesceDropFactory<Scalar, LocalOrdinal, GlobalOrdinal, Node>::Build(Level
 
         LO realnnz = 0;
         rows(0)    = 0;
-        for (LO row = 0; row < Teuchos::as<LO>(A->getRowMap()->getLocalNumElements()); ++row) {
+	for (LO row = 0; row < Teuchos::as<LO>(A->getRowMap()->getLocalNumElements()); ++row) {
           size_t nnz          = A->getNumEntriesInLocalRow(row);
           bool rowIsDirichlet = boundaryNodes[row];
           ArrayView<const LO> indices;
@@ -573,11 +2222,11 @@ void CoalesceDropFactory<Scalar, LocalOrdinal, GlobalOrdinal, Node>::Build(Level
               rows(row + 1) = realnnz;
             }
           } else {
-            /* Cut Algorithm */
+	    /* Cut Algorithm */
             // CMS
             using DropTol = Details::DropTol<real_type, LO>;
             std::vector<DropTol> drop_vec;
-            drop_vec.reserve(nnz);
+	    drop_vec.reserve(nnz);
             const real_type zero = Teuchos::ScalarTraits<real_type>::zero();
             const real_type one  = Teuchos::ScalarTraits<real_type>::one();
             LO rownnz            = 0;
@@ -1594,7 +3243,7 @@ void CoalesceDropFactory<Scalar, LocalOrdinal, GlobalOrdinal, Node>::Build(Level
 
   }  // if (doExperimentalWrap) ... else ...
 
-}  // Build
+}  // BuildKokkos
 
 template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
 void CoalesceDropFactory<Scalar, LocalOrdinal, GlobalOrdinal, Node>::MergeRows(const Matrix& A, const LO row, Array<LO>& cols, const Array<LO>& translation) const {

From 105e33e71f5ab985b6559fcc047b4b596336efb1 Mon Sep 17 00:00:00 2001
From: Ian Halim <ihalim@ascicgpu031.sandia.gov>
Date: Mon, 22 Jul 2024 12:19:34 -0600
Subject: [PATCH 02/25] MueLu: Cut Drop Memory Optimization

DropTol structure in algorithm replaced with new, smaller DropTolKokkos structure.
Computations are now done on the fly.
Code passes current unit tests.
No significant change in speed.

Signed-off-by: Ian Halim <ihalim@ascicgpu031.sandia.gov>
---
 .../MueLu_CoalesceDropFactory_def.hpp         | 130 ++++++++++++------
 1 file changed, 86 insertions(+), 44 deletions(-)

diff --git a/packages/muelu/src/Graph/MatrixTransformation/MueLu_CoalesceDropFactory_def.hpp b/packages/muelu/src/Graph/MatrixTransformation/MueLu_CoalesceDropFactory_def.hpp
index a8befaea592b..eeb3f91dbfd6 100644
--- a/packages/muelu/src/Graph/MatrixTransformation/MueLu_CoalesceDropFactory_def.hpp
+++ b/packages/muelu/src/Graph/MatrixTransformation/MueLu_CoalesceDropFactory_def.hpp
@@ -94,35 +94,48 @@ namespace MueLu {
 namespace Details {
 template <class real_type, class LO>
 struct DropTol {
-  KOKKOS_INLINE_FUNCTION //NEW
   DropTol()               = default;
-  KOKKOS_INLINE_FUNCTION //NEW
   DropTol(DropTol const&) = default;
-  KOKKOS_INLINE_FUNCTION //NEW
   DropTol(DropTol&&)      = default;
 
   DropTol& operator=(DropTol const&) = default;
   DropTol& operator=(DropTol&&)      = default;
 
-  KOKKOS_INLINE_FUNCTION //NEW
   DropTol(real_type val_, real_type diag_, LO col_, bool drop_)
     : val{val_}
     , diag{diag_}
     , col{col_}
     , drop{drop_} {}
 
-  real_type val{0};
-  real_type diag{0};
-  LO col{-1};
-  //NEW Can't run these host functions on device
-  //real_type val{Teuchos::ScalarTraits<real_type>::zero()};
-  //real_type diag{Teuchos::ScalarTraits<real_type>::zero()};
-  //LO col{Teuchos::OrdinalTraits<LO>::invalid()};
+  real_type val{Teuchos::ScalarTraits<real_type>::zero()};
+  real_type diag{Teuchos::ScalarTraits<real_type>::zero()};
+  LO col{Teuchos::OrdinalTraits<LO>::invalid()};
   bool drop{true};
 
   // CMS: Auxillary information for debugging info
   //      real_type aux_val {Teuchos::ScalarTraits<real_type>::nan()};
 };
+
+template <class real_type, class LO>
+struct DropTolKokkos {
+  KOKKOS_INLINE_FUNCTION //NEW
+  DropTolKokkos()               = default;
+  KOKKOS_INLINE_FUNCTION //NEW
+  DropTolKokkos(DropTolKokkos const&) = default;
+  KOKKOS_INLINE_FUNCTION //NEW
+  DropTolKokkos(DropTolKokkos&&)      = default;
+
+  DropTolKokkos& operator=(DropTolKokkos const&) = default;
+  DropTolKokkos& operator=(DropTolKokkos&&)      = default;
+
+  KOKKOS_INLINE_FUNCTION //NEW
+  DropTolKokkos(LO col_, bool drop_)
+    : col{col_}
+    , drop{drop_} {}
+
+  LO col{-1};
+  LO drop{true};
+};
 }  // namespace Details
 
 template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
@@ -767,7 +780,7 @@ void CoalesceDropFactory<Scalar, LocalOrdinal, GlobalOrdinal, Node>::Build(Level
 		using ExecSpace = typename Node::execution_space;
 		using TeamPol = Kokkos::TeamPolicy<ExecSpace>;
 		using TeamMem = typename TeamPol::member_type;
-		using DropTol = Details::DropTol<real_type, LO>;
+		using DropTolKokkos = Details::DropTolKokkos<real_type, LO>;
 		
 		//move from host to device
 		ArrayView<const SC>  ghostedDiagValsArrayView = ghostedDiagVals.view(ghostedDiagVals.lowerOffset(), ghostedDiagVals.size());
@@ -779,7 +792,7 @@ void CoalesceDropFactory<Scalar, LocalOrdinal, GlobalOrdinal, Node>::Build(Level
 		
 		int algorithm = classicalAlgo;
 		Kokkos::View<LO*, ExecSpace>rownnzView("rownnzView", A_device.numRows());
-		auto drop_views = Kokkos::View<DropTol*, ExecSpace>("drop_views", A_device.nnz());
+		auto drop_views = Kokkos::View<DropTolKokkos*, ExecSpace>("drop_views", A_device.nnz());
 		//stackedTimer->stop("init");
 
 		//stackedTimer->start("loop");
@@ -790,74 +803,103 @@ void CoalesceDropFactory<Scalar, LocalOrdinal, GlobalOrdinal, Node>::Build(Level
 
 			size_t n = 0;
 			auto drop_view = Kokkos::subview(drop_views, Kokkos::make_pair(A_device.graph.row_map(row), A_device.graph.row_map(row+1)));
+			
 			//find magnitudes
 			Kokkos::parallel_reduce(Kokkos::TeamThreadRange(teamMember, (LO)nnz), [&] (const LO colID, size_t &count) {
 				LO col = rowView.colidx(colID);
 				if(row == col) {
-					drop_view(colID) = DropTol(0, 1, colID, false);
+					drop_view(colID) = DropTolKokkos(colID, true);
 					count++;
 				}
 				//Don't aggregate boundaries
 				else if(!boundaryNodesDevice(colID)) {
-					typename STS::magnitudeType aiiajj = static_cast<SC>(std::fabs(static_cast<double>(threshold * threshold * ghostedDiagValsView(col) * ghostedDiagValsView(row))));  // eps^2*|a     _ii|*|a_jj|
-					typename STS::magnitudeType aij    = static_cast<SC>(std::fabs(static_cast<double>(rowView.value(colID) * rowView.value(colID))));                                            // |a_i     j|^2
-					drop_view(colID) = DropTol(aij, aiiajj, colID, false);
+					drop_view(colID) = DropTolKokkos(colID, false);
 					count++;
 				}
 			}, n);
+
+			size_t dropStart = n;
 			if (algorithm == unscaled_cut) {
-				Kokkos::Experimental::sort_team(teamMember, drop_view, [](DropTol const& a, DropTol const& b) {
-					return a.val > b.val;
+				Kokkos::Experimental::sort_team(teamMember, drop_view, [=](DropTolKokkos const& x, DropTolKokkos const& y) {
+					if(x.drop || y.drop) {
+						return x.drop < y.drop;
+					}
+					else {
+						typename STS::magnitudeType x_aij    = static_cast<SC>(std::fabs(static_cast<double>(rowView.value(x.col) * rowView.value(x.col))));                                            // |a_i     j|^2
+						typename STS::magnitudeType y_aij    = static_cast<SC>(std::fabs(static_cast<double>(rowView.value(y.col) * rowView.value(y.col))));                                            // |a_i     j|^2
+						return x_aij > y_aij;
+					}
 				});
 
 				//find index where dropping starts
-				size_t dropStart;
 				Kokkos::parallel_reduce(Kokkos::TeamThreadRange(teamMember, 1, n), [=](size_t i, size_t& min) {
 					auto const& x = drop_view(i - 1);
 					auto const& y = drop_view(i);
-					auto a = x.val;
-					auto b = y.val;
-					if(a > realThreshold * b) {
+					typename STS::magnitudeType x_aij = 0;
+					typename STS::magnitudeType y_aij = 0;
+					if(!x.drop) {
+						x_aij    = static_cast<SC>(std::fabs(static_cast<double>(rowView.value(x.col) * rowView.value(x.col))));                                            // |a_i     j|^2
+					}
+					if(!y.drop) {
+						y_aij    = static_cast<SC>(std::fabs(static_cast<double>(rowView.value(y.col) * rowView.value(y.col))));                                            // |a_i     j|^2
+					}
+
+					if(x_aij > realThreshold * y_aij) {
 						if(i < min) {
 							min = i;
 						}
 					}
 				}, Kokkos::Min<size_t>(dropStart));
-
-				if(dropStart < n) {
-					Kokkos::parallel_for(Kokkos::TeamThreadRange(teamMember, dropStart, n), [=](size_t i) {
-						drop_view(i).drop = true;
-					});
-				}
           	 	} else if (algorithm == scaled_cut) {
-				Kokkos::Experimental::sort_team(teamMember, drop_view, [](DropTol const& a, DropTol const& b) {
-					return a.val / a.diag > b.val / b.diag;
+				Kokkos::Experimental::sort_team(teamMember, drop_view, [=](DropTolKokkos const& x, DropTolKokkos const& y) {
+					if(x.drop || y.drop) {
+						return x.drop < y.drop;
+					}
+					else {
+						typename STS::magnitudeType x_aij    = static_cast<SC>(std::fabs(static_cast<double>(rowView.value(x.col) * rowView.value(x.col))));                                            // |a_i     j|^2
+						typename STS::magnitudeType y_aij    = static_cast<SC>(std::fabs(static_cast<double>(rowView.value(y.col) * rowView.value(y.col))));                                            // |a_i     j|^2
+						typename STS::magnitudeType x_aiiajj = static_cast<SC>(std::fabs(static_cast<double>(threshold * threshold * ghostedDiagValsView(rowView.colidx(x.col)) * ghostedDiagValsView(row))));  // eps^2*|a     _ii|*|a_jj|
+						typename STS::magnitudeType y_aiiajj = static_cast<SC>(std::fabs(static_cast<double>(threshold * threshold * ghostedDiagValsView(rowView.colidx(y.col)) * ghostedDiagValsView(row))));  // eps^2*|a     _ii|*|a_jj|
+						return x_aij / x_aiiajj > y_aij / y_aiiajj;
+					}
 				});
 
+
 				//find index where dropping starts
-				size_t dropStart;
 				Kokkos::parallel_reduce(Kokkos::TeamThreadRange(teamMember, 1, n), [=](size_t i, size_t& min) {
 					auto const& x = drop_view(i - 1);
 					auto const& y = drop_view(i);
-					auto a = x.val / x.diag;
-					auto b = y.val / y.diag;
-					if(a > realThreshold * b) {
+					typename STS::magnitudeType x_val = 0;
+					typename STS::magnitudeType y_val = 0;
+					if(!x.drop) {
+						typename STS::magnitudeType x_aij    = static_cast<SC>(std::fabs(static_cast<double>(rowView.value(x.col) * rowView.value(x.col))));                                            // |a_i     j|^2
+						typename STS::magnitudeType x_aiiajj = static_cast<SC>(std::fabs(static_cast<double>(threshold * threshold * ghostedDiagValsView(rowView.colidx(x.col)) * ghostedDiagValsView(row))));  // eps^2*|a     _ii|*|a_jj|
+						x_val = x_aij / x_aiiajj;
+					}
+					if(!y.drop) {
+						typename STS::magnitudeType y_aij    = static_cast<SC>(std::fabs(static_cast<double>(rowView.value(y.col) * rowView.value(y.col))));                                            // |a_i     j|^2
+						typename STS::magnitudeType y_aiiajj = static_cast<SC>(std::fabs(static_cast<double>(threshold * threshold * ghostedDiagValsView(rowView.colidx(y.col)) * ghostedDiagValsView(row))));  // eps^2*|a     _ii|*|a_jj|
+						y_val = y_aij / y_aiiajj;
+					}
+
+					if(x_val > realThreshold * y_val) {
 						if(i < min) {
 							min = i;
 						}
 					}
 				}, Kokkos::Min<size_t>(dropStart));
-
-				if(dropStart < n) {
-					Kokkos::parallel_for(Kokkos::TeamThreadRange(teamMember, dropStart, n), [=](size_t i) {
-						drop_view(i).drop = true;
-					});
-				}
 	  	 	}
-			Kokkos::Experimental::sort_team(teamMember, drop_view, [](DropTol const& a, DropTol const& b) {
-				return a.col < b.col;
+
+			if(dropStart < n) {
+				Kokkos::parallel_for(Kokkos::TeamThreadRange(teamMember, dropStart, n), [=](size_t i) {
+					drop_view(i).drop = true;
+				});
+			}
+
+			Kokkos::Experimental::sort_team(teamMember, drop_view, [](DropTolKokkos const& a, DropTolKokkos const& b) {
+		 		return a.col < b.col;
 			});
-		 
+
 		  	LO rownnz = 0;
 		  	GO rowDropped = 0;
 		  	Kokkos::parallel_reduce(Kokkos::TeamThreadRange(teamMember, n), [=](const size_t idxID, LO& keep, GO& drop) {

From cdee728089dcf993f67cf194dbc51575a4a766e5 Mon Sep 17 00:00:00 2001
From: Ian Halim <ihalim@ascicgpu031.sandia.gov>
Date: Mon, 22 Jul 2024 19:00:22 -0600
Subject: [PATCH 03/25] MueLu: Sorting Now Resembles numpy.argsort

Per Christian's request.
DropTolKokkos structure removed and replaced with view indices and view of drop flags.
ORIGINAL code removed.
BuildKokkos removed.
Removed commented out timers.
Added comments.

Signed-off-by: Ian Halim <ihalim@ascicgpu031.sandia.gov>
---
 .../MueLu_CoalesceDropFactory_decl.hpp        |    1 -
 .../MueLu_CoalesceDropFactory_def.hpp         | 1736 +----------------
 2 files changed, 53 insertions(+), 1684 deletions(-)

diff --git a/packages/muelu/src/Graph/MatrixTransformation/MueLu_CoalesceDropFactory_decl.hpp b/packages/muelu/src/Graph/MatrixTransformation/MueLu_CoalesceDropFactory_decl.hpp
index db5e9a291313..96b5e778f6bc 100644
--- a/packages/muelu/src/Graph/MatrixTransformation/MueLu_CoalesceDropFactory_decl.hpp
+++ b/packages/muelu/src/Graph/MatrixTransformation/MueLu_CoalesceDropFactory_decl.hpp
@@ -160,7 +160,6 @@ class CoalesceDropFactory : public SingleLevelFactoryBase {
   //@}
 
   void Build(Level& currentLevel) const;  // Build
-  void BuildKokkos(Level& currentLevel) const;
 
  private:
   // pre-drop function
diff --git a/packages/muelu/src/Graph/MatrixTransformation/MueLu_CoalesceDropFactory_def.hpp b/packages/muelu/src/Graph/MatrixTransformation/MueLu_CoalesceDropFactory_def.hpp
index eeb3f91dbfd6..da606ab20ff6 100644
--- a/packages/muelu/src/Graph/MatrixTransformation/MueLu_CoalesceDropFactory_def.hpp
+++ b/packages/muelu/src/Graph/MatrixTransformation/MueLu_CoalesceDropFactory_def.hpp
@@ -61,8 +61,8 @@
 
 #include <Xpetra_IO.hpp>
 
-#include <Kokkos_NestedSort.hpp> //NEW
-#include <Kokkos_StdAlgorithms.hpp> //NEW
+#include <Kokkos_NestedSort.hpp>
+#include <Kokkos_StdAlgorithms.hpp>
 #include "MueLu_CoalesceDropFactory_decl.hpp"
 
 #include "MueLu_AmalgamationFactory.hpp"
@@ -115,27 +115,6 @@ struct DropTol {
   // CMS: Auxillary information for debugging info
   //      real_type aux_val {Teuchos::ScalarTraits<real_type>::nan()};
 };
-
-template <class real_type, class LO>
-struct DropTolKokkos {
-  KOKKOS_INLINE_FUNCTION //NEW
-  DropTolKokkos()               = default;
-  KOKKOS_INLINE_FUNCTION //NEW
-  DropTolKokkos(DropTolKokkos const&) = default;
-  KOKKOS_INLINE_FUNCTION //NEW
-  DropTolKokkos(DropTolKokkos&&)      = default;
-
-  DropTolKokkos& operator=(DropTolKokkos const&) = default;
-  DropTolKokkos& operator=(DropTolKokkos&&)      = default;
-
-  KOKKOS_INLINE_FUNCTION //NEW
-  DropTolKokkos(LO col_, bool drop_)
-    : col{col_}
-    , drop{drop_} {}
-
-  LO col{-1};
-  LO drop{true};
-};
 }  // namespace Details
 
 template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
@@ -529,180 +508,6 @@ void CoalesceDropFactory<Scalar, LocalOrdinal, GlobalOrdinal, Node>::Build(Level
 
         LO realnnz = 0;
         rows(0)    = 0;
-#define NEW 
-#ifdef ORIGINAL
-	for (LO row = 0; row < Teuchos::as<LO>(A->getRowMap()->getLocalNumElements()); ++row) {
-	  size_t nnz          = A->getNumEntriesInLocalRow(row);
-          bool rowIsDirichlet = boundaryNodes[row];
-          ArrayView<const LO> indices;
-          ArrayView<const SC> vals;
-          A->getLocalRowView(row, indices, vals);
-
-          if (classicalAlgo == defaultAlgo) {
-            // FIXME the current predrop function uses the following
-            // FIXME    if(std::abs(vals[k]) > std::abs(threshold_) || grow == gcid )
-            // FIXME but the threshold doesn't take into account the rows' diagonal entries
-            // FIXME For now, hardwiring the dropping in here
-
-            LO rownnz = 0;
-            if (useSignedClassicalRS) {
-              // Signed classical RS style
-              for (LO colID = 0; colID < Teuchos::as<LO>(nnz); colID++) {
-                LO col         = indices[colID];
-                MT max_neg_aik = realThreshold * STS::real(negMaxOffDiagonal[row]);
-                MT neg_aij     = -STS::real(vals[colID]);
-                /*                  if(row==1326) printf("A(%d,%d) = %6.4e, block = (%d,%d) neg_aij = %6.4e max_neg_aik = %6.4e\n",row,col,vals[colID],
-                                     g_block_id.is_null() ? -1 :  g_block_id[row],
-                                     g_block_id.is_null() ? -1 :  g_block_id[col],
-                                     neg_aij, max_neg_aik);*/
-                if ((!rowIsDirichlet && (g_block_id.is_null() || g_block_id[row] == g_block_id[col]) && neg_aij > max_neg_aik) || row == col) {
-                  columns[realnnz++] = col;
-                  rownnz++;
-                } else
-                  numDropped++;
-              }
-              rows(row + 1) = realnnz;
-            } else if (useSignedClassicalSA) {
-              // Signed classical SA style
-              for (LO colID = 0; colID < Teuchos::as<LO>(nnz); colID++) {
-                LO col = indices[colID];
-
-                bool is_nonpositive = STS::real(vals[colID]) <= 0;
-                MT aiiajj           = STS::magnitude(threshold * threshold * ghostedDiagVals[col] * ghostedDiagVals[row]);                        // eps^2*|a_ii|*|a_jj|
-                MT aij              = is_nonpositive ? STS::magnitude(vals[colID] * vals[colID]) : (-STS::magnitude(vals[colID] * vals[colID]));  // + |a_ij|^2, if a_ij < 0, - |a_ij|^2 if a_ij >=0
-                /*
-                if(row==1326) printf("A(%d,%d) = %6.4e, raw_aij = %6.4e aij = %6.4e aiiajj = %6.4e\n",row,col,vals[colID],
-                                     vals[colID],aij, aiiajj);
-                */
-
-                if ((!rowIsDirichlet && aij > aiiajj) || row == col) {
-                  columns(realnnz++) = col;
-                  rownnz++;
-                } else
-                  numDropped++;
-              }
-              rows[row + 1] = realnnz;
-            } else {
-              // Standard abs classical
-              for (LO colID = 0; colID < Teuchos::as<LO>(nnz); colID++) {
-                LO col    = indices[colID];
-                MT aiiajj = STS::magnitude(threshold * threshold * ghostedDiagVals[col] * ghostedDiagVals[row]);  // eps^2*|a_ii|*|a_jj|
-                MT aij    = STS::magnitude(vals[colID] * vals[colID]);                                            // |a_ij|^2
-
-                if ((!rowIsDirichlet && aij > aiiajj) || row == col) {
-                  columns(realnnz++) = col;
-                  rownnz++;
-                } else
-                  numDropped++;
-              }
-              rows(row + 1) = realnnz;
-            }
-          } else {
-	    /* Cut Algorithm */
-            // CMS
-            using DropTol = Details::DropTol<real_type, LO>;
-            std::vector<DropTol> drop_vec;
-	    drop_vec.reserve(nnz);
-            const real_type zero = Teuchos::ScalarTraits<real_type>::zero();
-            const real_type one  = Teuchos::ScalarTraits<real_type>::one();
-            LO rownnz            = 0;
-            // NOTE: This probably needs to be fixed for rowsum
-
-            // find magnitudes
-	    for (LO colID = 0; colID < (LO)nnz; colID++) {
-              LO col = indices[colID];
-              if (row == col) {
-                drop_vec.emplace_back(zero, one, colID, false);
-                continue;
-              }
-
-              // Don't aggregate boundaries
-              if (boundaryNodes[colID]) continue;
-              typename STS::magnitudeType aiiajj = STS::magnitude(threshold * threshold * ghostedDiagVals[col] * ghostedDiagVals[row]);  // eps^2*|a_ii|*|a_jj|
-              typename STS::magnitudeType aij    = STS::magnitude(vals[colID] * vals[colID]);                                            // |a_ij|^2
-              drop_vec.emplace_back(aij, aiiajj, colID, false);
-            }
-
-            const size_t n = drop_vec.size();
-
-            if (classicalAlgo == unscaled_cut) {
-              std::sort(drop_vec.begin(), drop_vec.end(), [](DropTol const& a, DropTol const& b) {
-                return a.val > b.val;
-              });
-
-              bool drop = false;
-              for (size_t i = 1; i < n; ++i) {
-                if (!drop) {
-                  auto const& x = drop_vec[i - 1];
-                  auto const& y = drop_vec[i];
-                  auto a        = x.val;
-                  auto b        = y.val;
-                  if (a > realThreshold * b) {
-                    drop = true;
-#ifdef HAVE_MUELU_DEBUG
-                    if (distanceLaplacianCutVerbose) {
-                      std::cout << "DJS: KEEP, N, ROW:  " << i + 1 << ", " << n << ", " << row << std::endl;
-                    }
-#endif
-                  }
-                }
-                drop_vec[i].drop = drop;
-              }
-            } else if (classicalAlgo == scaled_cut) {
-              std::sort(drop_vec.begin(), drop_vec.end(), [](DropTol const& a, DropTol const& b) {
-                return a.val / a.diag > b.val / b.diag;
-              });
-              bool drop = false;
-              //                  printf("[%d] Scaled Cut: ",(int)row);
-              //                  printf("%3d(%4s) ",indices[drop_vec[0].col],"keep");
-              for (size_t i = 1; i < n; ++i) {
-                if (!drop) {
-                  auto const& x = drop_vec[i - 1];
-                  auto const& y = drop_vec[i];
-                  auto a        = x.val / x.diag;
-                  auto b        = y.val / y.diag;
-                  if (a > realThreshold * b) {
-                    drop = true;
-
-#ifdef HAVE_MUELU_DEBUG
-                    if (distanceLaplacianCutVerbose) {
-                      std::cout << "DJS: KEEP, N, ROW:  " << i + 1 << ", " << n << ", " << row << std::endl;
-                    }
-#endif
-                  }
-                  //                      printf("%3d(%4s) ",indices[drop_vec[i].col],drop?"drop":"keep");
-                }
-                drop_vec[i].drop = drop;
-              }
-              //                  printf("\n");
-            }
-            std::sort(drop_vec.begin(), drop_vec.end(), [](DropTol const& a, DropTol const& b) {
-              return a.col < b.col;
-            });
-
-            for (LO idxID = 0; idxID < (LO)drop_vec.size(); idxID++) {
-              LO col = indices[drop_vec[idxID].col];
-              // don't drop diagonal
-              if (row == col) {
-                columns[realnnz++] = col;
-                rownnz++;
-                continue;
-              }
-
-              if (!drop_vec[idxID].drop) {
-                columns[realnnz++] = col;
-                rownnz++;
-              } else {
-                numDropped++;
-              }
-            }
-            // CMS
-            rows[row + 1] = realnnz;
-          }
-        }  // end for row
-#endif
-
-#ifdef NEW
 	if(classicalAlgo == defaultAlgo) {
             	SubFactoryMonitor m1(*this, "Classical RS/SA", currentLevel);
 		for (LO row = 0; row < Teuchos::as<LO>(A->getRowMap()->getLocalNumElements()); ++row) {
@@ -772,15 +577,11 @@ void CoalesceDropFactory<Scalar, LocalOrdinal, GlobalOrdinal, Node>::Build(Level
             		}
         	}  // end for row
 	}
-	else { //NEW START
-		//auto stackedTimer = rcp(new Teuchos::StackedTimer("timer"));
-		//Teuchos::TimeMonitor::setStackedTimer(stackedTimer);
-		//stackedTimer->start("init");
+	else {
             	SubFactoryMonitor m1(*this, "Cut Drop", currentLevel);
 		using ExecSpace = typename Node::execution_space;
 		using TeamPol = Kokkos::TeamPolicy<ExecSpace>;
 		using TeamMem = typename TeamPol::member_type;
-		using DropTolKokkos = Details::DropTolKokkos<real_type, LO>;
 		
 		//move from host to device
 		ArrayView<const SC>  ghostedDiagValsArrayView = ghostedDiagVals.view(ghostedDiagVals.lowerOffset(), ghostedDiagVals.size());
@@ -792,10 +593,9 @@ void CoalesceDropFactory<Scalar, LocalOrdinal, GlobalOrdinal, Node>::Build(Level
 		
 		int algorithm = classicalAlgo;
 		Kokkos::View<LO*, ExecSpace>rownnzView("rownnzView", A_device.numRows());
-		auto drop_views = Kokkos::View<DropTolKokkos*, ExecSpace>("drop_views", A_device.nnz());
-		//stackedTimer->stop("init");
+		auto drop_views = Kokkos::View<bool*, ExecSpace>("drop_views", A_device.nnz());
+		auto index_views = Kokkos::View<size_t*, ExecSpace>("index_views", A_device.nnz());
 
-		//stackedTimer->start("loop");
 		Kokkos::parallel_reduce("classical_cut", TeamPol(A_device.numRows(), Kokkos::AUTO), KOKKOS_LAMBDA(const TeamMem& teamMember, LO& globalnnz, GO& totalDropped) {
 			LO row = teamMember.league_rank();
 			auto rowView = A_device.row(row);
@@ -803,45 +603,52 @@ void CoalesceDropFactory<Scalar, LocalOrdinal, GlobalOrdinal, Node>::Build(Level
 
 			size_t n = 0;
 			auto drop_view = Kokkos::subview(drop_views, Kokkos::make_pair(A_device.graph.row_map(row), A_device.graph.row_map(row+1)));
-			
+			auto index_view = Kokkos::subview(index_views, Kokkos::make_pair(A_device.graph.row_map(row), A_device.graph.row_map(row+1)));
+
 			//find magnitudes
-			Kokkos::parallel_reduce(Kokkos::TeamThreadRange(teamMember, (LO)nnz), [&] (const LO colID, size_t &count) {
+			Kokkos::parallel_reduce(Kokkos::TeamThreadRange(teamMember, (LO)nnz), [&](const LO colID, size_t &count) {
+				index_view(colID) = colID;
 				LO col = rowView.colidx(colID);
+				//ignore diagonals for now, they are checked again later
 				if(row == col) {
-					drop_view(colID) = DropTolKokkos(colID, true);
+					drop_view(colID) = true;
 					count++;
 				}
 				//Don't aggregate boundaries
-				else if(!boundaryNodesDevice(colID)) {
-					drop_view(colID) = DropTolKokkos(colID, false);
+				else if(boundaryNodesDevice(colID)) {
+					drop_view(colID) = true;
+				}
+				else {
+					drop_view(colID) = false;
 					count++;
 				}
 			}, n);
 
 			size_t dropStart = n;
 			if (algorithm == unscaled_cut) {
-				Kokkos::Experimental::sort_team(teamMember, drop_view, [=](DropTolKokkos const& x, DropTolKokkos const& y) {
-					if(x.drop || y.drop) {
-						return x.drop < y.drop;
+				//push diagonals and boundaries to the right, sort everything else by aij on the left
+				Kokkos::Experimental::sort_team(teamMember, index_view, [=](size_t& x, size_t& y) {
+					if(drop_view(x) || drop_view(y)) {
+						return drop_view(x) < drop_view(y);
 					}
 					else {
-						typename STS::magnitudeType x_aij    = static_cast<SC>(std::fabs(static_cast<double>(rowView.value(x.col) * rowView.value(x.col))));                                            // |a_i     j|^2
-						typename STS::magnitudeType y_aij    = static_cast<SC>(std::fabs(static_cast<double>(rowView.value(y.col) * rowView.value(y.col))));                                            // |a_i     j|^2
+						typename STS::magnitudeType x_aij    = static_cast<SC>(std::fabs(static_cast<double>(rowView.value(x) * rowView.value(x))));
+						typename STS::magnitudeType y_aij    = static_cast<SC>(std::fabs(static_cast<double>(rowView.value(y) * rowView.value(y))));
 						return x_aij > y_aij;
 					}
 				});
 
 				//find index where dropping starts
 				Kokkos::parallel_reduce(Kokkos::TeamThreadRange(teamMember, 1, n), [=](size_t i, size_t& min) {
-					auto const& x = drop_view(i - 1);
-					auto const& y = drop_view(i);
+					auto const& x = index_view(i - 1);
+					auto const& y = index_view(i);
 					typename STS::magnitudeType x_aij = 0;
 					typename STS::magnitudeType y_aij = 0;
-					if(!x.drop) {
-						x_aij    = static_cast<SC>(std::fabs(static_cast<double>(rowView.value(x.col) * rowView.value(x.col))));                                            // |a_i     j|^2
+					if(!drop_view(x)) {
+						x_aij    = static_cast<SC>(std::fabs(static_cast<double>(rowView.value(x) * rowView.value(x))));
 					}
-					if(!y.drop) {
-						y_aij    = static_cast<SC>(std::fabs(static_cast<double>(rowView.value(y.col) * rowView.value(y.col))));                                            // |a_i     j|^2
+					if(!drop_view(y)) {
+						y_aij    = static_cast<SC>(std::fabs(static_cast<double>(rowView.value(y) * rowView.value(y))));
 					}
 
 					if(x_aij > realThreshold * y_aij) {
@@ -851,34 +658,34 @@ void CoalesceDropFactory<Scalar, LocalOrdinal, GlobalOrdinal, Node>::Build(Level
 					}
 				}, Kokkos::Min<size_t>(dropStart));
           	 	} else if (algorithm == scaled_cut) {
-				Kokkos::Experimental::sort_team(teamMember, drop_view, [=](DropTolKokkos const& x, DropTolKokkos const& y) {
-					if(x.drop || y.drop) {
-						return x.drop < y.drop;
+				//push diagonals and boundaries to the right, sort everything else by aij/aiiajj on the left
+				Kokkos::Experimental::sort_team(teamMember, index_view, [=](size_t& x, size_t& y) {
+					if(drop_view(x) || drop_view(y)) {
+						return drop_view(x) < drop_view(y);
 					}
 					else {
-						typename STS::magnitudeType x_aij    = static_cast<SC>(std::fabs(static_cast<double>(rowView.value(x.col) * rowView.value(x.col))));                                            // |a_i     j|^2
-						typename STS::magnitudeType y_aij    = static_cast<SC>(std::fabs(static_cast<double>(rowView.value(y.col) * rowView.value(y.col))));                                            // |a_i     j|^2
-						typename STS::magnitudeType x_aiiajj = static_cast<SC>(std::fabs(static_cast<double>(threshold * threshold * ghostedDiagValsView(rowView.colidx(x.col)) * ghostedDiagValsView(row))));  // eps^2*|a     _ii|*|a_jj|
-						typename STS::magnitudeType y_aiiajj = static_cast<SC>(std::fabs(static_cast<double>(threshold * threshold * ghostedDiagValsView(rowView.colidx(y.col)) * ghostedDiagValsView(row))));  // eps^2*|a     _ii|*|a_jj|
+						typename STS::magnitudeType x_aij    = static_cast<SC>(std::fabs(static_cast<double>(rowView.value(x) * rowView.value(x))));
+						typename STS::magnitudeType y_aij    = static_cast<SC>(std::fabs(static_cast<double>(rowView.value(y) * rowView.value(y))));
+						typename STS::magnitudeType x_aiiajj = static_cast<SC>(std::fabs(static_cast<double>(threshold * threshold * ghostedDiagValsView(rowView.colidx(x)) * ghostedDiagValsView(row))));
+						typename STS::magnitudeType y_aiiajj = static_cast<SC>(std::fabs(static_cast<double>(threshold * threshold * ghostedDiagValsView(rowView.colidx(y)) * ghostedDiagValsView(row))));
 						return x_aij / x_aiiajj > y_aij / y_aiiajj;
 					}
 				});
 
-
 				//find index where dropping starts
 				Kokkos::parallel_reduce(Kokkos::TeamThreadRange(teamMember, 1, n), [=](size_t i, size_t& min) {
-					auto const& x = drop_view(i - 1);
-					auto const& y = drop_view(i);
+					auto const& x = index_view(i - 1);
+					auto const& y = index_view(i);
 					typename STS::magnitudeType x_val = 0;
 					typename STS::magnitudeType y_val = 0;
-					if(!x.drop) {
-						typename STS::magnitudeType x_aij    = static_cast<SC>(std::fabs(static_cast<double>(rowView.value(x.col) * rowView.value(x.col))));                                            // |a_i     j|^2
-						typename STS::magnitudeType x_aiiajj = static_cast<SC>(std::fabs(static_cast<double>(threshold * threshold * ghostedDiagValsView(rowView.colidx(x.col)) * ghostedDiagValsView(row))));  // eps^2*|a     _ii|*|a_jj|
+					if(!drop_view(x)) {
+						typename STS::magnitudeType x_aij    = static_cast<SC>(std::fabs(static_cast<double>(rowView.value(x) * rowView.value(x))));
+						typename STS::magnitudeType x_aiiajj = static_cast<SC>(std::fabs(static_cast<double>(threshold * threshold * ghostedDiagValsView(rowView.colidx(x)) * ghostedDiagValsView(row))));
 						x_val = x_aij / x_aiiajj;
 					}
-					if(!y.drop) {
-						typename STS::magnitudeType y_aij    = static_cast<SC>(std::fabs(static_cast<double>(rowView.value(y.col) * rowView.value(y.col))));                                            // |a_i     j|^2
-						typename STS::magnitudeType y_aiiajj = static_cast<SC>(std::fabs(static_cast<double>(threshold * threshold * ghostedDiagValsView(rowView.colidx(y.col)) * ghostedDiagValsView(row))));  // eps^2*|a     _ii|*|a_jj|
+					if(!drop_view(y)) {
+						typename STS::magnitudeType y_aij    = static_cast<SC>(std::fabs(static_cast<double>(rowView.value(y) * rowView.value(y))));
+						typename STS::magnitudeType y_aiiajj = static_cast<SC>(std::fabs(static_cast<double>(threshold * threshold * ghostedDiagValsView(rowView.colidx(y)) * ghostedDiagValsView(row))));
 						y_val = y_aij / y_aiiajj;
 					}
 
@@ -890,22 +697,19 @@ void CoalesceDropFactory<Scalar, LocalOrdinal, GlobalOrdinal, Node>::Build(Level
 				}, Kokkos::Min<size_t>(dropStart));
 	  	 	}
 
+			//drop everything to the right of where values stop passing threshold 
 			if(dropStart < n) {
 				Kokkos::parallel_for(Kokkos::TeamThreadRange(teamMember, dropStart, n), [=](size_t i) {
-					drop_view(i).drop = true;
+					drop_view(index_view(i)) = true;
 				});
 			}
 
-			Kokkos::Experimental::sort_team(teamMember, drop_view, [](DropTolKokkos const& a, DropTolKokkos const& b) {
-		 		return a.col < b.col;
-			});
-
 		  	LO rownnz = 0;
 		  	GO rowDropped = 0;
 		  	Kokkos::parallel_reduce(Kokkos::TeamThreadRange(teamMember, n), [=](const size_t idxID, LO& keep, GO& drop) {
 				LO col = rowView.colidx(idxID);
 				//don't drop diagonal
-				if(row == col || !drop_view(idxID).drop) {
+				if(row == col || !drop_view(idxID)) {
 					keep++;
 				}
 				else {
@@ -913,1459 +717,25 @@ void CoalesceDropFactory<Scalar, LocalOrdinal, GlobalOrdinal, Node>::Build(Level
 					drop++;
 				}
 	  	 	}, rownnz, rowDropped);
+
 		  	globalnnz += rownnz;
 		  	totalDropped += rowDropped;
 			rownnzView(row) = rownnz;
 		}, realnnz, numDropped);
-		//stackedTimer->stop("loop");
-
-		//stackedTimer->start("remove");
-		
+	
+		//update column indices so that kept indices are aligned to the left for subview that happens later on
 		auto columnsDevice = Kokkos::create_mirror_view(ExecSpace(), columns);
 		Kokkos::Experimental::remove(ExecSpace(), columnsDevice, -1);
 		Kokkos::deep_copy(columns, columnsDevice);
 		
-		//stackedTimer->stop("remove");
-	
-		//update row indices
-		//stackedTimer->start("scan");
+		//update row indices by adding up new # of nnz in each row
 		auto rowsDevice = Kokkos::create_mirror_view(ExecSpace(), rows);
 		Kokkos::parallel_scan(Kokkos::RangePolicy<ExecSpace>(0, A_device.numRows()), KOKKOS_LAMBDA(const int i, LO& partial_sum, bool is_final) {
 			partial_sum += rownnzView(i);
 			if(is_final) rowsDevice(i+1) = partial_sum;
 		});
 		Kokkos::deep_copy(rows, rowsDevice);
-		//stackedTimer->stop("scan");
-	
-		//stackedTimer->stop("timer");
-		//stackedTimer->report(std::cout, Teuchos::DefaultComm<int>::getComm());
-	} //NEW END
-#endif
-
-        numTotal = A->getLocalNumEntries();
-
-        if (aggregationMayCreateDirichlet) {
-          // If the only element remaining after filtering is diagonal, mark node as boundary
-          for (LO row = 0; row < Teuchos::as<LO>(A->getRowMap()->getLocalNumElements()); ++row) {
-            if (rows[row + 1] - rows[row] <= 1)
-              boundaryNodes[row] = true;
-          }
-        }
-
-        RCP<LWGraph> graph = rcp(new LWGraph(rows, Kokkos::subview(columns, Kokkos::make_pair(0, realnnz)), A->getRowMap(), A->getColMap(), "thresholded graph of A"));
-        graph->SetBoundaryNodeMap(boundaryNodes);
-        if (GetVerbLevel() & Statistics1) {
-          GO numLocalBoundaryNodes  = 0;
-          GO numGlobalBoundaryNodes = 0;
-          for (size_t i = 0; i < boundaryNodes.size(); ++i)
-            if (boundaryNodes(i))
-              numLocalBoundaryNodes++;
-          RCP<const Teuchos::Comm<int>> comm = A->getRowMap()->getComm();
-          MueLu_sumAll(comm, numLocalBoundaryNodes, numGlobalBoundaryNodes);
-          GetOStream(Statistics1) << "Detected " << numGlobalBoundaryNodes << " Dirichlet nodes" << std::endl;
-        }
-        Set(currentLevel, "Graph", graph);
-        Set(currentLevel, "DofsPerNode", 1);
-
-        // If we're doing signed classical, we might want to block-diagonalize *after* the dropping
-        if (generateColoringGraph) {
-          RCP<LWGraph> colorGraph;
-          RCP<const Import> importer = A->getCrsGraph()->getImporter();
-          BlockDiagonalizeGraph(graph, ghostedBlockNumber, colorGraph, importer);
-          Set(currentLevel, "Coloring Graph", colorGraph);
-          // #define CMS_DUMP
-#ifdef CMS_DUMP
-          {
-            Xpetra::IO<Scalar, LocalOrdinal, GlobalOrdinal, Node>::Write("m_regular_graph." + std::to_string(currentLevel.GetLevelID()), *rcp_dynamic_cast<LWGraph>(graph)->GetCrsGraph());
-            Xpetra::IO<Scalar, LocalOrdinal, GlobalOrdinal, Node>::Write("m_color_graph." + std::to_string(currentLevel.GetLevelID()), *rcp_dynamic_cast<LWGraph>(colorGraph)->GetCrsGraph());
-            // int rank = graph->GetDomainMap()->getComm()->getRank();
-            // {
-            //   std::ofstream ofs(std::string("m_color_graph_") + std::to_string(currentLevel.GetLevelID())+std::string("_") + std::to_string(rank) + std::string(".dat"),std::ofstream::out);
-            //   RCP<Teuchos::FancyOStream> fancy = Teuchos::fancyOStream(Teuchos::rcpFromRef(ofs));
-            //   colorGraph->print(*fancy,Debug);
-            // }
-            // {
-            //   std::ofstream ofs(std::string("m_regular_graph_") + std::to_string(currentLevel.GetLevelID())+std::string("_") + std::to_string(rank) + std::string(".dat"),std::ofstream::out);
-            //   RCP<Teuchos::FancyOStream> fancy = Teuchos::fancyOStream(Teuchos::rcpFromRef(ofs));
-            //   graph->print(*fancy,Debug);
-            // }
-          }
-#endif
-        }  // end generateColoringGraph
-      } else if (BlockSize > 1 && threshold == STS::zero()) {
-        // Case 3:  Multiple DOF/node problem without dropping
-        const RCP<const Map> rowMap = A->getRowMap();
-        const RCP<const Map> colMap = A->getColMap();
-
-        graphType = "amalgamated";
-
-        // build node row map (uniqueMap) and node column map (nonUniqueMap)
-        // the arrays rowTranslation and colTranslation contain the local node id
-        // given a local dof id. The data is calculated by the AmalgamationFactory and
-        // stored in the variable container "UnAmalgamationInfo"
-        RCP<const Map> uniqueMap    = amalInfo->getNodeRowMap();
-        RCP<const Map> nonUniqueMap = amalInfo->getNodeColMap();
-        Array<LO> rowTranslation    = *(amalInfo->getRowTranslation());
-        Array<LO> colTranslation    = *(amalInfo->getColTranslation());
-
-        // get number of local nodes
-        LO numRows = Teuchos::as<LocalOrdinal>(uniqueMap->getLocalNumElements());
-
-        // Allocate space for the local graph
-        typename LWGraph::row_type::non_const_type rows("rows", numRows + 1);
-        typename LWGraph::entries_type::non_const_type columns("columns", A->getLocalNumEntries());
-
-        typename LWGraph::boundary_nodes_type amalgBoundaryNodes("amalgBoundaryNodes", numRows);
-        Kokkos::deep_copy(amalgBoundaryNodes, false);
-
-        // Detect and record rows that correspond to Dirichlet boundary conditions
-        // TODO If we use ArrayRCP<LO>, then we can record boundary nodes as usual.  Size
-        // TODO the array one bigger than the number of local rows, and the last entry can
-        // TODO hold the actual number of boundary nodes.  Clever, huh?
-        ArrayRCP<bool> pointBoundaryNodes;
-        pointBoundaryNodes = Teuchos::arcp_const_cast<bool>(MueLu::Utilities<SC, LO, GO, NO>::DetectDirichletRows(*A, dirichletThreshold));
-        if (rowSumTol > 0.)
-          Utilities::ApplyRowSumCriterion(*A, rowSumTol, pointBoundaryNodes);
-
-        // extract striding information
-        LO blkSize     = A->GetFixedBlockSize();  //< the full block size (number of dofs per node in strided map)
-        LO blkId       = -1;                      //< the block id within the strided map (or -1 if it is a full block map)
-        LO blkPartSize = A->GetFixedBlockSize();  //< stores the size of the block within the strided map
-        if (A->IsView("stridedMaps") == true) {
-          Teuchos::RCP<const Map> myMap         = A->getRowMap("stridedMaps");
-          Teuchos::RCP<const StridedMap> strMap = Teuchos::rcp_dynamic_cast<const StridedMap>(myMap);
-          TEUCHOS_TEST_FOR_EXCEPTION(strMap == null, Exceptions::RuntimeError, "Map is not of type StridedMap");
-          blkSize = Teuchos::as<const LO>(strMap->getFixedBlockSize());
-          blkId   = strMap->getStridedBlockId();
-          if (blkId > -1)
-            blkPartSize = Teuchos::as<LO>(strMap->getStridingData()[blkId]);
-        }
-
-        // loop over all local nodes
-        LO realnnz = 0;
-        rows(0)    = 0;
-        Array<LO> indicesExtra;
-        for (LO row = 0; row < numRows; row++) {
-          ArrayView<const LO> indices;
-          indicesExtra.resize(0);
-
-          // The amalgamated row is marked as Dirichlet iff all point rows are Dirichlet
-          // Note, that pointBoundaryNodes lives on the dofmap (and not the node map).
-          // Therefore, looping over all dofs is fine here. We use blkPartSize as we work
-          // with local ids.
-          // TODO: Here we have different options of how to define a node to be a boundary (or Dirichlet)
-          // node.
-          bool isBoundary = false;
-          if (pL.get<bool>("aggregation: greedy Dirichlet") == true) {
-            for (LO j = 0; j < blkPartSize; j++) {
-              if (pointBoundaryNodes[row * blkPartSize + j]) {
-                isBoundary = true;
-                break;
-              }
-            }
-          } else {
-            isBoundary = true;
-            for (LO j = 0; j < blkPartSize; j++) {
-              if (!pointBoundaryNodes[row * blkPartSize + j]) {
-                isBoundary = false;
-                break;
-              }
-            }
-          }
-
-          // Merge rows of A
-          // The array indicesExtra contains local column node ids for the current local node "row"
-          if (!isBoundary)
-            MergeRows(*A, row, indicesExtra, colTranslation);
-          else
-            indicesExtra.push_back(row);
-          indices = indicesExtra;
-          numTotal += indices.size();
-
-          // add the local column node ids to the full columns array which
-          // contains the local column node ids for all local node rows
-          LO nnz = indices.size(), rownnz = 0;
-          for (LO colID = 0; colID < nnz; colID++) {
-            LO col             = indices[colID];
-            columns(realnnz++) = col;
-            rownnz++;
-          }
-
-          if (rownnz == 1) {
-            // If the only element remaining after filtering is diagonal, mark node as boundary
-            // FIXME: this should really be replaced by the following
-            //    if (indices.size() == 1 && indices[0] == row)
-            //        boundaryNodes[row] = true;
-            // We do not do it this way now because there is no framework for distinguishing isolated
-            // and boundary nodes in the aggregation algorithms
-            amalgBoundaryNodes[row] = true;
-          }
-          rows(row + 1) = realnnz;
-        }  // for (LO row = 0; row < numRows; row++)
-
-        RCP<LWGraph> graph = rcp(new LWGraph(rows, Kokkos::subview(columns, Kokkos::make_pair(0, realnnz)), uniqueMap, nonUniqueMap, "amalgamated graph of A"));
-        graph->SetBoundaryNodeMap(amalgBoundaryNodes);
-
-        if (GetVerbLevel() & Statistics1) {
-          GO numLocalBoundaryNodes  = 0;
-          GO numGlobalBoundaryNodes = 0;
-
-          for (size_t i = 0; i < amalgBoundaryNodes.size(); ++i)
-            if (amalgBoundaryNodes(i))
-              numLocalBoundaryNodes++;
-
-          RCP<const Teuchos::Comm<int>> comm = A->getRowMap()->getComm();
-          MueLu_sumAll(comm, numLocalBoundaryNodes, numGlobalBoundaryNodes);
-          GetOStream(Statistics1) << "Detected " << numGlobalBoundaryNodes
-                                  << " agglomerated Dirichlet nodes" << std::endl;
-        }
-
-        Set(currentLevel, "Graph", graph);
-        Set(currentLevel, "DofsPerNode", blkSize);  // full block size
-
-      } else if (BlockSize > 1 && threshold != STS::zero()) {
-        // Case 4:  Multiple DOF/node problem with dropping
-        const RCP<const Map> rowMap = A->getRowMap();
-        const RCP<const Map> colMap = A->getColMap();
-        graphType                   = "amalgamated";
-
-        // build node row map (uniqueMap) and node column map (nonUniqueMap)
-        // the arrays rowTranslation and colTranslation contain the local node id
-        // given a local dof id. The data is calculated by the AmalgamationFactory and
-        // stored in the variable container "UnAmalgamationInfo"
-        RCP<const Map> uniqueMap    = amalInfo->getNodeRowMap();
-        RCP<const Map> nonUniqueMap = amalInfo->getNodeColMap();
-        Array<LO> rowTranslation    = *(amalInfo->getRowTranslation());
-        Array<LO> colTranslation    = *(amalInfo->getColTranslation());
-
-        // get number of local nodes
-        LO numRows = Teuchos::as<LocalOrdinal>(uniqueMap->getLocalNumElements());
-
-        // Allocate space for the local graph
-        typename LWGraph::row_type::non_const_type rows("rows", numRows + 1);
-        typename LWGraph::entries_type::non_const_type columns("columns", A->getLocalNumEntries());
-
-        typename LWGraph::boundary_nodes_type amalgBoundaryNodes("amalgBoundaryNodes", numRows);
-        Kokkos::deep_copy(amalgBoundaryNodes, false);
-
-        // Detect and record rows that correspond to Dirichlet boundary conditions
-        // TODO If we use ArrayRCP<LO>, then we can record boundary nodes as usual.  Size
-        // TODO the array one bigger than the number of local rows, and the last entry can
-        // TODO hold the actual number of boundary nodes.  Clever, huh?
-        auto pointBoundaryNodes = MueLu::Utilities<SC, LO, GO, NO>::DetectDirichletRows_kokkos_host(*A, dirichletThreshold);
-        if (rowSumTol > 0.)
-          Utilities::ApplyRowSumCriterionHost(*A, rowSumTol, pointBoundaryNodes);
-
-        // extract striding information
-        LO blkSize     = A->GetFixedBlockSize();  //< the full block size (number of dofs per node in strided map)
-        LO blkId       = -1;                      //< the block id within the strided map (or -1 if it is a full block map)
-        LO blkPartSize = A->GetFixedBlockSize();  //< stores the size of the block within the strided map
-        if (A->IsView("stridedMaps") == true) {
-          Teuchos::RCP<const Map> myMap         = A->getRowMap("stridedMaps");
-          Teuchos::RCP<const StridedMap> strMap = Teuchos::rcp_dynamic_cast<const StridedMap>(myMap);
-          TEUCHOS_TEST_FOR_EXCEPTION(strMap == null, Exceptions::RuntimeError, "Map is not of type StridedMap");
-          blkSize = Teuchos::as<const LO>(strMap->getFixedBlockSize());
-          blkId   = strMap->getStridedBlockId();
-          if (blkId > -1)
-            blkPartSize = Teuchos::as<LO>(strMap->getStridingData()[blkId]);
-        }
-
-        // extract diagonal data for dropping strategy
-        RCP<Vector> ghostedDiag                  = MueLu::Utilities<SC, LO, GO, NO>::GetMatrixOverlappedDiagonal(*A);
-        const ArrayRCP<const SC> ghostedDiagVals = ghostedDiag->getData(0);
-
-        // loop over all local nodes
-        LO realnnz = 0;
-        rows[0]    = 0;
-        Array<LO> indicesExtra;
-        for (LO row = 0; row < numRows; row++) {
-          ArrayView<const LO> indices;
-          indicesExtra.resize(0);
-
-          // The amalgamated row is marked as Dirichlet iff all point rows are Dirichlet
-          // Note, that pointBoundaryNodes lives on the dofmap (and not the node map).
-          // Therefore, looping over all dofs is fine here. We use blkPartSize as we work
-          // with local ids.
-          // TODO: Here we have different options of how to define a node to be a boundary (or Dirichlet)
-          // node.
-          bool isBoundary = false;
-          if (pL.get<bool>("aggregation: greedy Dirichlet") == true) {
-            for (LO j = 0; j < blkPartSize; j++) {
-              if (pointBoundaryNodes[row * blkPartSize + j]) {
-                isBoundary = true;
-                break;
-              }
-            }
-          } else {
-            isBoundary = true;
-            for (LO j = 0; j < blkPartSize; j++) {
-              if (!pointBoundaryNodes[row * blkPartSize + j]) {
-                isBoundary = false;
-                break;
-              }
-            }
-          }
-
-          // Merge rows of A
-          // The array indicesExtra contains local column node ids for the current local node "row"
-          if (!isBoundary)
-            MergeRowsWithDropping(*A, row, ghostedDiagVals, threshold, indicesExtra, colTranslation);
-          else
-            indicesExtra.push_back(row);
-          indices = indicesExtra;
-          numTotal += indices.size();
-
-          // add the local column node ids to the full columns array which
-          // contains the local column node ids for all local node rows
-          LO nnz = indices.size(), rownnz = 0;
-          for (LO colID = 0; colID < nnz; colID++) {
-            LO col             = indices[colID];
-            columns[realnnz++] = col;
-            rownnz++;
-          }
-
-          if (rownnz == 1) {
-            // If the only element remaining after filtering is diagonal, mark node as boundary
-            // FIXME: this should really be replaced by the following
-            //    if (indices.size() == 1 && indices[0] == row)
-            //        boundaryNodes[row] = true;
-            // We do not do it this way now because there is no framework for distinguishing isolated
-            // and boundary nodes in the aggregation algorithms
-            amalgBoundaryNodes[row] = true;
-          }
-          rows[row + 1] = realnnz;
-        }  // for (LO row = 0; row < numRows; row++)
-        // columns.resize(realnnz);
-
-        RCP<LWGraph> graph = rcp(new LWGraph(rows, Kokkos::subview(columns, Kokkos::make_pair(0, realnnz)), uniqueMap, nonUniqueMap, "amalgamated graph of A"));
-        graph->SetBoundaryNodeMap(amalgBoundaryNodes);
-
-        if (GetVerbLevel() & Statistics1) {
-          GO numLocalBoundaryNodes  = 0;
-          GO numGlobalBoundaryNodes = 0;
-
-          for (size_t i = 0; i < amalgBoundaryNodes.size(); ++i)
-            if (amalgBoundaryNodes(i))
-              numLocalBoundaryNodes++;
-
-          RCP<const Teuchos::Comm<int>> comm = A->getRowMap()->getComm();
-          MueLu_sumAll(comm, numLocalBoundaryNodes, numGlobalBoundaryNodes);
-          GetOStream(Statistics1) << "Detected " << numGlobalBoundaryNodes
-                                  << " agglomerated Dirichlet nodes" << std::endl;
-        }
-
-        Set(currentLevel, "Graph", graph);
-        Set(currentLevel, "DofsPerNode", blkSize);  // full block size
-      }
-
-    } else if (algo == "distance laplacian") {
-      LO blkSize   = A->GetFixedBlockSize();
-      GO indexBase = A->getRowMap()->getIndexBase();
-      // [*0*] : FIXME
-      // ap: somehow, if I move this line to [*1*], Belos throws an error
-      // I'm not sure what's going on. Do we always have to Get data, if we did
-      // DeclareInput for it?
-      //        RCP<RealValuedMultiVector> Coords = Get< RCP<RealValuedMultiVector > >(currentLevel, "Coordinates");
-
-      // Detect and record rows that correspond to Dirichlet boundary conditions
-      // TODO If we use ArrayRCP<LO>, then we can record boundary nodes as usual.  Size
-      // TODO the array one bigger than the number of local rows, and the last entry can
-      // TODO hold the actual number of boundary nodes.  Clever, huh?
-      auto pointBoundaryNodes = MueLu::Utilities<SC, LO, GO, NO>::DetectDirichletRows_kokkos_host(*A, dirichletThreshold);
-      if (rowSumTol > 0.)
-        Utilities::ApplyRowSumCriterionHost(*A, rowSumTol, pointBoundaryNodes);
-
-      if ((blkSize == 1) && (threshold == STS::zero())) {
-        // Trivial case: scalar problem, no dropping. Can return original graph
-        RCP<LWGraph> graph = rcp(new LWGraph(A->getCrsGraph(), "graph of A"));
-        graph->SetBoundaryNodeMap(pointBoundaryNodes);
-        graphType = "unamalgamated";
-        numTotal  = A->getLocalNumEntries();
-
-        if (GetVerbLevel() & Statistics1) {
-          GO numLocalBoundaryNodes  = 0;
-          GO numGlobalBoundaryNodes = 0;
-          for (size_t i = 0; i < pointBoundaryNodes.size(); ++i)
-            if (pointBoundaryNodes(i))
-              numLocalBoundaryNodes++;
-          RCP<const Teuchos::Comm<int>> comm = A->getRowMap()->getComm();
-          MueLu_sumAll(comm, numLocalBoundaryNodes, numGlobalBoundaryNodes);
-          GetOStream(Statistics1) << "Detected " << numGlobalBoundaryNodes << " Dirichlet nodes" << std::endl;
-        }
-
-        Set(currentLevel, "DofsPerNode", blkSize);
-        Set(currentLevel, "Graph", graph);
-
-      } else {
-        // ap: We make quite a few assumptions here; general case may be a lot different,
-        // but much much harder to implement. We assume that:
-        //  1) all maps are standard maps, not strided maps
-        //  2) global indices of dofs in A are related to dofs in coordinates in a simple arithmetic
-        //     way: rows i*blkSize, i*blkSize+1, ..., i*blkSize + (blkSize-1) correspond to node i
-        //
-        // NOTE: Potentially, some of the code below could be simplified with UnAmalgamationInfo,
-        // but as I totally don't understand that code, here is my solution
-
-        // [*1*]: see [*0*]
-
-        // Check that the number of local coordinates is consistent with the #rows in A
-        TEUCHOS_TEST_FOR_EXCEPTION(A->getRowMap()->getLocalNumElements() / blkSize != Coords->getLocalLength(), Exceptions::Incompatible,
-                                   "Coordinate vector length (" << Coords->getLocalLength() << ") is incompatible with number of rows in A (" << A->getRowMap()->getLocalNumElements() << ") by modulo block size (" << blkSize << ").");
-
-        const RCP<const Map> colMap = A->getColMap();
-        RCP<const Map> uniqueMap, nonUniqueMap;
-        Array<LO> colTranslation;
-        if (blkSize == 1) {
-          uniqueMap    = A->getRowMap();
-          nonUniqueMap = A->getColMap();
-          graphType    = "unamalgamated";
-
-        } else {
-          uniqueMap = Coords->getMap();
-          TEUCHOS_TEST_FOR_EXCEPTION(uniqueMap->getIndexBase() != indexBase, Exceptions::Incompatible,
-                                     "Different index bases for matrix and coordinates");
-
-          AmalgamationFactory::AmalgamateMap(*(A->getColMap()), *A, nonUniqueMap, colTranslation);
-
-          graphType = "amalgamated";
-        }
-        LO numRows = Teuchos::as<LocalOrdinal>(uniqueMap->getLocalNumElements());
-
-        RCP<RealValuedMultiVector> ghostedCoords;
-        RCP<Vector> ghostedLaplDiag;
-        Teuchos::ArrayRCP<SC> ghostedLaplDiagData;
-        if (threshold != STS::zero()) {
-          // Get ghost coordinates
-          RCP<const Import> importer;
-          {
-            SubFactoryMonitor m1(*this, "Import construction", currentLevel);
-            if (blkSize == 1 && realA->getCrsGraph()->getImporter() != Teuchos::null) {
-              GetOStream(Warnings1) << "Using existing importer from matrix graph" << std::endl;
-              importer = realA->getCrsGraph()->getImporter();
-            } else {
-              GetOStream(Warnings0) << "Constructing new importer instance" << std::endl;
-              importer = ImportFactory::Build(uniqueMap, nonUniqueMap);
-            }
-          }  // subtimer
-          ghostedCoords = Xpetra::MultiVectorFactory<real_type, LO, GO, NO>::Build(nonUniqueMap, Coords->getNumVectors());
-          {
-            SubFactoryMonitor m1(*this, "Coordinate import", currentLevel);
-            ghostedCoords->doImport(*Coords, *importer, Xpetra::INSERT);
-          }  // subtimer
-
-          // Construct Distance Laplacian diagonal
-          RCP<Vector> localLaplDiag = VectorFactory::Build(uniqueMap);
-          Array<LO> indicesExtra;
-          Teuchos::Array<Teuchos::ArrayRCP<const real_type>> coordData;
-          if (threshold != STS::zero()) {
-            const size_t numVectors = ghostedCoords->getNumVectors();
-            coordData.reserve(numVectors);
-            for (size_t j = 0; j < numVectors; j++) {
-              Teuchos::ArrayRCP<const real_type> tmpData = ghostedCoords->getData(j);
-              coordData.push_back(tmpData);
-            }
-          }
-          {
-            SubFactoryMonitor m1(*this, "Laplacian local diagonal", currentLevel);
-            ArrayRCP<SC> localLaplDiagData = localLaplDiag->getDataNonConst(0);
-            for (LO row = 0; row < numRows; row++) {
-              ArrayView<const LO> indices;
-
-              if (blkSize == 1) {
-                ArrayView<const SC> vals;
-                A->getLocalRowView(row, indices, vals);
-
-              } else {
-                // Merge rows of A
-                indicesExtra.resize(0);
-                MergeRows(*A, row, indicesExtra, colTranslation);
-                indices = indicesExtra;
-              }
-
-              LO nnz               = indices.size();
-              bool haveAddedToDiag = false;
-              for (LO colID = 0; colID < nnz; colID++) {
-                const LO col = indices[colID];
-
-                if (row != col) {
-                  if (use_dlap_weights == SINGLE_WEIGHTS) {
-                    /*printf("[%d,%d] Unweighted Distance = %6.4e Weighted Distance = %6.4e\n",row,col,
-                           MueLu::Utilities<real_type,LO,GO,NO>::Distance2(coordData, row, col),
-                           MueLu::Utilities<real_type,LO,GO,NO>::Distance2(dlap_weights(),coordData, row, col));*/
-                    localLaplDiagData[row] += STS::one() / MueLu::Utilities<real_type, LO, GO, NO>::Distance2(dlap_weights(), coordData, row, col);
-                  } else if (use_dlap_weights == BLOCK_WEIGHTS) {
-                    int block_id    = row % interleaved_blocksize;
-                    int block_start = block_id * interleaved_blocksize;
-                    localLaplDiagData[row] += STS::one() / MueLu::Utilities<real_type, LO, GO, NO>::Distance2(dlap_weights(block_start, interleaved_blocksize), coordData, row, col);
-                  } else {
-                    //                    printf("[%d,%d] Unweighted Distance = %6.4e\n",row,col,MueLu::Utilities<real_type,LO,GO,NO>::Distance2(coordData, row, col));
-                    localLaplDiagData[row] += STS::one() / MueLu::Utilities<real_type, LO, GO, NO>::Distance2(coordData, row, col);
-                  }
-                  haveAddedToDiag = true;
-                }
-              }
-              // Deal with the situation where boundary conditions have only been enforced on rows, but not on columns.
-              // We enforce dropping of these entries by assigning a very large number to the diagonal entries corresponding to BCs.
-              if (!haveAddedToDiag)
-                localLaplDiagData[row] = STS::rmax();
-            }
-          }  // subtimer
-          {
-            SubFactoryMonitor m1(*this, "Laplacian distributed diagonal", currentLevel);
-            ghostedLaplDiag = VectorFactory::Build(nonUniqueMap);
-            ghostedLaplDiag->doImport(*localLaplDiag, *importer, Xpetra::INSERT);
-            ghostedLaplDiagData = ghostedLaplDiag->getDataNonConst(0);
-          }  // subtimer
-
-        } else {
-          GetOStream(Runtime0) << "Skipping distance laplacian construction due to 0 threshold" << std::endl;
-        }
-
-        // NOTE: ghostedLaplDiagData might be zero if we don't actually calculate the laplacian
-
-        // allocate space for the local graph
-        typename LWGraph::row_type::non_const_type rows("rows", numRows + 1);
-        typename LWGraph::entries_type::non_const_type columns("columns", A->getLocalNumEntries());
-
-#ifdef HAVE_MUELU_DEBUG
-        // DEBUGGING
-        for (LO i = 0; i < (LO)columns.size(); i++) columns[i] = -666;
-#endif
-
-        // Extra array for if we're allowing symmetrization with cutting
-        ArrayRCP<LO> rows_stop;
-        bool use_stop_array = threshold != STS::zero() && distanceLaplacianAlgo == scaled_cut_symmetric;
-        if (use_stop_array)
-          // rows_stop = typename LWGraph::row_type::non_const_type("rows_stop", numRows);
-          rows_stop.resize(numRows);
-
-        typename LWGraph::boundary_nodes_type amalgBoundaryNodes("amalgBoundaryNodes", numRows);
-        Kokkos::deep_copy(amalgBoundaryNodes, false);
-
-        LO realnnz = 0;
-        rows(0)    = 0;
-
-        Array<LO> indicesExtra;
-        {
-          SubFactoryMonitor m1(*this, "Laplacian dropping", currentLevel);
-          Teuchos::Array<Teuchos::ArrayRCP<const real_type>> coordData;
-          if (threshold != STS::zero()) {
-            const size_t numVectors = ghostedCoords->getNumVectors();
-            coordData.reserve(numVectors);
-            for (size_t j = 0; j < numVectors; j++) {
-              Teuchos::ArrayRCP<const real_type> tmpData = ghostedCoords->getData(j);
-              coordData.push_back(tmpData);
-            }
-          }
-
-          ArrayView<const SC> vals;  // CMS hackery
-          for (LO row = 0; row < numRows; row++) {
-            ArrayView<const LO> indices;
-            indicesExtra.resize(0);
-            bool isBoundary = false;
-
-            if (blkSize == 1) {
-              //	      ArrayView<const SC>     vals;//CMS uncomment
-              A->getLocalRowView(row, indices, vals);
-              isBoundary = pointBoundaryNodes[row];
-            } else {
-              // The amalgamated row is marked as Dirichlet iff all point rows are Dirichlet
-              for (LO j = 0; j < blkSize; j++) {
-                if (!pointBoundaryNodes[row * blkSize + j]) {
-                  isBoundary = false;
-                  break;
-                }
-              }
-
-              // Merge rows of A
-              if (!isBoundary)
-                MergeRows(*A, row, indicesExtra, colTranslation);
-              else
-                indicesExtra.push_back(row);
-              indices = indicesExtra;
-            }
-            numTotal += indices.size();
-
-            LO nnz = indices.size(), rownnz = 0;
-
-            if (use_stop_array) {
-              rows(row + 1) = rows(row) + nnz;
-              realnnz       = rows(row);
-            }
-
-            if (threshold != STS::zero()) {
-              // default
-              if (distanceLaplacianAlgo == defaultAlgo) {
-                /* Standard Distance Laplacian */
-                for (LO colID = 0; colID < nnz; colID++) {
-                  LO col = indices[colID];
-
-                  if (row == col) {
-                    columns(realnnz++) = col;
-                    rownnz++;
-                    continue;
-                  }
-
-                  // We do not want the distance Laplacian aggregating boundary nodes
-                  if (isBoundary) continue;
-
-                  SC laplVal;
-                  if (use_dlap_weights == SINGLE_WEIGHTS) {
-                    laplVal = STS::one() / MueLu::Utilities<real_type, LO, GO, NO>::Distance2(dlap_weights(), coordData, row, col);
-                  } else if (use_dlap_weights == BLOCK_WEIGHTS) {
-                    int block_id    = row % interleaved_blocksize;
-                    int block_start = block_id * interleaved_blocksize;
-                    laplVal         = STS::one() / MueLu::Utilities<real_type, LO, GO, NO>::Distance2(dlap_weights(block_start, interleaved_blocksize), coordData, row, col);
-                  } else {
-                    laplVal = STS::one() / MueLu::Utilities<real_type, LO, GO, NO>::Distance2(coordData, row, col);
-                  }
-                  real_type aiiajj = STS::magnitude(realThreshold * realThreshold * ghostedLaplDiagData[row] * ghostedLaplDiagData[col]);
-                  real_type aij    = STS::magnitude(laplVal * laplVal);
-
-                  if (aij > aiiajj) {
-                    columns(realnnz++) = col;
-                    rownnz++;
-                  } else {
-                    numDropped++;
-                  }
-                }
-              } else {
-                /* Cut Algorithm */
-                using DropTol = Details::DropTol<real_type, LO>;
-                std::vector<DropTol> drop_vec;
-                drop_vec.reserve(nnz);
-                const real_type zero = Teuchos::ScalarTraits<real_type>::zero();
-                const real_type one  = Teuchos::ScalarTraits<real_type>::one();
-
-                // find magnitudes
-                for (LO colID = 0; colID < nnz; colID++) {
-                  LO col = indices[colID];
-
-                  if (row == col) {
-                    drop_vec.emplace_back(zero, one, colID, false);
-                    continue;
-                  }
-                  // We do not want the distance Laplacian aggregating boundary nodes
-                  if (isBoundary) continue;
-
-                  SC laplVal;
-                  if (use_dlap_weights == SINGLE_WEIGHTS) {
-                    laplVal = STS::one() / MueLu::Utilities<real_type, LO, GO, NO>::Distance2(dlap_weights(), coordData, row, col);
-                  } else if (use_dlap_weights == BLOCK_WEIGHTS) {
-                    int block_id    = row % interleaved_blocksize;
-                    int block_start = block_id * interleaved_blocksize;
-                    laplVal         = STS::one() / MueLu::Utilities<real_type, LO, GO, NO>::Distance2(dlap_weights(block_start, interleaved_blocksize), coordData, row, col);
-                  } else {
-                    laplVal = STS::one() / MueLu::Utilities<real_type, LO, GO, NO>::Distance2(coordData, row, col);
-                  }
-
-                  real_type aiiajj = STS::magnitude(ghostedLaplDiagData[row] * ghostedLaplDiagData[col]);
-                  real_type aij    = STS::magnitude(laplVal * laplVal);
-
-                  drop_vec.emplace_back(aij, aiiajj, colID, false);
-                }
-
-                const size_t n = drop_vec.size();
-
-                if (distanceLaplacianAlgo == unscaled_cut) {
-                  std::sort(drop_vec.begin(), drop_vec.end(), [](DropTol const& a, DropTol const& b) {
-                    return a.val > b.val;
-                  });
-
-                  bool drop = false;
-                  for (size_t i = 1; i < n; ++i) {
-                    if (!drop) {
-                      auto const& x = drop_vec[i - 1];
-                      auto const& y = drop_vec[i];
-                      auto a        = x.val;
-                      auto b        = y.val;
-                      if (a > realThreshold * b) {
-                        drop = true;
-#ifdef HAVE_MUELU_DEBUG
-                        if (distanceLaplacianCutVerbose) {
-                          std::cout << "DJS: KEEP, N, ROW:  " << i + 1 << ", " << n << ", " << row << std::endl;
-                        }
-#endif
-                      }
-                    }
-                    drop_vec[i].drop = drop;
-                  }
-                } else if (distanceLaplacianAlgo == scaled_cut || distanceLaplacianAlgo == scaled_cut_symmetric) {
-                  std::sort(drop_vec.begin(), drop_vec.end(), [](DropTol const& a, DropTol const& b) {
-                    return a.val / a.diag > b.val / b.diag;
-                  });
-
-                  bool drop = false;
-                  for (size_t i = 1; i < n; ++i) {
-                    if (!drop) {
-                      auto const& x = drop_vec[i - 1];
-                      auto const& y = drop_vec[i];
-                      auto a        = x.val / x.diag;
-                      auto b        = y.val / y.diag;
-                      if (a > realThreshold * b) {
-                        drop = true;
-#ifdef HAVE_MUELU_DEBUG
-                        if (distanceLaplacianCutVerbose) {
-                          std::cout << "DJS: KEEP, N, ROW:  " << i + 1 << ", " << n << ", " << row << std::endl;
-                        }
-#endif
-                      }
-                    }
-                    drop_vec[i].drop = drop;
-                  }
-                }
-
-                std::sort(drop_vec.begin(), drop_vec.end(), [](DropTol const& a, DropTol const& b) {
-                  return a.col < b.col;
-                });
-
-                for (LO idxID = 0; idxID < (LO)drop_vec.size(); idxID++) {
-                  LO col = indices[drop_vec[idxID].col];
-
-                  // don't drop diagonal
-                  if (row == col) {
-                    columns(realnnz++) = col;
-                    rownnz++;
-                    //		    printf("(%d,%d) KEEP %13s matrix = %6.4e\n",row,row,"DIAGONAL",drop_vec[idxID].aux_val);
-                    continue;
-                  }
-
-                  if (!drop_vec[idxID].drop) {
-                    columns(realnnz++) = col;
-                    //		    printf("(%d,%d) KEEP dlap = %6.4e matrix = %6.4e\n",row,col,drop_vec[idxID].val/drop_vec[idxID].diag,drop_vec[idxID].aux_val);
-                    rownnz++;
-                  } else {
-                    //		    printf("(%d,%d) DROP dlap = %6.4e matrix = %6.4e\n",row,col,drop_vec[idxID].val/drop_vec[idxID].diag,drop_vec[idxID].aux_val);
-                    numDropped++;
-                  }
-                }
-              }
-            } else {
-              // Skip laplace calculation and threshold comparison for zero threshold
-              for (LO colID = 0; colID < nnz; colID++) {
-                LO col             = indices[colID];
-                columns(realnnz++) = col;
-                rownnz++;
-              }
-            }
-
-            if (rownnz == 1) {
-              // If the only element remaining after filtering is diagonal, mark node as boundary
-              // FIXME: this should really be replaced by the following
-              //    if (indices.size() == 1 && indices[0] == row)
-              //        boundaryNodes[row] = true;
-              // We do not do it this way now because there is no framework for distinguishing isolated
-              // and boundary nodes in the aggregation algorithms
-              amalgBoundaryNodes[row] = true;
-            }
-
-            if (use_stop_array)
-              rows_stop[row] = rownnz + rows[row];
-            else
-              rows[row + 1] = realnnz;
-          }  // for (LO row = 0; row < numRows; row++)
-
-        }  // subtimer
-
-        if (use_stop_array) {
-          // Do symmetrization of the cut matrix
-          // NOTE: We assume nested row/column maps here
-          for (LO row = 0; row < numRows; row++) {
-            for (LO colidx = rows[row]; colidx < rows_stop[row]; colidx++) {
-              LO col = columns[colidx];
-              if (col >= numRows) continue;
-
-              bool found = false;
-              for (LO t_col = rows(col); !found && t_col < rows_stop[col]; t_col++) {
-                if (columns[t_col] == row)
-                  found = true;
-              }
-              // We didn't find the transpose buddy, so let's symmetrize, unless we'd be symmetrizing
-              // into a Dirichlet unknown.  In that case don't.
-              if (!found && !pointBoundaryNodes[col] && Teuchos::as<typename LWGraph::row_type::value_type>(rows_stop[col]) < rows[col + 1]) {
-                LO new_idx = rows_stop[col];
-                //		  printf("(%d,%d) SYMADD entry\n",col,row);
-                columns[new_idx] = row;
-                rows_stop[col]++;
-                numDropped--;
-              }
-            }
-          }
-
-          // Condense everything down
-          LO current_start = 0;
-          for (LO row = 0; row < numRows; row++) {
-            LO old_start = current_start;
-            for (LO col = rows(row); col < rows_stop[row]; col++) {
-              if (current_start != col) {
-                columns(current_start) = columns(col);
-              }
-              current_start++;
-            }
-            rows[row] = old_start;
-          }
-          rows(numRows) = realnnz = current_start;
-        }
-
-        RCP<LWGraph> graph;
-        {
-          SubFactoryMonitor m1(*this, "Build amalgamated graph", currentLevel);
-          graph = rcp(new LWGraph(rows, Kokkos::subview(columns, Kokkos::make_pair(0, realnnz)), uniqueMap, nonUniqueMap, "amalgamated graph of A"));
-          graph->SetBoundaryNodeMap(amalgBoundaryNodes);
-        }  // subtimer
-
-        if (GetVerbLevel() & Statistics1) {
-          GO numLocalBoundaryNodes  = 0;
-          GO numGlobalBoundaryNodes = 0;
-
-          for (size_t i = 0; i < amalgBoundaryNodes.size(); ++i)
-            if (amalgBoundaryNodes(i))
-              numLocalBoundaryNodes++;
-
-          RCP<const Teuchos::Comm<int>> comm = A->getRowMap()->getComm();
-          MueLu_sumAll(comm, numLocalBoundaryNodes, numGlobalBoundaryNodes);
-          GetOStream(Statistics1) << "Detected " << numGlobalBoundaryNodes << " agglomerated Dirichlet nodes"
-                                  << " using threshold " << dirichletThreshold << std::endl;
-        }
-
-        Set(currentLevel, "Graph", graph);
-        Set(currentLevel, "DofsPerNode", blkSize);
-      }
-    }
-
-    if ((GetVerbLevel() & Statistics1) && !(A->GetFixedBlockSize() > 1 && threshold != STS::zero())) {
-      RCP<const Teuchos::Comm<int>> comm = A->getRowMap()->getComm();
-      GO numGlobalTotal, numGlobalDropped;
-      MueLu_sumAll(comm, numTotal, numGlobalTotal);
-      MueLu_sumAll(comm, numDropped, numGlobalDropped);
-      GetOStream(Statistics1) << "Number of dropped entries in " << graphType << " matrix graph: " << numGlobalDropped << "/" << numGlobalTotal;
-      if (numGlobalTotal != 0)
-        GetOStream(Statistics1) << " (" << 100 * Teuchos::as<double>(numGlobalDropped) / Teuchos::as<double>(numGlobalTotal) << "%)";
-      GetOStream(Statistics1) << std::endl;
-    }
-
-  } else {
-    // what Tobias has implemented
-
-    SC threshold = as<SC>(pL.get<double>("aggregation: drop tol"));
-    // GetOStream(Runtime0) << "algorithm = \"" << algo << "\": threshold = " << threshold << ", blocksize = " << A->GetFixedBlockSize() << std::endl;
-    GetOStream(Runtime0) << "algorithm = \""
-                         << "failsafe"
-                         << "\": threshold = " << threshold << ", blocksize = " << A->GetFixedBlockSize() << std::endl;
-    Set<bool>(currentLevel, "Filtering", (threshold != STS::zero()));
-
-    RCP<const Map> rowMap = A->getRowMap();
-    RCP<const Map> colMap = A->getColMap();
-
-    LO blockdim  = 1;                       // block dim for fixed size blocks
-    GO indexBase = rowMap->getIndexBase();  // index base of maps
-    GO offset    = 0;
-
-    // 1) check for blocking/striding information
-    if (A->IsView("stridedMaps") &&
-        Teuchos::rcp_dynamic_cast<const StridedMap>(A->getRowMap("stridedMaps")) != Teuchos::null) {
-      Xpetra::viewLabel_t oldView  = A->SwitchToView("stridedMaps");  // note: "stridedMaps are always non-overlapping (correspond to range and domain maps!)
-      RCP<const StridedMap> strMap = Teuchos::rcp_dynamic_cast<const StridedMap>(A->getRowMap());
-      TEUCHOS_TEST_FOR_EXCEPTION(strMap == Teuchos::null, Exceptions::BadCast, "MueLu::CoalesceFactory::Build: cast to strided row map failed.");
-      blockdim = strMap->getFixedBlockSize();
-      offset   = strMap->getOffset();
-      oldView  = A->SwitchToView(oldView);
-      GetOStream(Statistics1) << "CoalesceDropFactory::Build():"
-                              << " found blockdim=" << blockdim << " from strided maps. offset=" << offset << std::endl;
-    } else
-      GetOStream(Statistics1) << "CoalesceDropFactory::Build(): no striding information available. Use blockdim=1 with offset=0" << std::endl;
-
-    // 2) get row map for amalgamated matrix (graph of A)
-    //    with same distribution over all procs as row map of A
-    RCP<const Map> nodeMap = amalInfo->getNodeRowMap();
-    GetOStream(Statistics1) << "CoalesceDropFactory: nodeMap " << nodeMap->getLocalNumElements() << "/" << nodeMap->getGlobalNumElements() << " elements" << std::endl;
-
-    // 3) create graph of amalgamated matrix
-    RCP<CrsGraph> crsGraph = CrsGraphFactory::Build(nodeMap, A->getLocalMaxNumRowEntries() * blockdim);
-
-    LO numRows  = A->getRowMap()->getLocalNumElements();
-    LO numNodes = nodeMap->getLocalNumElements();
-    typename LWGraph::boundary_nodes_type amalgBoundaryNodes("amalgBoundaryNodes", numNodes);
-    Kokkos::deep_copy(amalgBoundaryNodes, false);
-    const ArrayRCP<int> numberDirichletRowsPerNode(numNodes, 0);  // helper array counting the number of Dirichlet nodes associated with node
-    bool bIsDiagonalEntry = false;                                // boolean flag stating that grid==gcid
-
-    // 4) do amalgamation. generate graph of amalgamated matrix
-    //    Note, this code is much more inefficient than the leightwight implementation
-    //    Most of the work has already been done in the AmalgamationFactory
-    for (LO row = 0; row < numRows; row++) {
-      // get global DOF id
-      GO grid = rowMap->getGlobalElement(row);
-
-      // reinitialize boolean helper variable
-      bIsDiagonalEntry = false;
-
-      // translate grid to nodeid
-      GO nodeId = AmalgamationFactory::DOFGid2NodeId(grid, blockdim, offset, indexBase);
-
-      size_t nnz = A->getNumEntriesInLocalRow(row);
-      Teuchos::ArrayView<const LO> indices;
-      Teuchos::ArrayView<const SC> vals;
-      A->getLocalRowView(row, indices, vals);
-
-      RCP<std::vector<GO>> cnodeIds = Teuchos::rcp(new std::vector<GO>);  // global column block ids
-      LO realnnz                    = 0;
-      for (LO col = 0; col < Teuchos::as<LO>(nnz); col++) {
-        GO gcid = colMap->getGlobalElement(indices[col]);  // global column id
-
-        if (vals[col] != STS::zero()) {
-          GO cnodeId = AmalgamationFactory::DOFGid2NodeId(gcid, blockdim, offset, indexBase);
-          cnodeIds->push_back(cnodeId);
-          realnnz++;  // increment number of nnz in matrix row
-          if (grid == gcid) bIsDiagonalEntry = true;
-        }
-      }
-
-      if (realnnz == 1 && bIsDiagonalEntry == true) {
-        LO lNodeId = nodeMap->getLocalElement(nodeId);
-        numberDirichletRowsPerNode[lNodeId] += 1;             // increment Dirichlet row counter associated with lNodeId
-        if (numberDirichletRowsPerNode[lNodeId] == blockdim)  // mark full Dirichlet nodes
-          amalgBoundaryNodes[lNodeId] = true;
-      }
-
-      Teuchos::ArrayRCP<GO> arr_cnodeIds = Teuchos::arcp(cnodeIds);
-
-      if (arr_cnodeIds.size() > 0)
-        crsGraph->insertGlobalIndices(nodeId, arr_cnodeIds());
-    }
-    // fill matrix graph
-    crsGraph->fillComplete(nodeMap, nodeMap);
-
-    // 5) create MueLu Graph object
-    RCP<LWGraph> graph = rcp(new LWGraph(crsGraph, "amalgamated graph of A"));
-
-    // Detect and record rows that correspond to Dirichlet boundary conditions
-    graph->SetBoundaryNodeMap(amalgBoundaryNodes);
-
-    if (GetVerbLevel() & Statistics1) {
-      GO numLocalBoundaryNodes  = 0;
-      GO numGlobalBoundaryNodes = 0;
-      for (size_t i = 0; i < amalgBoundaryNodes.size(); ++i)
-        if (amalgBoundaryNodes(i))
-          numLocalBoundaryNodes++;
-      RCP<const Teuchos::Comm<int>> comm = A->getRowMap()->getComm();
-      MueLu_sumAll(comm, numLocalBoundaryNodes, numGlobalBoundaryNodes);
-      GetOStream(Statistics1) << "Detected " << numGlobalBoundaryNodes << " Dirichlet nodes" << std::endl;
-    }
-
-    // 6) store results in Level
-    // graph->SetBoundaryNodeMap(gBoundaryNodeMap);
-    Set(currentLevel, "DofsPerNode", blockdim);
-    Set(currentLevel, "Graph", graph);
-
-  }  // if (doExperimentalWrap) ... else ...
-
-}  // Build
-
-template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
-void CoalesceDropFactory<Scalar, LocalOrdinal, GlobalOrdinal, Node>::BuildKokkos(Level& currentLevel) const {
-  FactoryMonitor m(*this, "BuildKokkos", currentLevel);
-
-  typedef Teuchos::ScalarTraits<SC> STS;
-  typedef typename STS::magnitudeType real_type;
-  typedef Xpetra::MultiVector<real_type, LO, GO, NO> RealValuedMultiVector;
-  typedef Xpetra::MultiVectorFactory<real_type, LO, GO, NO> RealValuedMultiVectorFactory;
-
-  if (predrop_ != Teuchos::null)
-    GetOStream(Parameters0) << predrop_->description();
-
-  RCP<Matrix> realA              = Get<RCP<Matrix>>(currentLevel, "A");
-  RCP<AmalgamationInfo> amalInfo = Get<RCP<AmalgamationInfo>>(currentLevel, "UnAmalgamationInfo");
-  const ParameterList& pL        = GetParameterList();
-  bool doExperimentalWrap        = pL.get<bool>("lightweight wrap");
-
-  GetOStream(Parameters0) << "lightweight wrap = " << doExperimentalWrap << std::endl;
-  std::string algo                         = pL.get<std::string>("aggregation: drop scheme");
-  const bool aggregationMayCreateDirichlet = pL.get<bool>("aggregation: dropping may create Dirichlet");
-
-  RCP<RealValuedMultiVector> Coords;
-  RCP<Matrix> A;
-
-  bool use_block_algorithm   = false;
-  LO interleaved_blocksize   = as<LO>(pL.get<int>("aggregation: block diagonal: interleaved blocksize"));
-  bool useSignedClassicalRS  = false;
-  bool useSignedClassicalSA  = false;
-  bool generateColoringGraph = false;
-
-  // NOTE:  If we're doing blockDiagonal, we'll not want to do rowSum twice (we'll do it
-  // in the block diagonalization). So we'll clobber the rowSumTol with -1.0 in this case
-  typename STS::magnitudeType rowSumTol = as<typename STS::magnitudeType>(pL.get<double>("aggregation: row sum drop tol"));
-
-  RCP<LocalOrdinalVector> ghostedBlockNumber;
-  ArrayRCP<const LO> g_block_id;
-
-  if (algo == "distance laplacian") {
-    // Grab the coordinates for distance laplacian
-    Coords = Get<RCP<RealValuedMultiVector>>(currentLevel, "Coordinates");
-    A      = realA;
-  } else if (algo == "signed classical sa") {
-    useSignedClassicalSA = true;
-    algo                 = "classical";
-    A                    = realA;
-  } else if (algo == "signed classical" || algo == "block diagonal colored signed classical" || algo == "block diagonal signed classical") {
-    useSignedClassicalRS = true;
-    //      if(realA->GetFixedBlockSize() > 1) {
-    RCP<LocalOrdinalVector> BlockNumber = Get<RCP<LocalOrdinalVector>>(currentLevel, "BlockNumber");
-    // Ghost the column block numbers if we need to
-    RCP<const Import> importer = realA->getCrsGraph()->getImporter();
-    if (!importer.is_null()) {
-      SubFactoryMonitor m1(*this, "Block Number import", currentLevel);
-      ghostedBlockNumber = Xpetra::VectorFactory<LO, LO, GO, NO>::Build(importer->getTargetMap());
-      ghostedBlockNumber->doImport(*BlockNumber, *importer, Xpetra::INSERT);
-    } else {
-      ghostedBlockNumber = BlockNumber;
-    }
-    g_block_id = ghostedBlockNumber->getData(0);
-    //      }
-    if (algo == "block diagonal colored signed classical")
-      generateColoringGraph = true;
-    algo = "classical";
-    A    = realA;
-
-  } else if (algo == "block diagonal") {
-    // Handle the "block diagonal" filtering and then leave
-    BlockDiagonalize(currentLevel, realA, false);
-    return;
-  } else if (algo == "block diagonal classical" || algo == "block diagonal distance laplacian") {
-    // Handle the "block diagonal" filtering, and then continue onward
-    use_block_algorithm        = true;
-    RCP<Matrix> filteredMatrix = BlockDiagonalize(currentLevel, realA, true);
-    if (algo == "block diagonal distance laplacian") {
-      // We now need to expand the coordinates by the interleaved blocksize
-      RCP<RealValuedMultiVector> OldCoords = Get<RCP<RealValuedMultiVector>>(currentLevel, "Coordinates");
-      if (OldCoords->getLocalLength() != realA->getLocalNumRows()) {
-        LO dim = (LO)OldCoords->getNumVectors();
-        Coords = RealValuedMultiVectorFactory::Build(realA->getRowMap(), dim);
-        for (LO k = 0; k < dim; k++) {
-          ArrayRCP<const real_type> old_vec = OldCoords->getData(k);
-          ArrayRCP<real_type> new_vec       = Coords->getDataNonConst(k);
-          for (LO i = 0; i < (LO)OldCoords->getLocalLength(); i++) {
-            LO new_base = i * dim;
-            for (LO j = 0; j < interleaved_blocksize; j++)
-              new_vec[new_base + j] = old_vec[i];
-          }
-        }
-      } else {
-        Coords = OldCoords;
-      }
-      algo = "distance laplacian";
-    } else if (algo == "block diagonal classical") {
-      algo = "classical";
-    }
-    // All cases
-    A         = filteredMatrix;
-    rowSumTol = -1.0;
-  } else {
-    A = realA;
-  }
-
-  // Distance Laplacian weights
-  Array<double> dlap_weights = pL.get<Array<double>>("aggregation: distance laplacian directional weights");
-  enum { NO_WEIGHTS = 0,
-         SINGLE_WEIGHTS,
-         BLOCK_WEIGHTS };
-  int use_dlap_weights = NO_WEIGHTS;
-  if (algo == "distance laplacian") {
-    LO dim = (LO)Coords->getNumVectors();
-    // If anything isn't 1.0 we need to turn on the weighting
-    bool non_unity = false;
-    for (LO i = 0; !non_unity && i < (LO)dlap_weights.size(); i++) {
-      if (dlap_weights[i] != 1.0) {
-        non_unity = true;
-      }
-    }
-    if (non_unity) {
-      LO blocksize = use_block_algorithm ? as<LO>(pL.get<int>("aggregation: block diagonal: interleaved blocksize")) : 1;
-      if ((LO)dlap_weights.size() == dim)
-        use_dlap_weights = SINGLE_WEIGHTS;
-      else if ((LO)dlap_weights.size() == blocksize * dim)
-        use_dlap_weights = BLOCK_WEIGHTS;
-      else {
-        TEUCHOS_TEST_FOR_EXCEPTION(1, Exceptions::RuntimeError,
-                                   "length of 'aggregation: distance laplacian directional weights' must equal the coordinate dimension OR the coordinate dimension times the blocksize");
-      }
-      if (GetVerbLevel() & Statistics1)
-        GetOStream(Statistics1) << "Using distance laplacian weights: " << dlap_weights << std::endl;
-    }
-  }
-
-  // decide wether to use the fast-track code path for standard maps or the somewhat slower
-  // code path for non-standard maps
-  /*bool bNonStandardMaps = false;
-  if (A->IsView("stridedMaps") == true) {
-    Teuchos::RCP<const Map> myMap = A->getRowMap("stridedMaps");
-    Teuchos::RCP<const StridedMap> strMap = Teuchos::rcp_dynamic_cast<const StridedMap>(myMap);
-    TEUCHOS_TEST_FOR_EXCEPTION(strMap == null, Exceptions::RuntimeError, "Map is not of type StridedMap");
-    if (strMap->getStridedBlockId() != -1 || strMap->getOffset() > 0)
-      bNonStandardMaps = true;
-  }*/
-
-  if (doExperimentalWrap) {
-    TEUCHOS_TEST_FOR_EXCEPTION(predrop_ != null && algo != "classical", Exceptions::RuntimeError, "Dropping function must not be provided for \"" << algo << "\" algorithm");
-    TEUCHOS_TEST_FOR_EXCEPTION(algo != "classical" && algo != "distance laplacian" && algo != "signed classical", Exceptions::RuntimeError, "\"algorithm\" must be one of (classical|distance laplacian|signed classical)");
-
-    SC threshold;
-    // If we're doing the ML-style halving of the drop tol at each level, we do that here.
-    if (pL.get<bool>("aggregation: use ml scaling of drop tol"))
-      threshold = pL.get<double>("aggregation: drop tol") / pow(2.0, currentLevel.GetLevelID());
-    else
-      threshold = as<SC>(pL.get<double>("aggregation: drop tol"));
-
-    std::string distanceLaplacianAlgoStr = pL.get<std::string>("aggregation: distance laplacian algo");
-    std::string classicalAlgoStr         = pL.get<std::string>("aggregation: classical algo");
-    real_type realThreshold              = STS::magnitude(threshold);  // CMS: Rename this to "magnitude threshold" sometime
-
-    ////////////////////////////////////////////////////
-    // Remove this bit once we are confident that cut-based dropping works.
-#ifdef HAVE_MUELU_DEBUG
-    int distanceLaplacianCutVerbose = 0;
-#endif
-#ifdef DJS_READ_ENV_VARIABLES
-    if (getenv("MUELU_DROP_TOLERANCE_MODE")) {
-      distanceLaplacianAlgoStr = std::string(getenv("MUELU_DROP_TOLERANCE_MODE"));
-    }
-
-    if (getenv("MUELU_DROP_TOLERANCE_THRESHOLD")) {
-      auto tmp      = atoi(getenv("MUELU_DROP_TOLERANCE_THRESHOLD"));
-      realThreshold = 1e-4 * tmp;
-    }
-
-#ifdef HAVE_MUELU_DEBUG
-    if (getenv("MUELU_DROP_TOLERANCE_VERBOSE")) {
-      distanceLaplacianCutVerbose = atoi(getenv("MUELU_DROP_TOLERANCE_VERBOSE"));
-    }
-#endif
-#endif
-    ////////////////////////////////////////////////////
-
-    enum decisionAlgoType { defaultAlgo,
-                            unscaled_cut,
-                            scaled_cut,
-                            scaled_cut_symmetric };
-
-    decisionAlgoType distanceLaplacianAlgo = defaultAlgo;
-    decisionAlgoType classicalAlgo         = defaultAlgo;
-    if (algo == "distance laplacian") {
-      if (distanceLaplacianAlgoStr == "default")
-        distanceLaplacianAlgo = defaultAlgo;
-      else if (distanceLaplacianAlgoStr == "unscaled cut")
-        distanceLaplacianAlgo = unscaled_cut;
-      else if (distanceLaplacianAlgoStr == "scaled cut")
-        distanceLaplacianAlgo = scaled_cut;
-      else if (distanceLaplacianAlgoStr == "scaled cut symmetric")
-        distanceLaplacianAlgo = scaled_cut_symmetric;
-      else
-        TEUCHOS_TEST_FOR_EXCEPTION(true, Exceptions::RuntimeError, "\"aggregation: distance laplacian algo\" must be one of (default|unscaled cut|scaled cut), not \"" << distanceLaplacianAlgoStr << "\"");
-      GetOStream(Runtime0) << "algorithm = \"" << algo << "\" distance laplacian algorithm = \"" << distanceLaplacianAlgoStr << "\": threshold = " << threshold << ", blocksize = " << A->GetFixedBlockSize() << std::endl;
-    } else if (algo == "classical") {
-      if (classicalAlgoStr == "default")
-        classicalAlgo = defaultAlgo;
-      else if (classicalAlgoStr == "unscaled cut")
-        classicalAlgo = unscaled_cut;
-      else if (classicalAlgoStr == "scaled cut")
-        classicalAlgo = scaled_cut;
-      else
-        TEUCHOS_TEST_FOR_EXCEPTION(true, Exceptions::RuntimeError, "\"aggregation: classical algo\" must be one of (default|unscaled cut|scaled cut), not \"" << classicalAlgoStr << "\"");
-      GetOStream(Runtime0) << "algorithm = \"" << algo << "\" classical algorithm = \"" << classicalAlgoStr << "\": threshold = " << threshold << ", blocksize = " << A->GetFixedBlockSize() << std::endl;
-
-    } else
-      GetOStream(Runtime0) << "algorithm = \"" << algo << "\": threshold = " << threshold << ", blocksize = " << A->GetFixedBlockSize() << std::endl;
-    Set<bool>(currentLevel, "Filtering", (threshold != STS::zero()));
-
-    const typename STS::magnitudeType dirichletThreshold = STS::magnitude(as<SC>(pL.get<double>("aggregation: Dirichlet threshold")));
-
-    // NOTE: We don't support signed classical RS or SA with cut drop at present
-    TEUCHOS_TEST_FOR_EXCEPTION(useSignedClassicalRS && classicalAlgo != defaultAlgo, Exceptions::RuntimeError, "\"aggregation: classical algo\" != default is not supported for scalled classical aggregation");
-    TEUCHOS_TEST_FOR_EXCEPTION(useSignedClassicalSA && classicalAlgo != defaultAlgo, Exceptions::RuntimeError, "\"aggregation: classical algo\" != default is not supported for scalled classical sa aggregation");
-
-    GO numDropped = 0, numTotal = 0;
-    std::string graphType = "unamalgamated";  // for description purposes only
-
-    /* NOTE: storageblocksize (from GetStorageBlockSize()) is the size of a block in the chosen storage scheme.
-     BlockSize is the number of storage blocks that must kept together during the amalgamation process.
-
-     Both of these quantities may be different than numPDEs (from GetFixedBlockSize()), but the following must always hold:
-
-     numPDEs = BlockSize * storageblocksize.
-
-     If numPDEs==1
-       Matrix is point storage (classical CRS storage).  storageblocksize=1 and BlockSize=1
-       No other values makes sense.
-
-     If numPDEs>1
-       If matrix uses point storage, then storageblocksize=1  and BlockSize=numPDEs.
-       If matrix uses block storage, with block size of n, then storageblocksize=n, and BlockSize=numPDEs/n.
-       Thus far, only storageblocksize=numPDEs and BlockSize=1 has been tested.
-    */
-    TEUCHOS_TEST_FOR_EXCEPTION(A->GetFixedBlockSize() % A->GetStorageBlockSize() != 0, Exceptions::RuntimeError, "A->GetFixedBlockSize() needs to be a multiple of A->GetStorageBlockSize()");
-    const LO BlockSize = A->GetFixedBlockSize() / A->GetStorageBlockSize();
-
-    /************************** RS or SA-style Classical Dropping (and variants) **************************/
-    if (algo == "classical") {
-      if (predrop_ == null) {
-        // ap: this is a hack: had to declare predrop_ as mutable
-        predrop_ = rcp(new PreDropFunctionConstVal(threshold));
-      }
-
-      if (predrop_ != null) {
-        RCP<PreDropFunctionConstVal> predropConstVal = rcp_dynamic_cast<PreDropFunctionConstVal>(predrop_);
-        TEUCHOS_TEST_FOR_EXCEPTION(predropConstVal == Teuchos::null, Exceptions::BadCast,
-                                   "MueLu::CoalesceFactory::Build: cast to PreDropFunctionConstVal failed.");
-        // If a user provided a predrop function, it overwrites the XML threshold parameter
-        SC newt = predropConstVal->GetThreshold();
-        if (newt != threshold) {
-          GetOStream(Warnings0) << "switching threshold parameter from " << threshold << " (list) to " << newt << " (user function" << std::endl;
-          threshold = newt;
-        }
-      }
-      // At this points we either have
-      //     (predrop_ != null)
-      // Therefore, it is sufficient to check only threshold
-      if (BlockSize == 1 && threshold == STS::zero() && !useSignedClassicalRS && !useSignedClassicalSA && A->hasCrsGraph()) {
-        // Case 1:  scalar problem, no dropping => just use matrix graph
-        RCP<LWGraph> graph = rcp(new LWGraph(A->getCrsGraph(), "graph of A"));
-        // Detect and record rows that correspond to Dirichlet boundary conditions
-        auto boundaryNodes = MueLu::Utilities<SC, LO, GO, NO>::DetectDirichletRows_kokkos_host(*A, dirichletThreshold);
-        if (rowSumTol > 0.)
-          Utilities::ApplyRowSumCriterionHost(*A, rowSumTol, boundaryNodes);
-
-        graph->SetBoundaryNodeMap(boundaryNodes);
-        numTotal = A->getLocalNumEntries();
-
-        if (GetVerbLevel() & Statistics1) {
-          GO numLocalBoundaryNodes  = 0;
-          GO numGlobalBoundaryNodes = 0;
-          for (size_t i = 0; i < boundaryNodes.size(); ++i)
-            if (boundaryNodes[i])
-              numLocalBoundaryNodes++;
-          RCP<const Teuchos::Comm<int>> comm = A->getRowMap()->getComm();
-          MueLu_sumAll(comm, numLocalBoundaryNodes, numGlobalBoundaryNodes);
-          GetOStream(Statistics1) << "Detected " << numGlobalBoundaryNodes << " Dirichlet nodes" << std::endl;
-        }
-
-        Set(currentLevel, "DofsPerNode", 1);
-        Set(currentLevel, "Graph", graph);
-
-      } else if ((BlockSize == 1 && threshold != STS::zero()) ||
-                 (BlockSize == 1 && threshold == STS::zero() && !A->hasCrsGraph()) ||
-                 (BlockSize == 1 && useSignedClassicalRS) ||
-                 (BlockSize == 1 && useSignedClassicalSA)) {
-        // Case 2:  scalar problem with dropping => record the column indices of undropped entries, but still use original
-        //                                          graph's map information, e.g., whether index is local
-        // OR a matrix without a CrsGraph
-
-        // allocate space for the local graph
-        typename LWGraph::row_type::non_const_type rows("rows", A->getLocalNumRows() + 1);
-        typename LWGraph::entries_type::non_const_type columns("columns", A->getLocalNumEntries());
-
-        using MT = typename STS::magnitudeType;
-        RCP<Vector> ghostedDiag;
-        ArrayRCP<const SC> ghostedDiagVals;
-        ArrayRCP<const MT> negMaxOffDiagonal;
-        // RS style needs the max negative off-diagonal, SA style needs the diagonal
-        if (useSignedClassicalRS) {
-          if (ghostedBlockNumber.is_null()) {
-            negMaxOffDiagonal = MueLu::Utilities<SC, LO, GO, NO>::GetMatrixMaxMinusOffDiagonal(*A);
-            if (GetVerbLevel() & Statistics1)
-              GetOStream(Statistics1) << "Calculated max point off-diagonal" << std::endl;
-          } else {
-            negMaxOffDiagonal = MueLu::Utilities<SC, LO, GO, NO>::GetMatrixMaxMinusOffDiagonal(*A, *ghostedBlockNumber);
-            if (GetVerbLevel() & Statistics1)
-              GetOStream(Statistics1) << "Calculating max block off-diagonal" << std::endl;
-          }
-        } else {
-          ghostedDiag     = MueLu::Utilities<SC, LO, GO, NO>::GetMatrixOverlappedDiagonal(*A);
-          ghostedDiagVals = ghostedDiag->getData(0);
-        }
-        auto boundaryNodes = MueLu::Utilities<SC, LO, GO, NO>::DetectDirichletRows_kokkos_host(*A, dirichletThreshold);
-        if (rowSumTol > 0.) {
-          if (ghostedBlockNumber.is_null()) {
-            if (GetVerbLevel() & Statistics1)
-              GetOStream(Statistics1) << "Applying point row sum criterion." << std::endl;
-            Utilities::ApplyRowSumCriterionHost(*A, rowSumTol, boundaryNodes);
-          } else {
-            if (GetVerbLevel() & Statistics1)
-              GetOStream(Statistics1) << "Applying block row sum criterion." << std::endl;
-            Utilities::ApplyRowSumCriterionHost(*A, *ghostedBlockNumber, rowSumTol, boundaryNodes);
-          }
-        }
-
-        LO realnnz = 0;
-        rows(0)    = 0;
-	for (LO row = 0; row < Teuchos::as<LO>(A->getRowMap()->getLocalNumElements()); ++row) {
-          size_t nnz          = A->getNumEntriesInLocalRow(row);
-          bool rowIsDirichlet = boundaryNodes[row];
-          ArrayView<const LO> indices;
-          ArrayView<const SC> vals;
-          A->getLocalRowView(row, indices, vals);
-
-          if (classicalAlgo == defaultAlgo) {
-            // FIXME the current predrop function uses the following
-            // FIXME    if(std::abs(vals[k]) > std::abs(threshold_) || grow == gcid )
-            // FIXME but the threshold doesn't take into account the rows' diagonal entries
-            // FIXME For now, hardwiring the dropping in here
-
-            LO rownnz = 0;
-            if (useSignedClassicalRS) {
-              // Signed classical RS style
-              for (LO colID = 0; colID < Teuchos::as<LO>(nnz); colID++) {
-                LO col         = indices[colID];
-                MT max_neg_aik = realThreshold * STS::real(negMaxOffDiagonal[row]);
-                MT neg_aij     = -STS::real(vals[colID]);
-                /*                  if(row==1326) printf("A(%d,%d) = %6.4e, block = (%d,%d) neg_aij = %6.4e max_neg_aik = %6.4e\n",row,col,vals[colID],
-                                     g_block_id.is_null() ? -1 :  g_block_id[row],
-                                     g_block_id.is_null() ? -1 :  g_block_id[col],
-                                     neg_aij, max_neg_aik);*/
-                if ((!rowIsDirichlet && (g_block_id.is_null() || g_block_id[row] == g_block_id[col]) && neg_aij > max_neg_aik) || row == col) {
-                  columns[realnnz++] = col;
-                  rownnz++;
-                } else
-                  numDropped++;
-              }
-              rows(row + 1) = realnnz;
-            } else if (useSignedClassicalSA) {
-              // Signed classical SA style
-              for (LO colID = 0; colID < Teuchos::as<LO>(nnz); colID++) {
-                LO col = indices[colID];
-
-                bool is_nonpositive = STS::real(vals[colID]) <= 0;
-                MT aiiajj           = STS::magnitude(threshold * threshold * ghostedDiagVals[col] * ghostedDiagVals[row]);                        // eps^2*|a_ii|*|a_jj|
-                MT aij              = is_nonpositive ? STS::magnitude(vals[colID] * vals[colID]) : (-STS::magnitude(vals[colID] * vals[colID]));  // + |a_ij|^2, if a_ij < 0, - |a_ij|^2 if a_ij >=0
-                /*
-                if(row==1326) printf("A(%d,%d) = %6.4e, raw_aij = %6.4e aij = %6.4e aiiajj = %6.4e\n",row,col,vals[colID],
-                                     vals[colID],aij, aiiajj);
-                */
-
-                if ((!rowIsDirichlet && aij > aiiajj) || row == col) {
-                  columns(realnnz++) = col;
-                  rownnz++;
-                } else
-                  numDropped++;
-              }
-              rows[row + 1] = realnnz;
-            } else {
-              // Standard abs classical
-              for (LO colID = 0; colID < Teuchos::as<LO>(nnz); colID++) {
-                LO col    = indices[colID];
-                MT aiiajj = STS::magnitude(threshold * threshold * ghostedDiagVals[col] * ghostedDiagVals[row]);  // eps^2*|a_ii|*|a_jj|
-                MT aij    = STS::magnitude(vals[colID] * vals[colID]);                                            // |a_ij|^2
-
-                if ((!rowIsDirichlet && aij > aiiajj) || row == col) {
-                  columns(realnnz++) = col;
-                  rownnz++;
-                } else
-                  numDropped++;
-              }
-              rows(row + 1) = realnnz;
-            }
-          } else {
-	    /* Cut Algorithm */
-            // CMS
-            using DropTol = Details::DropTol<real_type, LO>;
-            std::vector<DropTol> drop_vec;
-	    drop_vec.reserve(nnz);
-            const real_type zero = Teuchos::ScalarTraits<real_type>::zero();
-            const real_type one  = Teuchos::ScalarTraits<real_type>::one();
-            LO rownnz            = 0;
-            // NOTE: This probably needs to be fixed for rowsum
-
-            // find magnitudes
-            for (LO colID = 0; colID < (LO)nnz; colID++) {
-              LO col = indices[colID];
-              if (row == col) {
-                drop_vec.emplace_back(zero, one, colID, false);
-                continue;
-              }
-
-              // Don't aggregate boundaries
-              if (boundaryNodes[colID]) continue;
-              typename STS::magnitudeType aiiajj = STS::magnitude(threshold * threshold * ghostedDiagVals[col] * ghostedDiagVals[row]);  // eps^2*|a_ii|*|a_jj|
-              typename STS::magnitudeType aij    = STS::magnitude(vals[colID] * vals[colID]);                                            // |a_ij|^2
-              drop_vec.emplace_back(aij, aiiajj, colID, false);
-            }
-
-            const size_t n = drop_vec.size();
-
-            if (classicalAlgo == unscaled_cut) {
-              std::sort(drop_vec.begin(), drop_vec.end(), [](DropTol const& a, DropTol const& b) {
-                return a.val > b.val;
-              });
-
-              bool drop = false;
-              for (size_t i = 1; i < n; ++i) {
-                if (!drop) {
-                  auto const& x = drop_vec[i - 1];
-                  auto const& y = drop_vec[i];
-                  auto a        = x.val;
-                  auto b        = y.val;
-                  if (a > realThreshold * b) {
-                    drop = true;
-#ifdef HAVE_MUELU_DEBUG
-                    if (distanceLaplacianCutVerbose) {
-                      std::cout << "DJS: KEEP, N, ROW:  " << i + 1 << ", " << n << ", " << row << std::endl;
-                    }
-#endif
-                  }
-                }
-                drop_vec[i].drop = drop;
-              }
-            } else if (classicalAlgo == scaled_cut) {
-              std::sort(drop_vec.begin(), drop_vec.end(), [](DropTol const& a, DropTol const& b) {
-                return a.val / a.diag > b.val / b.diag;
-              });
-              bool drop = false;
-              //                  printf("[%d] Scaled Cut: ",(int)row);
-              //                  printf("%3d(%4s) ",indices[drop_vec[0].col],"keep");
-              for (size_t i = 1; i < n; ++i) {
-                if (!drop) {
-                  auto const& x = drop_vec[i - 1];
-                  auto const& y = drop_vec[i];
-                  auto a        = x.val / x.diag;
-                  auto b        = y.val / y.diag;
-                  if (a > realThreshold * b) {
-                    drop = true;
-
-#ifdef HAVE_MUELU_DEBUG
-                    if (distanceLaplacianCutVerbose) {
-                      std::cout << "DJS: KEEP, N, ROW:  " << i + 1 << ", " << n << ", " << row << std::endl;
-                    }
-#endif
-                  }
-                  //                      printf("%3d(%4s) ",indices[drop_vec[i].col],drop?"drop":"keep");
-                }
-                drop_vec[i].drop = drop;
-              }
-              //                  printf("\n");
-            }
-            std::sort(drop_vec.begin(), drop_vec.end(), [](DropTol const& a, DropTol const& b) {
-              return a.col < b.col;
-            });
-
-            for (LO idxID = 0; idxID < (LO)drop_vec.size(); idxID++) {
-              LO col = indices[drop_vec[idxID].col];
-              // don't drop diagonal
-              if (row == col) {
-                columns[realnnz++] = col;
-                rownnz++;
-                continue;
-              }
-
-              if (!drop_vec[idxID].drop) {
-                columns[realnnz++] = col;
-                rownnz++;
-              } else {
-                numDropped++;
-              }
-            }
-            // CMS
-            rows[row + 1] = realnnz;
-          }
-        }  // end for row
+	}
 
         numTotal = A->getLocalNumEntries();
 
@@ -3285,7 +1655,7 @@ void CoalesceDropFactory<Scalar, LocalOrdinal, GlobalOrdinal, Node>::BuildKokkos
 
   }  // if (doExperimentalWrap) ... else ...
 
-}  // BuildKokkos
+} // Build
 
 template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
 void CoalesceDropFactory<Scalar, LocalOrdinal, GlobalOrdinal, Node>::MergeRows(const Matrix& A, const LO row, Array<LO>& cols, const Array<LO>& translation) const {

From 7c78025bf66884fd8b788bac2ae352e871241e7d Mon Sep 17 00:00:00 2001
From: Ian Halim <ihalim@ascicgpu031.sandia.gov>
Date: Wed, 24 Jul 2024 17:51:23 -0600
Subject: [PATCH 04/25] MueLu: std::complex Replaced With Kokkos::complex

Signed-off-by: Ian Halim <ihalim@ascicgpu031.sandia.gov>
---
 .../MueLu_CoalesceDropFactory_def.hpp         | 475 +++++++++---------
 1 file changed, 241 insertions(+), 234 deletions(-)

diff --git a/packages/muelu/src/Graph/MatrixTransformation/MueLu_CoalesceDropFactory_def.hpp b/packages/muelu/src/Graph/MatrixTransformation/MueLu_CoalesceDropFactory_def.hpp
index da606ab20ff6..ad5895e2e41b 100644
--- a/packages/muelu/src/Graph/MatrixTransformation/MueLu_CoalesceDropFactory_def.hpp
+++ b/packages/muelu/src/Graph/MatrixTransformation/MueLu_CoalesceDropFactory_def.hpp
@@ -475,10 +475,10 @@ void CoalesceDropFactory<Scalar, LocalOrdinal, GlobalOrdinal, Node>::Build(Level
         typename LWGraph::entries_type::non_const_type columns("columns", A->getLocalNumEntries());
 
         using MT = typename STS::magnitudeType;
-	RCP<Vector> ghostedDiag;
+        RCP<Vector> ghostedDiag;
         ArrayRCP<const SC> ghostedDiagVals;
         ArrayRCP<const MT> negMaxOffDiagonal;
-	// RS style needs the max negative off-diagonal, SA style needs the diagonal
+        // RS style needs the max negative off-diagonal, SA style needs the diagonal
         if (useSignedClassicalRS) {
           if (ghostedBlockNumber.is_null()) {
             negMaxOffDiagonal = MueLu::Utilities<SC, LO, GO, NO>::GetMatrixMaxMinusOffDiagonal(*A);
@@ -491,10 +491,12 @@ void CoalesceDropFactory<Scalar, LocalOrdinal, GlobalOrdinal, Node>::Build(Level
           }
         } else {
           ghostedDiag     = MueLu::Utilities<SC, LO, GO, NO>::GetMatrixOverlappedDiagonal(*A);
-          ghostedDiagVals = ghostedDiag->getData(0);
-	}
+          if(classicalAlgo == defaultAlgo) {
+            ghostedDiagVals = ghostedDiag->getData(0);
+          }
+        }
         auto boundaryNodes = MueLu::Utilities<SC, LO, GO, NO>::DetectDirichletRows_kokkos_host(*A, dirichletThreshold);
-	if (rowSumTol > 0.) {
+        if (rowSumTol > 0.) {
           if (ghostedBlockNumber.is_null()) {
             if (GetVerbLevel() & Statistics1)
               GetOStream(Statistics1) << "Applying point row sum criterion." << std::endl;
@@ -508,234 +510,239 @@ void CoalesceDropFactory<Scalar, LocalOrdinal, GlobalOrdinal, Node>::Build(Level
 
         LO realnnz = 0;
         rows(0)    = 0;
-	if(classicalAlgo == defaultAlgo) {
-            	SubFactoryMonitor m1(*this, "Classical RS/SA", currentLevel);
-		for (LO row = 0; row < Teuchos::as<LO>(A->getRowMap()->getLocalNumElements()); ++row) {
-        		size_t nnz          = A->getNumEntriesInLocalRow(row);
-        		bool rowIsDirichlet = boundaryNodes[row];
-        		ArrayView<const LO> indices;
-        		ArrayView<const SC> vals;
-        		A->getLocalRowView(row, indices, vals);
-
-        		// FIXME the current predrop function uses the following
-        		// FIXME    if(std::abs(vals[k]) > std::abs(threshold_) || grow == gcid )
-        		// FIXME but the threshold doesn't take into account the rows' diagonal entries
-        		// FIXME For now, hardwiring the dropping in here
-
-        		LO rownnz = 0;
-        		if (useSignedClassicalRS) {
-        			// Signed classical RS style
-            			for (LO colID = 0; colID < Teuchos::as<LO>(nnz); colID++) {
-                			LO col         = indices[colID];
-                			MT max_neg_aik = realThreshold * STS::real(negMaxOffDiagonal[row]);
-                			MT neg_aij     = -STS::real(vals[colID]);
-                			/*                  if(row==1326) printf("A(%d,%d) = %6.4e, block = (%d,%d) neg_aij = %6.4e max_neg_aik = %6.4e\n",row,col,vals[colID],
-                               		 		    g_block_id.is_null() ? -1 :  g_block_id[row],
-                              		 		    g_block_id.is_null() ? -1 :  g_block_id[col],
-                               			     	    neg_aij, max_neg_aik);*/
-                			if ((!rowIsDirichlet && (g_block_id.is_null() || g_block_id[row] == g_block_id[col]) && neg_aij > max_neg_aik) || row == col) {
-                  				columns[realnnz++] = col;
-                  				rownnz++;
-                			} else
-                  			numDropped++;
-              			}
-              			rows(row + 1) = realnnz;
-            		} else if (useSignedClassicalSA) {
-              			// Signed classical SA style
-              			for (LO colID = 0; colID < Teuchos::as<LO>(nnz); colID++) {
-                			LO col = indices[colID];
-
-			                bool is_nonpositive = STS::real(vals[colID]) <= 0;
-                			MT aiiajj           = STS::magnitude(threshold * threshold * ghostedDiagVals[col] * ghostedDiagVals[row]);                        // eps^2*|a_ii|*|a_jj|
-                			MT aij              = is_nonpositive ? STS::magnitude(vals[colID] * vals[colID]) : (-STS::magnitude(vals[colID] * vals[colID]));  // + |a_ij|^2, if a_ij < 0, - |a_ij|^2 if a_ij >=0
-                			/*
-                			if(row==1326) printf("A(%d,%d) = %6.4e, raw_aij = %6.4e aij = %6.4e aiiajj = %6.4e\n",row,col,vals[colID],
-                                			     vals[colID],aij, aiiajj);
-                			*/
-
-                			if ((!rowIsDirichlet && aij > aiiajj) || row == col) {
-                  				columns(realnnz++) = col;
-                  				rownnz++;
-                			} else
-                			  numDropped++;
-              			}
-              			rows[row + 1] = realnnz;
-            		} else {
-              			// Standard abs classical
-              			for (LO colID = 0; colID < Teuchos::as<LO>(nnz); colID++) {
-                			LO col    = indices[colID];
-                			MT aiiajj = STS::magnitude(threshold * threshold * ghostedDiagVals[col] * ghostedDiagVals[row]);  // eps^2*|a_ii|*|a_jj|
-                			MT aij    = STS::magnitude(vals[colID] * vals[colID]);                                            // |a_ij|^2
-
-                			if ((!rowIsDirichlet && aij > aiiajj) || row == col) {
-                  				columns(realnnz++) = col;
-                  				rownnz++;
-                			} else
-                  		 	  numDropped++;
-              			}
-              			rows(row + 1) = realnnz;
-            		}
-        	}  // end for row
-	}
-	else {
-            	SubFactoryMonitor m1(*this, "Cut Drop", currentLevel);
-		using ExecSpace = typename Node::execution_space;
-		using TeamPol = Kokkos::TeamPolicy<ExecSpace>;
-		using TeamMem = typename TeamPol::member_type;
-		
-		//move from host to device
-		ArrayView<const SC>  ghostedDiagValsArrayView = ghostedDiagVals.view(ghostedDiagVals.lowerOffset(), ghostedDiagVals.size());
-		Kokkos::View<const SC*, ExecSpace> ghostedDiagValsView = Kokkos::Compat::getKokkosViewDeepCopy<ExecSpace>(ghostedDiagValsArrayView);
-		auto boundaryNodesDevice = Kokkos::create_mirror_view(ExecSpace(), boundaryNodes);
-		
-		auto At = Utilities::Op2TpetraCrs(A);
-		auto A_device = At->getLocalMatrixDevice();
-		
-		int algorithm = classicalAlgo;
-		Kokkos::View<LO*, ExecSpace>rownnzView("rownnzView", A_device.numRows());
-		auto drop_views = Kokkos::View<bool*, ExecSpace>("drop_views", A_device.nnz());
-		auto index_views = Kokkos::View<size_t*, ExecSpace>("index_views", A_device.nnz());
-
-		Kokkos::parallel_reduce("classical_cut", TeamPol(A_device.numRows(), Kokkos::AUTO), KOKKOS_LAMBDA(const TeamMem& teamMember, LO& globalnnz, GO& totalDropped) {
-			LO row = teamMember.league_rank();
-			auto rowView = A_device.row(row);
-			size_t nnz = rowView.length;
-
-			size_t n = 0;
-			auto drop_view = Kokkos::subview(drop_views, Kokkos::make_pair(A_device.graph.row_map(row), A_device.graph.row_map(row+1)));
-			auto index_view = Kokkos::subview(index_views, Kokkos::make_pair(A_device.graph.row_map(row), A_device.graph.row_map(row+1)));
-
-			//find magnitudes
-			Kokkos::parallel_reduce(Kokkos::TeamThreadRange(teamMember, (LO)nnz), [&](const LO colID, size_t &count) {
-				index_view(colID) = colID;
-				LO col = rowView.colidx(colID);
-				//ignore diagonals for now, they are checked again later
-				if(row == col) {
-					drop_view(colID) = true;
-					count++;
-				}
-				//Don't aggregate boundaries
-				else if(boundaryNodesDevice(colID)) {
-					drop_view(colID) = true;
-				}
-				else {
-					drop_view(colID) = false;
-					count++;
-				}
-			}, n);
-
-			size_t dropStart = n;
-			if (algorithm == unscaled_cut) {
-				//push diagonals and boundaries to the right, sort everything else by aij on the left
-				Kokkos::Experimental::sort_team(teamMember, index_view, [=](size_t& x, size_t& y) {
-					if(drop_view(x) || drop_view(y)) {
-						return drop_view(x) < drop_view(y);
-					}
-					else {
-						typename STS::magnitudeType x_aij    = static_cast<SC>(std::fabs(static_cast<double>(rowView.value(x) * rowView.value(x))));
-						typename STS::magnitudeType y_aij    = static_cast<SC>(std::fabs(static_cast<double>(rowView.value(y) * rowView.value(y))));
-						return x_aij > y_aij;
-					}
-				});
-
-				//find index where dropping starts
-				Kokkos::parallel_reduce(Kokkos::TeamThreadRange(teamMember, 1, n), [=](size_t i, size_t& min) {
-					auto const& x = index_view(i - 1);
-					auto const& y = index_view(i);
-					typename STS::magnitudeType x_aij = 0;
-					typename STS::magnitudeType y_aij = 0;
-					if(!drop_view(x)) {
-						x_aij    = static_cast<SC>(std::fabs(static_cast<double>(rowView.value(x) * rowView.value(x))));
-					}
-					if(!drop_view(y)) {
-						y_aij    = static_cast<SC>(std::fabs(static_cast<double>(rowView.value(y) * rowView.value(y))));
-					}
-
-					if(x_aij > realThreshold * y_aij) {
-						if(i < min) {
-							min = i;
-						}
-					}
-				}, Kokkos::Min<size_t>(dropStart));
-          	 	} else if (algorithm == scaled_cut) {
-				//push diagonals and boundaries to the right, sort everything else by aij/aiiajj on the left
-				Kokkos::Experimental::sort_team(teamMember, index_view, [=](size_t& x, size_t& y) {
-					if(drop_view(x) || drop_view(y)) {
-						return drop_view(x) < drop_view(y);
-					}
-					else {
-						typename STS::magnitudeType x_aij    = static_cast<SC>(std::fabs(static_cast<double>(rowView.value(x) * rowView.value(x))));
-						typename STS::magnitudeType y_aij    = static_cast<SC>(std::fabs(static_cast<double>(rowView.value(y) * rowView.value(y))));
-						typename STS::magnitudeType x_aiiajj = static_cast<SC>(std::fabs(static_cast<double>(threshold * threshold * ghostedDiagValsView(rowView.colidx(x)) * ghostedDiagValsView(row))));
-						typename STS::magnitudeType y_aiiajj = static_cast<SC>(std::fabs(static_cast<double>(threshold * threshold * ghostedDiagValsView(rowView.colidx(y)) * ghostedDiagValsView(row))));
-						return x_aij / x_aiiajj > y_aij / y_aiiajj;
-					}
-				});
-
-				//find index where dropping starts
-				Kokkos::parallel_reduce(Kokkos::TeamThreadRange(teamMember, 1, n), [=](size_t i, size_t& min) {
-					auto const& x = index_view(i - 1);
-					auto const& y = index_view(i);
-					typename STS::magnitudeType x_val = 0;
-					typename STS::magnitudeType y_val = 0;
-					if(!drop_view(x)) {
-						typename STS::magnitudeType x_aij    = static_cast<SC>(std::fabs(static_cast<double>(rowView.value(x) * rowView.value(x))));
-						typename STS::magnitudeType x_aiiajj = static_cast<SC>(std::fabs(static_cast<double>(threshold * threshold * ghostedDiagValsView(rowView.colidx(x)) * ghostedDiagValsView(row))));
-						x_val = x_aij / x_aiiajj;
-					}
-					if(!drop_view(y)) {
-						typename STS::magnitudeType y_aij    = static_cast<SC>(std::fabs(static_cast<double>(rowView.value(y) * rowView.value(y))));
-						typename STS::magnitudeType y_aiiajj = static_cast<SC>(std::fabs(static_cast<double>(threshold * threshold * ghostedDiagValsView(rowView.colidx(y)) * ghostedDiagValsView(row))));
-						y_val = y_aij / y_aiiajj;
-					}
-
-					if(x_val > realThreshold * y_val) {
-						if(i < min) {
-							min = i;
-						}
-					}
-				}, Kokkos::Min<size_t>(dropStart));
-	  	 	}
-
-			//drop everything to the right of where values stop passing threshold 
-			if(dropStart < n) {
-				Kokkos::parallel_for(Kokkos::TeamThreadRange(teamMember, dropStart, n), [=](size_t i) {
-					drop_view(index_view(i)) = true;
-				});
-			}
-
-		  	LO rownnz = 0;
-		  	GO rowDropped = 0;
-		  	Kokkos::parallel_reduce(Kokkos::TeamThreadRange(teamMember, n), [=](const size_t idxID, LO& keep, GO& drop) {
-				LO col = rowView.colidx(idxID);
-				//don't drop diagonal
-				if(row == col || !drop_view(idxID)) {
-					keep++;
-				}
-				else {
-					rowView.colidx(idxID) = -1;
-					drop++;
-				}
-	  	 	}, rownnz, rowDropped);
-
-		  	globalnnz += rownnz;
-		  	totalDropped += rowDropped;
-			rownnzView(row) = rownnz;
-		}, realnnz, numDropped);
-	
-		//update column indices so that kept indices are aligned to the left for subview that happens later on
-		auto columnsDevice = Kokkos::create_mirror_view(ExecSpace(), columns);
-		Kokkos::Experimental::remove(ExecSpace(), columnsDevice, -1);
-		Kokkos::deep_copy(columns, columnsDevice);
-		
-		//update row indices by adding up new # of nnz in each row
-		auto rowsDevice = Kokkos::create_mirror_view(ExecSpace(), rows);
-		Kokkos::parallel_scan(Kokkos::RangePolicy<ExecSpace>(0, A_device.numRows()), KOKKOS_LAMBDA(const int i, LO& partial_sum, bool is_final) {
-			partial_sum += rownnzView(i);
-			if(is_final) rowsDevice(i+1) = partial_sum;
-		});
-		Kokkos::deep_copy(rows, rowsDevice);
-	}
+        if(classicalAlgo == defaultAlgo) {
+          SubFactoryMonitor m1(*this, "Classical RS/SA", currentLevel);
+          for (LO row = 0; row < Teuchos::as<LO>(A->getRowMap()->getLocalNumElements()); ++row) {
+            size_t nnz          = A->getNumEntriesInLocalRow(row);
+            bool rowIsDirichlet = boundaryNodes[row];
+            ArrayView<const LO> indices;
+            ArrayView<const SC> vals;
+            A->getLocalRowView(row, indices, vals);
+
+            // FIXME the current predrop function uses the following
+            // FIXME    if(std::abs(vals[k]) > std::abs(threshold_) || grow == gcid )
+            // FIXME but the threshold doesn't take into account the rows' diagonal entries
+            // FIXME For now, hardwiring the dropping in here
+
+            LO rownnz = 0;
+            if (useSignedClassicalRS) {
+              // Signed classical RS style
+              for (LO colID = 0; colID < Teuchos::as<LO>(nnz); colID++) {
+                LO col         = indices[colID];
+                MT max_neg_aik = realThreshold * STS::real(negMaxOffDiagonal[row]);
+                MT neg_aij     = -STS::real(vals[colID]);
+                /*                  if(row==1326) printf("A(%d,%d) = %6.4e, block = (%d,%d) neg_aij = %6.4e max_neg_aik = %6.4e\n",row,col,vals[colID],
+                                     g_block_id.is_null() ? -1 :  g_block_id[row],
+                                     g_block_id.is_null() ? -1 :  g_block_id[col],
+                                     neg_aij, max_neg_aik);*/
+                if ((!rowIsDirichlet && (g_block_id.is_null() || g_block_id[row] == g_block_id[col]) && neg_aij > max_neg_aik) || row == col) {
+                  columns[realnnz++] = col;
+                  rownnz++;
+                } else
+                  numDropped++;
+              }
+              rows(row + 1) = realnnz;
+            } else if (useSignedClassicalSA) {
+              // Signed classical SA style
+              for (LO colID = 0; colID < Teuchos::as<LO>(nnz); colID++) {
+                LO col = indices[colID];
+
+                bool is_nonpositive = STS::real(vals[colID]) <= 0;
+                MT aiiajj           = STS::magnitude(threshold * threshold * ghostedDiagVals[col] * ghostedDiagVals[row]);                        // eps^2*|a_ii|*|a_jj|
+                MT aij              = is_nonpositive ? STS::magnitude(vals[colID] * vals[colID]) : (-STS::magnitude(vals[colID] * vals[colID]));  // + |a_ij|^2, if a_ij < 0, - |a_ij|^2 if a_ij >=0
+                /*
+                if(row==1326) printf("A(%d,%d) = %6.4e, raw_aij = %6.4e aij = %6.4e aiiajj = %6.4e\n",row,col,vals[colID],
+                                     vals[colID],aij, aiiajj);
+                */
+
+                if ((!rowIsDirichlet && aij > aiiajj) || row == col) {
+                  columns(realnnz++) = col;
+                  rownnz++;
+                } else
+                  numDropped++;
+              }
+              rows[row + 1] = realnnz;
+            } else {
+              // Standard abs classical
+              for (LO colID = 0; colID < Teuchos::as<LO>(nnz); colID++) {
+                LO col    = indices[colID];
+                MT aiiajj = STS::magnitude(threshold * threshold * ghostedDiagVals[col] * ghostedDiagVals[row]);  // eps^2*|a_ii|*|a_jj|
+                MT aij    = STS::magnitude(vals[colID] * vals[colID]);                                            // |a_ij|^2
+
+                if ((!rowIsDirichlet && aij > aiiajj) || row == col) {
+                  columns(realnnz++) = col;
+                  rownnz++;
+                } else
+                  numDropped++;
+              }
+              rows(row + 1) = realnnz;
+            }
+          }  // end for row
+        }
+        else {
+          /* Cut Algorithm */
+          SubFactoryMonitor m1(*this, "Cut Drop", currentLevel);
+          using ExecSpace = typename Node::execution_space;
+          using TeamPol = Kokkos::TeamPolicy<ExecSpace>;
+          using TeamMem = typename TeamPol::member_type;
+          using ATS = Kokkos::ArithTraits<Scalar>;
+          using impl_scalar_type = typename ATS::val_type;
+          using implATS = Kokkos::ArithTraits<impl_scalar_type>;
+
+          //move from host to device
+          auto ghostedDiagValsView = Kokkos::subview(ghostedDiag->getDeviceLocalView(Xpetra::Access::ReadOnly), Kokkos::ALL(), 0);
+          auto boundaryNodesDevice = Kokkos::create_mirror_view(ExecSpace(), boundaryNodes);
+          auto thresholdKokkos = static_cast<impl_scalar_type>(threshold);
+          auto realThresholdKokkos = implATS::magnitude(thresholdKokkos);
+
+          auto At = Utilities::Op2TpetraCrs(A);
+          auto A_device = At->getLocalMatrixDevice();
+
+          int algorithm = classicalAlgo;
+          Kokkos::View<LO*, ExecSpace>rownnzView("rownnzView", A_device.numRows());
+          auto drop_views = Kokkos::View<bool*, ExecSpace>("drop_views", A_device.nnz());
+          auto index_views = Kokkos::View<size_t*, ExecSpace>("index_views", A_device.nnz());
+
+          Kokkos::parallel_reduce("classical_cut", TeamPol(A_device.numRows(), Kokkos::AUTO), KOKKOS_LAMBDA(const TeamMem& teamMember, LO& globalnnz, GO& totalDropped) {
+            LO row = teamMember.league_rank();
+            auto rowView = A_device.row(row);
+            size_t nnz = rowView.length;
+
+            size_t n = 0;
+            auto drop_view = Kokkos::subview(drop_views, Kokkos::make_pair(A_device.graph.row_map(row), A_device.graph.row_map(row+1)));
+            auto index_view = Kokkos::subview(index_views, Kokkos::make_pair(A_device.graph.row_map(row), A_device.graph.row_map(row+1)));
+
+            //find magnitudes
+            Kokkos::parallel_reduce(Kokkos::TeamThreadRange(teamMember, (LO)nnz), [&](const LO colID, size_t &count) {
+              index_view(colID) = colID;
+              LO col = rowView.colidx(colID);
+              //ignore diagonals for now, they are checked again later
+              if(row == col) {
+                drop_view(colID) = true;
+                count++;
+              }
+              //Don't aggregate boundaries
+              else if(boundaryNodesDevice(colID)) {
+                drop_view(colID) = true;
+              }
+              else {
+                drop_view(colID) = false;
+                count++;
+              }
+            }, n);
+
+            size_t dropStart = n;
+            if (algorithm == unscaled_cut) {
+              //push diagonals and boundaries to the right, sort everything else by aij on the left
+              Kokkos::Experimental::sort_team(teamMember, index_view, [=](size_t& x, size_t& y) -> bool {
+                if(drop_view(x) || drop_view(y)) {
+                  return drop_view(x) < drop_view(y);
+                }
+                else {
+                  auto x_aij    = implATS::magnitude(rowView.value(x) * rowView.value(x));
+                  auto y_aij    = implATS::magnitude(rowView.value(y) * rowView.value(y));
+                  return x_aij > y_aij;
+                }
+              });
+
+              //find index where dropping starts
+              Kokkos::parallel_reduce(Kokkos::TeamThreadRange(teamMember, 1, n), [=](size_t i, size_t& min) {
+                auto const& x = index_view(i - 1);
+                auto const& y = index_view(i);
+                typename implATS::magnitudeType x_aij = 0;
+                typename implATS::magnitudeType y_aij = 0;
+                if(!drop_view(x)) {
+                  x_aij    = implATS::magnitude(rowView.value(x) * rowView.value(x));
+                }
+                if(!drop_view(y)) {
+                  y_aij    = implATS::magnitude(rowView.value(y) * rowView.value(y));
+                }
+
+                if(x_aij > realThresholdKokkos * y_aij) {
+                  if(i < min) {
+                    min = i;
+                  }
+                }
+              }, Kokkos::Min<size_t>(dropStart));
+            } else if (algorithm == scaled_cut) {
+              //push diagonals and boundaries to the right, sort everything else by aij/aiiajj on the left
+              Kokkos::Experimental::sort_team(teamMember, index_view, [=](size_t& x, size_t& y) -> bool {
+                if(drop_view(x) || drop_view(y)) {
+                  return drop_view(x) < drop_view(y);
+                }
+                else {
+                  auto x_aij    = implATS::magnitude(rowView.value(x) * rowView.value(x));
+                  auto y_aij    = implATS::magnitude(rowView.value(y) * rowView.value(y));
+                  auto x_aiiajj = implATS::magnitude(thresholdKokkos * thresholdKokkos * ghostedDiagValsView(rowView.colidx(x)) * ghostedDiagValsView(row));
+                  auto y_aiiajj = implATS::magnitude(thresholdKokkos * thresholdKokkos * ghostedDiagValsView(rowView.colidx(y)) * ghostedDiagValsView(row));
+                  return (x_aij / x_aiiajj) > (y_aij / y_aiiajj);
+                }
+              });
+
+              //find index where dropping starts
+              Kokkos::parallel_reduce(Kokkos::TeamThreadRange(teamMember, 1, n), [=](size_t i, size_t& min) {
+                auto const& x = index_view(i - 1);
+                auto const& y = index_view(i);
+                typename implATS::magnitudeType x_val = 0;
+                typename implATS::magnitudeType y_val = 0;
+                if(!drop_view(x)) {
+                  typename implATS::magnitudeType x_aij    = implATS::magnitude(rowView.value(x) * rowView.value(x));
+                  typename implATS::magnitudeType x_aiiajj = implATS::magnitude(thresholdKokkos * thresholdKokkos * ghostedDiagValsView(rowView.colidx(x)) * ghostedDiagValsView(row));
+                  x_val = x_aij / x_aiiajj;
+                }
+                if(!drop_view(y)) {
+                  typename implATS::magnitudeType y_aij    = implATS::magnitude(rowView.value(y) * rowView.value(y));
+                  typename implATS::magnitudeType y_aiiajj = implATS::magnitude(thresholdKokkos * thresholdKokkos * ghostedDiagValsView(rowView.colidx(y)) * ghostedDiagValsView(row));
+                  y_val = y_aij / y_aiiajj;
+                }
+
+                if(x_val > realThresholdKokkos * y_val) {
+                  if(i < min) {
+                    min = i;
+                  }
+                }
+              }, Kokkos::Min<size_t>(dropStart));
+            }
+
+            //drop everything to the right of where values stop passing threshold
+            if(dropStart < n) {
+              Kokkos::parallel_for(Kokkos::TeamThreadRange(teamMember, dropStart, n), [=](size_t i) {
+                drop_view(index_view(i)) = true;
+              });
+            }
+
+            LO rownnz = 0;
+            GO rowDropped = 0;
+            Kokkos::parallel_reduce(Kokkos::TeamThreadRange(teamMember, n), [=](const size_t idxID, LO& keep, GO& drop) {
+              LO col = rowView.colidx(idxID);
+              //don't drop diagonal
+              if(row == col || !drop_view(idxID)) {
+                keep++;
+              }
+              else {
+                rowView.colidx(idxID) = -1;
+                drop++;
+              }
+            }, rownnz, rowDropped);
+
+            globalnnz += rownnz;
+            totalDropped += rowDropped;
+            rownnzView(row) = rownnz;
+          }, realnnz, numDropped);
+
+          //update column indices so that kept indices are aligned to the left for subview that happens later on
+          auto columnsDevice = Kokkos::create_mirror_view(ExecSpace(), columns);
+          Kokkos::Experimental::remove(ExecSpace(), columnsDevice, -1);
+          Kokkos::deep_copy(columns, columnsDevice);
+
+          //update row indices by adding up new # of nnz in each row
+          auto rowsDevice = Kokkos::create_mirror_view(ExecSpace(), rows);
+          Kokkos::parallel_scan(Kokkos::RangePolicy<ExecSpace>(0, A_device.numRows()), KOKKOS_LAMBDA(const int i, LO& partial_sum, bool is_final) {
+            partial_sum += rownnzView(i);
+            if(is_final) rowsDevice(i+1) = partial_sum;
+          });
+          Kokkos::deep_copy(rows, rowsDevice);
+        }
 
         numTotal = A->getLocalNumEntries();
 
@@ -1655,7 +1662,7 @@ void CoalesceDropFactory<Scalar, LocalOrdinal, GlobalOrdinal, Node>::Build(Level
 
   }  // if (doExperimentalWrap) ... else ...
 
-} // Build
+}  // Build
 
 template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
 void CoalesceDropFactory<Scalar, LocalOrdinal, GlobalOrdinal, Node>::MergeRows(const Matrix& A, const LO row, Array<LO>& cols, const Array<LO>& translation) const {

From 4dff65a96640d6473d88c9dc0282bb0441f77c77 Mon Sep 17 00:00:00 2001
From: Ian Halim <ihalim@ascicgpu031.sandia.gov>
Date: Thu, 15 Aug 2024 12:40:22 -0600
Subject: [PATCH 05/25] MueLu: Code Review Fixes

Signed-off-by: Ian Halim <ihalim@ascicgpu031.sandia.gov>
---
 .../MueLu_CoalesceDropFactory_def.hpp         | 40 +++++++++----------
 1 file changed, 20 insertions(+), 20 deletions(-)

diff --git a/packages/muelu/src/Graph/MatrixTransformation/MueLu_CoalesceDropFactory_def.hpp b/packages/muelu/src/Graph/MatrixTransformation/MueLu_CoalesceDropFactory_def.hpp
index ad5895e2e41b..0431bf011541 100644
--- a/packages/muelu/src/Graph/MatrixTransformation/MueLu_CoalesceDropFactory_def.hpp
+++ b/packages/muelu/src/Graph/MatrixTransformation/MueLu_CoalesceDropFactory_def.hpp
@@ -117,6 +117,11 @@ struct DropTol {
 };
 }  // namespace Details
 
+enum decisionAlgoType { defaultAlgo,
+                        unscaled_cut,
+                        scaled_cut,
+                        scaled_cut_symmetric };
+
 template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
 RCP<const ParameterList> CoalesceDropFactory<Scalar, LocalOrdinal, GlobalOrdinal, Node>::GetValidParameterList() const {
   RCP<ParameterList> validParamList = rcp(new ParameterList());
@@ -354,11 +359,6 @@ void CoalesceDropFactory<Scalar, LocalOrdinal, GlobalOrdinal, Node>::Build(Level
 #endif
     ////////////////////////////////////////////////////
 
-    enum decisionAlgoType { defaultAlgo,
-                            unscaled_cut,
-                            scaled_cut,
-                            scaled_cut_symmetric };
-
     decisionAlgoType distanceLaplacianAlgo = defaultAlgo;
     decisionAlgoType classicalAlgo         = defaultAlgo;
     if (algo == "distance laplacian") {
@@ -591,24 +591,24 @@ void CoalesceDropFactory<Scalar, LocalOrdinal, GlobalOrdinal, Node>::Build(Level
 
           //move from host to device
           auto ghostedDiagValsView = Kokkos::subview(ghostedDiag->getDeviceLocalView(Xpetra::Access::ReadOnly), Kokkos::ALL(), 0);
-          auto boundaryNodesDevice = Kokkos::create_mirror_view(ExecSpace(), boundaryNodes);
+          auto boundaryNodesDevice = Kokkos::create_mirror_view_and_copy(ExecSpace(), boundaryNodes);
           auto thresholdKokkos = static_cast<impl_scalar_type>(threshold);
           auto realThresholdKokkos = implATS::magnitude(thresholdKokkos);
+          auto columnsDevice = Kokkos::create_mirror_view(ExecSpace(), columns);
 
           auto At = Utilities::Op2TpetraCrs(A);
           auto A_device = At->getLocalMatrixDevice();
 
-          int algorithm = classicalAlgo;
           Kokkos::View<LO*, ExecSpace>rownnzView("rownnzView", A_device.numRows());
           auto drop_views = Kokkos::View<bool*, ExecSpace>("drop_views", A_device.nnz());
           auto index_views = Kokkos::View<size_t*, ExecSpace>("index_views", A_device.nnz());
 
           Kokkos::parallel_reduce("classical_cut", TeamPol(A_device.numRows(), Kokkos::AUTO), KOKKOS_LAMBDA(const TeamMem& teamMember, LO& globalnnz, GO& totalDropped) {
             LO row = teamMember.league_rank();
-            auto rowView = A_device.row(row);
+            auto rowView = A_device.rowConst(row);
             size_t nnz = rowView.length;
 
-            size_t n = 0;
+            size_t dropSize = 0;
             auto drop_view = Kokkos::subview(drop_views, Kokkos::make_pair(A_device.graph.row_map(row), A_device.graph.row_map(row+1)));
             auto index_view = Kokkos::subview(index_views, Kokkos::make_pair(A_device.graph.row_map(row), A_device.graph.row_map(row+1)));
 
@@ -629,10 +629,10 @@ void CoalesceDropFactory<Scalar, LocalOrdinal, GlobalOrdinal, Node>::Build(Level
                 drop_view(colID) = false;
                 count++;
               }
-            }, n);
+            }, dropSize);
 
-            size_t dropStart = n;
-            if (algorithm == unscaled_cut) {
+            size_t dropStart = dropSize;
+            if (classicalAlgo == unscaled_cut) {
               //push diagonals and boundaries to the right, sort everything else by aij on the left
               Kokkos::Experimental::sort_team(teamMember, index_view, [=](size_t& x, size_t& y) -> bool {
                 if(drop_view(x) || drop_view(y)) {
@@ -646,7 +646,7 @@ void CoalesceDropFactory<Scalar, LocalOrdinal, GlobalOrdinal, Node>::Build(Level
               });
 
               //find index where dropping starts
-              Kokkos::parallel_reduce(Kokkos::TeamThreadRange(teamMember, 1, n), [=](size_t i, size_t& min) {
+              Kokkos::parallel_reduce(Kokkos::TeamThreadRange(teamMember, 1, dropSize), [=](size_t i, size_t& min) {
                 auto const& x = index_view(i - 1);
                 auto const& y = index_view(i);
                 typename implATS::magnitudeType x_aij = 0;
@@ -664,7 +664,7 @@ void CoalesceDropFactory<Scalar, LocalOrdinal, GlobalOrdinal, Node>::Build(Level
                   }
                 }
               }, Kokkos::Min<size_t>(dropStart));
-            } else if (algorithm == scaled_cut) {
+            } else if (classicalAlgo == scaled_cut) {
               //push diagonals and boundaries to the right, sort everything else by aij/aiiajj on the left
               Kokkos::Experimental::sort_team(teamMember, index_view, [=](size_t& x, size_t& y) -> bool {
                 if(drop_view(x) || drop_view(y)) {
@@ -680,7 +680,7 @@ void CoalesceDropFactory<Scalar, LocalOrdinal, GlobalOrdinal, Node>::Build(Level
               });
 
               //find index where dropping starts
-              Kokkos::parallel_reduce(Kokkos::TeamThreadRange(teamMember, 1, n), [=](size_t i, size_t& min) {
+              Kokkos::parallel_reduce(Kokkos::TeamThreadRange(teamMember, 1, dropSize), [=](size_t i, size_t& min) {
                 auto const& x = index_view(i - 1);
                 auto const& y = index_view(i);
                 typename implATS::magnitudeType x_val = 0;
@@ -705,22 +705,23 @@ void CoalesceDropFactory<Scalar, LocalOrdinal, GlobalOrdinal, Node>::Build(Level
             }
 
             //drop everything to the right of where values stop passing threshold
-            if(dropStart < n) {
-              Kokkos::parallel_for(Kokkos::TeamThreadRange(teamMember, dropStart, n), [=](size_t i) {
+            if(dropStart < dropSize) {
+              Kokkos::parallel_for(Kokkos::TeamThreadRange(teamMember, dropStart, dropSize), [=](size_t i) {
                 drop_view(index_view(i)) = true;
               });
             }
 
             LO rownnz = 0;
             GO rowDropped = 0;
-            Kokkos::parallel_reduce(Kokkos::TeamThreadRange(teamMember, n), [=](const size_t idxID, LO& keep, GO& drop) {
+            Kokkos::parallel_reduce(Kokkos::TeamThreadRange(teamMember, dropSize), [=](const size_t idxID, LO& keep, GO& drop) {
               LO col = rowView.colidx(idxID);
               //don't drop diagonal
               if(row == col || !drop_view(idxID)) {
+                columnsDevice(A_device.graph.row_map(row) + idxID) = col;
                 keep++;
               }
               else {
-                rowView.colidx(idxID) = -1;
+                columnsDevice(A_device.graph.row_map(row) + idxID) = -1;
                 drop++;
               }
             }, rownnz, rowDropped);
@@ -731,7 +732,6 @@ void CoalesceDropFactory<Scalar, LocalOrdinal, GlobalOrdinal, Node>::Build(Level
           }, realnnz, numDropped);
 
           //update column indices so that kept indices are aligned to the left for subview that happens later on
-          auto columnsDevice = Kokkos::create_mirror_view(ExecSpace(), columns);
           Kokkos::Experimental::remove(ExecSpace(), columnsDevice, -1);
           Kokkos::deep_copy(columns, columnsDevice);
 

From 18097083fa60054ee56ada1f3afdf71f80c6fcc0 Mon Sep 17 00:00:00 2001
From: Anderson Chauphan <achauph@sandia.gov>
Date: Mon, 28 Oct 2024 15:56:58 -0600
Subject: [PATCH 06/25] Add AT2 runner, usage of GenConfig,
 get-changed-packages.sh

Squashing all the terrible commits I made while using the GitHub web
interface. The interface does not seem to have support for signing with
DCO.

Includes changes which modify the event triggers to comply with new AT2
specifications, assignment of an AT2 runner to run on, usage of
GenConfig to load the environment, and prototype implementation of
calling the get-changed-trilinos-packages.sh script.

Signed-off-by: Anderson Chauphan <achauph@sandia.gov>
---
 .github/workflows/codeql.yml | 65 +++++++++++++++++++-----------------
 1 file changed, 35 insertions(+), 30 deletions(-)

diff --git a/.github/workflows/codeql.yml b/.github/workflows/codeql.yml
index 7b51bbec8c75..e0478400bf5a 100644
--- a/.github/workflows/codeql.yml
+++ b/.github/workflows/codeql.yml
@@ -12,10 +12,11 @@
 name: "CodeQL: Linear Solvers"
 
 on:
-  #push:
-  #  branches: [ "muelu-sync-workflow" ]
   pull_request:
     branches: [ "develop" ]
+    types:
+      - opened
+      - synchronize
   schedule:
     - cron: '41 23 * * 2'
 
@@ -25,17 +26,12 @@ permissions:
 jobs:
   analyze:
     name: Analyze (${{ matrix.language }})
-    # Runner size impacts CodeQL analysis time. To learn more, please see:
-    #   - https://gh.io/recommended-hardware-resources-for-running-codeql
-    #   - https://gh.io/supported-runners-and-hardware-resources
-    #   - https://gh.io/using-larger-runners (GitHub.com only)
-    # Consider using larger runners or machines with greater resources for possible analysis time improvements.
-    runs-on: ${{ (matrix.language == 'swift' && 'macos-latest') || 'ubuntu-latest' }}
-    timeout-minutes: ${{ (matrix.language == 'swift' && 120) || 360 }}
+    runs-on: [self-hosted, gcc-10.3.0_openmpi-4.1.6]
+    if: ${{ github.event.action == 'synchronize' || github.event.action == 'opened' }}
+
     permissions:
       # required for all workflows
       security-events: write
-
       # only required for workflows in private repositories
       actions: read
       contents: read
@@ -46,16 +42,7 @@ jobs:
         include:
         - language: c-cpp
           build-mode: manual
-        #- language: python
-        #  build-mode: none
-        # CodeQL supports the following values keywords for 'language': 'c-cpp', 'csharp', 'go', 'java-kotlin', 'javascript-typescript', 'python', 'ruby', 'swift'
-        # Use `c-cpp` to analyze code written in C, C++ or both
-        # Use 'java-kotlin' to analyze code written in Java, Kotlin or both
-        # Use 'javascript-typescript' to analyze code written in JavaScript, TypeScript or both
-        # To learn more about changing the languages that are analyzed or customizing the build mode for your analysis,
-        # see https://docs.github.com/en/code-security/code-scanning/creating-an-advanced-setup-for-code-scanning/customizing-your-advanced-setup-for-code-scanning.
-        # If you are analyzing a compiled language, you can modify the 'build-mode' for that language to customize how
-        # your codebase is analyzed, see https://docs.github.com/en/code-security/code-scanning/creating-an-advanced-setup-for-code-scanning/codeql-code-scanning-for-compiled-languages
+
     steps:
     - name: Checkout repository
       uses: actions/checkout@692973e3d937129bcbf40652eb9f2f61becf3332 # v4.1.7
@@ -70,20 +57,38 @@ jobs:
           query-filters:
           - exclude:
              tags: cpp/integer-multiplication-cast-to-long
-           
+    - name: env
+      env:
+        GITHUB_CONTEXT: ${{ toJson(github) }}
+      run: |
+        env
+    - name: module list
+      shell: bash -l {0}
+      run: |
+        module list
+        printenv PATH
     - if: matrix.build-mode == 'manual'
-      name: Configure Trilinos            
+      name: Get dependencies
       run: |
-          mkdir -p trilinos_build
-          cd trilinos_build 
-          cmake -G 'Unix Makefiles' -DTrilinos_ENABLE_TESTS=OFF -DTrilinos_ENABLE_Epetra=OFF -DTrilinos_ENABLE_AztecOO=OFF -DTrilinos_ENABLE_Ifpack=OFF -DTrilinos_ENABLE_ML=OFF -D Trilinos_ENABLE_Triutils=OFF -DTrilinos_ENABLE_Tpetra=ON -DTrilinos_ENABLE_MueLu=ON -DTrilinos_ENABLE_Krino=OFF ..
-      
+          bash -lc "${GITHUB_WORKSPACE}/packages/framework/get_dependencies.sh --container"
+    - if: matrix.build-mode == 'manual'
+      name: Generate CMake fragment for changed packages
+      run: |
+          git fetch origin ${GITHUB_BASE_REF}
+          git branch
+          bash -lc "${GITHUB_WORKSPACE}/commonTools/framework/get-changed-trilinos-packages.sh ${GITHUB_BASE_REF} ${GITHUB_HEAD_REF} package_enables.cmake package_subprojects.cmake"
     - if: matrix.build-mode == 'manual'
-      name: Build Trilinos
+      name: Configure and Build Trilinos
+      shell: bash -lc {0}
       run: |
-          cd trilinos_build 
-          make -j 2
-      
+          mkdir -p trilinos_build
+          mv package_enables.cmake trilinos_build
+          cd trilinos_build
+
+          source ${GITHUB_WORKSPACE}/packages/framework/GenConfig/gen-config.sh --force --cmake-fragment genconfig_fragment.cmake rhel8_gcc-openmpi_debug_shared_no-kokkos-arch_no-asan_complex_no-fpic_mpi_no-pt_no-rdc_no-uvm_deprecated-on_no-package-enables
+          cmake -C genconfig_fragment.cmake -C package_enables.cmake ..
+          ninja -j 16
+            
     - name: Perform CodeQL Analysis
       uses: github/codeql-action/analyze@294a9d92911152fe08befb9ec03e240add280cb3 # v3.26.8
       with:

From 5742da5d3c8b3599e1348e016a85484ec0826dcf Mon Sep 17 00:00:00 2001
From: Anderson Chauphan <achauph@sandia.gov>
Date: Mon, 28 Oct 2024 16:03:26 -0600
Subject: [PATCH 07/25] Fix arguments of get-changed-trilinos-packages.sh for
 CodeQL

Fix calling of get-changed-trilinos-packages.sh to correctly reference
the origin remote.

Signed-off-by: Anderson Chauphan <achauph@sandia.gov>
---
 .github/workflows/codeql.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/codeql.yml b/.github/workflows/codeql.yml
index e0478400bf5a..0851448adb05 100644
--- a/.github/workflows/codeql.yml
+++ b/.github/workflows/codeql.yml
@@ -76,7 +76,7 @@ jobs:
       run: |
           git fetch origin ${GITHUB_BASE_REF}
           git branch
-          bash -lc "${GITHUB_WORKSPACE}/commonTools/framework/get-changed-trilinos-packages.sh ${GITHUB_BASE_REF} ${GITHUB_HEAD_REF} package_enables.cmake package_subprojects.cmake"
+          bash -lc "${GITHUB_WORKSPACE}/commonTools/framework/get-changed-trilinos-packages.sh origin/${GITHUB_BASE_REF} HEAD package_enables.cmake package_subprojects.cmake"
     - if: matrix.build-mode == 'manual'
       name: Configure and Build Trilinos
       shell: bash -lc {0}

From 661569af6dffdaaaeb11f548ea26ac71f207e356 Mon Sep 17 00:00:00 2001
From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com>
Date: Mon, 28 Oct 2024 22:59:16 +0000
Subject: [PATCH 08/25] Bump actions/dependency-review-action from 4.3.4 to
 4.4.0

Bumps [actions/dependency-review-action](https://github.com/actions/dependency-review-action) from 4.3.4 to 4.4.0.
- [Release notes](https://github.com/actions/dependency-review-action/releases)
- [Commits](https://github.com/actions/dependency-review-action/compare/5a2ce3f5b92ee19cbb1541a4984c76d921601d7c...4081bf99e2866ebe428fc0477b69eb4fcda7220a)

---
updated-dependencies:
- dependency-name: actions/dependency-review-action
  dependency-type: direct:production
  update-type: version-update:semver-minor
...

Signed-off-by: dependabot[bot] <support@github.com>
---
 .github/workflows/dependency-review.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/dependency-review.yml b/.github/workflows/dependency-review.yml
index 7b0990bcf5ca..bf29beac76d5 100644
--- a/.github/workflows/dependency-review.yml
+++ b/.github/workflows/dependency-review.yml
@@ -24,4 +24,4 @@ jobs:
       - name: 'Checkout Repository'
         uses: actions/checkout@eef61447b9ff4aafe5dcd4e0bbf5d482be7e7871 # v4.2.1
       - name: 'Dependency Review'
-        uses: actions/dependency-review-action@5a2ce3f5b92ee19cbb1541a4984c76d921601d7c # v4.3.4
+        uses: actions/dependency-review-action@4081bf99e2866ebe428fc0477b69eb4fcda7220a # v4.4.0

From 40e117299710784a953a03469b2b32752a4ea29b Mon Sep 17 00:00:00 2001
From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com>
Date: Mon, 28 Oct 2024 22:59:27 +0000
Subject: [PATCH 09/25] Bump github/codeql-action from 3.26.13 to 3.27.0

Bumps [github/codeql-action](https://github.com/github/codeql-action) from 3.26.13 to 3.27.0.
- [Release notes](https://github.com/github/codeql-action/releases)
- [Changelog](https://github.com/github/codeql-action/blob/main/CHANGELOG.md)
- [Commits](https://github.com/github/codeql-action/compare/f779452ac5af1c261dce0346a8f964149f49322b...662472033e021d55d94146f66f6058822b0b39fd)

---
updated-dependencies:
- dependency-name: github/codeql-action
  dependency-type: direct:production
  update-type: version-update:semver-minor
...

Signed-off-by: dependabot[bot] <support@github.com>
---
 .github/workflows/codeql.yml     | 4 ++--
 .github/workflows/scorecards.yml | 2 +-
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/.github/workflows/codeql.yml b/.github/workflows/codeql.yml
index 56bbf091adaf..4139508fa42b 100644
--- a/.github/workflows/codeql.yml
+++ b/.github/workflows/codeql.yml
@@ -62,7 +62,7 @@ jobs:
 
     # Initializes the CodeQL tools for scanning.
     - name: Initialize CodeQL
-      uses: github/codeql-action/init@f779452ac5af1c261dce0346a8f964149f49322b # v3.26.13
+      uses: github/codeql-action/init@662472033e021d55d94146f66f6058822b0b39fd # v3.27.0
       with:
         languages: ${{ matrix.language }}
         build-mode: ${{ matrix.build-mode }}
@@ -85,6 +85,6 @@ jobs:
           make -j 2
       
     - name: Perform CodeQL Analysis
-      uses: github/codeql-action/analyze@f779452ac5af1c261dce0346a8f964149f49322b # v3.26.13
+      uses: github/codeql-action/analyze@662472033e021d55d94146f66f6058822b0b39fd # v3.27.0
       with:
         category: "/language:${{matrix.language}}"
diff --git a/.github/workflows/scorecards.yml b/.github/workflows/scorecards.yml
index c648a7e9b626..46a2c4571aff 100644
--- a/.github/workflows/scorecards.yml
+++ b/.github/workflows/scorecards.yml
@@ -66,6 +66,6 @@ jobs:
 
       # Upload the results to GitHub's code scanning dashboard.
       - name: "Upload to code-scanning"
-        uses: github/codeql-action/upload-sarif@f779452ac5af1c261dce0346a8f964149f49322b # v3.26.13
+        uses: github/codeql-action/upload-sarif@662472033e021d55d94146f66f6058822b0b39fd # v3.27.0
         with:
           sarif_file: results.sarif

From 1d278e8c57e5f0a936f8eb8f6184222a19e5f681 Mon Sep 17 00:00:00 2001
From: Anderson Chauphan <achauph@sandia.gov>
Date: Mon, 28 Oct 2024 17:12:28 -0600
Subject: [PATCH 10/25] Move GenConfig step into the Generate CMake fragment
 step

The GenConfig step is just used to generate a cmake fragment for the
configuration. This would fit nicely with the step that generates the
other cmake fragment for package enables.

Signed-off-by: Anderson Chauphan <achauph@sandia.gov>
---
 .github/workflows/codeql.yml | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/.github/workflows/codeql.yml b/.github/workflows/codeql.yml
index 0851448adb05..aa9b8043c094 100644
--- a/.github/workflows/codeql.yml
+++ b/.github/workflows/codeql.yml
@@ -72,20 +72,20 @@ jobs:
       run: |
           bash -lc "${GITHUB_WORKSPACE}/packages/framework/get_dependencies.sh --container"
     - if: matrix.build-mode == 'manual'
-      name: Generate CMake fragment for changed packages
+      name: Generate CMake fragments
       run: |
           git fetch origin ${GITHUB_BASE_REF}
-          git branch
+
+          mkdir -p trilinos_build && cd trilinos_build
+
+          source ${GITHUB_WORKSPACE}/packages/framework/GenConfig/gen-config.sh --force --cmake-fragment genconfig_fragment.cmake rhel8_gcc-openmpi_debug_shared_no-kokkos-arch_no-asan_complex_no-fpic_mpi_no-pt_no-rdc_no-uvm_deprecated-on_no-package-enables
           bash -lc "${GITHUB_WORKSPACE}/commonTools/framework/get-changed-trilinos-packages.sh origin/${GITHUB_BASE_REF} HEAD package_enables.cmake package_subprojects.cmake"
     - if: matrix.build-mode == 'manual'
       name: Configure and Build Trilinos
       shell: bash -lc {0}
       run: |
-          mkdir -p trilinos_build
-          mv package_enables.cmake trilinos_build
           cd trilinos_build
 
-          source ${GITHUB_WORKSPACE}/packages/framework/GenConfig/gen-config.sh --force --cmake-fragment genconfig_fragment.cmake rhel8_gcc-openmpi_debug_shared_no-kokkos-arch_no-asan_complex_no-fpic_mpi_no-pt_no-rdc_no-uvm_deprecated-on_no-package-enables
           cmake -C genconfig_fragment.cmake -C package_enables.cmake ..
           ninja -j 16
             

From 237a61123802fd9d586eac0a32193178bbeda52e Mon Sep 17 00:00:00 2001
From: Anderson Chauphan <achauph@sandia.gov>
Date: Mon, 28 Oct 2024 17:14:14 -0600
Subject: [PATCH 11/25] Tidy up workflow file and add newlines

Tidy up workflow file with consistent naming and add newlines between
each named step for better readability.

Signed-off-by: Anderson Chauphan <achauph@sandia.gov>
---
 .github/workflows/codeql.yml | 13 +++++++++----
 1 file changed, 9 insertions(+), 4 deletions(-)

diff --git a/.github/workflows/codeql.yml b/.github/workflows/codeql.yml
index aa9b8043c094..6fc58693f0b7 100644
--- a/.github/workflows/codeql.yml
+++ b/.github/workflows/codeql.yml
@@ -57,20 +57,24 @@ jobs:
           query-filters:
           - exclude:
              tags: cpp/integer-multiplication-cast-to-long
-    - name: env
+
+    - name: Print environment
       env:
         GITHUB_CONTEXT: ${{ toJson(github) }}
       run: |
         env
-    - name: module list
+
+    - name: Module list
       shell: bash -l {0}
       run: |
         module list
         printenv PATH
+
     - if: matrix.build-mode == 'manual'
       name: Get dependencies
       run: |
           bash -lc "${GITHUB_WORKSPACE}/packages/framework/get_dependencies.sh --container"
+
     - if: matrix.build-mode == 'manual'
       name: Generate CMake fragments
       run: |
@@ -80,15 +84,16 @@ jobs:
 
           source ${GITHUB_WORKSPACE}/packages/framework/GenConfig/gen-config.sh --force --cmake-fragment genconfig_fragment.cmake rhel8_gcc-openmpi_debug_shared_no-kokkos-arch_no-asan_complex_no-fpic_mpi_no-pt_no-rdc_no-uvm_deprecated-on_no-package-enables
           bash -lc "${GITHUB_WORKSPACE}/commonTools/framework/get-changed-trilinos-packages.sh origin/${GITHUB_BASE_REF} HEAD package_enables.cmake package_subprojects.cmake"
+
     - if: matrix.build-mode == 'manual'
-      name: Configure and Build Trilinos
+      name: Configure and build Trilinos
       shell: bash -lc {0}
       run: |
           cd trilinos_build
 
           cmake -C genconfig_fragment.cmake -C package_enables.cmake ..
           ninja -j 16
-            
+
     - name: Perform CodeQL Analysis
       uses: github/codeql-action/analyze@294a9d92911152fe08befb9ec03e240add280cb3 # v3.26.8
       with:

From 7727e25ee133d6fbdce85589cca9c6359cb4109e Mon Sep 17 00:00:00 2001
From: Anderson Chauphan <achauph@sandia.gov>
Date: Tue, 29 Oct 2024 08:38:52 -0600
Subject: [PATCH 12/25] Fix bash login shell for generate CMake fragment

Signed-off-by: Anderson Chauphan <achauph@sandia.gov>
---
 .github/workflows/codeql.yml | 1 +
 1 file changed, 1 insertion(+)

diff --git a/.github/workflows/codeql.yml b/.github/workflows/codeql.yml
index 6fc58693f0b7..be4c96a2a393 100644
--- a/.github/workflows/codeql.yml
+++ b/.github/workflows/codeql.yml
@@ -77,6 +77,7 @@ jobs:
 
     - if: matrix.build-mode == 'manual'
       name: Generate CMake fragments
+      shell: bash -lc {0}
       run: |
           git fetch origin ${GITHUB_BASE_REF}
 

From 26eb6ff3cd6d003781d9fa51be2108cd77b94a3d Mon Sep 17 00:00:00 2001
From: Anderson Chauphan <achauph@sandia.gov>
Date: Tue, 29 Oct 2024 09:00:32 -0600
Subject: [PATCH 13/25] Add TriBITS cache variables to reduce code built

Add TriBITS cache variables to reduce code built for packages that are
not needed.

Signed-off-by: Anderson Chauphan <achauph@sandia.gov>
---
 .github/workflows/codeql.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/codeql.yml b/.github/workflows/codeql.yml
index be4c96a2a393..130ed194da46 100644
--- a/.github/workflows/codeql.yml
+++ b/.github/workflows/codeql.yml
@@ -92,7 +92,7 @@ jobs:
       run: |
           cd trilinos_build
 
-          cmake -C genconfig_fragment.cmake -C package_enables.cmake ..
+          cmake -C genconfig_fragment.cmake -C package_enables.cmake -DTrilinos_ENABLE_ALL_FORWARD_DEP_PACKAGES=OFF -DTrilinos_ENABLE_ALL_OPTIONAL_PACKAGES=OFF -DTrilinos_ENABLE_SECONDARY_TESTED_CODE=OFF  ..
           ninja -j 16
 
     - name: Perform CodeQL Analysis

From c367a234d9de789c533a4e7b853ed64a463e0a8d Mon Sep 17 00:00:00 2001
From: Anderson Chauphan <achauph@sandia.gov>
Date: Tue, 29 Oct 2024 16:41:33 -0600
Subject: [PATCH 14/25] Use multi-line yml for cmake command

Signed-off-by: Anderson Chauphan <achauph@sandia.gov>
---
 .github/workflows/codeql.yml | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/.github/workflows/codeql.yml b/.github/workflows/codeql.yml
index 130ed194da46..3ada32ba19fe 100644
--- a/.github/workflows/codeql.yml
+++ b/.github/workflows/codeql.yml
@@ -92,7 +92,11 @@ jobs:
       run: |
           cd trilinos_build
 
-          cmake -C genconfig_fragment.cmake -C package_enables.cmake -DTrilinos_ENABLE_ALL_FORWARD_DEP_PACKAGES=OFF -DTrilinos_ENABLE_ALL_OPTIONAL_PACKAGES=OFF -DTrilinos_ENABLE_SECONDARY_TESTED_CODE=OFF  ..
+          cmake -C genconfig_fragment.cmake -C package_enables.cmake \
+            -DTrilinos_ENABLE_ALL_FORWARD_DEP_PACKAGES=OFF \
+            -DTrilinos_ENABLE_ALL_OPTIONAL_PACKAGES=OFF \
+            -DTrilinos_ENABLE_SECONDARY_TESTED_CODE=OFF  ..
+
           ninja -j 16
 
     - name: Perform CodeQL Analysis

From 54d711e40daac2252c661c52482c29a9179b6974 Mon Sep 17 00:00:00 2001
From: Anderson Chauphan <achauph@sandia.gov>
Date: Tue, 29 Oct 2024 17:02:58 -0600
Subject: [PATCH 15/25] Manually disable each deprecated package in cmake
 command

Manually disable each deprecated package in the cmake command for CodeQL
configuration. Since these are defined on the command line, they should
take priority over any of the enables from the package_enables.cmake or
the genconfig_fragment.cmake fragments.

Signed-off-by: Anderson Chauphan <achauph@sandia.gov>
---
 .github/workflows/codeql.yml | 17 ++++++++++++++++-
 1 file changed, 16 insertions(+), 1 deletion(-)

diff --git a/.github/workflows/codeql.yml b/.github/workflows/codeql.yml
index 3ada32ba19fe..b739518db9ef 100644
--- a/.github/workflows/codeql.yml
+++ b/.github/workflows/codeql.yml
@@ -95,7 +95,22 @@ jobs:
           cmake -C genconfig_fragment.cmake -C package_enables.cmake \
             -DTrilinos_ENABLE_ALL_FORWARD_DEP_PACKAGES=OFF \
             -DTrilinos_ENABLE_ALL_OPTIONAL_PACKAGES=OFF \
-            -DTrilinos_ENABLE_SECONDARY_TESTED_CODE=OFF  ..
+            -DTrilinos_ENABLE_SECONDARY_TESTED_CODE=OFF \
+            -DTrilinos_ENABLE_Amesos=OFF \
+            -DTrilinos_ENABLE_AztecOO=OFF \
+            -DTrilinos_ENABLE_Epetra=OFF \
+            -DTrilinos_ENABLE_EpetraExt=OFF \
+            -DTrilinos_ENABLE_Ifpack=OFF \
+            -DTrilinos_ENABLE_Intrepid=OFF \
+            -DTrilinos_ENABLE_Isorropia=OFF \
+            -DTrilinos_ENABLE_ML=OFF \
+            -DTrilinos_ENABLE_NewPackage=OFF \
+            -DTrilinos_ENABLE_Pliris=OFF \
+            -DTrilinos_ENABLE_PyTrilinos=OFF \
+            -DTrilinos_ENABLE_ShyLU_DDCore=OFF \
+            -DTrilinos_ENABLE_ThyraEpetraAdapters=OFF \
+            -DTrilinos_ENABLE_ThyraEpetraExtAdapters=OFF \
+            -DTrilinos_ENABLE_Triutils=OFF ..
 
           ninja -j 16
 

From d88fa994bdb626eb4dac0026e6be601f9a62f03c Mon Sep 17 00:00:00 2001
From: Ian Halim <ihalim@ascicgpu031.sandia.gov>
Date: Fri, 23 Aug 2024 19:21:52 -0600
Subject: [PATCH 16/25] MueLu: Fixing Issue #13377 and #13378

Issues listed above have been addressed.
Threshold has been redefined to 1/threshold.
Unit tests have been modified to be more thorough.

Signed-off-by: Ian Halim <ihalim@ascicgpu031.sandia.gov>
---
 .../MueLu_CoalesceDropFactory_def.hpp         |  60 +++---
 .../test/unit_tests/CoalesceDropFactory.cpp   | 178 +++++++++++++++---
 2 files changed, 187 insertions(+), 51 deletions(-)

diff --git a/packages/muelu/src/Graph/MatrixTransformation/MueLu_CoalesceDropFactory_def.hpp b/packages/muelu/src/Graph/MatrixTransformation/MueLu_CoalesceDropFactory_def.hpp
index 0431bf011541..1f9961289cb0 100644
--- a/packages/muelu/src/Graph/MatrixTransformation/MueLu_CoalesceDropFactory_def.hpp
+++ b/packages/muelu/src/Graph/MatrixTransformation/MueLu_CoalesceDropFactory_def.hpp
@@ -591,13 +591,27 @@ void CoalesceDropFactory<Scalar, LocalOrdinal, GlobalOrdinal, Node>::Build(Level
 
           //move from host to device
           auto ghostedDiagValsView = Kokkos::subview(ghostedDiag->getDeviceLocalView(Xpetra::Access::ReadOnly), Kokkos::ALL(), 0);
-          auto boundaryNodesDevice = Kokkos::create_mirror_view_and_copy(ExecSpace(), boundaryNodes);
           auto thresholdKokkos = static_cast<impl_scalar_type>(threshold);
           auto realThresholdKokkos = implATS::magnitude(thresholdKokkos);
           auto columnsDevice = Kokkos::create_mirror_view(ExecSpace(), columns);
 
-          auto At = Utilities::Op2TpetraCrs(A);
-          auto A_device = At->getLocalMatrixDevice();
+          auto A_device = A->getLocalMatrixDevice();
+          RCP<LWGraph> graph = rcp(new LWGraph(A->getCrsGraph(), "graph of A"));
+          RCP<const Import> importer = A->getCrsGraph()->getImporter();
+          RCP<LocalOrdinalVector> boundaryNodesVector = Xpetra::VectorFactory<LO, LO, GO, NO>::Build(graph->GetDomainMap());
+          RCP<LocalOrdinalVector> boundaryColumnVector;
+          for(size_t i = 0; i < graph->GetNodeNumVertices(); i++) {
+            boundaryNodesVector->getDataNonConst(0)[i] = boundaryNodes[i];
+          }
+          if(!importer.is_null()) {
+            boundaryColumnVector = Xpetra::VectorFactory<LO, LO, GO, NO>::Build(graph->GetImportMap());
+            boundaryColumnVector->doImport(*boundaryNodesVector, *importer, Xpetra::INSERT);
+          }
+          else {
+            boundaryColumnVector = boundaryNodesVector;
+          }
+          auto boundaryColumn = boundaryColumnVector->getDeviceLocalView(Xpetra::Access::ReadOnly);
+          auto boundary = Kokkos::subview(boundaryColumn, Kokkos::ALL(), 0);
 
           Kokkos::View<LO*, ExecSpace>rownnzView("rownnzView", A_device.numRows());
           auto drop_views = Kokkos::View<bool*, ExecSpace>("drop_views", A_device.nnz());
@@ -608,30 +622,24 @@ void CoalesceDropFactory<Scalar, LocalOrdinal, GlobalOrdinal, Node>::Build(Level
             auto rowView = A_device.rowConst(row);
             size_t nnz = rowView.length;
 
-            size_t dropSize = 0;
             auto drop_view = Kokkos::subview(drop_views, Kokkos::make_pair(A_device.graph.row_map(row), A_device.graph.row_map(row+1)));
             auto index_view = Kokkos::subview(index_views, Kokkos::make_pair(A_device.graph.row_map(row), A_device.graph.row_map(row+1)));
 
             //find magnitudes
-            Kokkos::parallel_reduce(Kokkos::TeamThreadRange(teamMember, (LO)nnz), [&](const LO colID, size_t &count) {
+            Kokkos::parallel_for(Kokkos::TeamThreadRange(teamMember, (LO)nnz), [&](const LO colID) {
               index_view(colID) = colID;
               LO col = rowView.colidx(colID);
               //ignore diagonals for now, they are checked again later
-              if(row == col) {
-                drop_view(colID) = true;
-                count++;
-              }
               //Don't aggregate boundaries
-              else if(boundaryNodesDevice(colID)) {
+              if(row == col || boundary(col)) {
                 drop_view(colID) = true;
               }
               else {
                 drop_view(colID) = false;
-                count++;
               }
-            }, dropSize);
+            });
 
-            size_t dropStart = dropSize;
+            size_t dropStart = nnz;
             if (classicalAlgo == unscaled_cut) {
               //push diagonals and boundaries to the right, sort everything else by aij on the left
               Kokkos::Experimental::sort_team(teamMember, index_view, [=](size_t& x, size_t& y) -> bool {
@@ -646,7 +654,7 @@ void CoalesceDropFactory<Scalar, LocalOrdinal, GlobalOrdinal, Node>::Build(Level
               });
 
               //find index where dropping starts
-              Kokkos::parallel_reduce(Kokkos::TeamThreadRange(teamMember, 1, dropSize), [=](size_t i, size_t& min) {
+              Kokkos::parallel_reduce(Kokkos::TeamThreadRange(teamMember, 1, nnz), [=](size_t i, size_t& min) {
                 auto const& x = index_view(i - 1);
                 auto const& y = index_view(i);
                 typename implATS::magnitudeType x_aij = 0;
@@ -658,7 +666,7 @@ void CoalesceDropFactory<Scalar, LocalOrdinal, GlobalOrdinal, Node>::Build(Level
                   y_aij    = implATS::magnitude(rowView.value(y) * rowView.value(y));
                 }
 
-                if(x_aij > realThresholdKokkos * y_aij) {
+                if(realThresholdKokkos * realThresholdKokkos * x_aij > y_aij) {
                   if(i < min) {
                     min = i;
                   }
@@ -673,30 +681,30 @@ void CoalesceDropFactory<Scalar, LocalOrdinal, GlobalOrdinal, Node>::Build(Level
                 else {
                   auto x_aij    = implATS::magnitude(rowView.value(x) * rowView.value(x));
                   auto y_aij    = implATS::magnitude(rowView.value(y) * rowView.value(y));
-                  auto x_aiiajj = implATS::magnitude(thresholdKokkos * thresholdKokkos * ghostedDiagValsView(rowView.colidx(x)) * ghostedDiagValsView(row));
-                  auto y_aiiajj = implATS::magnitude(thresholdKokkos * thresholdKokkos * ghostedDiagValsView(rowView.colidx(y)) * ghostedDiagValsView(row));
+                  auto x_aiiajj = implATS::magnitude(ghostedDiagValsView(rowView.colidx(x)) * ghostedDiagValsView(row));
+                  auto y_aiiajj = implATS::magnitude(ghostedDiagValsView(rowView.colidx(y)) * ghostedDiagValsView(row));
                   return (x_aij / x_aiiajj) > (y_aij / y_aiiajj);
                 }
               });
 
               //find index where dropping starts
-              Kokkos::parallel_reduce(Kokkos::TeamThreadRange(teamMember, 1, dropSize), [=](size_t i, size_t& min) {
+              Kokkos::parallel_reduce(Kokkos::TeamThreadRange(teamMember, 1, nnz), [=](size_t i, size_t& min) {
                 auto const& x = index_view(i - 1);
                 auto const& y = index_view(i);
                 typename implATS::magnitudeType x_val = 0;
                 typename implATS::magnitudeType y_val = 0;
                 if(!drop_view(x)) {
                   typename implATS::magnitudeType x_aij    = implATS::magnitude(rowView.value(x) * rowView.value(x));
-                  typename implATS::magnitudeType x_aiiajj = implATS::magnitude(thresholdKokkos * thresholdKokkos * ghostedDiagValsView(rowView.colidx(x)) * ghostedDiagValsView(row));
+                  typename implATS::magnitudeType x_aiiajj = implATS::magnitude(ghostedDiagValsView(rowView.colidx(x)) * ghostedDiagValsView(row));
                   x_val = x_aij / x_aiiajj;
                 }
                 if(!drop_view(y)) {
                   typename implATS::magnitudeType y_aij    = implATS::magnitude(rowView.value(y) * rowView.value(y));
-                  typename implATS::magnitudeType y_aiiajj = implATS::magnitude(thresholdKokkos * thresholdKokkos * ghostedDiagValsView(rowView.colidx(y)) * ghostedDiagValsView(row));
+                  typename implATS::magnitudeType y_aiiajj = implATS::magnitude(ghostedDiagValsView(rowView.colidx(y)) * ghostedDiagValsView(row));
                   y_val = y_aij / y_aiiajj;
                 }
 
-                if(x_val > realThresholdKokkos * y_val) {
+                if(realThresholdKokkos * realThresholdKokkos * x_val > y_val) {
                   if(i < min) {
                     min = i;
                   }
@@ -705,15 +713,15 @@ void CoalesceDropFactory<Scalar, LocalOrdinal, GlobalOrdinal, Node>::Build(Level
             }
 
             //drop everything to the right of where values stop passing threshold
-            if(dropStart < dropSize) {
-              Kokkos::parallel_for(Kokkos::TeamThreadRange(teamMember, dropStart, dropSize), [=](size_t i) {
+            if(dropStart < nnz) {
+              Kokkos::parallel_for(Kokkos::TeamThreadRange(teamMember, dropStart, nnz), [=](size_t i) {
                 drop_view(index_view(i)) = true;
               });
             }
 
             LO rownnz = 0;
             GO rowDropped = 0;
-            Kokkos::parallel_reduce(Kokkos::TeamThreadRange(teamMember, dropSize), [=](const size_t idxID, LO& keep, GO& drop) {
+            Kokkos::parallel_reduce(Kokkos::TeamThreadRange(teamMember, nnz), [=](const size_t idxID, LO& keep, GO& drop) {
               LO col = rowView.colidx(idxID);
               //don't drop diagonal
               if(row == col || !drop_view(idxID)) {
@@ -1381,7 +1389,7 @@ void CoalesceDropFactory<Scalar, LocalOrdinal, GlobalOrdinal, Node>::Build(Level
                       auto const& y = drop_vec[i];
                       auto a        = x.val;
                       auto b        = y.val;
-                      if (a > realThreshold * b) {
+                      if (realThreshold * realThreshold * a > b) {
                         drop = true;
 #ifdef HAVE_MUELU_DEBUG
                         if (distanceLaplacianCutVerbose) {
@@ -1404,7 +1412,7 @@ void CoalesceDropFactory<Scalar, LocalOrdinal, GlobalOrdinal, Node>::Build(Level
                       auto const& y = drop_vec[i];
                       auto a        = x.val / x.diag;
                       auto b        = y.val / y.diag;
-                      if (a > realThreshold * b) {
+                      if (realThreshold * realThreshold * a > b) {
                         drop = true;
 #ifdef HAVE_MUELU_DEBUG
                         if (distanceLaplacianCutVerbose) {
diff --git a/packages/muelu/test/unit_tests/CoalesceDropFactory.cpp b/packages/muelu/test/unit_tests/CoalesceDropFactory.cpp
index e8902b178708..0073ca7e9bfb 100644
--- a/packages/muelu/test/unit_tests/CoalesceDropFactory.cpp
+++ b/packages/muelu/test/unit_tests/CoalesceDropFactory.cpp
@@ -1223,7 +1223,7 @@ TEUCHOS_UNIT_TEST_TEMPLATE_4_DECL(CoalesceDropFactory, DistanceLaplacianScaledCu
   // L_ij = -36
   // L_ii = 72
   // criterion for dropping is |L_ij|^2 <= tol^2 * |L_ii*L_jj|
-  coalesceDropFact.SetParameter("aggregation: drop tol", Teuchos::ParameterEntry(8.0));
+  coalesceDropFact.SetParameter("aggregation: drop tol", Teuchos::ParameterEntry(sqrt(0.125)));
   coalesceDropFact.SetParameter("aggregation: drop scheme", Teuchos::ParameterEntry(std::string("distance laplacian")));
   coalesceDropFact.SetParameter("aggregation: distance laplacian algo", Teuchos::ParameterEntry(std::string("scaled cut")));
   fineLevel.Request("Graph", &coalesceDropFact);
@@ -1289,7 +1289,7 @@ TEUCHOS_UNIT_TEST_TEMPLATE_4_DECL(CoalesceDropFactory, DistanceLaplacianUnscaled
   // L_ij = -36
   // L_ii = 72
   // criterion for dropping is |L_ij|^2 <= tol^2 * |L_ii*L_jj|
-  coalesceDropFact.SetParameter("aggregation: drop tol", Teuchos::ParameterEntry(8.0));
+  coalesceDropFact.SetParameter("aggregation: drop tol", Teuchos::ParameterEntry(sqrt(0.125)));
   coalesceDropFact.SetParameter("aggregation: drop scheme", Teuchos::ParameterEntry(std::string("distance laplacian")));
   coalesceDropFact.SetParameter("aggregation: distance laplacian algo", Teuchos::ParameterEntry(std::string("unscaled cut")));
   fineLevel.Request("Graph", &coalesceDropFact);
@@ -1355,7 +1355,7 @@ TEUCHOS_UNIT_TEST_TEMPLATE_4_DECL(CoalesceDropFactory, DistanceLaplacianCutSym,
   // L_ij = -36
   // L_ii = 72
   // criterion for dropping is |L_ij|^2 <= tol^2 * |L_ii*L_jj|
-  coalesceDropFact.SetParameter("aggregation: drop tol", Teuchos::ParameterEntry(8.0));
+  coalesceDropFact.SetParameter("aggregation: drop tol", Teuchos::ParameterEntry(sqrt(0.125)));
   coalesceDropFact.SetParameter("aggregation: drop scheme", Teuchos::ParameterEntry(std::string("distance laplacian")));
   coalesceDropFact.SetParameter("aggregation: distance laplacian algo", Teuchos::ParameterEntry(std::string("scaled cut symmetric")));
   fineLevel.Request("Graph", &coalesceDropFact);
@@ -1389,6 +1389,8 @@ TEUCHOS_UNIT_TEST_TEMPLATE_4_DECL(CoalesceDropFactory, ClassicalScaledCut, Scala
   typedef Teuchos::ScalarTraits<SC> STS;
   typedef typename STS::magnitudeType real_type;
   typedef Xpetra::MultiVector<real_type, LO, GO, NO> RealValuedMultiVector;
+  typedef Tpetra::Map<LO, GO, NO> map_type;
+  typedef Tpetra::CrsMatrix<SC, LO, GO, NO> crs_matrix_type;
 
   MUELU_TESTING_SET_OSTREAM;
   MUELU_TESTING_LIMIT_SCOPE(Scalar, GlobalOrdinal, Node);
@@ -1399,11 +1401,41 @@ TEUCHOS_UNIT_TEST_TEMPLATE_4_DECL(CoalesceDropFactory, ClassicalScaledCut, Scala
   Level fineLevel;
   TestHelpers::TestFactory<SC, LO, GO, NO>::createSingleLevelHierarchy(fineLevel);
 
-  RCP<Matrix> A = TestHelpers::TestFactory<SC, LO, GO, NO>::Build1DPoisson(36);
+  const global_size_t globalIndices = 12;
+  const GO indexBase = 0;
+  RCP<const map_type> map = rcp(new map_type(globalIndices, indexBase, comm));
+  RCP<crs_matrix_type> A_t(new crs_matrix_type(map, 5));
+  const SC two = static_cast<SC>(2.0);
+  const SC one = static_cast<SC>(1.0);
+  const SC negOne = static_cast<SC>(-1.0);
+  for(LO lclRow = 0; lclRow < static_cast<LO> (map->getLocalNumElements()); lclRow++) {
+    const GO gblRow = map->getGlobalElement(lclRow);
+    if(gblRow == 0) {
+      A_t->insertGlobalValues(gblRow, Teuchos::tuple<GO>(gblRow, gblRow + 1), Teuchos::tuple<SC>(two, negOne));
+    }
+    else if(static_cast<Tpetra::global_size_t>(gblRow) == globalIndices - 1) {
+      A_t->insertGlobalValues(gblRow, Teuchos::tuple<GO>(gblRow - 1, gblRow), Teuchos::tuple<SC>(negOne, two));
+    }
+    else if(gblRow == 2 || gblRow == 9) {
+      A_t->insertGlobalValues(gblRow, Teuchos::tuple<GO>(gblRow), Teuchos::tuple<SC>(one));
+    }
+    else if(gblRow == 5) {
+      A_t->insertGlobalValues(gblRow, Teuchos::tuple<GO>(gblRow-2, gblRow-1, gblRow, gblRow+1, gblRow+2), Teuchos::tuple<SC>(negOne, negOne, two, negOne, negOne));
+    }
+    else if(gblRow == 6) {
+      A_t->insertGlobalValues(gblRow, Teuchos::tuple<GO>(gblRow-2, gblRow-1, gblRow, gblRow+1, gblRow+2), Teuchos::tuple<SC>(negOne, two, two, two, negOne));
+    }
+    else {
+      A_t->insertGlobalValues(gblRow, Teuchos::tuple<GO>(gblRow - 1, gblRow, gblRow + 1), Teuchos::tuple<SC>(negOne, two, negOne));
+    }
+  }
+  A_t->fillComplete();
+  RCP<CrsMatrix> A_x = rcp(new TpetraCrsMatrix(A_t));
+  RCP<Matrix> A = rcp(new CrsMatrixWrap(A_x));
   fineLevel.Set("A", A);
 
   Teuchos::ParameterList galeriList;
-  galeriList.set("nx", Teuchos::as<GlobalOrdinal>(36));
+  galeriList.set("nx", Teuchos::as<GlobalOrdinal>(globalIndices));
   RCP<RealValuedMultiVector> coordinates = Galeri::Xpetra::Utils::CreateCartesianCoordinates<SC, LO, GO, Map, RealValuedMultiVector>("1D", A->getRowMap(), galeriList);
   fineLevel.Set("Coordinates", coordinates);
 
@@ -1429,25 +1461,59 @@ TEUCHOS_UNIT_TEST_TEMPLATE_4_DECL(CoalesceDropFactory, ClassicalScaledCut, Scala
   const RCP<const Map> myImportMap = graph->GetImportMap();  // < note that the ImportMap is built from the column map of the matrix A WITHOUT dropping!
   const RCP<const Map> myDomainMap = graph->GetDomainMap();
 
-  TEST_EQUALITY(myImportMap->getMaxAllGlobalIndex(), 35);
+  TEST_EQUALITY(myImportMap->getMaxAllGlobalIndex(), globalIndices-1);
   TEST_EQUALITY(myImportMap->getMinAllGlobalIndex(), 0);
   TEST_EQUALITY(myImportMap->getMinLocalIndex(), 0);
-  TEST_EQUALITY(myImportMap->getGlobalNumElements(), Teuchos::as<size_t>(36 + (comm->getSize() - 1) * 2));
+  TEST_EQUALITY(myImportMap->getGlobalNumElements(), Teuchos::as<size_t>(globalIndices + (comm->getSize() - 1) * 2));
 
-  TEST_EQUALITY(myDomainMap->getMaxAllGlobalIndex(), 35);
+  TEST_EQUALITY(myDomainMap->getMaxAllGlobalIndex(), globalIndices-1);
   TEST_EQUALITY(myDomainMap->getMinAllGlobalIndex(), 0);
   TEST_EQUALITY(myDomainMap->getMinLocalIndex(), 0);
-  TEST_EQUALITY(myDomainMap->getGlobalNumElements(), 36);
-
-  TEST_EQUALITY(graph->GetGlobalNumEdges(), 72);
-
-}  // SignaledClassical
+  TEST_EQUALITY(myDomainMap->getGlobalNumElements(), globalIndices);
+
+  TEST_EQUALITY(graph->GetGlobalNumEdges(), 28);
+
+  int rows[13] = {0, 2, 4, 5, 7, 10, 15, 18, 21, 23, 24, 26, 28};
+  int columns[28] = {0, 1,
+                     0, 1,
+                     2,
+                     3, 4,
+                     3, 4, 5,
+                     3, 4, 5, 6, 7,
+                     5, 6, 7,
+                     6, 7, 8,
+                     7, 8,
+                     9,
+                     10, 11,
+                     10, 11};
+  auto rowPtrs = graph->getRowPtrs();
+  auto entries = graph->getEntries();
+  size_t rowID = 0;
+  TEST_EQUALITY(rowPtrs(0), rowID);
+  for(size_t i = 0; i < rowPtrs.size()-1; i++) {
+    auto gblID = myDomainMap->getGlobalElement(i);
+    int rownnz = rows[gblID+1]-rows[gblID];
+    rowID += rownnz;
+    TEST_EQUALITY(rowPtrs(i+1), rowID);
+
+    std::vector<int> colID;
+    for(int j = 0; j < rownnz; j++) {
+      colID.push_back(myImportMap->getGlobalElement(entries(rowPtrs(i)+j)));
+    }
+    std::sort(std::begin(colID), std::end(colID));
+    for(int j = 0; j < rownnz; j++) {
+      TEST_EQUALITY(colID[j], columns[rows[gblID]+j]);
+    }
+  }
+}  // ClassicalScaledCut
 
 TEUCHOS_UNIT_TEST_TEMPLATE_4_DECL(CoalesceDropFactory, ClassicalUnScaledCut, Scalar, LocalOrdinal, GlobalOrdinal, Node) {
 #include <MueLu_UseShortNames.hpp>
   typedef Teuchos::ScalarTraits<SC> STS;
   typedef typename STS::magnitudeType real_type;
   typedef Xpetra::MultiVector<real_type, LO, GO, NO> RealValuedMultiVector;
+  typedef Tpetra::Map<LO, GO, NO> map_type;
+  typedef Tpetra::CrsMatrix<SC, LO, GO, NO> crs_matrix_type;
 
   MUELU_TESTING_SET_OSTREAM;
   MUELU_TESTING_LIMIT_SCOPE(Scalar, GlobalOrdinal, Node);
@@ -1458,11 +1524,41 @@ TEUCHOS_UNIT_TEST_TEMPLATE_4_DECL(CoalesceDropFactory, ClassicalUnScaledCut, Sca
   Level fineLevel;
   TestHelpers::TestFactory<SC, LO, GO, NO>::createSingleLevelHierarchy(fineLevel);
 
-  RCP<Matrix> A = TestHelpers::TestFactory<SC, LO, GO, NO>::Build1DPoisson(36);
+  const global_size_t globalIndices = 12;
+  const GO indexBase = 0;
+  RCP<const map_type> map = rcp(new map_type(globalIndices, indexBase, comm));
+  RCP<crs_matrix_type> A_t(new crs_matrix_type(map, 5));
+  const SC two = static_cast<SC>(2.0);
+  const SC one = static_cast<SC>(1.0);
+  const SC negOne = static_cast<SC>(-1.0);
+  for(LO lclRow = 0; lclRow < static_cast<LO> (map->getLocalNumElements()); lclRow++) {
+    const GO gblRow = map->getGlobalElement(lclRow);
+    if(gblRow == 0) {
+      A_t->insertGlobalValues(gblRow, Teuchos::tuple<GO>(gblRow, gblRow + 1), Teuchos::tuple<SC>(two, negOne));
+    }
+    else if(static_cast<Tpetra::global_size_t>(gblRow) == globalIndices - 1) {
+      A_t->insertGlobalValues(gblRow, Teuchos::tuple<GO>(gblRow - 1, gblRow), Teuchos::tuple<SC>(negOne, two));
+    }
+    else if(gblRow == 2 || gblRow == 9) {
+      A_t->insertGlobalValues(gblRow, Teuchos::tuple<GO>(gblRow), Teuchos::tuple<SC>(one));
+    }
+    else if(gblRow == 5) {
+      A_t->insertGlobalValues(gblRow, Teuchos::tuple<GO>(gblRow-2, gblRow-1, gblRow, gblRow+1, gblRow+2), Teuchos::tuple<SC>(negOne, negOne, two, negOne, negOne));
+    }
+    else if(gblRow == 6) {
+      A_t->insertGlobalValues(gblRow, Teuchos::tuple<GO>(gblRow-2, gblRow-1, gblRow, gblRow+1, gblRow+2), Teuchos::tuple<SC>(negOne, two, two, two, negOne));
+    }
+    else {
+      A_t->insertGlobalValues(gblRow, Teuchos::tuple<GO>(gblRow - 1, gblRow, gblRow + 1), Teuchos::tuple<SC>(negOne, two, negOne));
+    }
+  }
+  A_t->fillComplete();
+  RCP<CrsMatrix> A_x = rcp(new TpetraCrsMatrix(A_t));
+  RCP<Matrix> A = rcp(new CrsMatrixWrap(A_x));
   fineLevel.Set("A", A);
 
   Teuchos::ParameterList galeriList;
-  galeriList.set("nx", Teuchos::as<GlobalOrdinal>(36));
+  galeriList.set("nx", Teuchos::as<GlobalOrdinal>(globalIndices));
   RCP<RealValuedMultiVector> coordinates = Galeri::Xpetra::Utils::CreateCartesianCoordinates<SC, LO, GO, Map, RealValuedMultiVector>("1D", A->getRowMap(), galeriList);
   fineLevel.Set("Coordinates", coordinates);
 
@@ -1488,19 +1584,51 @@ TEUCHOS_UNIT_TEST_TEMPLATE_4_DECL(CoalesceDropFactory, ClassicalUnScaledCut, Sca
   const RCP<const Map> myImportMap = graph->GetImportMap();  // < note that the ImportMap is built from the column map of the matrix A WITHOUT dropping!
   const RCP<const Map> myDomainMap = graph->GetDomainMap();
 
-  TEST_EQUALITY(myImportMap->getMaxAllGlobalIndex(), 35);
+  TEST_EQUALITY(myImportMap->getMaxAllGlobalIndex(), globalIndices-1);
   TEST_EQUALITY(myImportMap->getMinAllGlobalIndex(), 0);
   TEST_EQUALITY(myImportMap->getMinLocalIndex(), 0);
-  TEST_EQUALITY(myImportMap->getGlobalNumElements(), Teuchos::as<size_t>(36 + (comm->getSize() - 1) * 2));
+  TEST_EQUALITY(myImportMap->getGlobalNumElements(), Teuchos::as<size_t>(globalIndices + (comm->getSize() - 1) * 2));
 
-  TEST_EQUALITY(myDomainMap->getMaxAllGlobalIndex(), 35);
+  TEST_EQUALITY(myDomainMap->getMaxAllGlobalIndex(), globalIndices-1);
   TEST_EQUALITY(myDomainMap->getMinAllGlobalIndex(), 0);
   TEST_EQUALITY(myDomainMap->getMinLocalIndex(), 0);
-  TEST_EQUALITY(myDomainMap->getGlobalNumElements(), 36);
-
-  TEST_EQUALITY(graph->GetGlobalNumEdges(), 72);
-
-}  // SignaledClassical
+  TEST_EQUALITY(myDomainMap->getGlobalNumElements(), globalIndices);
+
+  TEST_EQUALITY(graph->GetGlobalNumEdges(), 28);
+
+  int rows[13] = {0, 2, 4, 5, 7, 10, 15, 18, 21, 23, 24, 26, 28};
+  int columns[28] = {0, 1,
+                     0, 1,
+                     2,
+                     3, 4,
+                     3, 4, 5,
+                     3, 4, 5, 6, 7,
+                     5, 6, 7,
+                     6, 7, 8,
+                     7, 8,
+                     9,
+                     10, 11,
+                     10, 11};
+  auto rowPtrs = graph->getRowPtrs();
+  auto entries = graph->getEntries();
+  size_t rowID = 0;
+  TEST_EQUALITY(rowPtrs(0), rowID);
+  for(size_t i = 0; i < rowPtrs.size()-1; i++) {
+    auto gblID = myDomainMap->getGlobalElement(i);
+    int rownnz = rows[gblID+1]-rows[gblID];
+    rowID += rownnz;
+    TEST_EQUALITY(rowPtrs(i+1), rowID);
+
+    std::vector<int> colID;
+    for(int j = 0; j < rownnz; j++) {
+      colID.push_back(myImportMap->getGlobalElement(entries(rowPtrs(i)+j)));
+    }
+    std::sort(std::begin(colID), std::end(colID));
+    for(int j = 0; j < rownnz; j++) {
+      TEST_EQUALITY(colID[j], columns[rows[gblID]+j]);
+    }
+  }
+}  // ClassicalUnScaledCut
 
 TEUCHOS_UNIT_TEST_TEMPLATE_4_DECL(CoalesceDropFactory, SignaledClassical, Scalar, LocalOrdinal, GlobalOrdinal, Node) {
 #include <MueLu_UseShortNames.hpp>
@@ -1902,7 +2030,7 @@ TEUCHOS_UNIT_TEST_TEMPLATE_4_DECL(CoalesceDropFactory, BlockDiagonal, Scalar, Lo
   coalesceDropFact.SetDefaultVerbLevel(MueLu::Extreme);
   coalesceDropFact.SetFactory("UnAmalgamationInfo", amalgFact);
   coalesceDropFact.SetFactory("BlockNumber", ibFact);
-  coalesceDropFact.SetParameter("aggregation: drop tol", Teuchos::ParameterEntry(8.0));
+  coalesceDropFact.SetParameter("aggregation: drop tol", Teuchos::ParameterEntry(sqrt(0.125)));
   coalesceDropFact.SetParameter("aggregation: drop scheme", Teuchos::ParameterEntry(std::string("block diagonal")));
   coalesceDropFact.SetParameter("aggregation: block diagonal: interleaved blocksize", Teuchos::ParameterEntry(3));
   coalesceDropFact.SetDefaultVerbLevel(MueLu::Extreme);
@@ -1949,7 +2077,7 @@ TEUCHOS_UNIT_TEST_TEMPLATE_4_DECL(CoalesceDropFactory, BlockDiagonalClassical, S
   coalesceDropFact.SetDefaultVerbLevel(MueLu::Extreme);
   coalesceDropFact.SetFactory("UnAmalgamationInfo", amalgFact);
   coalesceDropFact.SetFactory("BlockNumber", ibFact);
-  coalesceDropFact.SetParameter("aggregation: drop tol", Teuchos::ParameterEntry(8.0));
+  coalesceDropFact.SetParameter("aggregation: drop tol", Teuchos::ParameterEntry(sqrt(0.125)));
   coalesceDropFact.SetParameter("aggregation: drop scheme", Teuchos::ParameterEntry(std::string("block diagonal classical")));
   coalesceDropFact.SetParameter("aggregation: block diagonal: interleaved blocksize", Teuchos::ParameterEntry(3));
   fineLevel.Request("Graph", &coalesceDropFact);

From 26b3eac40a0dd818b7cb5950744c5fba53087005 Mon Sep 17 00:00:00 2001
From: Christian Glusa <caglusa@sandia.gov>
Date: Mon, 4 Nov 2024 09:11:53 -0700
Subject: [PATCH 17/25] MueLu: Fix clang-format

Signed-off-by: Christian Glusa <caglusa@sandia.gov>
---
 .../MueLu_CoalesceDropFactory_def.hpp         | 285 +++++++++---------
 .../test/unit_tests/CoalesceDropFactory.cpp   | 118 ++++----
 2 files changed, 198 insertions(+), 205 deletions(-)

diff --git a/packages/muelu/src/Graph/MatrixTransformation/MueLu_CoalesceDropFactory_def.hpp b/packages/muelu/src/Graph/MatrixTransformation/MueLu_CoalesceDropFactory_def.hpp
index 1f9961289cb0..e2bae01ffa21 100644
--- a/packages/muelu/src/Graph/MatrixTransformation/MueLu_CoalesceDropFactory_def.hpp
+++ b/packages/muelu/src/Graph/MatrixTransformation/MueLu_CoalesceDropFactory_def.hpp
@@ -490,8 +490,8 @@ void CoalesceDropFactory<Scalar, LocalOrdinal, GlobalOrdinal, Node>::Build(Level
               GetOStream(Statistics1) << "Calculating max block off-diagonal" << std::endl;
           }
         } else {
-          ghostedDiag     = MueLu::Utilities<SC, LO, GO, NO>::GetMatrixOverlappedDiagonal(*A);
-          if(classicalAlgo == defaultAlgo) {
+          ghostedDiag = MueLu::Utilities<SC, LO, GO, NO>::GetMatrixOverlappedDiagonal(*A);
+          if (classicalAlgo == defaultAlgo) {
             ghostedDiagVals = ghostedDiag->getData(0);
           }
         }
@@ -510,7 +510,7 @@ void CoalesceDropFactory<Scalar, LocalOrdinal, GlobalOrdinal, Node>::Build(Level
 
         LO realnnz = 0;
         rows(0)    = 0;
-        if(classicalAlgo == defaultAlgo) {
+        if (classicalAlgo == defaultAlgo) {
           SubFactoryMonitor m1(*this, "Classical RS/SA", currentLevel);
           for (LO row = 0; row < Teuchos::as<LO>(A->getRowMap()->getLocalNumElements()); ++row) {
             size_t nnz          = A->getNumEntriesInLocalRow(row);
@@ -578,177 +578,180 @@ void CoalesceDropFactory<Scalar, LocalOrdinal, GlobalOrdinal, Node>::Build(Level
               rows(row + 1) = realnnz;
             }
           }  // end for row
-        }
-        else {
+        } else {
           /* Cut Algorithm */
           SubFactoryMonitor m1(*this, "Cut Drop", currentLevel);
-          using ExecSpace = typename Node::execution_space;
-          using TeamPol = Kokkos::TeamPolicy<ExecSpace>;
-          using TeamMem = typename TeamPol::member_type;
-          using ATS = Kokkos::ArithTraits<Scalar>;
+          using ExecSpace        = typename Node::execution_space;
+          using TeamPol          = Kokkos::TeamPolicy<ExecSpace>;
+          using TeamMem          = typename TeamPol::member_type;
+          using ATS              = Kokkos::ArithTraits<Scalar>;
           using impl_scalar_type = typename ATS::val_type;
-          using implATS = Kokkos::ArithTraits<impl_scalar_type>;
+          using implATS          = Kokkos::ArithTraits<impl_scalar_type>;
 
-          //move from host to device
+          // move from host to device
           auto ghostedDiagValsView = Kokkos::subview(ghostedDiag->getDeviceLocalView(Xpetra::Access::ReadOnly), Kokkos::ALL(), 0);
-          auto thresholdKokkos = static_cast<impl_scalar_type>(threshold);
+          auto thresholdKokkos     = static_cast<impl_scalar_type>(threshold);
           auto realThresholdKokkos = implATS::magnitude(thresholdKokkos);
-          auto columnsDevice = Kokkos::create_mirror_view(ExecSpace(), columns);
+          auto columnsDevice       = Kokkos::create_mirror_view(ExecSpace(), columns);
 
-          auto A_device = A->getLocalMatrixDevice();
-          RCP<LWGraph> graph = rcp(new LWGraph(A->getCrsGraph(), "graph of A"));
-          RCP<const Import> importer = A->getCrsGraph()->getImporter();
+          auto A_device                               = A->getLocalMatrixDevice();
+          RCP<LWGraph> graph                          = rcp(new LWGraph(A->getCrsGraph(), "graph of A"));
+          RCP<const Import> importer                  = A->getCrsGraph()->getImporter();
           RCP<LocalOrdinalVector> boundaryNodesVector = Xpetra::VectorFactory<LO, LO, GO, NO>::Build(graph->GetDomainMap());
           RCP<LocalOrdinalVector> boundaryColumnVector;
-          for(size_t i = 0; i < graph->GetNodeNumVertices(); i++) {
+          for (size_t i = 0; i < graph->GetNodeNumVertices(); i++) {
             boundaryNodesVector->getDataNonConst(0)[i] = boundaryNodes[i];
           }
-          if(!importer.is_null()) {
+          if (!importer.is_null()) {
             boundaryColumnVector = Xpetra::VectorFactory<LO, LO, GO, NO>::Build(graph->GetImportMap());
             boundaryColumnVector->doImport(*boundaryNodesVector, *importer, Xpetra::INSERT);
-          }
-          else {
+          } else {
             boundaryColumnVector = boundaryNodesVector;
           }
           auto boundaryColumn = boundaryColumnVector->getDeviceLocalView(Xpetra::Access::ReadOnly);
-          auto boundary = Kokkos::subview(boundaryColumn, Kokkos::ALL(), 0);
+          auto boundary       = Kokkos::subview(boundaryColumn, Kokkos::ALL(), 0);
 
-          Kokkos::View<LO*, ExecSpace>rownnzView("rownnzView", A_device.numRows());
-          auto drop_views = Kokkos::View<bool*, ExecSpace>("drop_views", A_device.nnz());
+          Kokkos::View<LO*, ExecSpace> rownnzView("rownnzView", A_device.numRows());
+          auto drop_views  = Kokkos::View<bool*, ExecSpace>("drop_views", A_device.nnz());
           auto index_views = Kokkos::View<size_t*, ExecSpace>("index_views", A_device.nnz());
 
-          Kokkos::parallel_reduce("classical_cut", TeamPol(A_device.numRows(), Kokkos::AUTO), KOKKOS_LAMBDA(const TeamMem& teamMember, LO& globalnnz, GO& totalDropped) {
-            LO row = teamMember.league_rank();
-            auto rowView = A_device.rowConst(row);
-            size_t nnz = rowView.length;
-
-            auto drop_view = Kokkos::subview(drop_views, Kokkos::make_pair(A_device.graph.row_map(row), A_device.graph.row_map(row+1)));
-            auto index_view = Kokkos::subview(index_views, Kokkos::make_pair(A_device.graph.row_map(row), A_device.graph.row_map(row+1)));
-
-            //find magnitudes
-            Kokkos::parallel_for(Kokkos::TeamThreadRange(teamMember, (LO)nnz), [&](const LO colID) {
-              index_view(colID) = colID;
-              LO col = rowView.colidx(colID);
-              //ignore diagonals for now, they are checked again later
-              //Don't aggregate boundaries
-              if(row == col || boundary(col)) {
-                drop_view(colID) = true;
-              }
-              else {
-                drop_view(colID) = false;
-              }
-            });
-
-            size_t dropStart = nnz;
-            if (classicalAlgo == unscaled_cut) {
-              //push diagonals and boundaries to the right, sort everything else by aij on the left
-              Kokkos::Experimental::sort_team(teamMember, index_view, [=](size_t& x, size_t& y) -> bool {
-                if(drop_view(x) || drop_view(y)) {
-                  return drop_view(x) < drop_view(y);
-                }
-                else {
-                  auto x_aij    = implATS::magnitude(rowView.value(x) * rowView.value(x));
-                  auto y_aij    = implATS::magnitude(rowView.value(y) * rowView.value(y));
-                  return x_aij > y_aij;
-                }
-              });
+          Kokkos::parallel_reduce(
+              "classical_cut", TeamPol(A_device.numRows(), Kokkos::AUTO), KOKKOS_LAMBDA(const TeamMem& teamMember, LO& globalnnz, GO& totalDropped) {
+                LO row       = teamMember.league_rank();
+                auto rowView = A_device.rowConst(row);
+                size_t nnz   = rowView.length;
 
-              //find index where dropping starts
-              Kokkos::parallel_reduce(Kokkos::TeamThreadRange(teamMember, 1, nnz), [=](size_t i, size_t& min) {
-                auto const& x = index_view(i - 1);
-                auto const& y = index_view(i);
-                typename implATS::magnitudeType x_aij = 0;
-                typename implATS::magnitudeType y_aij = 0;
-                if(!drop_view(x)) {
-                  x_aij    = implATS::magnitude(rowView.value(x) * rowView.value(x));
-                }
-                if(!drop_view(y)) {
-                  y_aij    = implATS::magnitude(rowView.value(y) * rowView.value(y));
-                }
+                auto drop_view  = Kokkos::subview(drop_views, Kokkos::make_pair(A_device.graph.row_map(row), A_device.graph.row_map(row + 1)));
+                auto index_view = Kokkos::subview(index_views, Kokkos::make_pair(A_device.graph.row_map(row), A_device.graph.row_map(row + 1)));
 
-                if(realThresholdKokkos * realThresholdKokkos * x_aij > y_aij) {
-                  if(i < min) {
-                    min = i;
+                // find magnitudes
+                Kokkos::parallel_for(Kokkos::TeamThreadRange(teamMember, (LO)nnz), [&](const LO colID) {
+                  index_view(colID) = colID;
+                  LO col            = rowView.colidx(colID);
+                  // ignore diagonals for now, they are checked again later
+                  // Don't aggregate boundaries
+                  if (row == col || boundary(col)) {
+                    drop_view(colID) = true;
+                  } else {
+                    drop_view(colID) = false;
                   }
-                }
-              }, Kokkos::Min<size_t>(dropStart));
-            } else if (classicalAlgo == scaled_cut) {
-              //push diagonals and boundaries to the right, sort everything else by aij/aiiajj on the left
-              Kokkos::Experimental::sort_team(teamMember, index_view, [=](size_t& x, size_t& y) -> bool {
-                if(drop_view(x) || drop_view(y)) {
-                  return drop_view(x) < drop_view(y);
-                }
-                else {
-                  auto x_aij    = implATS::magnitude(rowView.value(x) * rowView.value(x));
-                  auto y_aij    = implATS::magnitude(rowView.value(y) * rowView.value(y));
-                  auto x_aiiajj = implATS::magnitude(ghostedDiagValsView(rowView.colidx(x)) * ghostedDiagValsView(row));
-                  auto y_aiiajj = implATS::magnitude(ghostedDiagValsView(rowView.colidx(y)) * ghostedDiagValsView(row));
-                  return (x_aij / x_aiiajj) > (y_aij / y_aiiajj);
-                }
-              });
+                });
 
-              //find index where dropping starts
-              Kokkos::parallel_reduce(Kokkos::TeamThreadRange(teamMember, 1, nnz), [=](size_t i, size_t& min) {
-                auto const& x = index_view(i - 1);
-                auto const& y = index_view(i);
-                typename implATS::magnitudeType x_val = 0;
-                typename implATS::magnitudeType y_val = 0;
-                if(!drop_view(x)) {
-                  typename implATS::magnitudeType x_aij    = implATS::magnitude(rowView.value(x) * rowView.value(x));
-                  typename implATS::magnitudeType x_aiiajj = implATS::magnitude(ghostedDiagValsView(rowView.colidx(x)) * ghostedDiagValsView(row));
-                  x_val = x_aij / x_aiiajj;
-                }
-                if(!drop_view(y)) {
-                  typename implATS::magnitudeType y_aij    = implATS::magnitude(rowView.value(y) * rowView.value(y));
-                  typename implATS::magnitudeType y_aiiajj = implATS::magnitude(ghostedDiagValsView(rowView.colidx(y)) * ghostedDiagValsView(row));
-                  y_val = y_aij / y_aiiajj;
-                }
+                size_t dropStart = nnz;
+                if (classicalAlgo == unscaled_cut) {
+                  // push diagonals and boundaries to the right, sort everything else by aij on the left
+                  Kokkos::Experimental::sort_team(teamMember, index_view, [=](size_t& x, size_t& y) -> bool {
+                    if (drop_view(x) || drop_view(y)) {
+                      return drop_view(x) < drop_view(y);
+                    } else {
+                      auto x_aij = implATS::magnitude(rowView.value(x) * rowView.value(x));
+                      auto y_aij = implATS::magnitude(rowView.value(y) * rowView.value(y));
+                      return x_aij > y_aij;
+                    }
+                  });
 
-                if(realThresholdKokkos * realThresholdKokkos * x_val > y_val) {
-                  if(i < min) {
-                    min = i;
-                  }
+                  // find index where dropping starts
+                  Kokkos::parallel_reduce(
+                      Kokkos::TeamThreadRange(teamMember, 1, nnz), [=](size_t i, size_t& min) {
+                        auto const& x                         = index_view(i - 1);
+                        auto const& y                         = index_view(i);
+                        typename implATS::magnitudeType x_aij = 0;
+                        typename implATS::magnitudeType y_aij = 0;
+                        if (!drop_view(x)) {
+                          x_aij = implATS::magnitude(rowView.value(x) * rowView.value(x));
+                        }
+                        if (!drop_view(y)) {
+                          y_aij = implATS::magnitude(rowView.value(y) * rowView.value(y));
+                        }
+
+                        if (realThresholdKokkos * realThresholdKokkos * x_aij > y_aij) {
+                          if (i < min) {
+                            min = i;
+                          }
+                        }
+                      },
+                      Kokkos::Min<size_t>(dropStart));
+                } else if (classicalAlgo == scaled_cut) {
+                  // push diagonals and boundaries to the right, sort everything else by aij/aiiajj on the left
+                  Kokkos::Experimental::sort_team(teamMember, index_view, [=](size_t& x, size_t& y) -> bool {
+                    if (drop_view(x) || drop_view(y)) {
+                      return drop_view(x) < drop_view(y);
+                    } else {
+                      auto x_aij    = implATS::magnitude(rowView.value(x) * rowView.value(x));
+                      auto y_aij    = implATS::magnitude(rowView.value(y) * rowView.value(y));
+                      auto x_aiiajj = implATS::magnitude(ghostedDiagValsView(rowView.colidx(x)) * ghostedDiagValsView(row));
+                      auto y_aiiajj = implATS::magnitude(ghostedDiagValsView(rowView.colidx(y)) * ghostedDiagValsView(row));
+                      return (x_aij / x_aiiajj) > (y_aij / y_aiiajj);
+                    }
+                  });
+
+                  // find index where dropping starts
+                  Kokkos::parallel_reduce(
+                      Kokkos::TeamThreadRange(teamMember, 1, nnz), [=](size_t i, size_t& min) {
+                        auto const& x                         = index_view(i - 1);
+                        auto const& y                         = index_view(i);
+                        typename implATS::magnitudeType x_val = 0;
+                        typename implATS::magnitudeType y_val = 0;
+                        if (!drop_view(x)) {
+                          typename implATS::magnitudeType x_aij    = implATS::magnitude(rowView.value(x) * rowView.value(x));
+                          typename implATS::magnitudeType x_aiiajj = implATS::magnitude(ghostedDiagValsView(rowView.colidx(x)) * ghostedDiagValsView(row));
+                          x_val                                    = x_aij / x_aiiajj;
+                        }
+                        if (!drop_view(y)) {
+                          typename implATS::magnitudeType y_aij    = implATS::magnitude(rowView.value(y) * rowView.value(y));
+                          typename implATS::magnitudeType y_aiiajj = implATS::magnitude(ghostedDiagValsView(rowView.colidx(y)) * ghostedDiagValsView(row));
+                          y_val                                    = y_aij / y_aiiajj;
+                        }
+
+                        if (realThresholdKokkos * realThresholdKokkos * x_val > y_val) {
+                          if (i < min) {
+                            min = i;
+                          }
+                        }
+                      },
+                      Kokkos::Min<size_t>(dropStart));
                 }
-              }, Kokkos::Min<size_t>(dropStart));
-            }
 
-            //drop everything to the right of where values stop passing threshold
-            if(dropStart < nnz) {
-              Kokkos::parallel_for(Kokkos::TeamThreadRange(teamMember, dropStart, nnz), [=](size_t i) {
-                drop_view(index_view(i)) = true;
-              });
-            }
+                // drop everything to the right of where values stop passing threshold
+                if (dropStart < nnz) {
+                  Kokkos::parallel_for(Kokkos::TeamThreadRange(teamMember, dropStart, nnz), [=](size_t i) {
+                    drop_view(index_view(i)) = true;
+                  });
+                }
 
-            LO rownnz = 0;
-            GO rowDropped = 0;
-            Kokkos::parallel_reduce(Kokkos::TeamThreadRange(teamMember, nnz), [=](const size_t idxID, LO& keep, GO& drop) {
-              LO col = rowView.colidx(idxID);
-              //don't drop diagonal
-              if(row == col || !drop_view(idxID)) {
-                columnsDevice(A_device.graph.row_map(row) + idxID) = col;
-                keep++;
-              }
-              else {
-                columnsDevice(A_device.graph.row_map(row) + idxID) = -1;
-                drop++;
-              }
-            }, rownnz, rowDropped);
+                LO rownnz     = 0;
+                GO rowDropped = 0;
+                Kokkos::parallel_reduce(
+                    Kokkos::TeamThreadRange(teamMember, nnz), [=](const size_t idxID, LO& keep, GO& drop) {
+                      LO col = rowView.colidx(idxID);
+                      // don't drop diagonal
+                      if (row == col || !drop_view(idxID)) {
+                        columnsDevice(A_device.graph.row_map(row) + idxID) = col;
+                        keep++;
+                      } else {
+                        columnsDevice(A_device.graph.row_map(row) + idxID) = -1;
+                        drop++;
+                      }
+                    },
+                    rownnz, rowDropped);
 
-            globalnnz += rownnz;
-            totalDropped += rowDropped;
-            rownnzView(row) = rownnz;
-          }, realnnz, numDropped);
+                globalnnz += rownnz;
+                totalDropped += rowDropped;
+                rownnzView(row) = rownnz;
+              },
+              realnnz, numDropped);
 
-          //update column indices so that kept indices are aligned to the left for subview that happens later on
+          // update column indices so that kept indices are aligned to the left for subview that happens later on
           Kokkos::Experimental::remove(ExecSpace(), columnsDevice, -1);
           Kokkos::deep_copy(columns, columnsDevice);
 
-          //update row indices by adding up new # of nnz in each row
+          // update row indices by adding up new # of nnz in each row
           auto rowsDevice = Kokkos::create_mirror_view(ExecSpace(), rows);
-          Kokkos::parallel_scan(Kokkos::RangePolicy<ExecSpace>(0, A_device.numRows()), KOKKOS_LAMBDA(const int i, LO& partial_sum, bool is_final) {
-            partial_sum += rownnzView(i);
-            if(is_final) rowsDevice(i+1) = partial_sum;
-          });
+          Kokkos::parallel_scan(
+              Kokkos::RangePolicy<ExecSpace>(0, A_device.numRows()), KOKKOS_LAMBDA(const int i, LO& partial_sum, bool is_final) {
+                partial_sum += rownnzView(i);
+                if (is_final) rowsDevice(i + 1) = partial_sum;
+              });
           Kokkos::deep_copy(rows, rowsDevice);
         }
 
diff --git a/packages/muelu/test/unit_tests/CoalesceDropFactory.cpp b/packages/muelu/test/unit_tests/CoalesceDropFactory.cpp
index 0073ca7e9bfb..7ec8dbe27a3a 100644
--- a/packages/muelu/test/unit_tests/CoalesceDropFactory.cpp
+++ b/packages/muelu/test/unit_tests/CoalesceDropFactory.cpp
@@ -1402,36 +1402,31 @@ TEUCHOS_UNIT_TEST_TEMPLATE_4_DECL(CoalesceDropFactory, ClassicalScaledCut, Scala
   TestHelpers::TestFactory<SC, LO, GO, NO>::createSingleLevelHierarchy(fineLevel);
 
   const global_size_t globalIndices = 12;
-  const GO indexBase = 0;
-  RCP<const map_type> map = rcp(new map_type(globalIndices, indexBase, comm));
+  const GO indexBase                = 0;
+  RCP<const map_type> map           = rcp(new map_type(globalIndices, indexBase, comm));
   RCP<crs_matrix_type> A_t(new crs_matrix_type(map, 5));
-  const SC two = static_cast<SC>(2.0);
-  const SC one = static_cast<SC>(1.0);
+  const SC two    = static_cast<SC>(2.0);
+  const SC one    = static_cast<SC>(1.0);
   const SC negOne = static_cast<SC>(-1.0);
-  for(LO lclRow = 0; lclRow < static_cast<LO> (map->getLocalNumElements()); lclRow++) {
+  for (LO lclRow = 0; lclRow < static_cast<LO>(map->getLocalNumElements()); lclRow++) {
     const GO gblRow = map->getGlobalElement(lclRow);
-    if(gblRow == 0) {
+    if (gblRow == 0) {
       A_t->insertGlobalValues(gblRow, Teuchos::tuple<GO>(gblRow, gblRow + 1), Teuchos::tuple<SC>(two, negOne));
-    }
-    else if(static_cast<Tpetra::global_size_t>(gblRow) == globalIndices - 1) {
+    } else if (static_cast<Tpetra::global_size_t>(gblRow) == globalIndices - 1) {
       A_t->insertGlobalValues(gblRow, Teuchos::tuple<GO>(gblRow - 1, gblRow), Teuchos::tuple<SC>(negOne, two));
-    }
-    else if(gblRow == 2 || gblRow == 9) {
+    } else if (gblRow == 2 || gblRow == 9) {
       A_t->insertGlobalValues(gblRow, Teuchos::tuple<GO>(gblRow), Teuchos::tuple<SC>(one));
-    }
-    else if(gblRow == 5) {
-      A_t->insertGlobalValues(gblRow, Teuchos::tuple<GO>(gblRow-2, gblRow-1, gblRow, gblRow+1, gblRow+2), Teuchos::tuple<SC>(negOne, negOne, two, negOne, negOne));
-    }
-    else if(gblRow == 6) {
-      A_t->insertGlobalValues(gblRow, Teuchos::tuple<GO>(gblRow-2, gblRow-1, gblRow, gblRow+1, gblRow+2), Teuchos::tuple<SC>(negOne, two, two, two, negOne));
-    }
-    else {
+    } else if (gblRow == 5) {
+      A_t->insertGlobalValues(gblRow, Teuchos::tuple<GO>(gblRow - 2, gblRow - 1, gblRow, gblRow + 1, gblRow + 2), Teuchos::tuple<SC>(negOne, negOne, two, negOne, negOne));
+    } else if (gblRow == 6) {
+      A_t->insertGlobalValues(gblRow, Teuchos::tuple<GO>(gblRow - 2, gblRow - 1, gblRow, gblRow + 1, gblRow + 2), Teuchos::tuple<SC>(negOne, two, two, two, negOne));
+    } else {
       A_t->insertGlobalValues(gblRow, Teuchos::tuple<GO>(gblRow - 1, gblRow, gblRow + 1), Teuchos::tuple<SC>(negOne, two, negOne));
     }
   }
   A_t->fillComplete();
   RCP<CrsMatrix> A_x = rcp(new TpetraCrsMatrix(A_t));
-  RCP<Matrix> A = rcp(new CrsMatrixWrap(A_x));
+  RCP<Matrix> A      = rcp(new CrsMatrixWrap(A_x));
   fineLevel.Set("A", A);
 
   Teuchos::ParameterList galeriList;
@@ -1461,19 +1456,19 @@ TEUCHOS_UNIT_TEST_TEMPLATE_4_DECL(CoalesceDropFactory, ClassicalScaledCut, Scala
   const RCP<const Map> myImportMap = graph->GetImportMap();  // < note that the ImportMap is built from the column map of the matrix A WITHOUT dropping!
   const RCP<const Map> myDomainMap = graph->GetDomainMap();
 
-  TEST_EQUALITY(myImportMap->getMaxAllGlobalIndex(), globalIndices-1);
+  TEST_EQUALITY(myImportMap->getMaxAllGlobalIndex(), globalIndices - 1);
   TEST_EQUALITY(myImportMap->getMinAllGlobalIndex(), 0);
   TEST_EQUALITY(myImportMap->getMinLocalIndex(), 0);
   TEST_EQUALITY(myImportMap->getGlobalNumElements(), Teuchos::as<size_t>(globalIndices + (comm->getSize() - 1) * 2));
 
-  TEST_EQUALITY(myDomainMap->getMaxAllGlobalIndex(), globalIndices-1);
+  TEST_EQUALITY(myDomainMap->getMaxAllGlobalIndex(), globalIndices - 1);
   TEST_EQUALITY(myDomainMap->getMinAllGlobalIndex(), 0);
   TEST_EQUALITY(myDomainMap->getMinLocalIndex(), 0);
   TEST_EQUALITY(myDomainMap->getGlobalNumElements(), globalIndices);
 
   TEST_EQUALITY(graph->GetGlobalNumEdges(), 28);
 
-  int rows[13] = {0, 2, 4, 5, 7, 10, 15, 18, 21, 23, 24, 26, 28};
+  int rows[13]    = {0, 2, 4, 5, 7, 10, 15, 18, 21, 23, 24, 26, 28};
   int columns[28] = {0, 1,
                      0, 1,
                      2,
@@ -1486,23 +1481,23 @@ TEUCHOS_UNIT_TEST_TEMPLATE_4_DECL(CoalesceDropFactory, ClassicalScaledCut, Scala
                      9,
                      10, 11,
                      10, 11};
-  auto rowPtrs = graph->getRowPtrs();
-  auto entries = graph->getEntries();
-  size_t rowID = 0;
+  auto rowPtrs    = graph->getRowPtrs();
+  auto entries    = graph->getEntries();
+  size_t rowID    = 0;
   TEST_EQUALITY(rowPtrs(0), rowID);
-  for(size_t i = 0; i < rowPtrs.size()-1; i++) {
+  for (size_t i = 0; i < rowPtrs.size() - 1; i++) {
     auto gblID = myDomainMap->getGlobalElement(i);
-    int rownnz = rows[gblID+1]-rows[gblID];
+    int rownnz = rows[gblID + 1] - rows[gblID];
     rowID += rownnz;
-    TEST_EQUALITY(rowPtrs(i+1), rowID);
+    TEST_EQUALITY(rowPtrs(i + 1), rowID);
 
     std::vector<int> colID;
-    for(int j = 0; j < rownnz; j++) {
-      colID.push_back(myImportMap->getGlobalElement(entries(rowPtrs(i)+j)));
+    for (int j = 0; j < rownnz; j++) {
+      colID.push_back(myImportMap->getGlobalElement(entries(rowPtrs(i) + j)));
     }
     std::sort(std::begin(colID), std::end(colID));
-    for(int j = 0; j < rownnz; j++) {
-      TEST_EQUALITY(colID[j], columns[rows[gblID]+j]);
+    for (int j = 0; j < rownnz; j++) {
+      TEST_EQUALITY(colID[j], columns[rows[gblID] + j]);
     }
   }
 }  // ClassicalScaledCut
@@ -1525,36 +1520,31 @@ TEUCHOS_UNIT_TEST_TEMPLATE_4_DECL(CoalesceDropFactory, ClassicalUnScaledCut, Sca
   TestHelpers::TestFactory<SC, LO, GO, NO>::createSingleLevelHierarchy(fineLevel);
 
   const global_size_t globalIndices = 12;
-  const GO indexBase = 0;
-  RCP<const map_type> map = rcp(new map_type(globalIndices, indexBase, comm));
+  const GO indexBase                = 0;
+  RCP<const map_type> map           = rcp(new map_type(globalIndices, indexBase, comm));
   RCP<crs_matrix_type> A_t(new crs_matrix_type(map, 5));
-  const SC two = static_cast<SC>(2.0);
-  const SC one = static_cast<SC>(1.0);
+  const SC two    = static_cast<SC>(2.0);
+  const SC one    = static_cast<SC>(1.0);
   const SC negOne = static_cast<SC>(-1.0);
-  for(LO lclRow = 0; lclRow < static_cast<LO> (map->getLocalNumElements()); lclRow++) {
+  for (LO lclRow = 0; lclRow < static_cast<LO>(map->getLocalNumElements()); lclRow++) {
     const GO gblRow = map->getGlobalElement(lclRow);
-    if(gblRow == 0) {
+    if (gblRow == 0) {
       A_t->insertGlobalValues(gblRow, Teuchos::tuple<GO>(gblRow, gblRow + 1), Teuchos::tuple<SC>(two, negOne));
-    }
-    else if(static_cast<Tpetra::global_size_t>(gblRow) == globalIndices - 1) {
+    } else if (static_cast<Tpetra::global_size_t>(gblRow) == globalIndices - 1) {
       A_t->insertGlobalValues(gblRow, Teuchos::tuple<GO>(gblRow - 1, gblRow), Teuchos::tuple<SC>(negOne, two));
-    }
-    else if(gblRow == 2 || gblRow == 9) {
+    } else if (gblRow == 2 || gblRow == 9) {
       A_t->insertGlobalValues(gblRow, Teuchos::tuple<GO>(gblRow), Teuchos::tuple<SC>(one));
-    }
-    else if(gblRow == 5) {
-      A_t->insertGlobalValues(gblRow, Teuchos::tuple<GO>(gblRow-2, gblRow-1, gblRow, gblRow+1, gblRow+2), Teuchos::tuple<SC>(negOne, negOne, two, negOne, negOne));
-    }
-    else if(gblRow == 6) {
-      A_t->insertGlobalValues(gblRow, Teuchos::tuple<GO>(gblRow-2, gblRow-1, gblRow, gblRow+1, gblRow+2), Teuchos::tuple<SC>(negOne, two, two, two, negOne));
-    }
-    else {
+    } else if (gblRow == 5) {
+      A_t->insertGlobalValues(gblRow, Teuchos::tuple<GO>(gblRow - 2, gblRow - 1, gblRow, gblRow + 1, gblRow + 2), Teuchos::tuple<SC>(negOne, negOne, two, negOne, negOne));
+    } else if (gblRow == 6) {
+      A_t->insertGlobalValues(gblRow, Teuchos::tuple<GO>(gblRow - 2, gblRow - 1, gblRow, gblRow + 1, gblRow + 2), Teuchos::tuple<SC>(negOne, two, two, two, negOne));
+    } else {
       A_t->insertGlobalValues(gblRow, Teuchos::tuple<GO>(gblRow - 1, gblRow, gblRow + 1), Teuchos::tuple<SC>(negOne, two, negOne));
     }
   }
   A_t->fillComplete();
   RCP<CrsMatrix> A_x = rcp(new TpetraCrsMatrix(A_t));
-  RCP<Matrix> A = rcp(new CrsMatrixWrap(A_x));
+  RCP<Matrix> A      = rcp(new CrsMatrixWrap(A_x));
   fineLevel.Set("A", A);
 
   Teuchos::ParameterList galeriList;
@@ -1584,19 +1574,19 @@ TEUCHOS_UNIT_TEST_TEMPLATE_4_DECL(CoalesceDropFactory, ClassicalUnScaledCut, Sca
   const RCP<const Map> myImportMap = graph->GetImportMap();  // < note that the ImportMap is built from the column map of the matrix A WITHOUT dropping!
   const RCP<const Map> myDomainMap = graph->GetDomainMap();
 
-  TEST_EQUALITY(myImportMap->getMaxAllGlobalIndex(), globalIndices-1);
+  TEST_EQUALITY(myImportMap->getMaxAllGlobalIndex(), globalIndices - 1);
   TEST_EQUALITY(myImportMap->getMinAllGlobalIndex(), 0);
   TEST_EQUALITY(myImportMap->getMinLocalIndex(), 0);
   TEST_EQUALITY(myImportMap->getGlobalNumElements(), Teuchos::as<size_t>(globalIndices + (comm->getSize() - 1) * 2));
 
-  TEST_EQUALITY(myDomainMap->getMaxAllGlobalIndex(), globalIndices-1);
+  TEST_EQUALITY(myDomainMap->getMaxAllGlobalIndex(), globalIndices - 1);
   TEST_EQUALITY(myDomainMap->getMinAllGlobalIndex(), 0);
   TEST_EQUALITY(myDomainMap->getMinLocalIndex(), 0);
   TEST_EQUALITY(myDomainMap->getGlobalNumElements(), globalIndices);
 
   TEST_EQUALITY(graph->GetGlobalNumEdges(), 28);
 
-  int rows[13] = {0, 2, 4, 5, 7, 10, 15, 18, 21, 23, 24, 26, 28};
+  int rows[13]    = {0, 2, 4, 5, 7, 10, 15, 18, 21, 23, 24, 26, 28};
   int columns[28] = {0, 1,
                      0, 1,
                      2,
@@ -1609,23 +1599,23 @@ TEUCHOS_UNIT_TEST_TEMPLATE_4_DECL(CoalesceDropFactory, ClassicalUnScaledCut, Sca
                      9,
                      10, 11,
                      10, 11};
-  auto rowPtrs = graph->getRowPtrs();
-  auto entries = graph->getEntries();
-  size_t rowID = 0;
+  auto rowPtrs    = graph->getRowPtrs();
+  auto entries    = graph->getEntries();
+  size_t rowID    = 0;
   TEST_EQUALITY(rowPtrs(0), rowID);
-  for(size_t i = 0; i < rowPtrs.size()-1; i++) {
+  for (size_t i = 0; i < rowPtrs.size() - 1; i++) {
     auto gblID = myDomainMap->getGlobalElement(i);
-    int rownnz = rows[gblID+1]-rows[gblID];
+    int rownnz = rows[gblID + 1] - rows[gblID];
     rowID += rownnz;
-    TEST_EQUALITY(rowPtrs(i+1), rowID);
+    TEST_EQUALITY(rowPtrs(i + 1), rowID);
 
     std::vector<int> colID;
-    for(int j = 0; j < rownnz; j++) {
-      colID.push_back(myImportMap->getGlobalElement(entries(rowPtrs(i)+j)));
+    for (int j = 0; j < rownnz; j++) {
+      colID.push_back(myImportMap->getGlobalElement(entries(rowPtrs(i) + j)));
     }
     std::sort(std::begin(colID), std::end(colID));
-    for(int j = 0; j < rownnz; j++) {
-      TEST_EQUALITY(colID[j], columns[rows[gblID]+j]);
+    for (int j = 0; j < rownnz; j++) {
+      TEST_EQUALITY(colID[j], columns[rows[gblID] + j]);
     }
   }
 }  // ClassicalUnScaledCut

From 9b5fd842f76fdd9bb45d7a62aa8f0e10568f4e52 Mon Sep 17 00:00:00 2001
From: Anderson Chauphan <achauph@sandia.gov>
Date: Mon, 4 Nov 2024 11:05:14 -0700
Subject: [PATCH 18/25] Add default shell setting to CodeQL job

Add default shell setting to CodeQL job specifying to use a `bash -l`
login shell to clean the syntax throughout the file where these were
being individually specified.

Signed-off-by: Anderson Chauphan <achauph@sandia.gov>
---
 .github/workflows/codeql.yml | 17 ++++++++---------
 1 file changed, 8 insertions(+), 9 deletions(-)

diff --git a/.github/workflows/codeql.yml b/.github/workflows/codeql.yml
index b739518db9ef..82c7df08f3e9 100644
--- a/.github/workflows/codeql.yml
+++ b/.github/workflows/codeql.yml
@@ -35,19 +35,20 @@ jobs:
       # only required for workflows in private repositories
       actions: read
       contents: read
-
     strategy:
       fail-fast: false
       matrix:
         include:
         - language: c-cpp
           build-mode: manual
+    defaults:
+      run:
+        shell: bash -l
 
     steps:
     - name: Checkout repository
       uses: actions/checkout@692973e3d937129bcbf40652eb9f2f61becf3332 # v4.1.7
 
-    # Initializes the CodeQL tools for scanning.
     - name: Initialize CodeQL
       uses: github/codeql-action/init@294a9d92911152fe08befb9ec03e240add280cb3 # v3.26.8
       with:
@@ -65,33 +66,31 @@ jobs:
         env
 
     - name: Module list
-      shell: bash -l {0}
       run: |
         module list
         printenv PATH
 
     - if: matrix.build-mode == 'manual'
       name: Get dependencies
+      working-directory: ${GITHUB_WORKSPACE}/packages/framework
       run: |
-          bash -lc "${GITHUB_WORKSPACE}/packages/framework/get_dependencies.sh --container"
+          ./get_dependencies.sh --container
+          ${GITHUB_WORKSPACE}/packages/framework/get_dependencies.sh --container
 
     - if: matrix.build-mode == 'manual'
       name: Generate CMake fragments
-      shell: bash -lc {0}
       run: |
           git fetch origin ${GITHUB_BASE_REF}
 
           mkdir -p trilinos_build && cd trilinos_build
 
           source ${GITHUB_WORKSPACE}/packages/framework/GenConfig/gen-config.sh --force --cmake-fragment genconfig_fragment.cmake rhel8_gcc-openmpi_debug_shared_no-kokkos-arch_no-asan_complex_no-fpic_mpi_no-pt_no-rdc_no-uvm_deprecated-on_no-package-enables
-          bash -lc "${GITHUB_WORKSPACE}/commonTools/framework/get-changed-trilinos-packages.sh origin/${GITHUB_BASE_REF} HEAD package_enables.cmake package_subprojects.cmake"
+          ${GITHUB_WORKSPACE}/commonTools/framework/get-changed-trilinos-packages.sh origin/${GITHUB_BASE_REF} HEAD package_enables.cmake package_subprojects.cmake
 
     - if: matrix.build-mode == 'manual'
       name: Configure and build Trilinos
-      shell: bash -lc {0}
+      working-directory: ./trilinos_build
       run: |
-          cd trilinos_build
-
           cmake -C genconfig_fragment.cmake -C package_enables.cmake \
             -DTrilinos_ENABLE_ALL_FORWARD_DEP_PACKAGES=OFF \
             -DTrilinos_ENABLE_ALL_OPTIONAL_PACKAGES=OFF \

From 6c999fcf300f274adb2e479671f7f3410e84c0b7 Mon Sep 17 00:00:00 2001
From: Anderson Chauphan <achauph@sandia.gov>
Date: Mon, 4 Nov 2024 11:12:39 -0700
Subject: [PATCH 19/25] Fix bash argument syntax

Signed-off-by: Anderson Chauphan <achauph@sandia.gov>
---
 .github/workflows/codeql.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/codeql.yml b/.github/workflows/codeql.yml
index 82c7df08f3e9..4ddc84a920b7 100644
--- a/.github/workflows/codeql.yml
+++ b/.github/workflows/codeql.yml
@@ -43,7 +43,7 @@ jobs:
           build-mode: manual
     defaults:
       run:
-        shell: bash -l
+        shell: bash -l {0}
 
     steps:
     - name: Checkout repository

From c322f5a454f59bf0c3048df3cdd72ca4e7f1ba26 Mon Sep 17 00:00:00 2001
From: Anderson Chauphan <achauph@sandia.gov>
Date: Mon, 4 Nov 2024 11:19:40 -0700
Subject: [PATCH 20/25] Fix working-directory workflow setting

Signed-off-by: Anderson Chauphan <achauph@sandia.gov>
---
 .github/workflows/codeql.yml | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/.github/workflows/codeql.yml b/.github/workflows/codeql.yml
index 4ddc84a920b7..6daaeb02e030 100644
--- a/.github/workflows/codeql.yml
+++ b/.github/workflows/codeql.yml
@@ -72,10 +72,9 @@ jobs:
 
     - if: matrix.build-mode == 'manual'
       name: Get dependencies
-      working-directory: ${GITHUB_WORKSPACE}/packages/framework
+      working-directory: ./packages/framework
       run: |
           ./get_dependencies.sh --container
-          ${GITHUB_WORKSPACE}/packages/framework/get_dependencies.sh --container
 
     - if: matrix.build-mode == 'manual'
       name: Generate CMake fragments

From f7fdee0000c9d9d81c2ff4affb22aa740c20a2be Mon Sep 17 00:00:00 2001
From: Anderson Chauphan <achauph@sandia.gov>
Date: Mon, 4 Nov 2024 12:49:18 -0700
Subject: [PATCH 21/25] Add workflow concurrency cancel and rename workflow

Add workflow concurrency cancelation check for previous workflows
associated with the same PR.

Signed-off-by: Anderson Chauphan <achauph@sandia.gov>
---
 .github/workflows/codeql.yml | 21 ++++++++-------------
 1 file changed, 8 insertions(+), 13 deletions(-)

diff --git a/.github/workflows/codeql.yml b/.github/workflows/codeql.yml
index 6daaeb02e030..fff932c18dc7 100644
--- a/.github/workflows/codeql.yml
+++ b/.github/workflows/codeql.yml
@@ -1,25 +1,20 @@
-# For most projects, this workflow file will not need changing; you simply need
-# to commit it to your repository.
-#
-# You may wish to alter this file to override the set of languages analyzed,
-# or to provide custom queries or build logic.
-#
-# ******** NOTE ********
-# We have attempted to detect the languages in your repository. Please check
-# the `language` matrix defined below to confirm you have the correct set of
-# supported CodeQL languages.
-#
-name: "CodeQL: Linear Solvers"
+name: "CodeQL Security Scan"
 
 on:
   pull_request:
-    branches: [ "develop" ]
+    branches:
+      - develop
     types:
       - opened
       - synchronize
   schedule:
     - cron: '41 23 * * 2'
 
+# Cancels any in progress workflows associated with this PR
+concurrency:
+  group: ${{ github.workflow }}-${{ github.ref }}
+  cancel-in-progress: true
+
 permissions:
   contents: read
 

From 127a471342446f90dcbd614596f6a8da8adc5738 Mon Sep 17 00:00:00 2001
From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com>
Date: Tue, 5 Nov 2024 07:51:38 +0000
Subject: [PATCH 22/25] Bump actions/checkout from 4.2.1 to 4.2.2

Bumps [actions/checkout](https://github.com/actions/checkout) from 4.2.1 to 4.2.2.
- [Release notes](https://github.com/actions/checkout/releases)
- [Changelog](https://github.com/actions/checkout/blob/main/CHANGELOG.md)
- [Commits](https://github.com/actions/checkout/compare/eef61447b9ff4aafe5dcd4e0bbf5d482be7e7871...11bd71901bbe5b1630ceea73d27597364c9af683)

---
updated-dependencies:
- dependency-name: actions/checkout
  dependency-type: direct:production
  update-type: version-update:semver-patch
...

Signed-off-by: dependabot[bot] <support@github.com>
---
 .github/workflows/AT2.yml                   | 8 ++++----
 .github/workflows/clang_format.yml          | 2 +-
 .github/workflows/codeql.yml                | 2 +-
 .github/workflows/dependency-review.yml     | 2 +-
 .github/workflows/detect-git-lfs.yml        | 2 +-
 .github/workflows/detect-mpi-comm-world.yml | 2 +-
 .github/workflows/per-commit.yml            | 2 +-
 .github/workflows/scorecards.yml            | 2 +-
 .github/workflows/spack.yml                 | 2 +-
 9 files changed, 12 insertions(+), 12 deletions(-)

diff --git a/.github/workflows/AT2.yml b/.github/workflows/AT2.yml
index b232051eddf2..c085620db33a 100644
--- a/.github/workflows/AT2.yml
+++ b/.github/workflows/AT2.yml
@@ -60,7 +60,7 @@ jobs:
           mkdir -p /home/Trilinos/src/Trilinos
           mkdir -p /home/Trilinos/build
       - name: Clone trilinos
-        uses: actions/checkout@eef61447b9ff4aafe5dcd4e0bbf5d482be7e7871 # v4.2.1
+        uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
         with:
           fetch-depth: 0
       - name: Repo status
@@ -151,7 +151,7 @@ jobs:
           mkdir -p /home/Trilinos/src/Trilinos
           mkdir -p /home/Trilinos/build
       - name: Clone trilinos
-        uses: actions/checkout@eef61447b9ff4aafe5dcd4e0bbf5d482be7e7871 # v4.2.1
+        uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
         with:
           fetch-depth: 0
       - name: Repo status
@@ -242,7 +242,7 @@ jobs:
           mkdir -p /home/Trilinos/src/Trilinos
           mkdir -p /home/Trilinos/build
       - name: Clone trilinos
-        uses: actions/checkout@eef61447b9ff4aafe5dcd4e0bbf5d482be7e7871 # v4.2.1
+        uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
         with:
           fetch-depth: 0
       - name: Repo status
@@ -334,7 +334,7 @@ jobs:
           mkdir -p /home/Trilinos/src/Trilinos
           mkdir -p /home/Trilinos/build
       - name: Clone trilinos
-        uses: actions/checkout@eef61447b9ff4aafe5dcd4e0bbf5d482be7e7871 # v4.2.1
+        uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
         with:
           fetch-depth: 0
       - name: Repo status
diff --git a/.github/workflows/clang_format.yml b/.github/workflows/clang_format.yml
index a3fd0968ad75..d0b7392226a0 100644
--- a/.github/workflows/clang_format.yml
+++ b/.github/workflows/clang_format.yml
@@ -11,7 +11,7 @@ jobs:
     runs-on: ubuntu-latest
 
     steps:
-    - uses: actions/checkout@eef61447b9ff4aafe5dcd4e0bbf5d482be7e7871 # v4.2.1
+    - uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
     - uses: DoozyX/clang-format-lint-action@c71d0bf4e21876ebec3e5647491186f8797fde31 # v0.18.2
       with:
         source: './packages/muelu ./packages/tempus ./packages/teko ./packages/xpetra'
diff --git a/.github/workflows/codeql.yml b/.github/workflows/codeql.yml
index 4139508fa42b..3ee521f94e90 100644
--- a/.github/workflows/codeql.yml
+++ b/.github/workflows/codeql.yml
@@ -58,7 +58,7 @@ jobs:
         # your codebase is analyzed, see https://docs.github.com/en/code-security/code-scanning/creating-an-advanced-setup-for-code-scanning/codeql-code-scanning-for-compiled-languages
     steps:
     - name: Checkout repository
-      uses: actions/checkout@eef61447b9ff4aafe5dcd4e0bbf5d482be7e7871 # v4.2.1
+      uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
 
     # Initializes the CodeQL tools for scanning.
     - name: Initialize CodeQL
diff --git a/.github/workflows/dependency-review.yml b/.github/workflows/dependency-review.yml
index bf29beac76d5..955b3b3fb2d0 100644
--- a/.github/workflows/dependency-review.yml
+++ b/.github/workflows/dependency-review.yml
@@ -22,6 +22,6 @@ jobs:
           egress-policy: audit
 
       - name: 'Checkout Repository'
-        uses: actions/checkout@eef61447b9ff4aafe5dcd4e0bbf5d482be7e7871 # v4.2.1
+        uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
       - name: 'Dependency Review'
         uses: actions/dependency-review-action@4081bf99e2866ebe428fc0477b69eb4fcda7220a # v4.4.0
diff --git a/.github/workflows/detect-git-lfs.yml b/.github/workflows/detect-git-lfs.yml
index ebe778088863..68595577ec7c 100644
--- a/.github/workflows/detect-git-lfs.yml
+++ b/.github/workflows/detect-git-lfs.yml
@@ -12,7 +12,7 @@ jobs:
 
     steps:
     - name: Check out code
-      uses: actions/checkout@eef61447b9ff4aafe5dcd4e0bbf5d482be7e7871 # v4.2.1
+      uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
       with:
         fetch-depth: 0
 
diff --git a/.github/workflows/detect-mpi-comm-world.yml b/.github/workflows/detect-mpi-comm-world.yml
index 1fd6790c8c86..e85d71db2f6a 100644
--- a/.github/workflows/detect-mpi-comm-world.yml
+++ b/.github/workflows/detect-mpi-comm-world.yml
@@ -12,7 +12,7 @@ jobs:
 
     steps:
     - name: Check out code
-      uses: actions/checkout@eef61447b9ff4aafe5dcd4e0bbf5d482be7e7871 # v4.2.1
+      uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
       with:
         fetch-depth: 0
 
diff --git a/.github/workflows/per-commit.yml b/.github/workflows/per-commit.yml
index 3f619a7dbbc0..80dfc8b94008 100644
--- a/.github/workflows/per-commit.yml
+++ b/.github/workflows/per-commit.yml
@@ -12,7 +12,7 @@ jobs:
 
     steps:
     - name: Check out code
-      uses: actions/checkout@eef61447b9ff4aafe5dcd4e0bbf5d482be7e7871 # v4.2.1
+      uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
       with:
         fetch-depth: 0
 
diff --git a/.github/workflows/scorecards.yml b/.github/workflows/scorecards.yml
index 46a2c4571aff..1ac917d3af8a 100644
--- a/.github/workflows/scorecards.yml
+++ b/.github/workflows/scorecards.yml
@@ -31,7 +31,7 @@ jobs:
 
     steps:
       - name: "Checkout code"
-        uses: actions/checkout@eef61447b9ff4aafe5dcd4e0bbf5d482be7e7871 # v4.2.1
+        uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
         with:
           persist-credentials: false
 
diff --git a/.github/workflows/spack.yml b/.github/workflows/spack.yml
index 59976c1d9b3e..3c3c01b75849 100644
--- a/.github/workflows/spack.yml
+++ b/.github/workflows/spack.yml
@@ -24,7 +24,7 @@ jobs:
     runs-on: [self-hosted, gcc-10.3.0_openmpi-4.1.6]
     steps:
       - name: Clone Trilinos
-        uses: actions/checkout@eef61447b9ff4aafe5dcd4e0bbf5d482be7e7871 # v4.2.1
+        uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
         with:
           fetch-depth: 1
       - name: Spack build

From 0b3f08df7492cb9c580b902b862bfcf07097a1f1 Mon Sep 17 00:00:00 2001
From: maxfirmbach <max@firmbach.com>
Date: Mon, 4 Nov 2024 10:01:09 -0700
Subject: [PATCH 23/25] Make AggregateQualityFactory a transfer factory

Signed-off-by: maxfirmbach <max@firmbach.com>
---
 .../MueLu_NotayAggregationFactory_def.hpp     |  7 ---
 .../MueLu_UncoupledAggregationFactory_def.hpp | 10 -----
 .../MueLu_ParameterListInterpreter_def.hpp    | 43 ++++++++++---------
 ...u_AggregateQualityEstimateFactory_decl.hpp | 13 +++---
 ...Lu_AggregateQualityEstimateFactory_def.hpp | 26 +++++------
 .../aggregatequalities.xml                    |  7 +++
 .../aggregatequalities.xml                    |  6 +--
 .../Output/aggregatequalities_epetra.gold     | 20 ++++-----
 .../Output/aggregatequalities_tpetra.gold     | 20 ++++-----
 .../AggregateQualityEstimateFactory.cpp       | 37 ++++++++++------
 10 files changed, 98 insertions(+), 91 deletions(-)
 rename packages/muelu/src/{Misc => Utils}/MueLu_AggregateQualityEstimateFactory_decl.hpp (88%)
 rename packages/muelu/src/{Misc => Utils}/MueLu_AggregateQualityEstimateFactory_def.hpp (96%)
 create mode 100644 packages/muelu/test/interface/default/EasyParameterListInterpreter/aggregatequalities.xml

diff --git a/packages/muelu/src/Graph/PairwiseAggregation/MueLu_NotayAggregationFactory_def.hpp b/packages/muelu/src/Graph/PairwiseAggregation/MueLu_NotayAggregationFactory_def.hpp
index b432ffb1d868..40f4635e0b3d 100644
--- a/packages/muelu/src/Graph/PairwiseAggregation/MueLu_NotayAggregationFactory_def.hpp
+++ b/packages/muelu/src/Graph/PairwiseAggregation/MueLu_NotayAggregationFactory_def.hpp
@@ -55,7 +55,6 @@ RCP<const ParameterList> NotayAggregationFactory<Scalar, LocalOrdinal, GlobalOrd
 #define SET_VALID_ENTRY(name) validParamList->setEntry(name, MasterList::getEntry(name))
   SET_VALID_ENTRY("aggregation: pairwise: size");
   SET_VALID_ENTRY("aggregation: pairwise: tie threshold");
-  SET_VALID_ENTRY("aggregation: compute aggregate qualities");
   SET_VALID_ENTRY("aggregation: Dirichlet threshold");
   SET_VALID_ENTRY("aggregation: ordering");
 #undef SET_VALID_ENTRY
@@ -64,21 +63,15 @@ RCP<const ParameterList> NotayAggregationFactory<Scalar, LocalOrdinal, GlobalOrd
   validParamList->set<RCP<const FactoryBase>>("A", null, "Generating factory of the matrix");
   validParamList->set<RCP<const FactoryBase>>("Graph", null, "Generating factory of the graph");
   validParamList->set<RCP<const FactoryBase>>("DofsPerNode", null, "Generating factory for variable \'DofsPerNode\', usually the same as for \'Graph\'");
-  validParamList->set<RCP<const FactoryBase>>("AggregateQualities", null, "Generating factory for variable \'AggregateQualities\'");
 
   return validParamList;
 }
 
 template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
 void NotayAggregationFactory<Scalar, LocalOrdinal, GlobalOrdinal, Node>::DeclareInput(Level& currentLevel) const {
-  const ParameterList& pL = GetParameterList();
-
   Input(currentLevel, "A");
   Input(currentLevel, "Graph");
   Input(currentLevel, "DofsPerNode");
-  if (pL.get<bool>("aggregation: compute aggregate qualities")) {
-    Input(currentLevel, "AggregateQualities");
-  }
 }
 
 template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
diff --git a/packages/muelu/src/Graph/UncoupledAggregation/MueLu_UncoupledAggregationFactory_def.hpp b/packages/muelu/src/Graph/UncoupledAggregation/MueLu_UncoupledAggregationFactory_def.hpp
index fdbb1106294c..386451d1cfc3 100644
--- a/packages/muelu/src/Graph/UncoupledAggregation/MueLu_UncoupledAggregationFactory_def.hpp
+++ b/packages/muelu/src/Graph/UncoupledAggregation/MueLu_UncoupledAggregationFactory_def.hpp
@@ -75,14 +75,12 @@ RCP<const ParameterList> UncoupledAggregationFactory<LocalOrdinal, GlobalOrdinal
   SET_VALID_ENTRY("aggregation: use interface aggregation");
   SET_VALID_ENTRY("aggregation: error on nodes with no on-rank neighbors");
   SET_VALID_ENTRY("aggregation: phase3 avoid singletons");
-  SET_VALID_ENTRY("aggregation: compute aggregate qualities");
   SET_VALID_ENTRY("aggregation: phase 1 algorithm");
 #undef SET_VALID_ENTRY
 
   // general variables needed in AggregationFactory
   validParamList->set<RCP<const FactoryBase>>("Graph", null, "Generating factory of the graph");
   validParamList->set<RCP<const FactoryBase>>("DofsPerNode", null, "Generating factory for variable \'DofsPerNode\', usually the same as for \'Graph\'");
-  validParamList->set<RCP<const FactoryBase>>("AggregateQualities", null, "Generating factory for variable \'AggregateQualities\'");
 
   // special variables necessary for OnePtAggregationAlgorithm
   validParamList->set<std::string>("OnePt aggregate map name", "", "Name of input map for single node aggregates. (default='')");
@@ -131,10 +129,6 @@ void UncoupledAggregationFactory<LocalOrdinal, GlobalOrdinal, Node>::DeclareInpu
       Input(currentLevel, "nodeOnInterface");
     }
   }
-
-  if (pL.get<bool>("aggregation: compute aggregate qualities")) {
-    Input(currentLevel, "AggregateQualities");
-  }
 }
 
 template <class LocalOrdinal, class GlobalOrdinal, class Node>
@@ -375,10 +369,6 @@ void UncoupledAggregationFactory<LocalOrdinal, GlobalOrdinal, Node>::Build(Level
   aggregates->ComputeAggregateSizes(true /*forceRecompute*/);
 
   Set(currentLevel, "Aggregates", aggregates);
-
-  if (pL.get<bool>("aggregation: compute aggregate qualities")) {
-    RCP<Xpetra::MultiVector<DefaultScalar, LO, GO, Node>> aggQualities = Get<RCP<Xpetra::MultiVector<DefaultScalar, LO, GO, Node>>>(currentLevel, "AggregateQualities");
-  }
 }
 
 template <class LocalOrdinal, class GlobalOrdinal, class Node>
diff --git a/packages/muelu/src/Interface/MueLu_ParameterListInterpreter_def.hpp b/packages/muelu/src/Interface/MueLu_ParameterListInterpreter_def.hpp
index e46d286abb90..207791bf5b5b 100644
--- a/packages/muelu/src/Interface/MueLu_ParameterListInterpreter_def.hpp
+++ b/packages/muelu/src/Interface/MueLu_ParameterListInterpreter_def.hpp
@@ -1098,7 +1098,6 @@ void ParameterListInterpreter<Scalar, LocalOrdinal, GlobalOrdinal, Node>::
     MUELU_TEST_AND_SET_PARAM_2LIST(paramList, defaultList, "aggregation: preserve Dirichlet points", bool, aggParams);
     MUELU_TEST_AND_SET_PARAM_2LIST(paramList, defaultList, "aggregation: error on nodes with no on-rank neighbors", bool, aggParams);
     MUELU_TEST_AND_SET_PARAM_2LIST(paramList, defaultList, "aggregation: phase3 avoid singletons", bool, aggParams);
-    MUELU_TEST_AND_SET_PARAM_2LIST(paramList, defaultList, "aggregation: compute aggregate qualities", bool, aggParams);
     aggFactory->SetParameterList(aggParams);
     // make sure that the aggregation factory has all necessary data
     aggFactory->SetFactory("DofsPerNode", manager.GetFactory("Graph"));
@@ -1180,7 +1179,6 @@ void ParameterListInterpreter<Scalar, LocalOrdinal, GlobalOrdinal, Node>::
     MUELU_TEST_AND_SET_PARAM_2LIST(paramList, defaultList, "aggregation: pairwise: tie threshold", double, aggParams);
     MUELU_TEST_AND_SET_PARAM_2LIST(paramList, defaultList, "aggregation: Dirichlet threshold", double, aggParams);
     MUELU_TEST_AND_SET_PARAM_2LIST(paramList, defaultList, "aggregation: ordering", std::string, aggParams);
-    MUELU_TEST_AND_SET_PARAM_2LIST(paramList, defaultList, "aggregation: compute aggregate qualities", bool, aggParams);
     aggFactory->SetParameterList(aggParams);
     aggFactory->SetFactory("DofsPerNode", manager.GetFactory("Graph"));
     aggFactory->SetFactory("Graph", manager.GetFactory("Graph"));
@@ -1200,25 +1198,6 @@ void ParameterListInterpreter<Scalar, LocalOrdinal, GlobalOrdinal, Node>::
   coarseMap->SetFactory("Aggregates", manager.GetFactory("Aggregates"));
   manager.SetFactory("CoarseMap", coarseMap);
 
-  // Aggregate qualities
-  if (MUELU_TEST_PARAM_2LIST(paramList, defaultList, "aggregation: compute aggregate qualities", bool, true)) {
-    RCP<Factory> aggQualityFact = rcp(new AggregateQualityEstimateFactory());
-    ParameterList aggQualityParams;
-    MUELU_TEST_AND_SET_PARAM_2LIST(paramList, defaultList, "aggregate qualities: good aggregate threshold", double, aggQualityParams);
-    MUELU_TEST_AND_SET_PARAM_2LIST(paramList, defaultList, "aggregate qualities: file output", bool, aggQualityParams);
-    MUELU_TEST_AND_SET_PARAM_2LIST(paramList, defaultList, "aggregate qualities: file base", std::string, aggQualityParams);
-    MUELU_TEST_AND_SET_PARAM_2LIST(paramList, defaultList, "aggregate qualities: check symmetry", bool, aggQualityParams);
-    MUELU_TEST_AND_SET_PARAM_2LIST(paramList, defaultList, "aggregate qualities: algorithm", std::string, aggQualityParams);
-    MUELU_TEST_AND_SET_PARAM_2LIST(paramList, defaultList, "aggregate qualities: zero threshold", double, aggQualityParams);
-    MUELU_TEST_AND_SET_PARAM_2LIST(paramList, defaultList, "aggregate qualities: percentiles", Teuchos::Array<double>, aggQualityParams);
-    MUELU_TEST_AND_SET_PARAM_2LIST(paramList, defaultList, "aggregate qualities: mode", std::string, aggQualityParams);
-    aggQualityFact->SetParameterList(aggQualityParams);
-    manager.SetFactory("AggregateQualities", aggQualityFact);
-
-    assert(aggType == "uncoupled");
-    aggFactory->SetFactory("AggregateQualities", aggQualityFact);
-  }
-
   // Tentative P
   MUELU_KOKKOS_FACTORY(Ptent, TentativePFactory, TentativePFactory_kokkos);
   ParameterList ptentParams;
@@ -1319,6 +1298,28 @@ void ParameterListInterpreter<Scalar, LocalOrdinal, GlobalOrdinal, Node>::
       RAPs->SetFactory("R", manager.GetFactory("R"));
   }
 
+  // Aggregate qualities
+  if (MUELU_TEST_PARAM_2LIST(paramList, defaultList, "aggregation: compute aggregate qualities", bool, true)) {
+    RCP<Factory> aggQualityFact = rcp(new AggregateQualityEstimateFactory());
+    ParameterList aggQualityParams;
+    MUELU_TEST_AND_SET_PARAM_2LIST(paramList, defaultList, "aggregate qualities: good aggregate threshold", double, aggQualityParams);
+    MUELU_TEST_AND_SET_PARAM_2LIST(paramList, defaultList, "aggregate qualities: file output", bool, aggQualityParams);
+    MUELU_TEST_AND_SET_PARAM_2LIST(paramList, defaultList, "aggregate qualities: file base", std::string, aggQualityParams);
+    MUELU_TEST_AND_SET_PARAM_2LIST(paramList, defaultList, "aggregate qualities: check symmetry", bool, aggQualityParams);
+    MUELU_TEST_AND_SET_PARAM_2LIST(paramList, defaultList, "aggregate qualities: algorithm", std::string, aggQualityParams);
+    MUELU_TEST_AND_SET_PARAM_2LIST(paramList, defaultList, "aggregate qualities: zero threshold", double, aggQualityParams);
+    MUELU_TEST_AND_SET_PARAM_2LIST(paramList, defaultList, "aggregate qualities: percentiles", Teuchos::Array<double>, aggQualityParams);
+    MUELU_TEST_AND_SET_PARAM_2LIST(paramList, defaultList, "aggregate qualities: mode", std::string, aggQualityParams);
+    aggQualityFact->SetParameterList(aggQualityParams);
+    aggQualityFact->SetFactory("Aggregates", manager.GetFactory("Aggregates"));
+    aggQualityFact->SetFactory("CoarseMap", manager.GetFactory("CoarseMap"));
+
+    if (!RAP.is_null())
+      RAP->AddTransferFactory(aggQualityFact);
+    else
+      RAPs->AddTransferFactory(aggQualityFact);
+  }
+
   if (MUELU_TEST_PARAM_2LIST(paramList, defaultList, "aggregation: export visualization data", bool, true)) {
     RCP<AggregationExportFactory> aggExport = rcp(new AggregationExportFactory());
     ParameterList aggExportParams;
diff --git a/packages/muelu/src/Misc/MueLu_AggregateQualityEstimateFactory_decl.hpp b/packages/muelu/src/Utils/MueLu_AggregateQualityEstimateFactory_decl.hpp
similarity index 88%
rename from packages/muelu/src/Misc/MueLu_AggregateQualityEstimateFactory_decl.hpp
rename to packages/muelu/src/Utils/MueLu_AggregateQualityEstimateFactory_decl.hpp
index be87ec960139..473ad53ce0bf 100644
--- a/packages/muelu/src/Misc/MueLu_AggregateQualityEstimateFactory_decl.hpp
+++ b/packages/muelu/src/Utils/MueLu_AggregateQualityEstimateFactory_decl.hpp
@@ -11,7 +11,7 @@
 #define MUELU_AGGREGATEQUALITYESTIMATEFACTORY_DECL_HPP
 
 #include "MueLu_ConfigDefs.hpp"
-#include "MueLu_SingleLevelFactoryBase.hpp"
+#include "MueLu_TwoLevelFactoryBase.hpp"
 #include "MueLu_AggregateQualityEstimateFactory_fwd.hpp"
 
 #include <Xpetra_Matrix_fwd.hpp>
@@ -41,8 +41,11 @@ namespace MueLu {
   computing, 34(2), A1079-A1109.
 */
 
-template <class Scalar = double, class LocalOrdinal = int, class GlobalOrdinal = LocalOrdinal, class Node = Tpetra::KokkosClassic::DefaultNode::DefaultNodeType>
-class AggregateQualityEstimateFactory : public SingleLevelFactoryBase {
+template <class Scalar        = DefaultScalar,
+          class LocalOrdinal  = DefaultLocalOrdinal,
+          class GlobalOrdinal = DefaultGlobalOrdinal,
+          class Node          = DefaultNode>
+class AggregateQualityEstimateFactory : public TwoLevelFactoryBase {
 #undef MUELU_AGGREGATEQUALITYESTIMATEFACTORY_SHORT
 #include "MueLu_UseShortNames.hpp"
 
@@ -70,7 +73,7 @@ class AggregateQualityEstimateFactory : public SingleLevelFactoryBase {
     If the Build method of this class requires some data, but the generating factory is not specified in DeclareInput, then this class
     will fall back to the settings in FactoryManager.
   */
-  void DeclareInput(Level& currentLevel) const;
+  void DeclareInput(Level& fineLevel, Level& coarseLevel) const;
 
   //@}
 
@@ -78,7 +81,7 @@ class AggregateQualityEstimateFactory : public SingleLevelFactoryBase {
   //@{
 
   //! Build aggregate quality esimates with this factory.
-  void Build(Level& currentLevel) const;
+  void Build(Level& fineLevel, Level& coarseLevel) const;
 
   //@}
 
diff --git a/packages/muelu/src/Misc/MueLu_AggregateQualityEstimateFactory_def.hpp b/packages/muelu/src/Utils/MueLu_AggregateQualityEstimateFactory_def.hpp
similarity index 96%
rename from packages/muelu/src/Misc/MueLu_AggregateQualityEstimateFactory_def.hpp
rename to packages/muelu/src/Utils/MueLu_AggregateQualityEstimateFactory_def.hpp
index c2c288192214..e7a2943d9969 100644
--- a/packages/muelu/src/Misc/MueLu_AggregateQualityEstimateFactory_def.hpp
+++ b/packages/muelu/src/Utils/MueLu_AggregateQualityEstimateFactory_def.hpp
@@ -34,10 +34,10 @@ template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
 AggregateQualityEstimateFactory<Scalar, LocalOrdinal, GlobalOrdinal, Node>::~AggregateQualityEstimateFactory() {}
 
 template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
-void AggregateQualityEstimateFactory<Scalar, LocalOrdinal, GlobalOrdinal, Node>::DeclareInput(Level& currentLevel) const {
-  Input(currentLevel, "A");
-  Input(currentLevel, "Aggregates");
-  Input(currentLevel, "CoarseMap");
+void AggregateQualityEstimateFactory<Scalar, LocalOrdinal, GlobalOrdinal, Node>::DeclareInput(Level& fineLevel, Level& coarseLevel) const {
+  Input(fineLevel, "A");
+  Input(fineLevel, "Aggregates");
+  Input(fineLevel, "CoarseMap");
 }
 
 template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
@@ -64,13 +64,13 @@ RCP<const ParameterList> AggregateQualityEstimateFactory<Scalar, LocalOrdinal, G
 }
 
 template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
-void AggregateQualityEstimateFactory<Scalar, LocalOrdinal, GlobalOrdinal, Node>::Build(Level& currentLevel) const {
-  FactoryMonitor m(*this, "Build", currentLevel);
+void AggregateQualityEstimateFactory<Scalar, LocalOrdinal, GlobalOrdinal, Node>::Build(Level& fineLevel, Level& coarseLevel) const {
+  FactoryMonitor m(*this, "Build", fineLevel);
 
-  RCP<Matrix> A              = Get<RCP<Matrix>>(currentLevel, "A");
-  RCP<Aggregates> aggregates = Get<RCP<Aggregates>>(currentLevel, "Aggregates");
+  RCP<Matrix> A              = Get<RCP<Matrix>>(fineLevel, "A");
+  RCP<Aggregates> aggregates = Get<RCP<Aggregates>>(fineLevel, "Aggregates");
 
-  RCP<const Map> map = Get<RCP<const Map>>(currentLevel, "CoarseMap");
+  RCP<const Map> map = Get<RCP<const Map>>(fineLevel, "CoarseMap");
 
   assert(!aggregates->AggregatesCrossProcessors());
   ParameterList pL = GetParameterList();
@@ -81,15 +81,15 @@ void AggregateQualityEstimateFactory<Scalar, LocalOrdinal, GlobalOrdinal, Node>:
   if (mode == "eigenvalue" || mode == "both") {
     aggregate_qualities = Xpetra::MultiVectorFactory<magnitudeType, LO, GO, NO>::Build(map, 1);
     ComputeAggregateQualities(A, aggregates, aggregate_qualities);
-    OutputAggQualities(currentLevel, aggregate_qualities);
+    OutputAggQualities(fineLevel, aggregate_qualities);
   }
   if (mode == "size" || mode == "both") {
     RCP<LocalOrdinalVector> aggregate_sizes = Xpetra::VectorFactory<LO, LO, GO, NO>::Build(map);
     ComputeAggregateSizes(A, aggregates, aggregate_sizes);
-    Set(currentLevel, "AggregateSizes", aggregate_sizes);
-    OutputAggSizes(currentLevel, aggregate_sizes);
+    Set(fineLevel, "AggregateSizes", aggregate_sizes);
+    OutputAggSizes(fineLevel, aggregate_sizes);
   }
-  Set(currentLevel, "AggregateQualities", aggregate_qualities);
+  Set(coarseLevel, "AggregateQualities", aggregate_qualities);
 }
 
 template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
diff --git a/packages/muelu/test/interface/default/EasyParameterListInterpreter/aggregatequalities.xml b/packages/muelu/test/interface/default/EasyParameterListInterpreter/aggregatequalities.xml
new file mode 100644
index 000000000000..f732f2a3c9b5
--- /dev/null
+++ b/packages/muelu/test/interface/default/EasyParameterListInterpreter/aggregatequalities.xml
@@ -0,0 +1,7 @@
+<ParameterList name="MueLu">
+  <Parameter name="aggregation: drop scheme"                      type="string" value="distance laplacian"/>
+  <Parameter name="aggregation: drop tol"                         type="double" value="0.05"/>
+  <Parameter name="aggregation: compute aggregate qualities"      type="bool"   value="true"/>
+  <Parameter name="aggregate qualities: check symmetry"           type="bool"   value="false"/>
+  <Parameter name="aggregate qualities: good aggregate threshold" type="double" value="100.0"/>
+</ParameterList>
diff --git a/packages/muelu/test/interface/default/FactoryParameterListInterpreter/aggregatequalities.xml b/packages/muelu/test/interface/default/FactoryParameterListInterpreter/aggregatequalities.xml
index b36abd859cdd..56565e5f4de7 100644
--- a/packages/muelu/test/interface/default/FactoryParameterListInterpreter/aggregatequalities.xml
+++ b/packages/muelu/test/interface/default/FactoryParameterListInterpreter/aggregatequalities.xml
@@ -47,6 +47,7 @@
       <Parameter name="factory"                             type="string"   value="RAPFactory"/>
 
       <ParameterList name="TransferFactories">
+        <Parameter name="For AggregateQualities"            type="string"   value="myAggregateQualityFact"/>
         <Parameter name="For Coordinates"                   type="string"   value="myTransferCoordinatesFact"/>
       </ParameterList>
 
@@ -58,15 +59,14 @@
 
     <ParameterList name="All">
       <Parameter name="startLevel"                          type="int"      value="0"/>
-
       <Parameter name="A"                                   type="string"   value="myRAPFact"/>
-      <Parameter name="Coordinates"                         type="string"   value="myTransferCoordinatesFact"/>
       <Parameter name="DofsPerNode"                         type="string"   value="myCoalesceDropFact"/>
       <Parameter name="Graph"                               type="string"   value="myCoalesceDropFact"/>
       <Parameter name="P"                                   type="string"   value="myPFact"/>
       <Parameter name="Aggregates"                          type="string"   value="myAggregateFact"/>
-      <Parameter name="CoarseMap"                           type="string"   value="myCoarseMapFact"/>
       <Parameter name="AggregateQualities"                  type="string"   value="myAggregateQualityFact"/>
+      <Parameter name="CoarseMap"                           type="string"   value="myCoarseMapFact"/>
+      <Parameter name="Coordinates"                         type="string"   value="myTransferCoordinatesFact"/>
     </ParameterList>
 
   </ParameterList>
diff --git a/packages/muelu/test/interface/default/Output/aggregatequalities_epetra.gold b/packages/muelu/test/interface/default/Output/aggregatequalities_epetra.gold
index 3714e69e8895..5d4a2e452dab 100644
--- a/packages/muelu/test/interface/default/Output/aggregatequalities_epetra.gold
+++ b/packages/muelu/test/interface/default/Output/aggregatequalities_epetra.gold
@@ -26,13 +26,9 @@ BuildAggregatesNonKokkos (Phase 1 (main))
 BuildAggregatesNonKokkos (Phase 2a (secondary))
 BuildAggregatesNonKokkos (Phase 2b (expansion))
 BuildAggregatesNonKokkos (Phase 3 (cleanup))
-Build (MueLu::AggregateQualityEstimateFactory)
-Build (MueLu::CoarseMapFactory)
 Nullspace factory (MueLu::NullspaceFactory)
 Fine level nullspace = Nullspace
-aggregate qualities: good aggregate threshold = 100   [unused]
-aggregate qualities: check symmetry = 0   [unused]
-aggregation: compute aggregate qualities = 1
+Build (MueLu::CoarseMapFactory)
 matrixmatrix: kernel params -> 
  [empty list]
 matrixmatrix: kernel params -> 
@@ -41,6 +37,10 @@ Transpose P (MueLu::TransPFactory)
 matrixmatrix: kernel params -> 
  [empty list]
 Computing Ac (MueLu::RAPFactory)
+RAPFactory: call transfer factory: MueLu::AggregateQualityEstimateFactory
+Build (MueLu::AggregateQualityEstimateFactory)
+aggregate qualities: good aggregate threshold = 100   [unused]
+aggregate qualities: check symmetry = 0   [unused]
 RAPFactory: call transfer factory: MueLu::CoordinatesTransferFactory
 Build (MueLu::CoordinatesTransferFactory)
 Transferring coordinates
@@ -71,13 +71,9 @@ BuildAggregatesNonKokkos (Phase 1 (main))
 BuildAggregatesNonKokkos (Phase 2a (secondary))
 BuildAggregatesNonKokkos (Phase 2b (expansion))
 BuildAggregatesNonKokkos (Phase 3 (cleanup))
-Build (MueLu::AggregateQualityEstimateFactory)
-Build (MueLu::CoarseMapFactory)
 Nullspace factory (MueLu::NullspaceFactory)
 Fine level nullspace = Nullspace
-aggregate qualities: good aggregate threshold = 100   [unused]
-aggregate qualities: check symmetry = 0   [unused]
-aggregation: compute aggregate qualities = 1
+Build (MueLu::CoarseMapFactory)
 matrixmatrix: kernel params -> 
  [empty list]
 matrixmatrix: kernel params -> 
@@ -86,6 +82,10 @@ Transpose P (MueLu::TransPFactory)
 matrixmatrix: kernel params -> 
  [empty list]
 Computing Ac (MueLu::RAPFactory)
+RAPFactory: call transfer factory: MueLu::AggregateQualityEstimateFactory
+Build (MueLu::AggregateQualityEstimateFactory)
+aggregate qualities: good aggregate threshold = 100   [unused]
+aggregate qualities: check symmetry = 0   [unused]
 RAPFactory: call transfer factory: MueLu::CoordinatesTransferFactory
 Build (MueLu::CoordinatesTransferFactory)
 Transferring coordinates
diff --git a/packages/muelu/test/interface/default/Output/aggregatequalities_tpetra.gold b/packages/muelu/test/interface/default/Output/aggregatequalities_tpetra.gold
index ef6897802897..4c9b7d57f952 100644
--- a/packages/muelu/test/interface/default/Output/aggregatequalities_tpetra.gold
+++ b/packages/muelu/test/interface/default/Output/aggregatequalities_tpetra.gold
@@ -27,13 +27,9 @@ BuildAggregatesNonKokkos (Phase 1 (main))
 BuildAggregatesNonKokkos (Phase 2a (secondary))
 BuildAggregatesNonKokkos (Phase 2b (expansion))
 BuildAggregatesNonKokkos (Phase 3 (cleanup))
-Build (MueLu::AggregateQualityEstimateFactory)
-Build (MueLu::CoarseMapFactory)
 Nullspace factory (MueLu::NullspaceFactory)
 Fine level nullspace = Nullspace
-aggregate qualities: good aggregate threshold = 100   [unused]
-aggregate qualities: check symmetry = 0   [unused]
-aggregation: compute aggregate qualities = 1
+Build (MueLu::CoarseMapFactory)
 matrixmatrix: kernel params -> 
  [empty list]
 matrixmatrix: kernel params -> 
@@ -42,6 +38,10 @@ Transpose P (MueLu::TransPFactory)
 matrixmatrix: kernel params -> 
  [empty list]
 Computing Ac (MueLu::RAPFactory)
+RAPFactory: call transfer factory: MueLu::AggregateQualityEstimateFactory
+Build (MueLu::AggregateQualityEstimateFactory)
+aggregate qualities: good aggregate threshold = 100   [unused]
+aggregate qualities: check symmetry = 0   [unused]
 RAPFactory: call transfer factory: MueLu::CoordinatesTransferFactory
 Build (MueLu::CoordinatesTransferFactory)
 Transferring coordinates
@@ -73,13 +73,9 @@ BuildAggregatesNonKokkos (Phase 1 (main))
 BuildAggregatesNonKokkos (Phase 2a (secondary))
 BuildAggregatesNonKokkos (Phase 2b (expansion))
 BuildAggregatesNonKokkos (Phase 3 (cleanup))
-Build (MueLu::AggregateQualityEstimateFactory)
-Build (MueLu::CoarseMapFactory)
 Nullspace factory (MueLu::NullspaceFactory)
 Fine level nullspace = Nullspace
-aggregate qualities: good aggregate threshold = 100   [unused]
-aggregate qualities: check symmetry = 0   [unused]
-aggregation: compute aggregate qualities = 1
+Build (MueLu::CoarseMapFactory)
 matrixmatrix: kernel params -> 
  [empty list]
 matrixmatrix: kernel params -> 
@@ -88,6 +84,10 @@ Transpose P (MueLu::TransPFactory)
 matrixmatrix: kernel params -> 
  [empty list]
 Computing Ac (MueLu::RAPFactory)
+RAPFactory: call transfer factory: MueLu::AggregateQualityEstimateFactory
+Build (MueLu::AggregateQualityEstimateFactory)
+aggregate qualities: good aggregate threshold = 100   [unused]
+aggregate qualities: check symmetry = 0   [unused]
 RAPFactory: call transfer factory: MueLu::CoordinatesTransferFactory
 Build (MueLu::CoordinatesTransferFactory)
 Transferring coordinates
diff --git a/packages/muelu/test/unit_tests/AggregateQualityEstimateFactory.cpp b/packages/muelu/test/unit_tests/AggregateQualityEstimateFactory.cpp
index 769b47c77c19..dd095e626038 100644
--- a/packages/muelu/test/unit_tests/AggregateQualityEstimateFactory.cpp
+++ b/packages/muelu/test/unit_tests/AggregateQualityEstimateFactory.cpp
@@ -90,26 +90,40 @@ TEUCHOS_UNIT_TEST_TEMPLATE_4_DECL(AggregateQualityEstimateFactory, Poisson2D, Sc
 
   RCP<const Teuchos::Comm<int>> comm = Parameters::getDefaultComm();
 
-  Level level;
-  TestHelpers::TestFactory<Scalar, LO, GO, NO>::createSingleLevelHierarchy(level);
+  Level fineLevel, coarseLevel;
+  TestHelpers::TestFactory<Scalar, LO, GO, NO>::createTwoLevelHierarchy(fineLevel, coarseLevel);
 
   GO nx          = 20 * comm->getSize();
   GO ny          = nx;
   RCP<Matrix> Op = TestHelpers::TestFactory<Scalar, LO, GO, NO>::Build2DPoisson(nx, ny);
-  level.Set("A", Op);
+  fineLevel.Set("A", Op);
 
-  AggregateQualityEstimateFactory aggQualityEstimateFactory;
-  std::cout << *(aggQualityEstimateFactory.GetValidParameterList()) << std::endl;
-  aggQualityEstimateFactory.SetParameter("aggregate qualities: check symmetry", Teuchos::ParameterEntry(false));
-  aggQualityEstimateFactory.SetParameter("aggregate qualities: good aggregate threshold", Teuchos::ParameterEntry(100.0));
-  aggQualityEstimateFactory.SetParameter("aggregate qualities: file output", Teuchos::ParameterEntry(false));
+  RCP<AggregateQualityEstimateFactory> aggQualityEstimateFactory = rcp(new AggregateQualityEstimateFactory());
+  aggQualityEstimateFactory->SetParameter("aggregate qualities: check symmetry", Teuchos::ParameterEntry(false));
+  aggQualityEstimateFactory->SetParameter("aggregate qualities: good aggregate threshold", Teuchos::ParameterEntry(100.0));
+  aggQualityEstimateFactory->SetParameter("aggregate qualities: file output", Teuchos::ParameterEntry(false));
 
-  level.Request("AggregateQualities", &aggQualityEstimateFactory);
-  level.Request(aggQualityEstimateFactory);
+  RCP<AmalgamationFactory> amalgFact = rcp(new AmalgamationFactory());
+  RCP<CoalesceDropFactory> dropFact  = rcp(new CoalesceDropFactory());
+  dropFact->SetFactory("UnAmalgamationInfo", amalgFact);
+  RCP<UncoupledAggregationFactory> aggFact = rcp(new UncoupledAggregationFactory());
+  aggFact->SetFactory("Graph", dropFact);
+  RCP<CoarseMapFactory> coarsemapFact = Teuchos::rcp(new CoarseMapFactory());
+  coarsemapFact->SetFactory("Aggregates", aggFact);
+  aggQualityEstimateFactory->SetFactory("Aggregates", aggFact);
+  aggQualityEstimateFactory->SetFactory("CoarseMap", coarsemapFact);
+
+  coarseLevel.Request(*aggQualityEstimateFactory);
+  fineLevel.Request(*aggFact);
+  fineLevel.Request(*coarsemapFact);
+
+  aggQualityEstimateFactory->Build(fineLevel, coarseLevel);
+
+  coarseLevel.Request("AggregateQualities", aggQualityEstimateFactory.get());
 
   out << "Getting aggregate qualities...\n\n";
 
-  RCP<MultiVectorDouble> aggQualities = level.Get<RCP<MultiVectorDouble>>("AggregateQualities", &aggQualityEstimateFactory);
+  RCP<MultiVectorDouble> aggQualities = coarseLevel.Get<RCP<MultiVectorDouble>>("AggregateQualities", aggQualityEstimateFactory.get());
 
   out << "Testing aggregate qualities to make sure all aggregates are of good quality...\n\n";
 
@@ -536,7 +550,6 @@ TEUCHOS_UNIT_TEST_TEMPLATE_4_DECL(AggregateQualityEstimateFactory, ConvectionDif
   TEUCHOS_UNIT_TEST_TEMPLATE_4_INSTANT(AggregateQualityEstimateFactory, Constructor, Scalar, LO, GO, Node) \
   TEUCHOS_UNIT_TEST_TEMPLATE_4_INSTANT(AggregateQualityEstimateFactory, Poisson2D, Scalar, LO, GO, Node)
 //  TEUCHOS_UNIT_TEST_TEMPLATE_4_INSTANT(AggregateQualityEstimateFactory,AnisotropicDiffusion2D,Scalar,LO,GO,Node)
-
 //  TEUCHOS_UNIT_TEST_TEMPLATE_4_INSTANT(AggregateQualityEstimateFactory,ConvectionDiffusion2D,Scalar,LO,GO,Node)
 
 #include <MueLu_ETI_4arg.hpp>

From 4e6e2ca89df63eb0050d822a60b575e2c0ddcd9c Mon Sep 17 00:00:00 2001
From: reuterb <bwreute@sandia.gov>
Date: Wed, 6 Nov 2024 16:40:02 -0700
Subject: [PATCH 24/25] Panzer tangent unit tests (Blocked Gather) (#13576)

Refresh Gather_BlockedTpetra evaluator, put tangent capability on device, and update the unit test.

---------

Signed-off-by: Bryan Reuter <bwreute@sandia.gov>
---
 .../test/evaluator_tests/CMakeLists.txt       |   7 +
 .../tpetra_blocked_gather_solution.cpp        | 721 ++++++++++++++++++
 .../Panzer_GatherSolution_BlockedTpetra.hpp   |  21 +-
 ...nzer_GatherSolution_BlockedTpetra_impl.hpp | 178 +++--
 4 files changed, 853 insertions(+), 74 deletions(-)
 create mode 100644 packages/panzer/adapters-stk/test/evaluator_tests/tpetra_blocked_gather_solution.cpp

diff --git a/packages/panzer/adapters-stk/test/evaluator_tests/CMakeLists.txt b/packages/panzer/adapters-stk/test/evaluator_tests/CMakeLists.txt
index 3b2202563a32..d871d0375cb0 100644
--- a/packages/panzer/adapters-stk/test/evaluator_tests/CMakeLists.txt
+++ b/packages/panzer/adapters-stk/test/evaluator_tests/CMakeLists.txt
@@ -43,6 +43,13 @@ TRIBITS_ADD_EXECUTABLE_AND_TEST(
   NUM_MPI_PROCS 2
   )
 
+TRIBITS_ADD_EXECUTABLE_AND_TEST(
+  tGatherSolution_BlockedTpetra
+  SOURCES tpetra_blocked_gather_solution.cpp ${UNIT_TEST_DRIVER}
+  COMM serial mpi
+  NUM_MPI_PROCS 2
+  )
+
 TRIBITS_ADD_EXECUTABLE_AND_TEST(
   tScatterResidual_Tpetra
   SOURCES tpetra_scatter_residual.cpp ${UNIT_TEST_DRIVER}
diff --git a/packages/panzer/adapters-stk/test/evaluator_tests/tpetra_blocked_gather_solution.cpp b/packages/panzer/adapters-stk/test/evaluator_tests/tpetra_blocked_gather_solution.cpp
new file mode 100644
index 000000000000..279956f8d6eb
--- /dev/null
+++ b/packages/panzer/adapters-stk/test/evaluator_tests/tpetra_blocked_gather_solution.cpp
@@ -0,0 +1,721 @@
+// @HEADER
+// *****************************************************************************
+//           Panzer: A partial differential equation assembly
+//       engine for strongly coupled complex multiphysics systems
+//
+// Copyright 2011 NTESS and the Panzer contributors.
+// SPDX-License-Identifier: BSD-3-Clause
+// *****************************************************************************
+// @HEADER
+
+///////////////////////////////////////////////////////////////////////////////
+//
+//  Include Files
+//
+///////////////////////////////////////////////////////////////////////////////
+
+// C++
+#include <cstdio>
+#include <string>
+#include <vector>
+
+// Kokkos
+#include "Kokkos_View_Fad.hpp"
+
+// Panzer
+#include "PanzerAdaptersSTK_config.hpp"
+#include "Panzer_BasisIRLayout.hpp"
+#include "Panzer_BlockedTpetraLinearObjFactory.hpp"
+#include "Panzer_BlockedDOFManager.hpp"
+#include "Panzer_DOFManager.hpp"
+#include "Panzer_Evaluator_WithBaseImpl.hpp"
+#include "Panzer_FieldManagerBuilder.hpp"
+#include "Panzer_GatherOrientation.hpp"
+#include "Panzer_PureBasis.hpp"
+#include "Panzer_STKConnManager.hpp"
+#include "Panzer_STK_Interface.hpp"
+#include "Panzer_STK_SetupUtilities.hpp"
+#include "Panzer_STK_SquareQuadMeshFactory.hpp"
+#include "Panzer_STK_Version.hpp"
+#include "Panzer_Workset.hpp"
+#include "Panzer_LOCPair_GlobalEvaluationData.hpp"
+#include "Panzer_GlobalEvaluationDataContainer.hpp"
+
+// Teuchos
+#include "Teuchos_DefaultMpiComm.hpp"
+#include "Teuchos_GlobalMPISession.hpp"
+#include "Teuchos_OpaqueWrapper.hpp"
+#include "Teuchos_RCP.hpp"
+#include "Teuchos_TimeMonitor.hpp"
+#include "Teuchos_UnitTestHarness.hpp"
+
+// Thyra
+#include "Thyra_ProductVectorBase.hpp"
+#include "Thyra_VectorStdOps.hpp"
+
+// Tpetra
+#include "Tpetra_Vector.hpp"
+
+// user_app
+#include "user_app_EquationSetFactory.hpp"
+
+typedef double ScalarT;
+using LocalOrdinalT = panzer::LocalOrdinal;
+using GlobalOrdinalT = panzer::GlobalOrdinal;
+
+typedef Tpetra::Vector<ScalarT, LocalOrdinalT, GlobalOrdinalT> VectorType;
+typedef Tpetra::Operator<ScalarT, LocalOrdinalT, GlobalOrdinalT> OperatorType;
+typedef Tpetra::CrsMatrix<ScalarT, LocalOrdinalT, GlobalOrdinalT> CrsMatrixType;
+typedef Tpetra::CrsGraph<LocalOrdinalT, GlobalOrdinalT> CrsGraphType;
+typedef Tpetra::Map<LocalOrdinalT, GlobalOrdinalT> MapType;
+typedef Tpetra::Import<LocalOrdinalT, GlobalOrdinalT> ImportType;
+typedef Tpetra::Export<LocalOrdinalT, GlobalOrdinalT> ExportType;
+
+typedef Thyra::TpetraLinearOp<ScalarT, LocalOrdinalT, GlobalOrdinalT> ThyraLinearOp;
+
+typedef panzer::BlockedTpetraLinearObjFactory<panzer::Traits, ScalarT, LocalOrdinalT, GlobalOrdinalT> BlockedTpetraLinObjFactoryType;
+typedef panzer::TpetraLinearObjFactory<panzer::Traits, ScalarT, LocalOrdinalT, GlobalOrdinalT> TpetraLinObjFactoryType;
+typedef panzer::BlockedTpetraLinearObjContainer<ScalarT, LocalOrdinalT, GlobalOrdinalT> BlockedTpetraLinObjContainerType;
+typedef panzer::TpetraLinearObjContainer<ScalarT, LocalOrdinalT, GlobalOrdinalT> TpetraLinObjContainerType;
+
+namespace panzer
+{
+
+  Teuchos::RCP<panzer::PureBasis> buildBasis(std::size_t worksetSize, const std::string &basisName);
+  void testInitialization(const Teuchos::RCP<Teuchos::ParameterList> &ipb);
+  Teuchos::RCP<panzer_stk::STK_Interface> buildMesh(int elemX, int elemY);
+  void testGatherScatter(const bool enable_tangents, Teuchos::FancyOStream &out, bool &success);
+
+  // Test without tangent fields in gather evaluator
+  TEUCHOS_UNIT_TEST(tpetra_assembly, gather_solution_no_tangents)
+  {
+    testGatherScatter(false, out, success);
+  }
+
+  // Test with tangent fields in gather evaluator
+  TEUCHOS_UNIT_TEST(tpetra_assembly, gather_solution_tangents)
+  {
+    testGatherScatter(true, out, success);
+  }
+
+  // enable_tangents determines whether tangent fields dx/dp are added to gather evaluator.
+  // These are used when computing df/dx*dx/dp with the tangent evaluation type
+  void testGatherScatter(const bool enable_tangents, Teuchos::FancyOStream &out, bool &success)
+  {
+#ifdef HAVE_MPI
+    Teuchos::RCP<const Teuchos::MpiComm<int>> tComm = Teuchos::rcp(new Teuchos::MpiComm<int>(MPI_COMM_WORLD));
+#else
+    Teuchos::RCP<const Teuchos::SerialComm<int>> tComm = Teuchos::rcp(new Teuchos::SerialComm<int>(MPI_COMM_WORLD));
+#endif
+
+    int myRank = tComm->getRank();
+    int numProcs = tComm->getSize();
+
+    const std::size_t workset_size = 4 / numProcs;
+    const std::string fieldName1_q1 = "U";
+    const std::string fieldName2_q1 = "V";
+    const std::string fieldName_qedge1 = "B";
+    const int num_tangent = enable_tangents ? 5 : 0;
+
+    Teuchos::RCP<panzer_stk::STK_Interface> mesh = buildMesh(2, 2);
+
+    // build input physics block
+    Teuchos::RCP<panzer::PureBasis> basis_q1 = buildBasis(workset_size, "Q1");
+    Teuchos::RCP<panzer::PureBasis> basis_qedge1 = buildBasis(workset_size, "QEdge1");
+
+    Teuchos::RCP<Teuchos::ParameterList> ipb = Teuchos::parameterList();
+    testInitialization(ipb);
+
+    const int default_int_order = 1;
+    std::string eBlockID = "eblock-0_0";
+    Teuchos::RCP<user_app::MyFactory> eqset_factory = Teuchos::rcp(new user_app::MyFactory);
+    panzer::CellData cellData(workset_size, mesh->getCellTopology("eblock-0_0"));
+    Teuchos::RCP<panzer::GlobalData> gd = panzer::createGlobalData();
+    Teuchos::RCP<panzer::PhysicsBlock> physicsBlock =
+        Teuchos::rcp(new PhysicsBlock(ipb, eBlockID, default_int_order, cellData, eqset_factory, gd, false));
+
+    Teuchos::RCP<std::vector<panzer::Workset>> work_sets = panzer_stk::buildWorksets(*mesh, physicsBlock->elementBlockID(),
+                                                                                     physicsBlock->getWorksetNeeds());
+    TEST_EQUALITY(work_sets->size(), 1);
+
+    // build connection manager and field manager
+    const Teuchos::RCP<panzer::ConnManager> conn_manager = Teuchos::rcp(new panzer_stk::STKConnManager(mesh));
+    Teuchos::RCP<panzer::BlockedDOFManager> blocked_dofManager = Teuchos::rcp(new panzer::BlockedDOFManager(conn_manager, MPI_COMM_WORLD));
+
+    blocked_dofManager->addField(fieldName1_q1, Teuchos::rcp(new panzer::Intrepid2FieldPattern(basis_q1->getIntrepid2Basis())));
+    blocked_dofManager->addField(fieldName2_q1, Teuchos::rcp(new panzer::Intrepid2FieldPattern(basis_q1->getIntrepid2Basis())));
+    blocked_dofManager->addField(fieldName_qedge1, Teuchos::rcp(new panzer::Intrepid2FieldPattern(basis_qedge1->getIntrepid2Basis())));
+
+    std::vector<std::vector<std::string> > fieldOrder(3);
+    fieldOrder[0].push_back(fieldName1_q1);
+    fieldOrder[1].push_back(fieldName_qedge1);
+    fieldOrder[2].push_back(fieldName2_q1);
+    blocked_dofManager->setFieldOrder(fieldOrder);
+
+    blocked_dofManager->buildGlobalUnknowns();
+
+    // setup linear object factory
+    /////////////////////////////////////////////////////////////
+
+    Teuchos::RCP<BlockedTpetraLinObjFactoryType> t_lof = Teuchos::rcp(new BlockedTpetraLinObjFactoryType(tComm.getConst(), blocked_dofManager));
+    Teuchos::RCP<LinearObjFactory<panzer::Traits>> lof = t_lof;
+    Teuchos::RCP<LinearObjContainer> loc = t_lof->buildGhostedLinearObjContainer();
+    t_lof->initializeGhostedContainer(LinearObjContainer::X, *loc);
+    loc->initialize();
+
+    Teuchos::RCP<BlockedTpetraLinObjContainerType> t_loc = Teuchos::rcp_dynamic_cast<BlockedTpetraLinObjContainerType>(loc);
+    Teuchos::RCP<Thyra::VectorBase<double>> x_vec = t_loc->get_x_th();
+    Thyra::assign(x_vec.ptr(), 123.0 + myRank);
+
+    // need a place to evaluate the tangent fields, so we create a 
+    // unblocked DOFManager and LOF and set up if needed
+    std::vector<Teuchos::RCP<GlobalEvaluationData>> tangentContainers;
+    Teuchos::RCP<panzer::DOFManager> dofManager = Teuchos::rcp(new panzer::DOFManager(conn_manager, MPI_COMM_WORLD));
+    Teuchos::RCP<TpetraLinObjFactoryType> tangent_lof = Teuchos::rcp(new TpetraLinObjFactoryType(tComm.getConst(), dofManager));
+    Teuchos::RCP<LinearObjFactory<panzer::Traits>> parent_tangent_lof = tangent_lof;
+ 
+    if (enable_tangents)
+    {
+      using Teuchos::RCP;
+      using Teuchos::rcp;
+      using Teuchos::rcp_dynamic_cast;
+      using Thyra::ProductVectorBase;
+      using LOCPair = panzer::LOCPair_GlobalEvaluationData;
+
+      std::vector<std::string> tangent_fieldOrder;
+      for (int i(0); i < num_tangent; ++i) {
+        std::stringstream ssedge;
+        ssedge << fieldName_qedge1 << " Tangent " << i;
+        std::stringstream ss1, ss2;
+        ss1 << fieldName1_q1 << " Tangent " << i;
+        ss2 << fieldName2_q1 << " Tangent " << i;
+ 
+        dofManager->addField(ss1.str(), Teuchos::rcp(new panzer::Intrepid2FieldPattern(basis_q1->getIntrepid2Basis())));
+        dofManager->addField(ss2.str(), Teuchos::rcp(new panzer::Intrepid2FieldPattern(basis_q1->getIntrepid2Basis())));
+        dofManager->addField(ssedge.str(), Teuchos::rcp(new panzer::Intrepid2FieldPattern(basis_qedge1->getIntrepid2Basis())));
+        tangent_fieldOrder.push_back(ss1.str());
+        tangent_fieldOrder.push_back(ss2.str());
+        tangent_fieldOrder.push_back(ssedge.str());
+      }
+      dofManager->setFieldOrder(tangent_fieldOrder);
+      dofManager->buildGlobalUnknowns();
+
+      // generate and evaluate some fields
+      Teuchos::RCP<LinearObjContainer> tangent_loc = tangent_lof->buildGhostedLinearObjContainer();
+      tangent_lof->initializeGhostedContainer(LinearObjContainer::X, *tangent_loc);
+      tangent_loc->initialize();
+
+      for (int i(0); i < num_tangent; ++i)
+      {
+        auto locPair = Teuchos::rcp(new LOCPair(tangent_lof, panzer::LinearObjContainer::X));
+
+        auto global_t_loc = rcp_dynamic_cast<TpetraLinObjContainerType>(locPair->getGlobalLOC());
+        Teuchos::RCP<Thyra::VectorBase<double>> global_x_vec = global_t_loc->get_x_th();
+        Thyra::assign(global_x_vec.ptr(), 0.123 + myRank + i);
+
+        auto ghosted_t_loc = rcp_dynamic_cast<TpetraLinObjContainerType>(locPair->getGhostedLOC());
+        Teuchos::RCP<Thyra::VectorBase<double>> ghosted_x_vec = ghosted_t_loc->get_x_th();
+        Thyra::assign(ghosted_x_vec.ptr(), 0.123 + myRank + i);
+
+        tangentContainers.push_back(locPair);
+      } // end loop over the tangents
+    }   // end if (enable_tangents)
+
+    // setup field manager, add evaluator under test
+    /////////////////////////////////////////////////////////////
+
+    PHX::FieldManager<panzer::Traits> fm;
+
+    std::vector<PHX::index_size_type> derivative_dimensions;
+    derivative_dimensions.push_back(12);
+    fm.setKokkosExtendedDataTypeDimensions<panzer::Traits::Jacobian>(derivative_dimensions);
+
+    std::vector<PHX::index_size_type> tan_derivative_dimensions;
+    if (enable_tangents)
+      tan_derivative_dimensions.push_back(num_tangent);
+    else
+      tan_derivative_dimensions.push_back(0);
+    fm.setKokkosExtendedDataTypeDimensions<panzer::Traits::Tangent>(tan_derivative_dimensions);
+
+    Teuchos::RCP<PHX::FieldTag> evalField_q1, evalField_qedge1;
+    {
+      using Teuchos::RCP;
+      using Teuchos::rcp;
+      RCP<std::vector<std::string>> names = rcp(new std::vector<std::string>);
+      names->push_back(fieldName1_q1);
+      names->push_back(fieldName2_q1);
+
+      Teuchos::ParameterList pl;
+      pl.set("Basis", basis_q1);
+      pl.set("DOF Names", names);
+      pl.set("Indexer Names", names);
+
+      Teuchos::RCP<PHX::Evaluator<panzer::Traits>> evaluator = lof->buildGather<panzer::Traits::Residual>(pl);
+
+      TEST_EQUALITY(evaluator->evaluatedFields().size(), 2);
+
+      fm.registerEvaluator<panzer::Traits::Residual>(evaluator);
+      fm.requireField<panzer::Traits::Residual>(*evaluator->evaluatedFields()[0]);
+    }
+    {
+      using Teuchos::RCP;
+      using Teuchos::rcp;
+      RCP<std::vector<std::string>> names = rcp(new std::vector<std::string>);
+      names->push_back(fieldName_qedge1);
+
+      Teuchos::ParameterList pl;
+      pl.set("Basis", basis_qedge1);
+      pl.set("DOF Names", names);
+      pl.set("Indexer Names", names);
+
+      Teuchos::RCP<PHX::Evaluator<panzer::Traits>> evaluator = lof->buildGather<panzer::Traits::Residual>(pl);
+
+      TEST_EQUALITY(evaluator->evaluatedFields().size(), 1);
+
+      fm.registerEvaluator<panzer::Traits::Residual>(evaluator);
+      fm.requireField<panzer::Traits::Residual>(*evaluator->evaluatedFields()[0]);
+    }
+
+    {
+      using Teuchos::RCP;
+      using Teuchos::rcp;
+      RCP<std::vector<std::string>> names = rcp(new std::vector<std::string>);
+      names->push_back(fieldName1_q1);
+      names->push_back(fieldName2_q1);
+
+      Teuchos::ParameterList pl;
+      pl.set("Basis", basis_q1);
+      pl.set("DOF Names", names);
+      pl.set("Indexer Names", names);
+
+      Teuchos::RCP<PHX::Evaluator<panzer::Traits>> evaluator = lof->buildGather<panzer::Traits::Jacobian>(pl);
+
+      TEST_EQUALITY(evaluator->evaluatedFields().size(), 2);
+
+      fm.registerEvaluator<panzer::Traits::Jacobian>(evaluator);
+      fm.requireField<panzer::Traits::Jacobian>(*evaluator->evaluatedFields()[0]);
+    }
+    {
+      using Teuchos::RCP;
+      using Teuchos::rcp;
+      RCP<std::vector<std::string>> names = rcp(new std::vector<std::string>);
+      names->push_back(fieldName_qedge1);
+
+      Teuchos::ParameterList pl;
+      pl.set("Basis", basis_qedge1);
+      pl.set("DOF Names", names);
+      pl.set("Indexer Names", names);
+
+      Teuchos::RCP<PHX::Evaluator<panzer::Traits>> evaluator = lof->buildGather<panzer::Traits::Jacobian>(pl);
+
+      TEST_EQUALITY(evaluator->evaluatedFields().size(), 1);
+
+      fm.registerEvaluator<panzer::Traits::Jacobian>(evaluator);
+      fm.requireField<panzer::Traits::Jacobian>(*evaluator->evaluatedFields()[0]);
+    }
+
+    {
+      using Teuchos::RCP;
+      using Teuchos::rcp;
+      RCP<std::vector<std::string>> names = rcp(new std::vector<std::string>);
+      names->push_back(fieldName1_q1);
+      names->push_back(fieldName2_q1);
+
+      Teuchos::ParameterList pl;
+      pl.set("Basis", basis_q1);
+      pl.set("DOF Names", names);
+      pl.set("Indexer Names", names);
+
+      if (enable_tangents)
+      {
+        RCP<std::vector<std::vector<std::string>>> tangent_names =
+            rcp(new std::vector<std::vector<std::string>>(2));
+        for (int i = 0; i < num_tangent; ++i)
+        {
+          std::stringstream ss1, ss2;
+          ss1 << fieldName1_q1 << " Tangent " << i;
+          ss2 << fieldName2_q1 << " Tangent " << i;
+          (*tangent_names)[0].push_back(ss1.str());
+          (*tangent_names)[1].push_back(ss2.str());
+        }
+        pl.set("Tangent Names", tangent_names);
+      }
+
+      Teuchos::RCP<PHX::Evaluator<panzer::Traits>> evaluator = lof->buildGather<panzer::Traits::Tangent>(pl);
+
+      TEST_EQUALITY(evaluator->evaluatedFields().size(), 2);
+
+      fm.registerEvaluator<panzer::Traits::Tangent>(evaluator);
+      fm.requireField<panzer::Traits::Tangent>(*evaluator->evaluatedFields()[0]);
+    }
+    {
+      using Teuchos::RCP;
+      using Teuchos::rcp;
+      RCP<std::vector<std::string>> names = rcp(new std::vector<std::string>);
+      names->push_back(fieldName_qedge1);
+
+      Teuchos::ParameterList pl;
+      pl.set("Basis", basis_qedge1);
+      pl.set("DOF Names", names);
+      pl.set("Indexer Names", names);
+
+      if (enable_tangents)
+      {
+        RCP<std::vector<std::vector<std::string>>> tangent_names =
+            rcp(new std::vector<std::vector<std::string>>(1));
+        for (int i = 0; i < num_tangent; ++i)
+        {
+          std::stringstream ss;
+          ss << fieldName_qedge1 << " Tangent " << i;
+          (*tangent_names)[0].push_back(ss.str());
+        }
+        pl.set("Tangent Names", tangent_names);
+      }
+
+      Teuchos::RCP<PHX::Evaluator<panzer::Traits>> evaluator = lof->buildGather<panzer::Traits::Tangent>(pl);
+
+      TEST_EQUALITY(evaluator->evaluatedFields().size(), 1);
+
+      fm.registerEvaluator<panzer::Traits::Tangent>(evaluator);
+      fm.requireField<panzer::Traits::Tangent>(*evaluator->evaluatedFields()[0]);
+    }
+
+    if (enable_tangents)
+    {
+      for (int i = 0; i < num_tangent; ++i)
+      {
+        using Teuchos::RCP;
+        using Teuchos::rcp;
+        RCP<std::vector<std::string>> names = rcp(new std::vector<std::string>);
+        RCP<std::vector<std::string>> tangent_names = rcp(new std::vector<std::string>);
+        names->push_back(fieldName1_q1);
+        names->push_back(fieldName2_q1);
+        {
+          std::stringstream ss1, ss2;
+          ss1 << fieldName1_q1 << " Tangent " << i;
+          ss2 << fieldName2_q1 << " Tangent " << i;
+          tangent_names->push_back(ss1.str());
+          tangent_names->push_back(ss2.str());
+        }
+
+        Teuchos::ParameterList pl;
+        pl.set("Basis", basis_q1);
+        pl.set("DOF Names", tangent_names);
+        pl.set("Indexer Names", tangent_names);
+
+        {
+          std::stringstream ss;
+          ss << "Tangent Container " << i;
+          pl.set("Global Data Key", ss.str());
+        }
+
+        Teuchos::RCP<PHX::Evaluator<panzer::Traits>> evaluator =
+            parent_tangent_lof->buildGatherTangent<panzer::Traits::Tangent>(pl);
+
+        TEST_EQUALITY(evaluator->evaluatedFields().size(), 2);
+
+        fm.registerEvaluator<panzer::Traits::Tangent>(evaluator);
+      }
+      for (int i = 0; i < num_tangent; ++i)
+      {
+        using Teuchos::RCP;
+        using Teuchos::rcp;
+        RCP<std::vector<std::string>> names = rcp(new std::vector<std::string>);
+        RCP<std::vector<std::string>> tangent_names = rcp(new std::vector<std::string>);
+        names->push_back(fieldName_qedge1);
+        {
+          std::stringstream ss;
+          ss << fieldName_qedge1 << " Tangent " << i;
+          tangent_names->push_back(ss.str());
+        }
+
+        Teuchos::ParameterList pl;
+        pl.set("Basis", basis_qedge1);
+        pl.set("DOF Names", tangent_names);
+        pl.set("Indexer Names", tangent_names);
+
+        {
+          std::stringstream ss;
+          ss << "Tangent Container " << i;
+          pl.set("Global Data Key", ss.str());
+        }
+
+        Teuchos::RCP<PHX::Evaluator<panzer::Traits>> evaluator =
+            parent_tangent_lof->buildGatherTangent<panzer::Traits::Tangent>(pl);
+
+        TEST_EQUALITY(evaluator->evaluatedFields().size(), 1);
+
+        fm.registerEvaluator<panzer::Traits::Tangent>(evaluator);
+      }
+    }
+
+    panzer::Traits::SD sd;
+
+    panzer::Workset &workset = (*work_sets)[0];
+    workset.alpha = 0.0;
+    workset.beta = 2.0; // derivatives multiplied by 2
+    workset.time = 0.0;
+    workset.evaluate_transient_terms = false;
+
+    sd.worksets_ = work_sets;
+
+    fm.postRegistrationSetup(sd);
+
+    panzer::Traits::PED ped;
+    ped.gedc->addDataObject("Solution Gather Container", loc);
+    if (enable_tangents)
+    {
+      for (int i(0); i < num_tangent; ++i)
+      {
+        std::stringstream ss;
+        ss << "Tangent Container " << i;
+        ped.gedc->addDataObject(ss.str(), tangentContainers[i]);
+      }
+    }
+
+    fm.preEvaluate<panzer::Traits::Residual>(ped);
+    fm.evaluateFields<panzer::Traits::Residual>(workset);
+    fm.postEvaluate<panzer::Traits::Residual>(0);
+
+    fm.preEvaluate<panzer::Traits::Jacobian>(ped);
+    fm.evaluateFields<panzer::Traits::Jacobian>(workset);
+    fm.postEvaluate<panzer::Traits::Jacobian>(0);
+
+    fm.preEvaluate<panzer::Traits::Tangent>(ped);
+    fm.evaluateFields<panzer::Traits::Tangent>(workset);
+    fm.postEvaluate<panzer::Traits::Tangent>(0);
+
+    // test Residual fields
+    {
+      PHX::MDField<panzer::Traits::Residual::ScalarT, panzer::Cell, panzer::BASIS>
+          fieldData1_q1(fieldName1_q1, basis_q1->functional);
+      PHX::MDField<panzer::Traits::Residual::ScalarT, panzer::Cell, panzer::BASIS>
+          fieldData2_q1(fieldName2_q1, basis_qedge1->functional);
+
+      fm.getFieldData<panzer::Traits::Residual>(fieldData1_q1);
+      fm.getFieldData<panzer::Traits::Residual>(fieldData2_q1);
+
+      TEST_EQUALITY(fieldData1_q1.extent(0), Teuchos::as<unsigned int>(4 / numProcs));
+      TEST_EQUALITY(fieldData1_q1.extent(1), 4);
+      TEST_EQUALITY(fieldData2_q1.extent(0), Teuchos::as<unsigned int>(4 / numProcs));
+      TEST_EQUALITY(fieldData2_q1.extent(1), 4);
+      TEST_EQUALITY(fieldData1_q1.size(), Teuchos::as<unsigned int>(4 * 4 / numProcs));
+      TEST_EQUALITY(fieldData2_q1.size(), Teuchos::as<unsigned int>(4 * 4 / numProcs));
+
+      auto fieldData1_q1_h = Kokkos::create_mirror_view(fieldData1_q1.get_static_view());
+      auto fieldData2_q1_h = Kokkos::create_mirror_view(fieldData2_q1.get_static_view());
+      Kokkos::deep_copy(fieldData1_q1_h, fieldData1_q1.get_static_view());
+      Kokkos::deep_copy(fieldData2_q1_h, fieldData2_q1.get_static_view());
+
+      for (unsigned int i = 0; i < fieldData1_q1.extent(0); i++)
+        for (unsigned int j = 0; j < fieldData1_q1.extent(1); j++)
+          TEST_EQUALITY(fieldData1_q1_h(i, j), 123.0 + myRank);
+
+      for (unsigned int i = 0; i < fieldData2_q1.extent(0); i++)
+        for (unsigned int j = 0; j < fieldData2_q1.extent(1); j++)
+          TEST_EQUALITY(fieldData2_q1_h(i, j), 123.0 + myRank);
+    }
+    {
+      PHX::MDField<panzer::Traits::Residual::ScalarT, panzer::Cell, panzer::BASIS>
+          fieldData_qedge1(fieldName_qedge1, basis_qedge1->functional);
+
+      fm.getFieldData<panzer::Traits::Residual>(fieldData_qedge1);
+
+      auto fieldData_qedge1_h = Kokkos::create_mirror_view(fieldData_qedge1.get_static_view());
+      Kokkos::deep_copy(fieldData_qedge1_h, fieldData_qedge1.get_static_view());
+
+      TEST_EQUALITY(fieldData_qedge1.extent(0), Teuchos::as<unsigned int>(4 / numProcs));
+      TEST_EQUALITY(fieldData_qedge1.extent(1), 4);
+      TEST_EQUALITY(fieldData_qedge1.size(), Teuchos::as<unsigned int>(4 * 4 / numProcs));
+
+      for (unsigned int cell = 0; cell < fieldData_qedge1.extent(0); ++cell)
+        for (unsigned int pt = 0; pt < fieldData_qedge1.extent(1); pt++)
+          TEST_EQUALITY(fieldData_qedge1_h(cell, pt), 123.0 + myRank);
+    }
+
+    // test Jacobian fields
+    {
+      PHX::MDField<panzer::Traits::Jacobian::ScalarT, panzer::Cell, panzer::BASIS>
+          fieldData1_q1(fieldName1_q1, basis_q1->functional);
+      PHX::MDField<panzer::Traits::Jacobian::ScalarT, panzer::Cell, panzer::BASIS>
+          fieldData2_q1(fieldName2_q1, basis_qedge1->functional);
+
+      fm.getFieldData<panzer::Traits::Jacobian>(fieldData1_q1);
+      fm.getFieldData<panzer::Traits::Jacobian>(fieldData2_q1);
+
+      auto fieldData1_q1_h = Kokkos::create_mirror_view(fieldData1_q1.get_static_view());
+      auto fieldData2_q1_h = Kokkos::create_mirror_view(fieldData2_q1.get_static_view());
+      Kokkos::deep_copy(fieldData1_q1_h, fieldData1_q1.get_static_view());
+      Kokkos::deep_copy(fieldData2_q1_h, fieldData2_q1.get_static_view());
+
+      for (unsigned int cell = 0; cell < fieldData1_q1.extent(0); ++cell)
+      {
+        for (unsigned int pt = 0; pt < fieldData1_q1.extent(1); pt++)
+        {
+          TEST_EQUALITY(fieldData1_q1_h(cell, pt), 123.0 + myRank);
+          TEST_EQUALITY(fieldData1_q1_h(cell, pt).availableSize(), 12);
+        }
+      }
+      for (unsigned int cell = 0; cell < fieldData2_q1.extent(0); ++cell)
+      {
+        for (unsigned int pt = 0; pt < fieldData2_q1.extent(1); pt++)
+        {
+          TEST_EQUALITY(fieldData2_q1_h(cell, pt), 123.0 + myRank);
+          TEST_EQUALITY(fieldData2_q1_h(cell, pt).availableSize(), 12);
+        }
+      }
+    }
+    {
+      PHX::MDField<panzer::Traits::Jacobian::ScalarT, panzer::Cell, panzer::BASIS>
+          fieldData_qedge1(fieldName_qedge1, basis_qedge1->functional);
+
+      fm.getFieldData<panzer::Traits::Jacobian>(fieldData_qedge1);
+
+      auto fieldData_qedge1_h = Kokkos::create_mirror_view(fieldData_qedge1.get_static_view());
+      Kokkos::deep_copy(fieldData_qedge1_h, fieldData_qedge1.get_static_view());
+
+      for (unsigned int cell = 0; cell < fieldData_qedge1.extent(0); ++cell)
+      {
+        for (unsigned int pt = 0; pt < fieldData_qedge1.extent(1); ++pt)
+        {
+          TEST_EQUALITY(fieldData_qedge1_h(cell, pt), 123.0 + myRank);
+          TEST_EQUALITY(fieldData_qedge1_h(cell, pt).availableSize(), 12);
+        }
+      }
+    }
+
+    // test Tangent fields
+    {
+      PHX::MDField<panzer::Traits::Tangent::ScalarT, panzer::Cell, panzer::BASIS>
+          fieldData1_q1(fieldName1_q1, basis_q1->functional);
+      PHX::MDField<panzer::Traits::Tangent::ScalarT, panzer::Cell, panzer::BASIS>
+          fieldData2_q1(fieldName2_q1, basis_qedge1->functional);
+
+      fm.getFieldData<panzer::Traits::Tangent>(fieldData1_q1);
+      fm.getFieldData<panzer::Traits::Tangent>(fieldData2_q1);
+
+      auto fieldData1_q1_h = Kokkos::create_mirror_view(fieldData1_q1.get_static_view());
+      auto fieldData2_q1_h = Kokkos::create_mirror_view(fieldData2_q1.get_static_view());
+      Kokkos::deep_copy(fieldData1_q1_h, fieldData1_q1.get_static_view());
+      Kokkos::deep_copy(fieldData2_q1_h, fieldData2_q1.get_static_view());
+
+      for (unsigned int cell = 0; cell < fieldData1_q1.extent(0); ++cell)
+      {
+        for (unsigned int pt = 0; pt < fieldData1_q1.extent(1); pt++)
+        {
+          if (enable_tangents)
+          {
+            TEST_EQUALITY(fieldData1_q1_h(cell, pt).val(), 123.0 + myRank);
+            TEST_EQUALITY(fieldData1_q1_h(cell, pt).availableSize(), num_tangent);
+            for (int i = 0; i < num_tangent; ++i)
+              TEST_EQUALITY(fieldData1_q1_h(cell, pt).dx(i), 0.123 + myRank + i);
+          }
+          else
+          {
+            TEST_EQUALITY(fieldData1_q1_h(cell, pt), 123.0 + myRank);
+            TEST_EQUALITY(fieldData1_q1_h(cell, pt).availableSize(), 0);
+          }
+        }
+      }
+      for (unsigned int cell = 0; cell < fieldData2_q1.extent(0); ++cell)
+      {
+        for (unsigned int pt = 0; pt < fieldData2_q1.extent(1); pt++)
+        {
+          if (enable_tangents)
+          {
+            TEST_EQUALITY(fieldData2_q1_h(cell, pt).val(), 123.0 + myRank);
+            TEST_EQUALITY(fieldData2_q1_h(cell, pt).availableSize(), num_tangent);
+            for (int i = 0; i < num_tangent; ++i)
+            {
+              TEST_EQUALITY(fieldData2_q1_h(cell, pt).dx(i), 0.123 + myRank + i);
+              TEST_EQUALITY(fieldData2_q1_h(cell, pt).dx(i), 0.123 + myRank + i);
+            }
+          }
+          else
+          {
+            TEST_EQUALITY(fieldData2_q1_h(cell, pt), 123.0 + myRank);
+            TEST_EQUALITY(fieldData2_q1_h(cell, pt).availableSize(), 0);
+          }
+        }
+      }
+    }
+    {
+      PHX::MDField<panzer::Traits::Tangent::ScalarT, panzer::Cell, panzer::BASIS>
+          fieldData_qedge1(fieldName_qedge1, basis_qedge1->functional);
+
+      fm.getFieldData<panzer::Traits::Tangent>(fieldData_qedge1);
+
+      auto fieldData_qedge1_h = Kokkos::create_mirror_view(fieldData_qedge1.get_static_view());
+      Kokkos::deep_copy(fieldData_qedge1_h, fieldData_qedge1.get_static_view());
+
+      for (unsigned int cell = 0; cell < fieldData_qedge1.extent(0); ++cell)
+      {
+        for (unsigned int pt = 0; pt < fieldData_qedge1.extent(1); ++pt)
+        {
+          if (enable_tangents)
+          {
+            TEST_EQUALITY(fieldData_qedge1_h(cell, pt).val(), 123.0 + myRank);
+            TEST_EQUALITY(fieldData_qedge1_h(cell, pt).availableSize(), num_tangent);
+            for (int i = 0; i < num_tangent; ++i)
+              TEST_EQUALITY(fieldData_qedge1_h(cell, pt).dx(i), 0.123 + myRank + i);
+          }
+          else
+          {
+            TEST_EQUALITY(fieldData_qedge1_h(cell, pt), 123.0 + myRank);
+            TEST_EQUALITY(fieldData_qedge1_h(cell, pt).availableSize(), 0);
+          }
+        }
+      }
+    }
+  }
+
+  Teuchos::RCP<panzer::PureBasis> buildBasis(std::size_t worksetSize, const std::string &basisName)
+  {
+    Teuchos::RCP<shards::CellTopology> topo =
+        Teuchos::rcp(new shards::CellTopology(shards::getCellTopologyData<shards::Quadrilateral<4>>()));
+
+    panzer::CellData cellData(worksetSize, topo);
+    return Teuchos::rcp(new panzer::PureBasis(basisName, 1, cellData));
+  }
+
+  Teuchos::RCP<panzer_stk::STK_Interface> buildMesh(int elemX, int elemY)
+  {
+    Teuchos::RCP<Teuchos::ParameterList> pl = rcp(new Teuchos::ParameterList);
+    pl->set("X Blocks", 1);
+    pl->set("Y Blocks", 1);
+    pl->set("X Elements", elemX);
+    pl->set("Y Elements", elemY);
+
+    panzer_stk::SquareQuadMeshFactory factory;
+    factory.setParameterList(pl);
+    Teuchos::RCP<panzer_stk::STK_Interface> mesh = factory.buildUncommitedMesh(MPI_COMM_WORLD);
+    factory.completeMeshConstruction(*mesh, MPI_COMM_WORLD);
+
+    return mesh;
+  }
+
+  void testInitialization(const Teuchos::RCP<Teuchos::ParameterList> &ipb)
+  {
+    // Physics block
+    ipb->setName("test physics");
+    {
+      Teuchos::ParameterList &p = ipb->sublist("a");
+      p.set("Type", "Energy");
+      p.set("Prefix", "");
+      p.set("Model ID", "solid");
+      p.set("Basis Type", "HGrad");
+      p.set("Basis Order", 1);
+      p.set("Integration Order", 1);
+    }
+    {
+      Teuchos::ParameterList &p = ipb->sublist("b");
+      p.set("Type", "Energy");
+      p.set("Prefix", "ION_");
+      p.set("Model ID", "solid");
+      p.set("Basis Type", "HCurl");
+      p.set("Basis Order", 1);
+      p.set("Integration Order", 1);
+    }
+  }
+
+}
diff --git a/packages/panzer/disc-fe/src/evaluators/Panzer_GatherSolution_BlockedTpetra.hpp b/packages/panzer/disc-fe/src/evaluators/Panzer_GatherSolution_BlockedTpetra.hpp
index aec43b41dfbc..6d9bde9d1a3b 100644
--- a/packages/panzer/disc-fe/src/evaluators/Panzer_GatherSolution_BlockedTpetra.hpp
+++ b/packages/panzer/disc-fe/src/evaluators/Panzer_GatherSolution_BlockedTpetra.hpp
@@ -163,7 +163,7 @@ class GatherSolution_BlockedTpetra<panzer::Traits::Tangent,TRAITS,S,LO,GO,NodeT>
 public:
 
    GatherSolution_BlockedTpetra(const Teuchos::RCP<const BlockedDOFManager> & indexer)
-     : gidIndexer_(indexer) {}
+     : globalIndexer_(indexer) {}
 
    GatherSolution_BlockedTpetra(const Teuchos::RCP<const BlockedDOFManager> & indexer,
                                 const Teuchos::ParameterList& p);
@@ -176,13 +176,13 @@ class GatherSolution_BlockedTpetra<panzer::Traits::Tangent,TRAITS,S,LO,GO,NodeT>
   void evaluateFields(typename TRAITS::EvalData d);
 
   virtual Teuchos::RCP<CloneableEvaluator> clone(const Teuchos::ParameterList & pl) const
-  { return Teuchos::rcp(new GatherSolution_BlockedTpetra<panzer::Traits::Tangent,TRAITS,S,LO,GO>(gidIndexer_,pl)); }
+  { return Teuchos::rcp(new GatherSolution_BlockedTpetra<panzer::Traits::Tangent,TRAITS,S,LO,GO>(globalIndexer_,pl)); }
 
 
 private:
   typedef typename panzer::Traits::Tangent EvalT;
   typedef typename panzer::Traits::Tangent::ScalarT ScalarT;
-  //typedef typename panzer::Traits::RealType RealT;
+  typedef typename panzer::Traits::RealType RealT;
 
   typedef BlockedTpetraLinearObjContainer<S,LO,GO,NodeT> ContainerType;
   typedef Tpetra::Vector<S,LO,GO,NodeT> VectorType;
@@ -194,10 +194,14 @@ class GatherSolution_BlockedTpetra<panzer::Traits::Tangent,TRAITS,S,LO,GO,NodeT>
 
   // maps the local (field,element,basis) triplet to a global ID
   // for scattering
-  Teuchos::RCP<const BlockedDOFManager> gidIndexer_;
+  Teuchos::RCP<const BlockedDOFManager> globalIndexer_;
 
   std::vector<int> fieldIds_; // field IDs needing mapping
 
+  //! Returns the index into the Thyra ProductVector sub-block. Size
+  //! of number of fields to scatter
+  std::vector<int> productVectorBlockIndex_;
+
   std::vector< PHX::MDField<ScalarT,Cell,NODE> > gatherFields_;
 
   std::vector<std::string> indexerNames_;
@@ -206,9 +210,16 @@ class GatherSolution_BlockedTpetra<panzer::Traits::Tangent,TRAITS,S,LO,GO,NodeT>
 
   Teuchos::RCP<const BlockedTpetraLinearObjContainer<S,LO,GO,NodeT> > blockedContainer_;
 
+  //! Local indices for unknowns
+  PHX::View<LO**> worksetLIDs_;
+
+  //! Offset into the cell lids for each field. Size of number of fields to scatter.
+  std::vector<PHX::View<int*>> fieldOffsets_;
+
   // Fields for storing tangent components dx/dp of solution vector x
   bool has_tangent_fields_;
-  std::vector< std::vector< PHX::MDField<const ScalarT,Cell,NODE> > > tangentFields_;
+  std::vector< std::vector< PHX::MDField<const RealT,Cell,NODE> > > tangentFields_;
+  PHX::ViewOfViews<2,PHX::View<const RealT**>> tangentFieldsVoV_;
 
   GatherSolution_BlockedTpetra();
 };
diff --git a/packages/panzer/disc-fe/src/evaluators/Panzer_GatherSolution_BlockedTpetra_impl.hpp b/packages/panzer/disc-fe/src/evaluators/Panzer_GatherSolution_BlockedTpetra_impl.hpp
index b0ef54fdd70b..52488585d37e 100644
--- a/packages/panzer/disc-fe/src/evaluators/Panzer_GatherSolution_BlockedTpetra_impl.hpp
+++ b/packages/panzer/disc-fe/src/evaluators/Panzer_GatherSolution_BlockedTpetra_impl.hpp
@@ -8,8 +8,8 @@
 // *****************************************************************************
 // @HEADER
 
-#ifndef PANZER_GATHER_SOLUTION_BLOCKED_EPETRA_IMPL_HPP
-#define PANZER_GATHER_SOLUTION_BLOCKED_EPETRA_IMPL_HPP
+#ifndef PANZER_GATHER_SOLUTION_BLOCKED_TPETRA_IMPL_HPP
+#define PANZER_GATHER_SOLUTION_BLOCKED_TPETRA_IMPL_HPP
 
 #include "Teuchos_Assert.hpp"
 #include "Phalanx_DataLayout.hpp"
@@ -216,7 +216,7 @@ panzer::GatherSolution_BlockedTpetra<panzer::Traits::Tangent, TRAITS,S,LO,GO,Nod
 GatherSolution_BlockedTpetra(
   const Teuchos::RCP<const BlockedDOFManager> & indexer,
   const Teuchos::ParameterList& p)
-  : gidIndexer_(indexer)
+  : globalIndexer_(indexer)
   , has_tangent_fields_(false)
 {
   typedef std::vector< std::vector<std::string> > vvstring;
@@ -250,7 +250,7 @@ GatherSolution_BlockedTpetra(
       tangentFields_[fd].resize(tangent_field_names[fd].size());
       for (std::size_t i=0; i<tangent_field_names[fd].size(); ++i) {
         tangentFields_[fd][i] =
-          PHX::MDField<const ScalarT,Cell,NODE>(tangent_field_names[fd][i],basis->functional);
+          PHX::MDField<const RealT,Cell,NODE>(tangent_field_names[fd][i],basis->functional);
         this->addDependentField(tangentFields_[fd][i]);
       }
     }
@@ -268,17 +268,60 @@ GatherSolution_BlockedTpetra(
 // **********************************************************************
 template <typename TRAITS,typename S,typename LO,typename GO,typename NodeT>
 void panzer::GatherSolution_BlockedTpetra<panzer::Traits::Tangent, TRAITS,S,LO,GO,NodeT>::
-postRegistrationSetup(typename TRAITS::SetupData /* d */,
+postRegistrationSetup(typename TRAITS::SetupData d,
                       PHX::FieldManager<TRAITS>& /* fm */)
 {
   TEUCHOS_ASSERT(gatherFields_.size() == indexerNames_.size());
 
-  fieldIds_.resize(gatherFields_.size());
+  const Workset & workset_0 = (*d.worksets_)[0];
+  const std::string blockId = this->wda(workset_0).block_id;
 
+  fieldIds_.resize(gatherFields_.size());
+  fieldOffsets_.resize(gatherFields_.size());
+  productVectorBlockIndex_.resize(gatherFields_.size());
+  int maxElementBlockGIDCount = -1;
   for (std::size_t fd = 0; fd < gatherFields_.size(); ++fd) {
-    // get field ID from DOF manager
-    const std::string& fieldName = indexerNames_[fd];
-    fieldIds_[fd] = gidIndexer_->getFieldNum(fieldName);
+
+    const std::string fieldName = indexerNames_[fd];
+    const int globalFieldNum = globalIndexer_->getFieldNum(fieldName); // Field number in the aggregate BlockDOFManager
+    productVectorBlockIndex_[fd] = globalIndexer_->getFieldBlock(globalFieldNum);
+    const auto& subGlobalIndexer = globalIndexer_->getFieldDOFManagers()[productVectorBlockIndex_[fd]];
+    fieldIds_[fd] = subGlobalIndexer->getFieldNum(fieldName); // Field number in the sub-global-indexer
+
+    const std::vector<int>& offsets = subGlobalIndexer->getGIDFieldOffsets(blockId,fieldIds_[fd]);
+    fieldOffsets_[fd] = PHX::View<int*>("GatherSolution_BlockedTpetra(Tangent):fieldOffsets",offsets.size());
+    auto hostOffsets = Kokkos::create_mirror_view(fieldOffsets_[fd]);
+    for (std::size_t i=0; i < offsets.size(); ++i)
+      hostOffsets(i) = offsets[i];
+    Kokkos::deep_copy(fieldOffsets_[fd], hostOffsets);
+    maxElementBlockGIDCount = std::max(subGlobalIndexer->getElementBlockGIDCount(blockId),maxElementBlockGIDCount);
+  }
+
+  // We will use one workset lid view for all fields, but has to be
+  // sized big enough to hold the largest elementBlockGIDCount in the
+  // ProductVector.
+  worksetLIDs_ = PHX::View<LO**>("ScatterResidual_BlockedTpetra(Tangent):worksetLIDs",
+                                                gatherFields_[0].extent(0),
+                                                maxElementBlockGIDCount);
+
+  // Set up storage for tangentFields using view of views
+  // We also need storage for the number of tangent fields associated with
+  // each gatherField
+
+  if (has_tangent_fields_) {
+
+    size_t inner_vector_max_size = 0;
+    for (std::size_t fd = 0; fd < tangentFields_.size(); ++fd)
+      inner_vector_max_size = std::max(inner_vector_max_size,tangentFields_[fd].size());
+    tangentFieldsVoV_.initialize("GatherSolution_BlockedTpetra<Tangent>::tangentFieldsVoV_",gatherFields_.size(),inner_vector_max_size);
+
+    for (std::size_t fd = 0; fd < gatherFields_.size(); ++fd) {
+      for (std::size_t i=0; i<tangentFields_[fd].size(); ++i) {
+        tangentFieldsVoV_.addView(tangentFields_[fd][i].get_static_view(),fd,i);
+      }
+    }
+
+    tangentFieldsVoV_.syncHostToDevice();
   }
 
   indexerNames_.clear();  // Don't need this anymore
@@ -298,75 +341,72 @@ template <typename TRAITS,typename S,typename LO,typename GO,typename NodeT>
 void panzer::GatherSolution_BlockedTpetra<panzer::Traits::Tangent, TRAITS,S,LO,GO,NodeT>::
 evaluateFields(typename TRAITS::EvalData workset)
 {
-   using Teuchos::RCP;
-   using Teuchos::ArrayRCP;
-   using Teuchos::ptrFromRef;
-   using Teuchos::rcp_dynamic_cast;
-
-   using Thyra::VectorBase;
-   using Thyra::SpmdVectorBase;
-   using Thyra::ProductVectorBase;
+  using Teuchos::RCP;
+  using Teuchos::ArrayRCP;
+  using Teuchos::ptrFromRef;
+  using Teuchos::rcp_dynamic_cast;
 
-   Teuchos::FancyOStream out(Teuchos::rcpFromRef(std::cout));
-   out.setShowProcRank(true);
-   out.setOutputToRootOnly(-1);
+  using Thyra::VectorBase;
+  using Thyra::SpmdVectorBase;
+  using Thyra::ProductVectorBase;
 
-   std::vector<std::pair<int,GO> > GIDs;
-   std::vector<LO> LIDs;
+  Teuchos::FancyOStream out(Teuchos::rcpFromRef(std::cout));
+  out.setShowProcRank(true);
+  out.setOutputToRootOnly(-1);
 
-   // for convenience pull out some objects from workset
-   std::string blockId = this->wda(workset).block_id;
-   const std::vector<std::size_t> & localCellIds = this->wda(workset).cell_local_ids;
+  const PHX::View<const int*> & localCellIds = this->wda(workset).getLocalCellIDs();
 
-   Teuchos::RCP<ProductVectorBase<double> > x;
-   if (useTimeDerivativeSolutionVector_)
-     x = rcp_dynamic_cast<ProductVectorBase<double> >(blockedContainer_->get_dxdt());
-   else
-     x = rcp_dynamic_cast<ProductVectorBase<double> >(blockedContainer_->get_x());
+  Teuchos::RCP<ProductVectorBase<double> > blockedSolution;
+  if (useTimeDerivativeSolutionVector_)
+    blockedSolution = rcp_dynamic_cast<ProductVectorBase<double> >(blockedContainer_->get_dxdt());
+  else
+    blockedSolution = rcp_dynamic_cast<ProductVectorBase<double> >(blockedContainer_->get_x());
 
-   // gather operation for each cell in workset
-   for(std::size_t worksetCellIndex=0;worksetCellIndex<localCellIds.size();++worksetCellIndex) {
-      LO cellLocalId = localCellIds[worksetCellIndex];
+  // Loop over fields to gather
+  int currentWorksetLIDSubBlock = -1;
+  for (std::size_t fieldIndex = 0; fieldIndex < gatherFields_.size(); fieldIndex++) {
+    // workset LIDs only change if in different sub blocks 
+    if (productVectorBlockIndex_[fieldIndex] != currentWorksetLIDSubBlock) {
+      const auto& blockIndexer = globalIndexer_->getFieldDOFManagers()[productVectorBlockIndex_[fieldIndex]];
+      const std::string blockId = this->wda(workset).block_id;
+      const int num_dofs = globalIndexer_->getFieldDOFManagers()[productVectorBlockIndex_[fieldIndex]]->getElementBlockGIDCount(blockId);
+      blockIndexer->getElementLIDs(localCellIds,worksetLIDs_,num_dofs); 
+      currentWorksetLIDSubBlock = productVectorBlockIndex_[fieldIndex];
+    }
 
-      gidIndexer_->getElementGIDsPair(cellLocalId,GIDs,blockId);
+    const int blockRowIndex = productVectorBlockIndex_[fieldIndex];
+    const auto& subblockSolution = *((rcp_dynamic_cast<Thyra::TpetraVector<RealT,LO,GO,NodeT>>(blockedSolution->getNonconstVectorBlock(blockRowIndex),true))->getTpetraVector());
+    const auto kokkosSolution = subblockSolution.getLocalViewDevice(Tpetra::Access::ReadOnly);
 
-      // caculate the local IDs for this element
-      LIDs.resize(GIDs.size());
-      for(std::size_t i=0;i<GIDs.size();i++) {
-         // used for doing local ID lookups
-         RCP<const MapType> x_map = blockedContainer_->getMapForBlock(GIDs[i].first);
+    // Class data fields for lambda capture
+    const PHX::View<const int*> fieldOffsets = fieldOffsets_[fieldIndex];
+    const PHX::View<const LO**> worksetLIDs = worksetLIDs_;
+    const PHX::View<ScalarT**> fieldValues = gatherFields_[fieldIndex].get_static_view();        
 
-         LIDs[i] = x_map->getLocalElement(GIDs[i].second);
-      }
+    if (has_tangent_fields_) { 
+      const int numTangents = tangentFields_[fieldIndex].size();
+      const auto tangentFieldsDevice = tangentFieldsVoV_.getViewDevice();
+      const auto kokkosTangents = Kokkos::subview(tangentFieldsDevice,fieldIndex,Kokkos::ALL());
+      Kokkos::parallel_for(Kokkos::RangePolicy<PHX::Device>(0,workset.num_cells), KOKKOS_LAMBDA (const int& cell) {  
+        for (int basis=0; basis < static_cast<int>(fieldOffsets.size()); ++basis) {
+          const int rowLID = worksetLIDs(cell,fieldOffsets(basis));
+	       fieldValues(cell,basis).zero();
+          fieldValues(cell,basis).val() = kokkosSolution(rowLID,0);
+          for (int i_tangent=0; i_tangent<numTangents; ++i_tangent)
+            fieldValues(cell,basis).fastAccessDx(i_tangent) = kokkosTangents(i_tangent)(cell,basis);
+        }
+      });
+    } else {
+      Kokkos::parallel_for(Kokkos::RangePolicy<PHX::Device>(0,workset.num_cells), KOKKOS_LAMBDA (const int& cell) {  
+        for (int basis=0; basis < static_cast<int>(fieldOffsets.size()); ++basis) {
+          const int rowLID = worksetLIDs(cell,fieldOffsets(basis));
+	       fieldValues(cell,basis).zero();
+          fieldValues(cell,basis) = kokkosSolution(rowLID,0);
+        }
+      });
+    }
+  }
 
-      // loop over the fields to be gathered
-      Teuchos::ArrayRCP<const double> local_x;
-      for (std::size_t fieldIndex=0; fieldIndex<gatherFields_.size();fieldIndex++) {
-         int fieldNum = fieldIds_[fieldIndex];
-         int indexerId = gidIndexer_->getFieldBlock(fieldNum);
-
-         // grab local data for inputing
-         RCP<SpmdVectorBase<double> > block_x = rcp_dynamic_cast<SpmdVectorBase<double> >(x->getNonconstVectorBlock(indexerId));
-         block_x->getLocalData(ptrFromRef(local_x));
-
-         const std::vector<int> & elmtOffset = gidIndexer_->getGIDFieldOffsets(blockId,fieldNum);
-
-         // loop over basis functions and fill the fields
-         for(std::size_t basis=0;basis<elmtOffset.size();basis++) {
-            int offset = elmtOffset[basis];
-            int lid = LIDs[offset];
-
-            if (!has_tangent_fields_)
-              (gatherFields_[fieldIndex])(worksetCellIndex,basis) = local_x[lid];
-            else {
-              (gatherFields_[fieldIndex])(worksetCellIndex,basis).val() = local_x[lid];
-              for (std::size_t i=0; i<tangentFields_[fieldIndex].size(); ++i)
-                (gatherFields_[fieldIndex])(worksetCellIndex,basis).fastAccessDx(i) =
-                  tangentFields_[fieldIndex][i](worksetCellIndex,basis).val();
-            }
-         }
-      }
-   }
 }
 
 // **********************************************************************

From c54fe80458bc5cec1ff52b912f52952779534812 Mon Sep 17 00:00:00 2001
From: Paul Zehner <paul.zehner@cea.fr>
Date: Thu, 7 Nov 2024 10:48:29 -0500
Subject: [PATCH 25/25] Remove use of Kokkos::Impl::DynRankViewFill

Signed-off-by: Paul Zehner <paul.zehner@cea.fr>
---
 .../sacado/src/Kokkos_DynRankView_Fad.hpp     | 26 ++++++++++++++-----
 1 file changed, 20 insertions(+), 6 deletions(-)

diff --git a/packages/sacado/src/Kokkos_DynRankView_Fad.hpp b/packages/sacado/src/Kokkos_DynRankView_Fad.hpp
index e2c1d78aa81b..7e413cbc9393 100644
--- a/packages/sacado/src/Kokkos_DynRankView_Fad.hpp
+++ b/packages/sacado/src/Kokkos_DynRankView_Fad.hpp
@@ -988,9 +988,16 @@ void deep_copy(
                   typename ViewTraits<DT,DP...>::non_const_value_type >::value
     , "Can only deep copy into non-const type" );
 
-  Kokkos::fence();
-  Kokkos::Impl::DynRankViewFill< DynRankView<DT,DP...> >( view , value );
-  Kokkos::fence();
+  switch(view.rank()) {
+    case 0: deep_copy(Impl::as_view_of_rank_n<0>(view), value); break;
+    case 1: deep_copy(Impl::as_view_of_rank_n<1>(view), value); break;
+    case 2: deep_copy(Impl::as_view_of_rank_n<2>(view), value); break;
+    case 3: deep_copy(Impl::as_view_of_rank_n<3>(view), value); break;
+    case 4: deep_copy(Impl::as_view_of_rank_n<4>(view), value); break;
+    case 5: deep_copy(Impl::as_view_of_rank_n<5>(view), value); break;
+    case 6: deep_copy(Impl::as_view_of_rank_n<6>(view), value); break;
+    case 7: deep_copy(Impl::as_view_of_rank_n<7>(view), value); break;
+  }
 }
 
 // Overload of deep_copy for Fad views intializing to a constant Fad
@@ -1010,9 +1017,16 @@ void deep_copy(
                   typename ViewTraits<DT,DP...>::non_const_value_type >::value
     , "Can only deep copy into non-const type" );
 
-  Kokkos::fence();
-  Kokkos::Impl::DynRankViewFill< DynRankView<DT,DP...> >( view , value );
-  Kokkos::fence();
+  switch(view.rank()) {
+    case 0: deep_copy(Impl::as_view_of_rank_n<0>(view), value); break;
+    case 1: deep_copy(Impl::as_view_of_rank_n<1>(view), value); break;
+    case 2: deep_copy(Impl::as_view_of_rank_n<2>(view), value); break;
+    case 3: deep_copy(Impl::as_view_of_rank_n<3>(view), value); break;
+    case 4: deep_copy(Impl::as_view_of_rank_n<4>(view), value); break;
+    case 5: deep_copy(Impl::as_view_of_rank_n<5>(view), value); break;
+    case 6: deep_copy(Impl::as_view_of_rank_n<6>(view), value); break;
+    case 7: deep_copy(Impl::as_view_of_rank_n<7>(view), value); break;
+  }
 }
 
 template< class DstType , class SrcType >