MueLu  Version of the Day
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Groups Pages
MueLu_CoalesceDropFactory_def.hpp
Go to the documentation of this file.
1 // @HEADER
2 // *****************************************************************************
3 // MueLu: A package for multigrid based preconditioning
4 //
5 // Copyright 2012 NTESS and the MueLu contributors.
6 // SPDX-License-Identifier: BSD-3-Clause
7 // *****************************************************************************
8 // @HEADER
9 
10 #ifndef MUELU_COALESCEDROPFACTORY_DEF_HPP
11 #define MUELU_COALESCEDROPFACTORY_DEF_HPP
12 
14 #include <Xpetra_CrsGraph.hpp>
15 #include <Xpetra_ImportFactory.hpp>
16 #include <Xpetra_ExportFactory.hpp>
17 #include <Xpetra_MapFactory.hpp>
18 #include <Xpetra_Map.hpp>
19 #include <Xpetra_Matrix.hpp>
20 #include <Xpetra_MultiVectorFactory.hpp>
21 #include <Xpetra_MultiVector.hpp>
22 #include <Xpetra_StridedMap.hpp>
23 #include <Xpetra_VectorFactory.hpp>
24 #include <Xpetra_Vector.hpp>
25 
26 #include <Xpetra_IO.hpp>
27 
28 #include <Kokkos_NestedSort.hpp>
29 #include <Kokkos_StdAlgorithms.hpp>
31 
32 #include "MueLu_AmalgamationFactory.hpp"
33 #include "MueLu_AmalgamationInfo.hpp"
34 #include "MueLu_Exceptions.hpp"
35 #include "MueLu_LWGraph.hpp"
36 
37 #include "MueLu_Level.hpp"
38 #include "MueLu_LWGraph.hpp"
39 #include "MueLu_MasterList.hpp"
40 #include "MueLu_Monitor.hpp"
41 #include "MueLu_PreDropFunctionConstVal.hpp"
42 #include "MueLu_Utilities.hpp"
43 
44 #ifdef HAVE_XPETRA_TPETRA
45 #include "Tpetra_CrsGraphTransposer.hpp"
46 #endif
47 
48 #include <algorithm>
49 #include <cstdlib>
50 #include <string>
51 
52 // If defined, read environment variables.
53 // Should be removed once we are confident that this works.
54 //#define DJS_READ_ENV_VARIABLES
55 
56 namespace MueLu {
57 
58 namespace Details {
59 template <class real_type, class LO>
60 struct DropTol {
61  DropTol() = default;
62  DropTol(DropTol const&) = default;
63  DropTol(DropTol&&) = default;
64 
65  DropTol& operator=(DropTol const&) = default;
66  DropTol& operator=(DropTol&&) = default;
67 
68  DropTol(real_type val_, real_type diag_, LO col_, bool drop_)
69  : val{val_}
70  , diag{diag_}
71  , col{col_}
72  , drop{drop_} {}
73 
77  bool drop{true};
78 
79  // CMS: Auxillary information for debugging info
80  // real_type aux_val {Teuchos::ScalarTraits<real_type>::nan()};
81 };
82 } // namespace Details
83 
88 
89 template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
91  RCP<ParameterList> validParamList = rcp(new ParameterList());
92 
93 #define SET_VALID_ENTRY(name) validParamList->setEntry(name, MasterList::getEntry(name))
94  SET_VALID_ENTRY("aggregation: drop tol");
95  SET_VALID_ENTRY("aggregation: use ml scaling of drop tol");
96  SET_VALID_ENTRY("aggregation: Dirichlet threshold");
97  SET_VALID_ENTRY("aggregation: greedy Dirichlet");
98  SET_VALID_ENTRY("aggregation: row sum drop tol");
99  SET_VALID_ENTRY("aggregation: drop scheme");
100  SET_VALID_ENTRY("aggregation: block diagonal: interleaved blocksize");
101  SET_VALID_ENTRY("aggregation: distance laplacian directional weights");
102  SET_VALID_ENTRY("aggregation: dropping may create Dirichlet");
103 
104  {
105  // "signed classical" is the Ruge-Stuben style (relative to max off-diagonal), "sign classical sa" is the signed version of the sa criterion (relative to the diagonal values)
106  validParamList->getEntry("aggregation: drop scheme").setValidator(rcp(new Teuchos::StringValidator(Teuchos::tuple<std::string>("signed classical sa", "classical", "distance laplacian", "signed classical", "block diagonal", "block diagonal classical", "block diagonal distance laplacian", "block diagonal signed classical", "block diagonal colored signed classical"))));
107  }
108  SET_VALID_ENTRY("aggregation: distance laplacian algo");
109  SET_VALID_ENTRY("aggregation: classical algo");
110  SET_VALID_ENTRY("aggregation: coloring: localize color graph");
111 #undef SET_VALID_ENTRY
112  validParamList->set<bool>("lightweight wrap", true, "Experimental option for lightweight graph access");
113 
114  validParamList->set<RCP<const FactoryBase>>("A", Teuchos::null, "Generating factory of the matrix A");
115  validParamList->set<RCP<const FactoryBase>>("UnAmalgamationInfo", Teuchos::null, "Generating factory for UnAmalgamationInfo");
116  validParamList->set<RCP<const FactoryBase>>("Coordinates", Teuchos::null, "Generating factory for Coordinates");
117  validParamList->set<RCP<const FactoryBase>>("BlockNumber", Teuchos::null, "Generating factory for BlockNUmber");
118 
119  return validParamList;
120 }
121 
122 template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
124  : predrop_(Teuchos::null) {}
125 
126 template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
128  Input(currentLevel, "A");
129  Input(currentLevel, "UnAmalgamationInfo");
130 
131  const ParameterList& pL = GetParameterList();
132  if (pL.get<bool>("lightweight wrap") == true) {
133  std::string algo = pL.get<std::string>("aggregation: drop scheme");
134  if (algo == "distance laplacian" || algo == "block diagonal distance laplacian") {
135  Input(currentLevel, "Coordinates");
136  }
137  if (algo == "signed classical sa")
138  ;
139  else if (algo.find("block diagonal") != std::string::npos || algo.find("signed classical") != std::string::npos) {
140  Input(currentLevel, "BlockNumber");
141  }
142  }
143 }
144 
145 template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
147  FactoryMonitor m(*this, "Build", currentLevel);
148 
149  typedef Teuchos::ScalarTraits<SC> STS;
150  typedef typename STS::magnitudeType real_type;
151  typedef Xpetra::MultiVector<real_type, LO, GO, NO> RealValuedMultiVector;
152  typedef Xpetra::MultiVectorFactory<real_type, LO, GO, NO> RealValuedMultiVectorFactory;
153 
154  if (predrop_ != Teuchos::null)
155  GetOStream(Parameters0) << predrop_->description();
156 
157  RCP<Matrix> realA = Get<RCP<Matrix>>(currentLevel, "A");
158  RCP<AmalgamationInfo> amalInfo = Get<RCP<AmalgamationInfo>>(currentLevel, "UnAmalgamationInfo");
159  const ParameterList& pL = GetParameterList();
160  bool doExperimentalWrap = pL.get<bool>("lightweight wrap");
161 
162  GetOStream(Parameters0) << "lightweight wrap = " << doExperimentalWrap << std::endl;
163  std::string algo = pL.get<std::string>("aggregation: drop scheme");
164  const bool aggregationMayCreateDirichlet = pL.get<bool>("aggregation: dropping may create Dirichlet");
165 
167  RCP<Matrix> A;
168 
169  bool use_block_algorithm = false;
170  LO interleaved_blocksize = as<LO>(pL.get<int>("aggregation: block diagonal: interleaved blocksize"));
171  bool useSignedClassicalRS = false;
172  bool useSignedClassicalSA = false;
173  bool generateColoringGraph = false;
174 
175  // NOTE: If we're doing blockDiagonal, we'll not want to do rowSum twice (we'll do it
176  // in the block diagonalization). So we'll clobber the rowSumTol with -1.0 in this case
177  typename STS::magnitudeType rowSumTol = as<typename STS::magnitudeType>(pL.get<double>("aggregation: row sum drop tol"));
178 
179  RCP<LocalOrdinalVector> ghostedBlockNumber;
180 
181  if (algo == "distance laplacian") {
182  // Grab the coordinates for distance laplacian
183  Coords = Get<RCP<RealValuedMultiVector>>(currentLevel, "Coordinates");
184  A = realA;
185  } else if (algo == "signed classical sa") {
186  useSignedClassicalSA = true;
187  algo = "classical";
188  A = realA;
189  } else if (algo == "signed classical" || algo == "block diagonal colored signed classical" || algo == "block diagonal signed classical") {
190  useSignedClassicalRS = true;
191  // if(realA->GetFixedBlockSize() > 1) {
192  RCP<LocalOrdinalVector> BlockNumber = Get<RCP<LocalOrdinalVector>>(currentLevel, "BlockNumber");
193  // Ghost the column block numbers if we need to
194  RCP<const Import> importer = realA->getCrsGraph()->getImporter();
195  if (!importer.is_null()) {
196  SubFactoryMonitor m1(*this, "Block Number import", currentLevel);
197  ghostedBlockNumber = Xpetra::VectorFactory<LO, LO, GO, NO>::Build(importer->getTargetMap());
198  ghostedBlockNumber->doImport(*BlockNumber, *importer, Xpetra::INSERT);
199  } else {
200  ghostedBlockNumber = BlockNumber;
201  }
202  // }
203  if (algo == "block diagonal colored signed classical")
204  generateColoringGraph = true;
205  algo = "classical";
206  A = realA;
207 
208  } else if (algo == "block diagonal") {
209  // Handle the "block diagonal" filtering and then leave
210  BlockDiagonalize(currentLevel, realA, false);
211  return;
212  } else if (algo == "block diagonal classical" || algo == "block diagonal distance laplacian") {
213  // Handle the "block diagonal" filtering, and then continue onward
214  use_block_algorithm = true;
215  RCP<Matrix> filteredMatrix = BlockDiagonalize(currentLevel, realA, true);
216  if (algo == "block diagonal distance laplacian") {
217  // We now need to expand the coordinates by the interleaved blocksize
218  RCP<RealValuedMultiVector> OldCoords = Get<RCP<RealValuedMultiVector>>(currentLevel, "Coordinates");
219  if (OldCoords->getLocalLength() != realA->getLocalNumRows()) {
220  LO dim = (LO)OldCoords->getNumVectors();
221  Coords = RealValuedMultiVectorFactory::Build(realA->getRowMap(), dim);
222  for (LO k = 0; k < dim; k++) {
223  ArrayRCP<const real_type> old_vec = OldCoords->getData(k);
224  ArrayRCP<real_type> new_vec = Coords->getDataNonConst(k);
225  for (LO i = 0; i < (LO)OldCoords->getLocalLength(); i++) {
226  LO new_base = i * dim;
227  for (LO j = 0; j < interleaved_blocksize; j++)
228  new_vec[new_base + j] = old_vec[i];
229  }
230  }
231  } else {
232  Coords = OldCoords;
233  }
234  algo = "distance laplacian";
235  } else if (algo == "block diagonal classical") {
236  algo = "classical";
237  }
238  // All cases
239  A = filteredMatrix;
240  rowSumTol = -1.0;
241  } else {
242  A = realA;
243  }
244 
245  // Distance Laplacian weights
246  Array<double> dlap_weights = pL.get<Array<double>>("aggregation: distance laplacian directional weights");
247  enum { NO_WEIGHTS = 0,
248  SINGLE_WEIGHTS,
249  BLOCK_WEIGHTS };
250  int use_dlap_weights = NO_WEIGHTS;
251  if (algo == "distance laplacian") {
252  LO dim = (LO)Coords->getNumVectors();
253  // If anything isn't 1.0 we need to turn on the weighting
254  bool non_unity = false;
255  for (LO i = 0; !non_unity && i < (LO)dlap_weights.size(); i++) {
256  if (dlap_weights[i] != 1.0) {
257  non_unity = true;
258  }
259  }
260  if (non_unity) {
261  LO blocksize = use_block_algorithm ? as<LO>(pL.get<int>("aggregation: block diagonal: interleaved blocksize")) : 1;
262  if ((LO)dlap_weights.size() == dim)
263  use_dlap_weights = SINGLE_WEIGHTS;
264  else if ((LO)dlap_weights.size() == blocksize * dim)
265  use_dlap_weights = BLOCK_WEIGHTS;
266  else {
268  "length of 'aggregation: distance laplacian directional weights' must equal the coordinate dimension OR the coordinate dimension times the blocksize");
269  }
270  if (GetVerbLevel() & Statistics1)
271  GetOStream(Statistics1) << "Using distance laplacian weights: " << dlap_weights << std::endl;
272  }
273  }
274 
275  // decide wether to use the fast-track code path for standard maps or the somewhat slower
276  // code path for non-standard maps
277  /*bool bNonStandardMaps = false;
278  if (A->IsView("stridedMaps") == true) {
279  Teuchos::RCP<const Map> myMap = A->getRowMap("stridedMaps");
280  Teuchos::RCP<const StridedMap> strMap = Teuchos::rcp_dynamic_cast<const StridedMap>(myMap);
281  TEUCHOS_TEST_FOR_EXCEPTION(strMap == null, Exceptions::RuntimeError, "Map is not of type StridedMap");
282  if (strMap->getStridedBlockId() != -1 || strMap->getOffset() > 0)
283  bNonStandardMaps = true;
284  }*/
285 
286  if (doExperimentalWrap) {
287  TEUCHOS_TEST_FOR_EXCEPTION(predrop_ != null && algo != "classical", Exceptions::RuntimeError, "Dropping function must not be provided for \"" << algo << "\" algorithm");
288  TEUCHOS_TEST_FOR_EXCEPTION(algo != "classical" && algo != "distance laplacian" && algo != "signed classical", Exceptions::RuntimeError, "\"algorithm\" must be one of (classical|distance laplacian|signed classical)");
289 
290  SC threshold;
291  // If we're doing the ML-style halving of the drop tol at each level, we do that here.
292  if (pL.get<bool>("aggregation: use ml scaling of drop tol"))
293  threshold = pL.get<double>("aggregation: drop tol") / pow(2.0, currentLevel.GetLevelID());
294  else
295  threshold = as<SC>(pL.get<double>("aggregation: drop tol"));
296 
297  std::string distanceLaplacianAlgoStr = pL.get<std::string>("aggregation: distance laplacian algo");
298  std::string classicalAlgoStr = pL.get<std::string>("aggregation: classical algo");
299  real_type realThreshold = STS::magnitude(threshold); // CMS: Rename this to "magnitude threshold" sometime
300 
302  // Remove this bit once we are confident that cut-based dropping works.
303 #ifdef HAVE_MUELU_DEBUG
304  int distanceLaplacianCutVerbose = 0;
305 #endif
306 #ifdef DJS_READ_ENV_VARIABLES
307  if (getenv("MUELU_DROP_TOLERANCE_MODE")) {
308  distanceLaplacianAlgoStr = std::string(getenv("MUELU_DROP_TOLERANCE_MODE"));
309  }
310 
311  if (getenv("MUELU_DROP_TOLERANCE_THRESHOLD")) {
312  auto tmp = atoi(getenv("MUELU_DROP_TOLERANCE_THRESHOLD"));
313  realThreshold = 1e-4 * tmp;
314  }
315 
316 #ifdef HAVE_MUELU_DEBUG
317  if (getenv("MUELU_DROP_TOLERANCE_VERBOSE")) {
318  distanceLaplacianCutVerbose = atoi(getenv("MUELU_DROP_TOLERANCE_VERBOSE"));
319  }
320 #endif
321 #endif
322 
324  decisionAlgoType distanceLaplacianAlgo = defaultAlgo;
325  decisionAlgoType classicalAlgo = defaultAlgo;
326  if (algo == "distance laplacian") {
327  if (distanceLaplacianAlgoStr == "default")
328  distanceLaplacianAlgo = defaultAlgo;
329  else if (distanceLaplacianAlgoStr == "unscaled cut")
330  distanceLaplacianAlgo = unscaled_cut;
331  else if (distanceLaplacianAlgoStr == "scaled cut")
332  distanceLaplacianAlgo = scaled_cut;
333  else if (distanceLaplacianAlgoStr == "scaled cut symmetric")
334  distanceLaplacianAlgo = scaled_cut_symmetric;
335  else
336  TEUCHOS_TEST_FOR_EXCEPTION(true, Exceptions::RuntimeError, "\"aggregation: distance laplacian algo\" must be one of (default|unscaled cut|scaled cut), not \"" << distanceLaplacianAlgoStr << "\"");
337  GetOStream(Runtime0) << "algorithm = \"" << algo << "\" distance laplacian algorithm = \"" << distanceLaplacianAlgoStr << "\": threshold = " << threshold << ", blocksize = " << A->GetFixedBlockSize() << std::endl;
338  } else if (algo == "classical") {
339  if (classicalAlgoStr == "default")
340  classicalAlgo = defaultAlgo;
341  else if (classicalAlgoStr == "unscaled cut")
342  classicalAlgo = unscaled_cut;
343  else if (classicalAlgoStr == "scaled cut")
344  classicalAlgo = scaled_cut;
345  else
346  TEUCHOS_TEST_FOR_EXCEPTION(true, Exceptions::RuntimeError, "\"aggregation: classical algo\" must be one of (default|unscaled cut|scaled cut), not \"" << classicalAlgoStr << "\"");
347  GetOStream(Runtime0) << "algorithm = \"" << algo << "\" classical algorithm = \"" << classicalAlgoStr << "\": threshold = " << threshold << ", blocksize = " << A->GetFixedBlockSize() << std::endl;
348 
349  } else
350  GetOStream(Runtime0) << "algorithm = \"" << algo << "\": threshold = " << threshold << ", blocksize = " << A->GetFixedBlockSize() << std::endl;
351 
352  if (((algo == "classical") && (classicalAlgoStr.find("scaled") != std::string::npos)) || ((algo == "distance laplacian") && (distanceLaplacianAlgoStr.find("scaled") != std::string::npos)))
353  TEUCHOS_TEST_FOR_EXCEPTION(realThreshold > 1.0, Exceptions::RuntimeError, "For cut-drop algorithms, \"aggregation: drop tol\" = " << threshold << ", needs to be <= 1.0");
354 
355  Set<bool>(currentLevel, "Filtering", (threshold != STS::zero()));
356 
357  const typename STS::magnitudeType dirichletThreshold = STS::magnitude(as<SC>(pL.get<double>("aggregation: Dirichlet threshold")));
358 
359  // NOTE: We don't support signed classical RS or SA with cut drop at present
360  TEUCHOS_TEST_FOR_EXCEPTION(useSignedClassicalRS && classicalAlgo != defaultAlgo, Exceptions::RuntimeError, "\"aggregation: classical algo\" != default is not supported for scalled classical aggregation");
361  TEUCHOS_TEST_FOR_EXCEPTION(useSignedClassicalSA && classicalAlgo != defaultAlgo, Exceptions::RuntimeError, "\"aggregation: classical algo\" != default is not supported for scalled classical sa aggregation");
362 
363  GO numDropped = 0, numTotal = 0;
364  std::string graphType = "unamalgamated"; // for description purposes only
365 
366  /* NOTE: storageblocksize (from GetStorageBlockSize()) is the size of a block in the chosen storage scheme.
367  BlockSize is the number of storage blocks that must kept together during the amalgamation process.
368 
369  Both of these quantities may be different than numPDEs (from GetFixedBlockSize()), but the following must always hold:
370 
371  numPDEs = BlockSize * storageblocksize.
372 
373  If numPDEs==1
374  Matrix is point storage (classical CRS storage). storageblocksize=1 and BlockSize=1
375  No other values makes sense.
376 
377  If numPDEs>1
378  If matrix uses point storage, then storageblocksize=1 and BlockSize=numPDEs.
379  If matrix uses block storage, with block size of n, then storageblocksize=n, and BlockSize=numPDEs/n.
380  Thus far, only storageblocksize=numPDEs and BlockSize=1 has been tested.
381  */
382  TEUCHOS_TEST_FOR_EXCEPTION(A->GetFixedBlockSize() % A->GetStorageBlockSize() != 0, Exceptions::RuntimeError, "A->GetFixedBlockSize() needs to be a multiple of A->GetStorageBlockSize()");
383  const LO BlockSize = A->GetFixedBlockSize() / A->GetStorageBlockSize();
384 
385  /************************** RS or SA-style Classical Dropping (and variants) **************************/
386  if (algo == "classical") {
387  if (predrop_ == null) {
388  // ap: this is a hack: had to declare predrop_ as mutable
389  predrop_ = rcp(new PreDropFunctionConstVal(threshold));
390  }
391 
392  if (predrop_ != null) {
393  RCP<PreDropFunctionConstVal> predropConstVal = rcp_dynamic_cast<PreDropFunctionConstVal>(predrop_);
394  TEUCHOS_TEST_FOR_EXCEPTION(predropConstVal == Teuchos::null, Exceptions::BadCast,
395  "MueLu::CoalesceFactory::Build: cast to PreDropFunctionConstVal failed.");
396  // If a user provided a predrop function, it overwrites the XML threshold parameter
397  SC newt = predropConstVal->GetThreshold();
398  if (newt != threshold) {
399  GetOStream(Warnings0) << "switching threshold parameter from " << threshold << " (list) to " << newt << " (user function" << std::endl;
400  threshold = newt;
401  }
402  }
403  // At this points we either have
404  // (predrop_ != null)
405  // Therefore, it is sufficient to check only threshold
406  if (BlockSize == 1 && threshold == STS::zero() && !useSignedClassicalRS && !useSignedClassicalSA && A->hasCrsGraph()) {
407  // Case 1: scalar problem, no dropping => just use matrix graph
408  RCP<LWGraph> graph = rcp(new LWGraph(A->getCrsGraph(), "graph of A"));
409  // Detect and record rows that correspond to Dirichlet boundary conditions
410  auto boundaryNodes = MueLu::Utilities<SC, LO, GO, NO>::DetectDirichletRows_kokkos_host(*A, dirichletThreshold);
411  if (rowSumTol > 0.)
412  Utilities::ApplyRowSumCriterionHost(*A, rowSumTol, boundaryNodes);
413 
414  graph->SetBoundaryNodeMap(boundaryNodes);
415  numTotal = A->getLocalNumEntries();
416 
417  if (GetVerbLevel() & Statistics1) {
418  GO numLocalBoundaryNodes = 0;
419  GO numGlobalBoundaryNodes = 0;
420  for (size_t i = 0; i < boundaryNodes.size(); ++i)
421  if (boundaryNodes[i])
422  numLocalBoundaryNodes++;
423  RCP<const Teuchos::Comm<int>> comm = A->getRowMap()->getComm();
424  MueLu_sumAll(comm, numLocalBoundaryNodes, numGlobalBoundaryNodes);
425  GetOStream(Statistics1) << "Detected " << numGlobalBoundaryNodes << " Dirichlet nodes" << std::endl;
426  }
427 
428  Set(currentLevel, "DofsPerNode", 1);
429  Set(currentLevel, "Graph", graph);
430 
431  } else if ((BlockSize == 1 && threshold != STS::zero()) ||
432  (BlockSize == 1 && threshold == STS::zero() && !A->hasCrsGraph()) ||
433  (BlockSize == 1 && useSignedClassicalRS) ||
434  (BlockSize == 1 && useSignedClassicalSA)) {
435  // Case 2: scalar problem with dropping => record the column indices of undropped entries, but still use original
436  // graph's map information, e.g., whether index is local
437  // OR a matrix without a CrsGraph
438 
439  // allocate space for the local graph
440  typename LWGraph::row_type::non_const_type rows("rows", A->getLocalNumRows() + 1);
441  typename LWGraph::entries_type::non_const_type columns("columns", A->getLocalNumEntries());
442 
443  using MT = typename STS::magnitudeType;
444  RCP<Vector> ghostedDiag;
445  ArrayRCP<const SC> ghostedDiagVals;
446  ArrayRCP<const SC> negMaxOffDiagonal;
447  // RS style needs the max negative off-diagonal, SA style needs the diagonal
448  if (useSignedClassicalRS) {
449  if (ghostedBlockNumber.is_null()) {
451  negMaxOffDiagonal = negMaxOffDiagonalVec->getData(0);
452  if (GetVerbLevel() & Statistics1)
453  GetOStream(Statistics1) << "Calculated max point off-diagonal" << std::endl;
454  } else {
455  auto negMaxOffDiagonalVec = MueLu::Utilities<SC, LO, GO, NO>::GetMatrixMaxMinusOffDiagonal(*A, *ghostedBlockNumber);
456  negMaxOffDiagonal = negMaxOffDiagonalVec->getData(0);
457  if (GetVerbLevel() & Statistics1)
458  GetOStream(Statistics1) << "Calculating max block off-diagonal" << std::endl;
459  }
460  } else {
462  if (classicalAlgo == defaultAlgo) {
463  ghostedDiagVals = ghostedDiag->getData(0);
464  }
465  }
466  auto boundaryNodes = MueLu::Utilities<SC, LO, GO, NO>::DetectDirichletRows_kokkos_host(*A, dirichletThreshold);
467  if (rowSumTol > 0.) {
468  if (ghostedBlockNumber.is_null()) {
469  if (GetVerbLevel() & Statistics1)
470  GetOStream(Statistics1) << "Applying point row sum criterion." << std::endl;
471  Utilities::ApplyRowSumCriterionHost(*A, rowSumTol, boundaryNodes);
472  } else {
473  if (GetVerbLevel() & Statistics1)
474  GetOStream(Statistics1) << "Applying block row sum criterion." << std::endl;
475  Utilities::ApplyRowSumCriterionHost(*A, *ghostedBlockNumber, rowSumTol, boundaryNodes);
476  }
477  }
478 
479  ArrayRCP<const LO> g_block_id;
480  if (!ghostedBlockNumber.is_null())
481  g_block_id = ghostedBlockNumber->getData(0);
482 
483  LO realnnz = 0;
484  rows(0) = 0;
485  if (classicalAlgo == defaultAlgo) {
486  SubFactoryMonitor m1(*this, "Classical RS/SA", currentLevel);
487  for (LO row = 0; row < Teuchos::as<LO>(A->getRowMap()->getLocalNumElements()); ++row) {
488  size_t nnz = A->getNumEntriesInLocalRow(row);
489  bool rowIsDirichlet = boundaryNodes[row];
490  ArrayView<const LO> indices;
491  ArrayView<const SC> vals;
492  A->getLocalRowView(row, indices, vals);
493 
494  // FIXME the current predrop function uses the following
495  // FIXME if(std::abs(vals[k]) > std::abs(threshold_) || grow == gcid )
496  // FIXME but the threshold doesn't take into account the rows' diagonal entries
497  // FIXME For now, hardwiring the dropping in here
498 
499  LO rownnz = 0;
500  if (useSignedClassicalRS) {
501  // Signed classical RS style
502  for (LO colID = 0; colID < Teuchos::as<LO>(nnz); colID++) {
503  LO col = indices[colID];
504  MT max_neg_aik = realThreshold * STS::real(negMaxOffDiagonal[row]);
505  MT neg_aij = -STS::real(vals[colID]);
506  /* if(row==1326) printf("A(%d,%d) = %6.4e, block = (%d,%d) neg_aij = %6.4e max_neg_aik = %6.4e\n",row,col,vals[colID],
507  g_block_id.is_null() ? -1 : g_block_id[row],
508  g_block_id.is_null() ? -1 : g_block_id[col],
509  neg_aij, max_neg_aik);*/
510  if ((!rowIsDirichlet && (g_block_id.is_null() || g_block_id[row] == g_block_id[col]) && neg_aij > max_neg_aik) || row == col) {
511  columns[realnnz++] = col;
512  rownnz++;
513  } else
514  numDropped++;
515  }
516  rows(row + 1) = realnnz;
517  } else if (useSignedClassicalSA) {
518  // Signed classical SA style
519  for (LO colID = 0; colID < Teuchos::as<LO>(nnz); colID++) {
520  LO col = indices[colID];
521 
522  bool is_nonpositive = STS::real(vals[colID]) <= 0;
523  MT aiiajj = STS::magnitude(threshold * threshold * ghostedDiagVals[col] * ghostedDiagVals[row]); // eps^2*|a_ii|*|a_jj|
524  MT aij = is_nonpositive ? STS::magnitude(vals[colID] * vals[colID]) : (-STS::magnitude(vals[colID] * vals[colID])); // + |a_ij|^2, if a_ij < 0, - |a_ij|^2 if a_ij >=0
525  /*
526  if(row==1326) printf("A(%d,%d) = %6.4e, raw_aij = %6.4e aij = %6.4e aiiajj = %6.4e\n",row,col,vals[colID],
527  vals[colID],aij, aiiajj);
528  */
529 
530  if ((!rowIsDirichlet && aij > aiiajj) || row == col) {
531  columns(realnnz++) = col;
532  rownnz++;
533  } else
534  numDropped++;
535  }
536  rows[row + 1] = realnnz;
537  } else {
538  // Standard abs classical
539  for (LO colID = 0; colID < Teuchos::as<LO>(nnz); colID++) {
540  LO col = indices[colID];
541  MT aiiajj = STS::magnitude(threshold * threshold * ghostedDiagVals[col] * ghostedDiagVals[row]); // eps^2*|a_ii|*|a_jj|
542  MT aij = STS::magnitude(vals[colID] * vals[colID]); // |a_ij|^2
543 
544  if ((!rowIsDirichlet && aij > aiiajj) || row == col) {
545  columns(realnnz++) = col;
546  rownnz++;
547  } else
548  numDropped++;
549  }
550  rows(row + 1) = realnnz;
551  }
552  } // end for row
553  } else {
554  /* Cut Algorithm */
555  SubFactoryMonitor m1(*this, "Cut Drop", currentLevel);
556  using ExecSpace = typename Node::execution_space;
557  using TeamPol = Kokkos::TeamPolicy<ExecSpace>;
558  using TeamMem = typename TeamPol::member_type;
559  using ATS = Kokkos::ArithTraits<Scalar>;
560  using impl_scalar_type = typename ATS::val_type;
561  using implATS = Kokkos::ArithTraits<impl_scalar_type>;
562 
563  // move from host to device
564  auto ghostedDiagValsView = Kokkos::subview(ghostedDiag->getDeviceLocalView(Xpetra::Access::ReadOnly), Kokkos::ALL(), 0);
565  auto thresholdKokkos = static_cast<impl_scalar_type>(threshold);
566  auto realThresholdKokkos = implATS::magnitude(thresholdKokkos);
567  auto columnsDevice = Kokkos::create_mirror_view(ExecSpace(), columns);
568 
569  auto A_device = A->getLocalMatrixDevice();
570  RCP<LWGraph> graph = rcp(new LWGraph(A->getCrsGraph(), "graph of A"));
571  RCP<const Import> importer = A->getCrsGraph()->getImporter();
573  RCP<LocalOrdinalVector> boundaryColumnVector;
574  for (size_t i = 0; i < graph->GetNodeNumVertices(); i++) {
575  boundaryNodesVector->getDataNonConst(0)[i] = boundaryNodes[i];
576  }
577  if (!importer.is_null()) {
578  boundaryColumnVector = Xpetra::VectorFactory<LO, LO, GO, NO>::Build(graph->GetImportMap());
579  boundaryColumnVector->doImport(*boundaryNodesVector, *importer, Xpetra::INSERT);
580  } else {
581  boundaryColumnVector = boundaryNodesVector;
582  }
583  auto boundaryColumn = boundaryColumnVector->getDeviceLocalView(Xpetra::Access::ReadOnly);
584  auto boundary = Kokkos::subview(boundaryColumn, Kokkos::ALL(), 0);
585 
586  Kokkos::View<LO*, ExecSpace> rownnzView("rownnzView", A_device.numRows());
587  auto drop_views = Kokkos::View<bool*, ExecSpace>("drop_views", A_device.nnz());
588  auto index_views = Kokkos::View<size_t*, ExecSpace>("index_views", A_device.nnz());
589 
590  Kokkos::parallel_reduce(
591  "classical_cut", TeamPol(A_device.numRows(), Kokkos::AUTO), KOKKOS_LAMBDA(const TeamMem& teamMember, LO& globalnnz, GO& totalDropped) {
592  LO row = teamMember.league_rank();
593  auto rowView = A_device.rowConst(row);
594  size_t nnz = rowView.length;
595 
596  auto drop_view = Kokkos::subview(drop_views, Kokkos::make_pair(A_device.graph.row_map(row), A_device.graph.row_map(row + 1)));
597  auto index_view = Kokkos::subview(index_views, Kokkos::make_pair(A_device.graph.row_map(row), A_device.graph.row_map(row + 1)));
598 
599  // find magnitudes
600  Kokkos::parallel_for(Kokkos::TeamThreadRange(teamMember, (LO)nnz), [&](const LO colID) {
601  index_view(colID) = colID;
602  LO col = rowView.colidx(colID);
603  // ignore diagonals for now, they are checked again later
604  // Don't aggregate boundaries
605  if (row == col || boundary(col)) {
606  drop_view(colID) = true;
607  } else {
608  drop_view(colID) = false;
609  }
610  });
611 
612  size_t dropStart = nnz;
613  if (classicalAlgo == unscaled_cut) {
614  // push diagonals and boundaries to the right, sort everything else by aij on the left
615  Kokkos::Experimental::sort_team(teamMember, index_view, [=](size_t& x, size_t& y) -> bool {
616  if (drop_view(x) || drop_view(y)) {
617  return drop_view(x) < drop_view(y);
618  } else {
619  auto x_aij = implATS::magnitude(rowView.value(x) * rowView.value(x));
620  auto y_aij = implATS::magnitude(rowView.value(y) * rowView.value(y));
621  return x_aij > y_aij;
622  }
623  });
624 
625  // find index where dropping starts
626  Kokkos::parallel_reduce(
627  Kokkos::TeamThreadRange(teamMember, 1, nnz), [=](size_t i, size_t& min) {
628  auto const& x = index_view(i - 1);
629  auto const& y = index_view(i);
630  typename implATS::magnitudeType x_aij = 0;
631  typename implATS::magnitudeType y_aij = 0;
632  if (!drop_view(x)) {
633  x_aij = implATS::magnitude(rowView.value(x) * rowView.value(x));
634  }
635  if (!drop_view(y)) {
636  y_aij = implATS::magnitude(rowView.value(y) * rowView.value(y));
637  }
638 
639  if (realThresholdKokkos * realThresholdKokkos * x_aij > y_aij) {
640  if (i < min) {
641  min = i;
642  }
643  }
644  },
645  Kokkos::Min<size_t>(dropStart));
646  } else if (classicalAlgo == scaled_cut) {
647  // push diagonals and boundaries to the right, sort everything else by aij/aiiajj on the left
648  Kokkos::Experimental::sort_team(teamMember, index_view, [=](size_t& x, size_t& y) -> bool {
649  if (drop_view(x) || drop_view(y)) {
650  return drop_view(x) < drop_view(y);
651  } else {
652  auto x_aij = implATS::magnitude(rowView.value(x) * rowView.value(x));
653  auto y_aij = implATS::magnitude(rowView.value(y) * rowView.value(y));
654  auto x_aiiajj = implATS::magnitude(ghostedDiagValsView(rowView.colidx(x)) * ghostedDiagValsView(row));
655  auto y_aiiajj = implATS::magnitude(ghostedDiagValsView(rowView.colidx(y)) * ghostedDiagValsView(row));
656  return (x_aij / x_aiiajj) > (y_aij / y_aiiajj);
657  }
658  });
659 
660  // find index where dropping starts
661  Kokkos::parallel_reduce(
662  Kokkos::TeamThreadRange(teamMember, 1, nnz), [=](size_t i, size_t& min) {
663  auto const& x = index_view(i - 1);
664  auto const& y = index_view(i);
665  typename implATS::magnitudeType x_val = 0;
666  typename implATS::magnitudeType y_val = 0;
667  if (!drop_view(x)) {
668  typename implATS::magnitudeType x_aij = implATS::magnitude(rowView.value(x) * rowView.value(x));
669  typename implATS::magnitudeType x_aiiajj = implATS::magnitude(ghostedDiagValsView(rowView.colidx(x)) * ghostedDiagValsView(row));
670  x_val = x_aij / x_aiiajj;
671  }
672  if (!drop_view(y)) {
673  typename implATS::magnitudeType y_aij = implATS::magnitude(rowView.value(y) * rowView.value(y));
674  typename implATS::magnitudeType y_aiiajj = implATS::magnitude(ghostedDiagValsView(rowView.colidx(y)) * ghostedDiagValsView(row));
675  y_val = y_aij / y_aiiajj;
676  }
677 
678  if (realThresholdKokkos * realThresholdKokkos * x_val > y_val) {
679  if (i < min) {
680  min = i;
681  }
682  }
683  },
684  Kokkos::Min<size_t>(dropStart));
685  }
686 
687  // drop everything to the right of where values stop passing threshold
688  if (dropStart < nnz) {
689  Kokkos::parallel_for(Kokkos::TeamThreadRange(teamMember, dropStart, nnz), [=](size_t i) {
690  drop_view(index_view(i)) = true;
691  });
692  }
693 
694  LO rownnz = 0;
695  GO rowDropped = 0;
696  Kokkos::parallel_reduce(
697  Kokkos::TeamThreadRange(teamMember, nnz), [=](const size_t idxID, LO& keep, GO& drop) {
698  LO col = rowView.colidx(idxID);
699  // don't drop diagonal
700  if (row == col || !drop_view(idxID)) {
701  columnsDevice(A_device.graph.row_map(row) + idxID) = col;
702  keep++;
703  } else {
704  columnsDevice(A_device.graph.row_map(row) + idxID) = -1;
705  drop++;
706  }
707  },
708  rownnz, rowDropped);
709 
710  globalnnz += rownnz;
711  totalDropped += rowDropped;
712  rownnzView(row) = rownnz;
713  },
714  realnnz, numDropped);
715 
716  // update column indices so that kept indices are aligned to the left for subview that happens later on
717  Kokkos::Experimental::remove(ExecSpace(), columnsDevice, -1);
718  Kokkos::deep_copy(columns, columnsDevice);
719 
720  // update row indices by adding up new # of nnz in each row
721  auto rowsDevice = Kokkos::create_mirror_view(ExecSpace(), rows);
722  Kokkos::parallel_scan(
723  Kokkos::RangePolicy<ExecSpace>(0, A_device.numRows()), KOKKOS_LAMBDA(const int i, LO& partial_sum, bool is_final) {
724  partial_sum += rownnzView(i);
725  if (is_final) rowsDevice(i + 1) = partial_sum;
726  });
727  Kokkos::deep_copy(rows, rowsDevice);
728  }
729 
730  numTotal = A->getLocalNumEntries();
731 
732  if (aggregationMayCreateDirichlet) {
733  // If the only element remaining after filtering is diagonal, mark node as boundary
734  for (LO row = 0; row < Teuchos::as<LO>(A->getRowMap()->getLocalNumElements()); ++row) {
735  if (rows[row + 1] - rows[row] <= 1)
736  boundaryNodes[row] = true;
737  }
738  }
739 
740  RCP<LWGraph> graph = rcp(new LWGraph(rows, Kokkos::subview(columns, Kokkos::make_pair(0, realnnz)), A->getRowMap(), A->getColMap(), "thresholded graph of A"));
741  graph->SetBoundaryNodeMap(boundaryNodes);
742  if (GetVerbLevel() & Statistics1) {
743  GO numLocalBoundaryNodes = 0;
744  GO numGlobalBoundaryNodes = 0;
745  for (size_t i = 0; i < boundaryNodes.size(); ++i)
746  if (boundaryNodes(i))
747  numLocalBoundaryNodes++;
748  RCP<const Teuchos::Comm<int>> comm = A->getRowMap()->getComm();
749  MueLu_sumAll(comm, numLocalBoundaryNodes, numGlobalBoundaryNodes);
750  GetOStream(Statistics1) << "Detected " << numGlobalBoundaryNodes << " Dirichlet nodes" << std::endl;
751  }
752  Set(currentLevel, "Graph", graph);
753  Set(currentLevel, "DofsPerNode", 1);
754 
755  // If we're doing signed classical, we might want to block-diagonalize *after* the dropping
756  if (generateColoringGraph) {
757  RCP<LWGraph> colorGraph;
758  RCP<const Import> importer = A->getCrsGraph()->getImporter();
759  BlockDiagonalizeGraph(graph, ghostedBlockNumber, colorGraph, importer);
760  Set(currentLevel, "Coloring Graph", colorGraph);
761  // #define CMS_DUMP
762 #ifdef CMS_DUMP
763  {
764  Xpetra::IO<Scalar, LocalOrdinal, GlobalOrdinal, Node>::Write("m_regular_graph." + std::to_string(currentLevel.GetLevelID()), *rcp_dynamic_cast<LWGraph>(graph)->GetCrsGraph());
765  Xpetra::IO<Scalar, LocalOrdinal, GlobalOrdinal, Node>::Write("m_color_graph." + std::to_string(currentLevel.GetLevelID()), *rcp_dynamic_cast<LWGraph>(colorGraph)->GetCrsGraph());
766  // int rank = graph->GetDomainMap()->getComm()->getRank();
767  // {
768  // std::ofstream ofs(std::string("m_color_graph_") + std::to_string(currentLevel.GetLevelID())+std::string("_") + std::to_string(rank) + std::string(".dat"),std::ofstream::out);
769  // RCP<Teuchos::FancyOStream> fancy = Teuchos::fancyOStream(Teuchos::rcpFromRef(ofs));
770  // colorGraph->print(*fancy,Debug);
771  // }
772  // {
773  // std::ofstream ofs(std::string("m_regular_graph_") + std::to_string(currentLevel.GetLevelID())+std::string("_") + std::to_string(rank) + std::string(".dat"),std::ofstream::out);
774  // RCP<Teuchos::FancyOStream> fancy = Teuchos::fancyOStream(Teuchos::rcpFromRef(ofs));
775  // graph->print(*fancy,Debug);
776  // }
777  }
778 #endif
779  } // end generateColoringGraph
780  } else if (BlockSize > 1 && threshold == STS::zero()) {
781  // Case 3: Multiple DOF/node problem without dropping
782  const RCP<const Map> rowMap = A->getRowMap();
783  const RCP<const Map> colMap = A->getColMap();
784 
785  graphType = "amalgamated";
786 
787  // build node row map (uniqueMap) and node column map (nonUniqueMap)
788  // the arrays rowTranslation and colTranslation contain the local node id
789  // given a local dof id. The data is calculated by the AmalgamationFactory and
790  // stored in the variable container "UnAmalgamationInfo"
791  RCP<const Map> uniqueMap = amalInfo->getNodeRowMap();
792  RCP<const Map> nonUniqueMap = amalInfo->getNodeColMap();
793  Array<LO> rowTranslation = *(amalInfo->getRowTranslation());
794  Array<LO> colTranslation = *(amalInfo->getColTranslation());
795 
796  // get number of local nodes
797  LO numRows = Teuchos::as<LocalOrdinal>(uniqueMap->getLocalNumElements());
798 
799  // Allocate space for the local graph
800  typename LWGraph::row_type::non_const_type rows("rows", numRows + 1);
801  typename LWGraph::entries_type::non_const_type columns("columns", A->getLocalNumEntries());
802 
803  typename LWGraph::boundary_nodes_type amalgBoundaryNodes("amalgBoundaryNodes", numRows);
804  Kokkos::deep_copy(amalgBoundaryNodes, false);
805 
806  // Detect and record rows that correspond to Dirichlet boundary conditions
807  // TODO If we use ArrayRCP<LO>, then we can record boundary nodes as usual. Size
808  // TODO the array one bigger than the number of local rows, and the last entry can
809  // TODO hold the actual number of boundary nodes. Clever, huh?
810  ArrayRCP<bool> pointBoundaryNodes;
811  pointBoundaryNodes = Teuchos::arcp_const_cast<bool>(MueLu::Utilities<SC, LO, GO, NO>::DetectDirichletRows(*A, dirichletThreshold));
812  if (rowSumTol > 0.)
813  Utilities::ApplyRowSumCriterion(*A, rowSumTol, pointBoundaryNodes);
814 
815  // extract striding information
816  LO blkSize = A->GetFixedBlockSize(); //< the full block size (number of dofs per node in strided map)
817  LO blkId = -1; //< the block id within the strided map (or -1 if it is a full block map)
818  LO blkPartSize = A->GetFixedBlockSize(); //< stores the size of the block within the strided map
819  if (A->IsView("stridedMaps") == true) {
820  Teuchos::RCP<const Map> myMap = A->getRowMap("stridedMaps");
821  Teuchos::RCP<const StridedMap> strMap = Teuchos::rcp_dynamic_cast<const StridedMap>(myMap);
822  TEUCHOS_TEST_FOR_EXCEPTION(strMap == null, Exceptions::RuntimeError, "Map is not of type StridedMap");
823  blkSize = Teuchos::as<const LO>(strMap->getFixedBlockSize());
824  blkId = strMap->getStridedBlockId();
825  if (blkId > -1)
826  blkPartSize = Teuchos::as<LO>(strMap->getStridingData()[blkId]);
827  }
828 
829  // loop over all local nodes
830  LO realnnz = 0;
831  rows(0) = 0;
832  Array<LO> indicesExtra;
833  for (LO row = 0; row < numRows; row++) {
834  ArrayView<const LO> indices;
835  indicesExtra.resize(0);
836 
837  // The amalgamated row is marked as Dirichlet iff all point rows are Dirichlet
838  // Note, that pointBoundaryNodes lives on the dofmap (and not the node map).
839  // Therefore, looping over all dofs is fine here. We use blkPartSize as we work
840  // with local ids.
841  // TODO: Here we have different options of how to define a node to be a boundary (or Dirichlet)
842  // node.
843  bool isBoundary = false;
844  if (pL.get<bool>("aggregation: greedy Dirichlet") == true) {
845  for (LO j = 0; j < blkPartSize; j++) {
846  if (pointBoundaryNodes[row * blkPartSize + j]) {
847  isBoundary = true;
848  break;
849  }
850  }
851  } else {
852  isBoundary = true;
853  for (LO j = 0; j < blkPartSize; j++) {
854  if (!pointBoundaryNodes[row * blkPartSize + j]) {
855  isBoundary = false;
856  break;
857  }
858  }
859  }
860 
861  // Merge rows of A
862  // The array indicesExtra contains local column node ids for the current local node "row"
863  if (!isBoundary)
864  MergeRows(*A, row, indicesExtra, colTranslation);
865  else
866  indicesExtra.push_back(row);
867  indices = indicesExtra;
868  numTotal += indices.size();
869 
870  // add the local column node ids to the full columns array which
871  // contains the local column node ids for all local node rows
872  LO nnz = indices.size(), rownnz = 0;
873  for (LO colID = 0; colID < nnz; colID++) {
874  LO col = indices[colID];
875  columns(realnnz++) = col;
876  rownnz++;
877  }
878 
879  if (rownnz == 1) {
880  // If the only element remaining after filtering is diagonal, mark node as boundary
881  // FIXME: this should really be replaced by the following
882  // if (indices.size() == 1 && indices[0] == row)
883  // boundaryNodes[row] = true;
884  // We do not do it this way now because there is no framework for distinguishing isolated
885  // and boundary nodes in the aggregation algorithms
886  amalgBoundaryNodes[row] = true;
887  }
888  rows(row + 1) = realnnz;
889  } // for (LO row = 0; row < numRows; row++)
890 
891  RCP<LWGraph> graph = rcp(new LWGraph(rows, Kokkos::subview(columns, Kokkos::make_pair(0, realnnz)), uniqueMap, nonUniqueMap, "amalgamated graph of A"));
892  graph->SetBoundaryNodeMap(amalgBoundaryNodes);
893 
894  if (GetVerbLevel() & Statistics1) {
895  GO numLocalBoundaryNodes = 0;
896  GO numGlobalBoundaryNodes = 0;
897 
898  for (size_t i = 0; i < amalgBoundaryNodes.size(); ++i)
899  if (amalgBoundaryNodes(i))
900  numLocalBoundaryNodes++;
901 
902  RCP<const Teuchos::Comm<int>> comm = A->getRowMap()->getComm();
903  MueLu_sumAll(comm, numLocalBoundaryNodes, numGlobalBoundaryNodes);
904  GetOStream(Statistics1) << "Detected " << numGlobalBoundaryNodes
905  << " agglomerated Dirichlet nodes" << std::endl;
906  }
907 
908  Set(currentLevel, "Graph", graph);
909  Set(currentLevel, "DofsPerNode", blkSize); // full block size
910 
911  } else if (BlockSize > 1 && threshold != STS::zero()) {
912  // Case 4: Multiple DOF/node problem with dropping
913  const RCP<const Map> rowMap = A->getRowMap();
914  const RCP<const Map> colMap = A->getColMap();
915  graphType = "amalgamated";
916 
917  // build node row map (uniqueMap) and node column map (nonUniqueMap)
918  // the arrays rowTranslation and colTranslation contain the local node id
919  // given a local dof id. The data is calculated by the AmalgamationFactory and
920  // stored in the variable container "UnAmalgamationInfo"
921  RCP<const Map> uniqueMap = amalInfo->getNodeRowMap();
922  RCP<const Map> nonUniqueMap = amalInfo->getNodeColMap();
923  Array<LO> rowTranslation = *(amalInfo->getRowTranslation());
924  Array<LO> colTranslation = *(amalInfo->getColTranslation());
925 
926  // get number of local nodes
927  LO numRows = Teuchos::as<LocalOrdinal>(uniqueMap->getLocalNumElements());
928 
929  // Allocate space for the local graph
930  typename LWGraph::row_type::non_const_type rows("rows", numRows + 1);
931  typename LWGraph::entries_type::non_const_type columns("columns", A->getLocalNumEntries());
932 
933  typename LWGraph::boundary_nodes_type amalgBoundaryNodes("amalgBoundaryNodes", numRows);
934  Kokkos::deep_copy(amalgBoundaryNodes, false);
935 
936  // Detect and record rows that correspond to Dirichlet boundary conditions
937  // TODO If we use ArrayRCP<LO>, then we can record boundary nodes as usual. Size
938  // TODO the array one bigger than the number of local rows, and the last entry can
939  // TODO hold the actual number of boundary nodes. Clever, huh?
940  auto pointBoundaryNodes = MueLu::Utilities<SC, LO, GO, NO>::DetectDirichletRows_kokkos_host(*A, dirichletThreshold);
941  if (rowSumTol > 0.)
942  Utilities::ApplyRowSumCriterionHost(*A, rowSumTol, pointBoundaryNodes);
943 
944  // extract striding information
945  LO blkSize = A->GetFixedBlockSize(); //< the full block size (number of dofs per node in strided map)
946  LO blkId = -1; //< the block id within the strided map (or -1 if it is a full block map)
947  LO blkPartSize = A->GetFixedBlockSize(); //< stores the size of the block within the strided map
948  if (A->IsView("stridedMaps") == true) {
949  Teuchos::RCP<const Map> myMap = A->getRowMap("stridedMaps");
950  Teuchos::RCP<const StridedMap> strMap = Teuchos::rcp_dynamic_cast<const StridedMap>(myMap);
951  TEUCHOS_TEST_FOR_EXCEPTION(strMap == null, Exceptions::RuntimeError, "Map is not of type StridedMap");
952  blkSize = Teuchos::as<const LO>(strMap->getFixedBlockSize());
953  blkId = strMap->getStridedBlockId();
954  if (blkId > -1)
955  blkPartSize = Teuchos::as<LO>(strMap->getStridingData()[blkId]);
956  }
957 
958  // extract diagonal data for dropping strategy
960  const ArrayRCP<const SC> ghostedDiagVals = ghostedDiag->getData(0);
961 
962  // loop over all local nodes
963  LO realnnz = 0;
964  rows[0] = 0;
965  Array<LO> indicesExtra;
966  for (LO row = 0; row < numRows; row++) {
967  ArrayView<const LO> indices;
968  indicesExtra.resize(0);
969 
970  // The amalgamated row is marked as Dirichlet iff all point rows are Dirichlet
971  // Note, that pointBoundaryNodes lives on the dofmap (and not the node map).
972  // Therefore, looping over all dofs is fine here. We use blkPartSize as we work
973  // with local ids.
974  // TODO: Here we have different options of how to define a node to be a boundary (or Dirichlet)
975  // node.
976  bool isBoundary = false;
977  if (pL.get<bool>("aggregation: greedy Dirichlet") == true) {
978  for (LO j = 0; j < blkPartSize; j++) {
979  if (pointBoundaryNodes[row * blkPartSize + j]) {
980  isBoundary = true;
981  break;
982  }
983  }
984  } else {
985  isBoundary = true;
986  for (LO j = 0; j < blkPartSize; j++) {
987  if (!pointBoundaryNodes[row * blkPartSize + j]) {
988  isBoundary = false;
989  break;
990  }
991  }
992  }
993 
994  // Merge rows of A
995  // The array indicesExtra contains local column node ids for the current local node "row"
996  if (!isBoundary)
997  MergeRowsWithDropping(*A, row, ghostedDiagVals, threshold, indicesExtra, colTranslation);
998  else
999  indicesExtra.push_back(row);
1000  indices = indicesExtra;
1001  numTotal += indices.size();
1002 
1003  // add the local column node ids to the full columns array which
1004  // contains the local column node ids for all local node rows
1005  LO nnz = indices.size(), rownnz = 0;
1006  for (LO colID = 0; colID < nnz; colID++) {
1007  LO col = indices[colID];
1008  columns[realnnz++] = col;
1009  rownnz++;
1010  }
1011 
1012  if (rownnz == 1) {
1013  // If the only element remaining after filtering is diagonal, mark node as boundary
1014  // FIXME: this should really be replaced by the following
1015  // if (indices.size() == 1 && indices[0] == row)
1016  // boundaryNodes[row] = true;
1017  // We do not do it this way now because there is no framework for distinguishing isolated
1018  // and boundary nodes in the aggregation algorithms
1019  amalgBoundaryNodes[row] = true;
1020  }
1021  rows[row + 1] = realnnz;
1022  } // for (LO row = 0; row < numRows; row++)
1023  // columns.resize(realnnz);
1024 
1025  RCP<LWGraph> graph = rcp(new LWGraph(rows, Kokkos::subview(columns, Kokkos::make_pair(0, realnnz)), uniqueMap, nonUniqueMap, "amalgamated graph of A"));
1026  graph->SetBoundaryNodeMap(amalgBoundaryNodes);
1027 
1028  if (GetVerbLevel() & Statistics1) {
1029  GO numLocalBoundaryNodes = 0;
1030  GO numGlobalBoundaryNodes = 0;
1031 
1032  for (size_t i = 0; i < amalgBoundaryNodes.size(); ++i)
1033  if (amalgBoundaryNodes(i))
1034  numLocalBoundaryNodes++;
1035 
1036  RCP<const Teuchos::Comm<int>> comm = A->getRowMap()->getComm();
1037  MueLu_sumAll(comm, numLocalBoundaryNodes, numGlobalBoundaryNodes);
1038  GetOStream(Statistics1) << "Detected " << numGlobalBoundaryNodes
1039  << " agglomerated Dirichlet nodes" << std::endl;
1040  }
1041 
1042  Set(currentLevel, "Graph", graph);
1043  Set(currentLevel, "DofsPerNode", blkSize); // full block size
1044  }
1045 
1046  } else if (algo == "distance laplacian") {
1047  LO blkSize = A->GetFixedBlockSize();
1048  GO indexBase = A->getRowMap()->getIndexBase();
1049  // [*0*] : FIXME
1050  // ap: somehow, if I move this line to [*1*], Belos throws an error
1051  // I'm not sure what's going on. Do we always have to Get data, if we did
1052  // DeclareInput for it?
1053  // RCP<RealValuedMultiVector> Coords = Get< RCP<RealValuedMultiVector > >(currentLevel, "Coordinates");
1054 
1055  // Detect and record rows that correspond to Dirichlet boundary conditions
1056  // TODO If we use ArrayRCP<LO>, then we can record boundary nodes as usual. Size
1057  // TODO the array one bigger than the number of local rows, and the last entry can
1058  // TODO hold the actual number of boundary nodes. Clever, huh?
1059  auto pointBoundaryNodes = MueLu::Utilities<SC, LO, GO, NO>::DetectDirichletRows_kokkos_host(*A, dirichletThreshold);
1060  if (rowSumTol > 0.)
1061  Utilities::ApplyRowSumCriterionHost(*A, rowSumTol, pointBoundaryNodes);
1062 
1063  if ((blkSize == 1) && (threshold == STS::zero())) {
1064  // Trivial case: scalar problem, no dropping. Can return original graph
1065  RCP<LWGraph> graph = rcp(new LWGraph(A->getCrsGraph(), "graph of A"));
1066  graph->SetBoundaryNodeMap(pointBoundaryNodes);
1067  graphType = "unamalgamated";
1068  numTotal = A->getLocalNumEntries();
1069 
1070  if (GetVerbLevel() & Statistics1) {
1071  GO numLocalBoundaryNodes = 0;
1072  GO numGlobalBoundaryNodes = 0;
1073  for (size_t i = 0; i < pointBoundaryNodes.size(); ++i)
1074  if (pointBoundaryNodes(i))
1075  numLocalBoundaryNodes++;
1076  RCP<const Teuchos::Comm<int>> comm = A->getRowMap()->getComm();
1077  MueLu_sumAll(comm, numLocalBoundaryNodes, numGlobalBoundaryNodes);
1078  GetOStream(Statistics1) << "Detected " << numGlobalBoundaryNodes << " Dirichlet nodes" << std::endl;
1079  }
1080 
1081  Set(currentLevel, "DofsPerNode", blkSize);
1082  Set(currentLevel, "Graph", graph);
1083 
1084  } else {
1085  // ap: We make quite a few assumptions here; general case may be a lot different,
1086  // but much much harder to implement. We assume that:
1087  // 1) all maps are standard maps, not strided maps
1088  // 2) global indices of dofs in A are related to dofs in coordinates in a simple arithmetic
1089  // way: rows i*blkSize, i*blkSize+1, ..., i*blkSize + (blkSize-1) correspond to node i
1090  //
1091  // NOTE: Potentially, some of the code below could be simplified with UnAmalgamationInfo,
1092  // but as I totally don't understand that code, here is my solution
1093 
1094  // [*1*]: see [*0*]
1095 
1096  // Check that the number of local coordinates is consistent with the #rows in A
1097  TEUCHOS_TEST_FOR_EXCEPTION(A->getRowMap()->getLocalNumElements() / blkSize != Coords->getLocalLength(), Exceptions::Incompatible,
1098  "Coordinate vector length (" << Coords->getLocalLength() << ") is incompatible with number of rows in A (" << A->getRowMap()->getLocalNumElements() << ") by modulo block size (" << blkSize << ").");
1099 
1100  const RCP<const Map> colMap = A->getColMap();
1101  RCP<const Map> uniqueMap, nonUniqueMap;
1102  Array<LO> colTranslation;
1103  if (blkSize == 1) {
1104  uniqueMap = A->getRowMap();
1105  nonUniqueMap = A->getColMap();
1106  graphType = "unamalgamated";
1107 
1108  } else {
1109  uniqueMap = Coords->getMap();
1110  TEUCHOS_TEST_FOR_EXCEPTION(uniqueMap->getIndexBase() != indexBase, Exceptions::Incompatible,
1111  "Different index bases for matrix and coordinates");
1112 
1113  AmalgamationFactory::AmalgamateMap(*(A->getColMap()), *A, nonUniqueMap, colTranslation);
1114 
1115  graphType = "amalgamated";
1116  }
1117  LO numRows = Teuchos::as<LocalOrdinal>(uniqueMap->getLocalNumElements());
1118 
1119  RCP<RealValuedMultiVector> ghostedCoords;
1120  RCP<Vector> ghostedLaplDiag;
1121  Teuchos::ArrayRCP<SC> ghostedLaplDiagData;
1122  if (threshold != STS::zero()) {
1123  // Get ghost coordinates
1124  RCP<const Import> importer;
1125  {
1126  SubFactoryMonitor m1(*this, "Import construction", currentLevel);
1127  if (blkSize == 1 && realA->getCrsGraph()->getImporter() != Teuchos::null) {
1128  GetOStream(Warnings1) << "Using existing importer from matrix graph" << std::endl;
1129  importer = realA->getCrsGraph()->getImporter();
1130  } else {
1131  GetOStream(Warnings0) << "Constructing new importer instance" << std::endl;
1132  importer = ImportFactory::Build(uniqueMap, nonUniqueMap);
1133  }
1134  } // subtimer
1135  ghostedCoords = Xpetra::MultiVectorFactory<real_type, LO, GO, NO>::Build(nonUniqueMap, Coords->getNumVectors());
1136  {
1137  SubFactoryMonitor m1(*this, "Coordinate import", currentLevel);
1138  ghostedCoords->doImport(*Coords, *importer, Xpetra::INSERT);
1139  } // subtimer
1140 
1141  // Construct Distance Laplacian diagonal
1142  RCP<Vector> localLaplDiag = VectorFactory::Build(uniqueMap);
1143  Array<LO> indicesExtra;
1145  if (threshold != STS::zero()) {
1146  const size_t numVectors = ghostedCoords->getNumVectors();
1147  coordData.reserve(numVectors);
1148  for (size_t j = 0; j < numVectors; j++) {
1149  Teuchos::ArrayRCP<const real_type> tmpData = ghostedCoords->getData(j);
1150  coordData.push_back(tmpData);
1151  }
1152  }
1153  {
1154  SubFactoryMonitor m1(*this, "Laplacian local diagonal", currentLevel);
1155  ArrayRCP<SC> localLaplDiagData = localLaplDiag->getDataNonConst(0);
1156  for (LO row = 0; row < numRows; row++) {
1157  ArrayView<const LO> indices;
1158 
1159  if (blkSize == 1) {
1160  ArrayView<const SC> vals;
1161  A->getLocalRowView(row, indices, vals);
1162 
1163  } else {
1164  // Merge rows of A
1165  indicesExtra.resize(0);
1166  MergeRows(*A, row, indicesExtra, colTranslation);
1167  indices = indicesExtra;
1168  }
1169 
1170  LO nnz = indices.size();
1171  bool haveAddedToDiag = false;
1172  for (LO colID = 0; colID < nnz; colID++) {
1173  const LO col = indices[colID];
1174 
1175  if (row != col) {
1176  if (use_dlap_weights == SINGLE_WEIGHTS) {
1177  /*printf("[%d,%d] Unweighted Distance = %6.4e Weighted Distance = %6.4e\n",row,col,
1178  MueLu::Utilities<real_type,LO,GO,NO>::Distance2(coordData, row, col),
1179  MueLu::Utilities<real_type,LO,GO,NO>::Distance2(dlap_weights(),coordData, row, col));*/
1180  localLaplDiagData[row] += STS::one() / MueLu::Utilities<real_type, LO, GO, NO>::Distance2(dlap_weights(), coordData, row, col);
1181  } else if (use_dlap_weights == BLOCK_WEIGHTS) {
1182  int block_id = row % interleaved_blocksize;
1183  int block_start = block_id * interleaved_blocksize;
1184  localLaplDiagData[row] += STS::one() / MueLu::Utilities<real_type, LO, GO, NO>::Distance2(dlap_weights(block_start, interleaved_blocksize), coordData, row, col);
1185  } else {
1186  // printf("[%d,%d] Unweighted Distance = %6.4e\n",row,col,MueLu::Utilities<real_type,LO,GO,NO>::Distance2(coordData, row, col));
1187  localLaplDiagData[row] += STS::one() / MueLu::Utilities<real_type, LO, GO, NO>::Distance2(coordData, row, col);
1188  }
1189  haveAddedToDiag = true;
1190  }
1191  }
1192  // Deal with the situation where boundary conditions have only been enforced on rows, but not on columns.
1193  // We enforce dropping of these entries by assigning a very large number to the diagonal entries corresponding to BCs.
1194  if (!haveAddedToDiag)
1195  localLaplDiagData[row] = STS::rmax();
1196  }
1197  } // subtimer
1198  {
1199  SubFactoryMonitor m1(*this, "Laplacian distributed diagonal", currentLevel);
1200  ghostedLaplDiag = VectorFactory::Build(nonUniqueMap);
1201  ghostedLaplDiag->doImport(*localLaplDiag, *importer, Xpetra::INSERT);
1202  ghostedLaplDiagData = ghostedLaplDiag->getDataNonConst(0);
1203  } // subtimer
1204 
1205  } else {
1206  GetOStream(Runtime0) << "Skipping distance laplacian construction due to 0 threshold" << std::endl;
1207  }
1208 
1209  // NOTE: ghostedLaplDiagData might be zero if we don't actually calculate the laplacian
1210 
1211  // allocate space for the local graph
1212  typename LWGraph::row_type::non_const_type rows("rows", numRows + 1);
1213  typename LWGraph::entries_type::non_const_type columns("columns", A->getLocalNumEntries());
1214 
1215 #ifdef HAVE_MUELU_DEBUG
1216  // DEBUGGING
1217  for (LO i = 0; i < (LO)columns.size(); i++) columns[i] = -666;
1218 #endif
1219 
1220  // Extra array for if we're allowing symmetrization with cutting
1221  ArrayRCP<LO> rows_stop;
1222  bool use_stop_array = threshold != STS::zero() && distanceLaplacianAlgo == scaled_cut_symmetric;
1223  if (use_stop_array)
1224  // rows_stop = typename LWGraph::row_type::non_const_type("rows_stop", numRows);
1225  rows_stop.resize(numRows);
1226 
1227  typename LWGraph::boundary_nodes_type amalgBoundaryNodes("amalgBoundaryNodes", numRows);
1228  Kokkos::deep_copy(amalgBoundaryNodes, false);
1229 
1230  LO realnnz = 0;
1231  rows(0) = 0;
1232 
1233  Array<LO> indicesExtra;
1234  {
1235  SubFactoryMonitor m1(*this, "Laplacian dropping", currentLevel);
1237  if (threshold != STS::zero()) {
1238  const size_t numVectors = ghostedCoords->getNumVectors();
1239  coordData.reserve(numVectors);
1240  for (size_t j = 0; j < numVectors; j++) {
1241  Teuchos::ArrayRCP<const real_type> tmpData = ghostedCoords->getData(j);
1242  coordData.push_back(tmpData);
1243  }
1244  }
1245 
1246  ArrayView<const SC> vals; // CMS hackery
1247  for (LO row = 0; row < numRows; row++) {
1248  ArrayView<const LO> indices;
1249  indicesExtra.resize(0);
1250  bool isBoundary = false;
1251 
1252  if (blkSize == 1) {
1253  // ArrayView<const SC> vals;//CMS uncomment
1254  A->getLocalRowView(row, indices, vals);
1255  isBoundary = pointBoundaryNodes[row];
1256  } else {
1257  // The amalgamated row is marked as Dirichlet iff all point rows are Dirichlet
1258  isBoundary = true;
1259  for (LO j = 0; j < blkSize; j++) {
1260  if (!pointBoundaryNodes[row * blkSize + j]) {
1261  isBoundary = false;
1262  break;
1263  }
1264  }
1265 
1266  // Merge rows of A
1267  if (!isBoundary)
1268  MergeRows(*A, row, indicesExtra, colTranslation);
1269  else
1270  indicesExtra.push_back(row);
1271  indices = indicesExtra;
1272  }
1273  numTotal += indices.size();
1274 
1275  LO nnz = indices.size(), rownnz = 0;
1276 
1277  if (use_stop_array) {
1278  rows(row + 1) = rows(row) + nnz;
1279  realnnz = rows(row);
1280  }
1281 
1282  if (threshold != STS::zero()) {
1283  // default
1284  if (distanceLaplacianAlgo == defaultAlgo) {
1285  /* Standard Distance Laplacian */
1286  for (LO colID = 0; colID < nnz; colID++) {
1287  LO col = indices[colID];
1288 
1289  if (row == col) {
1290  columns(realnnz++) = col;
1291  rownnz++;
1292  continue;
1293  }
1294 
1295  // We do not want the distance Laplacian aggregating boundary nodes
1296  if (isBoundary) continue;
1297 
1298  SC laplVal;
1299  if (use_dlap_weights == SINGLE_WEIGHTS) {
1300  laplVal = STS::one() / MueLu::Utilities<real_type, LO, GO, NO>::Distance2(dlap_weights(), coordData, row, col);
1301  } else if (use_dlap_weights == BLOCK_WEIGHTS) {
1302  int block_id = row % interleaved_blocksize;
1303  int block_start = block_id * interleaved_blocksize;
1304  laplVal = STS::one() / MueLu::Utilities<real_type, LO, GO, NO>::Distance2(dlap_weights(block_start, interleaved_blocksize), coordData, row, col);
1305  } else {
1306  laplVal = STS::one() / MueLu::Utilities<real_type, LO, GO, NO>::Distance2(coordData, row, col);
1307  }
1308  real_type aiiajj = STS::magnitude(realThreshold * realThreshold * ghostedLaplDiagData[row] * ghostedLaplDiagData[col]);
1309  real_type aij = STS::magnitude(laplVal * laplVal);
1310 
1311  if (aij > aiiajj) {
1312  columns(realnnz++) = col;
1313  rownnz++;
1314  } else {
1315  numDropped++;
1316  }
1317  }
1318  } else {
1319  /* Cut Algorithm */
1320  using DropTol = Details::DropTol<real_type, LO>;
1321  std::vector<DropTol> drop_vec;
1322  drop_vec.reserve(nnz);
1323  const real_type zero = Teuchos::ScalarTraits<real_type>::zero();
1324  const real_type one = Teuchos::ScalarTraits<real_type>::one();
1325 
1326  // find magnitudes
1327  for (LO colID = 0; colID < nnz; colID++) {
1328  LO col = indices[colID];
1329 
1330  if (row == col) {
1331  drop_vec.emplace_back(zero, one, colID, false);
1332  continue;
1333  }
1334  // We do not want the distance Laplacian aggregating boundary nodes
1335  if (isBoundary) continue;
1336 
1337  SC laplVal;
1338  if (use_dlap_weights == SINGLE_WEIGHTS) {
1339  laplVal = STS::one() / MueLu::Utilities<real_type, LO, GO, NO>::Distance2(dlap_weights(), coordData, row, col);
1340  } else if (use_dlap_weights == BLOCK_WEIGHTS) {
1341  int block_id = row % interleaved_blocksize;
1342  int block_start = block_id * interleaved_blocksize;
1343  laplVal = STS::one() / MueLu::Utilities<real_type, LO, GO, NO>::Distance2(dlap_weights(block_start, interleaved_blocksize), coordData, row, col);
1344  } else {
1345  laplVal = STS::one() / MueLu::Utilities<real_type, LO, GO, NO>::Distance2(coordData, row, col);
1346  }
1347 
1348  real_type aiiajj = STS::magnitude(ghostedLaplDiagData[row] * ghostedLaplDiagData[col]);
1349  real_type aij = STS::magnitude(laplVal * laplVal);
1350 
1351  drop_vec.emplace_back(aij, aiiajj, colID, false);
1352  }
1353 
1354  const size_t n = drop_vec.size();
1355 
1356  if (distanceLaplacianAlgo == unscaled_cut) {
1357  std::sort(drop_vec.begin(), drop_vec.end(), [](DropTol const& a, DropTol const& b) {
1358  return a.val > b.val;
1359  });
1360 
1361  bool drop = false;
1362  for (size_t i = 1; i < n; ++i) {
1363  if (!drop) {
1364  auto const& x = drop_vec[i - 1];
1365  auto const& y = drop_vec[i];
1366  auto a = x.val;
1367  auto b = y.val;
1368  if (realThreshold * realThreshold * a > b) {
1369  drop = true;
1370 #ifdef HAVE_MUELU_DEBUG
1371  if (distanceLaplacianCutVerbose) {
1372  std::cout << "DJS: KEEP, N, ROW: " << i + 1 << ", " << n << ", " << row << std::endl;
1373  }
1374 #endif
1375  }
1376  }
1377  drop_vec[i].drop = drop;
1378  }
1379  } else if (distanceLaplacianAlgo == scaled_cut || distanceLaplacianAlgo == scaled_cut_symmetric) {
1380  std::sort(drop_vec.begin(), drop_vec.end(), [](DropTol const& a, DropTol const& b) {
1381  return a.val / a.diag > b.val / b.diag;
1382  });
1383 
1384  bool drop = false;
1385  for (size_t i = 1; i < n; ++i) {
1386  if (!drop) {
1387  auto const& x = drop_vec[i - 1];
1388  auto const& y = drop_vec[i];
1389  auto a = x.val / x.diag;
1390  auto b = y.val / y.diag;
1391  if (realThreshold * realThreshold * a > b) {
1392  drop = true;
1393 #ifdef HAVE_MUELU_DEBUG
1394  if (distanceLaplacianCutVerbose) {
1395  std::cout << "DJS: KEEP, N, ROW: " << i + 1 << ", " << n << ", " << row << std::endl;
1396  }
1397 #endif
1398  }
1399  }
1400  drop_vec[i].drop = drop;
1401  }
1402  }
1403 
1404  std::sort(drop_vec.begin(), drop_vec.end(), [](DropTol const& a, DropTol const& b) {
1405  return a.col < b.col;
1406  });
1407 
1408  for (LO idxID = 0; idxID < (LO)drop_vec.size(); idxID++) {
1409  LO col = indices[drop_vec[idxID].col];
1410 
1411  // don't drop diagonal
1412  if (row == col) {
1413  columns(realnnz++) = col;
1414  rownnz++;
1415  // printf("(%d,%d) KEEP %13s matrix = %6.4e\n",row,row,"DIAGONAL",drop_vec[idxID].aux_val);
1416  continue;
1417  }
1418 
1419  if (!drop_vec[idxID].drop) {
1420  columns(realnnz++) = col;
1421  // printf("(%d,%d) KEEP dlap = %6.4e matrix = %6.4e\n",row,col,drop_vec[idxID].val/drop_vec[idxID].diag,drop_vec[idxID].aux_val);
1422  rownnz++;
1423  } else {
1424  // printf("(%d,%d) DROP dlap = %6.4e matrix = %6.4e\n",row,col,drop_vec[idxID].val/drop_vec[idxID].diag,drop_vec[idxID].aux_val);
1425  numDropped++;
1426  }
1427  }
1428  }
1429  } else {
1430  // Skip laplace calculation and threshold comparison for zero threshold
1431  for (LO colID = 0; colID < nnz; colID++) {
1432  LO col = indices[colID];
1433  columns(realnnz++) = col;
1434  rownnz++;
1435  }
1436  }
1437 
1438  if (rownnz == 1) {
1439  // If the only element remaining after filtering is diagonal, mark node as boundary
1440  // FIXME: this should really be replaced by the following
1441  // if (indices.size() == 1 && indices[0] == row)
1442  // boundaryNodes[row] = true;
1443  // We do not do it this way now because there is no framework for distinguishing isolated
1444  // and boundary nodes in the aggregation algorithms
1445  amalgBoundaryNodes[row] = true;
1446  }
1447 
1448  if (use_stop_array)
1449  rows_stop[row] = rownnz + rows[row];
1450  else
1451  rows[row + 1] = realnnz;
1452  } // for (LO row = 0; row < numRows; row++)
1453 
1454  } // subtimer
1455 
1456  if (use_stop_array) {
1457  // Do symmetrization of the cut matrix
1458  // NOTE: We assume nested row/column maps here
1459  for (LO row = 0; row < numRows; row++) {
1460  for (LO colidx = rows[row]; colidx < rows_stop[row]; colidx++) {
1461  LO col = columns[colidx];
1462  if (col >= numRows) continue;
1463 
1464  bool found = false;
1465  for (LO t_col = rows(col); !found && t_col < rows_stop[col]; t_col++) {
1466  if (columns[t_col] == row)
1467  found = true;
1468  }
1469  // We didn't find the transpose buddy, so let's symmetrize, unless we'd be symmetrizing
1470  // into a Dirichlet unknown. In that case don't.
1471  if (!found && !pointBoundaryNodes[col] && Teuchos::as<typename LWGraph::row_type::value_type>(rows_stop[col]) < rows[col + 1]) {
1472  LO new_idx = rows_stop[col];
1473  // printf("(%d,%d) SYMADD entry\n",col,row);
1474  columns[new_idx] = row;
1475  rows_stop[col]++;
1476  numDropped--;
1477  }
1478  }
1479  }
1480 
1481  // Condense everything down
1482  LO current_start = 0;
1483  for (LO row = 0; row < numRows; row++) {
1484  LO old_start = current_start;
1485  for (LO col = rows(row); col < rows_stop[row]; col++) {
1486  if (current_start != col) {
1487  columns(current_start) = columns(col);
1488  }
1489  current_start++;
1490  }
1491  rows[row] = old_start;
1492  }
1493  rows(numRows) = realnnz = current_start;
1494  }
1495 
1496  RCP<LWGraph> graph;
1497  {
1498  SubFactoryMonitor m1(*this, "Build amalgamated graph", currentLevel);
1499  graph = rcp(new LWGraph(rows, Kokkos::subview(columns, Kokkos::make_pair(0, realnnz)), uniqueMap, nonUniqueMap, "amalgamated graph of A"));
1500  graph->SetBoundaryNodeMap(amalgBoundaryNodes);
1501  } // subtimer
1502 
1503  if (GetVerbLevel() & Statistics1) {
1504  GO numLocalBoundaryNodes = 0;
1505  GO numGlobalBoundaryNodes = 0;
1506 
1507  for (size_t i = 0; i < amalgBoundaryNodes.size(); ++i)
1508  if (amalgBoundaryNodes(i))
1509  numLocalBoundaryNodes++;
1510 
1511  RCP<const Teuchos::Comm<int>> comm = A->getRowMap()->getComm();
1512  MueLu_sumAll(comm, numLocalBoundaryNodes, numGlobalBoundaryNodes);
1513  GetOStream(Statistics1) << "Detected " << numGlobalBoundaryNodes << " agglomerated Dirichlet nodes"
1514  << " using threshold " << dirichletThreshold << std::endl;
1515  }
1516 
1517  Set(currentLevel, "Graph", graph);
1518  Set(currentLevel, "DofsPerNode", blkSize);
1519  }
1520  }
1521 
1522  if (GetVerbLevel() & Statistics1) {
1523  RCP<const Teuchos::Comm<int>> comm = A->getRowMap()->getComm();
1524  GO numGlobalTotal, numGlobalDropped;
1525  MueLu_sumAll(comm, numTotal, numGlobalTotal);
1526  MueLu_sumAll(comm, numDropped, numGlobalDropped);
1527  GetOStream(Statistics1) << "Number of dropped entries in " << graphType << " matrix graph: " << numGlobalDropped << "/" << numGlobalTotal;
1528  if (numGlobalTotal != 0)
1529  GetOStream(Statistics1) << " (" << 100 * Teuchos::as<double>(numGlobalDropped) / Teuchos::as<double>(numGlobalTotal) << "%)";
1530  GetOStream(Statistics1) << std::endl;
1531  }
1532 
1533  } else {
1534  // what Tobias has implemented
1535 
1536  SC threshold = as<SC>(pL.get<double>("aggregation: drop tol"));
1537  // GetOStream(Runtime0) << "algorithm = \"" << algo << "\": threshold = " << threshold << ", blocksize = " << A->GetFixedBlockSize() << std::endl;
1538  GetOStream(Runtime0) << "algorithm = \""
1539  << "failsafe"
1540  << "\": threshold = " << threshold << ", blocksize = " << A->GetFixedBlockSize() << std::endl;
1541  Set<bool>(currentLevel, "Filtering", (threshold != STS::zero()));
1542 
1543  RCP<const Map> rowMap = A->getRowMap();
1544  RCP<const Map> colMap = A->getColMap();
1545 
1546  LO blockdim = 1; // block dim for fixed size blocks
1547  GO indexBase = rowMap->getIndexBase(); // index base of maps
1548  GO offset = 0;
1549 
1550  // 1) check for blocking/striding information
1551  if (A->IsView("stridedMaps") &&
1552  Teuchos::rcp_dynamic_cast<const StridedMap>(A->getRowMap("stridedMaps")) != Teuchos::null) {
1553  Xpetra::viewLabel_t oldView = A->SwitchToView("stridedMaps"); // note: "stridedMaps are always non-overlapping (correspond to range and domain maps!)
1554  RCP<const StridedMap> strMap = Teuchos::rcp_dynamic_cast<const StridedMap>(A->getRowMap());
1555  TEUCHOS_TEST_FOR_EXCEPTION(strMap == Teuchos::null, Exceptions::BadCast, "MueLu::CoalesceFactory::Build: cast to strided row map failed.");
1556  blockdim = strMap->getFixedBlockSize();
1557  offset = strMap->getOffset();
1558  oldView = A->SwitchToView(oldView);
1559  GetOStream(Statistics1) << "CoalesceDropFactory::Build():"
1560  << " found blockdim=" << blockdim << " from strided maps. offset=" << offset << std::endl;
1561  } else
1562  GetOStream(Statistics1) << "CoalesceDropFactory::Build(): no striding information available. Use blockdim=1 with offset=0" << std::endl;
1563 
1564  // 2) get row map for amalgamated matrix (graph of A)
1565  // with same distribution over all procs as row map of A
1566  RCP<const Map> nodeMap = amalInfo->getNodeRowMap();
1567  GetOStream(Statistics1) << "CoalesceDropFactory: nodeMap " << nodeMap->getLocalNumElements() << "/" << nodeMap->getGlobalNumElements() << " elements" << std::endl;
1568 
1569  // 3) create graph of amalgamated matrix
1570  RCP<CrsGraph> crsGraph = CrsGraphFactory::Build(nodeMap, A->getLocalMaxNumRowEntries() * blockdim);
1571 
1572  LO numRows = A->getRowMap()->getLocalNumElements();
1573  LO numNodes = nodeMap->getLocalNumElements();
1574  typename LWGraph::boundary_nodes_type amalgBoundaryNodes("amalgBoundaryNodes", numNodes);
1575  Kokkos::deep_copy(amalgBoundaryNodes, false);
1576  const ArrayRCP<int> numberDirichletRowsPerNode(numNodes, 0); // helper array counting the number of Dirichlet nodes associated with node
1577  bool bIsDiagonalEntry = false; // boolean flag stating that grid==gcid
1578 
1579  // 4) do amalgamation. generate graph of amalgamated matrix
1580  // Note, this code is much more inefficient than the leightwight implementation
1581  // Most of the work has already been done in the AmalgamationFactory
1582  for (LO row = 0; row < numRows; row++) {
1583  // get global DOF id
1584  GO grid = rowMap->getGlobalElement(row);
1585 
1586  // reinitialize boolean helper variable
1587  bIsDiagonalEntry = false;
1588 
1589  // translate grid to nodeid
1590  GO nodeId = AmalgamationFactory::DOFGid2NodeId(grid, blockdim, offset, indexBase);
1591 
1592  size_t nnz = A->getNumEntriesInLocalRow(row);
1595  A->getLocalRowView(row, indices, vals);
1596 
1597  RCP<std::vector<GO>> cnodeIds = Teuchos::rcp(new std::vector<GO>); // global column block ids
1598  LO realnnz = 0;
1599  for (LO col = 0; col < Teuchos::as<LO>(nnz); col++) {
1600  GO gcid = colMap->getGlobalElement(indices[col]); // global column id
1601 
1602  if (vals[col] != STS::zero()) {
1603  GO cnodeId = AmalgamationFactory::DOFGid2NodeId(gcid, blockdim, offset, indexBase);
1604  cnodeIds->push_back(cnodeId);
1605  realnnz++; // increment number of nnz in matrix row
1606  if (grid == gcid) bIsDiagonalEntry = true;
1607  }
1608  }
1609 
1610  if (realnnz == 1 && bIsDiagonalEntry == true) {
1611  LO lNodeId = nodeMap->getLocalElement(nodeId);
1612  numberDirichletRowsPerNode[lNodeId] += 1; // increment Dirichlet row counter associated with lNodeId
1613  if (numberDirichletRowsPerNode[lNodeId] == blockdim) // mark full Dirichlet nodes
1614  amalgBoundaryNodes[lNodeId] = true;
1615  }
1616 
1617  Teuchos::ArrayRCP<GO> arr_cnodeIds = Teuchos::arcp(cnodeIds);
1618 
1619  if (arr_cnodeIds.size() > 0)
1620  crsGraph->insertGlobalIndices(nodeId, arr_cnodeIds());
1621  }
1622  // fill matrix graph
1623  crsGraph->fillComplete(nodeMap, nodeMap);
1624 
1625  // 5) create MueLu Graph object
1626  RCP<LWGraph> graph = rcp(new LWGraph(crsGraph, "amalgamated graph of A"));
1627 
1628  // Detect and record rows that correspond to Dirichlet boundary conditions
1629  graph->SetBoundaryNodeMap(amalgBoundaryNodes);
1630 
1631  if (GetVerbLevel() & Statistics1) {
1632  GO numLocalBoundaryNodes = 0;
1633  GO numGlobalBoundaryNodes = 0;
1634  for (size_t i = 0; i < amalgBoundaryNodes.size(); ++i)
1635  if (amalgBoundaryNodes(i))
1636  numLocalBoundaryNodes++;
1637  RCP<const Teuchos::Comm<int>> comm = A->getRowMap()->getComm();
1638  MueLu_sumAll(comm, numLocalBoundaryNodes, numGlobalBoundaryNodes);
1639  GetOStream(Statistics1) << "Detected " << numGlobalBoundaryNodes << " Dirichlet nodes" << std::endl;
1640  }
1641 
1642  // 6) store results in Level
1643  // graph->SetBoundaryNodeMap(gBoundaryNodeMap);
1644  Set(currentLevel, "DofsPerNode", blockdim);
1645  Set(currentLevel, "Graph", graph);
1646 
1647  } // if (doExperimentalWrap) ... else ...
1648 
1649 } // Build
1650 
1651 template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
1652 void CoalesceDropFactory<Scalar, LocalOrdinal, GlobalOrdinal, Node>::MergeRows(const Matrix& A, const LO row, Array<LO>& cols, const Array<LO>& translation) const {
1653  typedef typename ArrayView<const LO>::size_type size_type;
1654 
1655  // extract striding information
1656  LO blkSize = A.GetFixedBlockSize(); //< stores the size of the block within the strided map
1657  if (A.IsView("stridedMaps") == true) {
1658  Teuchos::RCP<const Map> myMap = A.getRowMap("stridedMaps");
1659  Teuchos::RCP<const StridedMap> strMap = Teuchos::rcp_dynamic_cast<const StridedMap>(myMap);
1660  TEUCHOS_TEST_FOR_EXCEPTION(strMap == null, Exceptions::RuntimeError, "Map is not of type StridedMap");
1661  if (strMap->getStridedBlockId() > -1)
1662  blkSize = Teuchos::as<LO>(strMap->getStridingData()[strMap->getStridedBlockId()]);
1663  }
1664 
1665  // count nonzero entries in all dof rows associated with node row
1666  size_t nnz = 0, pos = 0;
1667  for (LO j = 0; j < blkSize; j++)
1668  nnz += A.getNumEntriesInLocalRow(row * blkSize + j);
1669 
1670  if (nnz == 0) {
1671  cols.resize(0);
1672  return;
1673  }
1674 
1675  cols.resize(nnz);
1676 
1677  // loop over all local dof rows associated with local node "row"
1678  ArrayView<const LO> inds;
1679  ArrayView<const SC> vals;
1680  for (LO j = 0; j < blkSize; j++) {
1681  A.getLocalRowView(row * blkSize + j, inds, vals);
1682  size_type numIndices = inds.size();
1683 
1684  if (numIndices == 0) // skip empty dof rows
1685  continue;
1686 
1687  // cols: stores all local node ids for current local node id "row"
1688  cols[pos++] = translation[inds[0]];
1689  for (size_type k = 1; k < numIndices; k++) {
1690  LO nodeID = translation[inds[k]];
1691  // Here we try to speed up the process by reducing the size of an array
1692  // to sort. This works if the column nonzeros belonging to the same
1693  // node are stored consequently.
1694  if (nodeID != cols[pos - 1])
1695  cols[pos++] = nodeID;
1696  }
1697  }
1698  cols.resize(pos);
1699  nnz = pos;
1700 
1701  // Sort and remove duplicates
1702  std::sort(cols.begin(), cols.end());
1703  pos = 0;
1704  for (size_t j = 1; j < nnz; j++)
1705  if (cols[j] != cols[pos])
1706  cols[++pos] = cols[j];
1707  cols.resize(pos + 1);
1708 }
1709 
1710 template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
1711 void CoalesceDropFactory<Scalar, LocalOrdinal, GlobalOrdinal, Node>::MergeRowsWithDropping(const Matrix& A, const LO row, const ArrayRCP<const SC>& ghostedDiagVals, SC threshold, Array<LO>& cols, const Array<LO>& translation) const {
1712  typedef typename ArrayView<const LO>::size_type size_type;
1713  typedef Teuchos::ScalarTraits<SC> STS;
1714 
1715  // extract striding information
1716  LO blkSize = A.GetFixedBlockSize(); //< stores the size of the block within the strided map
1717  if (A.IsView("stridedMaps") == true) {
1718  Teuchos::RCP<const Map> myMap = A.getRowMap("stridedMaps");
1719  Teuchos::RCP<const StridedMap> strMap = Teuchos::rcp_dynamic_cast<const StridedMap>(myMap);
1720  TEUCHOS_TEST_FOR_EXCEPTION(strMap == null, Exceptions::RuntimeError, "Map is not of type StridedMap");
1721  if (strMap->getStridedBlockId() > -1)
1722  blkSize = Teuchos::as<LO>(strMap->getStridingData()[strMap->getStridedBlockId()]);
1723  }
1724 
1725  // count nonzero entries in all dof rows associated with node row
1726  size_t nnz = 0, pos = 0;
1727  for (LO j = 0; j < blkSize; j++)
1728  nnz += A.getNumEntriesInLocalRow(row * blkSize + j);
1729 
1730  if (nnz == 0) {
1731  cols.resize(0);
1732  return;
1733  }
1734 
1735  cols.resize(nnz);
1736 
1737  // loop over all local dof rows associated with local node "row"
1738  ArrayView<const LO> inds;
1739  ArrayView<const SC> vals;
1740  for (LO j = 0; j < blkSize; j++) {
1741  A.getLocalRowView(row * blkSize + j, inds, vals);
1742  size_type numIndices = inds.size();
1743 
1744  if (numIndices == 0) // skip empty dof rows
1745  continue;
1746 
1747  // cols: stores all local node ids for current local node id "row"
1748  LO prevNodeID = -1;
1749  for (size_type k = 0; k < numIndices; k++) {
1750  LO dofID = inds[k];
1751  LO nodeID = translation[inds[k]];
1752 
1753  // we avoid a square root by using squared values
1754  typename STS::magnitudeType aiiajj = STS::magnitude(threshold * threshold * ghostedDiagVals[dofID] * ghostedDiagVals[row * blkSize + j]); // eps^2 * |a_ii| * |a_jj|
1755  typename STS::magnitudeType aij = STS::magnitude(vals[k] * vals[k]);
1756 
1757  // check dropping criterion
1758  if (aij > aiiajj || (row * blkSize + j == dofID)) {
1759  // accept entry in graph
1760 
1761  // Here we try to speed up the process by reducing the size of an array
1762  // to sort. This works if the column nonzeros belonging to the same
1763  // node are stored consequently.
1764  if (nodeID != prevNodeID) {
1765  cols[pos++] = nodeID;
1766  prevNodeID = nodeID;
1767  }
1768  }
1769  }
1770  }
1771  cols.resize(pos);
1772  nnz = pos;
1773 
1774  // Sort and remove duplicates
1775  std::sort(cols.begin(), cols.end());
1776  pos = 0;
1777  for (size_t j = 1; j < nnz; j++)
1778  if (cols[j] != cols[pos])
1779  cols[++pos] = cols[j];
1780  cols.resize(pos + 1);
1781 
1782  return;
1783 }
1784 
1785 template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
1787  typedef Teuchos::ScalarTraits<SC> STS;
1788 
1789  const ParameterList& pL = GetParameterList();
1790  const typename STS::magnitudeType dirichletThreshold = STS::magnitude(as<SC>(pL.get<double>("aggregation: Dirichlet threshold")));
1791  const typename STS::magnitudeType rowSumTol = as<typename STS::magnitudeType>(pL.get<double>("aggregation: row sum drop tol"));
1792 
1793  RCP<LocalOrdinalVector> BlockNumber = Get<RCP<LocalOrdinalVector>>(currentLevel, "BlockNumber");
1794  RCP<LocalOrdinalVector> ghostedBlockNumber;
1795  GetOStream(Statistics1) << "Using BlockDiagonal Graph before dropping (with provided blocking)" << std::endl;
1796 
1797  // Ghost the column block numbers if we need to
1798  RCP<const Import> importer = A->getCrsGraph()->getImporter();
1799  if (!importer.is_null()) {
1800  SubFactoryMonitor m1(*this, "Block Number import", currentLevel);
1801  ghostedBlockNumber = Xpetra::VectorFactory<LO, LO, GO, NO>::Build(importer->getTargetMap());
1802  ghostedBlockNumber->doImport(*BlockNumber, *importer, Xpetra::INSERT);
1803  } else {
1804  ghostedBlockNumber = BlockNumber;
1805  }
1806 
1807  // Accessors for block numbers
1808  Teuchos::ArrayRCP<const LO> row_block_number = BlockNumber->getData(0);
1809  Teuchos::ArrayRCP<const LO> col_block_number = ghostedBlockNumber->getData(0);
1810 
1811  // allocate space for the local graph
1812  typename CrsMatrix::local_matrix_type::row_map_type::HostMirror::non_const_type rows_mat;
1813  typename LWGraph::row_type::non_const_type rows_graph;
1814  typename LWGraph::entries_type::non_const_type columns;
1815  typename CrsMatrix::local_matrix_type::values_type::HostMirror::non_const_type values;
1816  RCP<CrsMatrixWrap> crs_matrix_wrap;
1817 
1818  if (generate_matrix) {
1819  crs_matrix_wrap = rcp(new CrsMatrixWrap(A->getRowMap(), A->getColMap(), 0));
1820  rows_mat = typename CrsMatrix::local_matrix_type::row_map_type::HostMirror::non_const_type("rows_mat", A->getLocalNumRows() + 1);
1821  } else {
1822  rows_graph = typename LWGraph::row_type::non_const_type("rows_graph", A->getLocalNumRows() + 1);
1823  }
1824  columns = typename LWGraph::entries_type::non_const_type("columns", A->getLocalNumEntries());
1825  values = typename CrsMatrix::local_matrix_type::values_type::HostMirror::non_const_type("values", A->getLocalNumEntries());
1826 
1827  LO realnnz = 0;
1828  GO numDropped = 0, numTotal = 0;
1829  for (LO row = 0; row < Teuchos::as<LO>(A->getRowMap()->getLocalNumElements()); ++row) {
1830  LO row_block = row_block_number[row];
1831  size_t nnz = A->getNumEntriesInLocalRow(row);
1832  ArrayView<const LO> indices;
1833  ArrayView<const SC> vals;
1834  A->getLocalRowView(row, indices, vals);
1835 
1836  LO rownnz = 0;
1837  for (LO colID = 0; colID < Teuchos::as<LO>(nnz); colID++) {
1838  LO col = indices[colID];
1839  LO col_block = col_block_number[col];
1840 
1841  if (row_block == col_block) {
1842  if (generate_matrix) values[realnnz] = vals[colID];
1843  columns[realnnz++] = col;
1844  rownnz++;
1845  } else
1846  numDropped++;
1847  }
1848  if (generate_matrix)
1849  rows_mat[row + 1] = realnnz;
1850  else
1851  rows_graph[row + 1] = realnnz;
1852  }
1853 
1854  auto boundaryNodes = MueLu::Utilities<SC, LO, GO, NO>::DetectDirichletRows_kokkos_host(*A, dirichletThreshold);
1855  if (rowSumTol > 0.)
1856  Utilities::ApplyRowSumCriterionHost(*A, rowSumTol, boundaryNodes);
1857 
1858  numTotal = A->getLocalNumEntries();
1859 
1860  if (GetVerbLevel() & Statistics1) {
1861  GO numLocalBoundaryNodes = 0;
1862  GO numGlobalBoundaryNodes = 0;
1863  for (size_t i = 0; i < boundaryNodes.size(); ++i)
1864  if (boundaryNodes(i))
1865  numLocalBoundaryNodes++;
1866  RCP<const Teuchos::Comm<int>> comm = A->getRowMap()->getComm();
1867  MueLu_sumAll(comm, numLocalBoundaryNodes, numGlobalBoundaryNodes);
1868  GetOStream(Statistics1) << "Detected " << numGlobalBoundaryNodes << " Dirichlet nodes" << std::endl;
1869 
1870  GO numGlobalTotal, numGlobalDropped;
1871  MueLu_sumAll(comm, numTotal, numGlobalTotal);
1872  MueLu_sumAll(comm, numDropped, numGlobalDropped);
1873  GetOStream(Statistics1) << "Number of dropped entries in block-diagonalized matrix graph: " << numGlobalDropped << "/" << numGlobalTotal;
1874  if (numGlobalTotal != 0)
1875  GetOStream(Statistics1) << " (" << 100 * Teuchos::as<double>(numGlobalDropped) / Teuchos::as<double>(numGlobalTotal) << "%)";
1876  GetOStream(Statistics1) << std::endl;
1877  }
1878 
1879  Set(currentLevel, "Filtering", true);
1880 
1881  if (generate_matrix) {
1882  // NOTE: Trying to use A's Import/Export objects will cause the code to segfault back in Build() with errors on the Import
1883  // if you're using Epetra. I'm not really sure why. By using the Col==Domain and Row==Range maps, we get null Import/Export objects
1884  // here, which is legit, because we never use them anyway.
1885  if constexpr (std::is_same<typename LWGraph::row_type,
1886  typename CrsMatrix::local_matrix_type::row_map_type>::value) {
1887  crs_matrix_wrap->getCrsMatrix()->setAllValues(rows_mat, columns, values);
1888  } else {
1889  auto rows_mat2 = typename CrsMatrix::local_matrix_type::row_map_type::non_const_type("rows_mat2", rows_mat.extent(0));
1890  Kokkos::deep_copy(rows_mat2, rows_mat);
1891  auto columns2 = typename CrsMatrix::local_graph_type::entries_type::non_const_type("columns2", columns.extent(0));
1892  Kokkos::deep_copy(columns2, columns);
1893  auto values2 = typename CrsMatrix::local_matrix_type::values_type::non_const_type("values2", values.extent(0));
1894  Kokkos::deep_copy(values2, values);
1895  crs_matrix_wrap->getCrsMatrix()->setAllValues(rows_mat2, columns2, values2);
1896  }
1897  crs_matrix_wrap->getCrsMatrix()->expertStaticFillComplete(A->getColMap(), A->getRowMap());
1898  } else {
1899  RCP<LWGraph> graph = rcp(new LWGraph(rows_graph, Kokkos::subview(columns, Kokkos::make_pair(0, realnnz)), A->getRowMap(), A->getColMap(), "block-diagonalized graph of A"));
1900  graph->SetBoundaryNodeMap(boundaryNodes);
1901  Set(currentLevel, "Graph", graph);
1902  }
1903 
1904  Set(currentLevel, "DofsPerNode", 1);
1905  return crs_matrix_wrap;
1906 }
1907 
1908 template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
1910  TEUCHOS_TEST_FOR_EXCEPTION(ghostedBlockNumber.is_null(), Exceptions::RuntimeError, "BlockDiagonalizeGraph(): ghostedBlockNumber is null.");
1911  const ParameterList& pL = GetParameterList();
1912 
1913  const bool localizeColoringGraph = pL.get<bool>("aggregation: coloring: localize color graph");
1914 
1915  GetOStream(Statistics1) << "Using BlockDiagonal Graph after Dropping (with provided blocking)";
1916  if (localizeColoringGraph)
1917  GetOStream(Statistics1) << ", with localization" << std::endl;
1918  else
1919  GetOStream(Statistics1) << ", without localization" << std::endl;
1920 
1921  // Accessors for block numbers
1922  Teuchos::ArrayRCP<const LO> row_block_number = ghostedBlockNumber->getData(0);
1923  Teuchos::ArrayRCP<const LO> col_block_number = ghostedBlockNumber->getData(0);
1924 
1925  // allocate space for the local graph
1926  ArrayRCP<size_t> rows_mat;
1927  typename LWGraph::row_type::non_const_type rows_graph("rows_graph", inputGraph->GetNodeNumVertices() + 1);
1928  typename LWGraph::entries_type::non_const_type columns("columns", inputGraph->GetNodeNumEdges());
1929 
1930  LO realnnz = 0;
1931  GO numDropped = 0, numTotal = 0;
1932  const LO numRows = Teuchos::as<LO>(inputGraph->GetDomainMap()->getLocalNumElements());
1933  if (localizeColoringGraph) {
1934  for (LO row = 0; row < numRows; ++row) {
1935  LO row_block = row_block_number[row];
1936  auto indices = inputGraph->getNeighborVertices(row);
1937 
1938  LO rownnz = 0;
1939  for (LO colID = 0; colID < Teuchos::as<LO>(indices.length); colID++) {
1940  LO col = indices(colID);
1941  LO col_block = col_block_number[col];
1942 
1943  if ((row_block == col_block) && (col < numRows)) {
1944  columns(realnnz++) = col;
1945  rownnz++;
1946  } else
1947  numDropped++;
1948  }
1949  rows_graph(row + 1) = realnnz;
1950  }
1951  } else {
1952  // ghosting of boundary node map
1953  auto boundaryNodes = inputGraph->GetBoundaryNodeMap();
1955  for (size_t i = 0; i < inputGraph->GetNodeNumVertices(); i++)
1956  boundaryNodesVector->getDataNonConst(0)[i] = boundaryNodes[i];
1957  // Xpetra::IO<Scalar,LocalOrdinal,GlobalOrdinal,Node>::Write("boundary",*boundaryNodesVector);
1959  boundaryColumnVector->doImport(*boundaryNodesVector, *importer, Xpetra::INSERT);
1960  auto boundaryColumn = boundaryColumnVector->getData(0);
1961 
1962  for (LO row = 0; row < numRows; ++row) {
1963  LO row_block = row_block_number[row];
1964  auto indices = inputGraph->getNeighborVertices(row);
1965 
1966  LO rownnz = 0;
1967  for (LO colID = 0; colID < Teuchos::as<LO>(indices.length); colID++) {
1968  LO col = indices(colID);
1969  LO col_block = col_block_number[col];
1970 
1971  if ((row_block == col_block) && ((row == col) || (boundaryColumn[col] == 0))) {
1972  columns(realnnz++) = col;
1973  rownnz++;
1974  } else
1975  numDropped++;
1976  }
1977  rows_graph(row + 1) = realnnz;
1978  }
1979  }
1980 
1981  numTotal = inputGraph->GetNodeNumEdges();
1982 
1983  if (GetVerbLevel() & Statistics1) {
1984  RCP<const Teuchos::Comm<int>> comm = inputGraph->GetDomainMap()->getComm();
1985  GO numGlobalTotal, numGlobalDropped;
1986  MueLu_sumAll(comm, numTotal, numGlobalTotal);
1987  MueLu_sumAll(comm, numDropped, numGlobalDropped);
1988  GetOStream(Statistics1) << "Number of dropped entries in block-diagonalized matrix graph: " << numGlobalDropped << "/" << numGlobalTotal;
1989  if (numGlobalTotal != 0)
1990  GetOStream(Statistics1) << " (" << 100 * Teuchos::as<double>(numGlobalDropped) / Teuchos::as<double>(numGlobalTotal) << "%)";
1991  GetOStream(Statistics1) << std::endl;
1992  }
1993 
1994  if (localizeColoringGraph) {
1995  outputGraph = rcp(new LWGraph(rows_graph, Kokkos::subview(columns, Kokkos::make_pair(0, realnnz)), inputGraph->GetDomainMap(), inputGraph->GetImportMap(), "block-diagonalized graph of A"));
1996  outputGraph->SetBoundaryNodeMap(inputGraph->GetBoundaryNodeMap());
1997  } else {
1998  TEUCHOS_ASSERT(inputGraph->GetDomainMap()->lib() == Xpetra::UseTpetra);
1999 #ifdef HAVE_XPETRA_TPETRA
2000  auto outputGraph2 = rcp(new LWGraph(rows_graph, Kokkos::subview(columns, Kokkos::make_pair(0, realnnz)), inputGraph->GetDomainMap(), inputGraph->GetImportMap(), "block-diagonalized graph of A"));
2001 
2002  auto tpGraph = Xpetra::toTpetra(rcp_const_cast<const CrsGraph>(outputGraph2->GetCrsGraph()));
2004  auto tpGraphSym = sym->symmetrize();
2005  auto lclGraphSym = tpGraphSym->getLocalGraphHost();
2006  auto colIndsSym = lclGraphSym.entries;
2007 
2008  auto rowsSym = tpGraphSym->getLocalRowPtrsHost();
2009  typename LWGraph::row_type::non_const_type rows_graphSym("rows_graphSym", rowsSym.size());
2010  for (size_t row = 0; row < rowsSym.size(); row++)
2011  rows_graphSym(row) = rowsSym(row);
2012  outputGraph = rcp(new LWGraph(rows_graphSym, colIndsSym, inputGraph->GetDomainMap(), Xpetra::toXpetra(tpGraphSym->getColMap()), "block-diagonalized graph of A"));
2013  outputGraph->SetBoundaryNodeMap(inputGraph->GetBoundaryNodeMap());
2014 #endif
2015  }
2016 }
2017 
2018 } // namespace MueLu
2019 
2020 #endif // MUELU_COALESCEDROPFACTORY_DEF_HPP
Important warning messages (one line)
RCP< const ParameterList > GetValidParameterList() const
Return a const parameter list of valid parameters that setParameterList() will accept.
Exception indicating invalid cast attempted.
void reserve(size_type n)
void BlockDiagonalizeGraph(const RCP< LWGraph > &inputGraph, const RCP< LocalOrdinalVector > &ghostedBlockNumber, RCP< LWGraph > &outputGraph, RCP< const Import > &importer) const
static Teuchos::RCP< Vector > GetMatrixMaxMinusOffDiagonal(const Xpetra::Matrix< Scalar, LocalOrdinal, GlobalOrdinal, Node > &A)
Return vector containing: max_{i=k}(-a_ik), for each for i in the matrix.
#define MueLu_sumAll(rcpComm, in, out)
static void ApplyRowSumCriterionHost(const Matrix &A, const typename Teuchos::ScalarTraits< Scalar >::magnitudeType rowSumTol, Kokkos::View< bool *, Kokkos::HostSpace > &dirichletRows)
KOKKOS_INLINE_FUNCTION void SetBoundaryNodeMap(const boundary_nodes_type bndry)
Set boolean array indicating which rows correspond to Dirichlet boundaries.
static Teuchos::RCP< MultiVector< Scalar, LocalOrdinal, GlobalOrdinal, Node > > Build(const Teuchos::RCP< const Map< LocalOrdinal, GlobalOrdinal, Node >> &map, size_t NumVectors, bool zeroOut=true)
void setValidator(RCP< const ParameterEntryValidator > const &validator)
GlobalOrdinal GO
T & get(const std::string &name, T def_value)
static void ApplyRowSumCriterion(const Xpetra::Matrix< Scalar, LocalOrdinal, GlobalOrdinal, Node > &A, const Magnitude rowSumTol, Teuchos::ArrayRCP< bool > &dirichletRows)
Apply Rowsum Criterion.
Timer to be used in factories. Similar to Monitor but with additional timers.
#define TEUCHOS_TEST_FOR_EXCEPTION(throw_exception_test, Exception, msg)
static void Write(const std::string &fileName, const Xpetra::Map< LocalOrdinal, GlobalOrdinal, Node > &M)
Print more statistics.
size_type size() const
DropTol & operator=(DropTol const &)=default
Teuchos::RCP< Xpetra::Matrix< Scalar, LocalOrdinal, GlobalOrdinal, Node > > BlockDiagonalize(Level &currentLevel, const RCP< Matrix > &A, bool generate_matrix) const
LocalOrdinal LO
One-liner description of what is happening.
ParameterList & set(std::string const &name, T &&value, std::string const &docString="", RCP< const ParameterEntryValidator > const &validator=null)
KOKKOS_INLINE_FUNCTION size_type GetNodeNumVertices() const
Return number of graph vertices.
KOKKOS_INLINE_FUNCTION size_type GetNodeNumEdges() const
Return number of graph edges.
KOKKOS_INLINE_FUNCTION const boundary_nodes_type GetBoundaryNodeMap() const
Returns map with global ids of boundary nodes.
size_type size() const
Exception throws to report incompatible objects (like maps).
static const GlobalOrdinal DOFGid2NodeId(GlobalOrdinal gid, LocalOrdinal blockSize, const GlobalOrdinal offset, const GlobalOrdinal indexBase)
Translate global (row/column) id to global amalgamation block id.
Scalar GetThreshold() const
Return threshold value.
Additional warnings.
void resize(const size_type n, const T &val=T())
TEUCHOS_DEPRECATED RCP< T > rcp(T *p, Dealloc_T dealloc, bool owns_mem)
Class that holds all level-specific information.
Definition: MueLu_Level.hpp:63
Timer to be used in factories. Similar to SubMonitor but adds a timer level by level.
static Teuchos::ArrayRCP< const bool > DetectDirichletRows(const Xpetra::Matrix< Scalar, LocalOrdinal, GlobalOrdinal, Node > &A, const Magnitude &tol=Teuchos::ScalarTraits< Magnitude >::zero(), bool count_twos_as_dirichlet=false)
Detect Dirichlet rows.
#define SET_VALID_ENTRY(name)
static void AmalgamateMap(const Map &sourceMap, const Matrix &A, RCP< const Map > &amalgamatedMap, Array< LO > &translation)
Method to create merged map for systems of PDEs.
static RCP< Vector > GetMatrixOverlappedDiagonal(const Matrix &A)
Extract Overlapped Matrix Diagonal.
void resize(size_type new_size, const value_type &x=value_type())
void Build(Level &currentLevel) const
Build an object with this factory.
std::string viewLabel_t
static RCP< Vector > Build(const Teuchos::RCP< const Map > &map, bool zeroOut=true)
iterator end()
void DeclareInput(Level &currentLevel) const
Input.
KOKKOS_INLINE_FUNCTION neighbor_vertices_type getNeighborVertices(LO i) const
Return the list of vertices adjacent to the vertex &#39;v&#39;.
Print class parameters.
const RCP< const Map > GetImportMap() const
Return overlapping import map (nodes).
RCP< const Tpetra::CrsGraph< LocalOrdinal, GlobalOrdinal, Node > > toTpetra(const RCP< const CrsGraph< LocalOrdinal, GlobalOrdinal, Node > > &graph)
void push_back(const value_type &x)
void MergeRows(const Matrix &A, const LO row, Array< LO > &cols, const Array< LO > &translation) const
Method to merge rows of matrix for systems of PDEs.
RCP< const CrsGraph< int, GlobalOrdinal, Node > > toXpetra(const Epetra_CrsGraph &g)
size_type size() const
Scalar SC
const RCP< const Map > GetDomainMap() const
Lightweight MueLu representation of a compressed row storage graph.
DropTol(real_type val_, real_type diag_, LO col_, bool drop_)
Exception throws to report errors in the internal logical of the program.
#define TEUCHOS_ASSERT(assertion_test)
static Kokkos::View< bool *, typename Kokkos::HostSpace > DetectDirichletRows_kokkos_host(const Matrix &A, const Magnitude &tol=Teuchos::ScalarTraits< typename Teuchos::ScalarTraits< SC >::magnitudeType >::zero(), const bool count_twos_as_dirichlet=false)
static Teuchos::ScalarTraits< Scalar >::magnitudeType Distance2(const Teuchos::Array< Teuchos::ArrayRCP< const Scalar >> &v, LocalOrdinal i0, LocalOrdinal i1)
Squared distance between two rows in a multivector.
ParameterEntry & getEntry(const std::string &name)
bool is_null() const
iterator begin()
void MergeRowsWithDropping(const Matrix &A, const LO row, const ArrayRCP< const SC > &ghostedDiagVals, SC threshold, Array< LO > &cols, const Array< LO > &translation) const
bool is_null() const