MueLu  Version of the Day
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Groups Pages
MueLu_CoalesceDropFactory_kokkos_def.hpp
Go to the documentation of this file.
1 // @HEADER
2 // *****************************************************************************
3 // MueLu: A package for multigrid based preconditioning
4 //
5 // Copyright 2012 NTESS and the MueLu contributors.
6 // SPDX-License-Identifier: BSD-3-Clause
7 // *****************************************************************************
8 // @HEADER
9 
10 #ifndef MUELU_COALESCEDROPFACTORY_KOKKOS_DEF_HPP
11 #define MUELU_COALESCEDROPFACTORY_KOKKOS_DEF_HPP
12 
13 #include <Kokkos_Core.hpp>
14 #include <KokkosSparse_CrsMatrix.hpp>
15 #include <sstream>
16 #include <string>
17 #include <tuple>
18 
19 #include "Xpetra_Matrix.hpp"
20 
22 
23 #include "MueLu_AmalgamationInfo.hpp"
24 #include "MueLu_Exceptions.hpp"
25 #include "MueLu_Level.hpp"
26 #include "MueLu_LWGraph_kokkos.hpp"
27 #include "MueLu_MasterList.hpp"
28 #include "MueLu_Monitor.hpp"
29 
30 // #define MUELU_COALESCE_DROP_DEBUG 1
31 
34 #include "MueLu_CutDrop.hpp"
35 #include "MueLu_DroppingCommon.hpp"
38 
39 namespace MueLu {
40 
41 template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
43  RCP<ParameterList> validParamList = rcp(new ParameterList());
44 
45 #define SET_VALID_ENTRY(name) validParamList->setEntry(name, MasterList::getEntry(name))
46  SET_VALID_ENTRY("aggregation: use blocking");
47  SET_VALID_ENTRY("aggregation: drop tol");
48  SET_VALID_ENTRY("aggregation: use ml scaling of drop tol");
49  SET_VALID_ENTRY("aggregation: Dirichlet threshold");
50  SET_VALID_ENTRY("aggregation: greedy Dirichlet");
51  SET_VALID_ENTRY("aggregation: row sum drop tol");
52  SET_VALID_ENTRY("aggregation: strength-of-connection: matrix");
53  SET_VALID_ENTRY("aggregation: strength-of-connection: measure");
54  SET_VALID_ENTRY("aggregation: drop scheme");
55  SET_VALID_ENTRY("aggregation: block diagonal: interleaved blocksize");
56  SET_VALID_ENTRY("aggregation: distance laplacian metric");
57  SET_VALID_ENTRY("aggregation: distance laplacian directional weights");
58  SET_VALID_ENTRY("aggregation: dropping may create Dirichlet");
59 #ifdef HAVE_MUELU_COALESCEDROP_ALLOW_OLD_PARAMETERS
60  SET_VALID_ENTRY("aggregation: distance laplacian algo");
61  SET_VALID_ENTRY("aggregation: classical algo");
62 #endif
63  SET_VALID_ENTRY("aggregation: symmetrize graph after dropping");
64  SET_VALID_ENTRY("aggregation: coloring: use color graph");
65  SET_VALID_ENTRY("aggregation: coloring: localize color graph");
66 
67  SET_VALID_ENTRY("filtered matrix: use lumping");
68  SET_VALID_ENTRY("filtered matrix: reuse graph");
69  SET_VALID_ENTRY("filtered matrix: reuse eigenvalue");
70 
71  SET_VALID_ENTRY("filtered matrix: use root stencil");
72  SET_VALID_ENTRY("filtered matrix: use spread lumping");
73  SET_VALID_ENTRY("filtered matrix: spread lumping diag dom growth factor");
74  SET_VALID_ENTRY("filtered matrix: spread lumping diag dom cap");
75  SET_VALID_ENTRY("filtered matrix: Dirichlet threshold");
76 
77 #undef SET_VALID_ENTRY
78  validParamList->set<bool>("lightweight wrap", true, "Experimental option for lightweight graph access");
79 #ifndef HAVE_MUELU_COALESCEDROP_ALLOW_OLD_PARAMETERS
80  validParamList->getEntry("aggregation: drop scheme").setValidator(rcp(new Teuchos::StringValidator(Teuchos::tuple<std::string>("point-wise", "cut-drop"))));
81 #else
82  validParamList->getEntry("aggregation: drop scheme").setValidator(rcp(new Teuchos::StringValidator(Teuchos::tuple<std::string>("point-wise", "cut-drop", "signed classical sa", "classical", "distance laplacian", "signed classical", "block diagonal", "block diagonal classical", "block diagonal distance laplacian", "block diagonal signed classical", "block diagonal colored signed classical", "signed classical distance laplacian", "signed classical sa distance laplacian"))));
83  validParamList->getEntry("aggregation: classical algo").setValidator(rcp(new Teuchos::StringValidator(Teuchos::tuple<std::string>("default", "unscaled cut", "scaled cut", "scaled cut symmetric"))));
84  validParamList->getEntry("aggregation: distance laplacian algo").setValidator(rcp(new Teuchos::StringValidator(Teuchos::tuple<std::string>("default", "unscaled cut", "scaled cut", "scaled cut symmetric"))));
85 #endif
86  validParamList->getEntry("aggregation: strength-of-connection: matrix").setValidator(rcp(new Teuchos::StringValidator(Teuchos::tuple<std::string>("A", "distance laplacian"))));
87  validParamList->getEntry("aggregation: strength-of-connection: measure").setValidator(rcp(new Teuchos::StringValidator(Teuchos::tuple<std::string>("smoothed aggregation", "signed smoothed aggregation", "signed ruge-stueben", "unscaled"))));
88  validParamList->getEntry("aggregation: distance laplacian metric").setValidator(rcp(new Teuchos::StringValidator(Teuchos::tuple<std::string>("unweighted", "material"))));
89 
90  validParamList->set<RCP<const FactoryBase>>("A", Teuchos::null, "Generating factory of the matrix A");
91  validParamList->set<RCP<const FactoryBase>>("UnAmalgamationInfo", Teuchos::null, "Generating factory for UnAmalgamationInfo");
92  validParamList->set<RCP<const FactoryBase>>("Coordinates", Teuchos::null, "Generating factory for Coordinates");
93  validParamList->set<RCP<const FactoryBase>>("BlockNumber", Teuchos::null, "Generating factory for BlockNumber");
94  validParamList->set<RCP<const FactoryBase>>("Material", Teuchos::null, "Generating factory for Material");
95 
96  return validParamList;
97 }
98 
99 template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
101  Input(currentLevel, "A");
102  Input(currentLevel, "UnAmalgamationInfo");
103 
104  const ParameterList& pL = GetParameterList();
105 
106  std::string socUsesMatrix = pL.get<std::string>("aggregation: strength-of-connection: matrix");
107  bool needCoords = (socUsesMatrix == "distance laplacian");
108 #ifdef HAVE_MUELU_COALESCEDROP_ALLOW_OLD_PARAMETERS
109  std::string droppingMethod = pL.get<std::string>("aggregation: drop scheme");
110  needCoords |= (droppingMethod.find("distance laplacian") != std::string::npos);
111 #endif
112  if (needCoords) {
113  Input(currentLevel, "Coordinates");
114  std::string distLaplMetric = pL.get<std::string>("aggregation: distance laplacian metric");
115  if (distLaplMetric == "material")
116  Input(currentLevel, "Material");
117  }
118 
119  bool useBlocking = pL.get<bool>("aggregation: use blocking");
120 #ifdef HAVE_MUELU_COALESCEDROP_ALLOW_OLD_PARAMETERS
121  useBlocking |= (droppingMethod.find("block diagonal") != std::string::npos);
122 #endif
123  if (useBlocking) {
124  Input(currentLevel, "BlockNumber");
125  }
126 }
127 
128 template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
130  Build(Level& currentLevel) const {
131  auto A = Get<RCP<Matrix>>(currentLevel, "A");
132  TEUCHOS_TEST_FOR_EXCEPTION(A->GetFixedBlockSize() % A->GetStorageBlockSize() != 0, Exceptions::RuntimeError, "A->GetFixedBlockSize() needs to be a multiple of A->GetStorageBlockSize()");
133  LO blkSize = A->GetFixedBlockSize() / A->GetStorageBlockSize();
134 
135  std::tuple<GlobalOrdinal, boundary_nodes_type> results;
136  if (blkSize == 1)
137  results = BuildScalar(currentLevel);
138  else
139  results = BuildVector(currentLevel);
140 
141  if (GetVerbLevel() & Statistics1) {
142  GlobalOrdinal numDropped = std::get<0>(results);
143  auto boundaryNodes = std::get<1>(results);
144 
145  GO numLocalBoundaryNodes = 0;
146 
147  Kokkos::parallel_reduce(
148  "MueLu:CoalesceDropF:Build:bnd", range_type(0, boundaryNodes.extent(0)),
149  KOKKOS_LAMBDA(const LO i, GO& n) {
150  if (boundaryNodes(i))
151  n++;
152  },
153  numLocalBoundaryNodes);
154 
155  if (IsPrint(Statistics1)) {
156  auto comm = A->getRowMap()->getComm();
157 
158  std::vector<GlobalOrdinal> localStats = {numLocalBoundaryNodes, numDropped};
159  std::vector<GlobalOrdinal> globalStats(2);
160  Teuchos::reduceAll(*comm, Teuchos::REDUCE_SUM, 2, localStats.data(), globalStats.data());
161 
162  GO numGlobalTotal = A->getGlobalNumEntries();
163  GO numGlobalBoundaryNodes = globalStats[0];
164  GO numGlobalDropped = globalStats[1];
165 
166  GetOStream(Statistics1) << "Detected " << numGlobalBoundaryNodes << " Dirichlet nodes" << std::endl;
167  if (numGlobalTotal != 0) {
168  GetOStream(Statistics1) << "Number of dropped entries: "
169  << numGlobalDropped << "/" << numGlobalTotal
170  << " (" << 100 * Teuchos::as<double>(numGlobalDropped) / Teuchos::as<double>(numGlobalTotal) << "%)" << std::endl;
171  }
172  }
173  }
174 }
175 
176 template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
179  auto material = Get<RCP<MultiVector>>(currentLevel, "Material");
180 
181  if (IsPrint(Runtime0)) {
182  if (material->getNumVectors() == 1) {
183  GetOStream(Runtime0) << "material scalar mean = " << material->getVector(0)->meanValue() << std::endl;
184  } else {
185  TEUCHOS_TEST_FOR_EXCEPTION(spatialDim * spatialDim != material->getNumVectors(), Exceptions::RuntimeError, "Need \"Material\" to have spatialDim^2 vectors.");
186  {
187  Teuchos::Array<Scalar> means(material->getNumVectors());
188  material->meanValue(means());
189  std::stringstream ss;
190  ss << "material tensor mean =" << std::endl;
191  size_t k = 0;
192  for (size_t i = 0; i < spatialDim; ++i) {
193  ss << " ";
194  for (size_t j = 0; j < spatialDim; ++j) {
195  ss << means[k] << " ";
196  ++k;
197  }
198  ss << std::endl;
199  }
200  GetOStream(Runtime0) << ss.str();
201  }
202  }
203  }
204 
205  return material;
206 }
207 
208 template <class magnitudeType>
209 void translateOldAlgoParam(const Teuchos::ParameterList& pL, std::string& droppingMethod, bool& useBlocking, std::string& socUsesMatrix, std::string& socUsesMeasure, bool& symmetrizeDroppedGraph, bool& generateColoringGraph, magnitudeType& threshold) {
210  std::set<std::string> validDroppingMethods = {"piece-wise", "cut-drop"};
211  if (validDroppingMethods.find(droppingMethod) == validDroppingMethods.end()) {
212  std::string algo = droppingMethod;
213  std::string classicalAlgoStr = pL.get<std::string>("aggregation: classical algo");
214  std::string distanceLaplacianAlgoStr = pL.get<std::string>("aggregation: distance laplacian algo");
215 
216  // Remove prefix "block diagonal" from algo
217  if (algo.find("block diagonal") == 0) {
218  useBlocking = true;
219  algo = algo.substr(14);
220  if (algo != "") {
221  algo = algo.substr(1);
222  }
223  }
224 
225  if ((algo == "classical") || (algo == "signed classical sa") || (algo == "signed classical") || (algo == "colored signed classical")) {
226  socUsesMatrix = "A";
227 
228  if (algo == "classical") {
229  socUsesMeasure = "smoothed aggregation";
230  } else if (algo == "signed classical sa") {
231  socUsesMeasure = "signed smoothed aggregation";
232  } else if (algo == "signed classical") {
233  socUsesMeasure = "signed ruge-stueben";
234  } else if (algo == "colored signed classical") {
235  socUsesMeasure = "signed ruge-stueben";
236  generateColoringGraph = true;
237  }
238 
239  if (classicalAlgoStr == "default")
240  droppingMethod = "point-wise";
241  else if (classicalAlgoStr == "unscaled cut") {
242  socUsesMeasure = "unscaled";
243  droppingMethod = "cut-drop";
244  } else if (classicalAlgoStr == "scaled cut") {
245  droppingMethod = "cut-drop";
246  } else if (classicalAlgoStr == "scaled cut symmetric") {
247  droppingMethod = "cut-drop";
248  symmetrizeDroppedGraph = true;
249  }
250  } else if ((algo == "distance laplacian") || (algo == "signed classical sa distance laplacian") || (algo == "signed classical distance laplacian")) {
251  socUsesMatrix = "distance laplacian";
252 
253  if (algo == "distance laplacian") {
254  socUsesMeasure = "smoothed aggregation";
255  } else if (algo == "signed classical sa distance laplacian") {
256  socUsesMeasure = "signed smoothed aggregation";
257  } else if (algo == "signed classical distance laplacian") {
258  socUsesMeasure = "signed ruge-stueben";
259  }
260 
261  if (distanceLaplacianAlgoStr == "default")
262  droppingMethod = "point-wise";
263  else if (distanceLaplacianAlgoStr == "unscaled cut") {
264  socUsesMeasure = "unscaled";
265  droppingMethod = "cut-drop";
266  } else if (distanceLaplacianAlgoStr == "scaled cut") {
267  droppingMethod = "cut-drop";
268  } else if (distanceLaplacianAlgoStr == "scaled cut symmetric") {
269  droppingMethod = "cut-drop";
270  symmetrizeDroppedGraph = true;
271  }
272  } else if (algo == "") {
273  // algo was "block diagonal", but we process and remove the "block diagonal" part
274  socUsesMatrix = "A";
276  }
277  }
278 }
279 
280 template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
281 std::tuple<GlobalOrdinal, typename MueLu::LWGraph_kokkos<LocalOrdinal, GlobalOrdinal, Node>::boundary_nodes_type> CoalesceDropFactory_kokkos<Scalar, LocalOrdinal, GlobalOrdinal, Node>::
282  BuildScalar(Level& currentLevel) const {
283  FactoryMonitor m(*this, "Build", currentLevel);
284 
287  using local_matrix_type = typename MatrixType::local_matrix_type;
288  using local_graph_type = typename GraphType::local_graph_type;
289  using rowptr_type = typename local_graph_type::row_map_type::non_const_type;
290  using entries_type = typename local_graph_type::entries_type::non_const_type;
291  using values_type = typename local_matrix_type::values_type::non_const_type;
292  using device_type = typename Node::device_type;
293  using memory_space = typename device_type::memory_space;
294  using magnitudeType = typename Teuchos::ScalarTraits<Scalar>::magnitudeType;
295  using doubleMultiVector = Xpetra::MultiVector<magnitudeType, LO, GO, NO>;
296 
297  typedef Teuchos::ScalarTraits<Scalar> STS;
298  const magnitudeType zero = Teuchos::ScalarTraits<magnitudeType>::zero();
299 
300  auto A = Get<RCP<Matrix>>(currentLevel, "A");
301 
303  // Process parameterlist
304  const ParameterList& pL = GetParameterList();
305 
306  // Boundary detection
307  const magnitudeType dirichletThreshold = STS::magnitude(as<SC>(pL.get<double>("aggregation: Dirichlet threshold")));
308  const magnitudeType rowSumTol = as<magnitudeType>(pL.get<double>("aggregation: row sum drop tol"));
309  const LocalOrdinal dirichletNonzeroThreshold = 1;
310 
311  // Dropping
312  bool useBlocking = pL.get<bool>("aggregation: use blocking");
313  std::string droppingMethod = pL.get<std::string>("aggregation: drop scheme");
314  std::string socUsesMatrix = pL.get<std::string>("aggregation: strength-of-connection: matrix");
315  std::string socUsesMeasure = pL.get<std::string>("aggregation: strength-of-connection: measure");
316  std::string distanceLaplacianMetric = pL.get<std::string>("aggregation: distance laplacian metric");
317  bool symmetrizeDroppedGraph = pL.get<bool>("aggregation: symmetrize graph after dropping");
318  magnitudeType threshold;
319  // If we're doing the ML-style halving of the drop tol at each level, we do that here.
320  if (pL.get<bool>("aggregation: use ml scaling of drop tol"))
321  threshold = pL.get<double>("aggregation: drop tol") / pow(2.0, currentLevel.GetLevelID());
322  else
323  threshold = as<magnitudeType>(pL.get<double>("aggregation: drop tol"));
324  bool aggregationMayCreateDirichlet = pL.get<bool>("aggregation: dropping may create Dirichlet");
325 
326  // Fill
327  const bool lumping = pL.get<bool>("filtered matrix: use lumping");
328  const bool reuseGraph = pL.get<bool>("filtered matrix: reuse graph");
329  const bool reuseEigenvalue = pL.get<bool>("filtered matrix: reuse eigenvalue");
330 
331  const bool useRootStencil = pL.get<bool>("filtered matrix: use root stencil");
332  const bool useSpreadLumping = pL.get<bool>("filtered matrix: use spread lumping");
333 
334  const magnitudeType filteringDirichletThreshold = as<magnitudeType>(pL.get<double>("filtered matrix: Dirichlet threshold"));
335 
336  // coloring graph
337  bool generateColoringGraph = pL.get<bool>("aggregation: coloring: use color graph");
338  const bool localizeColoringGraph = pL.get<bool>("aggregation: coloring: localize color graph");
339  const bool symmetrizeColoringGraph = true;
340 
341 #ifdef HAVE_MUELU_COALESCEDROP_ALLOW_OLD_PARAMETERS
342  translateOldAlgoParam(pL, droppingMethod, useBlocking, socUsesMatrix, socUsesMeasure, symmetrizeDroppedGraph, generateColoringGraph, threshold);
343 #endif
344 
345  {
346  std::stringstream ss;
347  ss << "dropping scheme = \"" << droppingMethod << "\", strength-of-connection measure = \"" << socUsesMeasure << "\", strength-of-connection matrix = \"" << socUsesMatrix << "\", ";
348  if (socUsesMatrix == "distance laplacian")
349  ss << "distance laplacian metric = \"" << distanceLaplacianMetric << "\", ";
350  ss << "threshold = " << threshold << ", blocksize = " << A->GetFixedBlockSize() << ", useBlocking = " << useBlocking;
351  ss << ", symmetrizeDroppedGraph = " << symmetrizeDroppedGraph << std::endl;
352 
353  GetOStream(Runtime0) << ss.str();
354  }
355 
356  TEUCHOS_ASSERT(!useRootStencil);
357  TEUCHOS_ASSERT(!useSpreadLumping);
358  if (droppingMethod == "cut-drop")
359  TEUCHOS_TEST_FOR_EXCEPTION(threshold > 1.0, Exceptions::RuntimeError, "For cut-drop algorithms, \"aggregation: drop tol\" = " << threshold << ", needs to be <= 1.0");
360 
362  // We perform four sweeps over the rows of A:
363  // Pass 1: detection of boundary nodes
364  // Pass 2: diagonal extraction
365  // Pass 3: drop decision for each entry and construction of the rowptr of the filtered matrix
366  // Pass 4: fill of the filtered matrix
367  //
368  // Pass 1 and 3 apply a sequence of criteria to each row of the matrix.
369 
370  // TODO: We could merge pass 1 and 2.
371 
372  auto crsA = toCrsMatrix(A);
373  auto lclA = crsA->getLocalMatrixDevice();
374  auto range = range_type(0, lclA.numRows());
375 
377  // Pass 1: Detect boundary nodes
378  //
379  // The following criteria are available:
380  // - BoundaryDetection::PointDirichletFunctor
381  // Marks rows as Dirichlet based on value threshold and number of off-diagonal entries
382  // - BoundaryDetection::RowSumFunctor
383  // Marks rows as Dirichlet bases on row-sum criterion
384 
385  // Dirichlet nodes
386  auto boundaryNodes = boundary_nodes_type("boundaryNodes", lclA.numRows()); // initialized to false
387  {
388  SubFactoryMonitor mBoundary(*this, "Boundary detection", currentLevel);
389 
390  // macro that applies boundary detection functors
391 #define MueLu_runBoundaryFunctors(...) \
392  { \
393  auto boundaries = BoundaryDetection::BoundaryFunctor(lclA, __VA_ARGS__); \
394  Kokkos::parallel_for("CoalesceDrop::BoundaryDetection", range, boundaries); \
395  }
396 
397  auto dirichlet_detection = BoundaryDetection::PointDirichletFunctor(lclA, boundaryNodes, dirichletThreshold, dirichletNonzeroThreshold);
398 
399  if (rowSumTol <= 0.) {
400  MueLu_runBoundaryFunctors(dirichlet_detection);
401  } else {
402  auto apply_rowsum = BoundaryDetection::RowSumFunctor(lclA, boundaryNodes, rowSumTol);
403  MueLu_runBoundaryFunctors(dirichlet_detection,
404  apply_rowsum);
405  }
406 #undef MueLu_runBoundaryFunctors
407  }
408  // In what follows, boundaryNodes can still still get modified if aggregationMayCreateDirichlet == true.
409  // Otherwise we're now done with it now.
410 
412  // Pass 2 & 3: Diagonal extraction and determine dropping and construct
413  // rowptr of filtered matrix
414  //
415  // The following criteria are available:
416  // - Misc::PointwiseDropBoundaryFunctor
417  // Drop all rows that have been marked as Dirichlet
418  // - Misc::DropOffRankFunctor
419  // Drop all entries that are off-rank
420  // - ClassicalDropping::DropFunctor
421  // Classical dropping
422  // - DistanceLaplacian::DropFunctor
423  // Distance Laplacian dropping
424  // - Misc::KeepDiagonalFunctor
425  // Mark diagonal as KEEP
426  // - Misc::MarkSingletonFunctor
427  // Mark singletons after dropping as Dirichlet
428  // - Misc::BlockDiagonalizeFunctor
429  // Drop coupling between blocks
430  //
431  // For the block diagonal variants we first block diagonalized and then apply "blocksize = 1" algorithms.
432 
433  // Macro that applies dropping functors.
434  // If HAVE_MUELU_DEBUG is true, this runs additional debug checks.
435 #if !defined(HAVE_MUELU_DEBUG)
436 #define MueLu_runDroppingFunctorsImpl(...) \
437  { \
438  auto countingFunctor = MatrixConstruction::PointwiseCountingFunctor(lclA, results, filtered_rowptr, __VA_ARGS__); \
439  Kokkos::parallel_scan("MueLu::CoalesceDrop::CountEntries", range, countingFunctor, nnz_filtered); \
440  }
441 #else
442 #define MueLu_runDroppingFunctorsImpl(...) \
443  { \
444  auto debug = Misc::DebugFunctor(lclA, results); \
445  auto countingFunctor = MatrixConstruction::PointwiseCountingFunctor(lclA, results, filtered_rowptr, __VA_ARGS__, debug); \
446  Kokkos::parallel_scan("MueLu::CoalesceDrop::CountEntries", range, countingFunctor, nnz_filtered); \
447  }
448 #endif
449 
450  // Macro that handles optional block diagonalization.
451  // Calls MueLu_runDroppingFunctorsImpl
452 #define MueLu_runDroppingFunctors(...) \
453  { \
454  if (useBlocking) { \
455  auto BlockNumber = Get<RCP<LocalOrdinalVector>>(currentLevel, "BlockNumber"); \
456  auto block_diagonalize = Misc::BlockDiagonalizeFunctor(*A, *BlockNumber, results); \
457  MueLu_runDroppingFunctorsImpl(block_diagonalize, __VA_ARGS__); \
458  } else \
459  MueLu_runDroppingFunctorsImpl(__VA_ARGS__); \
460  }
461 
462  // Macro that runs dropping for SoC based on A itself, handling of droppingMethod.
463  // Calls MueLu_runDroppingFunctors
464 #define MueLu_runDroppingFunctors_on_A(SoC) \
465  { \
466  if (droppingMethod == "point-wise") { \
467  auto dropping = ClassicalDropping::make_drop_functor<SoC>(*A, threshold, results); \
468  \
469  if (aggregationMayCreateDirichlet) { \
470  MueLu_runDroppingFunctors(dropping, \
471  drop_boundaries, \
472  preserve_diagonals, \
473  mark_singletons_as_boundary); \
474  } else { \
475  MueLu_runDroppingFunctors(dropping, \
476  drop_boundaries, \
477  preserve_diagonals); \
478  } \
479  } else if (droppingMethod == "cut-drop") { \
480  auto comparison = CutDrop::make_comparison_functor<SoC>(*A, results); \
481  auto cut_drop = CutDrop::CutDropFunctor(comparison, threshold); \
482  \
483  MueLu_runDroppingFunctors(drop_boundaries, \
484  preserve_diagonals, \
485  cut_drop); \
486  } \
487  }
488 
489  // Macro that runs on the distance Laplacian, handling of droppingMethod.
490  // Calls MueLu_runDroppingFunctors
491 #define MueLu_runDroppingFunctors_on_dlap_inner(SoC) \
492  { \
493  if (droppingMethod == "point-wise") { \
494  auto dist_laplacian_dropping = DistanceLaplacian::make_drop_functor<SoC>(*A, threshold, dist2, results); \
495  \
496  if (aggregationMayCreateDirichlet) { \
497  MueLu_runDroppingFunctors(dist_laplacian_dropping, \
498  drop_boundaries, \
499  preserve_diagonals, \
500  mark_singletons_as_boundary); \
501  } else { \
502  MueLu_runDroppingFunctors(dist_laplacian_dropping, \
503  drop_boundaries, \
504  preserve_diagonals); \
505  } \
506  } else if (droppingMethod == "cut-drop") { \
507  auto comparison = CutDrop::make_dlap_comparison_functor<SoC>(*A, dist2, results); \
508  auto cut_drop = CutDrop::CutDropFunctor(comparison, threshold); \
509  \
510  MueLu_runDroppingFunctors(drop_boundaries, \
511  preserve_diagonals, \
512  cut_drop); \
513  } \
514  }
515 
516  // Macro that runs on the distance Laplacian, handling of distanceLaplacianMetric.
517  // Calls MueLu_runDroppingFunctors_on_dlap_inner
518 #define MueLu_runDroppingFunctors_on_dlap(SoC) \
519  { \
520  if (distanceLaplacianMetric == "unweighted") { \
521  auto dist2 = DistanceLaplacian::UnweightedDistanceFunctor(*A, coords); \
522  MueLu_runDroppingFunctors_on_dlap_inner(SoC); \
523  } else if (distanceLaplacianMetric == "material") { \
524  auto material = GetMaterial(currentLevel, coords->getNumVectors()); \
525  if (material->getNumVectors() == 1) { \
526  auto dist2 = DistanceLaplacian::ScalarMaterialDistanceFunctor(*A, coords, material); \
527  MueLu_runDroppingFunctors_on_dlap_inner(SoC); \
528  } else { \
529  auto dist2 = DistanceLaplacian::TensorMaterialDistanceFunctor(*A, coords, material); \
530  MueLu_runDroppingFunctors_on_dlap_inner(SoC); \
531  } \
532  } \
533  }
534 
535  // rowptr of filtered A
536  auto filtered_rowptr = rowptr_type("filtered_rowptr", lclA.numRows() + 1);
537  // Number of nonzeros of filtered A
538  LocalOrdinal nnz_filtered = 0;
539  // dropping decisions for each entry
540  auto results = Kokkos::View<DecisionType*, memory_space>("results", lclA.nnz()); // initialized to UNDECIDED
541  {
542  SubFactoryMonitor mDropping(*this, "Dropping decisions", currentLevel);
543 
544  auto drop_boundaries = Misc::PointwiseDropBoundaryFunctor(lclA, boundaryNodes, results);
545 
546  if (threshold != zero) {
547  auto preserve_diagonals = Misc::KeepDiagonalFunctor(lclA, results);
548  auto mark_singletons_as_boundary = Misc::MarkSingletonFunctor(lclA, boundaryNodes, results);
549 
550  if (socUsesMatrix == "A") {
551  if (socUsesMeasure == "unscaled") {
553  } else if (socUsesMeasure == "smoothed aggregation") {
555  } else if (socUsesMeasure == "signed ruge-stueben") {
557  } else if (socUsesMeasure == "signed smoothed aggregation") {
559  }
560  } else if (socUsesMatrix == "distance laplacian") {
561  auto coords = Get<RCP<doubleMultiVector>>(currentLevel, "Coordinates");
562  if (socUsesMeasure == "unscaled") {
564  } else if (socUsesMeasure == "smoothed aggregation") {
566  } else if (socUsesMeasure == "signed ruge-stueben") {
568  } else if (socUsesMeasure == "signed smoothed aggregation") {
570  }
571  }
572  } else {
573  Kokkos::deep_copy(results, KEEP);
574  // FIXME: This seems inconsistent
575  // MueLu_runDroppingFunctors(drop_boundaries);
576  auto no_op = Misc::NoOpFunctor<LocalOrdinal>();
578  }
579 
580  if (symmetrizeDroppedGraph) {
581  auto symmetrize = Misc::SymmetrizeFunctor(lclA, results);
582  MueLu_runDroppingFunctors(symmetrize);
583  }
584  }
585  GO numDropped = lclA.nnz() - nnz_filtered;
586  // We now know the number of entries of filtered A and have the final rowptr.
587 
589  // Pass 4: Create local matrix for filtered A
590  //
591  // Dropped entries are optionally lumped to the diagonal.
592 
593  RCP<Matrix> filteredA;
594  RCP<LWGraph_kokkos> graph;
595  {
596  SubFactoryMonitor mFill(*this, "Filtered matrix fill", currentLevel);
597 
598  local_matrix_type lclFilteredA;
599  local_graph_type lclGraph;
600  if (reuseGraph) {
601  filteredA = MatrixFactory::BuildCopy(A);
602  lclFilteredA = filteredA->getLocalMatrixDevice();
603 
604  auto colidx = entries_type("entries", nnz_filtered);
605  lclGraph = local_graph_type(colidx, filtered_rowptr);
606  } else {
607  auto colidx = entries_type("entries", nnz_filtered);
608  auto values = values_type("values", nnz_filtered);
609  lclFilteredA = local_matrix_type("filteredA",
610  lclA.numRows(), lclA.numCols(),
611  nnz_filtered,
612  values, filtered_rowptr, colidx);
613  }
614 
615  if (lumping) {
616  if (reuseGraph) {
617  auto fillFunctor = MatrixConstruction::PointwiseFillReuseFunctor<local_matrix_type, local_graph_type, true>(lclA, results, lclFilteredA, lclGraph, filteringDirichletThreshold);
618  Kokkos::parallel_for("MueLu::CoalesceDrop::Fill_lumped_reuse", range, fillFunctor);
619  } else {
620  auto fillFunctor = MatrixConstruction::PointwiseFillNoReuseFunctor<local_matrix_type, true>(lclA, results, lclFilteredA, filteringDirichletThreshold);
621  Kokkos::parallel_for("MueLu::CoalesceDrop::Fill_lumped_noreuse", range, fillFunctor);
622  }
623  } else {
624  if (reuseGraph) {
625  auto fillFunctor = MatrixConstruction::PointwiseFillReuseFunctor<local_matrix_type, local_graph_type, false>(lclA, results, lclFilteredA, lclGraph, filteringDirichletThreshold);
626  Kokkos::parallel_for("MueLu::CoalesceDrop::Fill_unlumped_reuse", range, fillFunctor);
627  } else {
628  auto fillFunctor = MatrixConstruction::PointwiseFillNoReuseFunctor<local_matrix_type, false>(lclA, results, lclFilteredA, filteringDirichletThreshold);
629  Kokkos::parallel_for("MueLu::CoalesceDrop::Fill_unlumped_noreuse", range, fillFunctor);
630  }
631  }
632 
633  if (!reuseGraph)
634  filteredA = MatrixFactory::Build(lclFilteredA, A->getRowMap(), A->getColMap(), A->getDomainMap(), A->getRangeMap());
635  filteredA->SetFixedBlockSize(A->GetFixedBlockSize());
636 
637  if (reuseEigenvalue) {
638  // Reuse max eigenvalue from A
639  // It is unclear what eigenvalue is the best for the smoothing, but we already may have
640  // the D^{-1}A estimate in A, may as well use it.
641  // NOTE: ML does that too
642  filteredA->SetMaxEigenvalueEstimate(A->GetMaxEigenvalueEstimate());
643  } else {
644  filteredA->SetMaxEigenvalueEstimate(-Teuchos::ScalarTraits<SC>::one());
645  }
646 
647  if (!reuseGraph) {
648  // Use graph of filteredA as graph.
649  lclGraph = filteredA->getCrsGraph()->getLocalGraphDevice();
650  }
651  graph = rcp(new LWGraph_kokkos(lclGraph, filteredA->getRowMap(), filteredA->getColMap(), "amalgamated graph of A"));
652  graph->SetBoundaryNodeMap(boundaryNodes);
653  }
654 
655  // Construct a second graph for coloring
656  if (generateColoringGraph) {
657  SubFactoryMonitor mColoringGraph(*this, "Construct coloring graph", currentLevel);
658 
659  filtered_rowptr = rowptr_type("rowptr_coloring_graph", lclA.numRows() + 1);
660  if (localizeColoringGraph) {
661  auto drop_offrank = Misc::DropOffRankFunctor(lclA, results);
662  MueLu_runDroppingFunctors(drop_offrank);
663  }
664  if (symmetrizeColoringGraph) {
665  auto symmetrize = Misc::SymmetrizeFunctor(lclA, results);
666  MueLu_runDroppingFunctors(symmetrize);
667  }
668  auto colidx = entries_type("entries_coloring_graph", nnz_filtered);
669  auto lclGraph = local_graph_type(colidx, filtered_rowptr);
670  auto graphConstruction = MatrixConstruction::GraphConstruction<local_matrix_type, local_graph_type>(lclA, results, lclGraph);
671  Kokkos::parallel_for("MueLu::CoalesceDrop::Construct_coloring_graph", range, graphConstruction);
672 
673  auto colorGraph = rcp(new LWGraph_kokkos(lclGraph, filteredA->getRowMap(), filteredA->getColMap(), "coloring graph"));
674  Set(currentLevel, "Coloring Graph", colorGraph);
675  }
676 
677 #undef MueLu_runDroppingFunctors_on_A
678 #undef MueLu_runDroppingFunctors_on_dlap_inner
679 #undef MueLu_runDroppingFunctors_on_dlap
680 #undef MueLu_runDroppingFunctors
681 #undef MueLu_runDroppingFunctorsImpl
682 
683  LO dofsPerNode = 1;
684  Set(currentLevel, "DofsPerNode", dofsPerNode);
685  Set(currentLevel, "Graph", graph);
686  Set(currentLevel, "A", filteredA);
687 
688  return std::make_tuple(numDropped, boundaryNodes);
689 }
690 
691 template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
692 std::tuple<GlobalOrdinal, typename MueLu::LWGraph_kokkos<LocalOrdinal, GlobalOrdinal, Node>::boundary_nodes_type> CoalesceDropFactory_kokkos<Scalar, LocalOrdinal, GlobalOrdinal, Node>::
693  BuildVector(Level& currentLevel) const {
694  FactoryMonitor m(*this, "Build", currentLevel);
695 
698  using local_matrix_type = typename MatrixType::local_matrix_type;
699  using local_graph_type = typename GraphType::local_graph_type;
700  using rowptr_type = typename local_graph_type::row_map_type::non_const_type;
701  using entries_type = typename local_graph_type::entries_type::non_const_type;
702  using values_type = typename local_matrix_type::values_type::non_const_type;
703  using device_type = typename Node::device_type;
704  using memory_space = typename device_type::memory_space;
705  using magnitudeType = typename Teuchos::ScalarTraits<Scalar>::magnitudeType;
706  using doubleMultiVector = Xpetra::MultiVector<magnitudeType, LO, GO, NO>;
707 
708  typedef Teuchos::ScalarTraits<Scalar> STS;
709  const magnitudeType zero = Teuchos::ScalarTraits<magnitudeType>::zero();
710 
711  auto A = Get<RCP<Matrix>>(currentLevel, "A");
712 
713  /* NOTE: storageblocksize (from GetStorageBlockSize()) is the size of a block in the chosen storage scheme.
714  blkSize is the number of storage blocks that must kept together during the amalgamation process.
715 
716  Both of these quantities may be different than numPDEs (from GetFixedBlockSize()), but the following must always hold:
717 
718  numPDEs = blkSize * storageblocksize.
719 
720  If numPDEs==1
721  Matrix is point storage (classical CRS storage). storageblocksize=1 and blkSize=1
722  No other values makes sense.
723 
724  If numPDEs>1
725  If matrix uses point storage, then storageblocksize=1 and blkSize=numPDEs.
726  If matrix uses block storage, with block size of n, then storageblocksize=n, and blkSize=numPDEs/n.
727  Thus far, only storageblocksize=numPDEs and blkSize=1 has been tested.
728  */
729 
730  TEUCHOS_TEST_FOR_EXCEPTION(A->GetFixedBlockSize() % A->GetStorageBlockSize() != 0, Exceptions::RuntimeError, "A->GetFixedBlockSize() needs to be a multiple of A->GetStorageBlockSize()");
731  LO blkSize = A->GetFixedBlockSize() / A->GetStorageBlockSize();
732 
733  auto amalInfo = Get<RCP<AmalgamationInfo>>(currentLevel, "UnAmalgamationInfo");
734 
735  const RCP<const Map> rowMap = A->getRowMap();
736  const RCP<const Map> colMap = A->getColMap();
737 
738  // build a node row map (uniqueMap = non-overlapping) and a node column map
739  // (nonUniqueMap = overlapping). The arrays rowTranslation and colTranslation
740  // stored in the AmalgamationInfo class container contain the local node id
741  // given a local dof id. The data is calculated in the AmalgamationFactory and
742  // stored in the variable "UnAmalgamationInfo" (which is of type AmalagamationInfo)
743  const RCP<const Map> uniqueMap = amalInfo->getNodeRowMap();
744  const RCP<const Map> nonUniqueMap = amalInfo->getNodeColMap();
745  Array<LO> rowTranslationArray = *(amalInfo->getRowTranslation()); // TAW should be transform that into a View?
746  Array<LO> colTranslationArray = *(amalInfo->getColTranslation());
747 
748  Kokkos::View<LO*, Kokkos::MemoryUnmanaged>
749  rowTranslationView(rowTranslationArray.getRawPtr(), rowTranslationArray.size());
750  Kokkos::View<LO*, Kokkos::MemoryUnmanaged>
751  colTranslationView(colTranslationArray.getRawPtr(), colTranslationArray.size());
752 
753  // get number of local nodes
754  LO numNodes = Teuchos::as<LocalOrdinal>(uniqueMap->getLocalNumElements());
755  typedef typename Kokkos::View<LocalOrdinal*, typename Node::device_type> id_translation_type;
756  id_translation_type rowTranslation("dofId2nodeId", rowTranslationArray.size());
757  id_translation_type colTranslation("ov_dofId2nodeId", colTranslationArray.size());
758  Kokkos::deep_copy(rowTranslation, rowTranslationView);
759  Kokkos::deep_copy(colTranslation, colTranslationView);
760 
761  // extract striding information
762  blkSize = A->GetFixedBlockSize(); //< the full block size (number of dofs per node in strided map)
763  LocalOrdinal blkId = -1; //< the block id within a strided map or -1 if it is a full block map
764  LocalOrdinal blkPartSize = A->GetFixedBlockSize(); //< stores block size of part blkId (or the full block size)
765  if (A->IsView("stridedMaps") == true) {
766  const RCP<const Map> myMap = A->getRowMap("stridedMaps");
767  const RCP<const StridedMap> strMap = Teuchos::rcp_dynamic_cast<const StridedMap>(myMap);
768  TEUCHOS_TEST_FOR_EXCEPTION(strMap.is_null() == true, Exceptions::RuntimeError, "Map is not of type stridedMap");
769  blkSize = Teuchos::as<const LocalOrdinal>(strMap->getFixedBlockSize());
770  blkId = strMap->getStridedBlockId();
771  if (blkId > -1)
772  blkPartSize = Teuchos::as<LocalOrdinal>(strMap->getStridingData()[blkId]);
773  }
774 
775  TEUCHOS_TEST_FOR_EXCEPTION(A->getRowMap()->getLocalNumElements() % blkPartSize != 0, MueLu::Exceptions::RuntimeError, "MueLu::CoalesceDropFactory: Number of local elements is " << A->getRowMap()->getLocalNumElements() << " but should be a multiple of " << blkPartSize);
776 
778  // Process parameterlist
779  const ParameterList& pL = GetParameterList();
780 
781  // Boundary detection
782  const magnitudeType dirichletThreshold = STS::magnitude(as<SC>(pL.get<double>("aggregation: Dirichlet threshold")));
783  const magnitudeType rowSumTol = as<magnitudeType>(pL.get<double>("aggregation: row sum drop tol"));
784  const LocalOrdinal dirichletNonzeroThreshold = 1;
785  const bool useGreedyDirichlet = pL.get<bool>("aggregation: greedy Dirichlet");
786  TEUCHOS_TEST_FOR_EXCEPTION(rowSumTol > zero, MueLu::Exceptions::RuntimeError, "MueLu::CoalesceDropFactory: RowSum is not implemented for vectorial problems.");
787 
788  // Dropping
789  bool useBlocking = pL.get<bool>("aggregation: use blocking");
790  std::string droppingMethod = pL.get<std::string>("aggregation: drop scheme");
791  std::string socUsesMatrix = pL.get<std::string>("aggregation: strength-of-connection: matrix");
792  std::string socUsesMeasure = pL.get<std::string>("aggregation: strength-of-connection: measure");
793  std::string distanceLaplacianMetric = pL.get<std::string>("aggregation: distance laplacian metric");
794  bool symmetrizeDroppedGraph = pL.get<bool>("aggregation: symmetrize graph after dropping");
795  magnitudeType threshold;
796  // If we're doing the ML-style halving of the drop tol at each level, we do that here.
797  if (pL.get<bool>("aggregation: use ml scaling of drop tol"))
798  threshold = pL.get<double>("aggregation: drop tol") / pow(2.0, currentLevel.GetLevelID());
799  else
800  threshold = as<magnitudeType>(pL.get<double>("aggregation: drop tol"));
801  bool aggregationMayCreateDirichlet = pL.get<bool>("aggregation: dropping may create Dirichlet");
802 
803  // Fill
804  const bool lumping = pL.get<bool>("filtered matrix: use lumping");
805  const bool reuseGraph = pL.get<bool>("filtered matrix: reuse graph");
806  const bool reuseEigenvalue = pL.get<bool>("filtered matrix: reuse eigenvalue");
807 
808  const bool useRootStencil = pL.get<bool>("filtered matrix: use root stencil");
809  const bool useSpreadLumping = pL.get<bool>("filtered matrix: use spread lumping");
810 
811  const magnitudeType filteringDirichletThreshold = as<magnitudeType>(pL.get<double>("filtered matrix: Dirichlet threshold"));
812 
813  // coloring graph
814  bool generateColoringGraph = pL.get<bool>("aggregation: coloring: use color graph");
815  const bool localizeColoringGraph = pL.get<bool>("aggregation: coloring: localize color graph");
816  const bool symmetrizeColoringGraph = true;
817 
818 #ifdef HAVE_MUELU_COALESCEDROP_ALLOW_OLD_PARAMETERS
819  translateOldAlgoParam(pL, droppingMethod, useBlocking, socUsesMatrix, socUsesMeasure, symmetrizeDroppedGraph, generateColoringGraph, threshold);
820 #endif
821  {
822  std::stringstream ss;
823  ss << "dropping scheme = \"" << droppingMethod << "\", strength-of-connection measure = \"" << socUsesMeasure << "\", strength-of-connection matrix = \"" << socUsesMatrix << "\", ";
824  if (socUsesMatrix == "distance laplacian")
825  ss << "distance laplacian metric = \"" << distanceLaplacianMetric << "\", ";
826  ss << "threshold = " << threshold << ", blocksize = " << A->GetFixedBlockSize() << ", useBlocking = " << useBlocking;
827  ss << ", symmetrizeDroppedGraph = " << symmetrizeDroppedGraph << std::endl;
828 
829  GetOStream(Runtime0) << ss.str();
830  }
831 
832  TEUCHOS_ASSERT(!useRootStencil);
833  TEUCHOS_ASSERT(!useSpreadLumping);
834  if (droppingMethod == "cut-drop")
835  TEUCHOS_TEST_FOR_EXCEPTION(threshold > 1.0, Exceptions::RuntimeError, "For cut-drop algorithms, \"aggregation: drop tol\" = " << threshold << ", needs to be <= 1.0");
836 
838  // We perform four sweeps over the rows of A:
839  // Pass 1: detection of boundary nodes
840  // Pass 2: diagonal extraction
841  // Pass 3: drop decision for each entry and construction of the rowptr of the filtered matrix
842  // Pass 4: fill of the filtered matrix
843  //
844  // Pass 1 and 3 apply a sequence of criteria to each row of the matrix.
845 
846  // TODO: We could merge pass 1 and 2.
847 
848  auto crsA = toCrsMatrix(A);
849  auto lclA = crsA->getLocalMatrixDevice();
850  auto range = range_type(0, numNodes);
851 
853  // Pass 1: Detect boundary nodes
854  //
855  // The following criteria are available:
856  // - BoundaryDetection::VectorDirichletFunctor
857  // Marks rows as Dirichlet based on value threshold and number of off-diagonal entries
858 
859  // Dirichlet nodes
860  auto boundaryNodes = boundary_nodes_type("boundaryNodes", numNodes); // initialized to false
861  {
862  SubFactoryMonitor mBoundary(*this, "Boundary detection", currentLevel);
863 
864 #define MueLu_runBoundaryFunctors(...) \
865  { \
866  auto boundaries = BoundaryDetection::BoundaryFunctor(lclA, __VA_ARGS__); \
867  Kokkos::parallel_for("CoalesceDrop::BoundaryDetection", range, boundaries); \
868  }
869 
870  if (useGreedyDirichlet) {
871  auto dirichlet_detection = BoundaryDetection::VectorDirichletFunctor<local_matrix_type, true>(lclA, blkPartSize, boundaryNodes, dirichletThreshold, dirichletNonzeroThreshold);
872  MueLu_runBoundaryFunctors(dirichlet_detection);
873  } else {
874  auto dirichlet_detection = BoundaryDetection::VectorDirichletFunctor<local_matrix_type, false>(lclA, blkPartSize, boundaryNodes, dirichletThreshold, dirichletNonzeroThreshold);
875  MueLu_runBoundaryFunctors(dirichlet_detection);
876  }
877 #undef MueLu_runBoundaryFunctors
878  }
879  // In what follows, boundaryNodes can still still get modified if aggregationMayCreateDirichlet == true.
880  // Otherwise we're now done with it now.
881 
883  // Pass 2 & 3: Diagonal extraction and determine dropping and construct
884  // rowptr of filtered matrix
885  //
886  // The following criteria are available:
887  // - Misc::VectorDropBoundaryFunctor
888  // Drop all rows that have been marked as Dirichlet
889  // - Misc::DropOffRankFunctor
890  // Drop all entries that are off-rank
891  // - ClassicalDropping::DropFunctor
892  // Classical dropping
893  // - DistanceLaplacian::VectorDropFunctor
894  // Distance Laplacian dropping
895  // - Misc::KeepDiagonalFunctor
896  // Mark diagonal as KEEP
897  // - Misc::MarkSingletonFunctor
898  // Mark singletons after dropping as Dirichlet
899 
900 #if !defined(HAVE_MUELU_DEBUG)
901 #define MueLu_runDroppingFunctorsImpl(...) \
902  { \
903  auto countingFunctor = MatrixConstruction::VectorCountingFunctor(lclA, blkPartSize, colTranslation, results, filtered_rowptr, graph_rowptr, __VA_ARGS__); \
904  Kokkos::parallel_scan("MueLu::CoalesceDrop::CountEntries", range, countingFunctor, nnz); \
905  }
906 #else
907 #define MueLu_runDroppingFunctorsImpl(...) \
908  { \
909  auto debug = Misc::DebugFunctor(lclA, results); \
910  auto countingFunctor = MatrixConstruction::VectorCountingFunctor(lclA, blkPartSize, colTranslation, results, filtered_rowptr, graph_rowptr, __VA_ARGS__, debug); \
911  Kokkos::parallel_scan("MueLu::CoalesceDrop::CountEntries", range, countingFunctor, nnz); \
912  }
913 #endif
914 
915  // Macro that handles optional block diagonalization.
916  // Calls MueLu_runDroppingFunctorsImpl
917 #define MueLu_runDroppingFunctors(...) \
918  { \
919  if (useBlocking) { \
920  auto BlockNumber = Get<RCP<LocalOrdinalVector>>(currentLevel, "BlockNumber"); \
921  auto block_diagonalize = Misc::BlockDiagonalizeVectorFunctor(*A, *BlockNumber, nonUniqueMap, results, rowTranslation, colTranslation); \
922  MueLu_runDroppingFunctorsImpl(block_diagonalize, __VA_ARGS__); \
923  } else \
924  MueLu_runDroppingFunctorsImpl(__VA_ARGS__); \
925  }
926 
927  // Macro that runs dropping for SoC based on A itself, handling of droppingMethod.
928  // Calls MueLu_runDroppingFunctors
929 #define MueLu_runDroppingFunctors_on_A(SoC) \
930  { \
931  if (droppingMethod == "point-wise") { \
932  auto dropping = ClassicalDropping::make_drop_functor<SoC>(*A, threshold, results); \
933  \
934  if (aggregationMayCreateDirichlet) { \
935  MueLu_runDroppingFunctors(dropping, \
936  preserve_diagonals, \
937  mark_singletons_as_boundary); \
938  } else { \
939  MueLu_runDroppingFunctors(dropping, \
940  preserve_diagonals); \
941  } \
942  } else if (droppingMethod == "cut-drop") { \
943  auto comparison = CutDrop::make_comparison_functor<SoC>(*A, results); \
944  auto cut_drop = CutDrop::CutDropFunctor(comparison, threshold); \
945  \
946  MueLu_runDroppingFunctors(drop_boundaries, \
947  preserve_diagonals, \
948  cut_drop); \
949  } \
950  }
951 
952  // Macro that runs on the distance Laplacian, handling of droppingMethod.
953  // Calls MueLu_runDroppingFunctors
954 #define MueLu_runDroppingFunctors_on_dlap_inner(SoC) \
955  { \
956  if (droppingMethod == "point-wise") { \
957  auto dist_laplacian_dropping = DistanceLaplacian::make_vector_drop_functor<SoC>(*A, *mergedA, threshold, dist2, results, rowTranslation, colTranslation); \
958  \
959  if (aggregationMayCreateDirichlet) { \
960  MueLu_runDroppingFunctors(dist_laplacian_dropping, \
961  preserve_diagonals, \
962  mark_singletons_as_boundary); \
963  } else { \
964  MueLu_runDroppingFunctors(dist_laplacian_dropping, \
965  preserve_diagonals); \
966  } \
967  } else if (droppingMethod == "cut-drop") { \
968  auto comparison = CutDrop::make_dlap_comparison_functor<SoC>(*A, dist2, results); \
969  auto cut_drop = CutDrop::CutDropFunctor(comparison, threshold); \
970  \
971  MueLu_runDroppingFunctors(drop_boundaries, \
972  preserve_diagonals, \
973  cut_drop); \
974  } \
975  }
976 
977  // Macro that runs on the distance Laplacian, handling of distanceLaplacianMetric.
978  // Calls MueLu_runDroppingFunctors_on_dlap_inner
979 #define MueLu_runDroppingFunctors_on_dlap(SoC) \
980  { \
981  if (distanceLaplacianMetric == "unweighted") { \
982  auto dist2 = DistanceLaplacian::UnweightedDistanceFunctor(*mergedA, coords); \
983  MueLu_runDroppingFunctors_on_dlap_inner(SoC); \
984  } else if (distanceLaplacianMetric == "weighted") { \
985  auto k_dlap_weights_host = Kokkos::View<double*, Kokkos::HostSpace, Kokkos::MemoryTraits<Kokkos::Unmanaged>>(&dlap_weights[0], dlap_weights.size()); \
986  auto k_dlap_weights = Kokkos::View<double*>("dlap_weights", k_dlap_weights_host.extent(0)); \
987  Kokkos::deep_copy(k_dlap_weights, k_dlap_weights_host); \
988  auto dist2 = DistanceLaplacian::WeightedDistanceFunctor(*mergedA, coords, k_dlap_weights); \
989  MueLu_runDroppingFunctors_on_dlap_inner(SoC); \
990  } else if (distanceLaplacianMetric == "block weighted") { \
991  auto k_dlap_weights_host = Kokkos::View<double*, Kokkos::HostSpace, Kokkos::MemoryTraits<Kokkos::Unmanaged>>(&dlap_weights[0], dlap_weights.size()); \
992  auto k_dlap_weights = Kokkos::View<double*>("dlap_weights", k_dlap_weights_host.extent(0)); \
993  Kokkos::deep_copy(k_dlap_weights, k_dlap_weights_host); \
994  auto dist2 = DistanceLaplacian::BlockWeightedDistanceFunctor(*mergedA, coords, k_dlap_weights, interleaved_blocksize); \
995  MueLu_runDroppingFunctors_on_dlap_inner(SoC); \
996  } else if (distanceLaplacianMetric == "material") { \
997  auto material = GetMaterial(currentLevel, coords->getNumVectors()); \
998  if (material->getNumVectors() == 1) { \
999  auto dist2 = DistanceLaplacian::ScalarMaterialDistanceFunctor(*mergedA, coords, material); \
1000  MueLu_runDroppingFunctors_on_dlap_inner(SoC); \
1001  } else { \
1002  auto dist2 = DistanceLaplacian::TensorMaterialDistanceFunctor(*mergedA, coords, material); \
1003  MueLu_runDroppingFunctors_on_dlap_inner(SoC); \
1004  } \
1005  } \
1006  }
1007 
1008  // rowptr of filtered A
1009  auto filtered_rowptr = rowptr_type("rowptr", lclA.numRows() + 1);
1010  auto graph_rowptr = rowptr_type("rowptr", numNodes + 1);
1011  // Number of nonzeros of filtered A and graph
1012  Kokkos::pair<LocalOrdinal, LocalOrdinal> nnz = {0, 0};
1013 
1014  // dropping decisions for each entry
1015  auto results = Kokkos::View<DecisionType*, memory_space>("results", lclA.nnz()); // initialized to UNDECIDED
1016  {
1017  SubFactoryMonitor mDropping(*this, "Dropping decisions", currentLevel);
1018 
1019  auto drop_boundaries = Misc::VectorDropBoundaryFunctor(lclA, rowTranslation, boundaryNodes, results);
1020 
1021  if (threshold != zero) {
1022  auto preserve_diagonals = Misc::KeepDiagonalFunctor(lclA, results);
1023  auto mark_singletons_as_boundary = Misc::MarkSingletonVectorFunctor(lclA, rowTranslation, boundaryNodes, results);
1024 
1025  if (socUsesMatrix == "A") {
1026  if (socUsesMeasure == "unscaled") {
1028  } else if (socUsesMeasure == "smoothed aggregation") {
1030  } else if (socUsesMeasure == "signed ruge-stueben") {
1032  } else if (socUsesMeasure == "signed smoothed aggregation") {
1034  }
1035  } else if (socUsesMatrix == "distance laplacian") {
1036  auto coords = Get<RCP<doubleMultiVector>>(currentLevel, "Coordinates");
1037 
1038  Array<double> dlap_weights = pL.get<Array<double>>("aggregation: distance laplacian directional weights");
1039  LocalOrdinal interleaved_blocksize = as<LocalOrdinal>(pL.get<int>("aggregation: block diagonal: interleaved blocksize"));
1040  if (socUsesMeasure == "distance laplacian") {
1041  LO dim = (LO)coords->getNumVectors();
1042  // If anything isn't 1.0 we need to turn on the weighting
1043  bool non_unity = false;
1044  for (LO i = 0; !non_unity && i < (LO)dlap_weights.size(); i++) {
1045  if (dlap_weights[i] != 1.0) {
1046  non_unity = true;
1047  }
1048  }
1049  if (non_unity) {
1050  if ((LO)dlap_weights.size() == dim) {
1051  distanceLaplacianMetric = "weighted";
1052  } else if ((LO)dlap_weights.size() == interleaved_blocksize * dim)
1053  distanceLaplacianMetric = "block weighted";
1054  else {
1055  TEUCHOS_TEST_FOR_EXCEPTION(1, Exceptions::RuntimeError,
1056  "length of 'aggregation: distance laplacian directional weights' must equal the coordinate dimension OR the coordinate dimension times the blocksize");
1057  }
1058  if (GetVerbLevel() & Statistics1)
1059  GetOStream(Statistics1) << "Using distance laplacian weights: " << dlap_weights << std::endl;
1060  }
1061  }
1062 
1063  RCP<Matrix> mergedA;
1064  {
1065  // Construct merged A.
1066 
1067  auto merged_rowptr = rowptr_type("rowptr", numNodes + 1);
1068  LocalOrdinal nnz_merged = 0;
1069 
1070  auto functor = MatrixConstruction::MergeCountFunctor(lclA, blkPartSize, colTranslation, merged_rowptr);
1071  Kokkos::parallel_scan("MergeCount", range, functor, nnz_merged);
1072 
1073  local_graph_type lclMergedGraph;
1074  auto colidx_merged = entries_type("entries", nnz_merged);
1075  auto values_merged = values_type("values", nnz_merged);
1076 
1077  local_matrix_type lclMergedA = local_matrix_type("mergedA",
1078  numNodes, nonUniqueMap->getLocalNumElements(),
1079  nnz_merged,
1080  values_merged, merged_rowptr, colidx_merged);
1081 
1082  auto fillFunctor = MatrixConstruction::MergeFillFunctor<local_matrix_type>(lclA, blkSize, colTranslation, lclMergedA);
1083  Kokkos::parallel_for("MueLu::CoalesceDrop::MergeFill", range, fillFunctor);
1084 
1085  mergedA = Xpetra::MatrixFactory<Scalar, LocalOrdinal, GlobalOrdinal, Node>::Build(lclMergedA, uniqueMap, nonUniqueMap, uniqueMap, uniqueMap);
1086  }
1087 
1088  if (socUsesMeasure == "unscaled") {
1090  } else if (socUsesMeasure == "smoothed aggregation") {
1092  } else if (socUsesMeasure == "signed ruge-stueben") {
1094  } else if (socUsesMeasure == "signed smoothed aggregation") {
1096  }
1097  }
1098  } else {
1099  Kokkos::deep_copy(results, KEEP);
1100  // MueLu_runDroppingFunctors(drop_boundaries);
1101  auto no_op = Misc::NoOpFunctor<LocalOrdinal>();
1103  }
1104 
1105  if (symmetrizeDroppedGraph) {
1106  auto symmetrize = Misc::SymmetrizeFunctor(lclA, results);
1107  MueLu_runDroppingFunctors(symmetrize);
1108  }
1109  }
1110  LocalOrdinal nnz_filtered = nnz.first;
1111  LocalOrdinal nnz_graph = nnz.second;
1112  GO numTotal = lclA.nnz();
1113  GO numDropped = numTotal - nnz_filtered;
1114  // We now know the number of entries of filtered A and have the final rowptr.
1115 
1117  // Pass 4: Create local matrix for filtered A
1118  //
1119  // Dropped entries are optionally lumped to the diagonal.
1120 
1121  RCP<Matrix> filteredA;
1122  RCP<LWGraph_kokkos> graph;
1123  {
1124  SubFactoryMonitor mFill(*this, "Filtered matrix fill", currentLevel);
1125 
1126  local_matrix_type lclFilteredA;
1127  if (reuseGraph) {
1128  lclFilteredA = local_matrix_type("filteredA", lclA.graph, lclA.numCols());
1129  } else {
1130  auto colidx = entries_type("entries", nnz_filtered);
1131  auto values = values_type("values", nnz_filtered);
1132  lclFilteredA = local_matrix_type("filteredA",
1133  lclA.numRows(), lclA.numCols(),
1134  nnz_filtered,
1135  values, filtered_rowptr, colidx);
1136  }
1137 
1138  local_graph_type lclGraph;
1139  {
1140  auto colidx = entries_type("entries", nnz_graph);
1141  lclGraph = local_graph_type(colidx, graph_rowptr);
1142  }
1143 
1144  if (lumping) {
1145  if (reuseGraph) {
1146  auto fillFunctor = MatrixConstruction::VectorFillFunctor<local_matrix_type, true, true>(lclA, blkPartSize, colTranslation, results, lclFilteredA, lclGraph, filteringDirichletThreshold);
1147  Kokkos::parallel_for("MueLu::CoalesceDrop::Fill_lumped_reuse", range, fillFunctor);
1148  } else {
1149  auto fillFunctor = MatrixConstruction::VectorFillFunctor<local_matrix_type, true, false>(lclA, blkPartSize, colTranslation, results, lclFilteredA, lclGraph, filteringDirichletThreshold);
1150  Kokkos::parallel_for("MueLu::CoalesceDrop::Fill_lumped_noreuse", range, fillFunctor);
1151  }
1152  } else {
1153  if (reuseGraph) {
1154  auto fillFunctor = MatrixConstruction::VectorFillFunctor<local_matrix_type, false, true>(lclA, blkSize, colTranslation, results, lclFilteredA, lclGraph, filteringDirichletThreshold);
1155  Kokkos::parallel_for("MueLu::CoalesceDrop::Fill_unlumped_reuse", range, fillFunctor);
1156  } else {
1157  auto fillFunctor = MatrixConstruction::VectorFillFunctor<local_matrix_type, false, false>(lclA, blkSize, colTranslation, results, lclFilteredA, lclGraph, filteringDirichletThreshold);
1158  Kokkos::parallel_for("MueLu::CoalesceDrop::Fill_unlumped_noreuse", range, fillFunctor);
1159  }
1160  }
1161 
1162  filteredA = Xpetra::MatrixFactory<Scalar, LocalOrdinal, GlobalOrdinal, Node>::Build(lclFilteredA, A->getRowMap(), A->getColMap(), A->getDomainMap(), A->getRangeMap());
1163  filteredA->SetFixedBlockSize(blkSize);
1164 
1165  if (reuseEigenvalue) {
1166  // Reuse max eigenvalue from A
1167  // It is unclear what eigenvalue is the best for the smoothing, but we already may have
1168  // the D^{-1}A estimate in A, may as well use it.
1169  // NOTE: ML does that too
1170  filteredA->SetMaxEigenvalueEstimate(A->GetMaxEigenvalueEstimate());
1171  } else {
1172  filteredA->SetMaxEigenvalueEstimate(-Teuchos::ScalarTraits<SC>::one());
1173  }
1174 
1175  graph = rcp(new LWGraph_kokkos(lclGraph, uniqueMap, nonUniqueMap, "amalgamated graph of A"));
1176  graph->SetBoundaryNodeMap(boundaryNodes);
1177  }
1178 
1179  // Construct a second graph for coloring
1180  if (generateColoringGraph) {
1181  SubFactoryMonitor mColoringGraph(*this, "Construct coloring graph", currentLevel);
1182 
1183  filtered_rowptr = rowptr_type("rowptr_coloring_graph", lclA.numRows() + 1);
1184  graph_rowptr = rowptr_type("rowptr", numNodes + 1);
1185  if (localizeColoringGraph) {
1186  auto drop_offrank = Misc::DropOffRankFunctor(lclA, results);
1187  MueLu_runDroppingFunctors(drop_offrank);
1188  }
1189  if (symmetrizeColoringGraph) {
1190  auto symmetrize = Misc::SymmetrizeFunctor(lclA, results);
1191  MueLu_runDroppingFunctors(symmetrize);
1192  }
1193  auto colidx = entries_type("entries_coloring_graph", nnz_filtered);
1194  auto lclGraph = local_graph_type(colidx, filtered_rowptr);
1195  auto graphConstruction = MatrixConstruction::GraphConstruction<local_matrix_type, local_graph_type>(lclA, results, lclGraph);
1196  Kokkos::parallel_for("MueLu::CoalesceDrop::Construct_coloring_graph", range, graphConstruction);
1197 
1198  auto colorGraph = rcp(new LWGraph_kokkos(lclGraph, filteredA->getRowMap(), filteredA->getColMap(), "coloring graph"));
1199  Set(currentLevel, "Coloring Graph", colorGraph);
1200  }
1201 
1202 #undef MueLu_runDroppingFunctors_on_dlap
1203 #undef MueLu_runDroppingFunctors_on_dlap_inner
1204 #undef MueLu_runDroppingFunctors_on_A
1205 #undef MueLu_runDroppingFunctors
1206 #undef MueLu_runDroppingFunctorsImpl
1207 
1208  LO dofsPerNode = blkSize;
1209 
1210  Set(currentLevel, "DofsPerNode", dofsPerNode);
1211  Set(currentLevel, "Graph", graph);
1212  Set(currentLevel, "A", filteredA);
1213 
1214  return std::make_tuple(numDropped, boundaryNodes);
1215 }
1216 
1217 } // namespace MueLu
1218 #endif // MUELU_COALESCEDROPFACTORY_KOKKOS_DEF_HPP
MueLu::DefaultLocalOrdinal LocalOrdinal
Lightweight MueLu representation of a compressed row storage graph.
void translateOldAlgoParam(const Teuchos::ParameterList &pL, std::string &droppingMethod, bool &useBlocking, std::string &socUsesMatrix, std::string &socUsesMeasure, bool &symmetrizeDroppedGraph, bool &generateColoringGraph, magnitudeType &threshold)
KOKKOS_INLINE_FUNCTION void SetBoundaryNodeMap(const boundary_nodes_type bndry)
Set boolean array indicating which rows correspond to Dirichlet boundaries.
Teuchos::RCP< MultiVector > GetMaterial(Level &currentLevel, size_t spatialDim) const
void setValidator(RCP< const ParameterEntryValidator > const &validator)
Functor that drops boundary nodes for a blockSize &gt; 1 problem.
GlobalOrdinal GO
T & get(const std::string &name, T def_value)
Functor that marks singletons (all off-diagonal entries in a row are dropped) as boundary.
Timer to be used in factories. Similar to Monitor but with additional timers.
#define TEUCHOS_TEST_FOR_EXCEPTION(throw_exception_test, Exception, msg)
#define MueLu_runDroppingFunctors_on_dlap(SoC)
Print more statistics.
#define MueLu_runBoundaryFunctors(...)
LocalOrdinal LO
One-liner description of what is happening.
ParameterList & set(std::string const &name, T &&value, std::string const &docString="", RCP< const ParameterEntryValidator > const &validator=null)
Functor that marks singletons (all off-diagonal entries in a row are dropped) as boundary.
TEUCHOS_DEPRECATED RCP< T > rcp(T *p, Dealloc_T dealloc, bool owns_mem)
Functor that symmetrizes the dropping decisions.
MueLu::DefaultGlobalOrdinal GlobalOrdinal
Functor that drops boundary nodes for a blockSize == 1 problem.
Functor that drops off-rank entries.
Class that holds all level-specific information.
Definition: MueLu_Level.hpp:63
Timer to be used in factories. Similar to SubMonitor but adds a timer level by level.
typename MueLu::LWGraph_kokkos< LocalOrdinal, GlobalOrdinal, Node >::boundary_nodes_type boundary_nodes_type
void DeclareInput(Level &currentLevel) const
Input.
Functor that fills the filtered matrix while reusing the graph of the matrix before dropping...
Functor for marking nodes as Dirichlet.
Functor that marks diagonal as kept, unless the are already marked as boundary.
Kokkos::RangePolicy< local_ordinal_type, execution_space > range_type
std::tuple< GlobalOrdinal, boundary_nodes_type > BuildVector(Level &currentLevel) const
#define MueLu_runDroppingFunctors_on_A(SoC)
RCP< const ParameterList > GetValidParameterList() const
Return a const parameter list of valid parameters that setParameterList() will accept.
void Build(Level &currentLevel) const
Build an object with this factory.
Functor for marking nodes as Dirichlet based on rowsum.
size_type size() const
#define SET_VALID_ENTRY(name)
Functor for marking nodes as Dirichlet in a block operator.
std::tuple< GlobalOrdinal, boundary_nodes_type > BuildScalar(Level &currentLevel) const
Functor does not reuse the graph of the matrix for a problem with blockSize == 1. ...
static RCP< Matrix > Build(const RCP< const Map > &rowMap, size_t maxNumEntriesPerRow, Xpetra::ProfileType pftype=Xpetra::DynamicProfile)
Exception throws to report errors in the internal logical of the program.
#define TEUCHOS_ASSERT(assertion_test)
ParameterEntry & getEntry(const std::string &name)
#define MueLu_runDroppingFunctors(...)
bool is_null() const