MueLu  Version of the Day
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Groups Pages
MueLu_CoalesceDropFactory_kokkos_def.hpp
Go to the documentation of this file.
1 // @HEADER
2 // *****************************************************************************
3 // MueLu: A package for multigrid based preconditioning
4 //
5 // Copyright 2012 NTESS and the MueLu contributors.
6 // SPDX-License-Identifier: BSD-3-Clause
7 // *****************************************************************************
8 // @HEADER
9 
10 #ifndef MUELU_COALESCEDROPFACTORY_KOKKOS_DEF_HPP
11 #define MUELU_COALESCEDROPFACTORY_KOKKOS_DEF_HPP
12 
13 #include <Kokkos_Core.hpp>
14 #include <KokkosSparse_CrsMatrix.hpp>
15 
16 #include "Xpetra_Matrix.hpp"
17 
19 
20 #include "MueLu_AmalgamationInfo.hpp"
21 #include "MueLu_Exceptions.hpp"
22 #include "MueLu_Level.hpp"
23 #include "MueLu_LWGraph_kokkos.hpp"
24 #include "MueLu_MasterList.hpp"
25 #include "MueLu_Monitor.hpp"
26 #include "MueLu_Utilities.hpp"
27 
28 namespace MueLu {
29 
30 namespace CoalesceDrop_Kokkos_Details { // anonymous
31 
32 template <class LO, class RowType>
33 class ScanFunctor {
34  public:
35  ScanFunctor(RowType rows_)
36  : rows(rows_) {}
37 
38  KOKKOS_INLINE_FUNCTION
39  void operator()(const LO i, LO& upd, const bool& final) const {
40  upd += rows(i);
41  if (final)
42  rows(i) = upd;
43  }
44 
45  private:
46  RowType rows;
47 };
48 
49 template <class LO, class GhostedViewType>
51  private:
52  typedef typename GhostedViewType::value_type SC;
53  typedef Kokkos::ArithTraits<SC> ATS;
54  typedef typename ATS::magnitudeType magnitudeType;
55 
56  GhostedViewType diag; // corresponds to overlapped diagonal multivector (2D View)
58 
59  public:
60  ClassicalDropFunctor(GhostedViewType ghostedDiag, magnitudeType threshold)
61  : diag(ghostedDiag)
62  , eps(threshold) {}
63 
64  // Return true if we drop, false if not
65  KOKKOS_FORCEINLINE_FUNCTION
66  bool operator()(LO row, LO col, SC val) const {
67  // We avoid square root by using squared values
68  auto aiiajj = ATS::magnitude(diag(row, 0)) * ATS::magnitude(diag(col, 0)); // |a_ii|*|a_jj|
69  auto aij2 = ATS::magnitude(val) * ATS::magnitude(val); // |a_ij|^2
70 
71  return (aij2 <= eps * eps * aiiajj);
72  }
73 };
74 
75 template <class LO, class CoordsType>
77  private:
78  typedef typename CoordsType::value_type SC;
79  typedef Kokkos::ArithTraits<SC> ATS;
80  typedef typename ATS::magnitudeType magnitudeType;
81 
82  public:
83  typedef SC value_type;
84 
85  public:
86  DistanceFunctor(CoordsType coords_)
87  : coords(coords_) {}
88 
89  KOKKOS_INLINE_FUNCTION
90  magnitudeType distance2(LO row, LO col) const {
91  SC d = ATS::zero(), s;
92  for (size_t j = 0; j < coords.extent(1); j++) {
93  s = coords(row, j) - coords(col, j);
94  d += s * s;
95  }
96  return ATS::magnitude(d);
97  }
98 
99  private:
100  CoordsType coords;
101 };
102 
103 template <class LO, class GhostedViewType, class DistanceFunctor>
105  private:
106  typedef typename GhostedViewType::value_type SC;
107  typedef Kokkos::ArithTraits<SC> ATS;
108  typedef typename ATS::magnitudeType magnitudeType;
109 
110  public:
111  DistanceLaplacianDropFunctor(GhostedViewType ghostedLaplDiag, DistanceFunctor distFunctor_, magnitudeType threshold)
112  : diag(ghostedLaplDiag)
113  , distFunctor(distFunctor_)
114  , eps(threshold) {}
115 
116  // Return true if we drop, false if not
117  KOKKOS_INLINE_FUNCTION
118  bool operator()(LO row, LO col, SC /* val */) const {
119  // We avoid square root by using squared values
120 
121  // We ignore incoming value of val as we operate on an auxiliary
122  // distance Laplacian matrix
123  typedef typename DistanceFunctor::value_type dSC;
124  typedef Kokkos::ArithTraits<dSC> dATS;
125  auto fval = dATS::one() / distFunctor.distance2(row, col);
126 
127  auto aiiajj = ATS::magnitude(diag(row, 0)) * ATS::magnitude(diag(col, 0)); // |a_ii|*|a_jj|
128  auto aij2 = ATS::magnitude(fval) * ATS::magnitude(fval); // |a_ij|^2
129 
130  return (aij2 <= eps * eps * aiiajj);
131  }
132 
133  private:
134  GhostedViewType diag; // corresponds to overlapped diagonal multivector (2D View)
137 };
138 
139 template <class SC, class LO, class MatrixType, class BndViewType, class DropFunctorType>
141  private:
142  typedef typename MatrixType::StaticCrsGraphType graph_type;
143  typedef typename graph_type::row_map_type rows_type;
144  typedef typename graph_type::entries_type cols_type;
145  typedef typename MatrixType::values_type vals_type;
146  typedef Kokkos::ArithTraits<SC> ATS;
147  typedef typename ATS::val_type impl_Scalar;
148  typedef Kokkos::ArithTraits<impl_Scalar> impl_ATS;
149  typedef typename ATS::magnitudeType magnitudeType;
150 
151  public:
152  ScalarFunctor(MatrixType A_, BndViewType bndNodes_, DropFunctorType dropFunctor_,
153  typename rows_type::non_const_type rows_,
154  typename cols_type::non_const_type colsAux_,
155  typename vals_type::non_const_type valsAux_,
156  bool reuseGraph_, bool lumping_, SC /* threshold_ */,
157  bool aggregationMayCreateDirichlet_)
158  : A(A_)
159  , bndNodes(bndNodes_)
160  , dropFunctor(dropFunctor_)
161  , rows(rows_)
162  , colsAux(colsAux_)
163  , valsAux(valsAux_)
164  , reuseGraph(reuseGraph_)
165  , lumping(lumping_)
166  , aggregationMayCreateDirichlet(aggregationMayCreateDirichlet_) {
167  rowsA = A.graph.row_map;
168  zero = impl_ATS::zero();
169  }
170 
171  KOKKOS_INLINE_FUNCTION
172  void operator()(const LO row, LO& nnz) const {
173  auto rowView = A.rowConst(row);
174  auto length = rowView.length;
175  auto offset = rowsA(row);
176 
177  impl_Scalar diag = zero;
178  LO rownnz = 0;
179  LO diagID = -1;
180  for (decltype(length) colID = 0; colID < length; colID++) {
181  LO col = rowView.colidx(colID);
182  impl_Scalar val = rowView.value(colID);
183 
184  if ((!bndNodes(row) && !dropFunctor(row, col, rowView.value(colID))) || row == col) {
185  colsAux(offset + rownnz) = col;
186 
187  LO valID = (reuseGraph ? colID : rownnz);
188  valsAux(offset + valID) = val;
189  if (row == col)
190  diagID = valID;
191 
192  rownnz++;
193 
194  } else {
195  // Rewrite with zeros (needed for reuseGraph)
196  valsAux(offset + colID) = zero;
197  diag += val;
198  }
199  }
200  // How to assert on the device?
201  // assert(diagIndex != -1);
202  rows(row + 1) = rownnz;
203  // if (lumping && diagID != -1) {
204  if (lumping) {
205  // Add diag to the diagonal
206 
207  // NOTE_KOKKOS: valsAux was allocated with
208  // ViewAllocateWithoutInitializing. This is not a problem here
209  // because we explicitly set this value above.
210  valsAux(offset + diagID) += diag;
211  }
212 
213  // If the only element remaining after filtering is diagonal, mark node as boundary
214  // FIXME: this should really be replaced by the following
215  // if (indices.size() == 1 && indices[0] == row)
216  // boundaryNodes[row] = true;
217  // We do not do it this way now because there is no framework for distinguishing isolated
218  // and boundary nodes in the aggregation algorithms
219  bndNodes(row) |= (rownnz == 1 && aggregationMayCreateDirichlet);
220 
221  nnz += rownnz;
222  }
223 
224  private:
225  MatrixType A;
226  BndViewType bndNodes;
227  DropFunctorType dropFunctor;
228 
230 
231  typename rows_type::non_const_type rows;
232  typename cols_type::non_const_type colsAux;
233  typename vals_type::non_const_type valsAux;
234 
236  bool lumping;
239 };
240 
241 // collect number nonzeros of blkSize rows in nnz_(row+1)
242 template <class MatrixType, class NnzType, class blkSizeType>
244  private:
245  typedef typename MatrixType::ordinal_type LO;
246 
247  public:
248  Stage1aVectorFunctor(MatrixType kokkosMatrix_, NnzType nnz_, blkSizeType blkSize_)
249  : kokkosMatrix(kokkosMatrix_)
250  , nnz(nnz_)
251  , blkSize(blkSize_) {}
252 
253  KOKKOS_INLINE_FUNCTION
254  void operator()(const LO row, LO& totalnnz) const {
255  // the following code is more or less what MergeRows is doing
256  // count nonzero entries in all dof rows associated with node row
257  LO nodeRowMaxNonZeros = 0;
258  for (LO j = 0; j < blkSize; j++) {
259  auto rowView = kokkosMatrix.row(row * blkSize + j);
260  nodeRowMaxNonZeros += rowView.length;
261  }
262  nnz(row + 1) = nodeRowMaxNonZeros;
263  totalnnz += nodeRowMaxNonZeros;
264  }
265 
266  private:
267  MatrixType kokkosMatrix; //< local matrix part
268  NnzType nnz; //< View containing number of nonzeros for current row
269  blkSizeType blkSize; //< block size (or partial block size in strided maps)
270 };
271 
272 // build the dof-based column map containing the local dof ids belonging to blkSize rows in matrix
273 // sort column ids
274 // translate them into (unique) node ids
275 // count the node column ids per node row
276 template <class MatrixType, class NnzType, class blkSizeType, class ColDofType, class Dof2NodeTranslationType, class BdryNodeTypeConst, class BdryNodeType, class boolType>
278  private:
279  typedef typename MatrixType::ordinal_type LO;
280 
281  private:
282  MatrixType kokkosMatrix; //< local matrix part
283  NnzType coldofnnz; //< view containing start and stop indices for subviews
284  blkSizeType blkSize; //< block size (or partial block size in strided maps)
285  ColDofType coldofs; //< view containing the local dof ids associated with columns for the blkSize rows (not sorted)
286  Dof2NodeTranslationType dof2node; //< view containing the local node id associated with the local dof id
287  NnzType colnodennz; //< view containing number of column nodes for each node row
288  BdryNodeTypeConst dirichletdof; //< view containing with num dofs booleans. True if dof (not necessarily entire node) is dirichlet boundardy dof.
289  BdryNodeType bdrynode; //< view containing with numNodes booleans. True if node is (full) dirichlet boundardy node.
290  boolType usegreedydirichlet; //< boolean for use of greedy Dirichlet (if any dof is Dirichlet, entire node is dirichlet) default false (need all dofs in node to be Dirichlet for node to be Dirichlet)
291 
292  public:
293  Stage1bcVectorFunctor(MatrixType kokkosMatrix_,
294  NnzType coldofnnz_,
295  blkSizeType blkSize_,
296  ColDofType coldofs_,
297  Dof2NodeTranslationType dof2node_,
298  NnzType colnodennz_,
299  BdryNodeTypeConst dirichletdof_,
300  BdryNodeType bdrynode_,
301  boolType usegreedydirichlet_)
302  : kokkosMatrix(kokkosMatrix_)
303  , coldofnnz(coldofnnz_)
304  , blkSize(blkSize_)
305  , coldofs(coldofs_)
306  , dof2node(dof2node_)
307  , colnodennz(colnodennz_)
308  , dirichletdof(dirichletdof_)
309  , bdrynode(bdrynode_)
310  , usegreedydirichlet(usegreedydirichlet_) {
311  }
312 
313  KOKKOS_INLINE_FUNCTION
314  void operator()(const LO rowNode, LO& nnz) const {
315  LO pos = coldofnnz(rowNode);
316  if (usegreedydirichlet) {
317  bdrynode(rowNode) = false;
318  for (LO j = 0; j < blkSize; j++) {
319  auto rowView = kokkosMatrix.row(rowNode * blkSize + j);
320  auto numIndices = rowView.length;
321 
322  // if any dof in the node is Dirichlet
323  if (dirichletdof(rowNode * blkSize + j))
324  bdrynode(rowNode) = true;
325 
326  for (decltype(numIndices) k = 0; k < numIndices; k++) {
327  auto dofID = rowView.colidx(k);
328  coldofs(pos) = dofID;
329  pos++;
330  }
331  }
332  } else {
333  bdrynode(rowNode) = true;
334  for (LO j = 0; j < blkSize; j++) {
335  auto rowView = kokkosMatrix.row(rowNode * blkSize + j);
336  auto numIndices = rowView.length;
337 
338  // if any dof in the node is not Dirichlet
339  if (dirichletdof(rowNode * blkSize + j) == false)
340  bdrynode(rowNode) = false;
341 
342  for (decltype(numIndices) k = 0; k < numIndices; k++) {
343  auto dofID = rowView.colidx(k);
344  coldofs(pos) = dofID;
345  pos++;
346  }
347  }
348  }
349 
350  // sort coldofs
351  LO begin = coldofnnz(rowNode);
352  LO end = coldofnnz(rowNode + 1);
353  LO n = end - begin;
354  for (LO i = 0; i < (n - 1); i++) {
355  for (LO j = 0; j < (n - i - 1); j++) {
356  if (coldofs(j + begin) > coldofs(j + begin + 1)) {
357  LO temp = coldofs(j + begin);
358  coldofs(j + begin) = coldofs(j + begin + 1);
359  coldofs(j + begin + 1) = temp;
360  }
361  }
362  }
363  size_t cnt = 0;
364  LO lastNodeID = -1;
365  for (LO i = 0; i < n; i++) {
366  LO dofID = coldofs(begin + i);
367  LO nodeID = dof2node(dofID);
368  if (nodeID != lastNodeID) {
369  lastNodeID = nodeID;
370  coldofs(begin + cnt) = nodeID;
371  cnt++;
372  }
373  }
374  colnodennz(rowNode + 1) = cnt;
375  nnz += cnt;
376  }
377 };
378 
379 // fill column node id view
380 template <class MatrixType, class ColDofNnzType, class ColDofType, class ColNodeNnzType, class ColNodeType>
382  private:
383  typedef typename MatrixType::ordinal_type LO;
384  typedef typename MatrixType::value_type SC;
385 
386  private:
387  ColDofType coldofs; //< view containing mixed node and dof indices (only input)
388  ColDofNnzType coldofnnz; //< view containing the start and stop indices for subviews (dofs)
389  ColNodeType colnodes; //< view containing the local node ids associated with columns
390  ColNodeNnzType colnodennz; //< view containing start and stop indices for subviews
391 
392  public:
393  Stage1dVectorFunctor(ColDofType coldofs_, ColDofNnzType coldofnnz_, ColNodeType colnodes_, ColNodeNnzType colnodennz_)
394  : coldofs(coldofs_)
395  , coldofnnz(coldofnnz_)
396  , colnodes(colnodes_)
397  , colnodennz(colnodennz_) {
398  }
399 
400  KOKKOS_INLINE_FUNCTION
401  void operator()(const LO rowNode) const {
402  auto dofbegin = coldofnnz(rowNode);
403  auto nodebegin = colnodennz(rowNode);
404  auto nodeend = colnodennz(rowNode + 1);
405  auto n = nodeend - nodebegin;
406 
407  for (decltype(nodebegin) i = 0; i < n; i++) {
408  colnodes(nodebegin + i) = coldofs(dofbegin + i);
409  }
410  }
411 };
412 
413 } // namespace CoalesceDrop_Kokkos_Details
414 
415 template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
417  RCP<ParameterList> validParamList = rcp(new ParameterList());
418 
419 #define SET_VALID_ENTRY(name) validParamList->setEntry(name, MasterList::getEntry(name))
420  SET_VALID_ENTRY("aggregation: drop tol");
421  SET_VALID_ENTRY("aggregation: Dirichlet threshold");
422  SET_VALID_ENTRY("aggregation: drop scheme");
423  SET_VALID_ENTRY("aggregation: dropping may create Dirichlet");
424  SET_VALID_ENTRY("aggregation: greedy Dirichlet");
425  SET_VALID_ENTRY("filtered matrix: use lumping");
426  SET_VALID_ENTRY("filtered matrix: reuse graph");
427  SET_VALID_ENTRY("filtered matrix: reuse eigenvalue");
428  SET_VALID_ENTRY("aggregation: use ml scaling of drop tol");
429  {
430  validParamList->getEntry("aggregation: drop scheme").setValidator(rcp(new Teuchos::StringValidator(Teuchos::tuple<std::string>("classical", "distance laplacian"))));
431  }
432 #undef SET_VALID_ENTRY
433  validParamList->set<RCP<const FactoryBase>>("A", Teuchos::null, "Generating factory of the matrix A");
434  validParamList->set<RCP<const FactoryBase>>("UnAmalgamationInfo", Teuchos::null, "Generating factory for UnAmalgamationInfo");
435  validParamList->set<RCP<const FactoryBase>>("Coordinates", Teuchos::null, "Generating factory for Coordinates");
436 
437  return validParamList;
438 }
439 
440 template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
442  Input(currentLevel, "A");
443  Input(currentLevel, "UnAmalgamationInfo");
444 
445  const ParameterList& pL = GetParameterList();
446  if (pL.get<std::string>("aggregation: drop scheme") == "distance laplacian")
447  Input(currentLevel, "Coordinates");
448 }
449 
450 template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
452  Build(Level& currentLevel) const {
453  FactoryMonitor m(*this, "Build", currentLevel);
454 
455  typedef Teuchos::ScalarTraits<SC> STS;
456  typedef typename STS::magnitudeType MT;
457  const MT zero = Teuchos::ScalarTraits<MT>::zero();
458 
459  auto A = Get<RCP<Matrix>>(currentLevel, "A");
460 
461  /* NOTE: storageblocksize (from GetStorageBlockSize()) is the size of a block in the chosen storage scheme.
462  blkSize is the number of storage blocks that must kept together during the amalgamation process.
463 
464  Both of these quantities may be different than numPDEs (from GetFixedBlockSize()), but the following must always hold:
465 
466  numPDEs = blkSize * storageblocksize.
467 
468  If numPDEs==1
469  Matrix is point storage (classical CRS storage). storageblocksize=1 and blkSize=1
470  No other values makes sense.
471 
472  If numPDEs>1
473  If matrix uses point storage, then storageblocksize=1 and blkSize=numPDEs.
474  If matrix uses block storage, with block size of n, then storageblocksize=n, and blkSize=numPDEs/n.
475  Thus far, only storageblocksize=numPDEs and blkSize=1 has been tested.
476  */
477 
478  TEUCHOS_TEST_FOR_EXCEPTION(A->GetFixedBlockSize() % A->GetStorageBlockSize() != 0, Exceptions::RuntimeError, "A->GetFixedBlockSize() needs to be a multiple of A->GetStorageBlockSize()");
479  LO blkSize = A->GetFixedBlockSize() / A->GetStorageBlockSize();
480 
481  auto amalInfo = Get<RCP<AmalgamationInfo>>(currentLevel, "UnAmalgamationInfo");
482 
483  const ParameterList& pL = GetParameterList();
484 
485  // Sanity Checking: ML drop tol scaling is not supported in UncoupledAggregation_Kokkos
486  TEUCHOS_TEST_FOR_EXCEPTION(pL.get<bool>("aggregation: use ml scaling of drop tol"), std::invalid_argument, "Option: 'aggregation: use ml scaling of drop tol' is not supported in the Kokkos version of CoalesceDroPFactory");
487 
488  std::string algo = pL.get<std::string>("aggregation: drop scheme");
489 
490  double threshold = pL.get<double>("aggregation: drop tol");
491  GetOStream(Runtime0) << "algorithm = \"" << algo << "\": threshold = " << threshold
492  << ", blocksize = " << A->GetFixedBlockSize() << std::endl;
493 
494  const typename STS::magnitudeType dirichletThreshold =
495  STS::magnitude(as<SC>(pL.get<double>("aggregation: Dirichlet threshold")));
496 
497  GO numDropped = 0, numTotal = 0;
498 
499  RCP<LWGraph_kokkos> graph;
500  LO dofsPerNode = -1;
501 
502  typedef typename LWGraph_kokkos::boundary_nodes_type boundary_nodes_type;
503  boundary_nodes_type boundaryNodes;
504 
505  RCP<Matrix> filteredA;
506  if (blkSize == 1 && threshold == zero) {
507  // Scalar problem without dropping
508 
509  // Detect and record rows that correspond to Dirichlet boundary conditions
510  boundaryNodes = Utilities::DetectDirichletRows_kokkos(*A, dirichletThreshold);
511 
512  // Trivial LWGraph construction
513  graph = rcp(new LWGraph_kokkos(A->getCrsGraph()->getLocalGraphDevice(), A->getRowMap(), A->getColMap(), "graph of A"));
514  graph->SetBoundaryNodeMap(boundaryNodes);
515 
516  numTotal = A->getLocalNumEntries();
517  dofsPerNode = 1;
518 
519  filteredA = A;
520 
521  } else if (blkSize == 1 && threshold != zero) {
522  // Scalar problem with dropping
523 
524  // Detect and record rows that correspond to Dirichlet boundary conditions
525  boundaryNodes = Utilities::DetectDirichletRows_kokkos(*A, dirichletThreshold);
526 
527  typedef typename Matrix::local_matrix_type local_matrix_type;
528  typedef typename LWGraph_kokkos::local_graph_type kokkos_graph_type;
529  typedef typename kokkos_graph_type::row_map_type::non_const_type rows_type;
530  typedef typename kokkos_graph_type::entries_type::non_const_type cols_type;
531  typedef typename local_matrix_type::values_type::non_const_type vals_type;
532 
533  LO numRows = A->getLocalNumRows();
534  local_matrix_type kokkosMatrix = A->getLocalMatrixDevice();
535  auto nnzA = kokkosMatrix.nnz();
536  auto rowsA = kokkosMatrix.graph.row_map;
537 
538  typedef Kokkos::ArithTraits<SC> ATS;
539  typedef typename ATS::val_type impl_Scalar;
540  typedef Kokkos::ArithTraits<impl_Scalar> impl_ATS;
541 
542  bool reuseGraph = pL.get<bool>("filtered matrix: reuse graph");
543  bool lumping = pL.get<bool>("filtered matrix: use lumping");
544  if (lumping)
545  GetOStream(Runtime0) << "Lumping dropped entries" << std::endl;
546 
547  const bool aggregationMayCreateDirichlet = pL.get<bool>("aggregation: dropping may create Dirichlet");
548 
549  // FIXME_KOKKOS: replace by ViewAllocateWithoutInitializing + setting a single value
550  rows_type rows("FA_rows", numRows + 1);
551  cols_type colsAux(Kokkos::ViewAllocateWithoutInitializing("FA_aux_cols"), nnzA);
552  vals_type valsAux;
553  if (reuseGraph) {
554  SubFactoryMonitor m2(*this, "CopyMatrix", currentLevel);
555 
556  // Share graph with the original matrix
557  filteredA = MatrixFactory::Build(A->getCrsGraph());
558 
559  // Do a no-op fill-complete
560  RCP<ParameterList> fillCompleteParams(new ParameterList);
561  fillCompleteParams->set("No Nonlocal Changes", true);
562  filteredA->fillComplete(fillCompleteParams);
563 
564  // No need to reuseFill, just modify in place
565  valsAux = filteredA->getLocalMatrixDevice().values;
566 
567  } else {
568  // Need an extra array to compress
569  valsAux = vals_type(Kokkos::ViewAllocateWithoutInitializing("FA_aux_vals"), nnzA);
570  }
571 
572  LO nnzFA = 0;
573  {
574  if (algo == "classical") {
575  // Construct overlapped matrix diagonal
576  RCP<Vector> ghostedDiag;
577  {
578  kokkosMatrix = local_matrix_type();
579  SubFactoryMonitor m2(*this, "Ghosted diag construction", currentLevel);
580  ghostedDiag = Utilities::GetMatrixOverlappedDiagonal(*A);
581  kokkosMatrix = A->getLocalMatrixDevice();
582  }
583 
584  // Filter out entries
585  {
586  SubFactoryMonitor m2(*this, "MainLoop", currentLevel);
587 
588  auto ghostedDiagView = ghostedDiag->getDeviceLocalView(Xpetra::Access::ReadWrite);
589 
592  scalarFunctor(kokkosMatrix, boundaryNodes, dropFunctor, rows, colsAux, valsAux, reuseGraph, lumping, threshold, aggregationMayCreateDirichlet);
593 
594  Kokkos::parallel_reduce("MueLu:CoalesceDropF:Build:scalar_filter:main_loop", range_type(0, numRows),
595  scalarFunctor, nnzFA);
596  }
597 
598  } else if (algo == "distance laplacian") {
600  auto coords = Get<RCP<doubleMultiVector>>(currentLevel, "Coordinates");
601 
602  auto uniqueMap = A->getRowMap();
603  auto nonUniqueMap = A->getColMap();
604 
605  // Construct ghosted coordinates
606  RCP<const Import> importer;
607  {
608  SubFactoryMonitor m2(*this, "Coords Import construction", currentLevel);
609  importer = ImportFactory::Build(uniqueMap, nonUniqueMap);
610  }
611  RCP<doubleMultiVector> ghostedCoords;
612  {
613  SubFactoryMonitor m2(*this, "Ghosted coords construction", currentLevel);
614  ghostedCoords = Xpetra::MultiVectorFactory<typename Teuchos::ScalarTraits<Scalar>::magnitudeType, LO, GO, NO>::Build(nonUniqueMap, coords->getNumVectors());
615  ghostedCoords->doImport(*coords, *importer, Xpetra::INSERT);
616  }
617 
618  auto ghostedCoordsView = ghostedCoords->getDeviceLocalView(Xpetra::Access::ReadWrite);
620 
621  // Construct Laplacian diagonal
622  RCP<Vector> localLaplDiag;
623  {
624  SubFactoryMonitor m2(*this, "Local Laplacian diag construction", currentLevel);
625 
626  localLaplDiag = VectorFactory::Build(uniqueMap);
627 
628  auto localLaplDiagView = localLaplDiag->getDeviceLocalView(Xpetra::Access::OverwriteAll);
629  auto kokkosGraph = kokkosMatrix.graph;
630 
631  Kokkos::parallel_for(
632  "MueLu:CoalesceDropF:Build:scalar_filter:laplacian_diag", range_type(0, numRows),
633  KOKKOS_LAMBDA(const LO row) {
634  auto rowView = kokkosGraph.rowConst(row);
635  auto length = rowView.length;
636 
637  impl_Scalar d = impl_ATS::zero();
638  for (decltype(length) colID = 0; colID < length; colID++) {
639  auto col = rowView(colID);
640  if (row != col)
641  d += impl_ATS::one() / distFunctor.distance2(row, col);
642  }
643  localLaplDiagView(row, 0) = d;
644  });
645  }
646 
647  // Construct ghosted Laplacian diagonal
648  RCP<Vector> ghostedLaplDiag;
649  {
650  SubFactoryMonitor m2(*this, "Ghosted Laplacian diag construction", currentLevel);
651  ghostedLaplDiag = VectorFactory::Build(nonUniqueMap);
652  ghostedLaplDiag->doImport(*localLaplDiag, *importer, Xpetra::INSERT);
653  }
654 
655  // Filter out entries
656  {
657  SubFactoryMonitor m2(*this, "MainLoop", currentLevel);
658 
659  auto ghostedLaplDiagView = ghostedLaplDiag->getDeviceLocalView(Xpetra::Access::ReadWrite);
660 
662  dropFunctor(ghostedLaplDiagView, distFunctor, threshold);
664  scalarFunctor(kokkosMatrix, boundaryNodes, dropFunctor, rows, colsAux, valsAux, reuseGraph, lumping, threshold, true);
665 
666  Kokkos::parallel_reduce("MueLu:CoalesceDropF:Build:scalar_filter:main_loop", range_type(0, numRows),
667  scalarFunctor, nnzFA);
668  }
669  }
670  }
671  numDropped = nnzA - nnzFA;
672 
673  {
674  SubFactoryMonitor m2(*this, "CompressRows", currentLevel);
675 
676  // parallel_scan (exclusive)
677  Kokkos::parallel_scan(
678  "MueLu:CoalesceDropF:Build:scalar_filter:compress_rows", range_type(0, numRows + 1),
679  KOKKOS_LAMBDA(const LO i, LO& update, const bool& final_pass) {
680  update += rows(i);
681  if (final_pass)
682  rows(i) = update;
683  });
684  }
685 
686  // Compress cols (and optionally vals)
687  // We use a trick here: we moved all remaining elements to the beginning
688  // of the original row in the main loop, so we don't need to check for
689  // INVALID here, and just stop when achieving the new number of elements
690  // per row.
691  cols_type cols(Kokkos::ViewAllocateWithoutInitializing("FA_cols"), nnzFA);
692  vals_type vals;
693  if (reuseGraph) {
694  GetOStream(Runtime1) << "reuse matrix graph for filtering (compress matrix columns only)" << std::endl;
695  // Only compress cols
696  SubFactoryMonitor m2(*this, "CompressColsAndVals", currentLevel);
697 
698  Kokkos::parallel_for(
699  "MueLu:TentativePF:Build:compress_cols", range_type(0, numRows),
700  KOKKOS_LAMBDA(const LO i) {
701  // Is there Kokkos memcpy?
702  LO rowStart = rows(i);
703  LO rowAStart = rowsA(i);
704  size_t rownnz = rows(i + 1) - rows(i);
705  for (size_t j = 0; j < rownnz; j++)
706  cols(rowStart + j) = colsAux(rowAStart + j);
707  });
708  } else {
709  // Compress cols and vals
710  GetOStream(Runtime1) << "new matrix graph for filtering (compress matrix columns and values)" << std::endl;
711  SubFactoryMonitor m2(*this, "CompressColsAndVals", currentLevel);
712 
713  vals = vals_type(Kokkos::ViewAllocateWithoutInitializing("FA_vals"), nnzFA);
714 
715  Kokkos::parallel_for(
716  "MueLu:TentativePF:Build:compress_cols", range_type(0, numRows),
717  KOKKOS_LAMBDA(const LO i) {
718  LO rowStart = rows(i);
719  LO rowAStart = rowsA(i);
720  size_t rownnz = rows(i + 1) - rows(i);
721  for (size_t j = 0; j < rownnz; j++) {
722  cols(rowStart + j) = colsAux(rowAStart + j);
723  vals(rowStart + j) = valsAux(rowAStart + j);
724  }
725  });
726  }
727 
728  kokkos_graph_type kokkosGraph(cols, rows);
729 
730  {
731  SubFactoryMonitor m2(*this, "LWGraph construction", currentLevel);
732 
733  graph = rcp(new LWGraph_kokkos(kokkosGraph, A->getRowMap(), A->getColMap(), "filtered graph of A"));
734  graph->SetBoundaryNodeMap(boundaryNodes);
735  }
736 
737  numTotal = A->getLocalNumEntries();
738 
739  dofsPerNode = 1;
740 
741  if (!reuseGraph) {
742  SubFactoryMonitor m2(*this, "LocalMatrix+FillComplete", currentLevel);
743 
744  local_matrix_type localFA = local_matrix_type("A", numRows, A->getLocalMatrixDevice().numCols(), nnzFA, vals, rows, cols);
745  auto filteredACrs = CrsMatrixFactory::Build(localFA, A->getRowMap(), A->getColMap(), A->getDomainMap(), A->getRangeMap(),
746  A->getCrsGraph()->getImporter(), A->getCrsGraph()->getExporter());
747  filteredA = rcp(new CrsMatrixWrap(filteredACrs));
748  }
749 
750  filteredA->SetFixedBlockSize(A->GetFixedBlockSize());
751 
752  if (pL.get<bool>("filtered matrix: reuse eigenvalue")) {
753  // Reuse max eigenvalue from A
754  // It is unclear what eigenvalue is the best for the smoothing, but we already may have
755  // the D^{-1}A estimate in A, may as well use it.
756  // NOTE: ML does that too
757  filteredA->SetMaxEigenvalueEstimate(A->GetMaxEigenvalueEstimate());
758  } else {
759  filteredA->SetMaxEigenvalueEstimate(-Teuchos::ScalarTraits<SC>::one());
760  }
761 
762  } else if (blkSize > 1 && threshold == zero) {
763  // Case 3: block problem without filtering
764  //
765  // FIXME_KOKKOS: this code is completely unoptimized. It really should do
766  // a very simple thing: merge rows and produce nodal graph. But the code
767  // seems very complicated. Can we do better?
768 
769  TEUCHOS_TEST_FOR_EXCEPTION(A->getRowMap()->getLocalNumElements() % blkSize != 0, MueLu::Exceptions::RuntimeError, "MueLu::CoalesceDropFactory: Number of local elements is " << A->getRowMap()->getLocalNumElements() << " but should be a multiply of " << blkSize);
770 
771  const RCP<const Map> rowMap = A->getRowMap();
772  const RCP<const Map> colMap = A->getColMap();
773 
774  // build a node row map (uniqueMap = non-overlapping) and a node column map
775  // (nonUniqueMap = overlapping). The arrays rowTranslation and colTranslation
776  // stored in the AmalgamationInfo class container contain the local node id
777  // given a local dof id. The data is calculated in the AmalgamationFactory and
778  // stored in the variable "UnAmalgamationInfo" (which is of type AmalagamationInfo)
779  const RCP<const Map> uniqueMap = amalInfo->getNodeRowMap();
780  const RCP<const Map> nonUniqueMap = amalInfo->getNodeColMap();
781  Array<LO> rowTranslationArray = *(amalInfo->getRowTranslation()); // TAW should be transform that into a View?
782  Array<LO> colTranslationArray = *(amalInfo->getColTranslation());
783 
784  Kokkos::View<LO*, Kokkos::MemoryUnmanaged>
785  rowTranslationView(rowTranslationArray.getRawPtr(), rowTranslationArray.size());
786  Kokkos::View<LO*, Kokkos::MemoryUnmanaged>
787  colTranslationView(colTranslationArray.getRawPtr(), colTranslationArray.size());
788 
789  // get number of local nodes
790  LO numNodes = Teuchos::as<LocalOrdinal>(uniqueMap->getLocalNumElements());
791  typedef typename Kokkos::View<LocalOrdinal*, typename Node::device_type> id_translation_type;
792  id_translation_type rowTranslation("dofId2nodeId", rowTranslationArray.size());
793  id_translation_type colTranslation("ov_dofId2nodeId", colTranslationArray.size());
794  Kokkos::deep_copy(rowTranslation, rowTranslationView);
795  Kokkos::deep_copy(colTranslation, colTranslationView);
796 
797  // extract striding information
798  blkSize = A->GetFixedBlockSize(); //< the full block size (number of dofs per node in strided map)
799  LocalOrdinal blkId = -1; //< the block id within a strided map or -1 if it is a full block map
800  LocalOrdinal blkPartSize = A->GetFixedBlockSize(); //< stores block size of part blkId (or the full block size)
801  if (A->IsView("stridedMaps") == true) {
802  const RCP<const Map> myMap = A->getRowMap("stridedMaps");
803  const RCP<const StridedMap> strMap = Teuchos::rcp_dynamic_cast<const StridedMap>(myMap);
804  TEUCHOS_TEST_FOR_EXCEPTION(strMap.is_null() == true, Exceptions::RuntimeError, "Map is not of type stridedMap");
805  blkSize = Teuchos::as<const LocalOrdinal>(strMap->getFixedBlockSize());
806  blkId = strMap->getStridedBlockId();
807  if (blkId > -1)
808  blkPartSize = Teuchos::as<LocalOrdinal>(strMap->getStridingData()[blkId]);
809  }
810  auto kokkosMatrix = A->getLocalMatrixDevice(); // access underlying kokkos data
811 
812  //
813  typedef typename LWGraph_kokkos::local_graph_type kokkos_graph_type;
814  typedef typename kokkos_graph_type::row_map_type row_map_type;
815  // typedef typename row_map_type::HostMirror row_map_type_h;
816  typedef typename kokkos_graph_type::entries_type entries_type;
817 
818  // Stage 1c: get number of dof-nonzeros per blkSize node rows
819  typename row_map_type::non_const_type dofNnz("nnz_map", numNodes + 1);
820  LO numDofCols = 0;
822  Kokkos::parallel_reduce("MueLu:CoalesceDropF:Build:scalar_filter:stage1a", range_type(0, numNodes), stage1aFunctor, numDofCols);
823  // parallel_scan (exclusive)
825  Kokkos::parallel_scan("MueLu:CoalesceDropF:Build:scalar_filter:stage1_scan", range_type(0, numNodes + 1), scanFunctor);
826 
827  // Detect and record dof rows that correspond to Dirichlet boundary conditions
828  boundary_nodes_type singleEntryRows = Utilities::DetectDirichletRows_kokkos(*A, dirichletThreshold);
829 
830  typename entries_type::non_const_type dofcols("dofcols", numDofCols /*dofNnz(numNodes)*/); // why does dofNnz(numNodes) work? should be a parallel reduce, i guess
831 
832  // we have dofcols and dofids from Stage1dVectorFunctor
833  LO numNodeCols = 0;
834  typename row_map_type::non_const_type rows("nnz_nodemap", numNodes + 1);
835  typename boundary_nodes_type::non_const_type bndNodes("boundaryNodes", numNodes);
836 
837  CoalesceDrop_Kokkos_Details::Stage1bcVectorFunctor<decltype(kokkosMatrix), decltype(dofNnz), decltype(blkPartSize), decltype(dofcols), decltype(colTranslation), decltype(singleEntryRows), decltype(bndNodes), bool> stage1bcFunctor(kokkosMatrix, dofNnz, blkPartSize, dofcols, colTranslation, rows, singleEntryRows, bndNodes, pL.get<bool>("aggregation: greedy Dirichlet"));
838  Kokkos::parallel_reduce("MueLu:CoalesceDropF:Build:scalar_filter:stage1c", range_type(0, numNodes), stage1bcFunctor, numNodeCols);
839 
840  // parallel_scan (exclusive)
842  Kokkos::parallel_scan("MueLu:CoalesceDropF:Build:scalar_filter:stage1_scan", range_type(0, numNodes + 1), scanNodeFunctor);
843 
844  // create column node view
845  typename entries_type::non_const_type cols("nodecols", numNodeCols);
846 
848  Kokkos::parallel_for("MueLu:CoalesceDropF:Build:scalar_filter:stage1c", range_type(0, numNodes), stage1dFunctor);
849  kokkos_graph_type kokkosGraph(cols, rows);
850 
851  // create LW graph
852  graph = rcp(new LWGraph_kokkos(kokkosGraph, uniqueMap, nonUniqueMap, "amalgamated graph of A"));
853 
854  boundaryNodes = bndNodes;
855  graph->SetBoundaryNodeMap(boundaryNodes);
856  numTotal = A->getLocalNumEntries();
857 
858  dofsPerNode = blkSize;
859 
860  filteredA = A;
861 
862  } else {
863  TEUCHOS_TEST_FOR_EXCEPTION(true, Exceptions::RuntimeError, "MueLu: CoalesceDropFactory_kokkos: Block filtering is not implemented");
864  }
865 
866  if (GetVerbLevel() & Statistics1) {
867  GO numLocalBoundaryNodes = 0;
868  GO numGlobalBoundaryNodes = 0;
869 
870  Kokkos::parallel_reduce(
871  "MueLu:CoalesceDropF:Build:bnd", range_type(0, boundaryNodes.extent(0)),
872  KOKKOS_LAMBDA(const LO i, GO& n) {
873  if (boundaryNodes(i))
874  n++;
875  },
876  numLocalBoundaryNodes);
877 
878  auto comm = A->getRowMap()->getComm();
879  MueLu_sumAll(comm, numLocalBoundaryNodes, numGlobalBoundaryNodes);
880  GetOStream(Statistics1) << "Detected " << numGlobalBoundaryNodes << " Dirichlet nodes" << std::endl;
881  }
882 
883  if ((GetVerbLevel() & Statistics1) && threshold != zero) {
884  auto comm = A->getRowMap()->getComm();
885 
886  GO numGlobalTotal, numGlobalDropped;
887  MueLu_sumAll(comm, numTotal, numGlobalTotal);
888  MueLu_sumAll(comm, numDropped, numGlobalDropped);
889 
890  if (numGlobalTotal != 0) {
891  GetOStream(Statistics1) << "Number of dropped entries: "
892  << numGlobalDropped << "/" << numGlobalTotal
893  << " (" << 100 * Teuchos::as<double>(numGlobalDropped) / Teuchos::as<double>(numGlobalTotal) << "%)" << std::endl;
894  }
895  }
896 
897  Set(currentLevel, "DofsPerNode", dofsPerNode);
898  Set(currentLevel, "Graph", graph);
899  Set(currentLevel, "A", filteredA);
900 }
901 } // namespace MueLu
902 #endif // MUELU_COALESCEDROPFACTORY_KOKKOS_DEF_HPP
KOKKOS_INLINE_FUNCTION void operator()(const LO rowNode) const
#define MueLu_sumAll(rcpComm, in, out)
MueLu::DefaultLocalOrdinal LocalOrdinal
KOKKOS_INLINE_FUNCTION void operator()(const LO row, LO &nnz) const
Lightweight MueLu representation of a compressed row storage graph.
Stage1dVectorFunctor(ColDofType coldofs_, ColDofNnzType coldofnnz_, ColNodeType colnodes_, ColNodeNnzType colnodennz_)
ScalarFunctor(MatrixType A_, BndViewType bndNodes_, DropFunctorType dropFunctor_, typename rows_type::non_const_type rows_, typename cols_type::non_const_type colsAux_, typename vals_type::non_const_type valsAux_, bool reuseGraph_, bool lumping_, SC, bool aggregationMayCreateDirichlet_)
KOKKOS_INLINE_FUNCTION void SetBoundaryNodeMap(const boundary_nodes_type bndry)
Set boolean array indicating which rows correspond to Dirichlet boundaries.
Stage1aVectorFunctor(MatrixType kokkosMatrix_, NnzType nnz_, blkSizeType blkSize_)
typename std::conditional< OnHost, typename local_graph_device_type::HostMirror, local_graph_device_type >::type local_graph_type
void setValidator(RCP< const ParameterEntryValidator > const &validator)
GlobalOrdinal GO
T & get(const std::string &name, T def_value)
KOKKOS_FORCEINLINE_FUNCTION bool operator()(LO row, LO col, SC val) const
Timer to be used in factories. Similar to Monitor but with additional timers.
#define TEUCHOS_TEST_FOR_EXCEPTION(throw_exception_test, Exception, msg)
ClassicalDropFunctor(GhostedViewType ghostedDiag, magnitudeType threshold)
KOKKOS_INLINE_FUNCTION magnitudeType distance2(LO row, LO col) const
Print more statistics.
LocalOrdinal LO
One-liner description of what is happening.
ParameterList & set(std::string const &name, T &&value, std::string const &docString="", RCP< const ParameterEntryValidator > const &validator=null)
KOKKOS_INLINE_FUNCTION void operator()(const LO i, LO &upd, const bool &final) const
TEUCHOS_DEPRECATED RCP< T > rcp(T *p, Dealloc_T dealloc, bool owns_mem)
Class that holds all level-specific information.
Definition: MueLu_Level.hpp:63
Timer to be used in factories. Similar to SubMonitor but adds a timer level by level.
DistanceLaplacianDropFunctor(GhostedViewType ghostedLaplDiag, DistanceFunctor distFunctor_, magnitudeType threshold)
void DeclareInput(Level &currentLevel) const
Input.
KOKKOS_INLINE_FUNCTION void operator()(const LO row, LO &totalnnz) const
Stage1bcVectorFunctor(MatrixType kokkosMatrix_, NnzType coldofnnz_, blkSizeType blkSize_, ColDofType coldofs_, Dof2NodeTranslationType dof2node_, NnzType colnodennz_, BdryNodeTypeConst dirichletdof_, BdryNodeType bdrynode_, boolType usegreedydirichlet_)
static RCP< Vector > GetMatrixOverlappedDiagonal(const Matrix &A)
Extract Overlapped Matrix Diagonal.
Kokkos::RangePolicy< local_ordinal_type, execution_space > range_type
RCP< const ParameterList > GetValidParameterList() const
Return a const parameter list of valid parameters that setParameterList() will accept.
void Build(Level &currentLevel) const
Build an object with this factory.
size_type size() const
Scalar SC
#define SET_VALID_ENTRY(name)
Node NO
Exception throws to report errors in the internal logical of the program.
Description of what is happening (more verbose)
ParameterEntry & getEntry(const std::string &name)
TransListIter end
static Kokkos::View< bool *, typename NO::device_type::memory_space > DetectDirichletRows_kokkos(const Matrix &A, const Magnitude &tol=Teuchos::ScalarTraits< typename Teuchos::ScalarTraits< SC >::magnitudeType >::zero(), const bool count_twos_as_dirichlet=false)
Detect Dirichlet rows.
KOKKOS_INLINE_FUNCTION void operator()(const LO rowNode, LO &nnz) const
bool is_null() const