MueLu  Version of the Day
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Groups Pages
MueLu_CoalesceDropFactory_kokkos_def.hpp
Go to the documentation of this file.
1 // @HEADER
2 //
3 // ***********************************************************************
4 //
5 // MueLu: A package for multigrid based preconditioning
6 // Copyright 2012 Sandia Corporation
7 //
8 // Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
9 // the U.S. Government retains certain rights in this software.
10 //
11 // Redistribution and use in source and binary forms, with or without
12 // modification, are permitted provided that the following conditions are
13 // met:
14 //
15 // 1. Redistributions of source code must retain the above copyright
16 // notice, this list of conditions and the following disclaimer.
17 //
18 // 2. Redistributions in binary form must reproduce the above copyright
19 // notice, this list of conditions and the following disclaimer in the
20 // documentation and/or other materials provided with the distribution.
21 //
22 // 3. Neither the name of the Corporation nor the names of the
23 // contributors may be used to endorse or promote products derived from
24 // this software without specific prior written permission.
25 //
26 // THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
27 // EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
28 // IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
29 // PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
30 // CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
31 // EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
32 // PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
33 // PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
34 // LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
35 // NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
36 // SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
37 //
38 // Questions? Contact
39 // Jonathan Hu (jhu@sandia.gov)
40 // Andrey Prokopenko (aprokop@sandia.gov)
41 // Ray Tuminaro (rstumin@sandia.gov)
42 //
43 // ***********************************************************************
44 //
45 // @HEADER
46 #ifndef MUELU_COALESCEDROPFACTORY_KOKKOS_DEF_HPP
47 #define MUELU_COALESCEDROPFACTORY_KOKKOS_DEF_HPP
48 
49 #include <Kokkos_Core.hpp>
50 #include <KokkosSparse_CrsMatrix.hpp>
51 
52 #include "Xpetra_Matrix.hpp"
53 
55 
56 #include "MueLu_AmalgamationInfo.hpp"
57 #include "MueLu_Exceptions.hpp"
58 #include "MueLu_Level.hpp"
59 #include "MueLu_LWGraph_kokkos.hpp"
60 #include "MueLu_MasterList.hpp"
61 #include "MueLu_Monitor.hpp"
62 #include "MueLu_Utilities.hpp"
63 
64 namespace MueLu {
65 
66 namespace CoalesceDrop_Kokkos_Details { // anonymous
67 
68 template <class LO, class RowType>
69 class ScanFunctor {
70  public:
71  ScanFunctor(RowType rows_)
72  : rows(rows_) {}
73 
74  KOKKOS_INLINE_FUNCTION
75  void operator()(const LO i, LO& upd, const bool& final) const {
76  upd += rows(i);
77  if (final)
78  rows(i) = upd;
79  }
80 
81  private:
82  RowType rows;
83 };
84 
85 template <class LO, class GhostedViewType>
87  private:
88  typedef typename GhostedViewType::value_type SC;
89  typedef Kokkos::ArithTraits<SC> ATS;
90  typedef typename ATS::magnitudeType magnitudeType;
91 
92  GhostedViewType diag; // corresponds to overlapped diagonal multivector (2D View)
94 
95  public:
96  ClassicalDropFunctor(GhostedViewType ghostedDiag, magnitudeType threshold)
97  : diag(ghostedDiag)
98  , eps(threshold) {}
99 
100  // Return true if we drop, false if not
101  KOKKOS_FORCEINLINE_FUNCTION
102  bool operator()(LO row, LO col, SC val) const {
103  // We avoid square root by using squared values
104  auto aiiajj = ATS::magnitude(diag(row, 0)) * ATS::magnitude(diag(col, 0)); // |a_ii|*|a_jj|
105  auto aij2 = ATS::magnitude(val) * ATS::magnitude(val); // |a_ij|^2
106 
107  return (aij2 <= eps * eps * aiiajj);
108  }
109 };
110 
111 template <class LO, class CoordsType>
113  private:
114  typedef typename CoordsType::value_type SC;
115  typedef Kokkos::ArithTraits<SC> ATS;
116  typedef typename ATS::magnitudeType magnitudeType;
117 
118  public:
119  typedef SC value_type;
120 
121  public:
122  DistanceFunctor(CoordsType coords_)
123  : coords(coords_) {}
124 
125  KOKKOS_INLINE_FUNCTION
126  magnitudeType distance2(LO row, LO col) const {
127  SC d = ATS::zero(), s;
128  for (size_t j = 0; j < coords.extent(1); j++) {
129  s = coords(row, j) - coords(col, j);
130  d += s * s;
131  }
132  return ATS::magnitude(d);
133  }
134 
135  private:
136  CoordsType coords;
137 };
138 
139 template <class LO, class GhostedViewType, class DistanceFunctor>
141  private:
142  typedef typename GhostedViewType::value_type SC;
143  typedef Kokkos::ArithTraits<SC> ATS;
144  typedef typename ATS::magnitudeType magnitudeType;
145 
146  public:
147  DistanceLaplacianDropFunctor(GhostedViewType ghostedLaplDiag, DistanceFunctor distFunctor_, magnitudeType threshold)
148  : diag(ghostedLaplDiag)
149  , distFunctor(distFunctor_)
150  , eps(threshold) {}
151 
152  // Return true if we drop, false if not
153  KOKKOS_INLINE_FUNCTION
154  bool operator()(LO row, LO col, SC /* val */) const {
155  // We avoid square root by using squared values
156 
157  // We ignore incoming value of val as we operate on an auxiliary
158  // distance Laplacian matrix
159  typedef typename DistanceFunctor::value_type dSC;
160  typedef Kokkos::ArithTraits<dSC> dATS;
161  auto fval = dATS::one() / distFunctor.distance2(row, col);
162 
163  auto aiiajj = ATS::magnitude(diag(row, 0)) * ATS::magnitude(diag(col, 0)); // |a_ii|*|a_jj|
164  auto aij2 = ATS::magnitude(fval) * ATS::magnitude(fval); // |a_ij|^2
165 
166  return (aij2 <= eps * eps * aiiajj);
167  }
168 
169  private:
170  GhostedViewType diag; // corresponds to overlapped diagonal multivector (2D View)
173 };
174 
175 template <class SC, class LO, class MatrixType, class BndViewType, class DropFunctorType>
177  private:
178  typedef typename MatrixType::StaticCrsGraphType graph_type;
179  typedef typename graph_type::row_map_type rows_type;
180  typedef typename graph_type::entries_type cols_type;
181  typedef typename MatrixType::values_type vals_type;
182  typedef Kokkos::ArithTraits<SC> ATS;
183  typedef typename ATS::val_type impl_Scalar;
184  typedef Kokkos::ArithTraits<impl_Scalar> impl_ATS;
185  typedef typename ATS::magnitudeType magnitudeType;
186 
187  public:
188  ScalarFunctor(MatrixType A_, BndViewType bndNodes_, DropFunctorType dropFunctor_,
189  typename rows_type::non_const_type rows_,
190  typename cols_type::non_const_type colsAux_,
191  typename vals_type::non_const_type valsAux_,
192  bool reuseGraph_, bool lumping_, SC /* threshold_ */,
193  bool aggregationMayCreateDirichlet_)
194  : A(A_)
195  , bndNodes(bndNodes_)
196  , dropFunctor(dropFunctor_)
197  , rows(rows_)
198  , colsAux(colsAux_)
199  , valsAux(valsAux_)
200  , reuseGraph(reuseGraph_)
201  , lumping(lumping_)
202  , aggregationMayCreateDirichlet(aggregationMayCreateDirichlet_) {
203  rowsA = A.graph.row_map;
204  zero = impl_ATS::zero();
205  }
206 
207  KOKKOS_INLINE_FUNCTION
208  void operator()(const LO row, LO& nnz) const {
209  auto rowView = A.rowConst(row);
210  auto length = rowView.length;
211  auto offset = rowsA(row);
212 
213  impl_Scalar diag = zero;
214  LO rownnz = 0;
215  LO diagID = -1;
216  for (decltype(length) colID = 0; colID < length; colID++) {
217  LO col = rowView.colidx(colID);
218  impl_Scalar val = rowView.value(colID);
219 
220  if (!dropFunctor(row, col, rowView.value(colID)) || row == col) {
221  colsAux(offset + rownnz) = col;
222 
223  LO valID = (reuseGraph ? colID : rownnz);
224  valsAux(offset + valID) = val;
225  if (row == col)
226  diagID = valID;
227 
228  rownnz++;
229 
230  } else {
231  // Rewrite with zeros (needed for reuseGraph)
232  valsAux(offset + colID) = zero;
233  diag += val;
234  }
235  }
236  // How to assert on the device?
237  // assert(diagIndex != -1);
238  rows(row + 1) = rownnz;
239  // if (lumping && diagID != -1) {
240  if (lumping) {
241  // Add diag to the diagonal
242 
243  // NOTE_KOKKOS: valsAux was allocated with
244  // ViewAllocateWithoutInitializing. This is not a problem here
245  // because we explicitly set this value above.
246  valsAux(offset + diagID) += diag;
247  }
248 
249  // If the only element remaining after filtering is diagonal, mark node as boundary
250  // FIXME: this should really be replaced by the following
251  // if (indices.size() == 1 && indices[0] == row)
252  // boundaryNodes[row] = true;
253  // We do not do it this way now because there is no framework for distinguishing isolated
254  // and boundary nodes in the aggregation algorithms
255  bndNodes(row) = (rownnz == 1 && aggregationMayCreateDirichlet);
256 
257  nnz += rownnz;
258  }
259 
260  private:
261  MatrixType A;
262  BndViewType bndNodes;
263  DropFunctorType dropFunctor;
264 
266 
267  typename rows_type::non_const_type rows;
268  typename cols_type::non_const_type colsAux;
269  typename vals_type::non_const_type valsAux;
270 
272  bool lumping;
275 };
276 
277 // collect number nonzeros of blkSize rows in nnz_(row+1)
278 template <class MatrixType, class NnzType, class blkSizeType>
280  private:
281  typedef typename MatrixType::ordinal_type LO;
282 
283  public:
284  Stage1aVectorFunctor(MatrixType kokkosMatrix_, NnzType nnz_, blkSizeType blkSize_)
285  : kokkosMatrix(kokkosMatrix_)
286  , nnz(nnz_)
287  , blkSize(blkSize_) {}
288 
289  KOKKOS_INLINE_FUNCTION
290  void operator()(const LO row, LO& totalnnz) const {
291  // the following code is more or less what MergeRows is doing
292  // count nonzero entries in all dof rows associated with node row
293  LO nodeRowMaxNonZeros = 0;
294  for (LO j = 0; j < blkSize; j++) {
295  auto rowView = kokkosMatrix.row(row * blkSize + j);
296  nodeRowMaxNonZeros += rowView.length;
297  }
298  nnz(row + 1) = nodeRowMaxNonZeros;
299  totalnnz += nodeRowMaxNonZeros;
300  }
301 
302  private:
303  MatrixType kokkosMatrix; //< local matrix part
304  NnzType nnz; //< View containing number of nonzeros for current row
305  blkSizeType blkSize; //< block size (or partial block size in strided maps)
306 };
307 
308 // build the dof-based column map containing the local dof ids belonging to blkSize rows in matrix
309 // sort column ids
310 // translate them into (unique) node ids
311 // count the node column ids per node row
312 template <class MatrixType, class NnzType, class blkSizeType, class ColDofType, class Dof2NodeTranslationType, class BdryNodeTypeConst, class BdryNodeType, class boolType>
314  private:
315  typedef typename MatrixType::ordinal_type LO;
316 
317  private:
318  MatrixType kokkosMatrix; //< local matrix part
319  NnzType coldofnnz; //< view containing start and stop indices for subviews
320  blkSizeType blkSize; //< block size (or partial block size in strided maps)
321  ColDofType coldofs; //< view containing the local dof ids associated with columns for the blkSize rows (not sorted)
322  Dof2NodeTranslationType dof2node; //< view containing the local node id associated with the local dof id
323  NnzType colnodennz; //< view containing number of column nodes for each node row
324  BdryNodeTypeConst dirichletdof; //< view containing with num dofs booleans. True if dof (not necessarily entire node) is dirichlet boundardy dof.
325  BdryNodeType bdrynode; //< view containing with numNodes booleans. True if node is (full) dirichlet boundardy node.
326  boolType usegreedydirichlet; //< boolean for use of greedy Dirichlet (if any dof is Dirichlet, entire node is dirichlet) default false (need all dofs in node to be Dirichlet for node to be Dirichlet)
327 
328  public:
329  Stage1bcVectorFunctor(MatrixType kokkosMatrix_,
330  NnzType coldofnnz_,
331  blkSizeType blkSize_,
332  ColDofType coldofs_,
333  Dof2NodeTranslationType dof2node_,
334  NnzType colnodennz_,
335  BdryNodeTypeConst dirichletdof_,
336  BdryNodeType bdrynode_,
337  boolType usegreedydirichlet_)
338  : kokkosMatrix(kokkosMatrix_)
339  , coldofnnz(coldofnnz_)
340  , blkSize(blkSize_)
341  , coldofs(coldofs_)
342  , dof2node(dof2node_)
343  , colnodennz(colnodennz_)
344  , dirichletdof(dirichletdof_)
345  , bdrynode(bdrynode_)
346  , usegreedydirichlet(usegreedydirichlet_) {
347  }
348 
349  KOKKOS_INLINE_FUNCTION
350  void operator()(const LO rowNode, LO& nnz) const {
351  LO pos = coldofnnz(rowNode);
352  if (usegreedydirichlet) {
353  bdrynode(rowNode) = false;
354  for (LO j = 0; j < blkSize; j++) {
355  auto rowView = kokkosMatrix.row(rowNode * blkSize + j);
356  auto numIndices = rowView.length;
357 
358  // if any dof in the node is Dirichlet
359  if (dirichletdof(rowNode * blkSize + j))
360  bdrynode(rowNode) = true;
361 
362  for (decltype(numIndices) k = 0; k < numIndices; k++) {
363  auto dofID = rowView.colidx(k);
364  coldofs(pos) = dofID;
365  pos++;
366  }
367  }
368  } else {
369  bdrynode(rowNode) = true;
370  for (LO j = 0; j < blkSize; j++) {
371  auto rowView = kokkosMatrix.row(rowNode * blkSize + j);
372  auto numIndices = rowView.length;
373 
374  // if any dof in the node is not Dirichlet
375  if (dirichletdof(rowNode * blkSize + j) == false)
376  bdrynode(rowNode) = false;
377 
378  for (decltype(numIndices) k = 0; k < numIndices; k++) {
379  auto dofID = rowView.colidx(k);
380  coldofs(pos) = dofID;
381  pos++;
382  }
383  }
384  }
385 
386  // sort coldofs
387  LO begin = coldofnnz(rowNode);
388  LO end = coldofnnz(rowNode + 1);
389  LO n = end - begin;
390  for (LO i = 0; i < (n - 1); i++) {
391  for (LO j = 0; j < (n - i - 1); j++) {
392  if (coldofs(j + begin) > coldofs(j + begin + 1)) {
393  LO temp = coldofs(j + begin);
394  coldofs(j + begin) = coldofs(j + begin + 1);
395  coldofs(j + begin + 1) = temp;
396  }
397  }
398  }
399  size_t cnt = 0;
400  LO lastNodeID = -1;
401  for (LO i = 0; i < n; i++) {
402  LO dofID = coldofs(begin + i);
403  LO nodeID = dof2node(dofID);
404  if (nodeID != lastNodeID) {
405  lastNodeID = nodeID;
406  coldofs(begin + cnt) = nodeID;
407  cnt++;
408  }
409  }
410  colnodennz(rowNode + 1) = cnt;
411  nnz += cnt;
412  }
413 };
414 
415 // fill column node id view
416 template <class MatrixType, class ColDofNnzType, class ColDofType, class ColNodeNnzType, class ColNodeType>
418  private:
419  typedef typename MatrixType::ordinal_type LO;
420  typedef typename MatrixType::value_type SC;
421 
422  private:
423  ColDofType coldofs; //< view containing mixed node and dof indices (only input)
424  ColDofNnzType coldofnnz; //< view containing the start and stop indices for subviews (dofs)
425  ColNodeType colnodes; //< view containing the local node ids associated with columns
426  ColNodeNnzType colnodennz; //< view containing start and stop indices for subviews
427 
428  public:
429  Stage1dVectorFunctor(ColDofType coldofs_, ColDofNnzType coldofnnz_, ColNodeType colnodes_, ColNodeNnzType colnodennz_)
430  : coldofs(coldofs_)
431  , coldofnnz(coldofnnz_)
432  , colnodes(colnodes_)
433  , colnodennz(colnodennz_) {
434  }
435 
436  KOKKOS_INLINE_FUNCTION
437  void operator()(const LO rowNode) const {
438  auto dofbegin = coldofnnz(rowNode);
439  auto nodebegin = colnodennz(rowNode);
440  auto nodeend = colnodennz(rowNode + 1);
441  auto n = nodeend - nodebegin;
442 
443  for (decltype(nodebegin) i = 0; i < n; i++) {
444  colnodes(nodebegin + i) = coldofs(dofbegin + i);
445  }
446  }
447 };
448 
449 } // namespace CoalesceDrop_Kokkos_Details
450 
451 template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
453  RCP<ParameterList> validParamList = rcp(new ParameterList());
454 
455 #define SET_VALID_ENTRY(name) validParamList->setEntry(name, MasterList::getEntry(name))
456  SET_VALID_ENTRY("aggregation: drop tol");
457  SET_VALID_ENTRY("aggregation: Dirichlet threshold");
458  SET_VALID_ENTRY("aggregation: drop scheme");
459  SET_VALID_ENTRY("aggregation: dropping may create Dirichlet");
460  SET_VALID_ENTRY("aggregation: greedy Dirichlet");
461  SET_VALID_ENTRY("filtered matrix: use lumping");
462  SET_VALID_ENTRY("filtered matrix: reuse graph");
463  SET_VALID_ENTRY("filtered matrix: reuse eigenvalue");
464  SET_VALID_ENTRY("aggregation: use ml scaling of drop tol");
465  {
466  validParamList->getEntry("aggregation: drop scheme").setValidator(rcp(new Teuchos::StringValidator(Teuchos::tuple<std::string>("classical", "distance laplacian"))));
467  }
468 #undef SET_VALID_ENTRY
469  validParamList->set<RCP<const FactoryBase>>("A", Teuchos::null, "Generating factory of the matrix A");
470  validParamList->set<RCP<const FactoryBase>>("UnAmalgamationInfo", Teuchos::null, "Generating factory for UnAmalgamationInfo");
471  validParamList->set<RCP<const FactoryBase>>("Coordinates", Teuchos::null, "Generating factory for Coordinates");
472 
473  return validParamList;
474 }
475 
476 template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
478  Input(currentLevel, "A");
479  Input(currentLevel, "UnAmalgamationInfo");
480 
481  const ParameterList& pL = GetParameterList();
482  if (pL.get<std::string>("aggregation: drop scheme") == "distance laplacian")
483  Input(currentLevel, "Coordinates");
484 }
485 
486 template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
488  Build(Level& currentLevel) const {
489  FactoryMonitor m(*this, "Build", currentLevel);
490 
491  typedef Teuchos::ScalarTraits<SC> STS;
492  typedef typename STS::magnitudeType MT;
493  const MT zero = Teuchos::ScalarTraits<MT>::zero();
494 
495  auto A = Get<RCP<Matrix>>(currentLevel, "A");
496 
497  /* NOTE: storageblocksize (from GetStorageBlockSize()) is the size of a block in the chosen storage scheme.
498  blkSize is the number of storage blocks that must kept together during the amalgamation process.
499 
500  Both of these quantities may be different than numPDEs (from GetFixedBlockSize()), but the following must always hold:
501 
502  numPDEs = blkSize * storageblocksize.
503 
504  If numPDEs==1
505  Matrix is point storage (classical CRS storage). storageblocksize=1 and blkSize=1
506  No other values makes sense.
507 
508  If numPDEs>1
509  If matrix uses point storage, then storageblocksize=1 and blkSize=numPDEs.
510  If matrix uses block storage, with block size of n, then storageblocksize=n, and blkSize=numPDEs/n.
511  Thus far, only storageblocksize=numPDEs and blkSize=1 has been tested.
512  */
513 
514  TEUCHOS_TEST_FOR_EXCEPTION(A->GetFixedBlockSize() % A->GetStorageBlockSize() != 0, Exceptions::RuntimeError, "A->GetFixedBlockSize() needs to be a multiple of A->GetStorageBlockSize()");
515  LO blkSize = A->GetFixedBlockSize() / A->GetStorageBlockSize();
516 
517  auto amalInfo = Get<RCP<AmalgamationInfo>>(currentLevel, "UnAmalgamationInfo");
518 
519  const ParameterList& pL = GetParameterList();
520 
521  // Sanity Checking: ML drop tol scaling is not supported in UncoupledAggregation_Kokkos
522  TEUCHOS_TEST_FOR_EXCEPTION(pL.get<bool>("aggregation: use ml scaling of drop tol"), std::invalid_argument, "Option: 'aggregation: use ml scaling of drop tol' is not supported in the Kokkos version of CoalesceDroPFactory");
523 
524  std::string algo = pL.get<std::string>("aggregation: drop scheme");
525 
526  double threshold = pL.get<double>("aggregation: drop tol");
527  GetOStream(Runtime0) << "algorithm = \"" << algo << "\": threshold = " << threshold
528  << ", blocksize = " << A->GetFixedBlockSize() << std::endl;
529 
530  const typename STS::magnitudeType dirichletThreshold =
531  STS::magnitude(as<SC>(pL.get<double>("aggregation: Dirichlet threshold")));
532 
533  GO numDropped = 0, numTotal = 0;
534 
535  RCP<LWGraph_kokkos> graph;
536  LO dofsPerNode = -1;
537 
538  typedef typename LWGraph_kokkos::boundary_nodes_type boundary_nodes_type;
539  boundary_nodes_type boundaryNodes;
540 
541  RCP<Matrix> filteredA;
542  if (blkSize == 1 && threshold == zero) {
543  // Scalar problem without dropping
544 
545  // Detect and record rows that correspond to Dirichlet boundary conditions
546  boundaryNodes = Utilities::DetectDirichletRows_kokkos(*A, dirichletThreshold);
547 
548  // Trivial LWGraph construction
549  graph = rcp(new LWGraph_kokkos(A->getCrsGraph()->getLocalGraphDevice(), A->getRowMap(), A->getColMap(), "graph of A"));
550  graph->SetBoundaryNodeMap(boundaryNodes);
551 
552  numTotal = A->getLocalNumEntries();
553  dofsPerNode = 1;
554 
555  filteredA = A;
556 
557  } else if (blkSize == 1 && threshold != zero) {
558  // Scalar problem with dropping
559 
560  typedef typename Matrix::local_matrix_type local_matrix_type;
561  typedef typename LWGraph_kokkos::local_graph_type kokkos_graph_type;
562  typedef typename kokkos_graph_type::row_map_type::non_const_type rows_type;
563  typedef typename kokkos_graph_type::entries_type::non_const_type cols_type;
564  typedef typename local_matrix_type::values_type::non_const_type vals_type;
565 
566  LO numRows = A->getLocalNumRows();
567  local_matrix_type kokkosMatrix = A->getLocalMatrixDevice();
568  auto nnzA = kokkosMatrix.nnz();
569  auto rowsA = kokkosMatrix.graph.row_map;
570 
571  typedef Kokkos::ArithTraits<SC> ATS;
572  typedef typename ATS::val_type impl_Scalar;
573  typedef Kokkos::ArithTraits<impl_Scalar> impl_ATS;
574 
575  bool reuseGraph = pL.get<bool>("filtered matrix: reuse graph");
576  bool lumping = pL.get<bool>("filtered matrix: use lumping");
577  if (lumping)
578  GetOStream(Runtime0) << "Lumping dropped entries" << std::endl;
579 
580  const bool aggregationMayCreateDirichlet = pL.get<bool>("aggregation: dropping may create Dirichlet");
581 
582  // FIXME_KOKKOS: replace by ViewAllocateWithoutInitializing + setting a single value
583  rows_type rows("FA_rows", numRows + 1);
584  cols_type colsAux(Kokkos::ViewAllocateWithoutInitializing("FA_aux_cols"), nnzA);
585  vals_type valsAux;
586  if (reuseGraph) {
587  SubFactoryMonitor m2(*this, "CopyMatrix", currentLevel);
588 
589  // Share graph with the original matrix
590  filteredA = MatrixFactory::Build(A->getCrsGraph());
591 
592  // Do a no-op fill-complete
593  RCP<ParameterList> fillCompleteParams(new ParameterList);
594  fillCompleteParams->set("No Nonlocal Changes", true);
595  filteredA->fillComplete(fillCompleteParams);
596 
597  // No need to reuseFill, just modify in place
598  valsAux = filteredA->getLocalMatrixDevice().values;
599 
600  } else {
601  // Need an extra array to compress
602  valsAux = vals_type(Kokkos::ViewAllocateWithoutInitializing("FA_aux_vals"), nnzA);
603  }
604 
605  typename boundary_nodes_type::non_const_type bndNodes(Kokkos::ViewAllocateWithoutInitializing("boundaryNodes"), numRows);
606 
607  LO nnzFA = 0;
608  {
609  if (algo == "classical") {
610  // Construct overlapped matrix diagonal
611  RCP<Vector> ghostedDiag;
612  {
613  kokkosMatrix = local_matrix_type();
614  SubFactoryMonitor m2(*this, "Ghosted diag construction", currentLevel);
615  ghostedDiag = Utilities::GetMatrixOverlappedDiagonal(*A);
616  kokkosMatrix = A->getLocalMatrixDevice();
617  }
618 
619  // Filter out entries
620  {
621  SubFactoryMonitor m2(*this, "MainLoop", currentLevel);
622 
623  auto ghostedDiagView = ghostedDiag->getDeviceLocalView(Xpetra::Access::ReadWrite);
624 
627  scalarFunctor(kokkosMatrix, bndNodes, dropFunctor, rows, colsAux, valsAux, reuseGraph, lumping, threshold, aggregationMayCreateDirichlet);
628 
629  Kokkos::parallel_reduce("MueLu:CoalesceDropF:Build:scalar_filter:main_loop", range_type(0, numRows),
630  scalarFunctor, nnzFA);
631  }
632 
633  } else if (algo == "distance laplacian") {
635  auto coords = Get<RCP<doubleMultiVector>>(currentLevel, "Coordinates");
636 
637  auto uniqueMap = A->getRowMap();
638  auto nonUniqueMap = A->getColMap();
639 
640  // Construct ghosted coordinates
641  RCP<const Import> importer;
642  {
643  SubFactoryMonitor m2(*this, "Coords Import construction", currentLevel);
644  importer = ImportFactory::Build(uniqueMap, nonUniqueMap);
645  }
646  RCP<doubleMultiVector> ghostedCoords;
647  {
648  SubFactoryMonitor m2(*this, "Ghosted coords construction", currentLevel);
649  ghostedCoords = Xpetra::MultiVectorFactory<typename Teuchos::ScalarTraits<Scalar>::magnitudeType, LO, GO, NO>::Build(nonUniqueMap, coords->getNumVectors());
650  ghostedCoords->doImport(*coords, *importer, Xpetra::INSERT);
651  }
652 
653  auto ghostedCoordsView = ghostedCoords->getDeviceLocalView(Xpetra::Access::ReadWrite);
655 
656  // Construct Laplacian diagonal
657  RCP<Vector> localLaplDiag;
658  {
659  SubFactoryMonitor m2(*this, "Local Laplacian diag construction", currentLevel);
660 
661  localLaplDiag = VectorFactory::Build(uniqueMap);
662 
663  auto localLaplDiagView = localLaplDiag->getDeviceLocalView(Xpetra::Access::OverwriteAll);
664  auto kokkosGraph = kokkosMatrix.graph;
665 
666  Kokkos::parallel_for(
667  "MueLu:CoalesceDropF:Build:scalar_filter:laplacian_diag", range_type(0, numRows),
668  KOKKOS_LAMBDA(const LO row) {
669  auto rowView = kokkosGraph.rowConst(row);
670  auto length = rowView.length;
671 
672  impl_Scalar d = impl_ATS::zero();
673  for (decltype(length) colID = 0; colID < length; colID++) {
674  auto col = rowView(colID);
675  if (row != col)
676  d += impl_ATS::one() / distFunctor.distance2(row, col);
677  }
678  localLaplDiagView(row, 0) = d;
679  });
680  }
681 
682  // Construct ghosted Laplacian diagonal
683  RCP<Vector> ghostedLaplDiag;
684  {
685  SubFactoryMonitor m2(*this, "Ghosted Laplacian diag construction", currentLevel);
686  ghostedLaplDiag = VectorFactory::Build(nonUniqueMap);
687  ghostedLaplDiag->doImport(*localLaplDiag, *importer, Xpetra::INSERT);
688  }
689 
690  // Filter out entries
691  {
692  SubFactoryMonitor m2(*this, "MainLoop", currentLevel);
693 
694  auto ghostedLaplDiagView = ghostedLaplDiag->getDeviceLocalView(Xpetra::Access::ReadWrite);
695 
697  dropFunctor(ghostedLaplDiagView, distFunctor, threshold);
699  scalarFunctor(kokkosMatrix, bndNodes, dropFunctor, rows, colsAux, valsAux, reuseGraph, lumping, threshold, true);
700 
701  Kokkos::parallel_reduce("MueLu:CoalesceDropF:Build:scalar_filter:main_loop", range_type(0, numRows),
702  scalarFunctor, nnzFA);
703  }
704  }
705  }
706  numDropped = nnzA - nnzFA;
707 
708  boundaryNodes = bndNodes;
709 
710  {
711  SubFactoryMonitor m2(*this, "CompressRows", currentLevel);
712 
713  // parallel_scan (exclusive)
714  Kokkos::parallel_scan(
715  "MueLu:CoalesceDropF:Build:scalar_filter:compress_rows", range_type(0, numRows + 1),
716  KOKKOS_LAMBDA(const LO i, LO& update, const bool& final_pass) {
717  update += rows(i);
718  if (final_pass)
719  rows(i) = update;
720  });
721  }
722 
723  // Compress cols (and optionally vals)
724  // We use a trick here: we moved all remaining elements to the beginning
725  // of the original row in the main loop, so we don't need to check for
726  // INVALID here, and just stop when achieving the new number of elements
727  // per row.
728  cols_type cols(Kokkos::ViewAllocateWithoutInitializing("FA_cols"), nnzFA);
729  vals_type vals;
730  if (reuseGraph) {
731  GetOStream(Runtime1) << "reuse matrix graph for filtering (compress matrix columns only)" << std::endl;
732  // Only compress cols
733  SubFactoryMonitor m2(*this, "CompressColsAndVals", currentLevel);
734 
735  Kokkos::parallel_for(
736  "MueLu:TentativePF:Build:compress_cols", range_type(0, numRows),
737  KOKKOS_LAMBDA(const LO i) {
738  // Is there Kokkos memcpy?
739  LO rowStart = rows(i);
740  LO rowAStart = rowsA(i);
741  size_t rownnz = rows(i + 1) - rows(i);
742  for (size_t j = 0; j < rownnz; j++)
743  cols(rowStart + j) = colsAux(rowAStart + j);
744  });
745  } else {
746  // Compress cols and vals
747  GetOStream(Runtime1) << "new matrix graph for filtering (compress matrix columns and values)" << std::endl;
748  SubFactoryMonitor m2(*this, "CompressColsAndVals", currentLevel);
749 
750  vals = vals_type(Kokkos::ViewAllocateWithoutInitializing("FA_vals"), nnzFA);
751 
752  Kokkos::parallel_for(
753  "MueLu:TentativePF:Build:compress_cols", range_type(0, numRows),
754  KOKKOS_LAMBDA(const LO i) {
755  LO rowStart = rows(i);
756  LO rowAStart = rowsA(i);
757  size_t rownnz = rows(i + 1) - rows(i);
758  for (size_t j = 0; j < rownnz; j++) {
759  cols(rowStart + j) = colsAux(rowAStart + j);
760  vals(rowStart + j) = valsAux(rowAStart + j);
761  }
762  });
763  }
764 
765  kokkos_graph_type kokkosGraph(cols, rows);
766 
767  {
768  SubFactoryMonitor m2(*this, "LWGraph construction", currentLevel);
769 
770  graph = rcp(new LWGraph_kokkos(kokkosGraph, A->getRowMap(), A->getColMap(), "filtered graph of A"));
771  graph->SetBoundaryNodeMap(boundaryNodes);
772  }
773 
774  numTotal = A->getLocalNumEntries();
775 
776  dofsPerNode = 1;
777 
778  if (!reuseGraph) {
779  SubFactoryMonitor m2(*this, "LocalMatrix+FillComplete", currentLevel);
780 
781  local_matrix_type localFA = local_matrix_type("A", numRows, A->getLocalMatrixDevice().numCols(), nnzFA, vals, rows, cols);
782  auto filteredACrs = CrsMatrixFactory::Build(localFA, A->getRowMap(), A->getColMap(), A->getDomainMap(), A->getRangeMap(),
783  A->getCrsGraph()->getImporter(), A->getCrsGraph()->getExporter());
784  filteredA = rcp(new CrsMatrixWrap(filteredACrs));
785  }
786 
787  filteredA->SetFixedBlockSize(A->GetFixedBlockSize());
788 
789  if (pL.get<bool>("filtered matrix: reuse eigenvalue")) {
790  // Reuse max eigenvalue from A
791  // It is unclear what eigenvalue is the best for the smoothing, but we already may have
792  // the D^{-1}A estimate in A, may as well use it.
793  // NOTE: ML does that too
794  filteredA->SetMaxEigenvalueEstimate(A->GetMaxEigenvalueEstimate());
795  } else {
796  filteredA->SetMaxEigenvalueEstimate(-Teuchos::ScalarTraits<SC>::one());
797  }
798 
799  } else if (blkSize > 1 && threshold == zero) {
800  // Case 3: block problem without filtering
801  //
802  // FIXME_KOKKOS: this code is completely unoptimized. It really should do
803  // a very simple thing: merge rows and produce nodal graph. But the code
804  // seems very complicated. Can we do better?
805 
806  TEUCHOS_TEST_FOR_EXCEPTION(A->getRowMap()->getLocalNumElements() % blkSize != 0, MueLu::Exceptions::RuntimeError, "MueLu::CoalesceDropFactory: Number of local elements is " << A->getRowMap()->getLocalNumElements() << " but should be a multiply of " << blkSize);
807 
808  const RCP<const Map> rowMap = A->getRowMap();
809  const RCP<const Map> colMap = A->getColMap();
810 
811  // build a node row map (uniqueMap = non-overlapping) and a node column map
812  // (nonUniqueMap = overlapping). The arrays rowTranslation and colTranslation
813  // stored in the AmalgamationInfo class container contain the local node id
814  // given a local dof id. The data is calculated in the AmalgamationFactory and
815  // stored in the variable "UnAmalgamationInfo" (which is of type AmalagamationInfo)
816  const RCP<const Map> uniqueMap = amalInfo->getNodeRowMap();
817  const RCP<const Map> nonUniqueMap = amalInfo->getNodeColMap();
818  Array<LO> rowTranslationArray = *(amalInfo->getRowTranslation()); // TAW should be transform that into a View?
819  Array<LO> colTranslationArray = *(amalInfo->getColTranslation());
820 
821  Kokkos::View<LO*, Kokkos::MemoryUnmanaged>
822  rowTranslationView(rowTranslationArray.getRawPtr(), rowTranslationArray.size());
823  Kokkos::View<LO*, Kokkos::MemoryUnmanaged>
824  colTranslationView(colTranslationArray.getRawPtr(), colTranslationArray.size());
825 
826  // get number of local nodes
827  LO numNodes = Teuchos::as<LocalOrdinal>(uniqueMap->getLocalNumElements());
828  typedef typename Kokkos::View<LocalOrdinal*, typename Node::device_type> id_translation_type;
829  id_translation_type rowTranslation("dofId2nodeId", rowTranslationArray.size());
830  id_translation_type colTranslation("ov_dofId2nodeId", colTranslationArray.size());
831  Kokkos::deep_copy(rowTranslation, rowTranslationView);
832  Kokkos::deep_copy(colTranslation, colTranslationView);
833 
834  // extract striding information
835  blkSize = A->GetFixedBlockSize(); //< the full block size (number of dofs per node in strided map)
836  LocalOrdinal blkId = -1; //< the block id within a strided map or -1 if it is a full block map
837  LocalOrdinal blkPartSize = A->GetFixedBlockSize(); //< stores block size of part blkId (or the full block size)
838  if (A->IsView("stridedMaps") == true) {
839  const RCP<const Map> myMap = A->getRowMap("stridedMaps");
840  const RCP<const StridedMap> strMap = Teuchos::rcp_dynamic_cast<const StridedMap>(myMap);
841  TEUCHOS_TEST_FOR_EXCEPTION(strMap.is_null() == true, Exceptions::RuntimeError, "Map is not of type stridedMap");
842  blkSize = Teuchos::as<const LocalOrdinal>(strMap->getFixedBlockSize());
843  blkId = strMap->getStridedBlockId();
844  if (blkId > -1)
845  blkPartSize = Teuchos::as<LocalOrdinal>(strMap->getStridingData()[blkId]);
846  }
847  auto kokkosMatrix = A->getLocalMatrixDevice(); // access underlying kokkos data
848 
849  //
850  typedef typename LWGraph_kokkos::local_graph_type kokkos_graph_type;
851  typedef typename kokkos_graph_type::row_map_type row_map_type;
852  // typedef typename row_map_type::HostMirror row_map_type_h;
853  typedef typename kokkos_graph_type::entries_type entries_type;
854 
855  // Stage 1c: get number of dof-nonzeros per blkSize node rows
856  typename row_map_type::non_const_type dofNnz("nnz_map", numNodes + 1);
857  LO numDofCols = 0;
859  Kokkos::parallel_reduce("MueLu:CoalesceDropF:Build:scalar_filter:stage1a", range_type(0, numNodes), stage1aFunctor, numDofCols);
860  // parallel_scan (exclusive)
862  Kokkos::parallel_scan("MueLu:CoalesceDropF:Build:scalar_filter:stage1_scan", range_type(0, numNodes + 1), scanFunctor);
863 
864  // Detect and record dof rows that correspond to Dirichlet boundary conditions
865  boundary_nodes_type singleEntryRows = Utilities::DetectDirichletRows_kokkos(*A, dirichletThreshold);
866 
867  typename entries_type::non_const_type dofcols("dofcols", numDofCols /*dofNnz(numNodes)*/); // why does dofNnz(numNodes) work? should be a parallel reduce, i guess
868 
869  // we have dofcols and dofids from Stage1dVectorFunctor
870  LO numNodeCols = 0;
871  typename row_map_type::non_const_type rows("nnz_nodemap", numNodes + 1);
872  typename boundary_nodes_type::non_const_type bndNodes("boundaryNodes", numNodes);
873 
874  CoalesceDrop_Kokkos_Details::Stage1bcVectorFunctor<decltype(kokkosMatrix), decltype(dofNnz), decltype(blkPartSize), decltype(dofcols), decltype(colTranslation), decltype(singleEntryRows), decltype(bndNodes), bool> stage1bcFunctor(kokkosMatrix, dofNnz, blkPartSize, dofcols, colTranslation, rows, singleEntryRows, bndNodes, pL.get<bool>("aggregation: greedy Dirichlet"));
875  Kokkos::parallel_reduce("MueLu:CoalesceDropF:Build:scalar_filter:stage1c", range_type(0, numNodes), stage1bcFunctor, numNodeCols);
876 
877  // parallel_scan (exclusive)
879  Kokkos::parallel_scan("MueLu:CoalesceDropF:Build:scalar_filter:stage1_scan", range_type(0, numNodes + 1), scanNodeFunctor);
880 
881  // create column node view
882  typename entries_type::non_const_type cols("nodecols", numNodeCols);
883 
885  Kokkos::parallel_for("MueLu:CoalesceDropF:Build:scalar_filter:stage1c", range_type(0, numNodes), stage1dFunctor);
886  kokkos_graph_type kokkosGraph(cols, rows);
887 
888  // create LW graph
889  graph = rcp(new LWGraph_kokkos(kokkosGraph, uniqueMap, nonUniqueMap, "amalgamated graph of A"));
890 
891  boundaryNodes = bndNodes;
892  graph->SetBoundaryNodeMap(boundaryNodes);
893  numTotal = A->getLocalNumEntries();
894 
895  dofsPerNode = blkSize;
896 
897  filteredA = A;
898 
899  } else {
900  TEUCHOS_TEST_FOR_EXCEPTION(true, Exceptions::RuntimeError, "MueLu: CoalesceDropFactory_kokkos: Block filtering is not implemented");
901  }
902 
903  if (GetVerbLevel() & Statistics1) {
904  GO numLocalBoundaryNodes = 0;
905  GO numGlobalBoundaryNodes = 0;
906 
907  Kokkos::parallel_reduce(
908  "MueLu:CoalesceDropF:Build:bnd", range_type(0, boundaryNodes.extent(0)),
909  KOKKOS_LAMBDA(const LO i, GO& n) {
910  if (boundaryNodes(i))
911  n++;
912  },
913  numLocalBoundaryNodes);
914 
915  auto comm = A->getRowMap()->getComm();
916  MueLu_sumAll(comm, numLocalBoundaryNodes, numGlobalBoundaryNodes);
917  GetOStream(Statistics1) << "Detected " << numGlobalBoundaryNodes << " Dirichlet nodes" << std::endl;
918  }
919 
920  if ((GetVerbLevel() & Statistics1) && threshold != zero) {
921  auto comm = A->getRowMap()->getComm();
922 
923  GO numGlobalTotal, numGlobalDropped;
924  MueLu_sumAll(comm, numTotal, numGlobalTotal);
925  MueLu_sumAll(comm, numDropped, numGlobalDropped);
926 
927  if (numGlobalTotal != 0) {
928  GetOStream(Statistics1) << "Number of dropped entries: "
929  << numGlobalDropped << "/" << numGlobalTotal
930  << " (" << 100 * Teuchos::as<double>(numGlobalDropped) / Teuchos::as<double>(numGlobalTotal) << "%)" << std::endl;
931  }
932  }
933 
934  Set(currentLevel, "DofsPerNode", dofsPerNode);
935  Set(currentLevel, "Graph", graph);
936  Set(currentLevel, "A", filteredA);
937 }
938 } // namespace MueLu
939 #endif // MUELU_COALESCEDROPFACTORY_KOKKOS_DEF_HPP
KOKKOS_INLINE_FUNCTION void operator()(const LO rowNode) const
#define MueLu_sumAll(rcpComm, in, out)
MueLu::DefaultLocalOrdinal LocalOrdinal
KOKKOS_INLINE_FUNCTION void operator()(const LO row, LO &nnz) const
Lightweight MueLu representation of a compressed row storage graph.
Stage1dVectorFunctor(ColDofType coldofs_, ColDofNnzType coldofnnz_, ColNodeType colnodes_, ColNodeNnzType colnodennz_)
ScalarFunctor(MatrixType A_, BndViewType bndNodes_, DropFunctorType dropFunctor_, typename rows_type::non_const_type rows_, typename cols_type::non_const_type colsAux_, typename vals_type::non_const_type valsAux_, bool reuseGraph_, bool lumping_, SC, bool aggregationMayCreateDirichlet_)
KOKKOS_INLINE_FUNCTION void SetBoundaryNodeMap(const boundary_nodes_type bndry)
Set boolean array indicating which rows correspond to Dirichlet boundaries.
Stage1aVectorFunctor(MatrixType kokkosMatrix_, NnzType nnz_, blkSizeType blkSize_)
typename std::conditional< OnHost, typename local_graph_device_type::HostMirror, local_graph_device_type >::type local_graph_type
void setValidator(RCP< const ParameterEntryValidator > const &validator)
GlobalOrdinal GO
T & get(const std::string &name, T def_value)
KOKKOS_FORCEINLINE_FUNCTION bool operator()(LO row, LO col, SC val) const
ParameterList & set(std::string const &name, T const &value, std::string const &docString="", RCP< const ParameterEntryValidator > const &validator=null)
Timer to be used in factories. Similar to Monitor but with additional timers.
#define TEUCHOS_TEST_FOR_EXCEPTION(throw_exception_test, Exception, msg)
ClassicalDropFunctor(GhostedViewType ghostedDiag, magnitudeType threshold)
KOKKOS_INLINE_FUNCTION magnitudeType distance2(LO row, LO col) const
Print more statistics.
LocalOrdinal LO
One-liner description of what is happening.
T * get() const
KOKKOS_INLINE_FUNCTION void operator()(const LO i, LO &upd, const bool &final) const
TEUCHOS_DEPRECATED RCP< T > rcp(T *p, Dealloc_T dealloc, bool owns_mem)
Class that holds all level-specific information.
Definition: MueLu_Level.hpp:99
Timer to be used in factories. Similar to SubMonitor but adds a timer level by level.
DistanceLaplacianDropFunctor(GhostedViewType ghostedLaplDiag, DistanceFunctor distFunctor_, magnitudeType threshold)
void DeclareInput(Level &currentLevel) const
Input.
KOKKOS_INLINE_FUNCTION void operator()(const LO row, LO &totalnnz) const
Stage1bcVectorFunctor(MatrixType kokkosMatrix_, NnzType coldofnnz_, blkSizeType blkSize_, ColDofType coldofs_, Dof2NodeTranslationType dof2node_, NnzType colnodennz_, BdryNodeTypeConst dirichletdof_, BdryNodeType bdrynode_, boolType usegreedydirichlet_)
static RCP< Vector > GetMatrixOverlappedDiagonal(const Matrix &A)
Extract Overlapped Matrix Diagonal.
Kokkos::RangePolicy< local_ordinal_type, execution_space > range_type
RCP< const ParameterList > GetValidParameterList() const
Return a const parameter list of valid parameters that setParameterList() will accept.
void Build(Level &currentLevel) const
Build an object with this factory.
size_type size() const
Scalar SC
#define SET_VALID_ENTRY(name)
Node NO
Exception throws to report errors in the internal logical of the program.
Description of what is happening (more verbose)
ParameterEntry & getEntry(const std::string &name)
TransListIter end
static Kokkos::View< bool *, typename NO::device_type::memory_space > DetectDirichletRows_kokkos(const Matrix &A, const Magnitude &tol=Teuchos::ScalarTraits< typename Teuchos::ScalarTraits< SC >::magnitudeType >::zero(), const bool count_twos_as_dirichlet=false)
Detect Dirichlet rows.
KOKKOS_INLINE_FUNCTION void operator()(const LO rowNode, LO &nnz) const
bool is_null() const