MueLu  Version of the Day
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Groups Pages
MueLu_CoalesceDropFactory_kokkos_def.hpp
Go to the documentation of this file.
1 // @HEADER
2 //
3 // ***********************************************************************
4 //
5 // MueLu: A package for multigrid based preconditioning
6 // Copyright 2012 Sandia Corporation
7 //
8 // Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
9 // the U.S. Government retains certain rights in this software.
10 //
11 // Redistribution and use in source and binary forms, with or without
12 // modification, are permitted provided that the following conditions are
13 // met:
14 //
15 // 1. Redistributions of source code must retain the above copyright
16 // notice, this list of conditions and the following disclaimer.
17 //
18 // 2. Redistributions in binary form must reproduce the above copyright
19 // notice, this list of conditions and the following disclaimer in the
20 // documentation and/or other materials provided with the distribution.
21 //
22 // 3. Neither the name of the Corporation nor the names of the
23 // contributors may be used to endorse or promote products derived from
24 // this software without specific prior written permission.
25 //
26 // THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
27 // EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
28 // IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
29 // PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
30 // CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
31 // EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
32 // PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
33 // PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
34 // LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
35 // NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
36 // SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
37 //
38 // Questions? Contact
39 // Jonathan Hu (jhu@sandia.gov)
40 // Andrey Prokopenko (aprokop@sandia.gov)
41 // Ray Tuminaro (rstumin@sandia.gov)
42 //
43 // ***********************************************************************
44 //
45 // @HEADER
46 #ifndef MUELU_COALESCEDROPFACTORY_KOKKOS_DEF_HPP
47 #define MUELU_COALESCEDROPFACTORY_KOKKOS_DEF_HPP
48 
49 #ifdef HAVE_MUELU_KOKKOS_REFACTOR
50 #include <Kokkos_Core.hpp>
51 #include <KokkosSparse_CrsMatrix.hpp>
52 
53 #include "Xpetra_Matrix.hpp"
54 
56 
57 #include "MueLu_AmalgamationInfo_kokkos.hpp"
58 #include "MueLu_Exceptions.hpp"
59 #include "MueLu_Level.hpp"
60 #include "MueLu_LWGraph_kokkos.hpp"
61 #include "MueLu_MasterList.hpp"
62 #include "MueLu_Monitor.hpp"
63 #include "MueLu_Utilities_kokkos.hpp"
64 
65 namespace MueLu {
66 
67 
68  namespace CoalesceDrop_Kokkos_Details { // anonymous
69 
70  template<class LO, class RowType>
71  class ScanFunctor {
72  public:
73  ScanFunctor(RowType rows_) : rows(rows_) { }
74 
75  KOKKOS_INLINE_FUNCTION
76  void operator()(const LO i, LO& upd, const bool& final) const {
77  upd += rows(i);
78  if (final)
79  rows(i) = upd;
80  }
81 
82  private:
83  RowType rows;
84  };
85 
86  template<class LO, class GhostedViewType>
87  class ClassicalDropFunctor {
88  private:
89  typedef typename GhostedViewType::value_type SC;
90  typedef Kokkos::ArithTraits<SC> ATS;
91  typedef typename ATS::magnitudeType magnitudeType;
92 
93  GhostedViewType diag; // corresponds to overlapped diagonal multivector (2D View)
94  magnitudeType eps;
95 
96  public:
97  ClassicalDropFunctor(GhostedViewType ghostedDiag, magnitudeType threshold) :
98  diag(ghostedDiag),
99  eps(threshold)
100  { }
101 
102  // Return true if we drop, false if not
103  KOKKOS_FORCEINLINE_FUNCTION
104  bool operator()(LO row, LO col, SC val) const {
105  // We avoid square root by using squared values
106  auto aiiajj = ATS::magnitude(diag(row, 0)) * ATS::magnitude(diag(col, 0)); // |a_ii|*|a_jj|
107  auto aij2 = ATS::magnitude(val) * ATS::magnitude(val); // |a_ij|^2
108 
109  return (aij2 <= eps*eps * aiiajj);
110  }
111  };
112 
113  template<class LO, class CoordsType>
114  class DistanceFunctor {
115  private:
116  typedef typename CoordsType::value_type SC;
117  typedef Kokkos::ArithTraits<SC> ATS;
118  typedef typename ATS::magnitudeType magnitudeType;
119 
120  public:
121  typedef SC value_type;
122 
123  public:
124  DistanceFunctor(CoordsType coords_) : coords(coords_) { }
125 
126  KOKKOS_INLINE_FUNCTION
127  magnitudeType distance2(LO row, LO col) const {
128  SC d = ATS::zero(), s;
129  for (size_t j = 0; j < coords.extent(1); j++) {
130  s = coords(row,j) - coords(col,j);
131  d += s*s;
132  }
133  return ATS::magnitude(d);
134  }
135  private:
136  CoordsType coords;
137  };
138 
139  template<class LO, class GhostedViewType, class DistanceFunctor>
140  class DistanceLaplacianDropFunctor {
141  private:
142  typedef typename GhostedViewType::value_type SC;
143  typedef Kokkos::ArithTraits<SC> ATS;
144  typedef typename ATS::magnitudeType magnitudeType;
145 
146  public:
147  DistanceLaplacianDropFunctor(GhostedViewType ghostedLaplDiag, DistanceFunctor distFunctor_, magnitudeType threshold) :
148  diag(ghostedLaplDiag),
149  distFunctor(distFunctor_),
150  eps(threshold)
151  { }
152 
153  // Return true if we drop, false if not
154  KOKKOS_INLINE_FUNCTION
155  bool operator()(LO row, LO col, SC /* val */) const {
156  // We avoid square root by using squared values
157 
158  // We ignore incoming value of val as we operate on an auxiliary
159  // distance Laplacian matrix
160  typedef typename DistanceFunctor::value_type dSC;
161  typedef Kokkos::ArithTraits<dSC> dATS;
162  auto fval = dATS::one() / distFunctor.distance2(row, col);
163 
164  auto aiiajj = ATS::magnitude(diag(row, 0)) * ATS::magnitude(diag(col, 0)); // |a_ii|*|a_jj|
165  auto aij2 = ATS::magnitude(fval) * ATS::magnitude(fval); // |a_ij|^2
166 
167  return (aij2 <= eps*eps * aiiajj);
168  }
169 
170  private:
171  GhostedViewType diag; // corresponds to overlapped diagonal multivector (2D View)
172  DistanceFunctor distFunctor;
173  magnitudeType eps;
174  };
175 
176  template<class SC, class LO, class MatrixType, class BndViewType, class DropFunctorType>
177  class ScalarFunctor {
178  private:
179  typedef typename MatrixType::StaticCrsGraphType graph_type;
180  typedef typename graph_type::row_map_type rows_type;
181  typedef typename graph_type::entries_type cols_type;
182  typedef typename MatrixType::values_type vals_type;
183  typedef Kokkos::ArithTraits<SC> ATS;
184  typedef typename ATS::magnitudeType magnitudeType;
185 
186  public:
187  ScalarFunctor(MatrixType A_, BndViewType bndNodes_, DropFunctorType dropFunctor_,
188  typename rows_type::non_const_type rows_,
189  typename cols_type::non_const_type colsAux_,
190  typename vals_type::non_const_type valsAux_,
191  bool reuseGraph_, bool lumping_, SC /* threshold_ */) :
192  A(A_),
193  bndNodes(bndNodes_),
194  dropFunctor(dropFunctor_),
195  rows(rows_),
196  colsAux(colsAux_),
197  valsAux(valsAux_),
198  reuseGraph(reuseGraph_),
199  lumping(lumping_)
200  {
201  rowsA = A.graph.row_map;
202  zero = ATS::zero();
203  }
204 
205  KOKKOS_INLINE_FUNCTION
206  void operator()(const LO row, LO& nnz) const {
207  auto rowView = A.rowConst(row);
208  auto length = rowView.length;
209  auto offset = rowsA(row);
210 
211  SC diag = zero;
212  LO rownnz = 0;
213  LO diagID = -1;
214  for (decltype(length) colID = 0; colID < length; colID++) {
215  LO col = rowView.colidx(colID);
216  SC val = rowView.value (colID);
217 
218  if (!dropFunctor(row, col, rowView.value(colID)) || row == col) {
219  colsAux(offset+rownnz) = col;
220 
221  LO valID = (reuseGraph ? colID : rownnz);
222  valsAux(offset+valID) = val;
223  if (row == col)
224  diagID = valID;
225 
226  rownnz++;
227 
228  } else {
229  // Rewrite with zeros (needed for reuseGraph)
230  valsAux(offset+colID) = zero;
231  diag += val;
232  }
233  }
234  // How to assert on the device?
235  // assert(diagIndex != -1);
236  rows(row+1) = rownnz;
237  // if (lumping && diagID != -1) {
238  if (lumping) {
239  // Add diag to the diagonal
240 
241  // NOTE_KOKKOS: valsAux was allocated with
242  // ViewAllocateWithoutInitializing. This is not a problem here
243  // because we explicitly set this value above.
244  valsAux(offset+diagID) += diag;
245  }
246 
247  // If the only element remaining after filtering is diagonal, mark node as boundary
248  // FIXME: this should really be replaced by the following
249  // if (indices.size() == 1 && indices[0] == row)
250  // boundaryNodes[row] = true;
251  // We do not do it this way now because there is no framework for distinguishing isolated
252  // and boundary nodes in the aggregation algorithms
253  bndNodes(row) = (rownnz == 1);
254 
255  nnz += rownnz;
256  }
257 
258  private:
259  MatrixType A;
260  BndViewType bndNodes;
261  DropFunctorType dropFunctor;
262 
263  rows_type rowsA;
264 
265  typename rows_type::non_const_type rows;
266  typename cols_type::non_const_type colsAux;
267  typename vals_type::non_const_type valsAux;
268 
269  bool reuseGraph;
270  bool lumping;
271  SC zero;
272  };
273 
274  // collect number nonzeros of blkSize rows in nnz_(row+1)
275  template<class MatrixType, class NnzType, class blkSizeType>
276  class Stage1aVectorFunctor {
277  private:
278  typedef typename MatrixType::ordinal_type LO;
279 
280  public:
281  Stage1aVectorFunctor(MatrixType kokkosMatrix_, NnzType nnz_, blkSizeType blkSize_) :
282  kokkosMatrix(kokkosMatrix_),
283  nnz(nnz_),
284  blkSize(blkSize_) { }
285 
286  KOKKOS_INLINE_FUNCTION
287  void operator()(const LO row, LO& totalnnz) const {
288 
289  // the following code is more or less what MergeRows is doing
290  // count nonzero entries in all dof rows associated with node row
291  LO nodeRowMaxNonZeros = 0;
292  for (LO j = 0; j < blkSize; j++) {
293  auto rowView = kokkosMatrix.row(row * blkSize + j);
294  nodeRowMaxNonZeros += rowView.length;
295  }
296  nnz(row + 1) = nodeRowMaxNonZeros;
297  totalnnz += nodeRowMaxNonZeros;
298  }
299 
300 
301  private:
302  MatrixType kokkosMatrix; //< local matrix part
303  NnzType nnz; //< View containing number of nonzeros for current row
304  blkSizeType blkSize; //< block size (or partial block size in strided maps)
305  };
306 
307 
308  // build the dof-based column map containing the local dof ids belonging to blkSize rows in matrix
309  // the DofIds may not be sorted.
310  template<class MatrixType, class NnzType, class blkSizeType, class ColDofType>
311  class Stage1bVectorFunctor {
312  private:
313  typedef typename MatrixType::ordinal_type LO;
314  //typedef typename MatrixType::value_type SC;
315  //typedef typename MatrixType::device_type NO;
316 
317  private:
318  MatrixType kokkosMatrix; //< local matrix part
319  NnzType nnz; //< View containing dof offsets for dof columns
320  blkSizeType blkSize; //< block size (or partial block size in strided maps)
321  ColDofType coldofs; //< view containing the local dof ids associated with columns for the blkSize rows (not sorted)
322 
323  public:
324  Stage1bVectorFunctor(MatrixType kokkosMatrix_,
325  NnzType nnz_,
326  blkSizeType blkSize_,
327  ColDofType coldofs_) :
328  kokkosMatrix(kokkosMatrix_),
329  nnz(nnz_),
330  blkSize(blkSize_),
331  coldofs(coldofs_) {
332  }
333 
334  KOKKOS_INLINE_FUNCTION
335  void operator()(const LO rowNode) const {
336 
337  LO pos = nnz(rowNode);
338  for (LO j = 0; j < blkSize; j++) {
339  auto rowView = kokkosMatrix.row(rowNode * blkSize + j);
340  auto numIndices = rowView.length;
341 
342  for (decltype(numIndices) k = 0; k < numIndices; k++) {
343  auto dofID = rowView.colidx(k);
344  coldofs(pos) = dofID;
345  pos ++;
346  }
347  }
348  }
349 
350  };
351 
352  // sort column ids
353  // translate them into (unique) node ids
354  // count the node column ids per node row
355  template<class MatrixType, class ColDofNnzType, class ColDofType, class Dof2NodeTranslationType, class BdryNodeType>
356  class Stage1cVectorFunctor {
357  private:
358  typedef typename MatrixType::ordinal_type LO;
359 
360  private:
361  ColDofNnzType coldofnnz; //< view containing start and stop indices for subviews
362  ColDofType coldofs; //< view containing the local dof ids associated with columns for the blkSize rows (not sorted)
363  Dof2NodeTranslationType dof2node; //< view containing the local node id associated with the local dof id
364  ColDofNnzType colnodennz; //< view containing number of column nodes for each node row
365  BdryNodeType bdrynode; //< view containing with numNodes booleans. True if node is (full) dirichlet boundardy node.
366 
367  public:
368  Stage1cVectorFunctor(ColDofNnzType coldofnnz_, ColDofType coldofs_, Dof2NodeTranslationType dof2node_, ColDofNnzType colnodennz_, BdryNodeType bdrynode_) :
369  coldofnnz(coldofnnz_),
370  coldofs(coldofs_),
371  dof2node(dof2node_),
372  colnodennz(colnodennz_),
373  bdrynode(bdrynode_) {
374  }
375 
376  KOKKOS_INLINE_FUNCTION
377  void operator()(const LO rowNode, LO& nnz) const {
378  LO begin = coldofnnz(rowNode);
379  LO end = coldofnnz(rowNode+1);
380  LO n = end - begin;
381  for (LO i = 0; i < (n-1); i++) {
382  for (LO j = 0; j < (n-i-1); j++) {
383  if (coldofs(j+begin) > coldofs(j+begin+1)) {
384  LO temp = coldofs(j+begin);
385  coldofs(j+begin) = coldofs(j+begin+1);
386  coldofs(j+begin+1) = temp;
387  }
388  }
389  }
390  size_t cnt = 0;
391  LO lastNodeID = -1;
392  for (LO i = 0; i < n; i++) {
393  LO dofID = coldofs(begin + i);
394  LO nodeID = dof2node(dofID);
395  if(nodeID != lastNodeID) {
396  lastNodeID = nodeID;
397  coldofs(begin+cnt) = nodeID;
398  cnt++;
399  }
400  }
401  if(cnt == 1)
402  bdrynode(rowNode) = true;
403  else
404  bdrynode(rowNode) = false;
405  colnodennz(rowNode+1) = cnt;
406  nnz += cnt;
407  }
408 
409  };
410 
411  // fill column node id view
412  template<class MatrixType, class ColDofNnzType, class ColDofType, class ColNodeNnzType, class ColNodeType>
413  class Stage1dVectorFunctor {
414  private:
415  typedef typename MatrixType::ordinal_type LO;
416  typedef typename MatrixType::value_type SC;
417 
418  private:
419  ColDofType coldofs; //< view containing mixed node and dof indices (only input)
420  ColDofNnzType coldofnnz; //< view containing the start and stop indices for subviews (dofs)
421  ColNodeType colnodes; //< view containing the local node ids associated with columns
422  ColNodeNnzType colnodennz; //< view containing start and stop indices for subviews
423 
424  public:
425  Stage1dVectorFunctor(ColDofType coldofs_, ColDofNnzType coldofnnz_, ColNodeType colnodes_, ColNodeNnzType colnodennz_) :
426  coldofs(coldofs_),
427  coldofnnz(coldofnnz_),
428  colnodes(colnodes_),
429  colnodennz(colnodennz_) {
430  }
431 
432  KOKKOS_INLINE_FUNCTION
433  void operator()(const LO rowNode) const {
434  auto dofbegin = coldofnnz(rowNode);
435  auto nodebegin = colnodennz(rowNode);
436  auto nodeend = colnodennz(rowNode+1);
437  auto n = nodeend - nodebegin;
438 
439  for (decltype(nodebegin) i = 0; i < n; i++) {
440  colnodes(nodebegin + i) = coldofs(dofbegin + i);
441  }
442  }
443  };
444 
445 
446  } // namespace
447 
448  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class DeviceType>
449  RCP<const ParameterList> CoalesceDropFactory_kokkos<Scalar,LocalOrdinal,GlobalOrdinal,Kokkos::Compat::KokkosDeviceWrapperNode<DeviceType>>::GetValidParameterList() const {
450  RCP<ParameterList> validParamList = rcp(new ParameterList());
451 
452 #define SET_VALID_ENTRY(name) validParamList->setEntry(name, MasterList::getEntry(name))
453  SET_VALID_ENTRY("aggregation: drop tol");
454  SET_VALID_ENTRY("aggregation: Dirichlet threshold");
455  SET_VALID_ENTRY("aggregation: drop scheme");
456  SET_VALID_ENTRY("filtered matrix: use lumping");
457  SET_VALID_ENTRY("filtered matrix: reuse graph");
458  SET_VALID_ENTRY("filtered matrix: reuse eigenvalue");
459  {
461  validParamList->getEntry("aggregation: drop scheme").setValidator(
462  rcp(new validatorType(Teuchos::tuple<std::string>("classical", "distance laplacian"), "aggregation: drop scheme")));
463  }
464 #undef SET_VALID_ENTRY
465  validParamList->set< bool > ("lightweight wrap", true, "Experimental option for lightweight graph access");
466 
467  validParamList->set< RCP<const FactoryBase> >("A", Teuchos::null, "Generating factory of the matrix A");
468  validParamList->set< RCP<const FactoryBase> >("UnAmalgamationInfo", Teuchos::null, "Generating factory for UnAmalgamationInfo");
469  validParamList->set< RCP<const FactoryBase> >("Coordinates", Teuchos::null, "Generating factory for Coordinates");
470 
471  return validParamList;
472  }
473 
474  template <class Scalar,class LocalOrdinal, class GlobalOrdinal, class DeviceType>
475  void CoalesceDropFactory_kokkos<Scalar,LocalOrdinal,GlobalOrdinal,Kokkos::Compat::KokkosDeviceWrapperNode<DeviceType>>::DeclareInput(Level &currentLevel) const {
476  Input(currentLevel, "A");
477  Input(currentLevel, "UnAmalgamationInfo");
478 
479  const ParameterList& pL = GetParameterList();
480  if (pL.get<bool>("lightweight wrap") == true) {
481  if (pL.get<std::string>("aggregation: drop scheme") == "distance laplacian")
482  Input(currentLevel, "Coordinates");
483  }
484  }
485 
486  template <class Scalar,class LocalOrdinal, class GlobalOrdinal, class DeviceType>
487  void CoalesceDropFactory_kokkos<Scalar,LocalOrdinal,GlobalOrdinal,Kokkos::Compat::KokkosDeviceWrapperNode<DeviceType>>::
488  Build(Level& currentLevel) const {
489  FactoryMonitor m(*this, "Build", currentLevel);
490 
491  typedef Teuchos::ScalarTraits<SC> STS;
492  const SC zero = STS::zero();
493 
494  auto A = Get< RCP<Matrix> >(currentLevel, "A");
495  LO blkSize = A->GetFixedBlockSize();
496 
497  auto amalInfo = Get< RCP<AmalgamationInfo_kokkos> >(currentLevel, "UnAmalgamationInfo");
498 
499  const ParameterList& pL = GetParameterList();
500 
501  bool doLightWeightWrap = pL.get<bool>("lightweight wrap");
502  GetOStream(Warnings0) << "lightweight wrap is deprecated" << std::endl;
503  TEUCHOS_TEST_FOR_EXCEPTION(!doLightWeightWrap, Exceptions::RuntimeError,
504  "MueLu KokkosRefactor only supports \"lightweight wrap\"=\"true\"");
505 
506  std::string algo = pL.get<std::string>("aggregation: drop scheme");
507 
508  double threshold = pL.get<double>("aggregation: drop tol");
509  GetOStream(Runtime0) << "algorithm = \"" << algo << "\": threshold = " << threshold
510  << ", blocksize = " << A->GetFixedBlockSize() << std::endl;
511 
512  const typename STS::magnitudeType dirichletThreshold =
513  STS::magnitude(as<SC>(pL.get<double>("aggregation: Dirichlet threshold")));
514 
515  GO numDropped = 0, numTotal = 0;
516 
517  RCP<LWGraph_kokkos> graph;
518  LO dofsPerNode = -1;
519 
520  typedef typename LWGraph_kokkos::boundary_nodes_type boundary_nodes_type;
521  boundary_nodes_type boundaryNodes;
522 
523  RCP<Matrix> filteredA;
524  if (blkSize == 1 && threshold == zero) {
525  // Scalar problem without dropping
526 
527  // Detect and record rows that correspond to Dirichlet boundary conditions
528  boundaryNodes = Utilities_kokkos::DetectDirichletRows(*A, dirichletThreshold);
529 
530  // Trivial LWGraph construction
531  graph = rcp(new LWGraph_kokkos(A->getLocalMatrix().graph, A->getRowMap(), A->getColMap(), "graph of A"));
532  graph->SetBoundaryNodeMap(boundaryNodes);
533 
534  numTotal = A->getNodeNumEntries();
535  dofsPerNode = 1;
536 
537  filteredA = A;
538 
539  } else if (blkSize == 1 && threshold != zero) {
540  // Scalar problem with dropping
541 
542  typedef typename Matrix::local_matrix_type local_matrix_type;
543  typedef typename LWGraph_kokkos::local_graph_type kokkos_graph_type;
544  typedef typename kokkos_graph_type::row_map_type::non_const_type rows_type;
545  typedef typename kokkos_graph_type::entries_type::non_const_type cols_type;
546  typedef typename local_matrix_type::values_type::non_const_type vals_type;
547 
548  LO numRows = A->getNodeNumRows();
549  auto kokkosMatrix = A->getLocalMatrix();
550  auto nnzA = kokkosMatrix.nnz();
551  auto rowsA = kokkosMatrix.graph.row_map;
552 
553  typedef Kokkos::ArithTraits<SC> ATS;
554 
555  bool reuseGraph = pL.get<bool>("filtered matrix: reuse graph");
556  bool lumping = pL.get<bool>("filtered matrix: use lumping");
557  if (lumping)
558  GetOStream(Runtime0) << "Lumping dropped entries" << std::endl;
559 
560  // FIXME_KOKKOS: replace by ViewAllocateWithoutInitializing + setting a single value
561  rows_type rows ("FA_rows", numRows+1);
562  cols_type colsAux(Kokkos::ViewAllocateWithoutInitializing("FA_aux_cols"), nnzA);
563  vals_type valsAux;
564  if (reuseGraph) {
565  SubFactoryMonitor m2(*this, "CopyMatrix", currentLevel);
566 
567  // Share graph with the original matrix
568  filteredA = MatrixFactory::Build(A->getCrsGraph());
569 
570  // Do a no-op fill-complete
571  RCP<ParameterList> fillCompleteParams(new ParameterList);
572  fillCompleteParams->set("No Nonlocal Changes", true);
573  filteredA->fillComplete(fillCompleteParams);
574 
575  // No need to reuseFill, just modify in place
576  valsAux = filteredA->getLocalMatrix().values;
577 
578  } else {
579  // Need an extra array to compress
580  valsAux = vals_type(Kokkos::ViewAllocateWithoutInitializing("FA_aux_vals"), nnzA);
581  }
582 
583  typename boundary_nodes_type::non_const_type bndNodes(Kokkos::ViewAllocateWithoutInitializing("boundaryNodes"), numRows);
584 
585  LO nnzFA = 0;
586  {
587  if (algo == "classical") {
588  // Construct overlapped matrix diagonal
589  RCP<Vector> ghostedDiag;
590  {
591  SubFactoryMonitor m2(*this, "Ghosted diag construction", currentLevel);
592  ghostedDiag = Utilities_kokkos::GetMatrixOverlappedDiagonal(*A);
593  }
594 
595  // Filter out entries
596  {
597  SubFactoryMonitor m2(*this, "MainLoop", currentLevel);
598 
599  auto ghostedDiagView = ghostedDiag->template getLocalView<DeviceType>();
600 
601  CoalesceDrop_Kokkos_Details::ClassicalDropFunctor<LO, decltype(ghostedDiagView)> dropFunctor(ghostedDiagView, threshold);
602  CoalesceDrop_Kokkos_Details::ScalarFunctor<SC, LO, local_matrix_type, decltype(bndNodes), decltype(dropFunctor)>
603  scalarFunctor(kokkosMatrix, bndNodes, dropFunctor, rows, colsAux, valsAux, reuseGraph, lumping, threshold);
604 
605  Kokkos::parallel_reduce("MueLu:CoalesceDropF:Build:scalar_filter:main_loop", range_type(0,numRows),
606  scalarFunctor, nnzFA);
607  }
608 
609  } else if (algo == "distance laplacian") {
610  typedef Xpetra::MultiVector<typename Teuchos::ScalarTraits<Scalar>::magnitudeType,LO,GO,NO> doubleMultiVector;
611  auto coords = Get<RCP<doubleMultiVector> >(currentLevel, "Coordinates");
612 
613  auto uniqueMap = A->getRowMap();
614  auto nonUniqueMap = A->getColMap();
615 
616  // Construct ghosted coordinates
617  RCP<const Import> importer;
618  {
619  SubFactoryMonitor m2(*this, "Coords Import construction", currentLevel);
620  importer = ImportFactory::Build(uniqueMap, nonUniqueMap);
621  }
622  RCP<doubleMultiVector> ghostedCoords;
623  {
624  SubFactoryMonitor m2(*this, "Ghosted coords construction", currentLevel);
625  ghostedCoords = Xpetra::MultiVectorFactory<typename Teuchos::ScalarTraits<Scalar>::magnitudeType,LO,GO,NO>::Build(nonUniqueMap, coords->getNumVectors());
626  ghostedCoords->doImport(*coords, *importer, Xpetra::INSERT);
627  }
628 
629  auto ghostedCoordsView = ghostedCoords->template getLocalView<DeviceType>();
630  CoalesceDrop_Kokkos_Details::DistanceFunctor<LO, decltype(ghostedCoordsView)> distFunctor(ghostedCoordsView);
631 
632  // Construct Laplacian diagonal
633  RCP<Vector> localLaplDiag;
634  {
635  SubFactoryMonitor m2(*this, "Local Laplacian diag construction", currentLevel);
636 
637  localLaplDiag = VectorFactory::Build(uniqueMap);
638 
639  auto localLaplDiagView = localLaplDiag->template getLocalView<DeviceType>();
640  auto kokkosGraph = kokkosMatrix.graph;
641 
642  Kokkos::parallel_for("MueLu:CoalesceDropF:Build:scalar_filter:laplacian_diag", range_type(0,numRows),
643  KOKKOS_LAMBDA(const LO row) {
644  auto rowView = kokkosGraph.rowConst(row);
645  auto length = rowView.length;
646 
647  SC d = ATS::zero();
648  for (decltype(length) colID = 0; colID < length; colID++) {
649  auto col = rowView(colID);
650  if (row != col)
651  d += ATS::one()/distFunctor.distance2(row, col);
652  }
653  localLaplDiagView(row,0) = d;
654  });
655  }
656 
657  // Construct ghosted Laplacian diagonal
658  RCP<Vector> ghostedLaplDiag;
659  {
660  SubFactoryMonitor m2(*this, "Ghosted Laplacian diag construction", currentLevel);
661  ghostedLaplDiag = VectorFactory::Build(nonUniqueMap);
662  ghostedLaplDiag->doImport(*localLaplDiag, *importer, Xpetra::INSERT);
663  }
664 
665  // Filter out entries
666  {
667  SubFactoryMonitor m2(*this, "MainLoop", currentLevel);
668 
669  auto ghostedLaplDiagView = ghostedLaplDiag->template getLocalView<DeviceType>();
670 
671  CoalesceDrop_Kokkos_Details::DistanceLaplacianDropFunctor<LO, decltype(ghostedLaplDiagView), decltype(distFunctor)>
672  dropFunctor(ghostedLaplDiagView, distFunctor, threshold);
673  CoalesceDrop_Kokkos_Details::ScalarFunctor<SC, LO, local_matrix_type, decltype(bndNodes), decltype(dropFunctor)>
674  scalarFunctor(kokkosMatrix, bndNodes, dropFunctor, rows, colsAux, valsAux, reuseGraph, lumping, threshold);
675 
676  Kokkos::parallel_reduce("MueLu:CoalesceDropF:Build:scalar_filter:main_loop", range_type(0,numRows),
677  scalarFunctor, nnzFA);
678  }
679  }
680 
681  }
682  numDropped = nnzA - nnzFA;
683 
684  boundaryNodes = bndNodes;
685 
686  {
687  SubFactoryMonitor m2(*this, "CompressRows", currentLevel);
688 
689  // parallel_scan (exclusive)
690  Kokkos::parallel_scan("MueLu:CoalesceDropF:Build:scalar_filter:compress_rows", range_type(0,numRows+1),
691  KOKKOS_LAMBDA(const LO i, LO& update, const bool& final_pass) {
692  update += rows(i);
693  if (final_pass)
694  rows(i) = update;
695  });
696  }
697 
698  // Compress cols (and optionally vals)
699  // We use a trick here: we moved all remaining elements to the beginning
700  // of the original row in the main loop, so we don't need to check for
701  // INVALID here, and just stop when achieving the new number of elements
702  // per row.
703  cols_type cols(Kokkos::ViewAllocateWithoutInitializing("FA_cols"), nnzFA);
704  vals_type vals;
705  if (reuseGraph) {
706  // Only compress cols
707  SubFactoryMonitor m2(*this, "CompressColsAndVals", currentLevel);
708 
709  Kokkos::parallel_for("MueLu:TentativePF:Build:compress_cols", range_type(0,numRows),
710  KOKKOS_LAMBDA(const LO i) {
711  // Is there Kokkos memcpy?
712  LO rowStart = rows(i);
713  LO rowAStart = rowsA(i);
714  size_t rownnz = rows(i+1) - rows(i);
715  for (size_t j = 0; j < rownnz; j++)
716  cols(rowStart+j) = colsAux(rowAStart+j);
717  });
718  } else {
719  // Compress cols and vals
720  SubFactoryMonitor m2(*this, "CompressColsAndVals", currentLevel);
721 
722  vals = vals_type(Kokkos::ViewAllocateWithoutInitializing("FA_vals"), nnzFA);
723 
724  Kokkos::parallel_for("MueLu:TentativePF:Build:compress_cols", range_type(0,numRows),
725  KOKKOS_LAMBDA(const LO i) {
726  LO rowStart = rows(i);
727  LO rowAStart = rowsA(i);
728  size_t rownnz = rows(i+1) - rows(i);
729  for (size_t j = 0; j < rownnz; j++) {
730  cols(rowStart+j) = colsAux(rowAStart+j);
731  vals(rowStart+j) = colsAux(rowAStart+j);
732  }
733  });
734  }
735 
736  kokkos_graph_type kokkosGraph(cols, rows);
737 
738  {
739  SubFactoryMonitor m2(*this, "LWGraph construction", currentLevel);
740 
741  graph = rcp(new LWGraph_kokkos(kokkosGraph, A->getRowMap(), A->getColMap(), "filtered graph of A"));
742  graph->SetBoundaryNodeMap(boundaryNodes);
743  }
744 
745  numTotal = A->getNodeNumEntries();
746 
747  dofsPerNode = 1;
748 
749  if (!reuseGraph) {
750  SubFactoryMonitor m2(*this, "LocalMatrix+FillComplete", currentLevel);
751 
752  local_matrix_type localFA = local_matrix_type("A", numRows, kokkosMatrix.numCols(), nnzFA, vals, rows, cols);
753 #if 1
754  // FIXME: this should be gone once Xpetra propagate "local matrix + 4 maps" constructor
755  // FIXME: we don't need to rebuild importers/exporters. We should reuse A's
756  auto filteredACrs = CrsMatrixFactory::Build(A->getRowMap(), A->getColMap(), localFA);
757  filteredACrs->resumeFill(); // we need that for rectangular matrices
758  filteredACrs->expertStaticFillComplete(A->getDomainMap(), A->getRangeMap());
759 #endif
760  filteredA = rcp(new CrsMatrixWrap(filteredACrs));
761  }
762 
763  filteredA->SetFixedBlockSize(A->GetFixedBlockSize());
764 
765  if (pL.get<bool>("filtered matrix: reuse eigenvalue")) {
766  // Reuse max eigenvalue from A
767  // It is unclear what eigenvalue is the best for the smoothing, but we already may have
768  // the D^{-1}A estimate in A, may as well use it.
769  // NOTE: ML does that too
770  filteredA->SetMaxEigenvalueEstimate(A->GetMaxEigenvalueEstimate());
771  } else {
772  filteredA->SetMaxEigenvalueEstimate(-Teuchos::ScalarTraits<SC>::one());
773  }
774 
775  } else if (blkSize > 1 && threshold == zero) {
776  // Case 3: block problem without filtering
777  //
778  // FIXME_KOKKOS: this code is completely unoptimized. It really should do
779  // a very simple thing: merge rows and produce nodal graph. But the code
780  // seems very complicated. Can we do better?
781 
782  TEUCHOS_TEST_FOR_EXCEPTION(A->getRowMap()->getNodeNumElements() % blkSize != 0, MueLu::Exceptions::RuntimeError, "MueLu::CoalesceDropFactory: Number of local elements is " << A->getRowMap()->getNodeNumElements() << " but should be a multiply of " << blkSize);
783 
784  const RCP<const Map> rowMap = A->getRowMap();
785  const RCP<const Map> colMap = A->getColMap();
786 
787  // build a node row map (uniqueMap = non-overlapping) and a node column map
788  // (nonUniqueMap = overlapping). The arrays rowTranslation and colTranslation
789  // stored in the AmalgamationInfo class container contain the local node id
790  // given a local dof id. The data is calculated in the AmalgamationFactory and
791  // stored in the variable "UnAmalgamationInfo" (which is of type AmalagamationInfo)
792  const RCP<const Map> uniqueMap = amalInfo->getNodeRowMap();
793  const RCP<const Map> nonUniqueMap = amalInfo->getNodeColMap();
794  Array<LO> rowTranslationArray = *(amalInfo->getRowTranslation()); // TAW should be transform that into a View?
795  Array<LO> colTranslationArray = *(amalInfo->getColTranslation());
796 
797  // get number of local nodes
798  LO numNodes = Teuchos::as<LocalOrdinal>(uniqueMap->getNodeNumElements());
799  typedef typename Kokkos::View<LocalOrdinal*, DeviceType> id_translation_type;
800  id_translation_type rowTranslation("dofId2nodeId",rowTranslationArray.size());
801  id_translation_type colTranslation("ov_dofId2nodeId",colTranslationArray.size());
802 
803  // TODO change this to lambdas
804  for (decltype(rowTranslationArray.size()) i = 0; i < rowTranslationArray.size(); ++i)
805  rowTranslation(i) = rowTranslationArray[i];
806  for (decltype(colTranslationArray.size()) i = 0; i < colTranslationArray.size(); ++i)
807  colTranslation(i) = colTranslationArray[i];
808  // extract striding information
809  blkSize = A->GetFixedBlockSize(); //< the full block size (number of dofs per node in strided map)
810  LocalOrdinal blkId = -1; //< the block id within a strided map or -1 if it is a full block map
811  LocalOrdinal blkPartSize = A->GetFixedBlockSize(); //< stores block size of part blkId (or the full block size)
812  if(A->IsView("stridedMaps") == true) {
813  const RCP<const Map> myMap = A->getRowMap("stridedMaps");
814  const RCP<const StridedMap> strMap = Teuchos::rcp_dynamic_cast<const StridedMap>(myMap);
815  TEUCHOS_TEST_FOR_EXCEPTION(strMap.is_null() == true, Exceptions::RuntimeError, "Map is not of type stridedMap");
816  blkSize = Teuchos::as<const LocalOrdinal>(strMap->getFixedBlockSize());
817  blkId = strMap->getStridedBlockId();
818  if (blkId > -1)
819  blkPartSize = Teuchos::as<LocalOrdinal>(strMap->getStridingData()[blkId]);
820  }
821  auto kokkosMatrix = A->getLocalMatrix(); // access underlying kokkos data
822 
823  //
824  typedef typename LWGraph_kokkos::local_graph_type kokkos_graph_type;
825  typedef typename kokkos_graph_type::row_map_type row_map_type;
826  //typedef typename row_map_type::HostMirror row_map_type_h;
827  typedef typename kokkos_graph_type::entries_type entries_type;
828 
829  // Stage 1c: get number of dof-nonzeros per blkSize node rows
830  typename row_map_type::non_const_type dofNnz("nnz_map", numNodes + 1);
831  LO numDofCols = 0;
832  CoalesceDrop_Kokkos_Details::Stage1aVectorFunctor<decltype(kokkosMatrix), decltype(dofNnz), decltype(blkPartSize)> stage1aFunctor(kokkosMatrix, dofNnz, blkPartSize);
833  Kokkos::parallel_reduce("MueLu:CoalesceDropF:Build:scalar_filter:stage1a", range_type(0,numNodes), stage1aFunctor, numDofCols);
834  // parallel_scan (exclusive)
835  CoalesceDrop_Kokkos_Details::ScanFunctor<LO,decltype(dofNnz)> scanFunctor(dofNnz);
836  Kokkos::parallel_scan("MueLu:CoalesceDropF:Build:scalar_filter:stage1_scan", range_type(0,numNodes+1), scanFunctor);
837 
838  typename entries_type::non_const_type dofcols("dofcols", numDofCols/*dofNnz(numNodes)*/); // why does dofNnz(numNodes) work? should be a parallel reduce, i guess
839  CoalesceDrop_Kokkos_Details::Stage1bVectorFunctor <decltype(kokkosMatrix), decltype(dofNnz), decltype(blkPartSize), decltype(dofcols)> stage1bFunctor(kokkosMatrix, dofNnz, blkPartSize, dofcols);
840  Kokkos::parallel_for("MueLu:CoalesceDropF:Build:scalar_filter:stage1b", range_type(0,numNodes), stage1bFunctor);
841 
842  // we have dofcols and dofids from Stage1dVectorFunctor
843  LO numNodeCols = 0;
844  typename row_map_type::non_const_type rows("nnz_nodemap", numNodes + 1);
845  typename boundary_nodes_type::non_const_type bndNodes("boundaryNodes", numNodes);
846  CoalesceDrop_Kokkos_Details::Stage1cVectorFunctor <decltype(kokkosMatrix), decltype(dofNnz), decltype(dofcols), decltype(colTranslation), decltype(bndNodes)> stage1cFunctor(dofNnz, dofcols, colTranslation,rows,bndNodes);
847  Kokkos::parallel_reduce("MueLu:CoalesceDropF:Build:scalar_filter:stage1c", range_type(0,numNodes), stage1cFunctor,numNodeCols);
848 
849  // parallel_scan (exclusive)
850  CoalesceDrop_Kokkos_Details::ScanFunctor<LO,decltype(rows)> scanNodeFunctor(rows);
851  Kokkos::parallel_scan("MueLu:CoalesceDropF:Build:scalar_filter:stage1_scan", range_type(0,numNodes+1), scanNodeFunctor);
852 
853  // create column node view
854  typename entries_type::non_const_type cols("nodecols", numNodeCols);
855 
856 
857  CoalesceDrop_Kokkos_Details::Stage1dVectorFunctor <decltype(kokkosMatrix), decltype(dofNnz), decltype(dofcols), decltype(rows), decltype(cols)> stage1dFunctor(dofcols, dofNnz, cols, rows);
858  Kokkos::parallel_for("MueLu:CoalesceDropF:Build:scalar_filter:stage1c", range_type(0,numNodes), stage1dFunctor);
859  kokkos_graph_type kokkosGraph(cols, rows);
860 
861  // create LW graph
862  graph = rcp(new LWGraph_kokkos(kokkosGraph, uniqueMap, nonUniqueMap, "amalgamated graph of A"));
863 
864  boundaryNodes = bndNodes;
865  graph->SetBoundaryNodeMap(boundaryNodes);
866  numTotal = A->getNodeNumEntries();
867 
868  dofsPerNode = blkSize;
869 
870  filteredA = A;
871 
872  } else {
873  TEUCHOS_TEST_FOR_EXCEPTION(true, Exceptions::RuntimeError, "MueLu: CoalesceDropFactory_kokkos: Block filtering is not implemented");
874  }
875 
876  if (GetVerbLevel() & Statistics1) {
877  GO numLocalBoundaryNodes = 0;
878  GO numGlobalBoundaryNodes = 0;
879 
880  Kokkos::parallel_reduce("MueLu:CoalesceDropF:Build:bnd", range_type(0, boundaryNodes.extent(0)),
881  KOKKOS_LAMBDA(const LO i, GO& n) {
882  if (boundaryNodes(i))
883  n++;
884  }, numLocalBoundaryNodes);
885 
886  auto comm = A->getRowMap()->getComm();
887  MueLu_sumAll(comm, numLocalBoundaryNodes, numGlobalBoundaryNodes);
888  GetOStream(Statistics1) << "Detected " << numGlobalBoundaryNodes << " Dirichlet nodes" << std::endl;
889  }
890 
891  if ((GetVerbLevel() & Statistics1) && threshold != zero) {
892  auto comm = A->getRowMap()->getComm();
893 
894  GO numGlobalTotal, numGlobalDropped;
895  MueLu_sumAll(comm, numTotal, numGlobalTotal);
896  MueLu_sumAll(comm, numDropped, numGlobalDropped);
897 
898  if (numGlobalTotal != 0) {
899  GetOStream(Statistics1) << "Number of dropped entries: "
900  << numGlobalDropped << "/" << numGlobalTotal
901  << " (" << 100*Teuchos::as<double>(numGlobalDropped)/Teuchos::as<double>(numGlobalTotal) << "%)" << std::endl;
902  }
903  }
904 
905  Set(currentLevel, "DofsPerNode", dofsPerNode);
906  Set(currentLevel, "Graph", graph);
907  Set(currentLevel, "A", filteredA);
908  }
909 }
910 #endif // HAVE_MUELU_KOKKOS_REFACTOR
911 #endif // MUELU_COALESCEDROPFACTORY_KOKKOS_DEF_HPP
Important warning messages (one line)
#define MueLu_sumAll(rcpComm, in, out)
MueLu::DefaultLocalOrdinal LocalOrdinal
#define TEUCHOS_TEST_FOR_EXCEPTION(throw_exception_test, Exception, msg)
Print more statistics.
One-liner description of what is happening.
Kokkos::View< bool *, typename NO::device_type > DetectDirichletRows(const Xpetra::Matrix< SC, LO, GO, NO > &A, const typename Teuchos::ScalarTraits< SC >::magnitudeType &tol, const bool count_twos_as_dirichlet)
TEUCHOS_DEPRECATED RCP< T > rcp(T *p, Dealloc_T dealloc, bool owns_mem)
void parallel_reduce(const std::string &label, const PolicyType &policy, const FunctorType &functor, ReturnType &return_value, typename std::enable_if< Kokkos::Impl::is_execution_policy< PolicyType >::value >::type *=nullptr)
void parallel_for(const ExecPolicy &policy, const FunctorType &functor, const std::string &str="", typename std::enable_if< Kokkos::Impl::is_execution_policy< ExecPolicy >::value >::type *=nullptr)
Exception throws to report errors in the internal logical of the program.
TransListIter end
#define SET_VALID_ENTRY(name)