MueLu  Version of the Day
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Groups Pages
MueLu_TentativePFactory_kokkos_def.hpp
Go to the documentation of this file.
1 // @HEADER
2 // *****************************************************************************
3 // MueLu: A package for multigrid based preconditioning
4 //
5 // Copyright 2012 NTESS and the MueLu contributors.
6 // SPDX-License-Identifier: BSD-3-Clause
7 // *****************************************************************************
8 // @HEADER
9 
10 #ifndef MUELU_TENTATIVEPFACTORY_KOKKOS_DEF_HPP
11 #define MUELU_TENTATIVEPFACTORY_KOKKOS_DEF_HPP
12 
13 #include "Kokkos_UnorderedMap.hpp"
15 
17 
18 #include "MueLu_Aggregates.hpp"
19 #include "MueLu_AmalgamationInfo.hpp"
20 #include "MueLu_AmalgamationFactory.hpp"
21 
22 #include "MueLu_MasterList.hpp"
23 #include "MueLu_PerfUtils.hpp"
24 #include "MueLu_Monitor.hpp"
25 
26 #include "Xpetra_IO.hpp"
27 
28 namespace MueLu {
29 
30 namespace { // anonymous
31 
32 template <class LocalOrdinal, class View>
33 class ReduceMaxFunctor {
34  public:
35  ReduceMaxFunctor(View view)
36  : view_(view) {}
37 
38  KOKKOS_INLINE_FUNCTION
39  void operator()(const LocalOrdinal& i, LocalOrdinal& vmax) const {
40  if (vmax < view_(i))
41  vmax = view_(i);
42  }
43 
44  KOKKOS_INLINE_FUNCTION
45  void join(LocalOrdinal& dst, const LocalOrdinal& src) const {
46  if (dst < src) {
47  dst = src;
48  }
49  }
50 
51  KOKKOS_INLINE_FUNCTION
52  void init(LocalOrdinal& dst) const {
53  dst = 0;
54  }
55 
56  private:
58 };
59 
60 // local QR decomposition
61 template <class LOType, class GOType, class SCType, class DeviceType, class NspType, class aggRowsType, class maxAggDofSizeType, class agg2RowMapLOType, class statusType, class rowsType, class rowsAuxType, class colsAuxType, class valsAuxType>
62 class LocalQRDecompFunctor {
63  private:
64  typedef LOType LO;
65  typedef GOType GO;
66  typedef SCType SC;
67 
68  typedef typename DeviceType::execution_space execution_space;
69  typedef typename Kokkos::ArithTraits<SC>::val_type impl_SC;
70  typedef Kokkos::ArithTraits<impl_SC> impl_ATS;
71  typedef typename impl_ATS::magnitudeType Magnitude;
72 
73  typedef Kokkos::View<impl_SC**, typename execution_space::scratch_memory_space, Kokkos::MemoryUnmanaged> shared_matrix;
74  typedef Kokkos::View<impl_SC*, typename execution_space::scratch_memory_space, Kokkos::MemoryUnmanaged> shared_vector;
75 
76  private:
77  NspType fineNS;
78  NspType coarseNS;
79  aggRowsType aggRows;
80  maxAggDofSizeType maxAggDofSize; //< maximum number of dofs in aggregate (max size of aggregate * numDofsPerNode)
81  agg2RowMapLOType agg2RowMapLO;
82  statusType statusAtomic;
83  rowsType rows;
84  rowsAuxType rowsAux;
85  colsAuxType colsAux;
86  valsAuxType valsAux;
87  bool doQRStep;
88 
89  public:
90  LocalQRDecompFunctor(NspType fineNS_, NspType coarseNS_, aggRowsType aggRows_, maxAggDofSizeType maxAggDofSize_, agg2RowMapLOType agg2RowMapLO_, statusType statusAtomic_, rowsType rows_, rowsAuxType rowsAux_, colsAuxType colsAux_, valsAuxType valsAux_, bool doQRStep_)
91  : fineNS(fineNS_)
92  , coarseNS(coarseNS_)
93  , aggRows(aggRows_)
94  , maxAggDofSize(maxAggDofSize_)
95  , agg2RowMapLO(agg2RowMapLO_)
96  , statusAtomic(statusAtomic_)
97  , rows(rows_)
98  , rowsAux(rowsAux_)
99  , colsAux(colsAux_)
100  , valsAux(valsAux_)
101  , doQRStep(doQRStep_) {}
102 
103  KOKKOS_INLINE_FUNCTION
104  void operator()(const typename Kokkos::TeamPolicy<execution_space>::member_type& thread, size_t& nnz) const {
105  auto agg = thread.league_rank();
106 
107  // size of aggregate: number of DOFs in aggregate
108  auto aggSize = aggRows(agg + 1) - aggRows(agg);
109 
110  const impl_SC one = impl_ATS::one();
111  const impl_SC two = one + one;
112  const impl_SC zero = impl_ATS::zero();
113  const auto zeroM = impl_ATS::magnitude(zero);
114 
115  int m = aggSize;
116  int n = fineNS.extent(1);
117 
118  // calculate row offset for coarse nullspace
119  Xpetra::global_size_t offset = agg * n;
120 
121  if (doQRStep) {
122  // Extract the piece of the nullspace corresponding to the aggregate
123  shared_matrix r(thread.team_shmem(), m, n); // A (initially), R (at the end)
124  for (int j = 0; j < n; j++)
125  for (int k = 0; k < m; k++)
126  r(k, j) = fineNS(agg2RowMapLO(aggRows(agg) + k), j);
127 #if 0
128  printf("A\n");
129  for (int i = 0; i < m; i++) {
130  for (int j = 0; j < n; j++)
131  printf(" %5.3lf ", r(i,j));
132  printf("\n");
133  }
134 #endif
135 
136  // Calculate QR decomposition (standard)
137  shared_matrix q(thread.team_shmem(), m, m); // Q
138  if (m >= n) {
139  bool isSingular = false;
140 
141  // Initialize Q^T
142  auto qt = q;
143  for (int i = 0; i < m; i++) {
144  for (int j = 0; j < m; j++)
145  qt(i, j) = zero;
146  qt(i, i) = one;
147  }
148 
149  for (int k = 0; k < n; k++) { // we ignore "n" instead of "n-1" to normalize
150  // FIXME_KOKKOS: use team
151  Magnitude s = zeroM, norm, norm_x;
152  for (int i = k + 1; i < m; i++)
153  s += pow(impl_ATS::magnitude(r(i, k)), 2);
154  norm = sqrt(pow(impl_ATS::magnitude(r(k, k)), 2) + s);
155 
156  if (norm == zero) {
157  isSingular = true;
158  break;
159  }
160 
161  r(k, k) -= norm * one;
162 
163  norm_x = sqrt(pow(impl_ATS::magnitude(r(k, k)), 2) + s);
164  if (norm_x == zeroM) {
165  // We have a single diagonal element in the column.
166  // No reflections required. Just need to restor r(k,k).
167  r(k, k) = norm * one;
168  continue;
169  }
170 
171  // FIXME_KOKKOS: use team
172  for (int i = k; i < m; i++)
173  r(i, k) /= norm_x;
174 
175  // Update R(k:m,k+1:n)
176  for (int j = k + 1; j < n; j++) {
177  // FIXME_KOKKOS: use team in the loops
178  impl_SC si = zero;
179  for (int i = k; i < m; i++)
180  si += r(i, k) * r(i, j);
181  for (int i = k; i < m; i++)
182  r(i, j) -= two * si * r(i, k);
183  }
184 
185  // Update Q^T (k:m,k:m)
186  for (int j = k; j < m; j++) {
187  // FIXME_KOKKOS: use team in the loops
188  impl_SC si = zero;
189  for (int i = k; i < m; i++)
190  si += r(i, k) * qt(i, j);
191  for (int i = k; i < m; i++)
192  qt(i, j) -= two * si * r(i, k);
193  }
194 
195  // Fix R(k:m,k)
196  r(k, k) = norm * one;
197  for (int i = k + 1; i < m; i++)
198  r(i, k) = zero;
199  }
200 
201 #if 0
202  // Q = (Q^T)^T
203  for (int i = 0; i < m; i++)
204  for (int j = 0; j < i; j++) {
205  impl_SC tmp = qt(i,j);
206  qt(i,j) = qt(j,i);
207  qt(j,i) = tmp;
208  }
209 #endif
210 
211  // Build coarse nullspace using the upper triangular part of R
212  for (int j = 0; j < n; j++)
213  for (int k = 0; k <= j; k++)
214  coarseNS(offset + k, j) = r(k, j);
215 
216  if (isSingular) {
217  statusAtomic(1) = true;
218  return;
219  }
220 
221  } else {
222  // Special handling for m < n (i.e. single node aggregates in structural mechanics)
223 
224  // The local QR decomposition is not possible in the "overconstrained"
225  // case (i.e. number of columns in qr > number of rowsAux), which
226  // corresponds to #DOFs in Aggregate < n. For usual problems this
227  // is only possible for single node aggregates in structural mechanics.
228  // (Similar problems may arise in discontinuous Galerkin problems...)
229  // We bypass the QR decomposition and use an identity block in the
230  // tentative prolongator for the single node aggregate and transfer the
231  // corresponding fine level null space information 1-to-1 to the coarse
232  // level null space part.
233 
234  // NOTE: The resulting tentative prolongation operator has
235  // (m*DofsPerNode-n) zero columns leading to a singular
236  // coarse level operator A. To deal with that one has the following
237  // options:
238  // - Use the "RepairMainDiagonal" flag in the RAPFactory (default:
239  // false) to add some identity block to the diagonal of the zero rowsAux
240  // in the coarse level operator A, such that standard level smoothers
241  // can be used again.
242  // - Use special (projection-based) level smoothers, which can deal
243  // with singular matrices (very application specific)
244  // - Adapt the code below to avoid zero columns. However, we do not
245  // support a variable number of DOFs per node in MueLu/Xpetra which
246  // makes the implementation really hard.
247  //
248  // FIXME: do we need to check for singularity here somehow? Zero
249  // columns would be easy but linear dependency would require proper QR.
250 
251  // R = extended (by adding identity rowsAux) qr
252  for (int j = 0; j < n; j++)
253  for (int k = 0; k < n; k++)
254  if (k < m)
255  coarseNS(offset + k, j) = r(k, j);
256  else
257  coarseNS(offset + k, j) = (k == j ? one : zero);
258 
259  // Q = I (rectangular)
260  for (int i = 0; i < m; i++)
261  for (int j = 0; j < n; j++)
262  q(i, j) = (j == i ? one : zero);
263  }
264 
265  // Process each row in the local Q factor and fill helper arrays to assemble P
266  for (int j = 0; j < m; j++) {
267  LO localRow = agg2RowMapLO(aggRows(agg) + j);
268  size_t rowStart = rowsAux(localRow);
269  size_t lnnz = 0;
270  for (int k = 0; k < n; k++) {
271  // skip zeros
272  if (q(j, k) != zero) {
273  colsAux(rowStart + lnnz) = offset + k;
274  valsAux(rowStart + lnnz) = q(j, k);
275  lnnz++;
276  }
277  }
278  rows(localRow + 1) = lnnz;
279  nnz += lnnz;
280  }
281 
282 #if 0
283  printf("R\n");
284  for (int i = 0; i < m; i++) {
285  for (int j = 0; j < n; j++)
286  printf(" %5.3lf ", coarseNS(i,j));
287  printf("\n");
288  }
289 
290  printf("Q\n");
291  for (int i = 0; i < aggSize; i++) {
292  for (int j = 0; j < aggSize; j++)
293  printf(" %5.3lf ", q(i,j));
294  printf("\n");
295  }
296 #endif
297  } else {
299  // "no-QR" option //
301  // Local Q factor is just the fine nullspace support over the current aggregate.
302  // Local R factor is the identity.
303  // TODO I have not implemented any special handling for aggregates that are too
304  // TODO small to locally support the nullspace, as is done in the standard QR
305  // TODO case above.
306 
307  for (int j = 0; j < m; j++) {
308  LO localRow = agg2RowMapLO(aggRows(agg) + j);
309  size_t rowStart = rowsAux(localRow);
310  size_t lnnz = 0;
311  for (int k = 0; k < n; k++) {
312  const impl_SC qr_jk = fineNS(localRow, k);
313  // skip zeros
314  if (qr_jk != zero) {
315  colsAux(rowStart + lnnz) = offset + k;
316  valsAux(rowStart + lnnz) = qr_jk;
317  lnnz++;
318  }
319  }
320  rows(localRow + 1) = lnnz;
321  nnz += lnnz;
322  }
323 
324  for (int j = 0; j < n; j++)
325  coarseNS(offset + j, j) = one;
326  }
327  }
328 
329  // amount of shared memory
330  size_t team_shmem_size(int /* team_size */) const {
331  if (doQRStep) {
332  int m = maxAggDofSize;
333  int n = fineNS.extent(1);
334  return shared_matrix::shmem_size(m, n) + // r
335  shared_matrix::shmem_size(m, m); // q
336  } else
337  return 0;
338  }
339 };
340 
341 } // namespace
342 
343 template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
345  RCP<ParameterList> validParamList = rcp(new ParameterList());
346 
347 #define SET_VALID_ENTRY(name) validParamList->setEntry(name, MasterList::getEntry(name))
348  SET_VALID_ENTRY("tentative: calculate qr");
349  SET_VALID_ENTRY("tentative: build coarse coordinates");
350 #undef SET_VALID_ENTRY
351 
352  validParamList->set<RCP<const FactoryBase>>("A", Teuchos::null, "Generating factory of the matrix A");
353  validParamList->set<RCP<const FactoryBase>>("Aggregates", Teuchos::null, "Generating factory of the aggregates");
354  validParamList->set<RCP<const FactoryBase>>("Nullspace", Teuchos::null, "Generating factory of the nullspace");
355  validParamList->set<RCP<const FactoryBase>>("Scaled Nullspace", Teuchos::null, "Generating factory of the scaled nullspace");
356  validParamList->set<RCP<const FactoryBase>>("UnAmalgamationInfo", Teuchos::null, "Generating factory of UnAmalgamationInfo");
357  validParamList->set<RCP<const FactoryBase>>("CoarseMap", Teuchos::null, "Generating factory of the coarse map");
358  validParamList->set<RCP<const FactoryBase>>("Coordinates", Teuchos::null, "Generating factory of the coordinates");
359  validParamList->set<RCP<const FactoryBase>>("Node Comm", Teuchos::null, "Generating factory of the node level communicator");
360 
361  // Make sure we don't recursively validate options for the matrixmatrix kernels
362  ParameterList norecurse;
363  norecurse.disableRecursiveValidation();
364  validParamList->set<ParameterList>("matrixmatrix: kernel params", norecurse, "MatrixMatrix kernel parameters");
365 
366  return validParamList;
367 }
368 
369 template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
371  const ParameterList& pL = GetParameterList();
372  // NOTE: This guy can only either be 'Nullspace' or 'Scaled Nullspace' or else the validator above will cause issues
373  std::string nspName = "Nullspace";
374  if (pL.isParameter("Nullspace name")) nspName = pL.get<std::string>("Nullspace name");
375 
376  Input(fineLevel, "A");
377  Input(fineLevel, "Aggregates");
378  Input(fineLevel, nspName);
379  Input(fineLevel, "UnAmalgamationInfo");
380  Input(fineLevel, "CoarseMap");
381  if (fineLevel.GetLevelID() == 0 &&
382  fineLevel.IsAvailable("Coordinates", NoFactory::get()) && // we have coordinates (provided by user app)
383  pL.get<bool>("tentative: build coarse coordinates")) { // and we want coordinates on other levels
384  bTransferCoordinates_ = true; // then set the transfer coordinates flag to true
385  Input(fineLevel, "Coordinates");
386  } else if (bTransferCoordinates_) {
387  Input(fineLevel, "Coordinates");
388  }
389 }
390 
391 template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
393  return BuildP(fineLevel, coarseLevel);
394 }
395 
396 template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
398  FactoryMonitor m(*this, "Build", coarseLevel);
399 
400  typedef typename Teuchos::ScalarTraits<Scalar>::coordinateType coordinate_type;
401  typedef Xpetra::MultiVectorFactory<coordinate_type, LO, GO, NO> RealValuedMultiVectorFactory;
402  const ParameterList& pL = GetParameterList();
403  std::string nspName = "Nullspace";
404  if (pL.isParameter("Nullspace name")) nspName = pL.get<std::string>("Nullspace name");
405 
406  auto A = Get<RCP<Matrix>>(fineLevel, "A");
407  auto aggregates = Get<RCP<Aggregates>>(fineLevel, "Aggregates");
408  auto amalgInfo = Get<RCP<AmalgamationInfo>>(fineLevel, "UnAmalgamationInfo");
409  auto fineNullspace = Get<RCP<MultiVector>>(fineLevel, nspName);
410  auto coarseMap = Get<RCP<const Map>>(fineLevel, "CoarseMap");
411  RCP<RealValuedMultiVector> fineCoords;
412  if (bTransferCoordinates_) {
413  fineCoords = Get<RCP<RealValuedMultiVector>>(fineLevel, "Coordinates");
414  }
415 
416  RCP<Matrix> Ptentative;
417  // No coarse DoFs so we need to bail by setting Ptentattive to null and returning
418  // This level will ultimately be removed in MueLu_Hierarchy_defs.h via a resize()
419  if (aggregates->GetNumGlobalAggregatesComputeIfNeeded() == 0) {
420  Ptentative = Teuchos::null;
421  Set(coarseLevel, "P", Ptentative);
422  return;
423  }
424  RCP<MultiVector> coarseNullspace;
425  RCP<RealValuedMultiVector> coarseCoords;
426 
427  if (bTransferCoordinates_) {
428  RCP<const Map> coarseCoordMap;
429 
430  LO blkSize = 1;
431  if (rcp_dynamic_cast<const StridedMap>(coarseMap) != Teuchos::null)
432  blkSize = rcp_dynamic_cast<const StridedMap>(coarseMap)->getFixedBlockSize();
433 
434  if (blkSize == 1) {
435  // Scalar system
436  // No amalgamation required, we can use the coarseMap
437  coarseCoordMap = coarseMap;
438  } else {
439  // Vector system
440  AmalgamationFactory<SC, LO, GO, NO>::AmalgamateMap(rcp_dynamic_cast<const StridedMap>(coarseMap), coarseCoordMap);
441  }
442 
443  coarseCoords = RealValuedMultiVectorFactory::Build(coarseCoordMap, fineCoords->getNumVectors());
444 
445  // Create overlapped fine coordinates to reduce global communication
446  auto uniqueMap = fineCoords->getMap();
447  RCP<RealValuedMultiVector> ghostedCoords = fineCoords;
448  if (aggregates->AggregatesCrossProcessors()) {
449  auto nonUniqueMap = aggregates->GetMap();
450  auto importer = ImportFactory::Build(uniqueMap, nonUniqueMap);
451 
452  ghostedCoords = RealValuedMultiVectorFactory::Build(nonUniqueMap, fineCoords->getNumVectors());
453  ghostedCoords->doImport(*fineCoords, *importer, Xpetra::INSERT);
454  }
455 
456  // The good new is that his graph has already been constructed for the
457  // TentativePFactory and was cached in Aggregates. So this is a no-op.
458  auto aggGraph = aggregates->GetGraph();
459  auto numAggs = aggGraph.numRows();
460 
461  auto fineCoordsView = fineCoords->getDeviceLocalView(Xpetra::Access::ReadOnly);
462  auto coarseCoordsView = coarseCoords->getDeviceLocalView(Xpetra::Access::OverwriteAll);
463 
464  // Fill in coarse coordinates
465  {
466  SubFactoryMonitor m2(*this, "AverageCoords", coarseLevel);
467 
468  const auto dim = fineCoords->getNumVectors();
469 
470  typename AppendTrait<decltype(fineCoordsView), Kokkos::RandomAccess>::type fineCoordsRandomView = fineCoordsView;
471  for (size_t j = 0; j < dim; j++) {
472  Kokkos::parallel_for(
473  "MueLu::TentativeP::BuildCoords", Kokkos::RangePolicy<local_ordinal_type, execution_space>(0, numAggs),
474  KOKKOS_LAMBDA(const LO i) {
475  // A row in this graph represents all node ids in the aggregate
476  // Therefore, averaging is very easy
477 
478  auto aggregate = aggGraph.rowConst(i);
479 
480  coordinate_type sum = 0.0; // do not use Scalar here (Stokhos)
481  for (size_t colID = 0; colID < static_cast<size_t>(aggregate.length); colID++)
482  sum += fineCoordsRandomView(aggregate(colID), j);
483 
484  coarseCoordsView(i, j) = sum / aggregate.length;
485  });
486  }
487  }
488  }
489 
490  if (!aggregates->AggregatesCrossProcessors()) {
492  BuildPuncoupledBlockCrs(coarseLevel, A, aggregates, amalgInfo, fineNullspace, coarseMap, Ptentative, coarseNullspace,
493  coarseLevel.GetLevelID());
494  } else {
495  BuildPuncoupled(coarseLevel, A, aggregates, amalgInfo, fineNullspace, coarseMap, Ptentative, coarseNullspace, coarseLevel.GetLevelID());
496  }
497  } else
498  BuildPcoupled(A, aggregates, amalgInfo, fineNullspace, coarseMap, Ptentative, coarseNullspace);
499 
500  // If available, use striding information of fine level matrix A for range
501  // map and coarseMap as domain map; otherwise use plain range map of
502  // Ptent = plain range map of A for range map and coarseMap as domain map.
503  // NOTE:
504  // The latter is not really safe, since there is no striding information
505  // for the range map. This is not really a problem, since striding
506  // information is always available on the intermedium levels and the
507  // coarsest levels.
508  if (A->IsView("stridedMaps") == true)
509  Ptentative->CreateView("stridedMaps", A->getRowMap("stridedMaps"), coarseMap);
510  else
511  Ptentative->CreateView("stridedMaps", Ptentative->getRangeMap(), coarseMap);
512 
513  if (bTransferCoordinates_) {
514  Set(coarseLevel, "Coordinates", coarseCoords);
515  }
516 
517  // FIXME: We should remove the NodeComm on levels past the threshold
518  if (fineLevel.IsAvailable("Node Comm")) {
519  RCP<const Teuchos::Comm<int>> nodeComm = Get<RCP<const Teuchos::Comm<int>>>(fineLevel, "Node Comm");
520  Set<RCP<const Teuchos::Comm<int>>>(coarseLevel, "Node Comm", nodeComm);
521  }
522 
523  Set(coarseLevel, "Nullspace", coarseNullspace);
524  Set(coarseLevel, "P", Ptentative);
525 
526  if (IsPrint(Statistics2)) {
527  RCP<ParameterList> params = rcp(new ParameterList());
528  params->set("printLoadBalancingInfo", true);
529  GetOStream(Statistics2) << PerfUtils::PrintMatrixInfo(*Ptentative, "Ptent", params);
530  }
531 }
532 
533 template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
535  BuildPuncoupled(Level& coarseLevel, RCP<Matrix> A, RCP<Aggregates> aggregates,
536  RCP<AmalgamationInfo> amalgInfo, RCP<MultiVector> fineNullspace,
537  RCP<const Map> coarseMap, RCP<Matrix>& Ptentative,
538  RCP<MultiVector>& coarseNullspace, const int levelID) const {
539  auto rowMap = A->getRowMap();
540  auto colMap = A->getColMap();
541 
542  const size_t numRows = rowMap->getLocalNumElements();
543  const size_t NSDim = fineNullspace->getNumVectors();
544 
545  typedef Kokkos::ArithTraits<SC> ATS;
546  using impl_SC = typename ATS::val_type;
547  using impl_ATS = Kokkos::ArithTraits<impl_SC>;
548  const impl_SC zero = impl_ATS::zero(), one = impl_ATS::one();
549 
550  const LO INVALID = Teuchos::OrdinalTraits<LO>::invalid();
551 
552  typename Aggregates::local_graph_type aggGraph;
553  {
554  SubFactoryMonitor m2(*this, "Get Aggregates graph", coarseLevel);
555  aggGraph = aggregates->GetGraph();
556  }
557  auto aggRows = aggGraph.row_map;
558  auto aggCols = aggGraph.entries;
559 
560  // Aggregates map is based on the amalgamated column map
561  // We can skip global-to-local conversion if LIDs in row map are
562  // same as LIDs in column map
563  bool goodMap;
564  {
565  SubFactoryMonitor m2(*this, "Check good map", coarseLevel);
566  goodMap = isGoodMap(*rowMap, *colMap);
567  }
568  // FIXME_KOKKOS: need to proofread later code for bad maps
570  "MueLu: TentativePFactory_kokkos: for now works only with good maps "
571  "(i.e. \"matching\" row and column maps)");
572 
573  // STEP 1: do unamalgamation
574  // The non-kokkos version uses member functions from the AmalgamationInfo
575  // container class to unamalgamate the data. In contrast, the kokkos
576  // version of TentativePFactory does the unamalgamation here and only uses
577  // the data of the AmalgamationInfo container class
578 
579  // Extract information for unamalgamation
580  LO fullBlockSize, blockID, stridingOffset, stridedBlockSize;
581  GO indexBase;
582  amalgInfo->GetStridingInformation(fullBlockSize, blockID, stridingOffset, stridedBlockSize, indexBase);
583  GO globalOffset = amalgInfo->GlobalOffset();
584 
585  // Extract aggregation info (already in Kokkos host views)
586  auto procWinner = aggregates->GetProcWinner()->getDeviceLocalView(Xpetra::Access::ReadOnly);
587  auto vertex2AggId = aggregates->GetVertex2AggId()->getDeviceLocalView(Xpetra::Access::ReadOnly);
588  const size_t numAggregates = aggregates->GetNumAggregates();
589 
590  int myPID = aggregates->GetMap()->getComm()->getRank();
591 
592  // Create Kokkos::View (on the device) to store the aggreate dof sizes
593  // Later used to get aggregate dof offsets
594  // NOTE: This zeros itself on construction
595  typedef typename Aggregates::aggregates_sizes_type::non_const_type AggSizeType;
596  AggSizeType aggDofSizes;
597 
598  if (stridedBlockSize == 1) {
599  SubFactoryMonitor m2(*this, "Calc AggSizes", coarseLevel);
600 
601  // FIXME_KOKKOS: use ViewAllocateWithoutInitializing + set a single value
602  aggDofSizes = AggSizeType("agg_dof_sizes", numAggregates + 1);
603 
604  auto sizesConst = aggregates->ComputeAggregateSizes();
605  Kokkos::deep_copy(Kokkos::subview(aggDofSizes, Kokkos::make_pair(static_cast<size_t>(1), numAggregates + 1)), sizesConst);
606 
607  } else {
608  SubFactoryMonitor m2(*this, "Calc AggSizes", coarseLevel);
609 
610  // FIXME_KOKKOS: use ViewAllocateWithoutInitializing + set a single value
611  aggDofSizes = AggSizeType("agg_dof_sizes", numAggregates + 1);
612 
613  auto nodeMap = aggregates->GetMap()->getLocalMap();
614  auto dofMap = colMap->getLocalMap();
615 
616  Kokkos::parallel_for(
617  "MueLu:TentativePF:Build:compute_agg_sizes", range_type(0, numAggregates),
618  KOKKOS_LAMBDA(const LO agg) {
619  auto aggRowView = aggGraph.rowConst(agg);
620 
621  size_t size = 0;
622  for (LO colID = 0; colID < aggRowView.length; colID++) {
623  GO nodeGID = nodeMap.getGlobalElement(aggRowView(colID));
624 
625  for (LO k = 0; k < stridedBlockSize; k++) {
626  GO dofGID = (nodeGID - indexBase) * fullBlockSize + k + indexBase + globalOffset + stridingOffset;
627 
628  if (dofMap.getLocalElement(dofGID) != INVALID)
629  size++;
630  }
631  }
632  aggDofSizes(agg + 1) = size;
633  });
634  }
635 
636  // Find maximum dof size for aggregates
637  // Later used to reserve enough scratch space for local QR decompositions
638  LO maxAggSize = 0;
639  ReduceMaxFunctor<LO, decltype(aggDofSizes)> reduceMax(aggDofSizes);
640  Kokkos::parallel_reduce("MueLu:TentativePF:Build:max_agg_size", range_type(0, aggDofSizes.extent(0)), reduceMax, maxAggSize);
641 
642  // parallel_scan (exclusive)
643  // The aggDofSizes View then contains the aggregate dof offsets
644  Kokkos::parallel_scan(
645  "MueLu:TentativePF:Build:aggregate_sizes:stage1_scan", range_type(0, numAggregates + 1),
646  KOKKOS_LAMBDA(const LO i, LO& update, const bool& final_pass) {
647  update += aggDofSizes(i);
648  if (final_pass)
649  aggDofSizes(i) = update;
650  });
651 
652  // Create Kokkos::View on the device to store mapping
653  // between (local) aggregate id and row map ids (LIDs)
654  Kokkos::View<LO*, DeviceType> agg2RowMapLO(Kokkos::ViewAllocateWithoutInitializing("agg2row_map_LO"), numRows);
655  {
656  SubFactoryMonitor m2(*this, "Create Agg2RowMap", coarseLevel);
657 
658  AggSizeType aggOffsets(Kokkos::ViewAllocateWithoutInitializing("aggOffsets"), numAggregates);
659  Kokkos::deep_copy(aggOffsets, Kokkos::subview(aggDofSizes, Kokkos::make_pair(static_cast<size_t>(0), numAggregates)));
660 
661  Kokkos::parallel_for(
662  "MueLu:TentativePF:Build:createAgg2RowMap", range_type(0, vertex2AggId.extent(0)),
663  KOKKOS_LAMBDA(const LO lnode) {
664  if (procWinner(lnode, 0) == myPID) {
665  // No need for atomics, it's one-to-one
666  auto aggID = vertex2AggId(lnode, 0);
667 
668  auto offset = Kokkos::atomic_fetch_add(&aggOffsets(aggID), stridedBlockSize);
669  // FIXME: I think this may be wrong
670  // We unconditionally add the whole block here. When we calculated
671  // aggDofSizes, we did the isLocalElement check. Something's fishy.
672  for (LO k = 0; k < stridedBlockSize; k++)
673  agg2RowMapLO(offset + k) = lnode * stridedBlockSize + k;
674  }
675  });
676  }
677 
678  // STEP 2: prepare local QR decomposition
679  // Reserve memory for tentative prolongation operator
680  coarseNullspace = MultiVectorFactory::Build(coarseMap, NSDim);
681 
682  // Pull out the nullspace vectors so that we can have random access (on the device)
683  auto fineNS = fineNullspace->getDeviceLocalView(Xpetra::Access::ReadWrite);
684  auto coarseNS = coarseNullspace->getDeviceLocalView(Xpetra::Access::OverwriteAll);
685 
686  size_t nnz = 0; // actual number of nnz
687 
688  typedef typename Xpetra::Matrix<SC, LO, GO, NO>::local_matrix_type local_matrix_type;
689  typedef typename local_matrix_type::row_map_type::non_const_type rows_type;
690  typedef typename local_matrix_type::index_type::non_const_type cols_type;
691  typedef typename local_matrix_type::values_type::non_const_type vals_type;
692 
693  // Device View for status (error messages...)
694  typedef Kokkos::View<int[10], DeviceType> status_type;
695  status_type status("status");
696 
699 
700  const ParameterList& pL = GetParameterList();
701  const bool& doQRStep = pL.get<bool>("tentative: calculate qr");
702  if (!doQRStep) {
703  GetOStream(Runtime1) << "TentativePFactory : bypassing local QR phase" << std::endl;
704  if (NSDim > 1)
705  GetOStream(Warnings0) << "TentativePFactor : for nontrivial nullspace, this may degrade performance" << std::endl;
706  }
707 
708  size_t nnzEstimate = numRows * NSDim;
709  rows_type rowsAux(Kokkos::ViewAllocateWithoutInitializing("Ptent_aux_rows"), numRows + 1);
710  cols_type colsAux(Kokkos::ViewAllocateWithoutInitializing("Ptent_aux_cols"), nnzEstimate);
711  vals_type valsAux("Ptent_aux_vals", nnzEstimate);
712  rows_type rows("Ptent_rows", numRows + 1);
713  {
714  // Stage 0: fill in views.
715  SubFactoryMonitor m2(*this, "Stage 0 (InitViews)", coarseLevel);
716 
717  // The main thing to notice is initialization of vals with INVALID. These
718  // values will later be used to compress the arrays
719  Kokkos::parallel_for(
720  "MueLu:TentativePF:BuildPuncoupled:for1", range_type(0, numRows + 1),
721  KOKKOS_LAMBDA(const LO row) {
722  rowsAux(row) = row * NSDim;
723  });
724  Kokkos::parallel_for(
725  "MueLu:TentativePF:BuildUncoupled:for2", range_type(0, nnzEstimate),
726  KOKKOS_LAMBDA(const LO j) {
727  colsAux(j) = INVALID;
728  });
729  }
730 
731  if (NSDim == 1) {
732  // 1D is special, as it is the easiest. We don't even need to the QR,
733  // just normalize an array. Plus, no worries abot small aggregates. In
734  // addition, we do not worry about compression. It is unlikely that
735  // nullspace will have zeros. If it does, a prolongator row would be
736  // zero and we'll get singularity anyway.
737  SubFactoryMonitor m2(*this, "Stage 1 (LocalQR)", coarseLevel);
738 
739  // Set up team policy with numAggregates teams and one thread per team.
740  // Each team handles a slice of the data associated with one aggregate
741  // and performs a local QR decomposition (in this case real QR is
742  // unnecessary).
743  const Kokkos::TeamPolicy<execution_space> policy(numAggregates, 1);
744 
745  if (doQRStep) {
746  Kokkos::parallel_for(
747  "MueLu:TentativePF:BuildUncoupled:main_loop", policy,
748  KOKKOS_LAMBDA(const typename Kokkos::TeamPolicy<execution_space>::member_type& thread) {
749  auto agg = thread.league_rank();
750 
751  // size of the aggregate (number of DOFs in aggregate)
752  LO aggSize = aggRows(agg + 1) - aggRows(agg);
753 
754  // Extract the piece of the nullspace corresponding to the aggregate, and
755  // put it in the flat array, "localQR" (in column major format) for the
756  // QR routine. Trivial in 1D.
757  auto norm = impl_ATS::magnitude(zero);
758 
759  // Calculate QR by hand
760  // FIXME: shouldn't there be stridedblock here?
761  // FIXME_KOKKOS: shouldn't there be stridedblock here?
762  for (decltype(aggSize) k = 0; k < aggSize; k++) {
763  auto dnorm = impl_ATS::magnitude(fineNSRandom(agg2RowMapLO(aggRows(agg) + k), 0));
764  norm += dnorm * dnorm;
765  }
766  norm = sqrt(norm);
767 
768  if (norm == zero) {
769  // zero column; terminate the execution
770  statusAtomic(1) = true;
771  return;
772  }
773 
774  // R = norm
775  coarseNS(agg, 0) = norm;
776 
777  // Q = localQR(:,0)/norm
778  for (decltype(aggSize) k = 0; k < aggSize; k++) {
779  LO localRow = agg2RowMapLO(aggRows(agg) + k);
780  impl_SC localVal = fineNSRandom(agg2RowMapLO(aggRows(agg) + k), 0) / norm;
781 
782  rows(localRow + 1) = 1;
783  colsAux(localRow) = agg;
784  valsAux(localRow) = localVal;
785  }
786  });
787 
788  typename status_type::HostMirror statusHost = Kokkos::create_mirror_view(status);
789  Kokkos::deep_copy(statusHost, status);
790  for (decltype(statusHost.size()) i = 0; i < statusHost.size(); i++)
791  if (statusHost(i)) {
792  std::ostringstream oss;
793  oss << "MueLu::TentativePFactory::MakeTentative: ";
794  switch (i) {
795  case 0: oss << "!goodMap is not implemented"; break;
796  case 1: oss << "fine level NS part has a zero column"; break;
797  }
798  throw Exceptions::RuntimeError(oss.str());
799  }
800 
801  } else {
802  Kokkos::parallel_for(
803  "MueLu:TentativePF:BuildUncoupled:main_loop_noqr", policy,
804  KOKKOS_LAMBDA(const typename Kokkos::TeamPolicy<execution_space>::member_type& thread) {
805  auto agg = thread.league_rank();
806 
807  // size of the aggregate (number of DOFs in aggregate)
808  LO aggSize = aggRows(agg + 1) - aggRows(agg);
809 
810  // R = norm
811  coarseNS(agg, 0) = one;
812 
813  // Q = localQR(:,0)/norm
814  for (decltype(aggSize) k = 0; k < aggSize; k++) {
815  LO localRow = agg2RowMapLO(aggRows(agg) + k);
816  impl_SC localVal = fineNSRandom(agg2RowMapLO(aggRows(agg) + k), 0);
817 
818  rows(localRow + 1) = 1;
819  colsAux(localRow) = agg;
820  valsAux(localRow) = localVal;
821  }
822  });
823  }
824 
825  Kokkos::parallel_reduce(
826  "MueLu:TentativeP:CountNNZ", range_type(0, numRows + 1),
827  KOKKOS_LAMBDA(const LO i, size_t& nnz_count) {
828  nnz_count += rows(i);
829  },
830  nnz);
831 
832  } else { // NSdim > 1
833  // FIXME_KOKKOS: This code branch is completely unoptimized.
834  // Work to do:
835  // - Optimize QR decomposition
836  // - Remove INVALID usage similarly to CoalesceDropFactory_kokkos by
837  // packing new values in the beginning of each row
838  // We do use auxilary view in this case, so keep a second rows view for
839  // counting nonzeros in rows
840 
841  {
842  SubFactoryMonitor m2 = SubFactoryMonitor(*this, doQRStep ? "Stage 1 (LocalQR)" : "Stage 1 (Fill coarse nullspace and tentative P)", coarseLevel);
843  // Set up team policy with numAggregates teams and one thread per team.
844  // Each team handles a slice of the data associated with one aggregate
845  // and performs a local QR decomposition
846  const Kokkos::TeamPolicy<execution_space> policy(numAggregates, 1); // numAggregates teams a 1 thread
847  LocalQRDecompFunctor<LocalOrdinal, GlobalOrdinal, Scalar, DeviceType, decltype(fineNSRandom),
848  decltype(aggDofSizes /*aggregate sizes in dofs*/), decltype(maxAggSize), decltype(agg2RowMapLO),
849  decltype(statusAtomic), decltype(rows), decltype(rowsAux), decltype(colsAux),
850  decltype(valsAux)>
851  localQRFunctor(fineNSRandom, coarseNS, aggDofSizes, maxAggSize, agg2RowMapLO, statusAtomic,
852  rows, rowsAux, colsAux, valsAux, doQRStep);
853  Kokkos::parallel_reduce("MueLu:TentativePF:BuildUncoupled:main_qr_loop", policy, localQRFunctor, nnz);
854  }
855 
856  typename status_type::HostMirror statusHost = Kokkos::create_mirror_view(status);
857  Kokkos::deep_copy(statusHost, status);
858  for (decltype(statusHost.size()) i = 0; i < statusHost.size(); i++)
859  if (statusHost(i)) {
860  std::ostringstream oss;
861  oss << "MueLu::TentativePFactory::MakeTentative: ";
862  switch (i) {
863  case 0: oss << "!goodMap is not implemented"; break;
864  case 1: oss << "fine level NS part has a zero column"; break;
865  }
866  throw Exceptions::RuntimeError(oss.str());
867  }
868  }
869 
870  // Compress the cols and vals by ignoring INVALID column entries that correspond
871  // to 0 in QR.
872 
873  // The real cols and vals are constructed using calculated (not estimated) nnz
874  cols_type cols;
875  vals_type vals;
876 
877  if (nnz != nnzEstimate) {
878  {
879  // Stage 2: compress the arrays
880  SubFactoryMonitor m2(*this, "Stage 2 (CompressRows)", coarseLevel);
881 
882  Kokkos::parallel_scan(
883  "MueLu:TentativePF:Build:compress_rows", range_type(0, numRows + 1),
884  KOKKOS_LAMBDA(const LO i, LO& upd, const bool& final) {
885  upd += rows(i);
886  if (final)
887  rows(i) = upd;
888  });
889  }
890 
891  {
892  SubFactoryMonitor m2(*this, "Stage 2 (CompressCols)", coarseLevel);
893 
894  cols = cols_type("Ptent_cols", nnz);
895  vals = vals_type("Ptent_vals", nnz);
896 
897  // FIXME_KOKKOS: this can be spedup by moving correct cols and vals values
898  // to the beginning of rows. See CoalesceDropFactory_kokkos for
899  // example.
900  Kokkos::parallel_for(
901  "MueLu:TentativePF:Build:compress_cols_vals", range_type(0, numRows),
902  KOKKOS_LAMBDA(const LO i) {
903  LO rowStart = rows(i);
904 
905  size_t lnnz = 0;
906  for (auto j = rowsAux(i); j < rowsAux(i + 1); j++)
907  if (colsAux(j) != INVALID) {
908  cols(rowStart + lnnz) = colsAux(j);
909  vals(rowStart + lnnz) = valsAux(j);
910  lnnz++;
911  }
912  });
913  }
914 
915  } else {
916  rows = rowsAux;
917  cols = colsAux;
918  vals = valsAux;
919  }
920 
921  GetOStream(Runtime1) << "TentativePFactory : aggregates do not cross process boundaries" << std::endl;
922 
923  {
924  // Stage 3: construct Xpetra::Matrix
925  SubFactoryMonitor m2(*this, "Stage 3 (LocalMatrix+FillComplete)", coarseLevel);
926 
927  local_matrix_type lclMatrix = local_matrix_type("A", numRows, coarseMap->getLocalNumElements(), nnz, vals, rows, cols);
928 
929  // Managing labels & constants for ESFC
930  RCP<ParameterList> FCparams;
931  if (pL.isSublist("matrixmatrix: kernel params"))
932  FCparams = rcp(new ParameterList(pL.sublist("matrixmatrix: kernel params")));
933  else
934  FCparams = rcp(new ParameterList);
935 
936  // By default, we don't need global constants for TentativeP
937  FCparams->set("compute global constants", FCparams->get("compute global constants", false));
938  FCparams->set("Timer Label", std::string("MueLu::TentativeP-") + toString(levelID));
939 
940  auto PtentCrs = CrsMatrixFactory::Build(lclMatrix, rowMap, coarseMap, coarseMap, A->getDomainMap());
941  Ptentative = rcp(new CrsMatrixWrap(PtentCrs));
942  }
943 }
944 
945 template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
948  RCP<AmalgamationInfo> amalgInfo, RCP<MultiVector> fineNullspace,
949  RCP<const Map> coarsePointMap, RCP<Matrix>& Ptentative,
950  RCP<MultiVector>& coarseNullspace, const int levelID) const {
951  /* This routine generates a BlockCrs P for a BlockCrs A. There are a few assumptions here, which meet the use cases we care about, but could
952  be generalized later, if we ever need to do so:
953  1) Null space dimension === block size of matrix: So no elasticity right now
954  2) QR is not supported: Under assumption #1, this shouldn't cause problems.
955  3) Maps are "good": Aka the first chunk of the ColMap is the RowMap.
956 
957  These assumptions keep our code way simpler and still support the use cases we actually care about.
958  */
959 
960  RCP<const Map> rowMap = A->getRowMap();
961  RCP<const Map> rangeMap = A->getRangeMap();
962  RCP<const Map> colMap = A->getColMap();
963  // const size_t numFinePointRows = rangeMap->getLocalNumElements();
964  const size_t numFineBlockRows = rowMap->getLocalNumElements();
965 
966  // typedef Teuchos::ScalarTraits<SC> STS;
967  // typedef typename STS::magnitudeType Magnitude;
968  const LO INVALID = Teuchos::OrdinalTraits<LO>::invalid();
969 
970  typedef Kokkos::ArithTraits<SC> ATS;
971  using impl_SC = typename ATS::val_type;
972  using impl_ATS = Kokkos::ArithTraits<impl_SC>;
973  const impl_SC one = impl_ATS::one();
974 
975  // const GO numAggs = aggregates->GetNumAggregates();
976  const size_t NSDim = fineNullspace->getNumVectors();
977  auto aggSizes = aggregates->ComputeAggregateSizes();
978 
979  typename Aggregates::local_graph_type aggGraph;
980  {
981  SubFactoryMonitor m2(*this, "Get Aggregates graph", coarseLevel);
982  aggGraph = aggregates->GetGraph();
983  }
984  auto aggRows = aggGraph.row_map;
985  auto aggCols = aggGraph.entries;
986 
987  // Need to generate the coarse block map
988  // NOTE: We assume NSDim == block size here
989  // NOTE: We also assume that coarseMap has contiguous GIDs
990  // const size_t numCoarsePointRows = coarsePointMap->getLocalNumElements();
991  const size_t numCoarseBlockRows = coarsePointMap->getLocalNumElements() / NSDim;
992  RCP<const Map> coarseBlockMap = MapFactory::Build(coarsePointMap->lib(),
994  numCoarseBlockRows,
995  coarsePointMap->getIndexBase(),
996  coarsePointMap->getComm());
997  // Sanity checking
998  const ParameterList& pL = GetParameterList();
999  // const bool &doQRStep = pL.get<bool>("tentative: calculate qr");
1000 
1001  // The aggregates use the amalgamated column map, which in this case is what we want
1002 
1003  // Aggregates map is based on the amalgamated column map
1004  // We can skip global-to-local conversion if LIDs in row map are
1005  // same as LIDs in column map
1006  bool goodMap = MueLu::Utilities<SC, LO, GO, NO>::MapsAreNested(*rowMap, *colMap);
1008  "MueLu: TentativePFactory_kokkos: for now works only with good maps "
1009  "(i.e. \"matching\" row and column maps)");
1010 
1011  // STEP 1: do unamalgamation
1012  // The non-kokkos version uses member functions from the AmalgamationInfo
1013  // container class to unamalgamate the data. In contrast, the kokkos
1014  // version of TentativePFactory does the unamalgamation here and only uses
1015  // the data of the AmalgamationInfo container class
1016 
1017  // Extract information for unamalgamation
1018  LO fullBlockSize, blockID, stridingOffset, stridedBlockSize;
1019  GO indexBase;
1020  amalgInfo->GetStridingInformation(fullBlockSize, blockID, stridingOffset, stridedBlockSize, indexBase);
1021  // GO globalOffset = amalgInfo->GlobalOffset();
1022 
1023  // Extract aggregation info (already in Kokkos host views)
1024  auto procWinner = aggregates->GetProcWinner()->getDeviceLocalView(Xpetra::Access::ReadOnly);
1025  auto vertex2AggId = aggregates->GetVertex2AggId()->getDeviceLocalView(Xpetra::Access::ReadOnly);
1026  const size_t numAggregates = aggregates->GetNumAggregates();
1027 
1028  int myPID = aggregates->GetMap()->getComm()->getRank();
1029 
1030  // Create Kokkos::View (on the device) to store the aggreate dof sizes
1031  // Later used to get aggregate dof offsets
1032  // NOTE: This zeros itself on construction
1033  typedef typename Aggregates::aggregates_sizes_type::non_const_type AggSizeType;
1034  AggSizeType aggDofSizes; // This turns into "starts" after the parallel_scan
1035 
1036  {
1037  SubFactoryMonitor m2(*this, "Calc AggSizes", coarseLevel);
1038 
1039  // FIXME_KOKKOS: use ViewAllocateWithoutInitializing + set a single value
1040  aggDofSizes = AggSizeType("agg_dof_sizes", numAggregates + 1);
1041 
1042  Kokkos::deep_copy(Kokkos::subview(aggDofSizes, Kokkos::make_pair(static_cast<size_t>(1), numAggregates + 1)), aggSizes);
1043  }
1044 
1045  // Find maximum dof size for aggregates
1046  // Later used to reserve enough scratch space for local QR decompositions
1047  LO maxAggSize = 0;
1048  ReduceMaxFunctor<LO, decltype(aggDofSizes)> reduceMax(aggDofSizes);
1049  Kokkos::parallel_reduce("MueLu:TentativePF:Build:max_agg_size", range_type(0, aggDofSizes.extent(0)), reduceMax, maxAggSize);
1050 
1051  // parallel_scan (exclusive)
1052  // The aggDofSizes View then contains the aggregate dof offsets
1053  Kokkos::parallel_scan(
1054  "MueLu:TentativePF:Build:aggregate_sizes:stage1_scan", range_type(0, numAggregates + 1),
1055  KOKKOS_LAMBDA(const LO i, LO& update, const bool& final_pass) {
1056  update += aggDofSizes(i);
1057  if (final_pass)
1058  aggDofSizes(i) = update;
1059  });
1060 
1061  // Create Kokkos::View on the device to store mapping
1062  // between (local) aggregate id and row map ids (LIDs)
1063  Kokkos::View<LO*, DeviceType> aggToRowMapLO(Kokkos::ViewAllocateWithoutInitializing("aggtorow_map_LO"), numFineBlockRows);
1064  {
1065  SubFactoryMonitor m2(*this, "Create AggToRowMap", coarseLevel);
1066 
1067  AggSizeType aggOffsets(Kokkos::ViewAllocateWithoutInitializing("aggOffsets"), numAggregates);
1068  Kokkos::deep_copy(aggOffsets, Kokkos::subview(aggDofSizes, Kokkos::make_pair(static_cast<size_t>(0), numAggregates)));
1069 
1070  Kokkos::parallel_for(
1071  "MueLu:TentativePF:Build:createAgg2RowMap", range_type(0, vertex2AggId.extent(0)),
1072  KOKKOS_LAMBDA(const LO lnode) {
1073  if (procWinner(lnode, 0) == myPID) {
1074  // No need for atomics, it's one-to-one
1075  auto aggID = vertex2AggId(lnode, 0);
1076 
1077  auto offset = Kokkos::atomic_fetch_add(&aggOffsets(aggID), stridedBlockSize);
1078  // FIXME: I think this may be wrong
1079  // We unconditionally add the whole block here. When we calculated
1080  // aggDofSizes, we did the isLocalElement check. Something's fishy.
1081  for (LO k = 0; k < stridedBlockSize; k++)
1082  aggToRowMapLO(offset + k) = lnode * stridedBlockSize + k;
1083  }
1084  });
1085  }
1086 
1087  // STEP 2: prepare local QR decomposition
1088  // Reserve memory for tentative prolongation operator
1089  coarseNullspace = MultiVectorFactory::Build(coarsePointMap, NSDim);
1090 
1091  // Pull out the nullspace vectors so that we can have random access (on the device)
1092  auto fineNS = fineNullspace->getDeviceLocalView(Xpetra::Access::ReadWrite);
1093  auto coarseNS = coarseNullspace->getDeviceLocalView(Xpetra::Access::OverwriteAll);
1094 
1095  typedef typename Xpetra::Matrix<SC, LO, GO, NO>::local_matrix_type local_matrix_type;
1096  typedef typename local_matrix_type::row_map_type::non_const_type rows_type;
1097  typedef typename local_matrix_type::index_type::non_const_type cols_type;
1098  // typedef typename local_matrix_type::values_type::non_const_type vals_type;
1099 
1100  // Device View for status (error messages...)
1101  typedef Kokkos::View<int[10], DeviceType> status_type;
1102  status_type status("status");
1103 
1106 
1107  // We're going to bypass QR in the BlockCrs version of the code regardless of what the user asks for
1108  GetOStream(Runtime1) << "TentativePFactory : bypassing local QR phase" << std::endl;
1109 
1110  // BlockCrs requires that we build the (block) graph first, so let's do that...
1111 
1112  // NOTE: Because we're assuming that the NSDim == BlockSize, we only have one
1113  // block non-zero per row in the matrix;
1114  rows_type ia(Kokkos::ViewAllocateWithoutInitializing("BlockGraph_rowptr"), numFineBlockRows + 1);
1115  cols_type ja(Kokkos::ViewAllocateWithoutInitializing("BlockGraph_colind"), numFineBlockRows);
1116 
1117  Kokkos::parallel_for(
1118  "MueLu:TentativePF:BlockCrs:graph_init", range_type(0, numFineBlockRows),
1119  KOKKOS_LAMBDA(const LO j) {
1120  ia[j] = j;
1121  ja[j] = INVALID;
1122 
1123  if (j == (LO)numFineBlockRows - 1)
1124  ia[numFineBlockRows] = numFineBlockRows;
1125  });
1126 
1127  // Fill Graph
1128  const Kokkos::TeamPolicy<execution_space> policy(numAggregates, 1);
1129  Kokkos::parallel_for(
1130  "MueLu:TentativePF:BlockCrs:fillGraph", policy,
1131  KOKKOS_LAMBDA(const typename Kokkos::TeamPolicy<execution_space>::member_type& thread) {
1132  auto agg = thread.league_rank();
1133  Xpetra::global_size_t offset = agg;
1134 
1135  // size of the aggregate (number of DOFs in aggregate)
1136  LO aggSize = aggRows(agg + 1) - aggRows(agg);
1137 
1138  for (LO j = 0; j < aggSize; j++) {
1139  // FIXME: Allow for bad maps
1140  const LO localRow = aggToRowMapLO[aggDofSizes[agg] + j];
1141  const size_t rowStart = ia[localRow];
1142  ja[rowStart] = offset;
1143  }
1144  });
1145 
1146  // Compress storage (remove all INVALID, which happen when we skip zeros)
1147  // We do that in-place
1148  {
1149  // Stage 2: compress the arrays
1150  SubFactoryMonitor m2(*this, "Stage 2 (CompressData)", coarseLevel);
1151  // Fill i_temp with the correct row starts
1152  rows_type i_temp(Kokkos::ViewAllocateWithoutInitializing("BlockGraph_rowptr"), numFineBlockRows + 1);
1153  LO nnz = 0;
1154  Kokkos::parallel_scan(
1155  "MueLu:TentativePF:BlockCrs:compress_rows", range_type(0, numFineBlockRows),
1156  KOKKOS_LAMBDA(const LO i, LO& upd, const bool& final) {
1157  if (final)
1158  i_temp[i] = upd;
1159  for (auto j = ia[i]; j < ia[i + 1]; j++)
1160  if (ja[j] != INVALID)
1161  upd++;
1162  if (final && i == (LO)numFineBlockRows - 1)
1163  i_temp[numFineBlockRows] = upd;
1164  },
1165  nnz);
1166 
1167  cols_type j_temp(Kokkos::ViewAllocateWithoutInitializing("BlockGraph_colind"), nnz);
1168 
1169  Kokkos::parallel_for(
1170  "MueLu:TentativePF:BlockCrs:compress_cols", range_type(0, numFineBlockRows),
1171  KOKKOS_LAMBDA(const LO i) {
1172  size_t rowStart = i_temp[i];
1173  size_t lnnz = 0;
1174  for (auto j = ia[i]; j < ia[i + 1]; j++)
1175  if (ja[j] != INVALID) {
1176  j_temp[rowStart + lnnz] = ja[j];
1177  lnnz++;
1178  }
1179  });
1180 
1181  ia = i_temp;
1182  ja = j_temp;
1183  }
1184 
1185  RCP<CrsGraph> BlockGraph = CrsGraphFactory::Build(rowMap, coarseBlockMap, ia, ja);
1186 
1187  // Managing labels & constants for ESFC
1188  {
1189  RCP<ParameterList> FCparams;
1190  if (pL.isSublist("matrixmatrix: kernel params"))
1191  FCparams = rcp(new ParameterList(pL.sublist("matrixmatrix: kernel params")));
1192  else
1193  FCparams = rcp(new ParameterList);
1194  // By default, we don't need global constants for TentativeP
1195  FCparams->set("compute global constants", FCparams->get("compute global constants", false));
1196  std::string levelIDs = toString(levelID);
1197  FCparams->set("Timer Label", std::string("MueLu::TentativeP-") + levelIDs);
1198  RCP<const Export> dummy_e;
1199  RCP<const Import> dummy_i;
1200  BlockGraph->expertStaticFillComplete(coarseBlockMap, rowMap, dummy_i, dummy_e, FCparams);
1201  }
1202 
1203  // We can't leave the ia/ja pointers floating around, because of host/device view counting, so
1204  // we clear them here
1205  ia = rows_type();
1206  ja = cols_type();
1207 
1208  // Now let's make a BlockCrs Matrix
1209  // NOTE: Assumes block size== NSDim
1210  RCP<Xpetra::CrsMatrix<SC, LO, GO, NO>> P_xpetra = Xpetra::CrsMatrixFactory<SC, LO, GO, NO>::BuildBlock(BlockGraph, coarsePointMap, rangeMap, NSDim);
1212  if (P_tpetra.is_null()) throw std::runtime_error("BuildPUncoupled: Matrix factory did not return a Tpetra::BlockCrsMatrix");
1213  RCP<CrsMatrixWrap> P_wrap = rcp(new CrsMatrixWrap(P_xpetra));
1214 
1215  auto values = P_tpetra->getTpetra_BlockCrsMatrix()->getValuesDeviceNonConst();
1216  const LO stride = NSDim * NSDim;
1217 
1218  Kokkos::parallel_for(
1219  "MueLu:TentativePF:BlockCrs:main_loop_noqr", policy,
1220  KOKKOS_LAMBDA(const typename Kokkos::TeamPolicy<execution_space>::member_type& thread) {
1221  auto agg = thread.league_rank();
1222 
1223  // size of the aggregate (number of DOFs in aggregate)
1224  LO aggSize = aggRows(agg + 1) - aggRows(agg);
1225  Xpetra::global_size_t offset = agg * NSDim;
1226 
1227  // Q = localQR(:,0)/norm
1228  for (LO j = 0; j < aggSize; j++) {
1229  LO localBlockRow = aggToRowMapLO(aggRows(agg) + j);
1230  LO rowStart = localBlockRow * stride;
1231  for (LO r = 0; r < (LO)NSDim; r++) {
1232  LO localPointRow = localBlockRow * NSDim + r;
1233  for (LO c = 0; c < (LO)NSDim; c++) {
1234  values[rowStart + r * NSDim + c] = fineNSRandom(localPointRow, c);
1235  }
1236  }
1237  }
1238 
1239  // R = norm
1240  for (LO j = 0; j < (LO)NSDim; j++)
1241  coarseNS(offset + j, j) = one;
1242  });
1243 
1244  Ptentative = P_wrap;
1245 }
1246 
1247 template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
1249  BuildPcoupled(RCP<Matrix> /* A */, RCP<Aggregates> /* aggregates */,
1250  RCP<AmalgamationInfo> /* amalgInfo */, RCP<MultiVector> /* fineNullspace */,
1251  RCP<const Map> /* coarseMap */, RCP<Matrix>& /* Ptentative */,
1252  RCP<MultiVector>& /* coarseNullspace */) const {
1253  throw Exceptions::RuntimeError("MueLu: Construction of coupled tentative P is not implemented");
1254 }
1255 
1256 template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
1258  isGoodMap(const Map& rowMap, const Map& colMap) const {
1259  auto rowLocalMap = rowMap.getLocalMap();
1260  auto colLocalMap = colMap.getLocalMap();
1261 
1262  const size_t numRows = rowLocalMap.getLocalNumElements();
1263  const size_t numCols = colLocalMap.getLocalNumElements();
1264 
1265  if (numCols < numRows)
1266  return false;
1267 
1268  size_t numDiff = 0;
1269  Kokkos::parallel_reduce(
1270  "MueLu:TentativePF:isGoodMap", range_type(0, numRows),
1271  KOKKOS_LAMBDA(const LO i, size_t& diff) {
1272  diff += (rowLocalMap.getGlobalElement(i) != colLocalMap.getGlobalElement(i));
1273  },
1274  numDiff);
1275 
1276  return (numDiff == 0);
1277 }
1278 
1279 } // namespace MueLu
1280 
1281 #define MUELU_TENTATIVEPFACTORY_KOKKOS_SHORT
1282 #endif // MUELU_TENTATIVEPFACTORY_KOKKOS_DEF_HPP
Important warning messages (one line)
void BuildPcoupled(RCP< Matrix > A, RCP< Aggregates > aggregates, RCP< AmalgamationInfo > amalgInfo, RCP< MultiVector > fineNullspace, RCP< const Map > coarseMap, RCP< Matrix > &Ptentative, RCP< MultiVector > &coarseNullspace) const
MueLu::DefaultLocalOrdinal LocalOrdinal
void BuildPuncoupledBlockCrs(Level &coarseLevel, RCP< Matrix > A, RCP< Aggregates > aggregates, RCP< AmalgamationInfo > amalgInfo, RCP< MultiVector > fineNullspace, RCP< const Map > coarseMap, RCP< Matrix > &Ptentative, RCP< MultiVector > &coarseNullspace, const int levelID) const
static bool MapsAreNested(const Xpetra::Map< LocalOrdinal, GlobalOrdinal, Node > &rowMap, const Xpetra::Map< LocalOrdinal, GlobalOrdinal, Node > &colMap)
std::string toString(const T &what)
Little helper function to convert non-string types to strings.
const RCP< LOVector > & GetProcWinner() const
Returns constant vector that maps local node IDs to owning processor IDs.
KOKKOS_INLINE_FUNCTION LO GetNumAggregates() const
void BuildP(Level &fineLevel, Level &coarseLevel) const
Abstract Build method.
GlobalOrdinal GO
T & get(const std::string &name, T def_value)
Timer to be used in factories. Similar to Monitor but with additional timers.
#define TEUCHOS_TEST_FOR_EXCEPTION(throw_exception_test, Exception, msg)
const RCP< const Map > GetMap() const
returns (overlapping) map of aggregate/node distribution
LocalOrdinal LO
ParameterList & set(std::string const &name, T &&value, std::string const &docString="", RCP< const ParameterEntryValidator > const &validator=null)
static const NoFactory * get()
Print even more statistics.
#define SET_VALID_ENTRY(name)
bool isParameter(const std::string &name) const
TEUCHOS_DEPRECATED RCP< T > rcp(T *p, Dealloc_T dealloc, bool owns_mem)
void BuildPuncoupled(Level &coarseLevel, RCP< Matrix > A, RCP< Aggregates > aggregates, RCP< AmalgamationInfo > amalgInfo, RCP< MultiVector > fineNullspace, RCP< const Map > coarseMap, RCP< Matrix > &Ptentative, RCP< MultiVector > &coarseNullspace, const int levelID) const
MueLu::DefaultScalar Scalar
MueLu::DefaultGlobalOrdinal GlobalOrdinal
Kokkos::RangePolicy< local_ordinal_type, execution_space > range_type
Class that holds all level-specific information.
Definition: MueLu_Level.hpp:63
bool isSublist(const std::string &name) const
void GetStridingInformation(LO &fullBlockSize, LO &blockID, LO &stridingOffset, LO &stridedBlockSize, GO &indexBase)
returns striding information
Timer to be used in factories. Similar to SubMonitor but adds a timer level by level.
typename LWGraph_kokkos::local_graph_type local_graph_type
void Build(Level &fineLevel, Level &coarseLevel) const
Build an object with this factory.
static void AmalgamateMap(const Map &sourceMap, const Matrix &A, RCP< const Map > &amalgamatedMap, Array< LO > &translation)
Method to create merged map for systems of PDEs.
void DeclareInput(Level &fineLevel, Level &coarseLevel) const
Input.
const RCP< LOMultiVector > & GetVertex2AggId() const
Returns constant vector that maps local node IDs to local aggregates IDs.
static std::string PrintMatrixInfo(const Matrix &A, const std::string &msgTag, RCP< const Teuchos::ParameterList > params=Teuchos::null)
size_t global_size_t
GO GlobalOffset()
returns offset of global dof ids
RCP< const ParameterList > GetValidParameterList() const
Return a const parameter list of valid parameters that setParameterList() will accept.
Scalar SC
ParameterList & sublist(const std::string &name, bool mustAlreadyExist=false, const std::string &docString="")
int GetLevelID() const
Return level number.
Definition: MueLu_Level.cpp:51
Exception throws to report errors in the internal logical of the program.
Description of what is happening (more verbose)
bool isGoodMap(const Map &rowMap, const Map &colMap) const
maxAggDofSizeType maxAggDofSize
local_graph_type GetGraph() const
agg2RowMapLOType agg2RowMapLO
aggregates_sizes_type::const_type ComputeAggregateSizes(bool forceRecompute=false) const
Compute sizes of aggregates.
bool IsAvailable(const std::string &ename, const FactoryBase *factory=NoFactory::get()) const
Test whether a need&#39;s value has been saved.
bool is_null() const