MueLu  Version of the Day
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Groups Pages
MueLu_TentativePFactory_kokkos_def.hpp
Go to the documentation of this file.
1 // @HEADER
2 // *****************************************************************************
3 // MueLu: A package for multigrid based preconditioning
4 //
5 // Copyright 2012 NTESS and the MueLu contributors.
6 // SPDX-License-Identifier: BSD-3-Clause
7 // *****************************************************************************
8 // @HEADER
9 
10 #ifndef MUELU_TENTATIVEPFACTORY_KOKKOS_DEF_HPP
11 #define MUELU_TENTATIVEPFACTORY_KOKKOS_DEF_HPP
12 
13 #include "Kokkos_UnorderedMap.hpp"
15 
17 
18 #include "MueLu_Aggregates.hpp"
19 #include "MueLu_AmalgamationInfo.hpp"
20 #include "MueLu_AmalgamationFactory.hpp"
21 
22 #include "MueLu_MasterList.hpp"
23 #include "MueLu_PerfUtils.hpp"
24 #include "MueLu_Monitor.hpp"
25 
26 #include "Xpetra_IO.hpp"
27 
28 namespace MueLu {
29 
30 namespace { // anonymous
31 
32 template <class LocalOrdinal, class View>
33 class ReduceMaxFunctor {
34  public:
35  ReduceMaxFunctor(View view)
36  : view_(view) {}
37 
38  KOKKOS_INLINE_FUNCTION
39  void operator()(const LocalOrdinal& i, LocalOrdinal& vmax) const {
40  if (vmax < view_(i))
41  vmax = view_(i);
42  }
43 
44  KOKKOS_INLINE_FUNCTION
45  void join(LocalOrdinal& dst, const LocalOrdinal& src) const {
46  if (dst < src) {
47  dst = src;
48  }
49  }
50 
51  KOKKOS_INLINE_FUNCTION
52  void init(LocalOrdinal& dst) const {
53  dst = 0;
54  }
55 
56  private:
58 };
59 
60 // local QR decomposition
61 template <class LOType, class GOType, class SCType, class DeviceType, class NspType, class aggRowsType, class maxAggDofSizeType, class agg2RowMapLOType, class statusType, class rowsType, class rowsAuxType, class colsAuxType, class valsAuxType>
62 class LocalQRDecompFunctor {
63  private:
64  typedef LOType LO;
65  typedef GOType GO;
66  typedef SCType SC;
67 
68  typedef typename DeviceType::execution_space execution_space;
69 #if KOKKOS_VERSION >= 40799
70  typedef typename KokkosKernels::ArithTraits<SC>::val_type impl_SC;
71 #else
72  typedef typename Kokkos::ArithTraits<SC>::val_type impl_SC;
73 #endif
74 #if KOKKOS_VERSION >= 40799
75  typedef KokkosKernels::ArithTraits<impl_SC> impl_ATS;
76 #else
77  typedef Kokkos::ArithTraits<impl_SC> impl_ATS;
78 #endif
79  typedef typename impl_ATS::magnitudeType Magnitude;
80 
81  typedef Kokkos::View<impl_SC**, typename execution_space::scratch_memory_space, Kokkos::MemoryUnmanaged> shared_matrix;
82  typedef Kokkos::View<impl_SC*, typename execution_space::scratch_memory_space, Kokkos::MemoryUnmanaged> shared_vector;
83 
84  private:
85  NspType fineNS;
86  NspType coarseNS;
87  aggRowsType aggRows;
88  maxAggDofSizeType maxAggDofSize; //< maximum number of dofs in aggregate (max size of aggregate * numDofsPerNode)
89  agg2RowMapLOType agg2RowMapLO;
90  statusType statusAtomic;
91  rowsType rows;
92  rowsAuxType rowsAux;
93  colsAuxType colsAux;
94  valsAuxType valsAux;
95  bool doQRStep;
96 
97  public:
98  LocalQRDecompFunctor(NspType fineNS_, NspType coarseNS_, aggRowsType aggRows_, maxAggDofSizeType maxAggDofSize_, agg2RowMapLOType agg2RowMapLO_, statusType statusAtomic_, rowsType rows_, rowsAuxType rowsAux_, colsAuxType colsAux_, valsAuxType valsAux_, bool doQRStep_)
99  : fineNS(fineNS_)
100  , coarseNS(coarseNS_)
101  , aggRows(aggRows_)
102  , maxAggDofSize(maxAggDofSize_)
103  , agg2RowMapLO(agg2RowMapLO_)
104  , statusAtomic(statusAtomic_)
105  , rows(rows_)
106  , rowsAux(rowsAux_)
107  , colsAux(colsAux_)
108  , valsAux(valsAux_)
109  , doQRStep(doQRStep_) {}
110 
111  KOKKOS_INLINE_FUNCTION
112  void operator()(const typename Kokkos::TeamPolicy<execution_space>::member_type& thread, size_t& nnz) const {
113  auto agg = thread.league_rank();
114 
115  // size of aggregate: number of DOFs in aggregate
116  auto aggSize = aggRows(agg + 1) - aggRows(agg);
117 
118  const impl_SC one = impl_ATS::one();
119  const impl_SC two = one + one;
120  const impl_SC zero = impl_ATS::zero();
121  const auto zeroM = impl_ATS::magnitude(zero);
122 
123  int m = aggSize;
124  int n = fineNS.extent(1);
125 
126  // calculate row offset for coarse nullspace
127  Xpetra::global_size_t offset = agg * n;
128 
129  if (doQRStep) {
130  // Extract the piece of the nullspace corresponding to the aggregate
131  shared_matrix r(thread.team_shmem(), m, n); // A (initially), R (at the end)
132  for (int j = 0; j < n; j++)
133  for (int k = 0; k < m; k++)
134  r(k, j) = fineNS(agg2RowMapLO(aggRows(agg) + k), j);
135 #if 0
136  printf("A\n");
137  for (int i = 0; i < m; i++) {
138  for (int j = 0; j < n; j++)
139  printf(" %5.3lf ", r(i,j));
140  printf("\n");
141  }
142 #endif
143 
144  // Calculate QR decomposition (standard)
145  shared_matrix q(thread.team_shmem(), m, m); // Q
146  if (m >= n) {
147  bool isSingular = false;
148 
149  // Initialize Q^T
150  auto qt = q;
151  for (int i = 0; i < m; i++) {
152  for (int j = 0; j < m; j++)
153  qt(i, j) = zero;
154  qt(i, i) = one;
155  }
156 
157  for (int k = 0; k < n; k++) { // we ignore "n" instead of "n-1" to normalize
158  // FIXME_KOKKOS: use team
159  Magnitude s = zeroM, norm, norm_x;
160  for (int i = k + 1; i < m; i++)
161  s += pow(impl_ATS::magnitude(r(i, k)), 2);
162  norm = sqrt(pow(impl_ATS::magnitude(r(k, k)), 2) + s);
163 
164  if (norm == zero) {
165  isSingular = true;
166  break;
167  }
168 
169  r(k, k) -= norm * one;
170 
171  norm_x = sqrt(pow(impl_ATS::magnitude(r(k, k)), 2) + s);
172  if (norm_x == zeroM) {
173  // We have a single diagonal element in the column.
174  // No reflections required. Just need to restor r(k,k).
175  r(k, k) = norm * one;
176  continue;
177  }
178 
179  // FIXME_KOKKOS: use team
180  for (int i = k; i < m; i++)
181  r(i, k) /= norm_x;
182 
183  // Update R(k:m,k+1:n)
184  for (int j = k + 1; j < n; j++) {
185  // FIXME_KOKKOS: use team in the loops
186  impl_SC si = zero;
187  for (int i = k; i < m; i++)
188  si += r(i, k) * r(i, j);
189  for (int i = k; i < m; i++)
190  r(i, j) -= two * si * r(i, k);
191  }
192 
193  // Update Q^T (k:m,k:m)
194  for (int j = k; j < m; j++) {
195  // FIXME_KOKKOS: use team in the loops
196  impl_SC si = zero;
197  for (int i = k; i < m; i++)
198  si += r(i, k) * qt(i, j);
199  for (int i = k; i < m; i++)
200  qt(i, j) -= two * si * r(i, k);
201  }
202 
203  // Fix R(k:m,k)
204  r(k, k) = norm * one;
205  for (int i = k + 1; i < m; i++)
206  r(i, k) = zero;
207  }
208 
209 #if 0
210  // Q = (Q^T)^T
211  for (int i = 0; i < m; i++)
212  for (int j = 0; j < i; j++) {
213  impl_SC tmp = qt(i,j);
214  qt(i,j) = qt(j,i);
215  qt(j,i) = tmp;
216  }
217 #endif
218 
219  // Build coarse nullspace using the upper triangular part of R
220  for (int j = 0; j < n; j++)
221  for (int k = 0; k <= j; k++)
222  coarseNS(offset + k, j) = r(k, j);
223 
224  if (isSingular) {
225  statusAtomic(1) = true;
226  return;
227  }
228 
229  } else {
230  // Special handling for m < n (i.e. single node aggregates in structural mechanics)
231 
232  // The local QR decomposition is not possible in the "overconstrained"
233  // case (i.e. number of columns in qr > number of rowsAux), which
234  // corresponds to #DOFs in Aggregate < n. For usual problems this
235  // is only possible for single node aggregates in structural mechanics.
236  // (Similar problems may arise in discontinuous Galerkin problems...)
237  // We bypass the QR decomposition and use an identity block in the
238  // tentative prolongator for the single node aggregate and transfer the
239  // corresponding fine level null space information 1-to-1 to the coarse
240  // level null space part.
241 
242  // NOTE: The resulting tentative prolongation operator has
243  // (m*DofsPerNode-n) zero columns leading to a singular
244  // coarse level operator A. To deal with that one has the following
245  // options:
246  // - Use the "RepairMainDiagonal" flag in the RAPFactory (default:
247  // false) to add some identity block to the diagonal of the zero rowsAux
248  // in the coarse level operator A, such that standard level smoothers
249  // can be used again.
250  // - Use special (projection-based) level smoothers, which can deal
251  // with singular matrices (very application specific)
252  // - Adapt the code below to avoid zero columns. However, we do not
253  // support a variable number of DOFs per node in MueLu/Xpetra which
254  // makes the implementation really hard.
255  //
256  // FIXME: do we need to check for singularity here somehow? Zero
257  // columns would be easy but linear dependency would require proper QR.
258 
259  // R = extended (by adding identity rowsAux) qr
260  for (int j = 0; j < n; j++)
261  for (int k = 0; k < n; k++)
262  if (k < m)
263  coarseNS(offset + k, j) = r(k, j);
264  else
265  coarseNS(offset + k, j) = (k == j ? one : zero);
266 
267  // Q = I (rectangular)
268  for (int i = 0; i < m; i++)
269  for (int j = 0; j < n; j++)
270  q(i, j) = (j == i ? one : zero);
271  }
272 
273  // Process each row in the local Q factor and fill helper arrays to assemble P
274  for (int j = 0; j < m; j++) {
275  LO localRow = agg2RowMapLO(aggRows(agg) + j);
276  size_t rowStart = rowsAux(localRow);
277  size_t lnnz = 0;
278  for (int k = 0; k < n; k++) {
279  // skip zeros
280  if (q(j, k) != zero) {
281  colsAux(rowStart + lnnz) = offset + k;
282  valsAux(rowStart + lnnz) = q(j, k);
283  lnnz++;
284  }
285  }
286  rows(localRow + 1) = lnnz;
287  nnz += lnnz;
288  }
289 
290 #if 0
291  printf("R\n");
292  for (int i = 0; i < m; i++) {
293  for (int j = 0; j < n; j++)
294  printf(" %5.3lf ", coarseNS(i,j));
295  printf("\n");
296  }
297 
298  printf("Q\n");
299  for (int i = 0; i < aggSize; i++) {
300  for (int j = 0; j < aggSize; j++)
301  printf(" %5.3lf ", q(i,j));
302  printf("\n");
303  }
304 #endif
305  } else {
307  // "no-QR" option //
309  // Local Q factor is just the fine nullspace support over the current aggregate.
310  // Local R factor is the identity.
311  // TODO I have not implemented any special handling for aggregates that are too
312  // TODO small to locally support the nullspace, as is done in the standard QR
313  // TODO case above.
314 
315  for (int j = 0; j < m; j++) {
316  LO localRow = agg2RowMapLO(aggRows(agg) + j);
317  size_t rowStart = rowsAux(localRow);
318  size_t lnnz = 0;
319  for (int k = 0; k < n; k++) {
320  const impl_SC qr_jk = fineNS(localRow, k);
321  // skip zeros
322  if (qr_jk != zero) {
323  colsAux(rowStart + lnnz) = offset + k;
324  valsAux(rowStart + lnnz) = qr_jk;
325  lnnz++;
326  }
327  }
328  rows(localRow + 1) = lnnz;
329  nnz += lnnz;
330  }
331 
332  for (int j = 0; j < n; j++)
333  coarseNS(offset + j, j) = one;
334  }
335  }
336 
337  // amount of shared memory
338  size_t team_shmem_size(int /* team_size */) const {
339  if (doQRStep) {
340  int m = maxAggDofSize;
341  int n = fineNS.extent(1);
342  return shared_matrix::shmem_size(m, n) + // r
343  shared_matrix::shmem_size(m, m); // q
344  } else
345  return 0;
346  }
347 };
348 
349 } // namespace
350 
351 template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
353  RCP<ParameterList> validParamList = rcp(new ParameterList());
354 
355 #define SET_VALID_ENTRY(name) validParamList->setEntry(name, MasterList::getEntry(name))
356  SET_VALID_ENTRY("tentative: calculate qr");
357  SET_VALID_ENTRY("tentative: build coarse coordinates");
358 #undef SET_VALID_ENTRY
359 
360  validParamList->set<RCP<const FactoryBase>>("A", Teuchos::null, "Generating factory of the matrix A");
361  validParamList->set<RCP<const FactoryBase>>("Aggregates", Teuchos::null, "Generating factory of the aggregates");
362  validParamList->set<RCP<const FactoryBase>>("Nullspace", Teuchos::null, "Generating factory of the nullspace");
363  validParamList->set<RCP<const FactoryBase>>("Scaled Nullspace", Teuchos::null, "Generating factory of the scaled nullspace");
364  validParamList->set<RCP<const FactoryBase>>("UnAmalgamationInfo", Teuchos::null, "Generating factory of UnAmalgamationInfo");
365  validParamList->set<RCP<const FactoryBase>>("CoarseMap", Teuchos::null, "Generating factory of the coarse map");
366  validParamList->set<RCP<const FactoryBase>>("Coordinates", Teuchos::null, "Generating factory of the coordinates");
367  validParamList->set<RCP<const FactoryBase>>("Node Comm", Teuchos::null, "Generating factory of the node level communicator");
368 
369  // Make sure we don't recursively validate options for the matrixmatrix kernels
370  ParameterList norecurse;
371  norecurse.disableRecursiveValidation();
372  validParamList->set<ParameterList>("matrixmatrix: kernel params", norecurse, "MatrixMatrix kernel parameters");
373 
374  return validParamList;
375 }
376 
377 template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
379  const ParameterList& pL = GetParameterList();
380  // NOTE: This guy can only either be 'Nullspace' or 'Scaled Nullspace' or else the validator above will cause issues
381  std::string nspName = "Nullspace";
382  if (pL.isParameter("Nullspace name")) nspName = pL.get<std::string>("Nullspace name");
383 
384  Input(fineLevel, "A");
385  Input(fineLevel, "Aggregates");
386  Input(fineLevel, nspName);
387  Input(fineLevel, "UnAmalgamationInfo");
388  Input(fineLevel, "CoarseMap");
389  if (fineLevel.GetLevelID() == 0 &&
390  fineLevel.IsAvailable("Coordinates", NoFactory::get()) && // we have coordinates (provided by user app)
391  pL.get<bool>("tentative: build coarse coordinates")) { // and we want coordinates on other levels
392  bTransferCoordinates_ = true; // then set the transfer coordinates flag to true
393  Input(fineLevel, "Coordinates");
394  } else if (bTransferCoordinates_) {
395  Input(fineLevel, "Coordinates");
396  }
397 }
398 
399 template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
401  return BuildP(fineLevel, coarseLevel);
402 }
403 
404 template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
406  FactoryMonitor m(*this, "Build", coarseLevel);
407 
408  typedef typename Teuchos::ScalarTraits<Scalar>::coordinateType coordinate_type;
409  typedef Xpetra::MultiVectorFactory<coordinate_type, LO, GO, NO> RealValuedMultiVectorFactory;
410  const ParameterList& pL = GetParameterList();
411  std::string nspName = "Nullspace";
412  if (pL.isParameter("Nullspace name")) nspName = pL.get<std::string>("Nullspace name");
413 
414  auto A = Get<RCP<Matrix>>(fineLevel, "A");
415  auto aggregates = Get<RCP<Aggregates>>(fineLevel, "Aggregates");
416  auto amalgInfo = Get<RCP<AmalgamationInfo>>(fineLevel, "UnAmalgamationInfo");
417  auto fineNullspace = Get<RCP<MultiVector>>(fineLevel, nspName);
418  auto coarseMap = Get<RCP<const Map>>(fineLevel, "CoarseMap");
419  RCP<RealValuedMultiVector> fineCoords;
420  if (bTransferCoordinates_) {
421  fineCoords = Get<RCP<RealValuedMultiVector>>(fineLevel, "Coordinates");
422  }
423 
424  RCP<Matrix> Ptentative;
425  // No coarse DoFs so we need to bail by setting Ptentattive to null and returning
426  // This level will ultimately be removed in MueLu_Hierarchy_defs.h via a resize()
427  if (aggregates->GetNumGlobalAggregatesComputeIfNeeded() == 0) {
428  Ptentative = Teuchos::null;
429  Set(coarseLevel, "P", Ptentative);
430  return;
431  }
432  RCP<MultiVector> coarseNullspace;
433  RCP<RealValuedMultiVector> coarseCoords;
434 
435  if (bTransferCoordinates_) {
436  RCP<const Map> coarseCoordMap;
437 
438  LO blkSize = 1;
439  if (rcp_dynamic_cast<const StridedMap>(coarseMap) != Teuchos::null)
440  blkSize = rcp_dynamic_cast<const StridedMap>(coarseMap)->getFixedBlockSize();
441 
442  if (blkSize == 1) {
443  // Scalar system
444  // No amalgamation required, we can use the coarseMap
445  coarseCoordMap = coarseMap;
446  } else {
447  // Vector system
448  AmalgamationFactory<SC, LO, GO, NO>::AmalgamateMap(rcp_dynamic_cast<const StridedMap>(coarseMap), coarseCoordMap);
449  }
450 
451  coarseCoords = RealValuedMultiVectorFactory::Build(coarseCoordMap, fineCoords->getNumVectors());
452 
453  // Create overlapped fine coordinates to reduce global communication
454  auto uniqueMap = fineCoords->getMap();
455  RCP<RealValuedMultiVector> ghostedCoords = fineCoords;
456  if (aggregates->AggregatesCrossProcessors()) {
457  auto nonUniqueMap = aggregates->GetMap();
458  auto importer = ImportFactory::Build(uniqueMap, nonUniqueMap);
459 
460  ghostedCoords = RealValuedMultiVectorFactory::Build(nonUniqueMap, fineCoords->getNumVectors());
461  ghostedCoords->doImport(*fineCoords, *importer, Xpetra::INSERT);
462  }
463 
464  // The good new is that his graph has already been constructed for the
465  // TentativePFactory and was cached in Aggregates. So this is a no-op.
466  auto aggGraph = aggregates->GetGraph();
467  auto numAggs = aggGraph.numRows();
468 
469  auto fineCoordsView = fineCoords->getLocalViewDevice(Xpetra::Access::ReadOnly);
470  auto coarseCoordsView = coarseCoords->getLocalViewDevice(Xpetra::Access::OverwriteAll);
471 
472  // Fill in coarse coordinates
473  {
474  SubFactoryMonitor m2(*this, "AverageCoords", coarseLevel);
475 
476  const auto dim = fineCoords->getNumVectors();
477 
478  typename AppendTrait<decltype(fineCoordsView), Kokkos::RandomAccess>::type fineCoordsRandomView = fineCoordsView;
479  for (size_t j = 0; j < dim; j++) {
480  Kokkos::parallel_for(
481  "MueLu::TentativeP::BuildCoords", Kokkos::RangePolicy<local_ordinal_type, execution_space>(0, numAggs),
482  KOKKOS_LAMBDA(const LO i) {
483  // A row in this graph represents all node ids in the aggregate
484  // Therefore, averaging is very easy
485 
486  auto aggregate = aggGraph.rowConst(i);
487 
488  coordinate_type sum = 0.0; // do not use Scalar here (Stokhos)
489  for (size_t colID = 0; colID < static_cast<size_t>(aggregate.length); colID++)
490  sum += fineCoordsRandomView(aggregate(colID), j);
491 
492  coarseCoordsView(i, j) = sum / aggregate.length;
493  });
494  }
495  }
496  }
497 
498  if (!aggregates->AggregatesCrossProcessors()) {
500  BuildPuncoupledBlockCrs(coarseLevel, A, aggregates, amalgInfo, fineNullspace, coarseMap, Ptentative, coarseNullspace,
501  coarseLevel.GetLevelID());
502  } else {
503  BuildPuncoupled(coarseLevel, A, aggregates, amalgInfo, fineNullspace, coarseMap, Ptentative, coarseNullspace, coarseLevel.GetLevelID());
504  }
505  } else
506  BuildPcoupled(A, aggregates, amalgInfo, fineNullspace, coarseMap, Ptentative, coarseNullspace);
507 
508  // If available, use striding information of fine level matrix A for range
509  // map and coarseMap as domain map; otherwise use plain range map of
510  // Ptent = plain range map of A for range map and coarseMap as domain map.
511  // NOTE:
512  // The latter is not really safe, since there is no striding information
513  // for the range map. This is not really a problem, since striding
514  // information is always available on the intermedium levels and the
515  // coarsest levels.
516  if (A->IsView("stridedMaps") == true)
517  Ptentative->CreateView("stridedMaps", A->getRowMap("stridedMaps"), coarseMap);
518  else
519  Ptentative->CreateView("stridedMaps", Ptentative->getRangeMap(), coarseMap);
520 
521  if (bTransferCoordinates_) {
522  Set(coarseLevel, "Coordinates", coarseCoords);
523  }
524 
525  // FIXME: We should remove the NodeComm on levels past the threshold
526  if (fineLevel.IsAvailable("Node Comm")) {
527  RCP<const Teuchos::Comm<int>> nodeComm = Get<RCP<const Teuchos::Comm<int>>>(fineLevel, "Node Comm");
528  Set<RCP<const Teuchos::Comm<int>>>(coarseLevel, "Node Comm", nodeComm);
529  }
530 
531  Set(coarseLevel, "Nullspace", coarseNullspace);
532  Set(coarseLevel, "P", Ptentative);
533 
534  if (IsPrint(Statistics2)) {
535  RCP<ParameterList> params = rcp(new ParameterList());
536  params->set("printLoadBalancingInfo", true);
537  GetOStream(Statistics2) << PerfUtils::PrintMatrixInfo(*Ptentative, "Ptent", params);
538  }
539 }
540 
541 template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
543  BuildPuncoupled(Level& coarseLevel, RCP<Matrix> A, RCP<Aggregates> aggregates,
544  RCP<AmalgamationInfo> amalgInfo, RCP<MultiVector> fineNullspace,
545  RCP<const Map> coarseMap, RCP<Matrix>& Ptentative,
546  RCP<MultiVector>& coarseNullspace, const int levelID) const {
547  auto rowMap = A->getRowMap();
548  auto colMap = A->getColMap();
549 
550  const size_t numRows = rowMap->getLocalNumElements();
551  const size_t NSDim = fineNullspace->getNumVectors();
552 
553 #if KOKKOS_VERSION >= 40799
554  typedef KokkosKernels::ArithTraits<SC> ATS;
555 #else
556  typedef Kokkos::ArithTraits<SC> ATS;
557 #endif
558  using impl_SC = typename ATS::val_type;
559 #if KOKKOS_VERSION >= 40799
560  using impl_ATS = KokkosKernels::ArithTraits<impl_SC>;
561 #else
562  using impl_ATS = Kokkos::ArithTraits<impl_SC>;
563 #endif
564  const impl_SC zero = impl_ATS::zero(), one = impl_ATS::one();
565 
566  const LO INVALID = Teuchos::OrdinalTraits<LO>::invalid();
567 
568  typename Aggregates::local_graph_type aggGraph;
569  {
570  SubFactoryMonitor m2(*this, "Get Aggregates graph", coarseLevel);
571  aggGraph = aggregates->GetGraph();
572  }
573  auto aggRows = aggGraph.row_map;
574  auto aggCols = aggGraph.entries;
575 
576  // Aggregates map is based on the amalgamated column map
577  // We can skip global-to-local conversion if LIDs in row map are
578  // same as LIDs in column map
579  bool goodMap;
580  {
581  SubFactoryMonitor m2(*this, "Check good map", coarseLevel);
582  goodMap = isGoodMap(*rowMap, *colMap);
583  }
584  // FIXME_KOKKOS: need to proofread later code for bad maps
586  "MueLu: TentativePFactory_kokkos: for now works only with good maps "
587  "(i.e. \"matching\" row and column maps)");
588 
589  // STEP 1: do unamalgamation
590  // The non-kokkos version uses member functions from the AmalgamationInfo
591  // container class to unamalgamate the data. In contrast, the kokkos
592  // version of TentativePFactory does the unamalgamation here and only uses
593  // the data of the AmalgamationInfo container class
594 
595  // Extract information for unamalgamation
596  LO fullBlockSize, blockID, stridingOffset, stridedBlockSize;
597  GO indexBase;
598  amalgInfo->GetStridingInformation(fullBlockSize, blockID, stridingOffset, stridedBlockSize, indexBase);
599  GO globalOffset = amalgInfo->GlobalOffset();
600 
601  // Extract aggregation info (already in Kokkos host views)
602  auto procWinner = aggregates->GetProcWinner()->getLocalViewDevice(Xpetra::Access::ReadOnly);
603  auto vertex2AggId = aggregates->GetVertex2AggId()->getLocalViewDevice(Xpetra::Access::ReadOnly);
604  const size_t numAggregates = aggregates->GetNumAggregates();
605 
606  int myPID = aggregates->GetMap()->getComm()->getRank();
607 
608  // Create Kokkos::View (on the device) to store the aggreate dof sizes
609  // Later used to get aggregate dof offsets
610  // NOTE: This zeros itself on construction
611  typedef typename Aggregates::aggregates_sizes_type::non_const_type AggSizeType;
612  AggSizeType aggDofSizes;
613 
614  if (stridedBlockSize == 1) {
615  SubFactoryMonitor m2(*this, "Calc AggSizes", coarseLevel);
616 
617  // FIXME_KOKKOS: use ViewAllocateWithoutInitializing + set a single value
618  aggDofSizes = AggSizeType("agg_dof_sizes", numAggregates + 1);
619 
620  auto sizesConst = aggregates->ComputeAggregateSizes();
621  Kokkos::deep_copy(Kokkos::subview(aggDofSizes, Kokkos::make_pair(static_cast<size_t>(1), numAggregates + 1)), sizesConst);
622 
623  } else {
624  SubFactoryMonitor m2(*this, "Calc AggSizes", coarseLevel);
625 
626  // FIXME_KOKKOS: use ViewAllocateWithoutInitializing + set a single value
627  aggDofSizes = AggSizeType("agg_dof_sizes", numAggregates + 1);
628 
629  auto nodeMap = aggregates->GetMap()->getLocalMap();
630  auto dofMap = colMap->getLocalMap();
631 
632  Kokkos::parallel_for(
633  "MueLu:TentativePF:Build:compute_agg_sizes", range_type(0, numAggregates),
634  KOKKOS_LAMBDA(const LO agg) {
635  auto aggRowView = aggGraph.rowConst(agg);
636 
637  size_t size = 0;
638  for (LO colID = 0; colID < aggRowView.length; colID++) {
639  GO nodeGID = nodeMap.getGlobalElement(aggRowView(colID));
640 
641  for (LO k = 0; k < stridedBlockSize; k++) {
642  GO dofGID = (nodeGID - indexBase) * fullBlockSize + k + indexBase + globalOffset + stridingOffset;
643 
644  if (dofMap.getLocalElement(dofGID) != INVALID)
645  size++;
646  }
647  }
648  aggDofSizes(agg + 1) = size;
649  });
650  }
651 
652  // Find maximum dof size for aggregates
653  // Later used to reserve enough scratch space for local QR decompositions
654  LO maxAggSize = 0;
655  ReduceMaxFunctor<LO, decltype(aggDofSizes)> reduceMax(aggDofSizes);
656  Kokkos::parallel_reduce("MueLu:TentativePF:Build:max_agg_size", range_type(0, aggDofSizes.extent(0)), reduceMax, maxAggSize);
657 
658  // parallel_scan (exclusive)
659  // The aggDofSizes View then contains the aggregate dof offsets
660  Kokkos::parallel_scan(
661  "MueLu:TentativePF:Build:aggregate_sizes:stage1_scan", range_type(0, numAggregates + 1),
662  KOKKOS_LAMBDA(const LO i, LO& update, const bool& final_pass) {
663  update += aggDofSizes(i);
664  if (final_pass)
665  aggDofSizes(i) = update;
666  });
667 
668  // Create Kokkos::View on the device to store mapping
669  // between (local) aggregate id and row map ids (LIDs)
670  Kokkos::View<LO*, DeviceType> agg2RowMapLO(Kokkos::ViewAllocateWithoutInitializing("agg2row_map_LO"), numRows);
671  {
672  SubFactoryMonitor m2(*this, "Create Agg2RowMap", coarseLevel);
673 
674  AggSizeType aggOffsets(Kokkos::ViewAllocateWithoutInitializing("aggOffsets"), numAggregates);
675  Kokkos::deep_copy(aggOffsets, Kokkos::subview(aggDofSizes, Kokkos::make_pair(static_cast<size_t>(0), numAggregates)));
676 
677  Kokkos::parallel_for(
678  "MueLu:TentativePF:Build:createAgg2RowMap", range_type(0, vertex2AggId.extent(0)),
679  KOKKOS_LAMBDA(const LO lnode) {
680  if (procWinner(lnode, 0) == myPID) {
681  // No need for atomics, it's one-to-one
682  auto aggID = vertex2AggId(lnode, 0);
683 
684  auto offset = Kokkos::atomic_fetch_add(&aggOffsets(aggID), stridedBlockSize);
685  // FIXME: I think this may be wrong
686  // We unconditionally add the whole block here. When we calculated
687  // aggDofSizes, we did the isLocalElement check. Something's fishy.
688  for (LO k = 0; k < stridedBlockSize; k++)
689  agg2RowMapLO(offset + k) = lnode * stridedBlockSize + k;
690  }
691  });
692  }
693 
694  // STEP 2: prepare local QR decomposition
695  // Reserve memory for tentative prolongation operator
696  coarseNullspace = MultiVectorFactory::Build(coarseMap, NSDim);
697 
698  // Pull out the nullspace vectors so that we can have random access (on the device)
699  auto fineNS = fineNullspace->getLocalViewDevice(Xpetra::Access::ReadWrite);
700  auto coarseNS = coarseNullspace->getLocalViewDevice(Xpetra::Access::OverwriteAll);
701 
702  size_t nnz = 0; // actual number of nnz
703 
704  typedef typename Xpetra::Matrix<SC, LO, GO, NO>::local_matrix_type local_matrix_type;
705  typedef typename local_matrix_type::row_map_type::non_const_type rows_type;
706  typedef typename local_matrix_type::index_type::non_const_type cols_type;
707  typedef typename local_matrix_type::values_type::non_const_type vals_type;
708 
709  // Device View for status (error messages...)
710  typedef Kokkos::View<int[10], DeviceType> status_type;
711  status_type status("status");
712 
715 
716  const ParameterList& pL = GetParameterList();
717  const bool& doQRStep = pL.get<bool>("tentative: calculate qr");
718  if (!doQRStep) {
719  GetOStream(Runtime1) << "TentativePFactory : bypassing local QR phase" << std::endl;
720  if (NSDim > 1)
721  GetOStream(Warnings0) << "TentativePFactor : for nontrivial nullspace, this may degrade performance" << std::endl;
722  }
723 
724  size_t nnzEstimate = numRows * NSDim;
725  rows_type rowsAux(Kokkos::ViewAllocateWithoutInitializing("Ptent_aux_rows"), numRows + 1);
726  cols_type colsAux(Kokkos::ViewAllocateWithoutInitializing("Ptent_aux_cols"), nnzEstimate);
727  vals_type valsAux("Ptent_aux_vals", nnzEstimate);
728  rows_type rows("Ptent_rows", numRows + 1);
729  {
730  // Stage 0: fill in views.
731  SubFactoryMonitor m2(*this, "Stage 0 (InitViews)", coarseLevel);
732 
733  // The main thing to notice is initialization of vals with INVALID. These
734  // values will later be used to compress the arrays
735  Kokkos::parallel_for(
736  "MueLu:TentativePF:BuildPuncoupled:for1", range_type(0, numRows + 1),
737  KOKKOS_LAMBDA(const LO row) {
738  rowsAux(row) = row * NSDim;
739  });
740  Kokkos::parallel_for(
741  "MueLu:TentativePF:BuildUncoupled:for2", range_type(0, nnzEstimate),
742  KOKKOS_LAMBDA(const LO j) {
743  colsAux(j) = INVALID;
744  });
745  }
746 
747  if (NSDim == 1) {
748  // 1D is special, as it is the easiest. We don't even need to the QR,
749  // just normalize an array. Plus, no worries abot small aggregates. In
750  // addition, we do not worry about compression. It is unlikely that
751  // nullspace will have zeros. If it does, a prolongator row would be
752  // zero and we'll get singularity anyway.
753  SubFactoryMonitor m2(*this, "Stage 1 (LocalQR)", coarseLevel);
754 
755  // Set up team policy with numAggregates teams and one thread per team.
756  // Each team handles a slice of the data associated with one aggregate
757  // and performs a local QR decomposition (in this case real QR is
758  // unnecessary).
759  const Kokkos::TeamPolicy<execution_space> policy(numAggregates, 1);
760 
761  if (doQRStep) {
762  Kokkos::parallel_for(
763  "MueLu:TentativePF:BuildUncoupled:main_loop", policy,
764  KOKKOS_LAMBDA(const typename Kokkos::TeamPolicy<execution_space>::member_type& thread) {
765  auto agg = thread.league_rank();
766 
767  // size of the aggregate (number of DOFs in aggregate)
768  LO aggSize = aggRows(agg + 1) - aggRows(agg);
769 
770  // Extract the piece of the nullspace corresponding to the aggregate, and
771  // put it in the flat array, "localQR" (in column major format) for the
772  // QR routine. Trivial in 1D.
773  auto norm = impl_ATS::magnitude(zero);
774 
775  // Calculate QR by hand
776  // FIXME: shouldn't there be stridedblock here?
777  // FIXME_KOKKOS: shouldn't there be stridedblock here?
778  for (decltype(aggSize) k = 0; k < aggSize; k++) {
779  auto dnorm = impl_ATS::magnitude(fineNSRandom(agg2RowMapLO(aggRows(agg) + k), 0));
780  norm += dnorm * dnorm;
781  }
782  norm = sqrt(norm);
783 
784  if (norm == zero) {
785  // zero column; terminate the execution
786  statusAtomic(1) = true;
787  return;
788  }
789 
790  // R = norm
791  coarseNS(agg, 0) = norm;
792 
793  // Q = localQR(:,0)/norm
794  for (decltype(aggSize) k = 0; k < aggSize; k++) {
795  LO localRow = agg2RowMapLO(aggRows(agg) + k);
796  impl_SC localVal = fineNSRandom(agg2RowMapLO(aggRows(agg) + k), 0) / norm;
797 
798  rows(localRow + 1) = 1;
799  colsAux(localRow) = agg;
800  valsAux(localRow) = localVal;
801  }
802  });
803 
804  typename status_type::host_mirror_type statusHost = Kokkos::create_mirror_view(status);
805  Kokkos::deep_copy(statusHost, status);
806  for (decltype(statusHost.size()) i = 0; i < statusHost.size(); i++)
807  if (statusHost(i)) {
808  std::ostringstream oss;
809  oss << "MueLu::TentativePFactory::MakeTentative: ";
810  switch (i) {
811  case 0: oss << "!goodMap is not implemented"; break;
812  case 1: oss << "fine level NS part has a zero column"; break;
813  }
814  throw Exceptions::RuntimeError(oss.str());
815  }
816 
817  } else {
818  Kokkos::parallel_for(
819  "MueLu:TentativePF:BuildUncoupled:main_loop_noqr", policy,
820  KOKKOS_LAMBDA(const typename Kokkos::TeamPolicy<execution_space>::member_type& thread) {
821  auto agg = thread.league_rank();
822 
823  // size of the aggregate (number of DOFs in aggregate)
824  LO aggSize = aggRows(agg + 1) - aggRows(agg);
825 
826  // R = norm
827  coarseNS(agg, 0) = one;
828 
829  // Q = localQR(:,0)/norm
830  for (decltype(aggSize) k = 0; k < aggSize; k++) {
831  LO localRow = agg2RowMapLO(aggRows(agg) + k);
832  impl_SC localVal = fineNSRandom(agg2RowMapLO(aggRows(agg) + k), 0);
833 
834  rows(localRow + 1) = 1;
835  colsAux(localRow) = agg;
836  valsAux(localRow) = localVal;
837  }
838  });
839  }
840 
841  Kokkos::parallel_reduce(
842  "MueLu:TentativeP:CountNNZ", range_type(0, numRows + 1),
843  KOKKOS_LAMBDA(const LO i, size_t& nnz_count) {
844  nnz_count += rows(i);
845  },
846  nnz);
847 
848  } else { // NSdim > 1
849  // FIXME_KOKKOS: This code branch is completely unoptimized.
850  // Work to do:
851  // - Optimize QR decomposition
852  // - Remove INVALID usage similarly to CoalesceDropFactory_kokkos by
853  // packing new values in the beginning of each row
854  // We do use auxilary view in this case, so keep a second rows view for
855  // counting nonzeros in rows
856 
857  {
858  SubFactoryMonitor m2 = SubFactoryMonitor(*this, doQRStep ? "Stage 1 (LocalQR)" : "Stage 1 (Fill coarse nullspace and tentative P)", coarseLevel);
859  // Set up team policy with numAggregates teams and one thread per team.
860  // Each team handles a slice of the data associated with one aggregate
861  // and performs a local QR decomposition
862  const Kokkos::TeamPolicy<execution_space> policy(numAggregates, 1); // numAggregates teams a 1 thread
863  LocalQRDecompFunctor<LocalOrdinal, GlobalOrdinal, Scalar, DeviceType, decltype(fineNSRandom),
864  decltype(aggDofSizes /*aggregate sizes in dofs*/), decltype(maxAggSize), decltype(agg2RowMapLO),
865  decltype(statusAtomic), decltype(rows), decltype(rowsAux), decltype(colsAux),
866  decltype(valsAux)>
867  localQRFunctor(fineNSRandom, coarseNS, aggDofSizes, maxAggSize, agg2RowMapLO, statusAtomic,
868  rows, rowsAux, colsAux, valsAux, doQRStep);
869  Kokkos::parallel_reduce("MueLu:TentativePF:BuildUncoupled:main_qr_loop", policy, localQRFunctor, nnz);
870  }
871 
872  typename status_type::host_mirror_type statusHost = Kokkos::create_mirror_view(status);
873  Kokkos::deep_copy(statusHost, status);
874  for (decltype(statusHost.size()) i = 0; i < statusHost.size(); i++)
875  if (statusHost(i)) {
876  std::ostringstream oss;
877  oss << "MueLu::TentativePFactory::MakeTentative: ";
878  switch (i) {
879  case 0: oss << "!goodMap is not implemented"; break;
880  case 1: oss << "fine level NS part has a zero column"; break;
881  }
882  throw Exceptions::RuntimeError(oss.str());
883  }
884  }
885 
886  // Compress the cols and vals by ignoring INVALID column entries that correspond
887  // to 0 in QR.
888 
889  // The real cols and vals are constructed using calculated (not estimated) nnz
890  cols_type cols;
891  vals_type vals;
892 
893  if (nnz != nnzEstimate) {
894  {
895  // Stage 2: compress the arrays
896  SubFactoryMonitor m2(*this, "Stage 2 (CompressRows)", coarseLevel);
897 
898  Kokkos::parallel_scan(
899  "MueLu:TentativePF:Build:compress_rows", range_type(0, numRows + 1),
900  KOKKOS_LAMBDA(const LO i, LO& upd, const bool& final) {
901  upd += rows(i);
902  if (final)
903  rows(i) = upd;
904  });
905  }
906 
907  {
908  SubFactoryMonitor m2(*this, "Stage 2 (CompressCols)", coarseLevel);
909 
910  cols = cols_type("Ptent_cols", nnz);
911  vals = vals_type("Ptent_vals", nnz);
912 
913  // FIXME_KOKKOS: this can be spedup by moving correct cols and vals values
914  // to the beginning of rows. See CoalesceDropFactory_kokkos for
915  // example.
916  Kokkos::parallel_for(
917  "MueLu:TentativePF:Build:compress_cols_vals", range_type(0, numRows),
918  KOKKOS_LAMBDA(const LO i) {
919  LO rowStart = rows(i);
920 
921  size_t lnnz = 0;
922  for (auto j = rowsAux(i); j < rowsAux(i + 1); j++)
923  if (colsAux(j) != INVALID) {
924  cols(rowStart + lnnz) = colsAux(j);
925  vals(rowStart + lnnz) = valsAux(j);
926  lnnz++;
927  }
928  });
929  }
930 
931  } else {
932  rows = rowsAux;
933  cols = colsAux;
934  vals = valsAux;
935  }
936 
937  GetOStream(Runtime1) << "TentativePFactory : aggregates do not cross process boundaries" << std::endl;
938 
939  {
940  // Stage 3: construct Xpetra::Matrix
941  SubFactoryMonitor m2(*this, "Stage 3 (LocalMatrix+FillComplete)", coarseLevel);
942 
943  local_matrix_type lclMatrix = local_matrix_type("A", numRows, coarseMap->getLocalNumElements(), nnz, vals, rows, cols);
944 
945  // Managing labels & constants for ESFC
946  RCP<ParameterList> FCparams;
947  if (pL.isSublist("matrixmatrix: kernel params"))
948  FCparams = rcp(new ParameterList(pL.sublist("matrixmatrix: kernel params")));
949  else
950  FCparams = rcp(new ParameterList);
951 
952  // By default, we don't need global constants for TentativeP
953  FCparams->set("compute global constants", FCparams->get("compute global constants", false));
954  FCparams->set("Timer Label", std::string("MueLu::TentativeP-") + toString(levelID));
955 
956  auto PtentCrs = CrsMatrixFactory::Build(lclMatrix, rowMap, coarseMap, coarseMap, A->getDomainMap());
957  Ptentative = rcp(new CrsMatrixWrap(PtentCrs));
958  }
959 }
960 
961 template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
964  RCP<AmalgamationInfo> amalgInfo, RCP<MultiVector> fineNullspace,
965  RCP<const Map> coarsePointMap, RCP<Matrix>& Ptentative,
966  RCP<MultiVector>& coarseNullspace, const int levelID) const {
967  /* This routine generates a BlockCrs P for a BlockCrs A. There are a few assumptions here, which meet the use cases we care about, but could
968  be generalized later, if we ever need to do so:
969  1) Null space dimension === block size of matrix: So no elasticity right now
970  2) QR is not supported: Under assumption #1, this shouldn't cause problems.
971  3) Maps are "good": Aka the first chunk of the ColMap is the RowMap.
972 
973  These assumptions keep our code way simpler and still support the use cases we actually care about.
974  */
975 
976  RCP<const Map> rowMap = A->getRowMap();
977  RCP<const Map> rangeMap = A->getRangeMap();
978  RCP<const Map> colMap = A->getColMap();
979  // const size_t numFinePointRows = rangeMap->getLocalNumElements();
980  const size_t numFineBlockRows = rowMap->getLocalNumElements();
981 
982  // typedef Teuchos::ScalarTraits<SC> STS;
983  // typedef typename STS::magnitudeType Magnitude;
984  const LO INVALID = Teuchos::OrdinalTraits<LO>::invalid();
985 
986 #if KOKKOS_VERSION >= 40799
987  typedef KokkosKernels::ArithTraits<SC> ATS;
988 #else
989  typedef Kokkos::ArithTraits<SC> ATS;
990 #endif
991  using impl_SC = typename ATS::val_type;
992 #if KOKKOS_VERSION >= 40799
993  using impl_ATS = KokkosKernels::ArithTraits<impl_SC>;
994 #else
995  using impl_ATS = Kokkos::ArithTraits<impl_SC>;
996 #endif
997  const impl_SC one = impl_ATS::one();
998 
999  // const GO numAggs = aggregates->GetNumAggregates();
1000  const size_t NSDim = fineNullspace->getNumVectors();
1001  auto aggSizes = aggregates->ComputeAggregateSizes();
1002 
1003  typename Aggregates::local_graph_type aggGraph;
1004  {
1005  SubFactoryMonitor m2(*this, "Get Aggregates graph", coarseLevel);
1006  aggGraph = aggregates->GetGraph();
1007  }
1008  auto aggRows = aggGraph.row_map;
1009  auto aggCols = aggGraph.entries;
1010 
1011  // Need to generate the coarse block map
1012  // NOTE: We assume NSDim == block size here
1013  // NOTE: We also assume that coarseMap has contiguous GIDs
1014  // const size_t numCoarsePointRows = coarsePointMap->getLocalNumElements();
1015  const size_t numCoarseBlockRows = coarsePointMap->getLocalNumElements() / NSDim;
1016  RCP<const Map> coarseBlockMap = MapFactory::Build(coarsePointMap->lib(),
1018  numCoarseBlockRows,
1019  coarsePointMap->getIndexBase(),
1020  coarsePointMap->getComm());
1021  // Sanity checking
1022  const ParameterList& pL = GetParameterList();
1023  // const bool &doQRStep = pL.get<bool>("tentative: calculate qr");
1024 
1025  // The aggregates use the amalgamated column map, which in this case is what we want
1026 
1027  // Aggregates map is based on the amalgamated column map
1028  // We can skip global-to-local conversion if LIDs in row map are
1029  // same as LIDs in column map
1030  bool goodMap = MueLu::Utilities<SC, LO, GO, NO>::MapsAreNested(*rowMap, *colMap);
1032  "MueLu: TentativePFactory_kokkos: for now works only with good maps "
1033  "(i.e. \"matching\" row and column maps)");
1034 
1035  // STEP 1: do unamalgamation
1036  // The non-kokkos version uses member functions from the AmalgamationInfo
1037  // container class to unamalgamate the data. In contrast, the kokkos
1038  // version of TentativePFactory does the unamalgamation here and only uses
1039  // the data of the AmalgamationInfo container class
1040 
1041  // Extract information for unamalgamation
1042  LO fullBlockSize, blockID, stridingOffset, stridedBlockSize;
1043  GO indexBase;
1044  amalgInfo->GetStridingInformation(fullBlockSize, blockID, stridingOffset, stridedBlockSize, indexBase);
1045  // GO globalOffset = amalgInfo->GlobalOffset();
1046 
1047  // Extract aggregation info (already in Kokkos host views)
1048  auto procWinner = aggregates->GetProcWinner()->getLocalViewDevice(Xpetra::Access::ReadOnly);
1049  auto vertex2AggId = aggregates->GetVertex2AggId()->getLocalViewDevice(Xpetra::Access::ReadOnly);
1050  const size_t numAggregates = aggregates->GetNumAggregates();
1051 
1052  int myPID = aggregates->GetMap()->getComm()->getRank();
1053 
1054  // Create Kokkos::View (on the device) to store the aggreate dof sizes
1055  // Later used to get aggregate dof offsets
1056  // NOTE: This zeros itself on construction
1057  typedef typename Aggregates::aggregates_sizes_type::non_const_type AggSizeType;
1058  AggSizeType aggDofSizes; // This turns into "starts" after the parallel_scan
1059 
1060  {
1061  SubFactoryMonitor m2(*this, "Calc AggSizes", coarseLevel);
1062 
1063  // FIXME_KOKKOS: use ViewAllocateWithoutInitializing + set a single value
1064  aggDofSizes = AggSizeType("agg_dof_sizes", numAggregates + 1);
1065 
1066  Kokkos::deep_copy(Kokkos::subview(aggDofSizes, Kokkos::make_pair(static_cast<size_t>(1), numAggregates + 1)), aggSizes);
1067  }
1068 
1069  // Find maximum dof size for aggregates
1070  // Later used to reserve enough scratch space for local QR decompositions
1071  LO maxAggSize = 0;
1072  ReduceMaxFunctor<LO, decltype(aggDofSizes)> reduceMax(aggDofSizes);
1073  Kokkos::parallel_reduce("MueLu:TentativePF:Build:max_agg_size", range_type(0, aggDofSizes.extent(0)), reduceMax, maxAggSize);
1074 
1075  // parallel_scan (exclusive)
1076  // The aggDofSizes View then contains the aggregate dof offsets
1077  Kokkos::parallel_scan(
1078  "MueLu:TentativePF:Build:aggregate_sizes:stage1_scan", range_type(0, numAggregates + 1),
1079  KOKKOS_LAMBDA(const LO i, LO& update, const bool& final_pass) {
1080  update += aggDofSizes(i);
1081  if (final_pass)
1082  aggDofSizes(i) = update;
1083  });
1084 
1085  // Create Kokkos::View on the device to store mapping
1086  // between (local) aggregate id and row map ids (LIDs)
1087  Kokkos::View<LO*, DeviceType> aggToRowMapLO(Kokkos::ViewAllocateWithoutInitializing("aggtorow_map_LO"), numFineBlockRows);
1088  {
1089  SubFactoryMonitor m2(*this, "Create AggToRowMap", coarseLevel);
1090 
1091  AggSizeType aggOffsets(Kokkos::ViewAllocateWithoutInitializing("aggOffsets"), numAggregates);
1092  Kokkos::deep_copy(aggOffsets, Kokkos::subview(aggDofSizes, Kokkos::make_pair(static_cast<size_t>(0), numAggregates)));
1093 
1094  Kokkos::parallel_for(
1095  "MueLu:TentativePF:Build:createAgg2RowMap", range_type(0, vertex2AggId.extent(0)),
1096  KOKKOS_LAMBDA(const LO lnode) {
1097  if (procWinner(lnode, 0) == myPID) {
1098  // No need for atomics, it's one-to-one
1099  auto aggID = vertex2AggId(lnode, 0);
1100 
1101  auto offset = Kokkos::atomic_fetch_add(&aggOffsets(aggID), stridedBlockSize);
1102  // FIXME: I think this may be wrong
1103  // We unconditionally add the whole block here. When we calculated
1104  // aggDofSizes, we did the isLocalElement check. Something's fishy.
1105  for (LO k = 0; k < stridedBlockSize; k++)
1106  aggToRowMapLO(offset + k) = lnode * stridedBlockSize + k;
1107  }
1108  });
1109  }
1110 
1111  // STEP 2: prepare local QR decomposition
1112  // Reserve memory for tentative prolongation operator
1113  coarseNullspace = MultiVectorFactory::Build(coarsePointMap, NSDim);
1114 
1115  // Pull out the nullspace vectors so that we can have random access (on the device)
1116  auto fineNS = fineNullspace->getLocalViewDevice(Xpetra::Access::ReadWrite);
1117  auto coarseNS = coarseNullspace->getLocalViewDevice(Xpetra::Access::OverwriteAll);
1118 
1119  typedef typename Xpetra::Matrix<SC, LO, GO, NO>::local_matrix_type local_matrix_type;
1120  typedef typename local_matrix_type::row_map_type::non_const_type rows_type;
1121  typedef typename local_matrix_type::index_type::non_const_type cols_type;
1122  // typedef typename local_matrix_type::values_type::non_const_type vals_type;
1123 
1124  // Device View for status (error messages...)
1125  typedef Kokkos::View<int[10], DeviceType> status_type;
1126  status_type status("status");
1127 
1130 
1131  // We're going to bypass QR in the BlockCrs version of the code regardless of what the user asks for
1132  GetOStream(Runtime1) << "TentativePFactory : bypassing local QR phase" << std::endl;
1133 
1134  // BlockCrs requires that we build the (block) graph first, so let's do that...
1135 
1136  // NOTE: Because we're assuming that the NSDim == BlockSize, we only have one
1137  // block non-zero per row in the matrix;
1138  rows_type ia(Kokkos::ViewAllocateWithoutInitializing("BlockGraph_rowptr"), numFineBlockRows + 1);
1139  cols_type ja(Kokkos::ViewAllocateWithoutInitializing("BlockGraph_colind"), numFineBlockRows);
1140 
1141  Kokkos::parallel_for(
1142  "MueLu:TentativePF:BlockCrs:graph_init", range_type(0, numFineBlockRows),
1143  KOKKOS_LAMBDA(const LO j) {
1144  ia[j] = j;
1145  ja[j] = INVALID;
1146 
1147  if (j == (LO)numFineBlockRows - 1)
1148  ia[numFineBlockRows] = numFineBlockRows;
1149  });
1150 
1151  // Fill Graph
1152  const Kokkos::TeamPolicy<execution_space> policy(numAggregates, 1);
1153  Kokkos::parallel_for(
1154  "MueLu:TentativePF:BlockCrs:fillGraph", policy,
1155  KOKKOS_LAMBDA(const typename Kokkos::TeamPolicy<execution_space>::member_type& thread) {
1156  auto agg = thread.league_rank();
1157  Xpetra::global_size_t offset = agg;
1158 
1159  // size of the aggregate (number of DOFs in aggregate)
1160  LO aggSize = aggRows(agg + 1) - aggRows(agg);
1161 
1162  for (LO j = 0; j < aggSize; j++) {
1163  // FIXME: Allow for bad maps
1164  const LO localRow = aggToRowMapLO[aggDofSizes[agg] + j];
1165  const size_t rowStart = ia[localRow];
1166  ja[rowStart] = offset;
1167  }
1168  });
1169 
1170  // Compress storage (remove all INVALID, which happen when we skip zeros)
1171  // We do that in-place
1172  {
1173  // Stage 2: compress the arrays
1174  SubFactoryMonitor m2(*this, "Stage 2 (CompressData)", coarseLevel);
1175  // Fill i_temp with the correct row starts
1176  rows_type i_temp(Kokkos::ViewAllocateWithoutInitializing("BlockGraph_rowptr"), numFineBlockRows + 1);
1177  LO nnz = 0;
1178  Kokkos::parallel_scan(
1179  "MueLu:TentativePF:BlockCrs:compress_rows", range_type(0, numFineBlockRows),
1180  KOKKOS_LAMBDA(const LO i, LO& upd, const bool& final) {
1181  if (final)
1182  i_temp[i] = upd;
1183  for (auto j = ia[i]; j < ia[i + 1]; j++)
1184  if (ja[j] != INVALID)
1185  upd++;
1186  if (final && i == (LO)numFineBlockRows - 1)
1187  i_temp[numFineBlockRows] = upd;
1188  },
1189  nnz);
1190 
1191  cols_type j_temp(Kokkos::ViewAllocateWithoutInitializing("BlockGraph_colind"), nnz);
1192 
1193  Kokkos::parallel_for(
1194  "MueLu:TentativePF:BlockCrs:compress_cols", range_type(0, numFineBlockRows),
1195  KOKKOS_LAMBDA(const LO i) {
1196  size_t rowStart = i_temp[i];
1197  size_t lnnz = 0;
1198  for (auto j = ia[i]; j < ia[i + 1]; j++)
1199  if (ja[j] != INVALID) {
1200  j_temp[rowStart + lnnz] = ja[j];
1201  lnnz++;
1202  }
1203  });
1204 
1205  ia = i_temp;
1206  ja = j_temp;
1207  }
1208 
1209  RCP<CrsGraph> BlockGraph = CrsGraphFactory::Build(rowMap, coarseBlockMap, ia, ja);
1210 
1211  // Managing labels & constants for ESFC
1212  {
1213  RCP<ParameterList> FCparams;
1214  if (pL.isSublist("matrixmatrix: kernel params"))
1215  FCparams = rcp(new ParameterList(pL.sublist("matrixmatrix: kernel params")));
1216  else
1217  FCparams = rcp(new ParameterList);
1218  // By default, we don't need global constants for TentativeP
1219  FCparams->set("compute global constants", FCparams->get("compute global constants", false));
1220  std::string levelIDs = toString(levelID);
1221  FCparams->set("Timer Label", std::string("MueLu::TentativeP-") + levelIDs);
1222  RCP<const Export> dummy_e;
1223  RCP<const Import> dummy_i;
1224  BlockGraph->expertStaticFillComplete(coarseBlockMap, rowMap, dummy_i, dummy_e, FCparams);
1225  }
1226 
1227  // We can't leave the ia/ja pointers floating around, because of host/device view counting, so
1228  // we clear them here
1229  ia = rows_type();
1230  ja = cols_type();
1231 
1232  // Now let's make a BlockCrs Matrix
1233  // NOTE: Assumes block size== NSDim
1234  RCP<Xpetra::CrsMatrix<SC, LO, GO, NO>> P_xpetra = Xpetra::CrsMatrixFactory<SC, LO, GO, NO>::BuildBlock(BlockGraph, coarsePointMap, rangeMap, NSDim);
1236  if (P_tpetra.is_null()) throw std::runtime_error("BuildPUncoupled: Matrix factory did not return a Tpetra::BlockCrsMatrix");
1237  RCP<CrsMatrixWrap> P_wrap = rcp(new CrsMatrixWrap(P_xpetra));
1238 
1239  auto values = P_tpetra->getTpetra_BlockCrsMatrix()->getValuesDeviceNonConst();
1240  const LO stride = NSDim * NSDim;
1241 
1242  Kokkos::parallel_for(
1243  "MueLu:TentativePF:BlockCrs:main_loop_noqr", policy,
1244  KOKKOS_LAMBDA(const typename Kokkos::TeamPolicy<execution_space>::member_type& thread) {
1245  auto agg = thread.league_rank();
1246 
1247  // size of the aggregate (number of DOFs in aggregate)
1248  LO aggSize = aggRows(agg + 1) - aggRows(agg);
1249  Xpetra::global_size_t offset = agg * NSDim;
1250 
1251  // Q = localQR(:,0)/norm
1252  for (LO j = 0; j < aggSize; j++) {
1253  LO localBlockRow = aggToRowMapLO(aggRows(agg) + j);
1254  LO rowStart = localBlockRow * stride;
1255  for (LO r = 0; r < (LO)NSDim; r++) {
1256  LO localPointRow = localBlockRow * NSDim + r;
1257  for (LO c = 0; c < (LO)NSDim; c++) {
1258  values[rowStart + r * NSDim + c] = fineNSRandom(localPointRow, c);
1259  }
1260  }
1261  }
1262 
1263  // R = norm
1264  for (LO j = 0; j < (LO)NSDim; j++)
1265  coarseNS(offset + j, j) = one;
1266  });
1267 
1268  Ptentative = P_wrap;
1269 }
1270 
1271 template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
1273  BuildPcoupled(RCP<Matrix> /* A */, RCP<Aggregates> /* aggregates */,
1274  RCP<AmalgamationInfo> /* amalgInfo */, RCP<MultiVector> /* fineNullspace */,
1275  RCP<const Map> /* coarseMap */, RCP<Matrix>& /* Ptentative */,
1276  RCP<MultiVector>& /* coarseNullspace */) const {
1277  throw Exceptions::RuntimeError("MueLu: Construction of coupled tentative P is not implemented");
1278 }
1279 
1280 template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
1282  isGoodMap(const Map& rowMap, const Map& colMap) const {
1283  auto rowLocalMap = rowMap.getLocalMap();
1284  auto colLocalMap = colMap.getLocalMap();
1285 
1286  const size_t numRows = rowLocalMap.getLocalNumElements();
1287  const size_t numCols = colLocalMap.getLocalNumElements();
1288 
1289  if (numCols < numRows)
1290  return false;
1291 
1292  size_t numDiff = 0;
1293  Kokkos::parallel_reduce(
1294  "MueLu:TentativePF:isGoodMap", range_type(0, numRows),
1295  KOKKOS_LAMBDA(const LO i, size_t& diff) {
1296  diff += (rowLocalMap.getGlobalElement(i) != colLocalMap.getGlobalElement(i));
1297  },
1298  numDiff);
1299 
1300  return (numDiff == 0);
1301 }
1302 
1303 } // namespace MueLu
1304 
1305 #define MUELU_TENTATIVEPFACTORY_KOKKOS_SHORT
1306 #endif // MUELU_TENTATIVEPFACTORY_KOKKOS_DEF_HPP
Important warning messages (one line)
void BuildPcoupled(RCP< Matrix > A, RCP< Aggregates > aggregates, RCP< AmalgamationInfo > amalgInfo, RCP< MultiVector > fineNullspace, RCP< const Map > coarseMap, RCP< Matrix > &Ptentative, RCP< MultiVector > &coarseNullspace) const
MueLu::DefaultLocalOrdinal LocalOrdinal
void BuildPuncoupledBlockCrs(Level &coarseLevel, RCP< Matrix > A, RCP< Aggregates > aggregates, RCP< AmalgamationInfo > amalgInfo, RCP< MultiVector > fineNullspace, RCP< const Map > coarseMap, RCP< Matrix > &Ptentative, RCP< MultiVector > &coarseNullspace, const int levelID) const
static bool MapsAreNested(const Xpetra::Map< LocalOrdinal, GlobalOrdinal, Node > &rowMap, const Xpetra::Map< LocalOrdinal, GlobalOrdinal, Node > &colMap)
std::string toString(const T &what)
Little helper function to convert non-string types to strings.
const RCP< LOVector > & GetProcWinner() const
Returns constant vector that maps local node IDs to owning processor IDs.
KOKKOS_INLINE_FUNCTION LO GetNumAggregates() const
void BuildP(Level &fineLevel, Level &coarseLevel) const
Abstract Build method.
GlobalOrdinal GO
T & get(const std::string &name, T def_value)
Timer to be used in factories. Similar to Monitor but with additional timers.
#define TEUCHOS_TEST_FOR_EXCEPTION(throw_exception_test, Exception, msg)
const RCP< const Map > GetMap() const
returns (overlapping) map of aggregate/node distribution
LocalOrdinal LO
ParameterList & set(std::string const &name, T &&value, std::string const &docString="", RCP< const ParameterEntryValidator > const &validator=null)
static const NoFactory * get()
Print even more statistics.
#define SET_VALID_ENTRY(name)
bool isParameter(const std::string &name) const
TEUCHOS_DEPRECATED RCP< T > rcp(T *p, Dealloc_T dealloc, bool owns_mem)
void BuildPuncoupled(Level &coarseLevel, RCP< Matrix > A, RCP< Aggregates > aggregates, RCP< AmalgamationInfo > amalgInfo, RCP< MultiVector > fineNullspace, RCP< const Map > coarseMap, RCP< Matrix > &Ptentative, RCP< MultiVector > &coarseNullspace, const int levelID) const
MueLu::DefaultScalar Scalar
MueLu::DefaultGlobalOrdinal GlobalOrdinal
Kokkos::RangePolicy< local_ordinal_type, execution_space > range_type
Class that holds all level-specific information.
Definition: MueLu_Level.hpp:63
bool isSublist(const std::string &name) const
void GetStridingInformation(LO &fullBlockSize, LO &blockID, LO &stridingOffset, LO &stridedBlockSize, GO &indexBase)
returns striding information
Timer to be used in factories. Similar to SubMonitor but adds a timer level by level.
typename LWGraph_kokkos::local_graph_type local_graph_type
void Build(Level &fineLevel, Level &coarseLevel) const
Build an object with this factory.
static void AmalgamateMap(const Map &sourceMap, const Matrix &A, RCP< const Map > &amalgamatedMap, Array< LO > &translation)
Method to create merged map for systems of PDEs.
void DeclareInput(Level &fineLevel, Level &coarseLevel) const
Input.
const RCP< LOMultiVector > & GetVertex2AggId() const
Returns constant vector that maps local node IDs to local aggregates IDs.
static std::string PrintMatrixInfo(const Matrix &A, const std::string &msgTag, RCP< const Teuchos::ParameterList > params=Teuchos::null)
size_t global_size_t
GO GlobalOffset()
returns offset of global dof ids
RCP< const ParameterList > GetValidParameterList() const
Return a const parameter list of valid parameters that setParameterList() will accept.
Scalar SC
ParameterList & sublist(const std::string &name, bool mustAlreadyExist=false, const std::string &docString="")
int GetLevelID() const
Return level number.
Definition: MueLu_Level.cpp:51
Exception throws to report errors in the internal logical of the program.
Description of what is happening (more verbose)
bool isGoodMap(const Map &rowMap, const Map &colMap) const
maxAggDofSizeType maxAggDofSize
local_graph_type GetGraph() const
agg2RowMapLOType agg2RowMapLO
aggregates_sizes_type::const_type ComputeAggregateSizes(bool forceRecompute=false) const
Compute sizes of aggregates.
bool IsAvailable(const std::string &ename, const FactoryBase *factory=NoFactory::get()) const
Test whether a need&#39;s value has been saved.
bool is_null() const