MueLu  Version of the Day
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Groups Pages
MueLu_TentativePFactory_kokkos_def.hpp
Go to the documentation of this file.
1 // @HEADER
2 // *****************************************************************************
3 // MueLu: A package for multigrid based preconditioning
4 //
5 // Copyright 2012 NTESS and the MueLu contributors.
6 // SPDX-License-Identifier: BSD-3-Clause
7 // *****************************************************************************
8 // @HEADER
9 
10 #ifndef MUELU_TENTATIVEPFACTORY_KOKKOS_DEF_HPP
11 #define MUELU_TENTATIVEPFACTORY_KOKKOS_DEF_HPP
12 
13 #include "Kokkos_UnorderedMap.hpp"
15 
17 
18 #include "MueLu_Aggregates.hpp"
19 #include "MueLu_AmalgamationInfo.hpp"
20 
21 #include "MueLu_MasterList.hpp"
22 #include "MueLu_PerfUtils.hpp"
23 #include "MueLu_Monitor.hpp"
24 
25 #include "Xpetra_IO.hpp"
26 
27 namespace MueLu {
28 
29 namespace { // anonymous
30 
31 template <class LocalOrdinal, class View>
32 class ReduceMaxFunctor {
33  public:
34  ReduceMaxFunctor(View view)
35  : view_(view) {}
36 
37  KOKKOS_INLINE_FUNCTION
38  void operator()(const LocalOrdinal& i, LocalOrdinal& vmax) const {
39  if (vmax < view_(i))
40  vmax = view_(i);
41  }
42 
43  KOKKOS_INLINE_FUNCTION
44  void join(LocalOrdinal& dst, const LocalOrdinal& src) const {
45  if (dst < src) {
46  dst = src;
47  }
48  }
49 
50  KOKKOS_INLINE_FUNCTION
51  void init(LocalOrdinal& dst) const {
52  dst = 0;
53  }
54 
55  private:
57 };
58 
59 // local QR decomposition
60 template <class LOType, class GOType, class SCType, class DeviceType, class NspType, class aggRowsType, class maxAggDofSizeType, class agg2RowMapLOType, class statusType, class rowsType, class rowsAuxType, class colsAuxType, class valsAuxType>
61 class LocalQRDecompFunctor {
62  private:
63  typedef LOType LO;
64  typedef GOType GO;
65  typedef SCType SC;
66 
67  typedef typename DeviceType::execution_space execution_space;
68  typedef typename Kokkos::ArithTraits<SC>::val_type impl_SC;
69  typedef Kokkos::ArithTraits<impl_SC> impl_ATS;
70  typedef typename impl_ATS::magnitudeType Magnitude;
71 
72  typedef Kokkos::View<impl_SC**, typename execution_space::scratch_memory_space, Kokkos::MemoryUnmanaged> shared_matrix;
73  typedef Kokkos::View<impl_SC*, typename execution_space::scratch_memory_space, Kokkos::MemoryUnmanaged> shared_vector;
74 
75  private:
76  NspType fineNS;
77  NspType coarseNS;
78  aggRowsType aggRows;
79  maxAggDofSizeType maxAggDofSize; //< maximum number of dofs in aggregate (max size of aggregate * numDofsPerNode)
80  agg2RowMapLOType agg2RowMapLO;
81  statusType statusAtomic;
82  rowsType rows;
83  rowsAuxType rowsAux;
84  colsAuxType colsAux;
85  valsAuxType valsAux;
86  bool doQRStep;
87 
88  public:
89  LocalQRDecompFunctor(NspType fineNS_, NspType coarseNS_, aggRowsType aggRows_, maxAggDofSizeType maxAggDofSize_, agg2RowMapLOType agg2RowMapLO_, statusType statusAtomic_, rowsType rows_, rowsAuxType rowsAux_, colsAuxType colsAux_, valsAuxType valsAux_, bool doQRStep_)
90  : fineNS(fineNS_)
91  , coarseNS(coarseNS_)
92  , aggRows(aggRows_)
93  , maxAggDofSize(maxAggDofSize_)
94  , agg2RowMapLO(agg2RowMapLO_)
95  , statusAtomic(statusAtomic_)
96  , rows(rows_)
97  , rowsAux(rowsAux_)
98  , colsAux(colsAux_)
99  , valsAux(valsAux_)
100  , doQRStep(doQRStep_) {}
101 
102  KOKKOS_INLINE_FUNCTION
103  void operator()(const typename Kokkos::TeamPolicy<execution_space>::member_type& thread, size_t& nnz) const {
104  auto agg = thread.league_rank();
105 
106  // size of aggregate: number of DOFs in aggregate
107  auto aggSize = aggRows(agg + 1) - aggRows(agg);
108 
109  const impl_SC one = impl_ATS::one();
110  const impl_SC two = one + one;
111  const impl_SC zero = impl_ATS::zero();
112  const auto zeroM = impl_ATS::magnitude(zero);
113 
114  int m = aggSize;
115  int n = fineNS.extent(1);
116 
117  // calculate row offset for coarse nullspace
118  Xpetra::global_size_t offset = agg * n;
119 
120  if (doQRStep) {
121  // Extract the piece of the nullspace corresponding to the aggregate
122  shared_matrix r(thread.team_shmem(), m, n); // A (initially), R (at the end)
123  for (int j = 0; j < n; j++)
124  for (int k = 0; k < m; k++)
125  r(k, j) = fineNS(agg2RowMapLO(aggRows(agg) + k), j);
126 #if 0
127  printf("A\n");
128  for (int i = 0; i < m; i++) {
129  for (int j = 0; j < n; j++)
130  printf(" %5.3lf ", r(i,j));
131  printf("\n");
132  }
133 #endif
134 
135  // Calculate QR decomposition (standard)
136  shared_matrix q(thread.team_shmem(), m, m); // Q
137  if (m >= n) {
138  bool isSingular = false;
139 
140  // Initialize Q^T
141  auto qt = q;
142  for (int i = 0; i < m; i++) {
143  for (int j = 0; j < m; j++)
144  qt(i, j) = zero;
145  qt(i, i) = one;
146  }
147 
148  for (int k = 0; k < n; k++) { // we ignore "n" instead of "n-1" to normalize
149  // FIXME_KOKKOS: use team
150  Magnitude s = zeroM, norm, norm_x;
151  for (int i = k + 1; i < m; i++)
152  s += pow(impl_ATS::magnitude(r(i, k)), 2);
153  norm = sqrt(pow(impl_ATS::magnitude(r(k, k)), 2) + s);
154 
155  if (norm == zero) {
156  isSingular = true;
157  break;
158  }
159 
160  r(k, k) -= norm * one;
161 
162  norm_x = sqrt(pow(impl_ATS::magnitude(r(k, k)), 2) + s);
163  if (norm_x == zeroM) {
164  // We have a single diagonal element in the column.
165  // No reflections required. Just need to restor r(k,k).
166  r(k, k) = norm * one;
167  continue;
168  }
169 
170  // FIXME_KOKKOS: use team
171  for (int i = k; i < m; i++)
172  r(i, k) /= norm_x;
173 
174  // Update R(k:m,k+1:n)
175  for (int j = k + 1; j < n; j++) {
176  // FIXME_KOKKOS: use team in the loops
177  impl_SC si = zero;
178  for (int i = k; i < m; i++)
179  si += r(i, k) * r(i, j);
180  for (int i = k; i < m; i++)
181  r(i, j) -= two * si * r(i, k);
182  }
183 
184  // Update Q^T (k:m,k:m)
185  for (int j = k; j < m; j++) {
186  // FIXME_KOKKOS: use team in the loops
187  impl_SC si = zero;
188  for (int i = k; i < m; i++)
189  si += r(i, k) * qt(i, j);
190  for (int i = k; i < m; i++)
191  qt(i, j) -= two * si * r(i, k);
192  }
193 
194  // Fix R(k:m,k)
195  r(k, k) = norm * one;
196  for (int i = k + 1; i < m; i++)
197  r(i, k) = zero;
198  }
199 
200 #if 0
201  // Q = (Q^T)^T
202  for (int i = 0; i < m; i++)
203  for (int j = 0; j < i; j++) {
204  impl_SC tmp = qt(i,j);
205  qt(i,j) = qt(j,i);
206  qt(j,i) = tmp;
207  }
208 #endif
209 
210  // Build coarse nullspace using the upper triangular part of R
211  for (int j = 0; j < n; j++)
212  for (int k = 0; k <= j; k++)
213  coarseNS(offset + k, j) = r(k, j);
214 
215  if (isSingular) {
216  statusAtomic(1) = true;
217  return;
218  }
219 
220  } else {
221  // Special handling for m < n (i.e. single node aggregates in structural mechanics)
222 
223  // The local QR decomposition is not possible in the "overconstrained"
224  // case (i.e. number of columns in qr > number of rowsAux), which
225  // corresponds to #DOFs in Aggregate < n. For usual problems this
226  // is only possible for single node aggregates in structural mechanics.
227  // (Similar problems may arise in discontinuous Galerkin problems...)
228  // We bypass the QR decomposition and use an identity block in the
229  // tentative prolongator for the single node aggregate and transfer the
230  // corresponding fine level null space information 1-to-1 to the coarse
231  // level null space part.
232 
233  // NOTE: The resulting tentative prolongation operator has
234  // (m*DofsPerNode-n) zero columns leading to a singular
235  // coarse level operator A. To deal with that one has the following
236  // options:
237  // - Use the "RepairMainDiagonal" flag in the RAPFactory (default:
238  // false) to add some identity block to the diagonal of the zero rowsAux
239  // in the coarse level operator A, such that standard level smoothers
240  // can be used again.
241  // - Use special (projection-based) level smoothers, which can deal
242  // with singular matrices (very application specific)
243  // - Adapt the code below to avoid zero columns. However, we do not
244  // support a variable number of DOFs per node in MueLu/Xpetra which
245  // makes the implementation really hard.
246  //
247  // FIXME: do we need to check for singularity here somehow? Zero
248  // columns would be easy but linear dependency would require proper QR.
249 
250  // R = extended (by adding identity rowsAux) qr
251  for (int j = 0; j < n; j++)
252  for (int k = 0; k < n; k++)
253  if (k < m)
254  coarseNS(offset + k, j) = r(k, j);
255  else
256  coarseNS(offset + k, j) = (k == j ? one : zero);
257 
258  // Q = I (rectangular)
259  for (int i = 0; i < m; i++)
260  for (int j = 0; j < n; j++)
261  q(i, j) = (j == i ? one : zero);
262  }
263 
264  // Process each row in the local Q factor and fill helper arrays to assemble P
265  for (int j = 0; j < m; j++) {
266  LO localRow = agg2RowMapLO(aggRows(agg) + j);
267  size_t rowStart = rowsAux(localRow);
268  size_t lnnz = 0;
269  for (int k = 0; k < n; k++) {
270  // skip zeros
271  if (q(j, k) != zero) {
272  colsAux(rowStart + lnnz) = offset + k;
273  valsAux(rowStart + lnnz) = q(j, k);
274  lnnz++;
275  }
276  }
277  rows(localRow + 1) = lnnz;
278  nnz += lnnz;
279  }
280 
281 #if 0
282  printf("R\n");
283  for (int i = 0; i < m; i++) {
284  for (int j = 0; j < n; j++)
285  printf(" %5.3lf ", coarseNS(i,j));
286  printf("\n");
287  }
288 
289  printf("Q\n");
290  for (int i = 0; i < aggSize; i++) {
291  for (int j = 0; j < aggSize; j++)
292  printf(" %5.3lf ", q(i,j));
293  printf("\n");
294  }
295 #endif
296  } else {
298  // "no-QR" option //
300  // Local Q factor is just the fine nullspace support over the current aggregate.
301  // Local R factor is the identity.
302  // TODO I have not implemented any special handling for aggregates that are too
303  // TODO small to locally support the nullspace, as is done in the standard QR
304  // TODO case above.
305 
306  for (int j = 0; j < m; j++) {
307  LO localRow = agg2RowMapLO(aggRows(agg) + j);
308  size_t rowStart = rowsAux(localRow);
309  size_t lnnz = 0;
310  for (int k = 0; k < n; k++) {
311  const impl_SC qr_jk = fineNS(localRow, k);
312  // skip zeros
313  if (qr_jk != zero) {
314  colsAux(rowStart + lnnz) = offset + k;
315  valsAux(rowStart + lnnz) = qr_jk;
316  lnnz++;
317  }
318  }
319  rows(localRow + 1) = lnnz;
320  nnz += lnnz;
321  }
322 
323  for (int j = 0; j < n; j++)
324  coarseNS(offset + j, j) = one;
325  }
326  }
327 
328  // amount of shared memory
329  size_t team_shmem_size(int /* team_size */) const {
330  if (doQRStep) {
331  int m = maxAggDofSize;
332  int n = fineNS.extent(1);
333  return shared_matrix::shmem_size(m, n) + // r
334  shared_matrix::shmem_size(m, m); // q
335  } else
336  return 0;
337  }
338 };
339 
340 } // namespace
341 
342 template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
344  RCP<ParameterList> validParamList = rcp(new ParameterList());
345 
346 #define SET_VALID_ENTRY(name) validParamList->setEntry(name, MasterList::getEntry(name))
347  SET_VALID_ENTRY("tentative: calculate qr");
348  SET_VALID_ENTRY("tentative: build coarse coordinates");
349 #undef SET_VALID_ENTRY
350 
351  validParamList->set<RCP<const FactoryBase>>("A", Teuchos::null, "Generating factory of the matrix A");
352  validParamList->set<RCP<const FactoryBase>>("Aggregates", Teuchos::null, "Generating factory of the aggregates");
353  validParamList->set<RCP<const FactoryBase>>("Nullspace", Teuchos::null, "Generating factory of the nullspace");
354  validParamList->set<RCP<const FactoryBase>>("Scaled Nullspace", Teuchos::null, "Generating factory of the scaled nullspace");
355  validParamList->set<RCP<const FactoryBase>>("UnAmalgamationInfo", Teuchos::null, "Generating factory of UnAmalgamationInfo");
356  validParamList->set<RCP<const FactoryBase>>("CoarseMap", Teuchos::null, "Generating factory of the coarse map");
357  validParamList->set<RCP<const FactoryBase>>("Coordinates", Teuchos::null, "Generating factory of the coordinates");
358  validParamList->set<RCP<const FactoryBase>>("Node Comm", Teuchos::null, "Generating factory of the node level communicator");
359 
360  // Make sure we don't recursively validate options for the matrixmatrix kernels
361  ParameterList norecurse;
362  norecurse.disableRecursiveValidation();
363  validParamList->set<ParameterList>("matrixmatrix: kernel params", norecurse, "MatrixMatrix kernel parameters");
364 
365  return validParamList;
366 }
367 
368 template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
370  const ParameterList& pL = GetParameterList();
371  // NOTE: This guy can only either be 'Nullspace' or 'Scaled Nullspace' or else the validator above will cause issues
372  std::string nspName = "Nullspace";
373  if (pL.isParameter("Nullspace name")) nspName = pL.get<std::string>("Nullspace name");
374 
375  Input(fineLevel, "A");
376  Input(fineLevel, "Aggregates");
377  Input(fineLevel, nspName);
378  Input(fineLevel, "UnAmalgamationInfo");
379  Input(fineLevel, "CoarseMap");
380  if (fineLevel.GetLevelID() == 0 &&
381  fineLevel.IsAvailable("Coordinates", NoFactory::get()) && // we have coordinates (provided by user app)
382  pL.get<bool>("tentative: build coarse coordinates")) { // and we want coordinates on other levels
383  bTransferCoordinates_ = true; // then set the transfer coordinates flag to true
384  Input(fineLevel, "Coordinates");
385  } else if (bTransferCoordinates_) {
386  Input(fineLevel, "Coordinates");
387  }
388 }
389 
390 template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
392  return BuildP(fineLevel, coarseLevel);
393 }
394 
395 template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
397  FactoryMonitor m(*this, "Build", coarseLevel);
398 
399  typedef typename Teuchos::ScalarTraits<Scalar>::coordinateType coordinate_type;
400  typedef Xpetra::MultiVectorFactory<coordinate_type, LO, GO, NO> RealValuedMultiVectorFactory;
401  const ParameterList& pL = GetParameterList();
402  std::string nspName = "Nullspace";
403  if (pL.isParameter("Nullspace name")) nspName = pL.get<std::string>("Nullspace name");
404 
405  auto A = Get<RCP<Matrix>>(fineLevel, "A");
406  auto aggregates = Get<RCP<Aggregates>>(fineLevel, "Aggregates");
407  auto amalgInfo = Get<RCP<AmalgamationInfo>>(fineLevel, "UnAmalgamationInfo");
408  auto fineNullspace = Get<RCP<MultiVector>>(fineLevel, nspName);
409  auto coarseMap = Get<RCP<const Map>>(fineLevel, "CoarseMap");
410  RCP<RealValuedMultiVector> fineCoords;
411  if (bTransferCoordinates_) {
412  fineCoords = Get<RCP<RealValuedMultiVector>>(fineLevel, "Coordinates");
413  }
414 
415  RCP<Matrix> Ptentative;
416  // No coarse DoFs so we need to bail by setting Ptentattive to null and returning
417  // This level will ultimately be removed in MueLu_Hierarchy_defs.h via a resize()
418  if (aggregates->GetNumGlobalAggregatesComputeIfNeeded() == 0) {
419  Ptentative = Teuchos::null;
420  Set(coarseLevel, "P", Ptentative);
421  return;
422  }
423  RCP<MultiVector> coarseNullspace;
424  RCP<RealValuedMultiVector> coarseCoords;
425 
426  if (bTransferCoordinates_) {
427  RCP<const Map> coarseCoordMap;
428  using array_type = typename Map::global_indices_array_device_type;
429 
430  LO blkSize = 1;
431  if (rcp_dynamic_cast<const StridedMap>(coarseMap) != Teuchos::null)
432  blkSize = rcp_dynamic_cast<const StridedMap>(coarseMap)->getFixedBlockSize();
433 
434  if (blkSize == 1) {
435  // Scalar system
436  // No amalgamation required, we can use the coarseMap
437  coarseCoordMap = coarseMap;
438  } else {
439  // Vector system
440  // NOTE: There could be further optimizations here where we detect contiguous maps and then
441  // create a contiguous amalgamated maps, which bypasses the expense of the getMyGlobalIndicesDevice()
442  // call (which is free for non-contiguous maps, but costs something if the map is contiguous).
443  using range_policy = Kokkos::RangePolicy<typename Node::execution_space>;
444  array_type elementAList = coarseMap->getMyGlobalIndicesDevice();
445  GO indexBase = coarseMap->getIndexBase();
446  auto numElements = elementAList.size() / blkSize;
447  typename array_type::non_const_type elementList_nc("elementList", numElements);
448 
449  // Amalgamate the map
450  Kokkos::parallel_for(
451  "Amalgamate Element List", range_policy(0, numElements), KOKKOS_LAMBDA(LO i) {
452  elementList_nc[i] = (elementAList[i * blkSize] - indexBase) / blkSize + indexBase;
453  });
454  array_type elementList = elementList_nc;
455  coarseCoordMap = MapFactory::Build(coarseMap->lib(), Teuchos::OrdinalTraits<Xpetra::global_size_t>::invalid(),
456  elementList, indexBase, coarseMap->getComm());
457  }
458 
459  coarseCoords = RealValuedMultiVectorFactory::Build(coarseCoordMap, fineCoords->getNumVectors());
460 
461  // Create overlapped fine coordinates to reduce global communication
462  auto uniqueMap = fineCoords->getMap();
463  RCP<RealValuedMultiVector> ghostedCoords = fineCoords;
464  if (aggregates->AggregatesCrossProcessors()) {
465  auto nonUniqueMap = aggregates->GetMap();
466  auto importer = ImportFactory::Build(uniqueMap, nonUniqueMap);
467 
468  ghostedCoords = RealValuedMultiVectorFactory::Build(nonUniqueMap, fineCoords->getNumVectors());
469  ghostedCoords->doImport(*fineCoords, *importer, Xpetra::INSERT);
470  }
471 
472  // The good new is that his graph has already been constructed for the
473  // TentativePFactory and was cached in Aggregates. So this is a no-op.
474  auto aggGraph = aggregates->GetGraph();
475  auto numAggs = aggGraph.numRows();
476 
477  auto fineCoordsView = fineCoords->getDeviceLocalView(Xpetra::Access::ReadOnly);
478  auto coarseCoordsView = coarseCoords->getDeviceLocalView(Xpetra::Access::OverwriteAll);
479 
480  // Fill in coarse coordinates
481  {
482  SubFactoryMonitor m2(*this, "AverageCoords", coarseLevel);
483 
484  const auto dim = fineCoords->getNumVectors();
485 
486  typename AppendTrait<decltype(fineCoordsView), Kokkos::RandomAccess>::type fineCoordsRandomView = fineCoordsView;
487  for (size_t j = 0; j < dim; j++) {
488  Kokkos::parallel_for(
489  "MueLu::TentativeP::BuildCoords", Kokkos::RangePolicy<local_ordinal_type, execution_space>(0, numAggs),
490  KOKKOS_LAMBDA(const LO i) {
491  // A row in this graph represents all node ids in the aggregate
492  // Therefore, averaging is very easy
493 
494  auto aggregate = aggGraph.rowConst(i);
495 
496  coordinate_type sum = 0.0; // do not use Scalar here (Stokhos)
497  for (size_t colID = 0; colID < static_cast<size_t>(aggregate.length); colID++)
498  sum += fineCoordsRandomView(aggregate(colID), j);
499 
500  coarseCoordsView(i, j) = sum / aggregate.length;
501  });
502  }
503  }
504  }
505 
506  if (!aggregates->AggregatesCrossProcessors()) {
508  BuildPuncoupledBlockCrs(coarseLevel, A, aggregates, amalgInfo, fineNullspace, coarseMap, Ptentative, coarseNullspace,
509  coarseLevel.GetLevelID());
510  } else {
511  BuildPuncoupled(coarseLevel, A, aggregates, amalgInfo, fineNullspace, coarseMap, Ptentative, coarseNullspace, coarseLevel.GetLevelID());
512  }
513  } else
514  BuildPcoupled(A, aggregates, amalgInfo, fineNullspace, coarseMap, Ptentative, coarseNullspace);
515 
516  // If available, use striding information of fine level matrix A for range
517  // map and coarseMap as domain map; otherwise use plain range map of
518  // Ptent = plain range map of A for range map and coarseMap as domain map.
519  // NOTE:
520  // The latter is not really safe, since there is no striding information
521  // for the range map. This is not really a problem, since striding
522  // information is always available on the intermedium levels and the
523  // coarsest levels.
524  if (A->IsView("stridedMaps") == true)
525  Ptentative->CreateView("stridedMaps", A->getRowMap("stridedMaps"), coarseMap);
526  else
527  Ptentative->CreateView("stridedMaps", Ptentative->getRangeMap(), coarseMap);
528 
529  if (bTransferCoordinates_) {
530  Set(coarseLevel, "Coordinates", coarseCoords);
531  }
532 
533  // FIXME: We should remove the NodeComm on levels past the threshold
534  if (fineLevel.IsAvailable("Node Comm")) {
535  RCP<const Teuchos::Comm<int>> nodeComm = Get<RCP<const Teuchos::Comm<int>>>(fineLevel, "Node Comm");
536  Set<RCP<const Teuchos::Comm<int>>>(coarseLevel, "Node Comm", nodeComm);
537  }
538 
539  Set(coarseLevel, "Nullspace", coarseNullspace);
540  Set(coarseLevel, "P", Ptentative);
541 
542  if (IsPrint(Statistics2)) {
543  RCP<ParameterList> params = rcp(new ParameterList());
544  params->set("printLoadBalancingInfo", true);
545  GetOStream(Statistics2) << PerfUtils::PrintMatrixInfo(*Ptentative, "Ptent", params);
546  }
547 }
548 
549 template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
551  BuildPuncoupled(Level& coarseLevel, RCP<Matrix> A, RCP<Aggregates> aggregates,
552  RCP<AmalgamationInfo> amalgInfo, RCP<MultiVector> fineNullspace,
553  RCP<const Map> coarseMap, RCP<Matrix>& Ptentative,
554  RCP<MultiVector>& coarseNullspace, const int levelID) const {
555  auto rowMap = A->getRowMap();
556  auto colMap = A->getColMap();
557 
558  const size_t numRows = rowMap->getLocalNumElements();
559  const size_t NSDim = fineNullspace->getNumVectors();
560 
561  typedef Kokkos::ArithTraits<SC> ATS;
562  using impl_SC = typename ATS::val_type;
563  using impl_ATS = Kokkos::ArithTraits<impl_SC>;
564  const impl_SC zero = impl_ATS::zero(), one = impl_ATS::one();
565 
566  const LO INVALID = Teuchos::OrdinalTraits<LO>::invalid();
567 
568  typename Aggregates::local_graph_type aggGraph;
569  {
570  SubFactoryMonitor m2(*this, "Get Aggregates graph", coarseLevel);
571  aggGraph = aggregates->GetGraph();
572  }
573  auto aggRows = aggGraph.row_map;
574  auto aggCols = aggGraph.entries;
575 
576  // Aggregates map is based on the amalgamated column map
577  // We can skip global-to-local conversion if LIDs in row map are
578  // same as LIDs in column map
579  bool goodMap;
580  {
581  SubFactoryMonitor m2(*this, "Check good map", coarseLevel);
582  goodMap = isGoodMap(*rowMap, *colMap);
583  }
584  // FIXME_KOKKOS: need to proofread later code for bad maps
586  "MueLu: TentativePFactory_kokkos: for now works only with good maps "
587  "(i.e. \"matching\" row and column maps)");
588 
589  // STEP 1: do unamalgamation
590  // The non-kokkos version uses member functions from the AmalgamationInfo
591  // container class to unamalgamate the data. In contrast, the kokkos
592  // version of TentativePFactory does the unamalgamation here and only uses
593  // the data of the AmalgamationInfo container class
594 
595  // Extract information for unamalgamation
596  LO fullBlockSize, blockID, stridingOffset, stridedBlockSize;
597  GO indexBase;
598  amalgInfo->GetStridingInformation(fullBlockSize, blockID, stridingOffset, stridedBlockSize, indexBase);
599  GO globalOffset = amalgInfo->GlobalOffset();
600 
601  // Extract aggregation info (already in Kokkos host views)
602  auto procWinner = aggregates->GetProcWinner()->getDeviceLocalView(Xpetra::Access::ReadOnly);
603  auto vertex2AggId = aggregates->GetVertex2AggId()->getDeviceLocalView(Xpetra::Access::ReadOnly);
604  const size_t numAggregates = aggregates->GetNumAggregates();
605 
606  int myPID = aggregates->GetMap()->getComm()->getRank();
607 
608  // Create Kokkos::View (on the device) to store the aggreate dof sizes
609  // Later used to get aggregate dof offsets
610  // NOTE: This zeros itself on construction
611  typedef typename Aggregates::aggregates_sizes_type::non_const_type AggSizeType;
612  AggSizeType aggDofSizes;
613 
614  if (stridedBlockSize == 1) {
615  SubFactoryMonitor m2(*this, "Calc AggSizes", coarseLevel);
616 
617  // FIXME_KOKKOS: use ViewAllocateWithoutInitializing + set a single value
618  aggDofSizes = AggSizeType("agg_dof_sizes", numAggregates + 1);
619 
620  auto sizesConst = aggregates->ComputeAggregateSizes();
621  Kokkos::deep_copy(Kokkos::subview(aggDofSizes, Kokkos::make_pair(static_cast<size_t>(1), numAggregates + 1)), sizesConst);
622 
623  } else {
624  SubFactoryMonitor m2(*this, "Calc AggSizes", coarseLevel);
625 
626  // FIXME_KOKKOS: use ViewAllocateWithoutInitializing + set a single value
627  aggDofSizes = AggSizeType("agg_dof_sizes", numAggregates + 1);
628 
629  auto nodeMap = aggregates->GetMap()->getLocalMap();
630  auto dofMap = colMap->getLocalMap();
631 
632  Kokkos::parallel_for(
633  "MueLu:TentativePF:Build:compute_agg_sizes", range_type(0, numAggregates),
634  KOKKOS_LAMBDA(const LO agg) {
635  auto aggRowView = aggGraph.rowConst(agg);
636 
637  size_t size = 0;
638  for (LO colID = 0; colID < aggRowView.length; colID++) {
639  GO nodeGID = nodeMap.getGlobalElement(aggRowView(colID));
640 
641  for (LO k = 0; k < stridedBlockSize; k++) {
642  GO dofGID = (nodeGID - indexBase) * fullBlockSize + k + indexBase + globalOffset + stridingOffset;
643 
644  if (dofMap.getLocalElement(dofGID) != INVALID)
645  size++;
646  }
647  }
648  aggDofSizes(agg + 1) = size;
649  });
650  }
651 
652  // Find maximum dof size for aggregates
653  // Later used to reserve enough scratch space for local QR decompositions
654  LO maxAggSize = 0;
655  ReduceMaxFunctor<LO, decltype(aggDofSizes)> reduceMax(aggDofSizes);
656  Kokkos::parallel_reduce("MueLu:TentativePF:Build:max_agg_size", range_type(0, aggDofSizes.extent(0)), reduceMax, maxAggSize);
657 
658  // parallel_scan (exclusive)
659  // The aggDofSizes View then contains the aggregate dof offsets
660  Kokkos::parallel_scan(
661  "MueLu:TentativePF:Build:aggregate_sizes:stage1_scan", range_type(0, numAggregates + 1),
662  KOKKOS_LAMBDA(const LO i, LO& update, const bool& final_pass) {
663  update += aggDofSizes(i);
664  if (final_pass)
665  aggDofSizes(i) = update;
666  });
667 
668  // Create Kokkos::View on the device to store mapping
669  // between (local) aggregate id and row map ids (LIDs)
670  Kokkos::View<LO*, DeviceType> agg2RowMapLO(Kokkos::ViewAllocateWithoutInitializing("agg2row_map_LO"), numRows);
671  {
672  SubFactoryMonitor m2(*this, "Create Agg2RowMap", coarseLevel);
673 
674  AggSizeType aggOffsets(Kokkos::ViewAllocateWithoutInitializing("aggOffsets"), numAggregates);
675  Kokkos::deep_copy(aggOffsets, Kokkos::subview(aggDofSizes, Kokkos::make_pair(static_cast<size_t>(0), numAggregates)));
676 
677  Kokkos::parallel_for(
678  "MueLu:TentativePF:Build:createAgg2RowMap", range_type(0, vertex2AggId.extent(0)),
679  KOKKOS_LAMBDA(const LO lnode) {
680  if (procWinner(lnode, 0) == myPID) {
681  // No need for atomics, it's one-to-one
682  auto aggID = vertex2AggId(lnode, 0);
683 
684  auto offset = Kokkos::atomic_fetch_add(&aggOffsets(aggID), stridedBlockSize);
685  // FIXME: I think this may be wrong
686  // We unconditionally add the whole block here. When we calculated
687  // aggDofSizes, we did the isLocalElement check. Something's fishy.
688  for (LO k = 0; k < stridedBlockSize; k++)
689  agg2RowMapLO(offset + k) = lnode * stridedBlockSize + k;
690  }
691  });
692  }
693 
694  // STEP 2: prepare local QR decomposition
695  // Reserve memory for tentative prolongation operator
696  coarseNullspace = MultiVectorFactory::Build(coarseMap, NSDim);
697 
698  // Pull out the nullspace vectors so that we can have random access (on the device)
699  auto fineNS = fineNullspace->getDeviceLocalView(Xpetra::Access::ReadWrite);
700  auto coarseNS = coarseNullspace->getDeviceLocalView(Xpetra::Access::OverwriteAll);
701 
702  size_t nnz = 0; // actual number of nnz
703 
704  typedef typename Xpetra::Matrix<SC, LO, GO, NO>::local_matrix_type local_matrix_type;
705  typedef typename local_matrix_type::row_map_type::non_const_type rows_type;
706  typedef typename local_matrix_type::index_type::non_const_type cols_type;
707  typedef typename local_matrix_type::values_type::non_const_type vals_type;
708 
709  // Device View for status (error messages...)
710  typedef Kokkos::View<int[10], DeviceType> status_type;
711  status_type status("status");
712 
715 
716  const ParameterList& pL = GetParameterList();
717  const bool& doQRStep = pL.get<bool>("tentative: calculate qr");
718  if (!doQRStep) {
719  GetOStream(Runtime1) << "TentativePFactory : bypassing local QR phase" << std::endl;
720  if (NSDim > 1)
721  GetOStream(Warnings0) << "TentativePFactor : for nontrivial nullspace, this may degrade performance" << std::endl;
722  }
723 
724  size_t nnzEstimate = numRows * NSDim;
725  rows_type rowsAux(Kokkos::ViewAllocateWithoutInitializing("Ptent_aux_rows"), numRows + 1);
726  cols_type colsAux(Kokkos::ViewAllocateWithoutInitializing("Ptent_aux_cols"), nnzEstimate);
727  vals_type valsAux("Ptent_aux_vals", nnzEstimate);
728  rows_type rows("Ptent_rows", numRows + 1);
729  {
730  // Stage 0: fill in views.
731  SubFactoryMonitor m2(*this, "Stage 0 (InitViews)", coarseLevel);
732 
733  // The main thing to notice is initialization of vals with INVALID. These
734  // values will later be used to compress the arrays
735  Kokkos::parallel_for(
736  "MueLu:TentativePF:BuildPuncoupled:for1", range_type(0, numRows + 1),
737  KOKKOS_LAMBDA(const LO row) {
738  rowsAux(row) = row * NSDim;
739  });
740  Kokkos::parallel_for(
741  "MueLu:TentativePF:BuildUncoupled:for2", range_type(0, nnzEstimate),
742  KOKKOS_LAMBDA(const LO j) {
743  colsAux(j) = INVALID;
744  });
745  }
746 
747  if (NSDim == 1) {
748  // 1D is special, as it is the easiest. We don't even need to the QR,
749  // just normalize an array. Plus, no worries abot small aggregates. In
750  // addition, we do not worry about compression. It is unlikely that
751  // nullspace will have zeros. If it does, a prolongator row would be
752  // zero and we'll get singularity anyway.
753  SubFactoryMonitor m2(*this, "Stage 1 (LocalQR)", coarseLevel);
754 
755  // Set up team policy with numAggregates teams and one thread per team.
756  // Each team handles a slice of the data associated with one aggregate
757  // and performs a local QR decomposition (in this case real QR is
758  // unnecessary).
759  const Kokkos::TeamPolicy<execution_space> policy(numAggregates, 1);
760 
761  if (doQRStep) {
762  Kokkos::parallel_for(
763  "MueLu:TentativePF:BuildUncoupled:main_loop", policy,
764  KOKKOS_LAMBDA(const typename Kokkos::TeamPolicy<execution_space>::member_type& thread) {
765  auto agg = thread.league_rank();
766 
767  // size of the aggregate (number of DOFs in aggregate)
768  LO aggSize = aggRows(agg + 1) - aggRows(agg);
769 
770  // Extract the piece of the nullspace corresponding to the aggregate, and
771  // put it in the flat array, "localQR" (in column major format) for the
772  // QR routine. Trivial in 1D.
773  auto norm = impl_ATS::magnitude(zero);
774 
775  // Calculate QR by hand
776  // FIXME: shouldn't there be stridedblock here?
777  // FIXME_KOKKOS: shouldn't there be stridedblock here?
778  for (decltype(aggSize) k = 0; k < aggSize; k++) {
779  auto dnorm = impl_ATS::magnitude(fineNSRandom(agg2RowMapLO(aggRows(agg) + k), 0));
780  norm += dnorm * dnorm;
781  }
782  norm = sqrt(norm);
783 
784  if (norm == zero) {
785  // zero column; terminate the execution
786  statusAtomic(1) = true;
787  return;
788  }
789 
790  // R = norm
791  coarseNS(agg, 0) = norm;
792 
793  // Q = localQR(:,0)/norm
794  for (decltype(aggSize) k = 0; k < aggSize; k++) {
795  LO localRow = agg2RowMapLO(aggRows(agg) + k);
796  impl_SC localVal = fineNSRandom(agg2RowMapLO(aggRows(agg) + k), 0) / norm;
797 
798  rows(localRow + 1) = 1;
799  colsAux(localRow) = agg;
800  valsAux(localRow) = localVal;
801  }
802  });
803 
804  typename status_type::HostMirror statusHost = Kokkos::create_mirror_view(status);
805  Kokkos::deep_copy(statusHost, status);
806  for (decltype(statusHost.size()) i = 0; i < statusHost.size(); i++)
807  if (statusHost(i)) {
808  std::ostringstream oss;
809  oss << "MueLu::TentativePFactory::MakeTentative: ";
810  switch (i) {
811  case 0: oss << "!goodMap is not implemented"; break;
812  case 1: oss << "fine level NS part has a zero column"; break;
813  }
814  throw Exceptions::RuntimeError(oss.str());
815  }
816 
817  } else {
818  Kokkos::parallel_for(
819  "MueLu:TentativePF:BuildUncoupled:main_loop_noqr", policy,
820  KOKKOS_LAMBDA(const typename Kokkos::TeamPolicy<execution_space>::member_type& thread) {
821  auto agg = thread.league_rank();
822 
823  // size of the aggregate (number of DOFs in aggregate)
824  LO aggSize = aggRows(agg + 1) - aggRows(agg);
825 
826  // R = norm
827  coarseNS(agg, 0) = one;
828 
829  // Q = localQR(:,0)/norm
830  for (decltype(aggSize) k = 0; k < aggSize; k++) {
831  LO localRow = agg2RowMapLO(aggRows(agg) + k);
832  impl_SC localVal = fineNSRandom(agg2RowMapLO(aggRows(agg) + k), 0);
833 
834  rows(localRow + 1) = 1;
835  colsAux(localRow) = agg;
836  valsAux(localRow) = localVal;
837  }
838  });
839  }
840 
841  Kokkos::parallel_reduce(
842  "MueLu:TentativeP:CountNNZ", range_type(0, numRows + 1),
843  KOKKOS_LAMBDA(const LO i, size_t& nnz_count) {
844  nnz_count += rows(i);
845  },
846  nnz);
847 
848  } else { // NSdim > 1
849  // FIXME_KOKKOS: This code branch is completely unoptimized.
850  // Work to do:
851  // - Optimize QR decomposition
852  // - Remove INVALID usage similarly to CoalesceDropFactory_kokkos by
853  // packing new values in the beginning of each row
854  // We do use auxilary view in this case, so keep a second rows view for
855  // counting nonzeros in rows
856 
857  {
858  SubFactoryMonitor m2 = SubFactoryMonitor(*this, doQRStep ? "Stage 1 (LocalQR)" : "Stage 1 (Fill coarse nullspace and tentative P)", coarseLevel);
859  // Set up team policy with numAggregates teams and one thread per team.
860  // Each team handles a slice of the data associated with one aggregate
861  // and performs a local QR decomposition
862  const Kokkos::TeamPolicy<execution_space> policy(numAggregates, 1); // numAggregates teams a 1 thread
863  LocalQRDecompFunctor<LocalOrdinal, GlobalOrdinal, Scalar, DeviceType, decltype(fineNSRandom),
864  decltype(aggDofSizes /*aggregate sizes in dofs*/), decltype(maxAggSize), decltype(agg2RowMapLO),
865  decltype(statusAtomic), decltype(rows), decltype(rowsAux), decltype(colsAux),
866  decltype(valsAux)>
867  localQRFunctor(fineNSRandom, coarseNS, aggDofSizes, maxAggSize, agg2RowMapLO, statusAtomic,
868  rows, rowsAux, colsAux, valsAux, doQRStep);
869  Kokkos::parallel_reduce("MueLu:TentativePF:BuildUncoupled:main_qr_loop", policy, localQRFunctor, nnz);
870  }
871 
872  typename status_type::HostMirror statusHost = Kokkos::create_mirror_view(status);
873  Kokkos::deep_copy(statusHost, status);
874  for (decltype(statusHost.size()) i = 0; i < statusHost.size(); i++)
875  if (statusHost(i)) {
876  std::ostringstream oss;
877  oss << "MueLu::TentativePFactory::MakeTentative: ";
878  switch (i) {
879  case 0: oss << "!goodMap is not implemented"; break;
880  case 1: oss << "fine level NS part has a zero column"; break;
881  }
882  throw Exceptions::RuntimeError(oss.str());
883  }
884  }
885 
886  // Compress the cols and vals by ignoring INVALID column entries that correspond
887  // to 0 in QR.
888 
889  // The real cols and vals are constructed using calculated (not estimated) nnz
890  cols_type cols;
891  vals_type vals;
892 
893  if (nnz != nnzEstimate) {
894  {
895  // Stage 2: compress the arrays
896  SubFactoryMonitor m2(*this, "Stage 2 (CompressRows)", coarseLevel);
897 
898  Kokkos::parallel_scan(
899  "MueLu:TentativePF:Build:compress_rows", range_type(0, numRows + 1),
900  KOKKOS_LAMBDA(const LO i, LO& upd, const bool& final) {
901  upd += rows(i);
902  if (final)
903  rows(i) = upd;
904  });
905  }
906 
907  {
908  SubFactoryMonitor m2(*this, "Stage 2 (CompressCols)", coarseLevel);
909 
910  cols = cols_type("Ptent_cols", nnz);
911  vals = vals_type("Ptent_vals", nnz);
912 
913  // FIXME_KOKKOS: this can be spedup by moving correct cols and vals values
914  // to the beginning of rows. See CoalesceDropFactory_kokkos for
915  // example.
916  Kokkos::parallel_for(
917  "MueLu:TentativePF:Build:compress_cols_vals", range_type(0, numRows),
918  KOKKOS_LAMBDA(const LO i) {
919  LO rowStart = rows(i);
920 
921  size_t lnnz = 0;
922  for (auto j = rowsAux(i); j < rowsAux(i + 1); j++)
923  if (colsAux(j) != INVALID) {
924  cols(rowStart + lnnz) = colsAux(j);
925  vals(rowStart + lnnz) = valsAux(j);
926  lnnz++;
927  }
928  });
929  }
930 
931  } else {
932  rows = rowsAux;
933  cols = colsAux;
934  vals = valsAux;
935  }
936 
937  GetOStream(Runtime1) << "TentativePFactory : aggregates do not cross process boundaries" << std::endl;
938 
939  {
940  // Stage 3: construct Xpetra::Matrix
941  SubFactoryMonitor m2(*this, "Stage 3 (LocalMatrix+FillComplete)", coarseLevel);
942 
943  local_matrix_type lclMatrix = local_matrix_type("A", numRows, coarseMap->getLocalNumElements(), nnz, vals, rows, cols);
944 
945  // Managing labels & constants for ESFC
946  RCP<ParameterList> FCparams;
947  if (pL.isSublist("matrixmatrix: kernel params"))
948  FCparams = rcp(new ParameterList(pL.sublist("matrixmatrix: kernel params")));
949  else
950  FCparams = rcp(new ParameterList);
951 
952  // By default, we don't need global constants for TentativeP
953  FCparams->set("compute global constants", FCparams->get("compute global constants", false));
954  FCparams->set("Timer Label", std::string("MueLu::TentativeP-") + toString(levelID));
955 
956  auto PtentCrs = CrsMatrixFactory::Build(lclMatrix, rowMap, coarseMap, coarseMap, A->getDomainMap());
957  Ptentative = rcp(new CrsMatrixWrap(PtentCrs));
958  }
959 }
960 
961 template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
964  RCP<AmalgamationInfo> amalgInfo, RCP<MultiVector> fineNullspace,
965  RCP<const Map> coarsePointMap, RCP<Matrix>& Ptentative,
966  RCP<MultiVector>& coarseNullspace, const int levelID) const {
967  /* This routine generates a BlockCrs P for a BlockCrs A. There are a few assumptions here, which meet the use cases we care about, but could
968  be generalized later, if we ever need to do so:
969  1) Null space dimension === block size of matrix: So no elasticity right now
970  2) QR is not supported: Under assumption #1, this shouldn't cause problems.
971  3) Maps are "good": Aka the first chunk of the ColMap is the RowMap.
972 
973  These assumptions keep our code way simpler and still support the use cases we actually care about.
974  */
975 
976  RCP<const Map> rowMap = A->getRowMap();
977  RCP<const Map> rangeMap = A->getRangeMap();
978  RCP<const Map> colMap = A->getColMap();
979  // const size_t numFinePointRows = rangeMap->getLocalNumElements();
980  const size_t numFineBlockRows = rowMap->getLocalNumElements();
981 
982  // typedef Teuchos::ScalarTraits<SC> STS;
983  // typedef typename STS::magnitudeType Magnitude;
984  const LO INVALID = Teuchos::OrdinalTraits<LO>::invalid();
985 
986  typedef Kokkos::ArithTraits<SC> ATS;
987  using impl_SC = typename ATS::val_type;
988  using impl_ATS = Kokkos::ArithTraits<impl_SC>;
989  const impl_SC one = impl_ATS::one();
990 
991  // const GO numAggs = aggregates->GetNumAggregates();
992  const size_t NSDim = fineNullspace->getNumVectors();
993  auto aggSizes = aggregates->ComputeAggregateSizes();
994 
995  typename Aggregates::local_graph_type aggGraph;
996  {
997  SubFactoryMonitor m2(*this, "Get Aggregates graph", coarseLevel);
998  aggGraph = aggregates->GetGraph();
999  }
1000  auto aggRows = aggGraph.row_map;
1001  auto aggCols = aggGraph.entries;
1002 
1003  // Need to generate the coarse block map
1004  // NOTE: We assume NSDim == block size here
1005  // NOTE: We also assume that coarseMap has contiguous GIDs
1006  // const size_t numCoarsePointRows = coarsePointMap->getLocalNumElements();
1007  const size_t numCoarseBlockRows = coarsePointMap->getLocalNumElements() / NSDim;
1008  RCP<const Map> coarseBlockMap = MapFactory::Build(coarsePointMap->lib(),
1010  numCoarseBlockRows,
1011  coarsePointMap->getIndexBase(),
1012  coarsePointMap->getComm());
1013  // Sanity checking
1014  const ParameterList& pL = GetParameterList();
1015  // const bool &doQRStep = pL.get<bool>("tentative: calculate qr");
1016 
1017  // The aggregates use the amalgamated column map, which in this case is what we want
1018 
1019  // Aggregates map is based on the amalgamated column map
1020  // We can skip global-to-local conversion if LIDs in row map are
1021  // same as LIDs in column map
1022  bool goodMap = MueLu::Utilities<SC, LO, GO, NO>::MapsAreNested(*rowMap, *colMap);
1024  "MueLu: TentativePFactory_kokkos: for now works only with good maps "
1025  "(i.e. \"matching\" row and column maps)");
1026 
1027  // STEP 1: do unamalgamation
1028  // The non-kokkos version uses member functions from the AmalgamationInfo
1029  // container class to unamalgamate the data. In contrast, the kokkos
1030  // version of TentativePFactory does the unamalgamation here and only uses
1031  // the data of the AmalgamationInfo container class
1032 
1033  // Extract information for unamalgamation
1034  LO fullBlockSize, blockID, stridingOffset, stridedBlockSize;
1035  GO indexBase;
1036  amalgInfo->GetStridingInformation(fullBlockSize, blockID, stridingOffset, stridedBlockSize, indexBase);
1037  // GO globalOffset = amalgInfo->GlobalOffset();
1038 
1039  // Extract aggregation info (already in Kokkos host views)
1040  auto procWinner = aggregates->GetProcWinner()->getDeviceLocalView(Xpetra::Access::ReadOnly);
1041  auto vertex2AggId = aggregates->GetVertex2AggId()->getDeviceLocalView(Xpetra::Access::ReadOnly);
1042  const size_t numAggregates = aggregates->GetNumAggregates();
1043 
1044  int myPID = aggregates->GetMap()->getComm()->getRank();
1045 
1046  // Create Kokkos::View (on the device) to store the aggreate dof sizes
1047  // Later used to get aggregate dof offsets
1048  // NOTE: This zeros itself on construction
1049  typedef typename Aggregates::aggregates_sizes_type::non_const_type AggSizeType;
1050  AggSizeType aggDofSizes; // This turns into "starts" after the parallel_scan
1051 
1052  {
1053  SubFactoryMonitor m2(*this, "Calc AggSizes", coarseLevel);
1054 
1055  // FIXME_KOKKOS: use ViewAllocateWithoutInitializing + set a single value
1056  aggDofSizes = AggSizeType("agg_dof_sizes", numAggregates + 1);
1057 
1058  Kokkos::deep_copy(Kokkos::subview(aggDofSizes, Kokkos::make_pair(static_cast<size_t>(1), numAggregates + 1)), aggSizes);
1059  }
1060 
1061  // Find maximum dof size for aggregates
1062  // Later used to reserve enough scratch space for local QR decompositions
1063  LO maxAggSize = 0;
1064  ReduceMaxFunctor<LO, decltype(aggDofSizes)> reduceMax(aggDofSizes);
1065  Kokkos::parallel_reduce("MueLu:TentativePF:Build:max_agg_size", range_type(0, aggDofSizes.extent(0)), reduceMax, maxAggSize);
1066 
1067  // parallel_scan (exclusive)
1068  // The aggDofSizes View then contains the aggregate dof offsets
1069  Kokkos::parallel_scan(
1070  "MueLu:TentativePF:Build:aggregate_sizes:stage1_scan", range_type(0, numAggregates + 1),
1071  KOKKOS_LAMBDA(const LO i, LO& update, const bool& final_pass) {
1072  update += aggDofSizes(i);
1073  if (final_pass)
1074  aggDofSizes(i) = update;
1075  });
1076 
1077  // Create Kokkos::View on the device to store mapping
1078  // between (local) aggregate id and row map ids (LIDs)
1079  Kokkos::View<LO*, DeviceType> aggToRowMapLO(Kokkos::ViewAllocateWithoutInitializing("aggtorow_map_LO"), numFineBlockRows);
1080  {
1081  SubFactoryMonitor m2(*this, "Create AggToRowMap", coarseLevel);
1082 
1083  AggSizeType aggOffsets(Kokkos::ViewAllocateWithoutInitializing("aggOffsets"), numAggregates);
1084  Kokkos::deep_copy(aggOffsets, Kokkos::subview(aggDofSizes, Kokkos::make_pair(static_cast<size_t>(0), numAggregates)));
1085 
1086  Kokkos::parallel_for(
1087  "MueLu:TentativePF:Build:createAgg2RowMap", range_type(0, vertex2AggId.extent(0)),
1088  KOKKOS_LAMBDA(const LO lnode) {
1089  if (procWinner(lnode, 0) == myPID) {
1090  // No need for atomics, it's one-to-one
1091  auto aggID = vertex2AggId(lnode, 0);
1092 
1093  auto offset = Kokkos::atomic_fetch_add(&aggOffsets(aggID), stridedBlockSize);
1094  // FIXME: I think this may be wrong
1095  // We unconditionally add the whole block here. When we calculated
1096  // aggDofSizes, we did the isLocalElement check. Something's fishy.
1097  for (LO k = 0; k < stridedBlockSize; k++)
1098  aggToRowMapLO(offset + k) = lnode * stridedBlockSize + k;
1099  }
1100  });
1101  }
1102 
1103  // STEP 2: prepare local QR decomposition
1104  // Reserve memory for tentative prolongation operator
1105  coarseNullspace = MultiVectorFactory::Build(coarsePointMap, NSDim);
1106 
1107  // Pull out the nullspace vectors so that we can have random access (on the device)
1108  auto fineNS = fineNullspace->getDeviceLocalView(Xpetra::Access::ReadWrite);
1109  auto coarseNS = coarseNullspace->getDeviceLocalView(Xpetra::Access::OverwriteAll);
1110 
1111  typedef typename Xpetra::Matrix<SC, LO, GO, NO>::local_matrix_type local_matrix_type;
1112  typedef typename local_matrix_type::row_map_type::non_const_type rows_type;
1113  typedef typename local_matrix_type::index_type::non_const_type cols_type;
1114  // typedef typename local_matrix_type::values_type::non_const_type vals_type;
1115 
1116  // Device View for status (error messages...)
1117  typedef Kokkos::View<int[10], DeviceType> status_type;
1118  status_type status("status");
1119 
1122 
1123  // We're going to bypass QR in the BlockCrs version of the code regardless of what the user asks for
1124  GetOStream(Runtime1) << "TentativePFactory : bypassing local QR phase" << std::endl;
1125 
1126  // BlockCrs requires that we build the (block) graph first, so let's do that...
1127 
1128  // NOTE: Because we're assuming that the NSDim == BlockSize, we only have one
1129  // block non-zero per row in the matrix;
1130  rows_type ia(Kokkos::ViewAllocateWithoutInitializing("BlockGraph_rowptr"), numFineBlockRows + 1);
1131  cols_type ja(Kokkos::ViewAllocateWithoutInitializing("BlockGraph_colind"), numFineBlockRows);
1132 
1133  Kokkos::parallel_for(
1134  "MueLu:TentativePF:BlockCrs:graph_init", range_type(0, numFineBlockRows),
1135  KOKKOS_LAMBDA(const LO j) {
1136  ia[j] = j;
1137  ja[j] = INVALID;
1138 
1139  if (j == (LO)numFineBlockRows - 1)
1140  ia[numFineBlockRows] = numFineBlockRows;
1141  });
1142 
1143  // Fill Graph
1144  const Kokkos::TeamPolicy<execution_space> policy(numAggregates, 1);
1145  Kokkos::parallel_for(
1146  "MueLu:TentativePF:BlockCrs:fillGraph", policy,
1147  KOKKOS_LAMBDA(const typename Kokkos::TeamPolicy<execution_space>::member_type& thread) {
1148  auto agg = thread.league_rank();
1149  Xpetra::global_size_t offset = agg;
1150 
1151  // size of the aggregate (number of DOFs in aggregate)
1152  LO aggSize = aggRows(agg + 1) - aggRows(agg);
1153 
1154  for (LO j = 0; j < aggSize; j++) {
1155  // FIXME: Allow for bad maps
1156  const LO localRow = aggToRowMapLO[aggDofSizes[agg] + j];
1157  const size_t rowStart = ia[localRow];
1158  ja[rowStart] = offset;
1159  }
1160  });
1161 
1162  // Compress storage (remove all INVALID, which happen when we skip zeros)
1163  // We do that in-place
1164  {
1165  // Stage 2: compress the arrays
1166  SubFactoryMonitor m2(*this, "Stage 2 (CompressData)", coarseLevel);
1167  // Fill i_temp with the correct row starts
1168  rows_type i_temp(Kokkos::ViewAllocateWithoutInitializing("BlockGraph_rowptr"), numFineBlockRows + 1);
1169  LO nnz = 0;
1170  Kokkos::parallel_scan(
1171  "MueLu:TentativePF:BlockCrs:compress_rows", range_type(0, numFineBlockRows),
1172  KOKKOS_LAMBDA(const LO i, LO& upd, const bool& final) {
1173  if (final)
1174  i_temp[i] = upd;
1175  for (auto j = ia[i]; j < ia[i + 1]; j++)
1176  if (ja[j] != INVALID)
1177  upd++;
1178  if (final && i == (LO)numFineBlockRows - 1)
1179  i_temp[numFineBlockRows] = upd;
1180  },
1181  nnz);
1182 
1183  cols_type j_temp(Kokkos::ViewAllocateWithoutInitializing("BlockGraph_colind"), nnz);
1184 
1185  Kokkos::parallel_for(
1186  "MueLu:TentativePF:BlockCrs:compress_cols", range_type(0, numFineBlockRows),
1187  KOKKOS_LAMBDA(const LO i) {
1188  size_t rowStart = i_temp[i];
1189  size_t lnnz = 0;
1190  for (auto j = ia[i]; j < ia[i + 1]; j++)
1191  if (ja[j] != INVALID) {
1192  j_temp[rowStart + lnnz] = ja[j];
1193  lnnz++;
1194  }
1195  });
1196 
1197  ia = i_temp;
1198  ja = j_temp;
1199  }
1200 
1201  RCP<CrsGraph> BlockGraph = CrsGraphFactory::Build(rowMap, coarseBlockMap, ia, ja);
1202 
1203  // Managing labels & constants for ESFC
1204  {
1205  RCP<ParameterList> FCparams;
1206  if (pL.isSublist("matrixmatrix: kernel params"))
1207  FCparams = rcp(new ParameterList(pL.sublist("matrixmatrix: kernel params")));
1208  else
1209  FCparams = rcp(new ParameterList);
1210  // By default, we don't need global constants for TentativeP
1211  FCparams->set("compute global constants", FCparams->get("compute global constants", false));
1212  std::string levelIDs = toString(levelID);
1213  FCparams->set("Timer Label", std::string("MueLu::TentativeP-") + levelIDs);
1214  RCP<const Export> dummy_e;
1215  RCP<const Import> dummy_i;
1216  BlockGraph->expertStaticFillComplete(coarseBlockMap, rowMap, dummy_i, dummy_e, FCparams);
1217  }
1218 
1219  // We can't leave the ia/ja pointers floating around, because of host/device view counting, so
1220  // we clear them here
1221  ia = rows_type();
1222  ja = cols_type();
1223 
1224  // Now let's make a BlockCrs Matrix
1225  // NOTE: Assumes block size== NSDim
1226  RCP<Xpetra::CrsMatrix<SC, LO, GO, NO>> P_xpetra = Xpetra::CrsMatrixFactory<SC, LO, GO, NO>::BuildBlock(BlockGraph, coarsePointMap, rangeMap, NSDim);
1228  if (P_tpetra.is_null()) throw std::runtime_error("BuildPUncoupled: Matrix factory did not return a Tpetra::BlockCrsMatrix");
1229  RCP<CrsMatrixWrap> P_wrap = rcp(new CrsMatrixWrap(P_xpetra));
1230 
1231  auto values = P_tpetra->getTpetra_BlockCrsMatrix()->getValuesDeviceNonConst();
1232  const LO stride = NSDim * NSDim;
1233 
1234  Kokkos::parallel_for(
1235  "MueLu:TentativePF:BlockCrs:main_loop_noqr", policy,
1236  KOKKOS_LAMBDA(const typename Kokkos::TeamPolicy<execution_space>::member_type& thread) {
1237  auto agg = thread.league_rank();
1238 
1239  // size of the aggregate (number of DOFs in aggregate)
1240  LO aggSize = aggRows(agg + 1) - aggRows(agg);
1241  Xpetra::global_size_t offset = agg * NSDim;
1242 
1243  // Q = localQR(:,0)/norm
1244  for (LO j = 0; j < aggSize; j++) {
1245  LO localBlockRow = aggToRowMapLO(aggRows(agg) + j);
1246  LO rowStart = localBlockRow * stride;
1247  for (LO r = 0; r < (LO)NSDim; r++) {
1248  LO localPointRow = localBlockRow * NSDim + r;
1249  for (LO c = 0; c < (LO)NSDim; c++) {
1250  values[rowStart + r * NSDim + c] = fineNSRandom(localPointRow, c);
1251  }
1252  }
1253  }
1254 
1255  // R = norm
1256  for (LO j = 0; j < (LO)NSDim; j++)
1257  coarseNS(offset + j, j) = one;
1258  });
1259 
1260  Ptentative = P_wrap;
1261 }
1262 
1263 template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
1265  BuildPcoupled(RCP<Matrix> /* A */, RCP<Aggregates> /* aggregates */,
1266  RCP<AmalgamationInfo> /* amalgInfo */, RCP<MultiVector> /* fineNullspace */,
1267  RCP<const Map> /* coarseMap */, RCP<Matrix>& /* Ptentative */,
1268  RCP<MultiVector>& /* coarseNullspace */) const {
1269  throw Exceptions::RuntimeError("MueLu: Construction of coupled tentative P is not implemented");
1270 }
1271 
1272 template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
1274  isGoodMap(const Map& rowMap, const Map& colMap) const {
1275  auto rowLocalMap = rowMap.getLocalMap();
1276  auto colLocalMap = colMap.getLocalMap();
1277 
1278  const size_t numRows = rowLocalMap.getLocalNumElements();
1279  const size_t numCols = colLocalMap.getLocalNumElements();
1280 
1281  if (numCols < numRows)
1282  return false;
1283 
1284  size_t numDiff = 0;
1285  Kokkos::parallel_reduce(
1286  "MueLu:TentativePF:isGoodMap", range_type(0, numRows),
1287  KOKKOS_LAMBDA(const LO i, size_t& diff) {
1288  diff += (rowLocalMap.getGlobalElement(i) != colLocalMap.getGlobalElement(i));
1289  },
1290  numDiff);
1291 
1292  return (numDiff == 0);
1293 }
1294 
1295 } // namespace MueLu
1296 
1297 #define MUELU_TENTATIVEPFACTORY_KOKKOS_SHORT
1298 #endif // MUELU_TENTATIVEPFACTORY_KOKKOS_DEF_HPP
Important warning messages (one line)
void BuildPcoupled(RCP< Matrix > A, RCP< Aggregates > aggregates, RCP< AmalgamationInfo > amalgInfo, RCP< MultiVector > fineNullspace, RCP< const Map > coarseMap, RCP< Matrix > &Ptentative, RCP< MultiVector > &coarseNullspace) const
MueLu::DefaultLocalOrdinal LocalOrdinal
void BuildPuncoupledBlockCrs(Level &coarseLevel, RCP< Matrix > A, RCP< Aggregates > aggregates, RCP< AmalgamationInfo > amalgInfo, RCP< MultiVector > fineNullspace, RCP< const Map > coarseMap, RCP< Matrix > &Ptentative, RCP< MultiVector > &coarseNullspace, const int levelID) const
static bool MapsAreNested(const Xpetra::Map< LocalOrdinal, GlobalOrdinal, Node > &rowMap, const Xpetra::Map< LocalOrdinal, GlobalOrdinal, Node > &colMap)
std::string toString(const T &what)
Little helper function to convert non-string types to strings.
const RCP< LOVector > & GetProcWinner() const
Returns constant vector that maps local node IDs to owning processor IDs.
KOKKOS_INLINE_FUNCTION LO GetNumAggregates() const
void BuildP(Level &fineLevel, Level &coarseLevel) const
Abstract Build method.
GlobalOrdinal GO
T & get(const std::string &name, T def_value)
Timer to be used in factories. Similar to Monitor but with additional timers.
#define TEUCHOS_TEST_FOR_EXCEPTION(throw_exception_test, Exception, msg)
const RCP< const Map > GetMap() const
returns (overlapping) map of aggregate/node distribution
LocalOrdinal LO
ParameterList & set(std::string const &name, T &&value, std::string const &docString="", RCP< const ParameterEntryValidator > const &validator=null)
static const NoFactory * get()
Print even more statistics.
#define SET_VALID_ENTRY(name)
bool isParameter(const std::string &name) const
TEUCHOS_DEPRECATED RCP< T > rcp(T *p, Dealloc_T dealloc, bool owns_mem)
void BuildPuncoupled(Level &coarseLevel, RCP< Matrix > A, RCP< Aggregates > aggregates, RCP< AmalgamationInfo > amalgInfo, RCP< MultiVector > fineNullspace, RCP< const Map > coarseMap, RCP< Matrix > &Ptentative, RCP< MultiVector > &coarseNullspace, const int levelID) const
MueLu::DefaultScalar Scalar
MueLu::DefaultGlobalOrdinal GlobalOrdinal
Kokkos::RangePolicy< local_ordinal_type, execution_space > range_type
Class that holds all level-specific information.
Definition: MueLu_Level.hpp:63
bool isSublist(const std::string &name) const
void GetStridingInformation(LO &fullBlockSize, LO &blockID, LO &stridingOffset, LO &stridedBlockSize, GO &indexBase)
returns striding information
Timer to be used in factories. Similar to SubMonitor but adds a timer level by level.
typename LWGraph_kokkos::local_graph_type local_graph_type
void Build(Level &fineLevel, Level &coarseLevel) const
Build an object with this factory.
void DeclareInput(Level &fineLevel, Level &coarseLevel) const
Input.
const RCP< LOMultiVector > & GetVertex2AggId() const
Returns constant vector that maps local node IDs to local aggregates IDs.
static std::string PrintMatrixInfo(const Matrix &A, const std::string &msgTag, RCP< const Teuchos::ParameterList > params=Teuchos::null)
size_t global_size_t
GO GlobalOffset()
returns offset of global dof ids
RCP< const ParameterList > GetValidParameterList() const
Return a const parameter list of valid parameters that setParameterList() will accept.
Scalar SC
ParameterList & sublist(const std::string &name, bool mustAlreadyExist=false, const std::string &docString="")
int GetLevelID() const
Return level number.
Definition: MueLu_Level.cpp:51
Exception throws to report errors in the internal logical of the program.
Description of what is happening (more verbose)
bool isGoodMap(const Map &rowMap, const Map &colMap) const
maxAggDofSizeType maxAggDofSize
local_graph_type GetGraph() const
agg2RowMapLOType agg2RowMapLO
aggregates_sizes_type::const_type ComputeAggregateSizes(bool forceRecompute=false) const
Compute sizes of aggregates.
bool IsAvailable(const std::string &ename, const FactoryBase *factory=NoFactory::get()) const
Test whether a need&#39;s value has been saved.
bool is_null() const