MueLu  Version of the Day
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Groups Pages
MueLu_TentativePFactory_kokkos_def.hpp
Go to the documentation of this file.
1 // @HEADER
2 //
3 // ***********************************************************************
4 //
5 // MueLu: A package for multigrid based preconditioning
6 // Copyright 2012 Sandia Corporation
7 //
8 // Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
9 // the U.S. Government retains certain rights in this software.
10 //
11 // Redistribution and use in source and binary forms, with or without
12 // modification, are permitted provided that the following conditions are
13 // met:
14 //
15 // 1. Redistributions of source code must retain the above copyright
16 // notice, this list of conditions and the following disclaimer.
17 //
18 // 2. Redistributions in binary form must reproduce the above copyright
19 // notice, this list of conditions and the following disclaimer in the
20 // documentation and/or other materials provided with the distribution.
21 //
22 // 3. Neither the name of the Corporation nor the names of the
23 // contributors may be used to endorse or promote products derived from
24 // this software without specific prior written permission.
25 //
26 // THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
27 // EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
28 // IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
29 // PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
30 // CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
31 // EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
32 // PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
33 // PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
34 // LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
35 // NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
36 // SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
37 //
38 // Questions? Contact
39 // Jonathan Hu (jhu@sandia.gov)
40 // Andrey Prokopenko (aprokop@sandia.gov)
41 // Ray Tuminaro (rstumin@sandia.gov)
42 //
43 // ***********************************************************************
44 //
45 // @HEADER
46 #ifndef MUELU_TENTATIVEPFACTORY_KOKKOS_DEF_HPP
47 #define MUELU_TENTATIVEPFACTORY_KOKKOS_DEF_HPP
48 
49 #ifdef HAVE_MUELU_KOKKOS_REFACTOR
50 
51 #include "Kokkos_UnorderedMap.hpp"
52 
54 
55 #include "MueLu_Aggregates_kokkos.hpp"
56 #include "MueLu_AmalgamationFactory.hpp"
57 #include "MueLu_AmalgamationInfo.hpp"
58 #include "MueLu_CoarseMapFactory_kokkos.hpp"
59 #include "MueLu_MasterList.hpp"
60 #include "MueLu_NullspaceFactory_kokkos.hpp"
61 #include "MueLu_PerfUtils.hpp"
62 #include "MueLu_Monitor.hpp"
63 #include "MueLu_Utilities_kokkos.hpp"
64 
65 namespace MueLu {
66 
67  namespace { // anonymous
68 
69  template<class LocalOrdinal, class View>
70  class ReduceMaxFunctor{
71  public:
72  ReduceMaxFunctor(View view) : view_(view) { }
73 
74  KOKKOS_INLINE_FUNCTION
75  void operator()(const LocalOrdinal &i, LocalOrdinal& vmax) const {
76  if (vmax < view_(i))
77  vmax = view_(i);
78  }
79 
80  KOKKOS_INLINE_FUNCTION
81  void join (volatile LocalOrdinal& dst, const volatile LocalOrdinal& src) const {
82  if (dst < src) {
83  dst = src;
84  }
85  }
86 
87  KOKKOS_INLINE_FUNCTION
88  void init (LocalOrdinal& dst) const {
89  dst = 0;
90  }
91  private:
92  View view_;
93  };
94 
95  // local QR decomposition
96  template<class LOType, class GOType, class SCType,class DeviceType, class NspType, class aggRowsType, class maxAggDofSizeType, class agg2RowMapLOType, class statusType, class rowsType, class rowsAuxType, class colsAuxType, class valsAuxType>
97  class LocalQRDecompFunctor {
98  private:
99  typedef LOType LO;
100  typedef GOType GO;
101  typedef SCType SC;
102 
103  typedef typename DeviceType::execution_space execution_space;
104  typedef Kokkos::ArithTraits<SC> ATS;
105  typedef typename ATS::magnitudeType Magnitude;
106 
109 
110  private:
111 
112  NspType fineNS;
113  NspType coarseNS;
114  aggRowsType aggRows;
115  maxAggDofSizeType maxAggDofSize; //< maximum number of dofs in aggregate (max size of aggregate * numDofsPerNode)
116  agg2RowMapLOType agg2RowMapLO;
117  statusType statusAtomic;
118  rowsType rows;
119  rowsAuxType rowsAux;
120  colsAuxType colsAux;
121  valsAuxType valsAux;
122  bool doQRStep;
123  public:
124  LocalQRDecompFunctor(NspType fineNS_, NspType coarseNS_, aggRowsType aggRows_, maxAggDofSizeType maxAggDofSize_, agg2RowMapLOType agg2RowMapLO_, statusType statusAtomic_, rowsType rows_, rowsAuxType rowsAux_, colsAuxType colsAux_, valsAuxType valsAux_, bool doQRStep_) :
125  fineNS(fineNS_),
126  coarseNS(coarseNS_),
127  aggRows(aggRows_),
128  maxAggDofSize(maxAggDofSize_),
129  agg2RowMapLO(agg2RowMapLO_),
130  statusAtomic(statusAtomic_),
131  rows(rows_),
132  rowsAux(rowsAux_),
133  colsAux(colsAux_),
134  valsAux(valsAux_),
135  doQRStep(doQRStep_)
136  { }
137 
138  KOKKOS_INLINE_FUNCTION
139  void operator() ( const typename Kokkos::TeamPolicy<execution_space>::member_type & thread, size_t& nnz) const {
140  auto agg = thread.league_rank();
141 
142  // size of aggregate: number of DOFs in aggregate
143  auto aggSize = aggRows(agg+1) - aggRows(agg);
144 
145  const SC one = ATS::one();
146  const SC two = one + one;
147  const SC zero = ATS::zero();
148  const auto zeroM = ATS::magnitude(zero);
149 
150  int m = aggSize;
151  int n = fineNS.extent(1);
152 
153  // calculate row offset for coarse nullspace
154  Xpetra::global_size_t offset = agg * n;
155 
156  if (doQRStep) {
157 
158  // Extract the piece of the nullspace corresponding to the aggregate
159  shared_matrix r(thread.team_shmem(), m, n); // A (initially), R (at the end)
160  for (int j = 0; j < n; j++)
161  for (int k = 0; k < m; k++)
162  r(k,j) = fineNS(agg2RowMapLO(aggRows(agg)+k),j);
163 #if 0
164  printf("A\n");
165  for (int i = 0; i < m; i++) {
166  for (int j = 0; j < n; j++)
167  printf(" %5.3lf ", r(i,j));
168  printf("\n");
169  }
170 #endif
171 
172  // Calculate QR decomposition (standard)
173  shared_matrix q(thread.team_shmem(), m, m); // Q
174  if (m >= n) {
175  bool isSingular = false;
176 
177  // Initialize Q^T
178  auto qt = q;
179  for (int i = 0; i < m; i++) {
180  for (int j = 0; j < m; j++)
181  qt(i,j) = zero;
182  qt(i,i) = one;
183  }
184 
185  for (int k = 0; k < n; k++) { // we ignore "n" instead of "n-1" to normalize
186  // FIXME_KOKKOS: use team
187  Magnitude s = zeroM, norm, norm_x;
188  for (int i = k+1; i < m; i++)
189  s += pow(ATS::magnitude(r(i,k)), 2);
190  norm = sqrt(pow(ATS::magnitude(r(k,k)), 2) + s);
191 
192  if (norm == zero) {
193  isSingular = true;
194  break;
195  }
196 
197  r(k,k) -= norm*one;
198 
199  norm_x = sqrt(pow(ATS::magnitude(r(k,k)), 2) + s);
200  if (norm_x == zeroM) {
201  // We have a single diagonal element in the column.
202  // No reflections required. Just need to restor r(k,k).
203  r(k,k) = norm*one;
204  continue;
205  }
206 
207  // FIXME_KOKKOS: use team
208  for (int i = k; i < m; i++)
209  r(i,k) /= norm_x;
210 
211  // Update R(k:m,k+1:n)
212  for (int j = k+1; j < n; j++) {
213  // FIXME_KOKKOS: use team in the loops
214  SC si = zero;
215  for (int i = k; i < m; i++)
216  si += r(i,k) * r(i,j);
217  for (int i = k; i < m; i++)
218  r(i,j) -= two*si * r(i,k);
219  }
220 
221  // Update Q^T (k:m,k:m)
222  for (int j = k; j < m; j++) {
223  // FIXME_KOKKOS: use team in the loops
224  SC si = zero;
225  for (int i = k; i < m; i++)
226  si += r(i,k) * qt(i,j);
227  for (int i = k; i < m; i++)
228  qt(i,j) -= two*si * r(i,k);
229  }
230 
231  // Fix R(k:m,k)
232  r(k,k) = norm*one;
233  for (int i = k+1; i < m; i++)
234  r(i,k) = zero;
235  }
236 
237 #if 0
238  // Q = (Q^T)^T
239  for (int i = 0; i < m; i++)
240  for (int j = 0; j < i; j++) {
241  SC tmp = qt(i,j);
242  qt(i,j) = qt(j,i);
243  qt(j,i) = tmp;
244  }
245 #endif
246 
247  // Build coarse nullspace using the upper triangular part of R
248  for (int j = 0; j < n; j++)
249  for (int k = 0; k <= j; k++)
250  coarseNS(offset+k,j) = r(k,j);
251 
252  if (isSingular) {
253  statusAtomic(1) = true;
254  return;
255  }
256 
257  } else {
258  // Special handling for m < n (i.e. single node aggregates in structural mechanics)
259 
260  // The local QR decomposition is not possible in the "overconstrained"
261  // case (i.e. number of columns in qr > number of rowsAux), which
262  // corresponds to #DOFs in Aggregate < n. For usual problems this
263  // is only possible for single node aggregates in structural mechanics.
264  // (Similar problems may arise in discontinuous Galerkin problems...)
265  // We bypass the QR decomposition and use an identity block in the
266  // tentative prolongator for the single node aggregate and transfer the
267  // corresponding fine level null space information 1-to-1 to the coarse
268  // level null space part.
269 
270  // NOTE: The resulting tentative prolongation operator has
271  // (m*DofsPerNode-n) zero columns leading to a singular
272  // coarse level operator A. To deal with that one has the following
273  // options:
274  // - Use the "RepairMainDiagonal" flag in the RAPFactory (default:
275  // false) to add some identity block to the diagonal of the zero rowsAux
276  // in the coarse level operator A, such that standard level smoothers
277  // can be used again.
278  // - Use special (projection-based) level smoothers, which can deal
279  // with singular matrices (very application specific)
280  // - Adapt the code below to avoid zero columns. However, we do not
281  // support a variable number of DOFs per node in MueLu/Xpetra which
282  // makes the implementation really hard.
283  //
284  // FIXME: do we need to check for singularity here somehow? Zero
285  // columns would be easy but linear dependency would require proper QR.
286 
287  // R = extended (by adding identity rowsAux) qr
288  for (int j = 0; j < n; j++)
289  for (int k = 0; k < n; k++)
290  if (k < m)
291  coarseNS(offset+k,j) = r(k,j);
292  else
293  coarseNS(offset+k,j) = (k == j ? one : zero);
294 
295  // Q = I (rectangular)
296  for (int i = 0; i < m; i++)
297  for (int j = 0; j < n; j++)
298  q(i,j) = (j == i ? one : zero);
299  }
300 
301  // Process each row in the local Q factor and fill helper arrays to assemble P
302  for (int j = 0; j < m; j++) {
303  LO localRow = agg2RowMapLO(aggRows(agg)+j);
304  size_t rowStart = rowsAux(localRow);
305  size_t lnnz = 0;
306  for (int k = 0; k < n; k++) {
307  // skip zeros
308  if (q(j,k) != zero) {
309  colsAux(rowStart+lnnz) = offset + k;
310  valsAux(rowStart+lnnz) = q(j,k);
311  lnnz++;
312  }
313  }
314  rows(localRow+1) = lnnz;
315  nnz += lnnz;
316  }
317 
318 #if 0
319  printf("R\n");
320  for (int i = 0; i < m; i++) {
321  for (int j = 0; j < n; j++)
322  printf(" %5.3lf ", coarseNS(i,j));
323  printf("\n");
324  }
325 
326  printf("Q\n");
327  for (int i = 0; i < aggSize; i++) {
328  for (int j = 0; j < aggSize; j++)
329  printf(" %5.3lf ", q(i,j));
330  printf("\n");
331  }
332 #endif
333  } else {
335  // "no-QR" option //
337  // Local Q factor is just the fine nullspace support over the current aggregate.
338  // Local R factor is the identity.
339  // TODO I have not implemented any special handling for aggregates that are too
340  // TODO small to locally support the nullspace, as is done in the standard QR
341  // TODO case above.
342 
343  for (int j = 0; j < m; j++) {
344  LO localRow = agg2RowMapLO(aggRows(agg)+j);
345  size_t rowStart = rowsAux(localRow);
346  size_t lnnz = 0;
347  for (int k = 0; k < n; k++) {
348  const SC qr_jk = fineNS(localRow,k);
349  // skip zeros
350  if (qr_jk != zero) {
351  colsAux(rowStart+lnnz) = offset + k;
352  valsAux(rowStart+lnnz) = qr_jk;
353  lnnz++;
354  }
355  }
356  rows(localRow+1) = lnnz;
357  nnz += lnnz;
358  }
359 
360  for (int j = 0; j < n; j++)
361  coarseNS(offset+j,j) = one;
362 
363  }
364 
365  }
366 
367  // amount of shared memory
368  size_t team_shmem_size( int /* team_size */ ) const {
369  if (doQRStep) {
370  int m = maxAggDofSize;
371  int n = fineNS.extent(1);
372  return shared_matrix::shmem_size(m, n) + // r
373  shared_matrix::shmem_size(m, m); // q
374  } else
375  return 0;
376  }
377  };
378 
379  } // namespace anonymous
380 
381  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class DeviceType>
382  RCP<const ParameterList> TentativePFactory_kokkos<Scalar,LocalOrdinal,GlobalOrdinal,Kokkos::Compat::KokkosDeviceWrapperNode<DeviceType>>::GetValidParameterList() const {
383  RCP<ParameterList> validParamList = rcp(new ParameterList());
384 
385 #define SET_VALID_ENTRY(name) validParamList->setEntry(name, MasterList::getEntry(name))
386  SET_VALID_ENTRY("tentative: calculate qr");
387  SET_VALID_ENTRY("tentative: build coarse coordinates");
388 #undef SET_VALID_ENTRY
389 
390  validParamList->set< RCP<const FactoryBase> >("A", Teuchos::null, "Generating factory of the matrix A");
391  validParamList->set< RCP<const FactoryBase> >("Aggregates", Teuchos::null, "Generating factory of the aggregates");
392  validParamList->set< RCP<const FactoryBase> >("Nullspace", Teuchos::null, "Generating factory of the nullspace");
393  validParamList->set< RCP<const FactoryBase> >("UnAmalgamationInfo", Teuchos::null, "Generating factory of UnAmalgamationInfo");
394  validParamList->set< RCP<const FactoryBase> >("CoarseMap", Teuchos::null, "Generating factory of the coarse map");
395  validParamList->set< RCP<const FactoryBase> >("Coordinates", Teuchos::null, "Generating factory of the coordinates");
396 
397  // Make sure we don't recursively validate options for the matrixmatrix kernels
398  ParameterList norecurse;
399  norecurse.disableRecursiveValidation();
400  validParamList->set<ParameterList> ("matrixmatrix: kernel params", norecurse, "MatrixMatrix kernel parameters");
401 
402  return validParamList;
403  }
404 
405  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class DeviceType>
406  void TentativePFactory_kokkos<Scalar,LocalOrdinal,GlobalOrdinal,Kokkos::Compat::KokkosDeviceWrapperNode<DeviceType>>::DeclareInput(Level& fineLevel, Level& /* coarseLevel */) const {
407 
408  const ParameterList& pL = GetParameterList();
409 
410  Input(fineLevel, "A");
411  Input(fineLevel, "Aggregates");
412  Input(fineLevel, "Nullspace");
413  Input(fineLevel, "UnAmalgamationInfo");
414  Input(fineLevel, "CoarseMap");
415  if( fineLevel.GetLevelID() == 0 &&
416  fineLevel.IsAvailable("Coordinates", NoFactory::get()) && // we have coordinates (provided by user app)
417  pL.get<bool>("tentative: build coarse coordinates") ) { // and we want coordinates on other levels
418  bTransferCoordinates_ = true; // then set the transfer coordinates flag to true
419  Input(fineLevel, "Coordinates");
420  } else if (bTransferCoordinates_) {
421  Input(fineLevel, "Coordinates");
422  }
423  }
424 
425  template <class Scalar,class LocalOrdinal, class GlobalOrdinal, class DeviceType>
426  void TentativePFactory_kokkos<Scalar,LocalOrdinal,GlobalOrdinal,Kokkos::Compat::KokkosDeviceWrapperNode<DeviceType>>::Build(Level& fineLevel, Level& coarseLevel) const {
427  return BuildP(fineLevel, coarseLevel);
428  }
429 
430  template <class Scalar,class LocalOrdinal, class GlobalOrdinal, class DeviceType>
431  void TentativePFactory_kokkos<Scalar,LocalOrdinal,GlobalOrdinal,Kokkos::Compat::KokkosDeviceWrapperNode<DeviceType>>::BuildP(Level& fineLevel, Level& coarseLevel) const {
432  FactoryMonitor m(*this, "Build", coarseLevel);
433 
434  typedef typename Teuchos::ScalarTraits<Scalar>::coordinateType coordinate_type;
435  typedef Xpetra::MultiVector<coordinate_type,LO,GO,NO> RealValuedMultiVector;
436  typedef Xpetra::MultiVectorFactory<coordinate_type,LO,GO,NO> RealValuedMultiVectorFactory;
437 
438  auto A = Get< RCP<Matrix> > (fineLevel, "A");
439  auto aggregates = Get< RCP<Aggregates_kokkos> >(fineLevel, "Aggregates");
440  auto amalgInfo = Get< RCP<AmalgamationInfo> > (fineLevel, "UnAmalgamationInfo");
441  auto fineNullspace = Get< RCP<MultiVector> > (fineLevel, "Nullspace");
442  auto coarseMap = Get< RCP<const Map> > (fineLevel, "CoarseMap");
443  RCP<RealValuedMultiVector> fineCoords;
444  if(bTransferCoordinates_) {
445  fineCoords = Get< RCP<RealValuedMultiVector> >(fineLevel, "Coordinates");
446  }
447 
448  RCP<Matrix> Ptentative;
449  RCP<MultiVector> coarseNullspace;
450  RCP<RealValuedMultiVector> coarseCoords;
451 
452  if(bTransferCoordinates_) {
453  ArrayView<const GO> elementAList = coarseMap->getNodeElementList();
454  GO indexBase = coarseMap->getIndexBase();
455 
456  LO blkSize = 1;
457  if (rcp_dynamic_cast<const StridedMap>(coarseMap) != Teuchos::null)
458  blkSize = rcp_dynamic_cast<const StridedMap>(coarseMap)->getFixedBlockSize();
459 
460  Array<GO> elementList;
461  ArrayView<const GO> elementListView;
462  if (blkSize == 1) {
463  // Scalar system
464  // No amalgamation required
465  elementListView = elementAList;
466 
467  } else {
468  auto numElements = elementAList.size() / blkSize;
469 
470  elementList.resize(numElements);
471 
472  // Amalgamate the map
473  for (LO i = 0; i < Teuchos::as<LO>(numElements); i++)
474  elementList[i] = (elementAList[i*blkSize]-indexBase)/blkSize + indexBase;
475 
476  elementListView = elementList;
477  }
478 
479  auto uniqueMap = fineCoords->getMap();
480  auto coarseCoordMap = MapFactory::Build(coarseMap->lib(), Teuchos::OrdinalTraits<Xpetra::global_size_t>::invalid(),
481  elementListView, indexBase, coarseMap->getComm());
482  coarseCoords = RealValuedMultiVectorFactory::Build(coarseCoordMap, fineCoords->getNumVectors());
483 
484  // Create overlapped fine coordinates to reduce global communication
485  RCP<RealValuedMultiVector> ghostedCoords = fineCoords;
486  if (aggregates->AggregatesCrossProcessors()) {
487  auto nonUniqueMap = aggregates->GetMap();
488  auto importer = ImportFactory::Build(uniqueMap, nonUniqueMap);
489 
490  ghostedCoords = RealValuedMultiVectorFactory::Build(nonUniqueMap, fineCoords->getNumVectors());
491  ghostedCoords->doImport(*fineCoords, *importer, Xpetra::INSERT);
492  }
493 
494  // The good new is that his graph has already been constructed for the
495  // TentativePFactory and was cached in Aggregates. So this is a no-op.
496  auto aggGraph = aggregates->GetGraph();
497  auto numAggs = aggGraph.numRows();
498 
499  auto fineCoordsView = fineCoords ->template getLocalView<DeviceType>();
500  auto coarseCoordsView = coarseCoords->template getLocalView<DeviceType>();
501 
502  // Fill in coarse coordinates
503  {
504  SubFactoryMonitor m2(*this, "AverageCoords", coarseLevel);
505 
506  const auto dim = fineCoords->getNumVectors();
507 
508  typename AppendTrait<decltype(fineCoordsView), Kokkos::RandomAccess>::type fineCoordsRandomView = fineCoordsView;
509  for (size_t j = 0; j < dim; j++) {
510  Kokkos::parallel_for("MueLu::TentativeP::BuildCoords", Kokkos::RangePolicy<local_ordinal_type, execution_space>(0, numAggs),
511  KOKKOS_LAMBDA(const LO i) {
512  // A row in this graph represents all node ids in the aggregate
513  // Therefore, averaging is very easy
514 
515  auto aggregate = aggGraph.rowConst(i);
516 
517  typename Teuchos::ScalarTraits<Scalar>::coordinateType sum = 0.0; // do not use Scalar here (Stokhos)
518  for (size_t colID = 0; colID < static_cast<size_t>(aggregate.length); colID++)
519  sum += fineCoordsRandomView(aggregate(colID),j);
520 
521  coarseCoordsView(i,j) = sum / aggregate.length;
522  });
523  }
524  }
525  }
526 
527  if (!aggregates->AggregatesCrossProcessors())
528  BuildPuncoupled(coarseLevel, A, aggregates, amalgInfo, fineNullspace, coarseMap, Ptentative, coarseNullspace, coarseLevel.GetLevelID());
529  else
530  BuildPcoupled (A, aggregates, amalgInfo, fineNullspace, coarseMap, Ptentative, coarseNullspace);
531 
532  // If available, use striding information of fine level matrix A for range
533  // map and coarseMap as domain map; otherwise use plain range map of
534  // Ptent = plain range map of A for range map and coarseMap as domain map.
535  // NOTE:
536  // The latter is not really safe, since there is no striding information
537  // for the range map. This is not really a problem, since striding
538  // information is always available on the intermedium levels and the
539  // coarsest levels.
540  if (A->IsView("stridedMaps") == true)
541  Ptentative->CreateView("stridedMaps", A->getRowMap("stridedMaps"), coarseMap);
542  else
543  Ptentative->CreateView("stridedMaps", Ptentative->getRangeMap(), coarseMap);
544 
545  if(bTransferCoordinates_) {
546  Set(coarseLevel, "Coordinates", coarseCoords);
547  }
548  Set(coarseLevel, "Nullspace", coarseNullspace);
549  Set(coarseLevel, "P", Ptentative);
550 
551  if (IsPrint(Statistics1)) {
552  RCP<ParameterList> params = rcp(new ParameterList());
553  params->set("printLoadBalancingInfo", true);
554  GetOStream(Statistics1) << PerfUtils::PrintMatrixInfo(*Ptentative, "Ptent", params);
555  }
556  }
557 
558  template <class Scalar,class LocalOrdinal, class GlobalOrdinal, class DeviceType>
559  void TentativePFactory_kokkos<Scalar,LocalOrdinal,GlobalOrdinal,Kokkos::Compat::KokkosDeviceWrapperNode<DeviceType>>::
560  BuildPuncoupled(Level& coarseLevel, RCP<Matrix> A, RCP<Aggregates_kokkos> aggregates, RCP<AmalgamationInfo> amalgInfo, RCP<MultiVector> fineNullspace,
561  RCP<const Map> coarseMap, RCP<Matrix>& Ptentative, RCP<MultiVector>& coarseNullspace, const int levelID) const {
562  auto rowMap = A->getRowMap();
563  auto colMap = A->getColMap();
564 
565  const size_t numRows = rowMap->getNodeNumElements();
566  const size_t NSDim = fineNullspace->getNumVectors();
567 
568  typedef Kokkos::ArithTraits<SC> ATS;
569  const SC zero = ATS::zero(), one = ATS::one();
570 
571  const LO INVALID = Teuchos::OrdinalTraits<LO>::invalid();
572 
573  typename Aggregates_kokkos::local_graph_type aggGraph;
574  {
575  SubFactoryMonitor m2(*this, "Get Aggregates graph", coarseLevel);
576  aggGraph = aggregates->GetGraph();
577  }
578  auto aggRows = aggGraph.row_map;
579  auto aggCols = aggGraph.entries;
580 
581  // Aggregates map is based on the amalgamated column map
582  // We can skip global-to-local conversion if LIDs in row map are
583  // same as LIDs in column map
584  bool goodMap;
585  {
586  SubFactoryMonitor m2(*this, "Check good map", coarseLevel);
587  goodMap = isGoodMap(*rowMap, *colMap);
588  }
589  // FIXME_KOKKOS: need to proofread later code for bad maps
590  TEUCHOS_TEST_FOR_EXCEPTION(!goodMap, Exceptions::RuntimeError,
591  "MueLu: TentativePFactory_kokkos: for now works only with good maps "
592  "(i.e. \"matching\" row and column maps)");
593 
594  // STEP 1: do unamalgamation
595  // The non-kokkos version uses member functions from the AmalgamationInfo
596  // container class to unamalgamate the data. In contrast, the kokkos
597  // version of TentativePFactory does the unamalgamation here and only uses
598  // the data of the AmalgamationInfo container class
599 
600  // Extract information for unamalgamation
601  LO fullBlockSize, blockID, stridingOffset, stridedBlockSize;
602  GO indexBase;
603  amalgInfo->GetStridingInformation(fullBlockSize, blockID, stridingOffset, stridedBlockSize, indexBase);
604  GO globalOffset = amalgInfo->GlobalOffset();
605 
606  // Extract aggregation info (already in Kokkos host views)
607  auto procWinner = aggregates->GetProcWinner() ->template getLocalView<DeviceType>();
608  auto vertex2AggId = aggregates->GetVertex2AggId()->template getLocalView<DeviceType>();
609  const size_t numAggregates = aggregates->GetNumAggregates();
610 
611  int myPID = aggregates->GetMap()->getComm()->getRank();
612 
613  // Create Kokkos::View (on the device) to store the aggreate dof sizes
614  // Later used to get aggregate dof offsets
615  // NOTE: This zeros itself on construction
616  typedef typename Aggregates_kokkos::aggregates_sizes_type::non_const_type AggSizeType;
617  AggSizeType aggDofSizes;
618 
619  if (stridedBlockSize == 1) {
620  SubFactoryMonitor m2(*this, "Calc AggSizes", coarseLevel);
621 
622  // FIXME_KOKKOS: use ViewAllocateWithoutInitializing + set a single value
623  aggDofSizes = AggSizeType("agg_dof_sizes", numAggregates+1);
624 
625  auto sizesConst = aggregates->ComputeAggregateSizes();
626  Kokkos::deep_copy(Kokkos::subview(aggDofSizes, Kokkos::make_pair(static_cast<size_t>(1), numAggregates+1)), sizesConst);
627 
628  } else {
629  SubFactoryMonitor m2(*this, "Calc AggSizes", coarseLevel);
630 
631  // FIXME_KOKKOS: use ViewAllocateWithoutInitializing + set a single value
632  aggDofSizes = AggSizeType("agg_dof_sizes", numAggregates + 1);
633 
634  auto nodeMap = aggregates->GetMap()->getLocalMap();
635  auto dofMap = colMap->getLocalMap();
636 
637  Kokkos::parallel_for("MueLu:TentativePF:Build:compute_agg_sizes", range_type(0,numAggregates),
638  KOKKOS_LAMBDA(const LO agg) {
639  auto aggRowView = aggGraph.rowConst(agg);
640 
641  size_t size = 0;
642  for (LO colID = 0; colID < aggRowView.length; colID++) {
643  GO nodeGID = nodeMap.getGlobalElement(aggRowView(colID));
644 
645  for (LO k = 0; k < stridedBlockSize; k++) {
646  GO dofGID = (nodeGID - indexBase) * fullBlockSize + k + indexBase + globalOffset + stridingOffset;
647 
648  if (dofMap.getLocalElement(dofGID) != INVALID)
649  size++;
650  }
651  }
652  aggDofSizes(agg+1) = size;
653  });
654  }
655 
656  // Find maximum dof size for aggregates
657  // Later used to reserve enough scratch space for local QR decompositions
658  LO maxAggSize = 0;
659  ReduceMaxFunctor<LO,decltype(aggDofSizes)> reduceMax(aggDofSizes);
660  Kokkos::parallel_reduce("MueLu:TentativePF:Build:max_agg_size", range_type(0, aggDofSizes.extent(0)), reduceMax, maxAggSize);
661 
662  // parallel_scan (exclusive)
663  // The aggDofSizes View then contains the aggregate dof offsets
664  Kokkos::parallel_scan("MueLu:TentativePF:Build:aggregate_sizes:stage1_scan", range_type(0,numAggregates+1),
665  KOKKOS_LAMBDA(const LO i, LO& update, const bool& final_pass) {
666  update += aggDofSizes(i);
667  if (final_pass)
668  aggDofSizes(i) = update;
669  });
670 
671  // Create Kokkos::View on the device to store mapping
672  // between (local) aggregate id and row map ids (LIDs)
673  Kokkos::View<LO*, DeviceType> agg2RowMapLO(Kokkos::ViewAllocateWithoutInitializing("agg2row_map_LO"), numRows);
674  {
675  SubFactoryMonitor m2(*this, "Create Agg2RowMap", coarseLevel);
676 
677  AggSizeType aggOffsets(Kokkos::ViewAllocateWithoutInitializing("aggOffsets"), numAggregates);
678  Kokkos::deep_copy(aggOffsets, Kokkos::subview(aggDofSizes, Kokkos::make_pair(static_cast<size_t>(0), numAggregates)));
679 
680  Kokkos::parallel_for("MueLu:TentativePF:Build:createAgg2RowMap", range_type(0, vertex2AggId.extent(0)),
681  KOKKOS_LAMBDA(const LO lnode) {
682  if (procWinner(lnode, 0) == myPID) {
683  // No need for atomics, it's one-to-one
684  auto aggID = vertex2AggId(lnode,0);
685 
686  auto offset = Kokkos::atomic_fetch_add( &aggOffsets(aggID), stridedBlockSize );
687  // FIXME: I think this may be wrong
688  // We unconditionally add the whole block here. When we calculated
689  // aggDofSizes, we did the isLocalElement check. Something's fishy.
690  for (LO k = 0; k < stridedBlockSize; k++)
691  agg2RowMapLO(offset + k) = lnode*stridedBlockSize + k;
692  }
693  });
694  }
695 
696  // STEP 2: prepare local QR decomposition
697  // Reserve memory for tentative prolongation operator
698  coarseNullspace = MultiVectorFactory::Build(coarseMap, NSDim);
699 
700  // Pull out the nullspace vectors so that we can have random access (on the device)
701  auto fineNS = fineNullspace ->template getLocalView<DeviceType>();
702  auto coarseNS = coarseNullspace->template getLocalView<DeviceType>();
703 
704  size_t nnz = 0; // actual number of nnz
705 
706  typedef typename Xpetra::Matrix<SC,LO,GO,NO>::local_matrix_type local_matrix_type;
707  typedef typename local_matrix_type::row_map_type::non_const_type rows_type;
708  typedef typename local_matrix_type::index_type::non_const_type cols_type;
709  typedef typename local_matrix_type::values_type::non_const_type vals_type;
710 
711 
712  // Device View for status (error messages...)
713  typedef Kokkos::View<int[10], DeviceType> status_type;
714  status_type status("status");
715 
716  typename AppendTrait<decltype(fineNS), Kokkos::RandomAccess>::type fineNSRandom = fineNS;
717  typename AppendTrait<status_type, Kokkos::Atomic> ::type statusAtomic = status;
718 
719  rows_type rows;
720  cols_type cols;
721  vals_type vals;
722 
723  const ParameterList& pL = GetParameterList();
724  const bool& doQRStep = pL.get<bool>("tentative: calculate qr");
725  if (!doQRStep) {
726  GetOStream(Runtime1) << "TentativePFactory : bypassing local QR phase" << std::endl;
727  if (NSDim>1)
728  GetOStream(Warnings0) << "TentativePFactor : for nontrivial nullspace, this may degrade performance" << std::endl;
729  }
730 
731  if (NSDim == 1) {
732  // 1D is special, as it is the easiest. We don't even need to the QR,
733  // just normalize an array. Plus, no worries abot small aggregates. In
734  // addition, we do not worry about compression. It is unlikely that
735  // nullspace will have zeros. If it does, a prolongator row would be
736  // zero and we'll get singularity anyway.
737  SubFactoryMonitor m2(*this, "Stage 1 (LocalQR)", coarseLevel);
738 
739  nnz = numRows;
740 
741  // FIXME_KOKKOS: use ViewAllocateWithoutInitializing + set a single value
742  rows = rows_type("Ptent_rows", numRows+1);
743  cols = cols_type(Kokkos::ViewAllocateWithoutInitializing("Ptent_cols"), numRows);
744  vals = vals_type(Kokkos::ViewAllocateWithoutInitializing("Ptent_vals"), numRows);
745 
746  // Set up team policy with numAggregates teams and one thread per team.
747  // Each team handles a slice of the data associated with one aggregate
748  // and performs a local QR decomposition (in this case real QR is
749  // unnecessary).
750  const Kokkos::TeamPolicy<execution_space> policy(numAggregates, 1);
751 
752  if (doQRStep) {
753  Kokkos::parallel_for("MueLu:TentativePF:BuildUncoupled:main_loop", policy,
754  KOKKOS_LAMBDA(const typename Kokkos::TeamPolicy<execution_space>::member_type &thread) {
755  auto agg = thread.league_rank();
756 
757  // size of the aggregate (number of DOFs in aggregate)
758  LO aggSize = aggRows(agg+1) - aggRows(agg);
759 
760  // Extract the piece of the nullspace corresponding to the aggregate, and
761  // put it in the flat array, "localQR" (in column major format) for the
762  // QR routine. Trivial in 1D.
763  auto norm = ATS::magnitude(zero);
764 
765  // Calculate QR by hand
766  // FIXME: shouldn't there be stridedblock here?
767  // FIXME_KOKKOS: shouldn't there be stridedblock here?
768  for (decltype(aggSize) k = 0; k < aggSize; k++) {
769  auto dnorm = ATS::magnitude(fineNSRandom(agg2RowMapLO(aggRows(agg)+k),0));
770  norm += dnorm*dnorm;
771  }
772  norm = sqrt(norm);
773 
774  if (norm == zero) {
775  // zero column; terminate the execution
776  statusAtomic(1) = true;
777  return;
778  }
779 
780  // R = norm
781  coarseNS(agg, 0) = norm;
782 
783  // Q = localQR(:,0)/norm
784  for (decltype(aggSize) k = 0; k < aggSize; k++) {
785  LO localRow = agg2RowMapLO(aggRows(agg)+k);
786  SC localVal = fineNSRandom(agg2RowMapLO(aggRows(agg)+k),0) / norm;
787 
788  rows(localRow+1) = localRow+1;
789  cols(localRow) = agg;
790  vals(localRow) = localVal;
791 
792  }
793  });
794 
795  typename status_type::HostMirror statusHost = Kokkos::create_mirror_view(status);
796  Kokkos::deep_copy(statusHost, status);
797  for (decltype(statusHost.size()) i = 0; i < statusHost.size(); i++)
798  if (statusHost(i)) {
799  std::ostringstream oss;
800  oss << "MueLu::TentativePFactory::MakeTentative: ";
801  switch (i) {
802  case 0: oss << "!goodMap is not implemented"; break;
803  case 1: oss << "fine level NS part has a zero column"; break;
804  }
805  throw Exceptions::RuntimeError(oss.str());
806  }
807 
808  } else {
809  Kokkos::parallel_for("MueLu:TentativePF:BuildUncoupled:main_loop_noqr", policy,
810  KOKKOS_LAMBDA(const typename Kokkos::TeamPolicy<execution_space>::member_type &thread) {
811  auto agg = thread.league_rank();
812 
813  // size of the aggregate (number of DOFs in aggregate)
814  LO aggSize = aggRows(agg+1) - aggRows(agg);
815 
816  // R = norm
817  coarseNS(agg, 0) = one;
818 
819  // Q = localQR(:,0)/norm
820  for (decltype(aggSize) k = 0; k < aggSize; k++) {
821  LO localRow = agg2RowMapLO(aggRows(agg)+k);
822  SC localVal = fineNSRandom(agg2RowMapLO(aggRows(agg)+k),0);
823 
824  rows(localRow+1) = localRow+1;
825  cols(localRow) = agg;
826  vals(localRow) = localVal;
827 
828  }
829  });
830  }
831 
832  } else { // NSdim > 1
833  // FIXME_KOKKOS: This code branch is completely unoptimized.
834  // Work to do:
835  // - Optimize QR decomposition
836  // - Remove INVALID usage similarly to CoalesceDropFactory_kokkos by
837  // packing new values in the beginning of each row
838  // We do use auxilary view in this case, so keep a second rows view for
839  // counting nonzeros in rows
840 
841  // NOTE: the allocation (initialization) of these view takes noticeable time
842  size_t nnzEstimate = numRows * NSDim;
843  rows_type rowsAux("Ptent_aux_rows", numRows+1);
844  cols_type colsAux("Ptent_aux_cols", nnzEstimate);
845  vals_type valsAux("Ptent_aux_vals", nnzEstimate);
846  rows = rows_type("Ptent_rows", numRows+1);
847  {
848  // Stage 0: fill in views.
849  SubFactoryMonitor m2(*this, "Stage 0 (InitViews)", coarseLevel);
850 
851  // The main thing to notice is initialization of vals with INVALID. These
852  // values will later be used to compress the arrays
853  Kokkos::parallel_for("MueLu:TentativePF:BuildPuncoupled:for1", range_type(0, numRows+1),
854  KOKKOS_LAMBDA(const LO row) {
855  rowsAux(row) = row*NSDim;
856  });
857  Kokkos::parallel_for("MueLu:TentativePF:BuildUncoupled:for2", range_type(0, nnzEstimate),
858  KOKKOS_LAMBDA(const LO j) {
859  colsAux(j) = INVALID;
860  valsAux(j) = zero;
861  });
862  }
863 
864  {
865  SubFactoryMonitor m2 = SubFactoryMonitor(*this, doQRStep ? "Stage 1 (LocalQR)" : "Stage 1 (Fill coarse nullspace and tentative P)", coarseLevel);
866  // Set up team policy with numAggregates teams and one thread per team.
867  // Each team handles a slice of the data associated with one aggregate
868  // and performs a local QR decomposition
869  const Kokkos::TeamPolicy<execution_space> policy(numAggregates,1); // numAggregates teams a 1 thread
870  LocalQRDecompFunctor<LocalOrdinal, GlobalOrdinal, Scalar, DeviceType, decltype(fineNSRandom),
871  decltype(aggDofSizes /*aggregate sizes in dofs*/), decltype(maxAggSize), decltype(agg2RowMapLO),
872  decltype(statusAtomic), decltype(rows), decltype(rowsAux), decltype(colsAux),
873  decltype(valsAux)>
874  localQRFunctor(fineNSRandom, coarseNS, aggDofSizes, maxAggSize, agg2RowMapLO, statusAtomic,
875  rows, rowsAux, colsAux, valsAux, doQRStep);
876  Kokkos::parallel_reduce("MueLu:TentativePF:BuildUncoupled:main_qr_loop", policy, localQRFunctor, nnz);
877  }
878 
879  typename status_type::HostMirror statusHost = Kokkos::create_mirror_view(status);
880  Kokkos::deep_copy(statusHost, status);
881  for (decltype(statusHost.size()) i = 0; i < statusHost.size(); i++)
882  if (statusHost(i)) {
883  std::ostringstream oss;
884  oss << "MueLu::TentativePFactory::MakeTentative: ";
885  switch(i) {
886  case 0: oss << "!goodMap is not implemented"; break;
887  case 1: oss << "fine level NS part has a zero column"; break;
888  }
889  throw Exceptions::RuntimeError(oss.str());
890  }
891 
892  // Compress the cols and vals by ignoring INVALID column entries that correspond
893  // to 0 in QR.
894 
895  // The real cols and vals are constructed using calculated (not estimated) nnz
896  cols = decltype(cols)("Ptent_cols", nnz);
897  vals = decltype(vals)("Ptent_vals", nnz);
898  {
899  // Stage 2: compress the arrays
900  SubFactoryMonitor m2(*this, "Stage 2 (CompressRows)", coarseLevel);
901 
902  Kokkos::parallel_scan("MueLu:TentativePF:Build:compress_rows", range_type(0,numRows+1),
903  KOKKOS_LAMBDA(const LO i, LO& upd, const bool& final) {
904  upd += rows(i);
905  if (final)
906  rows(i) = upd;
907  });
908  }
909 
910  {
911  SubFactoryMonitor m2(*this, "Stage 2 (CompressCols)", coarseLevel);
912 
913  // FIXME_KOKKOS: this can be spedup by moving correct cols and vals values
914  // to the beginning of rows. See CoalesceDropFactory_kokkos for
915  // example.
916  Kokkos::parallel_for("MueLu:TentativePF:Build:compress_cols_vals", range_type(0,numRows),
917  KOKKOS_LAMBDA(const LO i) {
918  LO rowStart = rows(i);
919 
920  size_t lnnz = 0;
921  for (auto j = rowsAux(i); j < rowsAux(i+1); j++)
922  if (colsAux(j) != INVALID) {
923  cols(rowStart+lnnz) = colsAux(j);
924  vals(rowStart+lnnz) = valsAux(j);
925  lnnz++;
926  }
927  });
928  }
929  }
930 
931  GetOStream(Runtime1) << "TentativePFactory : aggregates do not cross process boundaries" << std::endl;
932 
933  {
934  // Stage 3: construct Xpetra::Matrix
935  SubFactoryMonitor m2(*this, "Stage 3 (LocalMatrix+FillComplete)", coarseLevel);
936 
937  local_matrix_type lclMatrix = local_matrix_type("A", numRows, coarseMap->getNodeNumElements(), nnz, vals, rows, cols);
938 
939  // Managing labels & constants for ESFC
940  RCP<ParameterList> FCparams;
941  if (pL.isSublist("matrixmatrix: kernel params"))
942  FCparams = rcp(new ParameterList(pL.sublist("matrixmatrix: kernel params")));
943  else
944  FCparams = rcp(new ParameterList);
945 
946  // By default, we don't need global constants for TentativeP
947  FCparams->set("compute global constants", FCparams->get("compute global constants", false));
948  FCparams->set("Timer Label", std::string("MueLu::TentativeP-") + toString(levelID));
949 
950  auto PtentCrs = CrsMatrixFactory::Build(lclMatrix, rowMap, coarseMap, coarseMap, A->getDomainMap());
951  Ptentative = rcp(new CrsMatrixWrap(PtentCrs));
952  }
953  }
954 
955  template <class Scalar,class LocalOrdinal, class GlobalOrdinal, class DeviceType>
956  void TentativePFactory_kokkos<Scalar,LocalOrdinal,GlobalOrdinal,Kokkos::Compat::KokkosDeviceWrapperNode<DeviceType>>::
957  BuildPcoupled(RCP<Matrix> /* A */, RCP<Aggregates_kokkos> /* aggregates */, RCP<AmalgamationInfo> /* amalgInfo */, RCP<MultiVector> /* fineNullspace */,
958  RCP<const Map> /* coarseMap */, RCP<Matrix>& /* Ptentative */, RCP<MultiVector>& /* coarseNullspace */) const {
959  throw Exceptions::RuntimeError("MueLu: Construction of coupled tentative P is not implemented");
960  }
961 
962  template <class Scalar,class LocalOrdinal, class GlobalOrdinal, class DeviceType>
963  bool TentativePFactory_kokkos<Scalar,LocalOrdinal,GlobalOrdinal,Kokkos::Compat::KokkosDeviceWrapperNode<DeviceType>>::
964  isGoodMap(const Map& rowMap, const Map& colMap) const {
965  auto rowLocalMap = rowMap.getLocalMap();
966  auto colLocalMap = colMap.getLocalMap();
967 
968  const size_t numRows = rowLocalMap.getNodeNumElements();
969  const size_t numCols = colLocalMap.getNodeNumElements();
970 
971  if (numCols < numRows)
972  return false;
973 
974  size_t numDiff = 0;
975  Kokkos::parallel_reduce("MueLu:TentativePF:isGoodMap", range_type(0, numRows),
976  KOKKOS_LAMBDA(const LO i, size_t &diff) {
977  diff += (rowLocalMap.getGlobalElement(i) != colLocalMap.getGlobalElement(i));
978  }, numDiff);
979 
980  return (numDiff == 0);
981  }
982 
983 } //namespace MueLu
984 
985 #define MUELU_TENTATIVEPFACTORY_KOKKOS_SHORT
986 #endif // HAVE_MUELU_KOKKOS_REFACTOR
987 #endif // MUELU_TENTATIVEPFACTORY_KOKKOS_DEF_HPP
Important warning messages (one line)
void parallel_for(const ExecPolicy &policy, const FunctorType &functor, const std::string &str="", typename Impl::enable_if< Kokkos::Impl::is_execution_policy< ExecPolicy >::value >::type *=0)
std::string toString(const T &what)
Little helper function to convert non-string types to strings.
GlobalOrdinal GO
void parallel_reduce(const std::string &label, const PolicyType &policy, const FunctorType &functor, ReturnType &return_value, typename Impl::enable_if< Kokkos::Impl::is_execution_policy< PolicyType >::value >::type *=0)
#define TEUCHOS_TEST_FOR_EXCEPTION(throw_exception_test, Exception, msg)
Print more statistics.
KOKKOS_INLINE_FUNCTION Kokkos::complex< RealType > pow(const complex< RealType > &x, const RealType &e)
LocalOrdinal LO
static const NoFactory * get()
void deep_copy(const View< DT, DP...> &dst, typename ViewTraits< DT, DP...>::const_value_type &value, typename std::enable_if< std::is_same< typename ViewTraits< DT, DP...>::specialize, void >::value >::type *=0)
TEUCHOS_DEPRECATED RCP< T > rcp(T *p, Dealloc_T dealloc, bool owns_mem)
static std::string PrintMatrixInfo(const Matrix &A, const std::string &msgTag, RCP< const Teuchos::ParameterList > params=Teuchos::null)
size_t global_size_t
KOKKOS_INLINE_FUNCTION Kokkos::complex< RealType > sqrt(const complex< RealType > &x)
KOKKOS_FORCEINLINE_FUNCTION constexpr pair< T1, T2 > make_pair(T1 x, T2 y)
Scalar SC
Description of what is happening (more verbose)
#define SET_VALID_ENTRY(name)