MueLu  Version of the Day
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Groups Pages
MueLu_NotayAggregationFactory_def.hpp
Go to the documentation of this file.
1 // @HEADER
2 // *****************************************************************************
3 // MueLu: A package for multigrid based preconditioning
4 //
5 // Copyright 2012 NTESS and the MueLu contributors.
6 // SPDX-License-Identifier: BSD-3-Clause
7 // *****************************************************************************
8 // @HEADER
9 
10 #ifndef MUELU_NOTAYAGGREGATIONFACTORY_DEF_HPP_
11 #define MUELU_NOTAYAGGREGATIONFACTORY_DEF_HPP_
12 
13 #include <Xpetra_Map.hpp>
14 #include <Xpetra_Vector.hpp>
15 #include <Xpetra_MultiVectorFactory.hpp>
16 #include <Xpetra_MapFactory.hpp>
17 #include <Xpetra_VectorFactory.hpp>
18 
19 #include "KokkosKernels_Handle.hpp"
20 #include "KokkosSparse_spgemm.hpp"
21 #include "KokkosSparse_spmv.hpp"
22 
24 
25 #include "MueLu_Aggregates.hpp"
26 #include "MueLu_LWGraph.hpp"
27 #include "MueLu_LWGraph_kokkos.hpp"
28 #include "MueLu_Level.hpp"
29 #include "MueLu_MasterList.hpp"
30 #include "MueLu_Monitor.hpp"
31 #include "MueLu_Types.hpp"
32 #include "MueLu_Utilities.hpp"
33 
34 namespace MueLu {
35 
36 namespace NotayUtils {
37 template <class LocalOrdinal>
39  return min + as<LocalOrdinal>((max - min + 1) * (static_cast<double>(std::rand()) / (RAND_MAX + 1.0)));
40 }
41 
42 template <class LocalOrdinal>
44  typedef LocalOrdinal LO;
45  LO n = Teuchos::as<LO>(list.size());
46  for (LO i = 0; i < n - 1; i++)
47  std::swap(list[i], list[RandomOrdinal(i, n - 1)]);
48 }
49 } // namespace NotayUtils
50 
51 template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
53  RCP<ParameterList> validParamList = rcp(new ParameterList());
54 
55 #define SET_VALID_ENTRY(name) validParamList->setEntry(name, MasterList::getEntry(name))
56  SET_VALID_ENTRY("aggregation: pairwise: size");
57  SET_VALID_ENTRY("aggregation: pairwise: tie threshold");
58  SET_VALID_ENTRY("aggregation: Dirichlet threshold");
59  SET_VALID_ENTRY("aggregation: ordering");
60 #undef SET_VALID_ENTRY
61 
62  // general variables needed in AggregationFactory
63  validParamList->set<RCP<const FactoryBase>>("A", null, "Generating factory of the matrix");
64  validParamList->set<RCP<const FactoryBase>>("Graph", null, "Generating factory of the graph");
65  validParamList->set<RCP<const FactoryBase>>("DofsPerNode", null, "Generating factory for variable \'DofsPerNode\', usually the same as for \'Graph\'");
66 
67  return validParamList;
68 }
69 
70 template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
72  Input(currentLevel, "A");
73  Input(currentLevel, "Graph");
74  Input(currentLevel, "DofsPerNode");
75 }
76 
77 template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
79  FactoryMonitor m(*this, "Build", currentLevel);
81  using MT = typename STS::magnitudeType;
82 
84 
86  if (const char* dbg = std::getenv("MUELU_PAIRWISEAGGREGATION_DEBUG")) {
87  out = Teuchos::fancyOStream(Teuchos::rcpFromRef(std::cout));
88  out->setShowAllFrontMatter(false).setShowProcRank(true);
89  } else {
90  out = Teuchos::getFancyOStream(rcp(new Teuchos::oblackholestream()));
91  }
92 
93  const ParameterList& pL = GetParameterList();
94 
95  const MT kappa = static_cast<MT>(pL.get<double>("aggregation: Dirichlet threshold"));
96  TEUCHOS_TEST_FOR_EXCEPTION(kappa <= MT_TWO,
98  "Pairwise requires kappa > 2"
99  " otherwise all rows are considered as Dirichlet rows.");
100 
101  // Parameters
102  int maxNumIter = 3;
103  if (pL.isParameter("aggregation: pairwise: size"))
104  maxNumIter = pL.get<int>("aggregation: pairwise: size");
105  TEUCHOS_TEST_FOR_EXCEPTION(maxNumIter < 1,
107  "NotayAggregationFactory::Build(): \"aggregation: pairwise: size\""
108  " must be a strictly positive integer");
109 
110  RCP<const LWGraph> graph;
111  if (IsType<RCP<LWGraph>>(currentLevel, "Graph"))
112  graph = Get<RCP<LWGraph>>(currentLevel, "Graph");
113  else {
114  auto graph_k = Get<RCP<LWGraph_kokkos>>(currentLevel, "Graph");
115  graph = graph_k->copyToHost();
116  }
117  RCP<const Matrix> A = Get<RCP<Matrix>>(currentLevel, "A");
118 
119  // Setup aggregates & aggStat objects
120  RCP<Aggregates> aggregates = rcp(new Aggregates(*graph));
121  aggregates->setObjectLabel("PW");
122 
123  const LO numRows = graph->GetNodeNumVertices();
124 
125  // construct aggStat information
126  std::vector<unsigned> aggStat(numRows, READY);
127 
128  const int DofsPerNode = Get<int>(currentLevel, "DofsPerNode");
130  "Pairwise only supports one dof per node");
131 
132  // This follows the paper:
133  // Notay, "Aggregation-based algebraic multigrid for convection-diffusion equations",
134  // SISC 34(3), pp. A2288-2316.
135 
136  // Handle Ordering
137  std::string orderingStr = pL.get<std::string>("aggregation: ordering");
138  enum {
139  O_NATURAL,
140  O_RANDOM,
141  O_CUTHILL_MCKEE,
142  } ordering;
143 
144  ordering = O_NATURAL;
145  if (orderingStr == "random")
146  ordering = O_RANDOM;
147  else if (orderingStr == "natural") {
148  } else if (orderingStr == "cuthill-mckee" || orderingStr == "cm")
149  ordering = O_CUTHILL_MCKEE;
150  else {
151  TEUCHOS_TEST_FOR_EXCEPTION(1, Exceptions::RuntimeError, "Invalid ordering type");
152  }
153 
154  // Get an ordering vector
155  // NOTE: The orderingVector only orders *rows* of the matrix. Off-proc columns
156  // will get ignored in the aggregation phases, so we don't need to worry about
157  // running off the end.
158  Array<LO> orderingVector(numRows);
159  for (LO i = 0; i < numRows; i++)
160  orderingVector[i] = i;
161  if (ordering == O_RANDOM)
162  MueLu::NotayUtils::RandomReorder(orderingVector);
163  else if (ordering == O_CUTHILL_MCKEE) {
165  auto localVector = rcmVector->getData(0);
166  for (LO i = 0; i < numRows; i++)
167  orderingVector[i] = localVector[i];
168  }
169 
170  // Get the party stated
171  LO numNonAggregatedNodes = numRows, numDirichletNodes = 0;
172  BuildInitialAggregates(pL, A, orderingVector(), kappa,
173  *aggregates, aggStat, numNonAggregatedNodes, numDirichletNodes);
174  TEUCHOS_TEST_FOR_EXCEPTION(0 < numNonAggregatedNodes, Exceptions::RuntimeError,
175  "Initial pairwise aggregation failed to aggregate all nodes");
176  LO numLocalAggregates = aggregates->GetNumAggregates();
177  GetOStream(Statistics0) << "Init : " << numLocalAggregates << " - "
178  << A->getLocalNumRows() / numLocalAggregates << std::endl;
179 
180  // Temporary data storage for further aggregation steps
181  local_matrix_type intermediateP;
182  local_matrix_type coarseLocalA;
183 
184  // Compute the on rank part of the local matrix
185  // that the square submatrix that only contains
186  // columns corresponding to local rows.
187  LO numLocalDirichletNodes = numDirichletNodes;
188  Array<LO> localVertex2AggId(aggregates->GetVertex2AggId()->getData(0).view(0, numRows));
189  BuildOnRankLocalMatrix(A->getLocalMatrixDevice(), coarseLocalA);
190  for (LO aggregationIter = 1; aggregationIter < maxNumIter; ++aggregationIter) {
191  // Compute the intermediate prolongator
192  BuildIntermediateProlongator(coarseLocalA.numRows(), numLocalDirichletNodes, numLocalAggregates,
193  localVertex2AggId(), intermediateP);
194 
195  // Compute the coarse local matrix and coarse row sum
196  BuildCoarseLocalMatrix(intermediateP, coarseLocalA);
197 
198  // Directly compute rowsum from A, rather than coarseA
199  row_sum_type rowSum("rowSum", numLocalAggregates);
200  {
201  std::vector<std::vector<LO>> agg2vertex(numLocalAggregates);
202  auto vertex2AggId = aggregates->GetVertex2AggId()->getData(0);
203  for (LO i = 0; i < (LO)numRows; i++) {
204  if (aggStat[i] != AGGREGATED)
205  continue;
206  LO agg = vertex2AggId[i];
207  agg2vertex[agg].push_back(i);
208  }
209 
210  typename row_sum_type::host_mirror_type rowSum_h = Kokkos::create_mirror_view(rowSum);
211  for (LO i = 0; i < numRows; i++) {
212  // If not aggregated already, skip this guy
213  if (aggStat[i] != AGGREGATED)
214  continue;
215  int agg = vertex2AggId[i];
216  std::vector<LO>& myagg = agg2vertex[agg];
217 
218  size_t nnz = A->getNumEntriesInLocalRow(i);
219  ArrayView<const LO> indices;
220  ArrayView<const SC> vals;
221  A->getLocalRowView(i, indices, vals);
222 
224  for (LO colidx = 0; colidx < static_cast<LO>(nnz); colidx++) {
225  bool found = false;
226  if (indices[colidx] < numRows) {
227  for (LO j = 0; j < (LO)myagg.size(); j++)
228  if (vertex2AggId[indices[colidx]] == agg)
229  found = true;
230  }
231  if (!found) {
232  *out << "- ADDING col " << indices[colidx] << " = " << vals[colidx] << std::endl;
233  mysum += vals[colidx];
234  } else {
235  *out << "- NOT ADDING col " << indices[colidx] << " = " << vals[colidx] << std::endl;
236  }
237  }
238 
239  rowSum_h[agg] = mysum;
240  }
241  Kokkos::deep_copy(rowSum, rowSum_h);
242  }
243 
244  // Get local orderingVector
245  Array<LO> localOrderingVector(numRows);
246  for (LO i = 0; i < numRows; i++)
247  localOrderingVector[i] = i;
248  if (ordering == O_RANDOM)
249  MueLu::NotayUtils::RandomReorder(localOrderingVector);
250  else if (ordering == O_CUTHILL_MCKEE) {
252  auto localVector = rcmVector->getData(0);
253  for (LO i = 0; i < numRows; i++)
254  localOrderingVector[i] = localVector[i];
255  }
256 
257  // Compute new aggregates
258  numLocalAggregates = 0;
259  numNonAggregatedNodes = static_cast<LO>(coarseLocalA.numRows());
260  std::vector<LO> localAggStat(numNonAggregatedNodes, READY);
261  localVertex2AggId.resize(numNonAggregatedNodes, -1);
262  BuildFurtherAggregates(pL, A, localOrderingVector, coarseLocalA, kappa, rowSum,
263  localAggStat, localVertex2AggId,
264  numLocalAggregates, numNonAggregatedNodes);
265 
266  // After the first initial pairwise aggregation
267  // the Dirichlet nodes have been removed.
268  numLocalDirichletNodes = 0;
269 
270  // Update the aggregates
271  RCP<LOMultiVector> vertex2AggIdMV = aggregates->GetVertex2AggId();
272  ArrayRCP<LO> vertex2AggId = vertex2AggIdMV->getDataNonConst(0);
273  for (size_t vertexIdx = 0; vertexIdx < A->getLocalNumRows(); ++vertexIdx) {
274  LO oldAggIdx = vertex2AggId[vertexIdx];
275  if (oldAggIdx != Teuchos::OrdinalTraits<LO>::invalid()) {
276  vertex2AggId[vertexIdx] = localVertex2AggId[oldAggIdx];
277  }
278  }
279 
280  // We could probably print some better statistics at some point
281  GetOStream(Statistics0) << "Iter " << aggregationIter << ": " << numLocalAggregates << " - "
282  << A->getLocalNumRows() / numLocalAggregates << std::endl;
283  }
284  aggregates->SetNumAggregates(numLocalAggregates);
285  aggregates->AggregatesCrossProcessors(false);
286  aggregates->ComputeAggregateSizes(true /*forceRecompute*/);
287 
288  // DO stuff
289  Set(currentLevel, "Aggregates", aggregates);
290  GetOStream(Statistics0) << aggregates->description() << std::endl;
291 }
292 
293 template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
296  const RCP<const Matrix>& A,
297  const Teuchos::ArrayView<const LO>& orderingVector,
299  Aggregates& aggregates,
300  std::vector<unsigned>& aggStat,
301  LO& numNonAggregatedNodes,
302  LO& numDirichletNodes) const {
303  Monitor m(*this, "BuildInitialAggregates");
304  using STS = Teuchos::ScalarTraits<Scalar>;
305  using MT = typename STS::magnitudeType;
307 
309  if (const char* dbg = std::getenv("MUELU_PAIRWISEAGGREGATION_DEBUG")) {
310  out = Teuchos::fancyOStream(Teuchos::rcpFromRef(std::cout));
311  out->setShowAllFrontMatter(false).setShowProcRank(true);
312  } else {
313  out = Teuchos::getFancyOStream(rcp(new Teuchos::oblackholestream()));
314  }
315 
316  const SC SC_ZERO = Teuchos::ScalarTraits<SC>::zero();
317  const MT MT_ZERO = Teuchos::ScalarTraits<MT>::zero();
318  const MT MT_ONE = Teuchos::ScalarTraits<MT>::one();
319  const MT MT_TWO = MT_ONE + MT_ONE;
320  const LO LO_INVALID = Teuchos::OrdinalTraits<LO>::invalid();
321  const LO LO_ZERO = Teuchos::OrdinalTraits<LO>::zero();
322 
323  const MT kappa_init = kappa / (kappa - MT_TWO);
324  const LO numRows = aggStat.size();
325  const int myRank = A->getMap()->getComm()->getRank();
326 
327  // For finding "ties" where we fall back to the ordering. Making this larger than
328  // hard zero substantially increases code robustness.
329  double tie_criterion = params.get<double>("aggregation: pairwise: tie threshold");
330  double tie_less = 1.0 - tie_criterion;
331  double tie_more = 1.0 + tie_criterion;
332 
333  // NOTE: Assumes 1 dof per node. This constraint is enforced in Build(),
334  // and so we're not doing again here.
335  // This should probably be fixed at some point.
336 
337  // Extract diagonal, rowsums, etc
338  // NOTE: The ghostedRowSum vector here has has the sign flipped from Notay's S
342  const ArrayRCP<const SC> D = ghostedDiag->getData(0);
343  const ArrayRCP<const SC> S = ghostedRowSum->getData(0);
344  const ArrayRCP<const MT> AbsRs = ghostedAbsRowSum->getData(0);
345 
346  // Aggregates stuff
347  ArrayRCP<LO> vertex2AggId_rcp = aggregates.GetVertex2AggId()->getDataNonConst(0);
348  ArrayRCP<LO> procWinner_rcp = aggregates.GetProcWinner()->getDataNonConst(0);
349  ArrayView<LO> vertex2AggId = vertex2AggId_rcp();
350  ArrayView<LO> procWinner = procWinner_rcp();
351 
352  // Algorithm 4.2
353 
354  // 0,1 : Initialize: Flag boundary conditions
355  // Modification: We assume symmetry here aij = aji
356  for (LO row = 0; row < Teuchos::as<LO>(A->getRowMap()->getLocalNumElements()); ++row) {
357  MT aii = STS::magnitude(D[row]);
358  MT rowsum = AbsRs[row];
359 
360  if (aii >= kappa_init * rowsum) {
361  *out << "Flagging index " << row << " as dirichlet "
362  "aii >= kappa*rowsum = "
363  << aii << " >= " << kappa_init << " " << rowsum << std::endl;
364  aggStat[row] = IGNORED;
365  --numNonAggregatedNodes;
366  ++numDirichletNodes;
367  }
368  }
369 
370  // 2 : Iteration
371  LO aggIndex = LO_ZERO;
372  for (LO i = 0; i < numRows; i++) {
373  LO current_idx = orderingVector[i];
374  // If we're aggregated already, skip this guy
375  if (aggStat[current_idx] != READY)
376  continue;
377 
378  MT best_mu = MT_ZERO;
379  LO best_idx = LO_INVALID;
380 
381  size_t nnz = A->getNumEntriesInLocalRow(current_idx);
382  ArrayView<const LO> indices;
383  ArrayView<const SC> vals;
384  A->getLocalRowView(current_idx, indices, vals);
385 
386  MT aii = STS::real(D[current_idx]);
387  MT si = STS::real(S[current_idx]);
388  for (LO colidx = 0; colidx < static_cast<LO>(nnz); colidx++) {
389  // Skip aggregated neighbors, off-rank neighbors, hard zeros and self
390  LO col = indices[colidx];
391  SC val = vals[colidx];
392  if (current_idx == col || col >= numRows || aggStat[col] != READY || val == SC_ZERO)
393  continue;
394 
395  MT aij = STS::real(val);
396  MT ajj = STS::real(D[col]);
397  MT sj = -STS::real(S[col]); // NOTE: The ghostedRowSum vector here has has the sign flipped from Notay's S
398  if (aii - si + ajj - sj >= MT_ZERO) {
399  // Modification: We assume symmetry here aij = aji
400  MT mu_top = MT_TWO / (MT_ONE / aii + MT_ONE / ajj);
401  MT mu_bottom = -aij + MT_ONE / (MT_ONE / (aii - si) + MT_ONE / (ajj - sj));
402  MT mu = mu_top / mu_bottom;
403 
404  // Modification: Explicitly check the tie criterion here
405  if (mu > MT_ZERO && (best_idx == LO_INVALID || mu < best_mu * tie_less ||
406  (mu < best_mu * tie_more && orderingVector[col] < orderingVector[best_idx]))) {
407  best_mu = mu;
408  best_idx = col;
409  *out << "[" << current_idx << "] Column UPDATED " << col << ": "
410  << "aii - si + ajj - sj = " << aii << " - " << si << " + " << ajj << " - " << sj
411  << " = " << aii - si + ajj - sj << ", aij = " << aij << ", mu = " << mu << std::endl;
412  } else {
413  *out << "[" << current_idx << "] Column NOT UPDATED " << col << ": "
414  << "aii - si + ajj - sj = " << aii << " - " << si << " + " << ajj << " - " << sj
415  << " = " << aii - si + ajj - sj << ", aij = " << aij << ", mu = " << mu << std::endl;
416  }
417  } else {
418  *out << "[" << current_idx << "] Column FAILED " << col << ": "
419  << "aii - si + ajj - sj = " << aii << " - " << si << " + " << ajj << " - " << sj
420  << " = " << aii - si + ajj - sj << std::endl;
421  }
422  } // end column for loop
423 
424  if (best_idx == LO_INVALID) {
425  *out << "No node buddy found for index " << current_idx
426  << " [agg " << aggIndex << "]\n"
427  << std::endl;
428  // We found no potential node-buddy, so let's just make this a singleton
429  // NOTE: The behavior of what to do if you have no un-aggregated neighbors is not specified in
430  // the paper
431 
432  aggStat[current_idx] = ONEPT;
433  vertex2AggId[current_idx] = aggIndex;
434  procWinner[current_idx] = myRank;
435  numNonAggregatedNodes--;
436  aggIndex++;
437 
438  } else {
439  // We have a buddy, so aggregate, either as a singleton or as a pair, depending on mu
440  if (best_mu <= kappa) {
441  *out << "Node buddies (" << current_idx << "," << best_idx << ") [agg " << aggIndex << "]" << std::endl;
442 
443  aggStat[current_idx] = AGGREGATED;
444  aggStat[best_idx] = AGGREGATED;
445  vertex2AggId[current_idx] = aggIndex;
446  vertex2AggId[best_idx] = aggIndex;
447  procWinner[current_idx] = myRank;
448  procWinner[best_idx] = myRank;
449  numNonAggregatedNodes -= 2;
450  aggIndex++;
451 
452  } else {
453  *out << "No buddy found for index " << current_idx << ","
454  " but aggregating as singleton [agg "
455  << aggIndex << "]" << std::endl;
456 
457  aggStat[current_idx] = ONEPT;
458  vertex2AggId[current_idx] = aggIndex;
459  procWinner[current_idx] = myRank;
460  numNonAggregatedNodes--;
461  aggIndex++;
462  } // best_mu
463  } // best_idx
464  } // end Algorithm 4.2
465 
466  *out << "vertex2aggid :";
467  for (int i = 0; i < static_cast<int>(vertex2AggId.size()); ++i) {
468  *out << i << "(" << vertex2AggId[i] << ")";
469  }
470  *out << std::endl;
471 
472  // update aggregate object
473  aggregates.SetNumAggregates(aggIndex);
474 } // BuildInitialAggregates
475 
476 template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
479  const RCP<const Matrix>& A,
480  const Teuchos::ArrayView<const LO>& orderingVector,
481  const typename Matrix::local_matrix_type& coarseA,
483 #if KOKKOS_VERSION >= 40799
484  const Kokkos::View<typename KokkosKernels::ArithTraits<Scalar>::val_type*,
485 #else
486  const Kokkos::View<typename Kokkos::ArithTraits<Scalar>::val_type*,
487 #endif
488  Kokkos::LayoutLeft,
489  typename Matrix::local_matrix_type::device_type>& rowSum,
490  std::vector<LocalOrdinal>& localAggStat,
491  Teuchos::Array<LocalOrdinal>& localVertex2AggID,
492  LO& numLocalAggregates,
493  LO& numNonAggregatedNodes) const {
494  Monitor m(*this, "BuildFurtherAggregates");
495 
496  // Set debug outputs based on environment variable
498  if (const char* dbg = std::getenv("MUELU_PAIRWISEAGGREGATION_DEBUG")) {
499  out = Teuchos::fancyOStream(Teuchos::rcpFromRef(std::cout));
500  out->setShowAllFrontMatter(false).setShowProcRank(true);
501  } else {
502  out = Teuchos::getFancyOStream(rcp(new Teuchos::oblackholestream()));
503  }
504 
505  using value_type = typename local_matrix_type::value_type;
506 #if KOKKOS_VERSION >= 40799
507  const value_type KAT_zero = KokkosKernels::ArithTraits<value_type>::zero();
508 #else
509  const value_type KAT_zero = Kokkos::ArithTraits<value_type>::zero();
510 #endif
513  const magnitude_type MT_two = MT_one + MT_one;
514  const LO LO_INVALID = Teuchos::OrdinalTraits<LO>::invalid();
515 
516  // For finding "ties" where we fall back to the ordering. Making this larger than
517  // hard zero substantially increases code robustness.
518  double tie_criterion = params.get<double>("aggregation: pairwise: tie threshold");
519  double tie_less = 1.0 - tie_criterion;
520  double tie_more = 1.0 + tie_criterion;
521 
522  typename row_sum_type::host_mirror_type rowSum_h = Kokkos::create_mirror_view(rowSum);
523  Kokkos::deep_copy(rowSum_h, rowSum);
524 
525  // Extracting the diagonal of a KokkosSparse::CrsMatrix
526  // is not currently provided in kokkos-kernels so here
527  // is an ugly way to get that done...
528  const LO numRows = static_cast<LO>(coarseA.numRows());
529  typename local_matrix_type::values_type::host_mirror_type diagA_h("diagA host", numRows);
530  typename local_matrix_type::row_map_type::host_mirror_type row_map_h = Kokkos::create_mirror_view(coarseA.graph.row_map);
531  Kokkos::deep_copy(row_map_h, coarseA.graph.row_map);
532  typename local_matrix_type::index_type::host_mirror_type entries_h = Kokkos::create_mirror_view(coarseA.graph.entries);
533  Kokkos::deep_copy(entries_h, coarseA.graph.entries);
534  typename local_matrix_type::values_type::host_mirror_type values_h = Kokkos::create_mirror_view(coarseA.values);
535  Kokkos::deep_copy(values_h, coarseA.values);
536  for (LO rowIdx = 0; rowIdx < numRows; ++rowIdx) {
537  for (LO entryIdx = static_cast<LO>(row_map_h(rowIdx));
538  entryIdx < static_cast<LO>(row_map_h(rowIdx + 1));
539  ++entryIdx) {
540  if (rowIdx == static_cast<LO>(entries_h(entryIdx))) {
541  diagA_h(rowIdx) = values_h(entryIdx);
542  }
543  }
544  }
545 
546  for (LO currentIdx = 0; currentIdx < numRows; ++currentIdx) {
547  if (localAggStat[currentIdx] != READY) {
548  continue;
549  }
550 
553  const magnitude_type aii = Teuchos::ScalarTraits<value_type>::real(diagA_h(currentIdx));
554  const magnitude_type si = Teuchos::ScalarTraits<value_type>::real(rowSum_h(currentIdx));
555  for (auto entryIdx = row_map_h(currentIdx); entryIdx < row_map_h(currentIdx + 1); ++entryIdx) {
556  const LO colIdx = static_cast<LO>(entries_h(entryIdx));
557  if (currentIdx == colIdx || colIdx >= numRows || localAggStat[colIdx] != READY || values_h(entryIdx) == KAT_zero) {
558  continue;
559  }
560 
561  const magnitude_type aij = Teuchos::ScalarTraits<value_type>::real(values_h(entryIdx));
562  const magnitude_type ajj = Teuchos::ScalarTraits<value_type>::real(diagA_h(colIdx));
563  const magnitude_type sj = -Teuchos::ScalarTraits<value_type>::real(rowSum_h(colIdx)); // NOTE: The ghostedRowSum vector here has has the sign flipped from Notay's S
564  if (aii - si + ajj - sj >= MT_zero) {
565  const magnitude_type mu_top = MT_two / (MT_one / aii + MT_one / ajj);
566  const magnitude_type mu_bottom = -aij + MT_one / (MT_one / (aii - si) + MT_one / (ajj - sj));
567  const magnitude_type mu = mu_top / mu_bottom;
568 
569  // Modification: Explicitly check the tie criterion here
570  if (mu > MT_zero && (bestIdx == LO_INVALID || mu < best_mu * tie_less ||
571  (mu < best_mu * tie_more && orderingVector[colIdx] < orderingVector[bestIdx]))) {
572  best_mu = mu;
573  bestIdx = colIdx;
574  *out << "[" << currentIdx << "] Column UPDATED " << colIdx << ": "
575  << "aii - si + ajj - sj = " << aii << " - " << si << " + " << ajj << " - " << sj
576  << " = " << aii - si + ajj - sj << ", aij = " << aij << " mu = " << mu << std::endl;
577  } else {
578  *out << "[" << currentIdx << "] Column NOT UPDATED " << colIdx << ": "
579  << "aii - si + ajj - sj = " << aii << " - " << si << " + " << ajj << " - " << sj
580  << " = " << aii - si + ajj - sj << ", aij = " << aij << ", mu = " << mu << std::endl;
581  }
582  } else {
583  *out << "[" << currentIdx << "] Column FAILED " << colIdx << ": "
584  << "aii - si + ajj - sj = " << aii << " - " << si << " + " << ajj << " - " << sj
585  << " = " << aii - si + ajj - sj << std::endl;
586  }
587  } // end loop over row entries
588 
589  if (bestIdx == Teuchos::OrdinalTraits<LO>::invalid()) {
590  localAggStat[currentIdx] = ONEPT;
591  localVertex2AggID[currentIdx] = numLocalAggregates;
592  --numNonAggregatedNodes;
593  ++numLocalAggregates;
594  } else {
595  if (best_mu <= kappa) {
596  *out << "Node buddies (" << currentIdx << "," << bestIdx << ") [agg " << numLocalAggregates << "]" << std::endl;
597 
598  localAggStat[currentIdx] = AGGREGATED;
599  localVertex2AggID[currentIdx] = numLocalAggregates;
600  --numNonAggregatedNodes;
601 
602  localAggStat[bestIdx] = AGGREGATED;
603  localVertex2AggID[bestIdx] = numLocalAggregates;
604  --numNonAggregatedNodes;
605 
606  ++numLocalAggregates;
607  } else {
608  *out << "No buddy found for index " << currentIdx << ","
609  " but aggregating as singleton [agg "
610  << numLocalAggregates << "]" << std::endl;
611 
612  localAggStat[currentIdx] = ONEPT;
613  localVertex2AggID[currentIdx] = numLocalAggregates;
614  --numNonAggregatedNodes;
615  ++numLocalAggregates;
616  }
617  }
618  } // end loop over matrix rows
619 
620 } // BuildFurtherAggregates
621 
622 template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
624  BuildOnRankLocalMatrix(const typename Matrix::local_matrix_type& localA,
625  typename Matrix::local_matrix_type& onrankA) const {
626  Monitor m(*this, "BuildOnRankLocalMatrix");
627 
628  // Set debug outputs based on environment variable
630  if (const char* dbg = std::getenv("MUELU_PAIRWISEAGGREGATION_DEBUG")) {
631  out = Teuchos::fancyOStream(Teuchos::rcpFromRef(std::cout));
632  out->setShowAllFrontMatter(false).setShowProcRank(true);
633  } else {
634  out = Teuchos::getFancyOStream(rcp(new Teuchos::oblackholestream()));
635  }
636 
637  using local_graph_type = typename local_matrix_type::staticcrsgraph_type;
638  using values_type = typename local_matrix_type::values_type;
639  using size_type = typename local_graph_type::size_type;
640  using col_index_type = typename local_graph_type::data_type;
641  using array_layout = typename local_graph_type::array_layout;
642  using memory_traits = typename local_graph_type::memory_traits;
643  using row_pointer_type = Kokkos::View<size_type*, array_layout, device_type, memory_traits>;
644  using col_indices_type = Kokkos::View<col_index_type*, array_layout, device_type, memory_traits>;
645  // Extract on rank part of A
646  // Simply check that the column index is less than the number of local rows
647  // otherwise remove it.
648 
649  const int numRows = static_cast<int>(localA.numRows());
650  row_pointer_type rowPtr("onrankA row pointer", numRows + 1);
651  typename row_pointer_type::host_mirror_type rowPtr_h = Kokkos::create_mirror_view(rowPtr);
652  typename local_graph_type::row_map_type::host_mirror_type origRowPtr_h = Kokkos::create_mirror_view(localA.graph.row_map);
653  typename local_graph_type::entries_type::host_mirror_type origColind_h = Kokkos::create_mirror_view(localA.graph.entries);
654  typename values_type::host_mirror_type origValues_h = Kokkos::create_mirror_view(localA.values);
655  Kokkos::deep_copy(origRowPtr_h, localA.graph.row_map);
656  Kokkos::deep_copy(origColind_h, localA.graph.entries);
657  Kokkos::deep_copy(origValues_h, localA.values);
658 
659  // Compute the number of nnz entries per row
660  rowPtr_h(0) = 0;
661  for (int rowIdx = 0; rowIdx < numRows; ++rowIdx) {
662  for (size_type entryIdx = origRowPtr_h(rowIdx); entryIdx < origRowPtr_h(rowIdx + 1); ++entryIdx) {
663  if (origColind_h(entryIdx) < numRows) {
664  rowPtr_h(rowIdx + 1) += 1;
665  }
666  }
667  rowPtr_h(rowIdx + 1) = rowPtr_h(rowIdx + 1) + rowPtr_h(rowIdx);
668  }
669  Kokkos::deep_copy(rowPtr, rowPtr_h);
670 
671  const LO nnzOnrankA = rowPtr_h(numRows);
672 
673  // Now use nnz per row to allocate matrix views and store column indices and values
674  col_indices_type colInd("onrankA column indices", rowPtr_h(numRows));
675  values_type values("onrankA values", rowPtr_h(numRows));
676  typename col_indices_type::host_mirror_type colInd_h = Kokkos::create_mirror_view(colInd);
677  typename values_type::host_mirror_type values_h = Kokkos::create_mirror_view(values);
678  int entriesInRow;
679  for (int rowIdx = 0; rowIdx < numRows; ++rowIdx) {
680  entriesInRow = 0;
681  for (size_type entryIdx = origRowPtr_h(rowIdx); entryIdx < origRowPtr_h(rowIdx + 1); ++entryIdx) {
682  if (origColind_h(entryIdx) < numRows) {
683  colInd_h(rowPtr_h(rowIdx) + entriesInRow) = origColind_h(entryIdx);
684  values_h(rowPtr_h(rowIdx) + entriesInRow) = origValues_h(entryIdx);
685  ++entriesInRow;
686  }
687  }
688  }
689  Kokkos::deep_copy(colInd, colInd_h);
690  Kokkos::deep_copy(values, values_h);
691 
692  onrankA = local_matrix_type("onrankA", numRows, numRows,
693  nnzOnrankA, values, rowPtr, colInd);
694 }
695 
696 template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
699  const LocalOrdinal numDirichletNodes,
700  const LocalOrdinal numLocalAggregates,
701  const Teuchos::ArrayView<const LocalOrdinal>& localVertex2AggID,
702  typename Matrix::local_matrix_type& intermediateP) const {
703  Monitor m(*this, "BuildIntermediateProlongator");
704 
705  // Set debug outputs based on environment variable
707  if (const char* dbg = std::getenv("MUELU_PAIRWISEAGGREGATION_DEBUG")) {
708  out = Teuchos::fancyOStream(Teuchos::rcpFromRef(std::cout));
709  out->setShowAllFrontMatter(false).setShowProcRank(true);
710  } else {
711  out = Teuchos::getFancyOStream(rcp(new Teuchos::oblackholestream()));
712  }
713 
714  using local_graph_type = typename local_matrix_type::staticcrsgraph_type;
715  using values_type = typename local_matrix_type::values_type;
716  using size_type = typename local_graph_type::size_type;
717  using col_index_type = typename local_graph_type::data_type;
718  using array_layout = typename local_graph_type::array_layout;
719  using memory_traits = typename local_graph_type::memory_traits;
720  using row_pointer_type = Kokkos::View<size_type*, array_layout, device_type, memory_traits>;
721  using col_indices_type = Kokkos::View<col_index_type*, array_layout, device_type, memory_traits>;
722 
723  const LO LO_INVALID = Teuchos::OrdinalTraits<LO>::invalid();
724 
725  const int intermediatePnnz = numRows - numDirichletNodes;
726  row_pointer_type rowPtr("intermediateP row pointer", numRows + 1);
727  col_indices_type colInd("intermediateP column indices", intermediatePnnz);
728  values_type values("intermediateP values", intermediatePnnz);
729  typename row_pointer_type::host_mirror_type rowPtr_h = Kokkos::create_mirror_view(rowPtr);
730  typename col_indices_type::host_mirror_type colInd_h = Kokkos::create_mirror_view(colInd);
731 
732  rowPtr_h(0) = 0;
733  for (int rowIdx = 0; rowIdx < numRows; ++rowIdx) {
734  // Skip Dirichlet nodes
735  if (localVertex2AggID[rowIdx] == LO_INVALID) {
736  rowPtr_h(rowIdx + 1) = rowPtr_h(rowIdx);
737  } else {
738  rowPtr_h(rowIdx + 1) = rowPtr_h(rowIdx) + 1;
739  colInd_h(rowPtr_h(rowIdx)) = localVertex2AggID[rowIdx];
740  }
741  }
742 
743  Kokkos::deep_copy(rowPtr, rowPtr_h);
744  Kokkos::deep_copy(colInd, colInd_h);
745 #if KOKKOS_VERSION >= 40799
746  Kokkos::deep_copy(values, KokkosKernels::ArithTraits<typename values_type::value_type>::one());
747 #else
748  Kokkos::deep_copy(values, Kokkos::ArithTraits<typename values_type::value_type>::one());
749 #endif
750 
751  intermediateP = local_matrix_type("intermediateP",
752  numRows, numLocalAggregates, intermediatePnnz,
753  values, rowPtr, colInd);
754 } // BuildIntermediateProlongator
755 
756 template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
758  BuildCoarseLocalMatrix(const typename Matrix::local_matrix_type& intermediateP,
759  typename Matrix::local_matrix_type& coarseA) const {
760  Monitor m(*this, "BuildCoarseLocalMatrix");
761 
762  using local_graph_type = typename local_matrix_type::staticcrsgraph_type;
763  using values_type = typename local_matrix_type::values_type;
764  using size_type = typename local_graph_type::size_type;
765  using col_index_type = typename local_graph_type::data_type;
766  using array_layout = typename local_graph_type::array_layout;
767  using memory_traits = typename local_graph_type::memory_traits;
768  using row_pointer_type = Kokkos::View<size_type*, array_layout, device_type, memory_traits>;
769  using col_indices_type = Kokkos::View<col_index_type*, array_layout, device_type, memory_traits>;
770 
772  localSpGEMM(coarseA, intermediateP, "AP", AP);
773 
774  // Note 03/11/20, lbv: does kh need to destroy and recreate the spgemm handle
775  // I am not sure but doing it for safety in case it stashes data from the previous
776  // spgemm computation...
777 
778  // Compute Ac = Pt * AP
779  // Two steps needed:
780  // 1. compute Pt
781  // 2. perform multiplication
782 
783  // Step 1 compute Pt
784  // Obviously this requires the same amount of storage as P except for the rowPtr
785  row_pointer_type rowPtrPt(Kokkos::ViewAllocateWithoutInitializing("Pt row pointer"),
786  intermediateP.numCols() + 1);
787  col_indices_type colIndPt(Kokkos::ViewAllocateWithoutInitializing("Pt column indices"),
788  intermediateP.nnz());
789  values_type valuesPt(Kokkos::ViewAllocateWithoutInitializing("Pt values"),
790  intermediateP.nnz());
791 
792  typename row_pointer_type::host_mirror_type rowPtrPt_h = Kokkos::create_mirror_view(rowPtrPt);
793  typename col_indices_type::host_mirror_type entries_h = Kokkos::create_mirror_view(intermediateP.graph.entries);
794  Kokkos::deep_copy(entries_h, intermediateP.graph.entries);
795  Kokkos::deep_copy(rowPtrPt_h, 0);
796  for (size_type entryIdx = 0; entryIdx < intermediateP.nnz(); ++entryIdx) {
797  rowPtrPt_h(entries_h(entryIdx) + 1) += 1;
798  }
799  for (LO rowIdx = 0; rowIdx < intermediateP.numCols(); ++rowIdx) {
800  rowPtrPt_h(rowIdx + 1) += rowPtrPt_h(rowIdx);
801  }
802  Kokkos::deep_copy(rowPtrPt, rowPtrPt_h);
803 
804  typename row_pointer_type::host_mirror_type rowPtrP_h = Kokkos::create_mirror_view(intermediateP.graph.row_map);
805  Kokkos::deep_copy(rowPtrP_h, intermediateP.graph.row_map);
806  typename col_indices_type::host_mirror_type colIndP_h = Kokkos::create_mirror_view(intermediateP.graph.entries);
807  Kokkos::deep_copy(colIndP_h, intermediateP.graph.entries);
808  typename values_type::host_mirror_type valuesP_h = Kokkos::create_mirror_view(intermediateP.values);
809  Kokkos::deep_copy(valuesP_h, intermediateP.values);
810  typename col_indices_type::host_mirror_type colIndPt_h = Kokkos::create_mirror_view(colIndPt);
811  typename values_type::host_mirror_type valuesPt_h = Kokkos::create_mirror_view(valuesPt);
812  const col_index_type invalidColumnIndex = KokkosSparse::OrdinalTraits<col_index_type>::invalid();
813  Kokkos::deep_copy(colIndPt_h, invalidColumnIndex);
814 
815  col_index_type colIdx = 0;
816  for (LO rowIdx = 0; rowIdx < intermediateP.numRows(); ++rowIdx) {
817  for (size_type entryIdxP = rowPtrP_h(rowIdx); entryIdxP < rowPtrP_h(rowIdx + 1); ++entryIdxP) {
818  colIdx = entries_h(entryIdxP);
819  for (size_type entryIdxPt = rowPtrPt_h(colIdx); entryIdxPt < rowPtrPt_h(colIdx + 1); ++entryIdxPt) {
820  if (colIndPt_h(entryIdxPt) == invalidColumnIndex) {
821  colIndPt_h(entryIdxPt) = rowIdx;
822  valuesPt_h(entryIdxPt) = valuesP_h(entryIdxP);
823  break;
824  }
825  } // Loop over entries in row of Pt
826  } // Loop over entries in row of P
827  } // Loop over rows of P
828 
829  Kokkos::deep_copy(colIndPt, colIndPt_h);
830  Kokkos::deep_copy(valuesPt, valuesPt_h);
831 
832  local_matrix_type intermediatePt("intermediatePt",
833  intermediateP.numCols(),
834  intermediateP.numRows(),
835  intermediateP.nnz(),
836  valuesPt, rowPtrPt, colIndPt);
837 
838  // Create views for coarseA matrix
839  localSpGEMM(intermediatePt, AP, "coarseA", coarseA);
840 } // BuildCoarseLocalMatrix
841 
842 template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
844  localSpGEMM(const typename Matrix::local_matrix_type& A,
845  const typename Matrix::local_matrix_type& B,
846  const std::string matrixLabel,
847  typename Matrix::local_matrix_type& C) const {
848  using local_graph_type = typename local_matrix_type::staticcrsgraph_type;
849  using values_type = typename local_matrix_type::values_type;
850  using size_type = typename local_graph_type::size_type;
851  using col_index_type = typename local_graph_type::data_type;
852  using array_layout = typename local_graph_type::array_layout;
853  using memory_space = typename device_type::memory_space;
854  using memory_traits = typename local_graph_type::memory_traits;
855  using row_pointer_type = Kokkos::View<size_type*, array_layout, device_type, memory_traits>;
856  using col_indices_type = Kokkos::View<col_index_type*, array_layout, device_type, memory_traits>;
857 
858  // Options
859  int team_work_size = 16;
860  std::string myalg("SPGEMM_KK_MEMORY");
861  KokkosSparse::SPGEMMAlgorithm alg_enum = KokkosSparse::StringToSPGEMMAlgorithm(myalg);
862  KokkosKernels::Experimental::KokkosKernelsHandle<typename row_pointer_type::const_value_type,
863  typename col_indices_type::const_value_type,
864  typename values_type::const_value_type,
866  memory_space,
867  memory_space>
868  kh;
869  kh.create_spgemm_handle(alg_enum);
870  kh.set_team_work_size(team_work_size);
871 
872  // Create views for AP matrix
873  row_pointer_type rowPtrC(Kokkos::ViewAllocateWithoutInitializing("C row pointer"),
874  A.numRows() + 1);
875  col_indices_type colIndC;
876  values_type valuesC;
877 
878  // Symbolic multiplication
879  KokkosSparse::spgemm_symbolic(&kh, A.numRows(),
880  B.numRows(), B.numCols(),
881  A.graph.row_map, A.graph.entries, false,
882  B.graph.row_map, B.graph.entries, false,
883  rowPtrC);
884 
885  // allocate column indices and values of AP
886  size_t nnzC = kh.get_spgemm_handle()->get_c_nnz();
887  if (nnzC) {
888  colIndC = col_indices_type(Kokkos::ViewAllocateWithoutInitializing("C column inds"), nnzC);
889  valuesC = values_type(Kokkos::ViewAllocateWithoutInitializing("C values"), nnzC);
890  }
891 
892  // Numeric multiplication
893  KokkosSparse::spgemm_numeric(&kh, A.numRows(),
894  B.numRows(), B.numCols(),
895  A.graph.row_map, A.graph.entries, A.values, false,
896  B.graph.row_map, B.graph.entries, B.values, false,
897  rowPtrC, colIndC, valuesC);
898  kh.destroy_spgemm_handle();
899 
900  C = local_matrix_type(matrixLabel, A.numRows(), B.numCols(), nnzC, valuesC, rowPtrC, colIndC);
901 
902 } // localSpGEMM
903 
904 } // namespace MueLu
905 
906 #endif /* MUELU_NOTAYAGGREGATIONFACTORY_DEF_HPP_ */
LocalOrdinal RandomOrdinal(LocalOrdinal min, LocalOrdinal max)
RCP< const ParameterList > GetValidParameterList() const
Return a const parameter list of valid parameters that setParameterList() will accept.
MueLu::DefaultLocalOrdinal LocalOrdinal
void Build(Level &currentLevel) const
Build aggregates.
const RCP< LOVector > & GetProcWinner() const
Returns constant vector that maps local node IDs to owning processor IDs.
Container class for aggregation information.
typename Matrix::local_matrix_type local_matrix_type
typename Teuchos::ScalarTraits< Scalar >::magnitudeType magnitude_type
basic_FancyOStream & setShowProcRank(const bool showProcRank)
T & get(const std::string &name, T def_value)
Timer to be used in factories. Similar to Monitor but with additional timers.
#define TEUCHOS_TEST_FOR_EXCEPTION(throw_exception_test, Exception, msg)
static RCP< Vector > GetMatrixOverlappedDeletedRowsum(const Matrix &A)
Extract Overlapped Matrix Deleted Rowsum.
static magnitudeType real(T a)
void BuildInitialAggregates(const Teuchos::ParameterList &params, const RCP< const Matrix > &A, const ArrayView< const LO > &orderingVector, const magnitude_type kappa, Aggregates &aggregates, std::vector< unsigned > &aggStat, LO &numNonAggregatedNodes, LO &numDirichletNodes) const
Initial aggregation phase.
size_type size() const
LocalOrdinal LO
void DeclareInput(Level &currentLevel) const
Input.
void BuildIntermediateProlongator(const LO numRows, const LO numDirichletNodes, const LO numLocalAggregates, const ArrayView< const LO > &localVertex2AggID, local_matrix_type &intermediateP) const
Construction of a local prolongator with values equal to 1.0.
ParameterList & set(std::string const &name, T &&value, std::string const &docString="", RCP< const ParameterEntryValidator > const &validator=null)
KOKKOS_INLINE_FUNCTION size_type GetNodeNumVertices() const
Return number of graph vertices.
static RCP< Xpetra::Vector< LocalOrdinal, LocalOrdinal, GlobalOrdinal, Node > > CuthillMcKee(const Matrix &Op)
void BuildFurtherAggregates(const Teuchos::ParameterList &params, const RCP< const Matrix > &A, const Teuchos::ArrayView< const LO > &orderingVector, const local_matrix_type &coarseA, const magnitude_type kappa, const row_sum_type &rowSum, std::vector< LO > &localAggStat, Array< LO > &localVertex2AggID, LO &numLocalAggregates, LO &numNonAggregatedNodes) const
Further aggregation phase increases coarsening rate by a factor of ~2 per iteration.
bool isParameter(const std::string &name) const
Print statistics that do not involve significant additional computation.
void BuildCoarseLocalMatrix(const local_matrix_type &intermediateP, local_matrix_type &coarseA) const
Implementation of a local Galerkin projection called inside BuildFurtherAggregates.
TEUCHOS_DEPRECATED RCP< T > rcp(T *p, Dealloc_T dealloc, bool owns_mem)
Class that holds all level-specific information.
Definition: MueLu_Level.hpp:63
typename device_type::execution_space execution_space
static RCP< Xpetra::Vector< Magnitude, LocalOrdinal, GlobalOrdinal, Node > > GetMatrixOverlappedAbsDeletedRowsum(const Matrix &A)
void BuildOnRankLocalMatrix(const local_matrix_type &localA, local_matrix_type &onRankA) const
basic_FancyOStream & setShowAllFrontMatter(const bool showAllFrontMatter)
const RCP< LOMultiVector > & GetVertex2AggId() const
Returns constant vector that maps local node IDs to local aggregates IDs.
static RCP< Vector > GetMatrixOverlappedDiagonal(const Matrix &A)
Extract Overlapped Matrix Diagonal.
void localSpGEMM(const local_matrix_type &A, const local_matrix_type &B, const std::string matrixLabel, local_matrix_type &C) const
Wrapper for kokkos-kernels&#39; spgemm that takes in CrsMatrix.
Timer to be used in non-factories.
void RandomReorder(Teuchos::Array< LocalOrdinal > &list)
size_type size() const
#define SET_VALID_ENTRY(name)
Scalar SC
Exception throws to report errors in the internal logical of the program.
typename Kokkos::View< impl_scalar_type *, Kokkos::LayoutLeft, device_type > row_sum_type
void SetNumAggregates(LO nAggregates)
Set number of local aggregates on current processor.