MueLu  Version of the Day
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Groups Pages
MueLu_NotayAggregationFactory_def.hpp
Go to the documentation of this file.
1 // @HEADER
2 // *****************************************************************************
3 // MueLu: A package for multigrid based preconditioning
4 //
5 // Copyright 2012 NTESS and the MueLu contributors.
6 // SPDX-License-Identifier: BSD-3-Clause
7 // *****************************************************************************
8 // @HEADER
9 
10 #ifndef MUELU_NOTAYAGGREGATIONFACTORY_DEF_HPP_
11 #define MUELU_NOTAYAGGREGATIONFACTORY_DEF_HPP_
12 
13 #include <Xpetra_Map.hpp>
14 #include <Xpetra_Vector.hpp>
15 #include <Xpetra_MultiVectorFactory.hpp>
16 #include <Xpetra_MapFactory.hpp>
17 #include <Xpetra_VectorFactory.hpp>
18 
19 #include "KokkosKernels_Handle.hpp"
20 #include "KokkosSparse_spgemm.hpp"
21 #include "KokkosSparse_spmv.hpp"
22 
24 
25 #include "MueLu_Aggregates.hpp"
26 #include "MueLu_LWGraph.hpp"
27 #include "MueLu_LWGraph_kokkos.hpp"
28 #include "MueLu_Level.hpp"
29 #include "MueLu_MasterList.hpp"
30 #include "MueLu_Monitor.hpp"
31 #include "MueLu_Types.hpp"
32 #include "MueLu_Utilities.hpp"
33 
34 namespace MueLu {
35 
36 namespace NotayUtils {
37 template <class LocalOrdinal>
39  return min + as<LocalOrdinal>((max - min + 1) * (static_cast<double>(std::rand()) / (RAND_MAX + 1.0)));
40 }
41 
42 template <class LocalOrdinal>
44  typedef LocalOrdinal LO;
45  LO n = Teuchos::as<LO>(list.size());
46  for (LO i = 0; i < n - 1; i++)
47  std::swap(list[i], list[RandomOrdinal(i, n - 1)]);
48 }
49 } // namespace NotayUtils
50 
51 template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
53  RCP<ParameterList> validParamList = rcp(new ParameterList());
54 
55 #define SET_VALID_ENTRY(name) validParamList->setEntry(name, MasterList::getEntry(name))
56  SET_VALID_ENTRY("aggregation: pairwise: size");
57  SET_VALID_ENTRY("aggregation: pairwise: tie threshold");
58  SET_VALID_ENTRY("aggregation: Dirichlet threshold");
59  SET_VALID_ENTRY("aggregation: ordering");
60 #undef SET_VALID_ENTRY
61 
62  // general variables needed in AggregationFactory
63  validParamList->set<RCP<const FactoryBase>>("A", null, "Generating factory of the matrix");
64  validParamList->set<RCP<const FactoryBase>>("Graph", null, "Generating factory of the graph");
65  validParamList->set<RCP<const FactoryBase>>("DofsPerNode", null, "Generating factory for variable \'DofsPerNode\', usually the same as for \'Graph\'");
66 
67  return validParamList;
68 }
69 
70 template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
72  Input(currentLevel, "A");
73  Input(currentLevel, "Graph");
74  Input(currentLevel, "DofsPerNode");
75 }
76 
77 template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
79  FactoryMonitor m(*this, "Build", currentLevel);
81  using MT = typename STS::magnitudeType;
82 
84 
86  if (const char* dbg = std::getenv("MUELU_PAIRWISEAGGREGATION_DEBUG")) {
87  out = Teuchos::fancyOStream(Teuchos::rcpFromRef(std::cout));
88  out->setShowAllFrontMatter(false).setShowProcRank(true);
89  } else {
90  out = Teuchos::getFancyOStream(rcp(new Teuchos::oblackholestream()));
91  }
92 
93  const ParameterList& pL = GetParameterList();
94 
95  const MT kappa = static_cast<MT>(pL.get<double>("aggregation: Dirichlet threshold"));
96  TEUCHOS_TEST_FOR_EXCEPTION(kappa <= MT_TWO,
98  "Pairwise requires kappa > 2"
99  " otherwise all rows are considered as Dirichlet rows.");
100 
101  // Parameters
102  int maxNumIter = 3;
103  if (pL.isParameter("aggregation: pairwise: size"))
104  maxNumIter = pL.get<int>("aggregation: pairwise: size");
105  TEUCHOS_TEST_FOR_EXCEPTION(maxNumIter < 1,
107  "NotayAggregationFactory::Build(): \"aggregation: pairwise: size\""
108  " must be a strictly positive integer");
109 
110  RCP<const LWGraph> graph;
111  if (IsType<RCP<LWGraph>>(currentLevel, "Graph"))
112  graph = Get<RCP<LWGraph>>(currentLevel, "Graph");
113  else {
114  auto graph_k = Get<RCP<LWGraph_kokkos>>(currentLevel, "Graph");
115  graph = graph_k->copyToHost();
116  }
117  RCP<const Matrix> A = Get<RCP<Matrix>>(currentLevel, "A");
118 
119  // Setup aggregates & aggStat objects
120  RCP<Aggregates> aggregates = rcp(new Aggregates(*graph));
121  aggregates->setObjectLabel("PW");
122 
123  const LO numRows = graph->GetNodeNumVertices();
124 
125  // construct aggStat information
126  std::vector<unsigned> aggStat(numRows, READY);
127 
128  const int DofsPerNode = Get<int>(currentLevel, "DofsPerNode");
130  "Pairwise only supports one dof per node");
131 
132  // This follows the paper:
133  // Notay, "Aggregation-based algebraic multigrid for convection-diffusion equations",
134  // SISC 34(3), pp. A2288-2316.
135 
136  // Handle Ordering
137  std::string orderingStr = pL.get<std::string>("aggregation: ordering");
138  enum {
139  O_NATURAL,
140  O_RANDOM,
141  O_CUTHILL_MCKEE,
142  } ordering;
143 
144  ordering = O_NATURAL;
145  if (orderingStr == "random")
146  ordering = O_RANDOM;
147  else if (orderingStr == "natural") {
148  } else if (orderingStr == "cuthill-mckee" || orderingStr == "cm")
149  ordering = O_CUTHILL_MCKEE;
150  else {
151  TEUCHOS_TEST_FOR_EXCEPTION(1, Exceptions::RuntimeError, "Invalid ordering type");
152  }
153 
154  // Get an ordering vector
155  // NOTE: The orderingVector only orders *rows* of the matrix. Off-proc columns
156  // will get ignored in the aggregation phases, so we don't need to worry about
157  // running off the end.
158  Array<LO> orderingVector(numRows);
159  for (LO i = 0; i < numRows; i++)
160  orderingVector[i] = i;
161  if (ordering == O_RANDOM)
162  MueLu::NotayUtils::RandomReorder(orderingVector);
163  else if (ordering == O_CUTHILL_MCKEE) {
165  auto localVector = rcmVector->getData(0);
166  for (LO i = 0; i < numRows; i++)
167  orderingVector[i] = localVector[i];
168  }
169 
170  // Get the party stated
171  LO numNonAggregatedNodes = numRows, numDirichletNodes = 0;
172  BuildInitialAggregates(pL, A, orderingVector(), kappa,
173  *aggregates, aggStat, numNonAggregatedNodes, numDirichletNodes);
174  TEUCHOS_TEST_FOR_EXCEPTION(0 < numNonAggregatedNodes, Exceptions::RuntimeError,
175  "Initial pairwise aggregation failed to aggregate all nodes");
176  LO numLocalAggregates = aggregates->GetNumAggregates();
177  GetOStream(Statistics0) << "Init : " << numLocalAggregates << " - "
178  << A->getLocalNumRows() / numLocalAggregates << std::endl;
179 
180  // Temporary data storage for further aggregation steps
181  local_matrix_type intermediateP;
182  local_matrix_type coarseLocalA;
183 
184  // Compute the on rank part of the local matrix
185  // that the square submatrix that only contains
186  // columns corresponding to local rows.
187  LO numLocalDirichletNodes = numDirichletNodes;
188  Array<LO> localVertex2AggId(aggregates->GetVertex2AggId()->getData(0).view(0, numRows));
189  BuildOnRankLocalMatrix(A->getLocalMatrixDevice(), coarseLocalA);
190  for (LO aggregationIter = 1; aggregationIter < maxNumIter; ++aggregationIter) {
191  // Compute the intermediate prolongator
192  BuildIntermediateProlongator(coarseLocalA.numRows(), numLocalDirichletNodes, numLocalAggregates,
193  localVertex2AggId(), intermediateP);
194 
195  // Compute the coarse local matrix and coarse row sum
196  BuildCoarseLocalMatrix(intermediateP, coarseLocalA);
197 
198  // Directly compute rowsum from A, rather than coarseA
199  row_sum_type rowSum("rowSum", numLocalAggregates);
200  {
201  std::vector<std::vector<LO>> agg2vertex(numLocalAggregates);
202  auto vertex2AggId = aggregates->GetVertex2AggId()->getData(0);
203  for (LO i = 0; i < (LO)numRows; i++) {
204  if (aggStat[i] != AGGREGATED)
205  continue;
206  LO agg = vertex2AggId[i];
207  agg2vertex[agg].push_back(i);
208  }
209 
210  typename row_sum_type::HostMirror rowSum_h = Kokkos::create_mirror_view(rowSum);
211  for (LO i = 0; i < numRows; i++) {
212  // If not aggregated already, skip this guy
213  if (aggStat[i] != AGGREGATED)
214  continue;
215  int agg = vertex2AggId[i];
216  std::vector<LO>& myagg = agg2vertex[agg];
217 
218  size_t nnz = A->getNumEntriesInLocalRow(i);
219  ArrayView<const LO> indices;
220  ArrayView<const SC> vals;
221  A->getLocalRowView(i, indices, vals);
222 
224  for (LO colidx = 0; colidx < static_cast<LO>(nnz); colidx++) {
225  bool found = false;
226  if (indices[colidx] < numRows) {
227  for (LO j = 0; j < (LO)myagg.size(); j++)
228  if (vertex2AggId[indices[colidx]] == agg)
229  found = true;
230  }
231  if (!found) {
232  *out << "- ADDING col " << indices[colidx] << " = " << vals[colidx] << std::endl;
233  mysum += vals[colidx];
234  } else {
235  *out << "- NOT ADDING col " << indices[colidx] << " = " << vals[colidx] << std::endl;
236  }
237  }
238 
239  rowSum_h[agg] = mysum;
240  }
241  Kokkos::deep_copy(rowSum, rowSum_h);
242  }
243 
244  // Get local orderingVector
245  Array<LO> localOrderingVector(numRows);
246  for (LO i = 0; i < numRows; i++)
247  localOrderingVector[i] = i;
248  if (ordering == O_RANDOM)
249  MueLu::NotayUtils::RandomReorder(localOrderingVector);
250  else if (ordering == O_CUTHILL_MCKEE) {
252  auto localVector = rcmVector->getData(0);
253  for (LO i = 0; i < numRows; i++)
254  localOrderingVector[i] = localVector[i];
255  }
256 
257  // Compute new aggregates
258  numLocalAggregates = 0;
259  numNonAggregatedNodes = static_cast<LO>(coarseLocalA.numRows());
260  std::vector<LO> localAggStat(numNonAggregatedNodes, READY);
261  localVertex2AggId.resize(numNonAggregatedNodes, -1);
262  BuildFurtherAggregates(pL, A, localOrderingVector, coarseLocalA, kappa, rowSum,
263  localAggStat, localVertex2AggId,
264  numLocalAggregates, numNonAggregatedNodes);
265 
266  // After the first initial pairwise aggregation
267  // the Dirichlet nodes have been removed.
268  numLocalDirichletNodes = 0;
269 
270  // Update the aggregates
271  RCP<LOMultiVector> vertex2AggIdMV = aggregates->GetVertex2AggId();
272  ArrayRCP<LO> vertex2AggId = vertex2AggIdMV->getDataNonConst(0);
273  for (size_t vertexIdx = 0; vertexIdx < A->getLocalNumRows(); ++vertexIdx) {
274  LO oldAggIdx = vertex2AggId[vertexIdx];
275  if (oldAggIdx != Teuchos::OrdinalTraits<LO>::invalid()) {
276  vertex2AggId[vertexIdx] = localVertex2AggId[oldAggIdx];
277  }
278  }
279 
280  // We could probably print some better statistics at some point
281  GetOStream(Statistics0) << "Iter " << aggregationIter << ": " << numLocalAggregates << " - "
282  << A->getLocalNumRows() / numLocalAggregates << std::endl;
283  }
284  aggregates->SetNumAggregates(numLocalAggregates);
285  aggregates->AggregatesCrossProcessors(false);
286  aggregates->ComputeAggregateSizes(true /*forceRecompute*/);
287 
288  // DO stuff
289  Set(currentLevel, "Aggregates", aggregates);
290  GetOStream(Statistics0) << aggregates->description() << std::endl;
291 }
292 
293 template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
296  const RCP<const Matrix>& A,
297  const Teuchos::ArrayView<const LO>& orderingVector,
299  Aggregates& aggregates,
300  std::vector<unsigned>& aggStat,
301  LO& numNonAggregatedNodes,
302  LO& numDirichletNodes) const {
303  Monitor m(*this, "BuildInitialAggregates");
304  using STS = Teuchos::ScalarTraits<Scalar>;
305  using MT = typename STS::magnitudeType;
307 
309  if (const char* dbg = std::getenv("MUELU_PAIRWISEAGGREGATION_DEBUG")) {
310  out = Teuchos::fancyOStream(Teuchos::rcpFromRef(std::cout));
311  out->setShowAllFrontMatter(false).setShowProcRank(true);
312  } else {
313  out = Teuchos::getFancyOStream(rcp(new Teuchos::oblackholestream()));
314  }
315 
316  const SC SC_ZERO = Teuchos::ScalarTraits<SC>::zero();
317  const MT MT_ZERO = Teuchos::ScalarTraits<MT>::zero();
318  const MT MT_ONE = Teuchos::ScalarTraits<MT>::one();
319  const MT MT_TWO = MT_ONE + MT_ONE;
320  const LO LO_INVALID = Teuchos::OrdinalTraits<LO>::invalid();
321  const LO LO_ZERO = Teuchos::OrdinalTraits<LO>::zero();
322 
323  const MT kappa_init = kappa / (kappa - MT_TWO);
324  const LO numRows = aggStat.size();
325  const int myRank = A->getMap()->getComm()->getRank();
326 
327  // For finding "ties" where we fall back to the ordering. Making this larger than
328  // hard zero substantially increases code robustness.
329  double tie_criterion = params.get<double>("aggregation: pairwise: tie threshold");
330  double tie_less = 1.0 - tie_criterion;
331  double tie_more = 1.0 + tie_criterion;
332 
333  // NOTE: Assumes 1 dof per node. This constraint is enforced in Build(),
334  // and so we're not doing again here.
335  // This should probably be fixed at some point.
336 
337  // Extract diagonal, rowsums, etc
338  // NOTE: The ghostedRowSum vector here has has the sign flipped from Notay's S
342  const ArrayRCP<const SC> D = ghostedDiag->getData(0);
343  const ArrayRCP<const SC> S = ghostedRowSum->getData(0);
344  const ArrayRCP<const MT> AbsRs = ghostedAbsRowSum->getData(0);
345 
346  // Aggregates stuff
347  ArrayRCP<LO> vertex2AggId_rcp = aggregates.GetVertex2AggId()->getDataNonConst(0);
348  ArrayRCP<LO> procWinner_rcp = aggregates.GetProcWinner()->getDataNonConst(0);
349  ArrayView<LO> vertex2AggId = vertex2AggId_rcp();
350  ArrayView<LO> procWinner = procWinner_rcp();
351 
352  // Algorithm 4.2
353 
354  // 0,1 : Initialize: Flag boundary conditions
355  // Modification: We assume symmetry here aij = aji
356  for (LO row = 0; row < Teuchos::as<LO>(A->getRowMap()->getLocalNumElements()); ++row) {
357  MT aii = STS::magnitude(D[row]);
358  MT rowsum = AbsRs[row];
359 
360  if (aii >= kappa_init * rowsum) {
361  *out << "Flagging index " << row << " as dirichlet "
362  "aii >= kappa*rowsum = "
363  << aii << " >= " << kappa_init << " " << rowsum << std::endl;
364  aggStat[row] = IGNORED;
365  --numNonAggregatedNodes;
366  ++numDirichletNodes;
367  }
368  }
369 
370  // 2 : Iteration
371  LO aggIndex = LO_ZERO;
372  for (LO i = 0; i < numRows; i++) {
373  LO current_idx = orderingVector[i];
374  // If we're aggregated already, skip this guy
375  if (aggStat[current_idx] != READY)
376  continue;
377 
378  MT best_mu = MT_ZERO;
379  LO best_idx = LO_INVALID;
380 
381  size_t nnz = A->getNumEntriesInLocalRow(current_idx);
382  ArrayView<const LO> indices;
383  ArrayView<const SC> vals;
384  A->getLocalRowView(current_idx, indices, vals);
385 
386  MT aii = STS::real(D[current_idx]);
387  MT si = STS::real(S[current_idx]);
388  for (LO colidx = 0; colidx < static_cast<LO>(nnz); colidx++) {
389  // Skip aggregated neighbors, off-rank neighbors, hard zeros and self
390  LO col = indices[colidx];
391  SC val = vals[colidx];
392  if (current_idx == col || col >= numRows || aggStat[col] != READY || val == SC_ZERO)
393  continue;
394 
395  MT aij = STS::real(val);
396  MT ajj = STS::real(D[col]);
397  MT sj = -STS::real(S[col]); // NOTE: The ghostedRowSum vector here has has the sign flipped from Notay's S
398  if (aii - si + ajj - sj >= MT_ZERO) {
399  // Modification: We assume symmetry here aij = aji
400  MT mu_top = MT_TWO / (MT_ONE / aii + MT_ONE / ajj);
401  MT mu_bottom = -aij + MT_ONE / (MT_ONE / (aii - si) + MT_ONE / (ajj - sj));
402  MT mu = mu_top / mu_bottom;
403 
404  // Modification: Explicitly check the tie criterion here
405  if (mu > MT_ZERO && (best_idx == LO_INVALID || mu < best_mu * tie_less ||
406  (mu < best_mu * tie_more && orderingVector[col] < orderingVector[best_idx]))) {
407  best_mu = mu;
408  best_idx = col;
409  *out << "[" << current_idx << "] Column UPDATED " << col << ": "
410  << "aii - si + ajj - sj = " << aii << " - " << si << " + " << ajj << " - " << sj
411  << " = " << aii - si + ajj - sj << ", aij = " << aij << ", mu = " << mu << std::endl;
412  } else {
413  *out << "[" << current_idx << "] Column NOT UPDATED " << col << ": "
414  << "aii - si + ajj - sj = " << aii << " - " << si << " + " << ajj << " - " << sj
415  << " = " << aii - si + ajj - sj << ", aij = " << aij << ", mu = " << mu << std::endl;
416  }
417  } else {
418  *out << "[" << current_idx << "] Column FAILED " << col << ": "
419  << "aii - si + ajj - sj = " << aii << " - " << si << " + " << ajj << " - " << sj
420  << " = " << aii - si + ajj - sj << std::endl;
421  }
422  } // end column for loop
423 
424  if (best_idx == LO_INVALID) {
425  *out << "No node buddy found for index " << current_idx
426  << " [agg " << aggIndex << "]\n"
427  << std::endl;
428  // We found no potential node-buddy, so let's just make this a singleton
429  // NOTE: The behavior of what to do if you have no un-aggregated neighbors is not specified in
430  // the paper
431 
432  aggStat[current_idx] = ONEPT;
433  vertex2AggId[current_idx] = aggIndex;
434  procWinner[current_idx] = myRank;
435  numNonAggregatedNodes--;
436  aggIndex++;
437 
438  } else {
439  // We have a buddy, so aggregate, either as a singleton or as a pair, depending on mu
440  if (best_mu <= kappa) {
441  *out << "Node buddies (" << current_idx << "," << best_idx << ") [agg " << aggIndex << "]" << std::endl;
442 
443  aggStat[current_idx] = AGGREGATED;
444  aggStat[best_idx] = AGGREGATED;
445  vertex2AggId[current_idx] = aggIndex;
446  vertex2AggId[best_idx] = aggIndex;
447  procWinner[current_idx] = myRank;
448  procWinner[best_idx] = myRank;
449  numNonAggregatedNodes -= 2;
450  aggIndex++;
451 
452  } else {
453  *out << "No buddy found for index " << current_idx << ","
454  " but aggregating as singleton [agg "
455  << aggIndex << "]" << std::endl;
456 
457  aggStat[current_idx] = ONEPT;
458  vertex2AggId[current_idx] = aggIndex;
459  procWinner[current_idx] = myRank;
460  numNonAggregatedNodes--;
461  aggIndex++;
462  } // best_mu
463  } // best_idx
464  } // end Algorithm 4.2
465 
466  *out << "vertex2aggid :";
467  for (int i = 0; i < static_cast<int>(vertex2AggId.size()); ++i) {
468  *out << i << "(" << vertex2AggId[i] << ")";
469  }
470  *out << std::endl;
471 
472  // update aggregate object
473  aggregates.SetNumAggregates(aggIndex);
474 } // BuildInitialAggregates
475 
476 template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
479  const RCP<const Matrix>& A,
480  const Teuchos::ArrayView<const LO>& orderingVector,
481  const typename Matrix::local_matrix_type& coarseA,
483  const Kokkos::View<typename Kokkos::ArithTraits<Scalar>::val_type*,
484  Kokkos::LayoutLeft,
485  typename Matrix::local_matrix_type::device_type>& rowSum,
486  std::vector<LocalOrdinal>& localAggStat,
487  Teuchos::Array<LocalOrdinal>& localVertex2AggID,
488  LO& numLocalAggregates,
489  LO& numNonAggregatedNodes) const {
490  Monitor m(*this, "BuildFurtherAggregates");
491 
492  // Set debug outputs based on environment variable
494  if (const char* dbg = std::getenv("MUELU_PAIRWISEAGGREGATION_DEBUG")) {
495  out = Teuchos::fancyOStream(Teuchos::rcpFromRef(std::cout));
496  out->setShowAllFrontMatter(false).setShowProcRank(true);
497  } else {
498  out = Teuchos::getFancyOStream(rcp(new Teuchos::oblackholestream()));
499  }
500 
501  using value_type = typename local_matrix_type::value_type;
502  const value_type KAT_zero = Kokkos::ArithTraits<value_type>::zero();
505  const magnitude_type MT_two = MT_one + MT_one;
506  const LO LO_INVALID = Teuchos::OrdinalTraits<LO>::invalid();
507 
508  // For finding "ties" where we fall back to the ordering. Making this larger than
509  // hard zero substantially increases code robustness.
510  double tie_criterion = params.get<double>("aggregation: pairwise: tie threshold");
511  double tie_less = 1.0 - tie_criterion;
512  double tie_more = 1.0 + tie_criterion;
513 
514  typename row_sum_type::HostMirror rowSum_h = Kokkos::create_mirror_view(rowSum);
515  Kokkos::deep_copy(rowSum_h, rowSum);
516 
517  // Extracting the diagonal of a KokkosSparse::CrsMatrix
518  // is not currently provided in kokkos-kernels so here
519  // is an ugly way to get that done...
520  const LO numRows = static_cast<LO>(coarseA.numRows());
521  typename local_matrix_type::values_type::HostMirror diagA_h("diagA host", numRows);
522  typename local_matrix_type::row_map_type::HostMirror row_map_h = Kokkos::create_mirror_view(coarseA.graph.row_map);
523  Kokkos::deep_copy(row_map_h, coarseA.graph.row_map);
524  typename local_matrix_type::index_type::HostMirror entries_h = Kokkos::create_mirror_view(coarseA.graph.entries);
525  Kokkos::deep_copy(entries_h, coarseA.graph.entries);
526  typename local_matrix_type::values_type::HostMirror values_h = Kokkos::create_mirror_view(coarseA.values);
527  Kokkos::deep_copy(values_h, coarseA.values);
528  for (LO rowIdx = 0; rowIdx < numRows; ++rowIdx) {
529  for (LO entryIdx = static_cast<LO>(row_map_h(rowIdx));
530  entryIdx < static_cast<LO>(row_map_h(rowIdx + 1));
531  ++entryIdx) {
532  if (rowIdx == static_cast<LO>(entries_h(entryIdx))) {
533  diagA_h(rowIdx) = values_h(entryIdx);
534  }
535  }
536  }
537 
538  for (LO currentIdx = 0; currentIdx < numRows; ++currentIdx) {
539  if (localAggStat[currentIdx] != READY) {
540  continue;
541  }
542 
545  const magnitude_type aii = Teuchos::ScalarTraits<value_type>::real(diagA_h(currentIdx));
546  const magnitude_type si = Teuchos::ScalarTraits<value_type>::real(rowSum_h(currentIdx));
547  for (auto entryIdx = row_map_h(currentIdx); entryIdx < row_map_h(currentIdx + 1); ++entryIdx) {
548  const LO colIdx = static_cast<LO>(entries_h(entryIdx));
549  if (currentIdx == colIdx || colIdx >= numRows || localAggStat[colIdx] != READY || values_h(entryIdx) == KAT_zero) {
550  continue;
551  }
552 
553  const magnitude_type aij = Teuchos::ScalarTraits<value_type>::real(values_h(entryIdx));
554  const magnitude_type ajj = Teuchos::ScalarTraits<value_type>::real(diagA_h(colIdx));
555  const magnitude_type sj = -Teuchos::ScalarTraits<value_type>::real(rowSum_h(colIdx)); // NOTE: The ghostedRowSum vector here has has the sign flipped from Notay's S
556  if (aii - si + ajj - sj >= MT_zero) {
557  const magnitude_type mu_top = MT_two / (MT_one / aii + MT_one / ajj);
558  const magnitude_type mu_bottom = -aij + MT_one / (MT_one / (aii - si) + MT_one / (ajj - sj));
559  const magnitude_type mu = mu_top / mu_bottom;
560 
561  // Modification: Explicitly check the tie criterion here
562  if (mu > MT_zero && (bestIdx == LO_INVALID || mu < best_mu * tie_less ||
563  (mu < best_mu * tie_more && orderingVector[colIdx] < orderingVector[bestIdx]))) {
564  best_mu = mu;
565  bestIdx = colIdx;
566  *out << "[" << currentIdx << "] Column UPDATED " << colIdx << ": "
567  << "aii - si + ajj - sj = " << aii << " - " << si << " + " << ajj << " - " << sj
568  << " = " << aii - si + ajj - sj << ", aij = " << aij << " mu = " << mu << std::endl;
569  } else {
570  *out << "[" << currentIdx << "] Column NOT UPDATED " << colIdx << ": "
571  << "aii - si + ajj - sj = " << aii << " - " << si << " + " << ajj << " - " << sj
572  << " = " << aii - si + ajj - sj << ", aij = " << aij << ", mu = " << mu << std::endl;
573  }
574  } else {
575  *out << "[" << currentIdx << "] Column FAILED " << colIdx << ": "
576  << "aii - si + ajj - sj = " << aii << " - " << si << " + " << ajj << " - " << sj
577  << " = " << aii - si + ajj - sj << std::endl;
578  }
579  } // end loop over row entries
580 
581  if (bestIdx == Teuchos::OrdinalTraits<LO>::invalid()) {
582  localAggStat[currentIdx] = ONEPT;
583  localVertex2AggID[currentIdx] = numLocalAggregates;
584  --numNonAggregatedNodes;
585  ++numLocalAggregates;
586  } else {
587  if (best_mu <= kappa) {
588  *out << "Node buddies (" << currentIdx << "," << bestIdx << ") [agg " << numLocalAggregates << "]" << std::endl;
589 
590  localAggStat[currentIdx] = AGGREGATED;
591  localVertex2AggID[currentIdx] = numLocalAggregates;
592  --numNonAggregatedNodes;
593 
594  localAggStat[bestIdx] = AGGREGATED;
595  localVertex2AggID[bestIdx] = numLocalAggregates;
596  --numNonAggregatedNodes;
597 
598  ++numLocalAggregates;
599  } else {
600  *out << "No buddy found for index " << currentIdx << ","
601  " but aggregating as singleton [agg "
602  << numLocalAggregates << "]" << std::endl;
603 
604  localAggStat[currentIdx] = ONEPT;
605  localVertex2AggID[currentIdx] = numLocalAggregates;
606  --numNonAggregatedNodes;
607  ++numLocalAggregates;
608  }
609  }
610  } // end loop over matrix rows
611 
612 } // BuildFurtherAggregates
613 
614 template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
616  BuildOnRankLocalMatrix(const typename Matrix::local_matrix_type& localA,
617  typename Matrix::local_matrix_type& onrankA) const {
618  Monitor m(*this, "BuildOnRankLocalMatrix");
619 
620  // Set debug outputs based on environment variable
622  if (const char* dbg = std::getenv("MUELU_PAIRWISEAGGREGATION_DEBUG")) {
623  out = Teuchos::fancyOStream(Teuchos::rcpFromRef(std::cout));
624  out->setShowAllFrontMatter(false).setShowProcRank(true);
625  } else {
626  out = Teuchos::getFancyOStream(rcp(new Teuchos::oblackholestream()));
627  }
628 
629  using local_graph_type = typename local_matrix_type::staticcrsgraph_type;
630  using values_type = typename local_matrix_type::values_type;
631  using size_type = typename local_graph_type::size_type;
632  using col_index_type = typename local_graph_type::data_type;
633  using array_layout = typename local_graph_type::array_layout;
634  using memory_traits = typename local_graph_type::memory_traits;
635  using row_pointer_type = Kokkos::View<size_type*, array_layout, device_type, memory_traits>;
636  using col_indices_type = Kokkos::View<col_index_type*, array_layout, device_type, memory_traits>;
637  // Extract on rank part of A
638  // Simply check that the column index is less than the number of local rows
639  // otherwise remove it.
640 
641  const int numRows = static_cast<int>(localA.numRows());
642  row_pointer_type rowPtr("onrankA row pointer", numRows + 1);
643  typename row_pointer_type::HostMirror rowPtr_h = Kokkos::create_mirror_view(rowPtr);
644  typename local_graph_type::row_map_type::HostMirror origRowPtr_h = Kokkos::create_mirror_view(localA.graph.row_map);
645  typename local_graph_type::entries_type::HostMirror origColind_h = Kokkos::create_mirror_view(localA.graph.entries);
646  typename values_type::HostMirror origValues_h = Kokkos::create_mirror_view(localA.values);
647  Kokkos::deep_copy(origRowPtr_h, localA.graph.row_map);
648  Kokkos::deep_copy(origColind_h, localA.graph.entries);
649  Kokkos::deep_copy(origValues_h, localA.values);
650 
651  // Compute the number of nnz entries per row
652  rowPtr_h(0) = 0;
653  for (int rowIdx = 0; rowIdx < numRows; ++rowIdx) {
654  for (size_type entryIdx = origRowPtr_h(rowIdx); entryIdx < origRowPtr_h(rowIdx + 1); ++entryIdx) {
655  if (origColind_h(entryIdx) < numRows) {
656  rowPtr_h(rowIdx + 1) += 1;
657  }
658  }
659  rowPtr_h(rowIdx + 1) = rowPtr_h(rowIdx + 1) + rowPtr_h(rowIdx);
660  }
661  Kokkos::deep_copy(rowPtr, rowPtr_h);
662 
663  const LO nnzOnrankA = rowPtr_h(numRows);
664 
665  // Now use nnz per row to allocate matrix views and store column indices and values
666  col_indices_type colInd("onrankA column indices", rowPtr_h(numRows));
667  values_type values("onrankA values", rowPtr_h(numRows));
668  typename col_indices_type::HostMirror colInd_h = Kokkos::create_mirror_view(colInd);
669  typename values_type::HostMirror values_h = Kokkos::create_mirror_view(values);
670  int entriesInRow;
671  for (int rowIdx = 0; rowIdx < numRows; ++rowIdx) {
672  entriesInRow = 0;
673  for (size_type entryIdx = origRowPtr_h(rowIdx); entryIdx < origRowPtr_h(rowIdx + 1); ++entryIdx) {
674  if (origColind_h(entryIdx) < numRows) {
675  colInd_h(rowPtr_h(rowIdx) + entriesInRow) = origColind_h(entryIdx);
676  values_h(rowPtr_h(rowIdx) + entriesInRow) = origValues_h(entryIdx);
677  ++entriesInRow;
678  }
679  }
680  }
681  Kokkos::deep_copy(colInd, colInd_h);
682  Kokkos::deep_copy(values, values_h);
683 
684  onrankA = local_matrix_type("onrankA", numRows, numRows,
685  nnzOnrankA, values, rowPtr, colInd);
686 }
687 
688 template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
691  const LocalOrdinal numDirichletNodes,
692  const LocalOrdinal numLocalAggregates,
693  const Teuchos::ArrayView<const LocalOrdinal>& localVertex2AggID,
694  typename Matrix::local_matrix_type& intermediateP) const {
695  Monitor m(*this, "BuildIntermediateProlongator");
696 
697  // Set debug outputs based on environment variable
699  if (const char* dbg = std::getenv("MUELU_PAIRWISEAGGREGATION_DEBUG")) {
700  out = Teuchos::fancyOStream(Teuchos::rcpFromRef(std::cout));
701  out->setShowAllFrontMatter(false).setShowProcRank(true);
702  } else {
703  out = Teuchos::getFancyOStream(rcp(new Teuchos::oblackholestream()));
704  }
705 
706  using local_graph_type = typename local_matrix_type::staticcrsgraph_type;
707  using values_type = typename local_matrix_type::values_type;
708  using size_type = typename local_graph_type::size_type;
709  using col_index_type = typename local_graph_type::data_type;
710  using array_layout = typename local_graph_type::array_layout;
711  using memory_traits = typename local_graph_type::memory_traits;
712  using row_pointer_type = Kokkos::View<size_type*, array_layout, device_type, memory_traits>;
713  using col_indices_type = Kokkos::View<col_index_type*, array_layout, device_type, memory_traits>;
714 
715  const LO LO_INVALID = Teuchos::OrdinalTraits<LO>::invalid();
716 
717  const int intermediatePnnz = numRows - numDirichletNodes;
718  row_pointer_type rowPtr("intermediateP row pointer", numRows + 1);
719  col_indices_type colInd("intermediateP column indices", intermediatePnnz);
720  values_type values("intermediateP values", intermediatePnnz);
721  typename row_pointer_type::HostMirror rowPtr_h = Kokkos::create_mirror_view(rowPtr);
722  typename col_indices_type::HostMirror colInd_h = Kokkos::create_mirror_view(colInd);
723 
724  rowPtr_h(0) = 0;
725  for (int rowIdx = 0; rowIdx < numRows; ++rowIdx) {
726  // Skip Dirichlet nodes
727  if (localVertex2AggID[rowIdx] == LO_INVALID) {
728  rowPtr_h(rowIdx + 1) = rowPtr_h(rowIdx);
729  } else {
730  rowPtr_h(rowIdx + 1) = rowPtr_h(rowIdx) + 1;
731  colInd_h(rowPtr_h(rowIdx)) = localVertex2AggID[rowIdx];
732  }
733  }
734 
735  Kokkos::deep_copy(rowPtr, rowPtr_h);
736  Kokkos::deep_copy(colInd, colInd_h);
737  Kokkos::deep_copy(values, Kokkos::ArithTraits<typename values_type::value_type>::one());
738 
739  intermediateP = local_matrix_type("intermediateP",
740  numRows, numLocalAggregates, intermediatePnnz,
741  values, rowPtr, colInd);
742 } // BuildIntermediateProlongator
743 
744 template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
746  BuildCoarseLocalMatrix(const typename Matrix::local_matrix_type& intermediateP,
747  typename Matrix::local_matrix_type& coarseA) const {
748  Monitor m(*this, "BuildCoarseLocalMatrix");
749 
750  using local_graph_type = typename local_matrix_type::staticcrsgraph_type;
751  using values_type = typename local_matrix_type::values_type;
752  using size_type = typename local_graph_type::size_type;
753  using col_index_type = typename local_graph_type::data_type;
754  using array_layout = typename local_graph_type::array_layout;
755  using memory_traits = typename local_graph_type::memory_traits;
756  using row_pointer_type = Kokkos::View<size_type*, array_layout, device_type, memory_traits>;
757  using col_indices_type = Kokkos::View<col_index_type*, array_layout, device_type, memory_traits>;
758 
760  localSpGEMM(coarseA, intermediateP, "AP", AP);
761 
762  // Note 03/11/20, lbv: does kh need to destroy and recreate the spgemm handle
763  // I am not sure but doing it for safety in case it stashes data from the previous
764  // spgemm computation...
765 
766  // Compute Ac = Pt * AP
767  // Two steps needed:
768  // 1. compute Pt
769  // 2. perform multiplication
770 
771  // Step 1 compute Pt
772  // Obviously this requires the same amount of storage as P except for the rowPtr
773  row_pointer_type rowPtrPt(Kokkos::ViewAllocateWithoutInitializing("Pt row pointer"),
774  intermediateP.numCols() + 1);
775  col_indices_type colIndPt(Kokkos::ViewAllocateWithoutInitializing("Pt column indices"),
776  intermediateP.nnz());
777  values_type valuesPt(Kokkos::ViewAllocateWithoutInitializing("Pt values"),
778  intermediateP.nnz());
779 
780  typename row_pointer_type::HostMirror rowPtrPt_h = Kokkos::create_mirror_view(rowPtrPt);
781  typename col_indices_type::HostMirror entries_h = Kokkos::create_mirror_view(intermediateP.graph.entries);
782  Kokkos::deep_copy(entries_h, intermediateP.graph.entries);
783  Kokkos::deep_copy(rowPtrPt_h, 0);
784  for (size_type entryIdx = 0; entryIdx < intermediateP.nnz(); ++entryIdx) {
785  rowPtrPt_h(entries_h(entryIdx) + 1) += 1;
786  }
787  for (LO rowIdx = 0; rowIdx < intermediateP.numCols(); ++rowIdx) {
788  rowPtrPt_h(rowIdx + 1) += rowPtrPt_h(rowIdx);
789  }
790  Kokkos::deep_copy(rowPtrPt, rowPtrPt_h);
791 
792  typename row_pointer_type::HostMirror rowPtrP_h = Kokkos::create_mirror_view(intermediateP.graph.row_map);
793  Kokkos::deep_copy(rowPtrP_h, intermediateP.graph.row_map);
794  typename col_indices_type::HostMirror colIndP_h = Kokkos::create_mirror_view(intermediateP.graph.entries);
795  Kokkos::deep_copy(colIndP_h, intermediateP.graph.entries);
796  typename values_type::HostMirror valuesP_h = Kokkos::create_mirror_view(intermediateP.values);
797  Kokkos::deep_copy(valuesP_h, intermediateP.values);
798  typename col_indices_type::HostMirror colIndPt_h = Kokkos::create_mirror_view(colIndPt);
799  typename values_type::HostMirror valuesPt_h = Kokkos::create_mirror_view(valuesPt);
800  const col_index_type invalidColumnIndex = KokkosSparse::OrdinalTraits<col_index_type>::invalid();
801  Kokkos::deep_copy(colIndPt_h, invalidColumnIndex);
802 
803  col_index_type colIdx = 0;
804  for (LO rowIdx = 0; rowIdx < intermediateP.numRows(); ++rowIdx) {
805  for (size_type entryIdxP = rowPtrP_h(rowIdx); entryIdxP < rowPtrP_h(rowIdx + 1); ++entryIdxP) {
806  colIdx = entries_h(entryIdxP);
807  for (size_type entryIdxPt = rowPtrPt_h(colIdx); entryIdxPt < rowPtrPt_h(colIdx + 1); ++entryIdxPt) {
808  if (colIndPt_h(entryIdxPt) == invalidColumnIndex) {
809  colIndPt_h(entryIdxPt) = rowIdx;
810  valuesPt_h(entryIdxPt) = valuesP_h(entryIdxP);
811  break;
812  }
813  } // Loop over entries in row of Pt
814  } // Loop over entries in row of P
815  } // Loop over rows of P
816 
817  Kokkos::deep_copy(colIndPt, colIndPt_h);
818  Kokkos::deep_copy(valuesPt, valuesPt_h);
819 
820  local_matrix_type intermediatePt("intermediatePt",
821  intermediateP.numCols(),
822  intermediateP.numRows(),
823  intermediateP.nnz(),
824  valuesPt, rowPtrPt, colIndPt);
825 
826  // Create views for coarseA matrix
827  localSpGEMM(intermediatePt, AP, "coarseA", coarseA);
828 } // BuildCoarseLocalMatrix
829 
830 template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
832  localSpGEMM(const typename Matrix::local_matrix_type& A,
833  const typename Matrix::local_matrix_type& B,
834  const std::string matrixLabel,
835  typename Matrix::local_matrix_type& C) const {
836  using local_graph_type = typename local_matrix_type::staticcrsgraph_type;
837  using values_type = typename local_matrix_type::values_type;
838  using size_type = typename local_graph_type::size_type;
839  using col_index_type = typename local_graph_type::data_type;
840  using array_layout = typename local_graph_type::array_layout;
841  using memory_space = typename device_type::memory_space;
842  using memory_traits = typename local_graph_type::memory_traits;
843  using row_pointer_type = Kokkos::View<size_type*, array_layout, device_type, memory_traits>;
844  using col_indices_type = Kokkos::View<col_index_type*, array_layout, device_type, memory_traits>;
845 
846  // Options
847  int team_work_size = 16;
848  std::string myalg("SPGEMM_KK_MEMORY");
849  KokkosSparse::SPGEMMAlgorithm alg_enum = KokkosSparse::StringToSPGEMMAlgorithm(myalg);
850  KokkosKernels::Experimental::KokkosKernelsHandle<typename row_pointer_type::const_value_type,
851  typename col_indices_type::const_value_type,
852  typename values_type::const_value_type,
854  memory_space,
855  memory_space>
856  kh;
857  kh.create_spgemm_handle(alg_enum);
858  kh.set_team_work_size(team_work_size);
859 
860  // Create views for AP matrix
861  row_pointer_type rowPtrC(Kokkos::ViewAllocateWithoutInitializing("C row pointer"),
862  A.numRows() + 1);
863  col_indices_type colIndC;
864  values_type valuesC;
865 
866  // Symbolic multiplication
867  KokkosSparse::Experimental::spgemm_symbolic(&kh, A.numRows(),
868  B.numRows(), B.numCols(),
869  A.graph.row_map, A.graph.entries, false,
870  B.graph.row_map, B.graph.entries, false,
871  rowPtrC);
872 
873  // allocate column indices and values of AP
874  size_t nnzC = kh.get_spgemm_handle()->get_c_nnz();
875  if (nnzC) {
876  colIndC = col_indices_type(Kokkos::ViewAllocateWithoutInitializing("C column inds"), nnzC);
877  valuesC = values_type(Kokkos::ViewAllocateWithoutInitializing("C values"), nnzC);
878  }
879 
880  // Numeric multiplication
881  KokkosSparse::Experimental::spgemm_numeric(&kh, A.numRows(),
882  B.numRows(), B.numCols(),
883  A.graph.row_map, A.graph.entries, A.values, false,
884  B.graph.row_map, B.graph.entries, B.values, false,
885  rowPtrC, colIndC, valuesC);
886  kh.destroy_spgemm_handle();
887 
888  C = local_matrix_type(matrixLabel, A.numRows(), B.numCols(), nnzC, valuesC, rowPtrC, colIndC);
889 
890 } // localSpGEMM
891 
892 } // namespace MueLu
893 
894 #endif /* MUELU_NOTAYAGGREGATIONFACTORY_DEF_HPP_ */
LocalOrdinal RandomOrdinal(LocalOrdinal min, LocalOrdinal max)
RCP< const ParameterList > GetValidParameterList() const
Return a const parameter list of valid parameters that setParameterList() will accept.
MueLu::DefaultLocalOrdinal LocalOrdinal
void Build(Level &currentLevel) const
Build aggregates.
const RCP< LOVector > & GetProcWinner() const
Returns constant vector that maps local node IDs to owning processor IDs.
Container class for aggregation information.
typename Matrix::local_matrix_type local_matrix_type
typename Teuchos::ScalarTraits< Scalar >::magnitudeType magnitude_type
basic_FancyOStream & setShowProcRank(const bool showProcRank)
T & get(const std::string &name, T def_value)
Timer to be used in factories. Similar to Monitor but with additional timers.
#define TEUCHOS_TEST_FOR_EXCEPTION(throw_exception_test, Exception, msg)
static RCP< Vector > GetMatrixOverlappedDeletedRowsum(const Matrix &A)
Extract Overlapped Matrix Deleted Rowsum.
static magnitudeType real(T a)
void BuildInitialAggregates(const Teuchos::ParameterList &params, const RCP< const Matrix > &A, const ArrayView< const LO > &orderingVector, const magnitude_type kappa, Aggregates &aggregates, std::vector< unsigned > &aggStat, LO &numNonAggregatedNodes, LO &numDirichletNodes) const
Initial aggregation phase.
size_type size() const
LocalOrdinal LO
void DeclareInput(Level &currentLevel) const
Input.
void BuildIntermediateProlongator(const LO numRows, const LO numDirichletNodes, const LO numLocalAggregates, const ArrayView< const LO > &localVertex2AggID, local_matrix_type &intermediateP) const
Construction of a local prolongator with values equal to 1.0.
ParameterList & set(std::string const &name, T &&value, std::string const &docString="", RCP< const ParameterEntryValidator > const &validator=null)
KOKKOS_INLINE_FUNCTION size_type GetNodeNumVertices() const
Return number of graph vertices.
static RCP< Xpetra::Vector< LocalOrdinal, LocalOrdinal, GlobalOrdinal, Node > > CuthillMcKee(const Matrix &Op)
void BuildFurtherAggregates(const Teuchos::ParameterList &params, const RCP< const Matrix > &A, const Teuchos::ArrayView< const LO > &orderingVector, const local_matrix_type &coarseA, const magnitude_type kappa, const row_sum_type &rowSum, std::vector< LO > &localAggStat, Array< LO > &localVertex2AggID, LO &numLocalAggregates, LO &numNonAggregatedNodes) const
Further aggregation phase increases coarsening rate by a factor of ~2 per iteration.
bool isParameter(const std::string &name) const
Print statistics that do not involve significant additional computation.
void BuildCoarseLocalMatrix(const local_matrix_type &intermediateP, local_matrix_type &coarseA) const
Implementation of a local Galerkin projection called inside BuildFurtherAggregates.
TEUCHOS_DEPRECATED RCP< T > rcp(T *p, Dealloc_T dealloc, bool owns_mem)
Class that holds all level-specific information.
Definition: MueLu_Level.hpp:63
typename device_type::execution_space execution_space
static RCP< Xpetra::Vector< Magnitude, LocalOrdinal, GlobalOrdinal, Node > > GetMatrixOverlappedAbsDeletedRowsum(const Matrix &A)
void BuildOnRankLocalMatrix(const local_matrix_type &localA, local_matrix_type &onRankA) const
basic_FancyOStream & setShowAllFrontMatter(const bool showAllFrontMatter)
const RCP< LOMultiVector > & GetVertex2AggId() const
Returns constant vector that maps local node IDs to local aggregates IDs.
static RCP< Vector > GetMatrixOverlappedDiagonal(const Matrix &A)
Extract Overlapped Matrix Diagonal.
void localSpGEMM(const local_matrix_type &A, const local_matrix_type &B, const std::string matrixLabel, local_matrix_type &C) const
Wrapper for kokkos-kernels&#39; spgemm that takes in CrsMatrix.
Timer to be used in non-factories.
void RandomReorder(Teuchos::Array< LocalOrdinal > &list)
size_type size() const
#define SET_VALID_ENTRY(name)
Scalar SC
Exception throws to report errors in the internal logical of the program.
typename Kokkos::View< impl_scalar_type *, Kokkos::LayoutLeft, device_type > row_sum_type
void SetNumAggregates(LO nAggregates)
Set number of local aggregates on current processor.