Ifpack2 Templated Preconditioning Package  Version 1.0
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Groups Pages
Ifpack2_Container_def.hpp
1 // @HEADER
2 // *****************************************************************************
3 // Ifpack2: Templated Object-Oriented Algebraic Preconditioner Package
4 //
5 // Copyright 2009 NTESS and the Ifpack2 contributors.
6 // SPDX-License-Identifier: BSD-3-Clause
7 // *****************************************************************************
8 // @HEADER
9 
10 #ifndef IFPACK2_CONTAINER_DEF_HPP
11 #define IFPACK2_CONTAINER_DEF_HPP
12 
14 #include <Teuchos_Time.hpp>
15 
16 namespace Ifpack2 {
17 
18 //Implementation of Ifpack2::Container
19 
20 template<class MatrixType>
23  const Teuchos::Array<Teuchos::Array<LO> >& partitions,
24  bool pointIndexed) :
25  inputMatrix_ (matrix),
26  inputCrsMatrix_ (Teuchos::rcp_dynamic_cast<const crs_matrix_type>(inputMatrix_)),
27  inputBlockMatrix_ (Teuchos::rcp_dynamic_cast<const block_crs_matrix_type>(inputMatrix_)),
28  pointIndexed_(pointIndexed),
29  IsInitialized_(false),
30  IsComputed_(false)
31 {
32  using Teuchos::Ptr;
33  using Teuchos::RCP;
34  using Teuchos::Array;
35  using Teuchos::ArrayView;
36  using Teuchos::Comm;
37  NumLocalRows_ = inputMatrix_->getLocalNumRows();
38  NumGlobalRows_ = inputMatrix_->getGlobalNumRows();
39  NumGlobalNonzeros_ = inputMatrix_->getGlobalNumEntries();
40  IsParallel_ = inputMatrix_->getRowMap()->getComm()->getSize() != 1;
42  if(hasBlockCrs_)
43  bcrsBlockSize_ = inputBlockMatrix_->getBlockSize();
44  else
45  bcrsBlockSize_ = 1;
48  else
49  scalarsPerRow_ = 1;
50  setBlockSizes(partitions);
51  //Sanity check the partitions
52  #ifdef HAVE_IFPACK2_DEBUG
53  // Check whether the input set of local row indices is correct.
54  const map_type& rowMap = *inputMatrix_->getRowMap();
55  for(int i = 0; i < numBlocks_; i++)
56  {
58  for(LO j = 0; j < blockSizes_[i]; j++)
59  {
60  LO row = blockRows[j];
61  if(pointIndexed)
62  {
63  //convert the point row to the corresponding block row
64  row /= bcrsBlockSize_;
65  }
67  !rowMap.isNodeLocalElement(row),
68  std::invalid_argument, "Ifpack2::Container: "
69  "On process " << rowMap.getComm()->getRank() << " of "
70  << rowMap.getComm()->getSize() << ", in the given set of local row "
71  "indices blockRows = " << Teuchos::toString(blockRows) << ", the following "
72  "entries is not valid local row index on the calling process: "
73  << row << ".");
74  }
75  }
76  #endif
77 }
78 
79 template<class MatrixType>
82 
83 template<class MatrixType>
86 {
88  (&blockRows_[blockOffsets_[blockIndex]], blockSizes_[blockIndex]);
89 }
90 
91 template<class MatrixType>
93 {
94  //First, create a grand total of all the rows in all the blocks
95  //Note: If partitioner allowed overlap, this could be greater than the # of local rows
96  LO totalBlockRows = 0;
97  numBlocks_ = partitions.size();
98  blockSizes_.resize(numBlocks_);
99  blockOffsets_.resize(numBlocks_);
100  maxBlockSize_ = 0;
101  for(int i = 0; i < numBlocks_; i++)
102  {
103  LO rowsInBlock = partitions[i].size();
104  blockSizes_[i] = rowsInBlock;
105  blockOffsets_[i] = totalBlockRows;
106  totalBlockRows += rowsInBlock;
107  maxBlockSize_ = std::max(maxBlockSize_, rowsInBlock * scalarsPerRow_);
108  }
109  blockRows_.resize(totalBlockRows);
110  //set blockRows_: each entry is the partition/block of the row
111  LO iter = 0;
112  for(int i = 0; i < numBlocks_; i++)
113  {
114  for(int j = 0; j < blockSizes_[i]; j++)
115  {
116  blockRows_[iter++] = partitions[i][j];
117  }
118  }
119 }
120 
121 template<class MatrixType>
123 {
124  if(Diag_.is_null())
125  {
126  Diag_ = rcp(new vector_type(inputMatrix_->getDomainMap()));
127  inputMatrix_->getLocalDiagCopy(*Diag_);
128  }
129 }
130 
131 template<class MatrixType>
133  return IsInitialized_;
134 }
135 
136 template<class MatrixType>
138  return IsComputed_;
139 }
140 
141 template<class MatrixType>
143 applyMV(const mv_type& X, mv_type& Y) const
144 {
145  TEUCHOS_TEST_FOR_EXCEPT_MSG(true, "Not implemented.");
146 }
147 
148 template<class MatrixType>
150 weightedApplyMV(const mv_type& X, mv_type& Y, vector_type& W) const
151 {
152  TEUCHOS_TEST_FOR_EXCEPT_MSG(true, "Not implemented.");
153 }
154 
155 template<class MatrixType>
156 std::string Container<MatrixType>::
158 {
159  return "Generic";
160 }
161 
162 template<class MatrixType>
163 void Container<MatrixType>::DoGSBlock(ConstHostView X, HostView Y, HostView Y2, HostView Resid,
164  SC dampingFactor, LO i) const
165 {
166  TEUCHOS_TEST_FOR_EXCEPT_MSG(true, "Not implemented.");
167 }
168 
169 template <class MatrixType>
170 void Container<MatrixType>::DoJacobi(ConstHostView X, HostView Y, SC dampingFactor) const
171 {
172  using STS = Teuchos::ScalarTraits<ISC>;
173  const ISC one = STS::one();
174  // use blockRows_ and blockSizes_
175  size_t numVecs = X.extent(1);
176  // Non-overlapping Jacobi
177  for (LO i = 0; i < numBlocks_; i++)
178  {
179  // may happen that a partition is empty
180  if(blockSizes_[i] != 1 || hasBlockCrs_)
181  {
182  if(blockSizes_[i] == 0 )
183  continue;
184  apply(X, Y, i, Teuchos::NO_TRANS, dampingFactor, one);
185  }
186  else // singleton, can't access Containers_[i] as it was never filled and may be null.
187  {
188  LO LRID = blockRows_[blockOffsets_[i]];
189  getMatDiag();
190  auto diagView = Diag_->getLocalViewHost(Tpetra::Access::ReadOnly);
191  ISC d = one * (static_cast<ISC> (dampingFactor)) / diagView(LRID, 0);
192  for(size_t nv = 0; nv < numVecs; nv++)
193  {
194  ISC x = X(LRID, nv);
195  Y(LRID, nv) += x * d;
196  }
197  }
198  }
199 }
200 
201 template <class MatrixType>
202 void Container<MatrixType>::DoOverlappingJacobi(ConstHostView X, HostView Y, ConstHostView W, SC dampingFactor, bool nonsymScaling) const
203 {
204  using STS = Teuchos::ScalarTraits<SC>;
205  // Overlapping Jacobi
206  for(LO i = 0; i < numBlocks_; i++)
207  {
208  // may happen that a partition is empty
209  if(blockSizes_[i] == 0)
210  continue;
211  if(blockSizes_[i] != 1) {
212  if (!nonsymScaling)
213  weightedApply(X, Y, W, i, Teuchos::NO_TRANS, dampingFactor, STS::one());
214  else {
215  // A crummy way of doing nonsymmetric scaling. We effectively
216  // first reverse scale x, which later gets scaled inside weightedApply
217  // so the net effect is that x is not scaled.
218  // This was done to keep using weightedApply() that is defined in
219  // many spots in the code.
220  HostView tempo("", X.extent(0), X.extent(1));
221  size_t numVecs = X.extent(1);
222  LO bOffset = blockOffsets_[i];
223  for (LO ii = 0; ii < blockSizes_[i]; ii++) {
224  LO LRID = blockRows_[bOffset++];
225  for (size_t jj = 0; jj < numVecs; jj++) tempo(LRID,jj)=X(LRID,jj)/ W(LRID,0);
226  }
227  weightedApply(tempo, Y, W, i, Teuchos::NO_TRANS, dampingFactor, STS::one());
228  }
229  }
230  else // singleton, can't access Containers_[i] as it was never filled and may be null.
231  {
232  const ISC one = STS::one();
233  size_t numVecs = X.extent(1);
234  LO LRID = blockRows_[blockOffsets_[i]];
235  getMatDiag();
236  auto diagView = Diag_->getLocalViewHost(Tpetra::Access::ReadOnly);
237  ISC recip = one / diagView(LRID, 0);
238  ISC wval = W(LRID,0);
239  ISC combo = wval*recip;
240  ISC d = combo*(static_cast<ISC> (dampingFactor));
241  for(size_t nv = 0; nv < numVecs; nv++)
242  {
243  ISC x = X(LRID, nv);
244  Y(LRID, nv) = x * d + Y(LRID, nv);
245  }
246  }
247  }
248 }
249 
250 //Do Gauss-Seidel with just block i
251 //This is used 3 times: once in DoGaussSeidel and twice in DoSGS
252 template<class MatrixType, typename LocalScalarType>
254  ConstHostView X, HostView Y, HostView Y2, HostView Resid,
255  SC dampingFactor, LO i) const
256 {
257  using Teuchos::ArrayView;
258  using STS = Teuchos::ScalarTraits<ISC>;
259  size_t numVecs = X.extent(1);
260  const ISC one = STS::one();
261  if(this->blockSizes_[i] == 0)
262  return; // Skip empty partitions
263  if(this->hasBlockCrs_ && !this->pointIndexed_)
264  {
265  //Use efficient blocked version
266  ArrayView<const LO> blockRows = this->getBlockRows(i);
267  const size_t localNumRows = this->blockSizes_[i];
268  using inds_type = typename block_crs_matrix_type::local_inds_host_view_type;
269  using vals_type = typename block_crs_matrix_type::values_host_view_type;
270  for(size_t j = 0; j < localNumRows; j++)
271  {
272  LO row = blockRows[j]; // Containers_[i]->ID (j);
273  vals_type values;
274  inds_type colinds;
275  this->inputBlockMatrix_->getLocalRowView(row, colinds, values);
276  LO numEntries = (LO) colinds.size();
277  for(size_t m = 0; m < numVecs; m++)
278  {
279  for (int localR = 0; localR < this->bcrsBlockSize_; localR++)
280  Resid(row * this->bcrsBlockSize_ + localR, m) = X(row * this->bcrsBlockSize_ + localR, m);
281  for (LO k = 0; k < numEntries; ++k)
282  {
283  const LO col = colinds[k];
284  for(int localR = 0; localR < this->bcrsBlockSize_; localR++)
285  {
286  for(int localC = 0; localC < this->bcrsBlockSize_; localC++)
287  {
288  Resid(row * this->bcrsBlockSize_ + localR, m) -=
289  values[k * this->bcrsBlockSize_ * this->bcrsBlockSize_ + localR + localC * this->bcrsBlockSize_]
290  * Y2(col * this->bcrsBlockSize_ + localC, m); }
291  }
292  }
293  }
294  }
295  // solve with this block
296  //
297  // Note: I'm abusing the ordering information, knowing that X/Y
298  // and Y2 have the same ordering for on-proc unknowns.
299  //
300  // Note: Add flop counts for inverse apply
301  apply(Resid, Y2, i, Teuchos::NO_TRANS, dampingFactor, one);
302  }
303  else if(!this->hasBlockCrs_ && this->blockSizes_[i] == 1)
304  {
305  // singleton, can't access Containers_[i] as it was never filled and may be null.
306  // a singleton calculation (just using matrix diagonal) is exact, all residuals should be zero.
307  LO LRID = this->blockOffsets_[i]; // by definition, a singleton 1 row in block.
308  //Use the KokkosSparse internal matrix for low-overhead values/indices access
309  //But, can only do this if the matrix is accessible directly from host, since it's not a DualView
311  container_exec_space().fence();
312  auto localA = this->inputCrsMatrix_->getLocalMatrixHost();
313  using size_type = typename crs_matrix_type::local_matrix_host_type::size_type;
314  const auto& rowmap = localA.graph.row_map;
315  const auto& entries = localA.graph.entries;
316  const auto& values = localA.values;
317  this->getMatDiag();
318  auto diagView = this->Diag_->getLocalViewHost(Tpetra::Access::ReadOnly);
319  ISC d = (static_cast<ISC> (dampingFactor)) / diagView(LRID, 0);
320  for(size_t m = 0; m < numVecs; m++)
321  {
322  // ISC x = X(LRID, m);
323  ISC r = X(LRID, m);
324  for(size_type k = rowmap(LRID); k < rowmap(LRID + 1); k++) {
325  const LO col = entries(k);
326  r -= values(k) * Y2(col, m);
327  }
328 
329  ISC newy = r * d;
330  Y2(LRID, m) += newy;
331  }
332  }
333  else if(!this->inputCrsMatrix_.is_null() &&
334  std::is_same<typename crs_matrix_type::device_type::memory_space, Kokkos::HostSpace>::value)
335  {
336  //Use the KokkosSparse internal matrix for low-overhead values/indices access
337  //But, can only do this if the matrix is accessible directly from host, since it's not a DualView
339  container_exec_space().fence();
340  auto localA = this->inputCrsMatrix_->getLocalMatrixHost();
341  using size_type = typename crs_matrix_type::local_matrix_host_type::size_type;
342  const auto& rowmap = localA.graph.row_map;
343  const auto& entries = localA.graph.entries;
344  const auto& values = localA.values;
345  ArrayView<const LO> blockRows = this->getBlockRows(i);
346  for(size_t j = 0; j < size_t(blockRows.size()); j++)
347  {
348  const LO row = blockRows[j];
349  for(size_t m = 0; m < numVecs; m++)
350  {
351  ISC r = X(row, m);
352  for(size_type k = rowmap(row); k < rowmap(row + 1); k++)
353  {
354  const LO col = entries(k);
355  r -= values(k) * Y2(col, m);
356  }
357  Resid(row, m) = r;
358  }
359  }
360  // solve with this block
361  //
362  // Note: I'm abusing the ordering information, knowing that X/Y
363  // and Y2 have the same ordering for on-proc unknowns.
364  //
365  // Note: Add flop counts for inverse apply
366  apply(Resid, Y2, i, Teuchos::NO_TRANS, dampingFactor, one);
367  }
368  else
369  {
370  //Either a point-indexed block matrix, or a normal row matrix
371  //that doesn't support getLocalMatrix
372  ArrayView<const LO> blockRows = this->getBlockRows(i);
373  for(size_t j = 0; j < size_t(blockRows.size()); j++)
374  {
375  const LO row = blockRows[j];
376  auto rowView = getInputRowView(row);
377  for(size_t m = 0; m < numVecs; m++)
378  {
379  Resid(row, m) = X(row, m);
380  for (size_t k = 0; k < rowView.size(); ++k)
381  {
382  const LO col = rowView.ind(k);
383  Resid(row, m) -= rowView.val(k) * Y2(col, m);
384  }
385  }
386  }
387  // solve with this block
388  //
389  // Note: I'm abusing the ordering information, knowing that X/Y
390  // and Y2 have the same ordering for on-proc unknowns.
391  //
392  // Note: Add flop counts for inverse apply
393  apply(Resid, Y2, i, Teuchos::NO_TRANS, dampingFactor, one);
394  }
395 }
396 
397 template<class MatrixType>
399 DoGaussSeidel(ConstHostView X, HostView Y, HostView Y2, SC dampingFactor) const
400 {
401  using Teuchos::Array;
402  using Teuchos::ArrayRCP;
403  using Teuchos::ArrayView;
404  using Teuchos::Ptr;
405  using Teuchos::RCP;
406  using Teuchos::rcp;
407  using Teuchos::rcpFromRef;
408  //This function just extracts the diagonal if it hasn't already.
409  getMatDiag();
410  auto numVecs = X.extent(1);
411  // X = RHS, Y = initial guess
412  HostView Resid("", X.extent(0), X.extent(1));
413  for(LO i = 0; i < numBlocks_; i++)
414  {
415  DoGSBlock(X, Y, Y2, Resid, dampingFactor, i);
416  }
417  if(IsParallel_)
418  {
419  auto numMyRows = inputMatrix_->getLocalNumRows();
420  for (size_t m = 0; m < numVecs; ++m)
421  {
422  for (size_t i = 0; i < numMyRows * bcrsBlockSize_; ++i)
423  {
424  Y(i, m) = Y2(i, m);
425  }
426  }
427  }
428 }
429 
430 template<class MatrixType>
431 void Container<MatrixType>::
432 DoSGS(ConstHostView X, HostView Y, HostView Y2, SC dampingFactor) const
433 {
434  // X = RHS, Y = initial guess
435  using Teuchos::Array;
436  using Teuchos::ArrayRCP;
437  using Teuchos::ArrayView;
438  using Teuchos::Ptr;
439  using Teuchos::ptr;
440  using Teuchos::RCP;
441  using Teuchos::rcp;
442  using Teuchos::rcpFromRef;
443  auto numVecs = X.extent(1);
444  HostView Resid("", X.extent(0), X.extent(1));
445  // Forward Sweep
446  for(LO i = 0; i < numBlocks_; i++)
447  {
448  DoGSBlock(X, Y, Y2, Resid, dampingFactor, i);
449  }
450  static_assert(std::is_signed<LO>::value,
451  "Local ordinal must be signed (unsigned breaks reverse iteration to 0)");
452  // Reverse Sweep
453  for(LO i = numBlocks_ - 1; i >= 0; --i)
454  {
455  DoGSBlock(X, Y, Y2, Resid, dampingFactor, i);
456  }
457  if(IsParallel_)
458  {
459  auto numMyRows = inputMatrix_->getLocalNumRows();
460  for (size_t m = 0; m < numVecs; ++m)
461  {
462  for (size_t i = 0; i < numMyRows * bcrsBlockSize_; ++i)
463  {
464  Y(i, m) = Y2(i, m);
465  }
466  }
467  }
468 }
469 
470 template<class MatrixType>
471 void Container<MatrixType>::
472 clearBlocks()
473 {
474  numBlocks_ = 0;
475  blockRows_.clear();
476  blockSizes_.clear();
477  blockOffsets_.clear();
478  Diag_ = Teuchos::null; //Diag_ will be recreated if needed
479 }
480 
481 //Implementation of Ifpack2::ContainerImpl
482 
483 template<class MatrixType, class LocalScalarType>
484 ContainerImpl<MatrixType, LocalScalarType>::
485 ContainerImpl(
487  const Teuchos::Array<Teuchos::Array<LO> >& partitions,
488  bool pointIndexed)
489  : Container<MatrixType>(matrix, partitions, pointIndexed) {}
490 
491 template<class MatrixType, class LocalScalarType>
494 
495 template<class MatrixType, class LocalScalarType>
498 
499 template<class MatrixType, class LocalScalarType>
501 applyInverseJacobi (const mv_type& /* X */, mv_type& /* Y */,
502  SC dampingFactor,
503  bool /* zeroStartingSolution = false */,
504  int /* numSweeps = 1 */) const
505 {
506  TEUCHOS_TEST_FOR_EXCEPT_MSG(true, "Not implemented.");
507 }
508 
509 template<class MatrixType, class LocalScalarType>
511 applyMV (const mv_type& X, mv_type& Y) const
512 {
513  ConstHostView XView = X.getLocalViewHost(Tpetra::Access::ReadOnly);
514  HostView YView = Y.getLocalViewHost(Tpetra::Access::ReadWrite);
515  this->apply (XView, YView, 0);
516 }
517 
518 template<class MatrixType, class LocalScalarType>
520 weightedApplyMV (const mv_type& X,
521  mv_type& Y,
522  vector_type& W) const
523 {
524  ConstHostView XView = X.getLocalViewHost(Tpetra::Access::ReadOnly);
525  HostView YView = Y.getLocalViewHost(Tpetra::Access::ReadWrite);
526  ConstHostView WView = W.getLocalViewHost(Tpetra::Access::ReadOnly);
527  weightedApply (XView, YView, WView, 0);
528 }
529 
530 template<class MatrixType, class LocalScalarType>
533 {
534  return "Generic";
535 }
536 
537 template<class MatrixType, class LocalScalarType>
539 solveBlock(ConstHostSubviewLocal X,
540  HostSubviewLocal Y,
541  int blockIndex,
542  Teuchos::ETransp mode,
543  const LSC alpha,
544  const LSC beta) const
545 {
546  TEUCHOS_TEST_FOR_EXCEPT_MSG(true, "Not implemented.");
547 }
548 
549 template<class MatrixType, class LocalScalarType>
550 typename ContainerImpl<MatrixType, LocalScalarType>::LO
553 {
554  LO LO_INVALID = Teuchos::OrdinalTraits<LO>::invalid();
555  GO GO_INVALID = Teuchos::OrdinalTraits<GO>::invalid();
556  const map_type& globalRowMap = *(this->inputMatrix_->getRowMap());
557  const map_type& globalColMap = *(this->inputMatrix_->getColMap());
558  LO rowLID = row;
559  LO dofOffset = 0;
560  if(this->pointIndexed_)
561  {
562  rowLID = row / this->bcrsBlockSize_;
563  dofOffset = row % this->bcrsBlockSize_;
564  }
565  GO diagGID = globalRowMap.getGlobalElement(rowLID);
567  diagGID == GO_INVALID,
568  std::runtime_error, "Ifpack2::Container::translateRowToCol: "
569  "On process " << this->inputMatrix_->getRowMap()->getComm()->getRank() <<
570  ", at least one row index in the set of local "
571  "row indices given to the constructor is not a valid local row index in "
572  "the input matrix's row Map on this process. This should be impossible "
573  "because the constructor checks for this case. Here is the complete set "
574  "of invalid local row indices: " << rowLID << ". "
575  "Please report this bug to the Ifpack2 developers.");
576  //now, can translate diagGID (both a global row AND global col ID) to local column
577  LO colLID = globalColMap.getLocalElement(diagGID);
579  colLID == LO_INVALID,
580  std::runtime_error, "Ifpack2::Container::translateRowToCol: "
581  "On process " << this->inputMatrix_->getRowMap()->getComm()->getRank() << ", "
582  "at least one row index in the set of row indices given to the constructor "
583  "does not have a corresponding column index in the input matrix's column "
584  "Map. This probably means that the column(s) in question is/are empty on "
585  "this process, which would make the submatrix to extract structurally "
586  "singular. The invalid global column index is " << diagGID << ".");
587  //colLID could identify a block column - translate to split column if needed
588  if(this->pointIndexed_)
589  return colLID * this->bcrsBlockSize_ + dofOffset;
590  return colLID;
591 }
592 
593 template<class MatrixType, class LocalScalarType>
595 apply (ConstHostView X,
596  HostView Y,
597  int blockIndex,
598  Teuchos::ETransp mode,
599  SC alpha,
600  SC beta) const
601 {
602  using Teuchos::ArrayView;
603  using Teuchos::as;
604  using Teuchos::RCP;
605  using Teuchos::rcp;
606 
607  // The local operator might have a different Scalar type than
608  // MatrixType. This means that we might have to convert X and Y to
609  // the Tpetra::MultiVector specialization that the local operator
610  // wants. This class' X_ and Y_ internal fields are of the right
611  // type for the local operator, so we can use those as targets.
612 
614 
616  ! this->IsComputed_, std::runtime_error, "Ifpack2::Container::apply: "
617  "You must have called the compute() method before you may call apply(). "
618  "You may call the apply() method as many times as you want after calling "
619  "compute() once, but you must have called compute() at least once.");
620 
621  const size_t numVecs = X.extent(1);
622 
623  if(numVecs == 0) {
624  return; // done! nothing to do
625  }
626 
627  // The local operator works on a permuted subset of the local parts
628  // of X and Y. The subset and permutation are defined by the index
629  // array returned by getBlockRows(). If the permutation is trivial
630  // and the subset is exactly equal to the local indices, then we
631  // could use the local parts of X and Y exactly, without needing to
632  // permute. Otherwise, we have to use temporary storage to permute
633  // X and Y. For now, we always use temporary storage.
634  //
635  // Create temporary permuted versions of the input and output.
636  // (Re)allocate X_ and/or Y_ only if necessary. We'll use them to
637  // store the permuted versions of X resp. Y. Note that X_local has
638  // the domain Map of the operator, which may be a permuted subset of
639  // the local Map corresponding to X.getMap(). Similarly, Y_local
640  // has the range Map of the operator, which may be a permuted subset
641  // of the local Map corresponding to Y.getMap(). numRows_ here
642  // gives the number of rows in the row Map of the local Inverse_
643  // operator.
644  //
645  // FIXME (mfh 20 Aug 2013) There might be an implicit assumption
646  // here that the row Map and the range Map of that operator are
647  // the same.
648  //
649  // FIXME (mfh 20 Aug 2013) This "local permutation" functionality
650  // really belongs in Tpetra.
651 
652  if(X_localBlocks_.size() == 0 || X.extent(1) != X_local_.extent(1))
653  {
654  //need to resize (or create for the first time) the three scratch arrays
655  X_localBlocks_.clear();
656  Y_localBlocks_.clear();
657  X_localBlocks_.reserve(this->numBlocks_);
658  Y_localBlocks_.reserve(this->numBlocks_);
659 
660  X_local_ = HostViewLocal("X_local", this->blockRows_.size() * this->scalarsPerRow_, numVecs);
661  Y_local_ = HostViewLocal("Y_local", this->blockRows_.size() * this->scalarsPerRow_, numVecs);
662 
663  //create all X_local and Y_local managed Views at once, are
664  //reused in subsequent apply() calls
665  for(int i = 0; i < this->numBlocks_; i++)
666  {
667  auto blockBounds = std::make_pair(this->blockOffsets_[i] * this->scalarsPerRow_,
668  (this->blockOffsets_[i] + this->blockSizes_[i]) * this->scalarsPerRow_);
669  X_localBlocks_.emplace_back(X_local_, blockBounds, Kokkos::ALL());
670  Y_localBlocks_.emplace_back(Y_local_, blockBounds, Kokkos::ALL());
671  }
672  }
673 
674  const ArrayView<const LO> blockRows = this->getBlockRows(blockIndex);
675 
676  if(this->scalarsPerRow_ == 1)
677  mvgs.gatherViewToView (X_localBlocks_[blockIndex], X, blockRows);
678  else
679  mvgs.gatherViewToViewBlock (X_localBlocks_[blockIndex], X, blockRows, this->scalarsPerRow_);
680 
681  // We must gather the contents of the output multivector Y even on
682  // input to solveBlock(), since the inverse operator might use it as
683  // an initial guess for a linear solve. We have no way of knowing
684  // whether it does or does not.
685 
686  if(this->scalarsPerRow_ == 1)
687  mvgs.gatherViewToView (Y_localBlocks_[blockIndex], Y, blockRows);
688  else
689  mvgs.gatherViewToViewBlock (Y_localBlocks_[blockIndex], Y, blockRows, this->scalarsPerRow_);
690 
691  // Apply the local operator:
692  // Y_local := beta*Y_local + alpha*M^{-1}*X_local
693  this->solveBlock (X_localBlocks_[blockIndex], Y_localBlocks_[blockIndex], blockIndex, mode,
694  LSC(alpha), LSC(beta));
695 
696  // Scatter the permuted subset output vector Y_local back into the
697  // original output multivector Y.
698  if(this->scalarsPerRow_ == 1)
699  mvgs.scatterViewToView (Y, Y_localBlocks_[blockIndex], blockRows);
700  else
701  mvgs.scatterViewToViewBlock (Y, Y_localBlocks_[blockIndex], blockRows, this->scalarsPerRow_);
702 }
703 
704 template<class MatrixType, class LocalScalarType>
706 weightedApply(ConstHostView X,
707  HostView Y,
708  ConstHostView D,
709  int blockIndex,
710  Teuchos::ETransp mode,
711  SC alpha,
712  SC beta) const
713 {
714  using Teuchos::ArrayRCP;
715  using Teuchos::ArrayView;
716  using Teuchos::Range1D;
717  using Teuchos::Ptr;
718  using Teuchos::ptr;
719  using Teuchos::RCP;
720  using Teuchos::rcp;
721  using Teuchos::rcp_const_cast;
722  using std::endl;
723  using STS = Teuchos::ScalarTraits<SC>;
724 
725  // The local operator template parameter might have a different
726  // Scalar type than MatrixType. This means that we might have to
727  // convert X and Y to the Tpetra::MultiVector specialization that
728  // the local operator wants. This class' X_ and Y_ internal fields
729  // are of the right type for the local operator, so we can use those
730  // as targets.
731 
732  const char prefix[] = "Ifpack2::Container::weightedApply: ";
734  ! this->IsComputed_, std::runtime_error, prefix << "You must have called the "
735  "compute() method before you may call this method. You may call "
736  "weightedApply() as many times as you want after calling compute() once, "
737  "but you must have called compute() at least once first.");
738 
739  //bmk 7-2019: BlockRelaxation already checked this, but if that changes...
741  this->scalarsPerRow_ > 1, std::logic_error, prefix << "Use of block rows isn't allowed "
742  "in overlapping Jacobi (the only method that uses weightedApply");
743 
744  const size_t numVecs = X.extent(1);
745 
747  X.extent(1) != Y.extent(1), std::runtime_error,
748  prefix << "X and Y have different numbers of vectors (columns). X has "
749  << X.extent(1) << ", but Y has " << Y.extent(1) << ".");
750 
751  if(numVecs == 0) {
752  return; // done! nothing to do
753  }
754 
755  const size_t numRows = this->blockSizes_[blockIndex];
756 
757  // The local operator works on a permuted subset of the local parts
758  // of X and Y. The subset and permutation are defined by the index
759  // array returned by getBlockRows(). If the permutation is trivial
760  // and the subset is exactly equal to the local indices, then we
761  // could use the local parts of X and Y exactly, without needing to
762  // permute. Otherwise, we have to use temporary storage to permute
763  // X and Y. For now, we always use temporary storage.
764  //
765  // Create temporary permuted versions of the input and output.
766  // (Re)allocate X_ and/or Y_ only if necessary. We'll use them to
767  // store the permuted versions of X resp. Y. Note that X_local has
768  // the domain Map of the operator, which may be a permuted subset of
769  // the local Map corresponding to X.getMap(). Similarly, Y_local
770  // has the range Map of the operator, which may be a permuted subset
771  // of the local Map corresponding to Y.getMap(). numRows_ here
772  // gives the number of rows in the row Map of the local operator.
773  //
774  // FIXME (mfh 20 Aug 2013) There might be an implicit assumption
775  // here that the row Map and the range Map of that operator are
776  // the same.
777  //
778  // FIXME (mfh 20 Aug 2013) This "local permutation" functionality
779  // really belongs in Tpetra.
780  if(X_localBlocks_.size() == 0 || X.extent(1) != X_local_.extent(1))
781  {
782  //need to resize (or create for the first time) the three scratch arrays
783  X_localBlocks_.clear();
784  Y_localBlocks_.clear();
785  X_localBlocks_.reserve(this->numBlocks_);
786  Y_localBlocks_.reserve(this->numBlocks_);
787 
788  X_local_ = HostViewLocal("X_local", this->blockRows_.size() * this->scalarsPerRow_, numVecs);
789  Y_local_ = HostViewLocal("Y_local", this->blockRows_.size() * this->scalarsPerRow_, numVecs);
790 
791  //create all X_local and Y_local managed Views at once, are
792  //reused in subsequent apply() calls
793  for(int i = 0; i < this->numBlocks_; i++)
794  {
795  auto blockBounds = std::make_pair(this->blockOffsets_[i] * this->scalarsPerRow_,
796  (this->blockOffsets_[i] + this->blockSizes_[i]) * this->scalarsPerRow_);
797  X_localBlocks_.emplace_back(X_local_, blockBounds, Kokkos::ALL());
798  Y_localBlocks_.emplace_back(Y_local_, blockBounds, Kokkos::ALL());
799  }
800  }
801  if(int(weightedApplyScratch_.extent(0)) != 3 * this->maxBlockSize_ ||
802  weightedApplyScratch_.extent(1) != numVecs)
803  {
804  weightedApplyScratch_ = HostViewLocal("weightedApply scratch", 3 * this->maxBlockSize_, numVecs);
805  }
806 
807  ArrayView<const LO> blockRows = this->getBlockRows(blockIndex);
808 
810 
811  //note: BlockCrs w/ weighted Jacobi isn't allowed, so no need to use block gather/scatter
812  mvgs.gatherViewToView (X_localBlocks_[blockIndex], X, blockRows);
813  // We must gather the output multivector Y even on input to
814  // solveBlock(), since the local operator might use it as an initial
815  // guess for a linear solve. We have no way of knowing whether it
816  // does or does not.
817 
818  mvgs.gatherViewToView (Y_localBlocks_[blockIndex], Y, blockRows);
819 
820  // Apply the diagonal scaling D to the input X. It's our choice
821  // whether the result has the original input Map of X, or the
822  // permuted subset Map of X_local. If the latter, we also need to
823  // gather D into the permuted subset Map. We choose the latter, to
824  // save memory and computation. Thus, we do the following:
825  //
826  // 1. Gather D into a temporary vector D_local.
827  // 2. Create a temporary X_scaled to hold diag(D_local) * X_local.
828  // 3. Compute X_scaled := diag(D_loca) * X_local.
829  auto maxBS = this->maxBlockSize_;
830  auto bs = this->blockSizes_[blockIndex] * this->scalarsPerRow_;
831 
832  HostSubviewLocal D_local(weightedApplyScratch_, std::make_pair(0, bs), std::make_pair(0, 1));
833  mvgs.gatherViewToView (D_local, D, blockRows);
834  HostSubviewLocal X_scaled(weightedApplyScratch_, std::make_pair(maxBS, maxBS + bs), Kokkos::ALL());
835  for(size_t j = 0; j < numVecs; j++)
836  for(size_t i = 0; i < numRows; i++)
837  X_scaled(i, j) = X_localBlocks_[blockIndex](i, j) * D_local(i, 0);
838 
839  HostSubviewLocal Y_temp(weightedApplyScratch_, std::make_pair(maxBS * 2, maxBS * 2 + bs), Kokkos::ALL());
840  // Apply the local operator: Y_temp := M^{-1} * X_scaled
841  this->solveBlock (X_scaled, Y_temp, blockIndex, mode, STS::one(), STS::zero());
842  // Y_local := beta * Y_local + alpha * diag(D_local) * Y_temp.
843  //
844  // Note that we still use the permuted subset scaling D_local here,
845  // because Y_temp has the same permuted subset Map. That's good, in
846  // fact, because it's a subset: less data to read and multiply.
847  LISC a = alpha;
848  LISC b = beta;
849  for(size_t j = 0; j < numVecs; j++)
850  for(size_t i = 0; i < numRows; i++)
851  Y_localBlocks_[blockIndex](i, j) = b * Y_localBlocks_[blockIndex](i, j) + a * Y_temp(i, j) * D_local(i, 0);
852 
853  // Copy the permuted subset output vector Y_local into the original
854  // output multivector Y.
855  mvgs.scatterViewToView (Y, Y_localBlocks_[blockIndex], blockRows);
856 }
857 
858 template<class MatrixType, class LocalScalarType>
860  typename ContainerImpl<MatrixType, LocalScalarType>::SC,
861  typename ContainerImpl<MatrixType, LocalScalarType>::LO,
862  typename ContainerImpl<MatrixType, LocalScalarType>::GO,
863  typename ContainerImpl<MatrixType, LocalScalarType>::NO>
865 getInputRowView(LO row) const
866 {
867 
868  typedef typename MatrixType::nonconst_local_inds_host_view_type nonconst_local_inds_host_view_type;
869  typedef typename MatrixType::nonconst_values_host_view_type nonconst_values_host_view_type;
870 
871  typedef typename MatrixType::local_inds_host_view_type local_inds_host_view_type;
872  typedef typename MatrixType::values_host_view_type values_host_view_type;
873  using IST = typename row_matrix_type::impl_scalar_type;
874 
875  if(this->hasBlockCrs_)
876  {
877  using h_inds_type = typename block_crs_matrix_type::local_inds_host_view_type;
878  using h_vals_type = typename block_crs_matrix_type::values_host_view_type;
879  h_inds_type colinds;
880  h_vals_type values;
881  this->inputBlockMatrix_->getLocalRowView(row / this->bcrsBlockSize_, colinds, values);
882  size_t numEntries = colinds.size();
883  // CMS: Can't say I understand what this really does
884  //return StridedRowView(values + row % this->bcrsBlockSize_, colinds, this->bcrsBlockSize_, numEntries * this->bcrsBlockSize_);
885  h_vals_type subvals = Kokkos::subview(values,std::pair<size_t,size_t>(row % this->bcrsBlockSize_,values.size()));
886  return StridedRowView(subvals, colinds, this->bcrsBlockSize_, numEntries * this->bcrsBlockSize_);
887  }
888  else if(!this->inputMatrix_->supportsRowViews())
889  {
890  size_t maxEntries = this->inputMatrix_->getLocalMaxNumRowEntries();
891  Teuchos::Array<LO> inds(maxEntries);
892  Teuchos::Array<SC> vals(maxEntries);
893  nonconst_local_inds_host_view_type inds_v(inds.data(),maxEntries);
894  nonconst_values_host_view_type vals_v(reinterpret_cast<IST*>(vals.data()),maxEntries);
895  size_t numEntries;
896  this->inputMatrix_->getLocalRowCopy(row, inds_v, vals_v, numEntries);
897  vals.resize(numEntries); inds.resize(numEntries);
898  return StridedRowView(vals, inds);
899  }
900  else
901  {
902  // CMS - This is dangerous and might not work.
903  local_inds_host_view_type colinds;
904  values_host_view_type values;
905  this->inputMatrix_->getLocalRowView(row, colinds, values);
906  return StridedRowView(values, colinds, 1, colinds.size());
907  }
908 }
909 
910 template<class MatrixType, class LocalScalarType>
912 clearBlocks()
913 {
914  X_localBlocks_.clear();
915  Y_localBlocks_.clear();
916  X_local_ = HostViewLocal();
917  Y_local_ = HostViewLocal();
919 }
920 
921 namespace Details {
922 
923 //Implementation of Ifpack2::Details::StridedRowView
924 template<class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
926 StridedRowView(h_vals_type vals_, h_inds_type inds_, int blockSize_, size_t nnz_)
927  : vals(vals_), inds(inds_), blockSize(blockSize_), nnz(nnz_)
928 {}
929 
930 template<class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
933  : vals(), inds(), blockSize(1), nnz(vals_.size())
934 {
935  valsCopy.swap(vals_);
936  indsCopy.swap(inds_);
937 }
938 
939 template<class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
941 val(size_t i) const
942 {
943  #ifdef HAVE_IFPACK2_DEBUG
944  TEUCHOS_TEST_FOR_EXCEPTION(i >= nnz, std::runtime_error,
945  "Out-of-bounds access into Ifpack2::Container::StridedRowView");
946  #endif
947  if(vals.size() > 0)
948  {
949  if(blockSize == 1)
950  return vals[i];
951  else
952  return vals[i * blockSize];
953  }
954  else
955  return valsCopy[i];
956 }
957 
958 template<class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
959 LocalOrdinal StridedRowView<Scalar, LocalOrdinal, GlobalOrdinal, Node>::
960 ind(size_t i) const
961 {
962  #ifdef HAVE_IFPACK2_DEBUG
963  TEUCHOS_TEST_FOR_EXCEPTION(i >= nnz, std::runtime_error,
964  "Out-of-bounds access into Ifpack2::Container::StridedRowView");
965  #endif
966  //inds is smaller than vals by a factor of the block size (dofs/node)
967  if(inds.size() > 0)
968  {
969  if(blockSize == 1)
970  return inds[i];
971  else
972  return inds[i / blockSize] * blockSize + i % blockSize;
973  }
974  else
975  return indsCopy[i];
976 }
977 
978 template<class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
979 size_t StridedRowView<Scalar, LocalOrdinal, GlobalOrdinal, Node>::
980 size() const
981 {
982  return nnz;
983 }
984 }
985 
986 }
987 
988 template <class MatrixType>
989 std::ostream& operator<<(std::ostream& os, const Ifpack2::Container<MatrixType>& obj)
990 {
991  return obj.print(os);
992 }
993 
994 #define IFPACK2_CONTAINER_INSTANT(S,LO,GO,N) \
995  template class Ifpack2::Container<Tpetra::RowMatrix<S, LO, GO, N>>; \
996  template class Ifpack2::ContainerImpl<Tpetra::RowMatrix<S, LO, GO, N>, S>; \
997  template class Ifpack2::Details::StridedRowView<S, LO, GO, N>; \
998  template std::ostream& operator<< <Tpetra::RowMatrix<S, LO, GO, N>>( \
999  std::ostream& os, const Ifpack2::Container<Tpetra::RowMatrix<S, LO, GO, N>>& obj);
1000 
1001 #endif
1002 
virtual void apply(ConstHostView X, HostView Y, int blockIndex, Teuchos::ETransp mode=Teuchos::NO_TRANS, SC alpha=Teuchos::ScalarTraits< SC >::one(), SC beta=Teuchos::ScalarTraits< SC >::zero()) const
Compute Y := alpha * M^{-1} X + beta*Y.
Definition: Ifpack2_Container_def.hpp:595
void weightedApplyMV(const mv_type &X, mv_type &Y, vector_type &W) const
Wrapper for weightedApply with MVs, used in unit tests (never called by BlockRelaxation) ...
Definition: Ifpack2_Container_def.hpp:520
bool isComputed() const
Whether the container has been successfully computed.
Definition: Ifpack2_Container_def.hpp:137
void DoGSBlock(ConstHostView X, HostView Y, HostView Y2, HostView Resid, SC dampingFactor, LO i) const
Do one step of Gauss-Seidel on block i (used by DoGaussSeidel and DoSGS)
Definition: Ifpack2_Container_def.hpp:253
void setBlockSizes(const Teuchos::Array< Teuchos::Array< LO > > &partitions)
Initialize arrays with information about block sizes.
Definition: Ifpack2_Container_def.hpp:92
Teuchos::Array< LO > blockSizes_
Number of rows in each block.
Definition: Ifpack2_Container_decl.hpp:263
typename Kokkos::ArithTraits< SC >::val_type ISC
Internal representation of Scalar in Kokkos::View.
Definition: Ifpack2_Container_decl.hpp:102
StridedRowView(h_vals_type vals_, h_inds_type inds_, int blockSize_, size_t nnz_)
Constructor for row views (preferred)
Definition: Ifpack2_Container_def.hpp:926
virtual void applyMV(const mv_type &X, mv_type &Y) const
Wrapper for apply with MultiVector.
Definition: Ifpack2_Container_def.hpp:143
bool pointIndexed_
(If hasBlockCrs_) Whether the blocks are described using sub-block row indices instead of full block ...
Definition: Ifpack2_Container_decl.hpp:281
int numBlocks_
The number of blocks (partitions) in the container.
Definition: Ifpack2_Container_decl.hpp:259
void swap(Array &x)
The implementation of the numerical features of Container (Jacobi, Gauss-Seidel, SGS). This class allows a custom scalar type (LocalScalarType) to be used for storing blocks and solving the block systems. Hiding this template parameter from the Container interface simplifies the BlockRelaxation and ContainerFactory classes.
Definition: Ifpack2_Container_decl.hpp:310
#define TEUCHOS_TEST_FOR_EXCEPTION(throw_exception_test, Exception, msg)
virtual void setParameters(const Teuchos::ParameterList &List)
Set parameters, if any.
Definition: Ifpack2_Container_def.hpp:497
bool isInitialized() const
Whether the container has been successfully initialized.
Definition: Ifpack2_Container_def.hpp:132
LO scalarsPerRow_
Definition: Ifpack2_Container_decl.hpp:284
virtual ~Container()
Destructor.
Definition: Ifpack2_Container_def.hpp:81
bool hasBlockCrs_
Whether the input matrix is a BlockCRS matrix.
Definition: Ifpack2_Container_decl.hpp:277
Details::StridedRowView< SC, LO, GO, NO > getInputRowView(LO row) const
View a row of the input matrix.
Definition: Ifpack2_Container_def.hpp:865
LocalScalarType LSC
The internal representation of LocalScalarType in Kokkos::View.
Definition: Ifpack2_Container_decl.hpp:330
Declaration and definition of the Ifpack2::Details::MultiVectorLocalGatherScatter class...
Container(const Teuchos::RCP< const row_matrix_type > &matrix, const Teuchos::Array< Teuchos::Array< LO > > &partitions, bool pointIndexed)
Constructor.
Definition: Ifpack2_Container_def.hpp:21
int bcrsBlockSize_
If hasBlockCrs_, the number of DOFs per vertex. Otherwise 1.
Definition: Ifpack2_Container_decl.hpp:279
TEUCHOS_DEPRECATED RCP< T > rcp(T *p, Dealloc_T dealloc, bool owns_mem)
#define TEUCHOS_TEST_FOR_EXCEPT_MSG(throw_exception_test, msg)
LO translateRowToCol(LO row)
Definition: Ifpack2_Container_def.hpp:552
Teuchos::RCP< const block_crs_matrix_type > inputBlockMatrix_
The input matrix, dynamic cast to BlockCrsMatrix. May be null.
Definition: Ifpack2_Container_decl.hpp:256
Teuchos::RCP< const row_matrix_type > inputMatrix_
The input matrix to the constructor.
Definition: Ifpack2_Container_decl.hpp:250
virtual void weightedApplyMV(const mv_type &X, mv_type &Y, vector_type &W) const
Wrapper for weightedApply with MultiVector.
Definition: Ifpack2_Container_def.hpp:150
void applyMV(const mv_type &X, mv_type &Y) const
Wrapper for apply with MVs, used in unit tests (never called by BlockRelaxation)
Definition: Ifpack2_Container_def.hpp:511
virtual void applyInverseJacobi(const mv_type &, mv_type &, SC dampingFactor, bool, int) const
Compute Y := (1 - a) Y + a D^{-1} (X - R*Y).
Definition: Ifpack2_Container_def.hpp:501
typename mv_type::dual_view_type::t_host HostView
Definition: Ifpack2_Container_decl.hpp:106
virtual void weightedApply(ConstHostView X, HostView Y, ConstHostView D, int blockIndex, Teuchos::ETransp mode=Teuchos::NO_TRANS, SC alpha=Teuchos::ScalarTraits< SC >::one(), SC beta=Teuchos::ScalarTraits< SC >::zero()) const
Compute Y := alpha * diag(D) * M^{-1} (diag(D) * X) + beta*Y.
Definition: Ifpack2_Container_def.hpp:706
void resize(size_type new_size, const value_type &x=value_type())
virtual ~ContainerImpl()
Destructor.
Definition: Ifpack2_Container_def.hpp:493
bool IsParallel_
Whether the problem is distributed across multiple MPI processes.
Definition: Ifpack2_Container_decl.hpp:269
static std::string getName()
Definition: Ifpack2_Container_def.hpp:532
TypeTo as(const TypeFrom &t)
Interface for creating and solving a set of local linear problems.
Definition: Ifpack2_Container_decl.hpp:79
Teuchos::ArrayView< const LO > getBlockRows(int blockIndex) const
Local indices of the rows of the input matrix that belong to this block.
Definition: Ifpack2_Container_def.hpp:85
Structure for read-only views of general matrix rows.
Definition: Ifpack2_Container_decl.hpp:297
static std::string getName()
Definition: Ifpack2_Container_def.hpp:157
GO NumGlobalRows_
Number of global rows in input matrix.
Definition: Ifpack2_Container_decl.hpp:273
GO NumGlobalNonzeros_
Number of nonzeros in input matrix.
Definition: Ifpack2_Container_decl.hpp:275
LO NumLocalRows_
Number of local rows in input matrix.
Definition: Ifpack2_Container_decl.hpp:271
Implementation detail of Ifpack2::Container subclasses.
Definition: Ifpack2_Details_MultiVectorLocalGatherScatter.hpp:52
virtual void solveBlock(ConstHostSubviewLocal X, HostSubviewLocal Y, int blockIndex, Teuchos::ETransp mode, const LSC alpha, const LSC beta) const
Definition: Ifpack2_Container_def.hpp:539
std::string toString(const T &t)
virtual void DoGSBlock(ConstHostView X, HostView Y, HostView Y2, HostView Resid, SC dampingFactor, LO i) const
Do one step of Gauss-Seidel on block i (used by DoGaussSeidel and DoSGS)
Definition: Ifpack2_Container_def.hpp:163
bool is_null() const