Ifpack2 Templated Preconditioning Package  Version 1.0
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Groups Pages
Ifpack2_Relaxation_def.hpp
1 /*@HEADER
2 // ***********************************************************************
3 //
4 // Ifpack2: Templated Object-Oriented Algebraic Preconditioner Package
5 // Copyright (2009) Sandia Corporation
6 //
7 // Under terms of Contract DE-AC04-94AL85000, there is a non-exclusive
8 // license for use of this work by or on behalf of the U.S. Government.
9 //
10 // Redistribution and use in source and binary forms, with or without
11 // modification, are permitted provided that the following conditions are
12 // met:
13 //
14 // 1. Redistributions of source code must retain the above copyright
15 // notice, this list of conditions and the following disclaimer.
16 //
17 // 2. Redistributions in binary form must reproduce the above copyright
18 // notice, this list of conditions and the following disclaimer in the
19 // documentation and/or other materials provided with the distribution.
20 //
21 // 3. Neither the name of the Corporation nor the names of the
22 // contributors may be used to endorse or promote products derived from
23 // this software without specific prior written permission.
24 //
25 // THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
26 // EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
27 // IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
28 // PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
29 // CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
30 // EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
31 // PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
32 // PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
33 // LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
34 // NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
35 // SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
36 //
37 // ***********************************************************************
38 //@HEADER
39 */
40 
41 #ifndef IFPACK2_RELAXATION_DEF_HPP
42 #define IFPACK2_RELAXATION_DEF_HPP
43 
44 #include "Teuchos_StandardParameterEntryValidators.hpp"
45 #include "Teuchos_TimeMonitor.hpp"
46 #include "Tpetra_CrsMatrix.hpp"
47 #include "Tpetra_BlockCrsMatrix.hpp"
48 #include "Tpetra_BlockView.hpp"
49 #include "Ifpack2_Utilities.hpp"
50 #include "MatrixMarket_Tpetra.hpp"
51 #include "Tpetra_transform_MultiVector.hpp"
52 #include "Tpetra_withLocalAccess_MultiVector.hpp"
53 #include "Tpetra_Details_residual.hpp"
54 #include <cstdlib>
55 #include <memory>
56 #include <sstream>
57 #include "KokkosSparse_gauss_seidel.hpp"
58 
59 namespace {
60  // Validate that a given int is nonnegative.
61  class NonnegativeIntValidator : public Teuchos::ParameterEntryValidator {
62  public:
63  // Constructor (does nothing).
64  NonnegativeIntValidator () {}
65 
66  // ParameterEntryValidator wants this method.
68  return Teuchos::null;
69  }
70 
71  // Actually validate the parameter's value.
72  void
73  validate (const Teuchos::ParameterEntry& entry,
74  const std::string& paramName,
75  const std::string& sublistName) const
76  {
77  using std::endl;
78  Teuchos::any anyVal = entry.getAny (true);
79  const std::string entryName = entry.getAny (false).typeName ();
80 
82  anyVal.type () != typeid (int),
84  "Parameter \"" << paramName << "\" in sublist \"" << sublistName
85  << "\" has the wrong type." << endl << "Parameter: " << paramName
86  << endl << "Type specified: " << entryName << endl
87  << "Type required: int" << endl);
88 
89  const int val = Teuchos::any_cast<int> (anyVal);
92  "Parameter \"" << paramName << "\" in sublist \"" << sublistName
93  << "\" is negative." << endl << "Parameter: " << paramName
94  << endl << "Value specified: " << val << endl
95  << "Required range: [0, INT_MAX]" << endl);
96  }
97 
98  // ParameterEntryValidator wants this method.
99  const std::string getXMLTypeName () const {
100  return "NonnegativeIntValidator";
101  }
102 
103  // ParameterEntryValidator wants this method.
104  void
105  printDoc (const std::string& docString,
106  std::ostream &out) const
107  {
108  Teuchos::StrUtils::printLines (out, "# ", docString);
109  out << "#\tValidator Used: " << std::endl;
110  out << "#\t\tNonnegativeIntValidator" << std::endl;
111  }
112  };
113 
114  // A way to get a small positive number (eps() for floating-point
115  // types, or 1 for integer types) when Teuchos::ScalarTraits doesn't
116  // define it (for example, for integer values).
117  template<class Scalar, const bool isOrdinal=Teuchos::ScalarTraits<Scalar>::isOrdinal>
118  class SmallTraits {
119  public:
120  // Return eps if Scalar is a floating-point type, else return 1.
121  static const Scalar eps ();
122  };
123 
124  // Partial specialization for when Scalar is not a floating-point type.
125  template<class Scalar>
126  class SmallTraits<Scalar, true> {
127  public:
128  static const Scalar eps () {
130  }
131  };
132 
133  // Partial specialization for when Scalar is a floating-point type.
134  template<class Scalar>
135  class SmallTraits<Scalar, false> {
136  public:
137  static const Scalar eps () {
139  }
140  };
141 
142  // Work-around for GitHub Issue #5269.
143  template<class Scalar,
144  const bool isComplex = Teuchos::ScalarTraits<Scalar>::isComplex>
145  struct RealTraits {};
146 
147  template<class Scalar>
148  struct RealTraits<Scalar, false> {
149  using val_type = Scalar;
150  using mag_type = Scalar;
151  static KOKKOS_INLINE_FUNCTION mag_type real (const val_type& z) {
152  return z;
153  }
154  };
155 
156  template<class Scalar>
157  struct RealTraits<Scalar, true> {
158  using val_type = Scalar;
159  using mag_type = typename Teuchos::ScalarTraits<Scalar>::magnitudeType;
160  static KOKKOS_INLINE_FUNCTION mag_type real (const val_type& z) {
161  return Kokkos::ArithTraits<val_type>::real (z);
162  }
163  };
164 
165  template<class Scalar>
166  KOKKOS_INLINE_FUNCTION typename RealTraits<Scalar>::mag_type
167  getRealValue (const Scalar& z) {
168  return RealTraits<Scalar>::real (z);
169  }
170 
171 } // namespace (anonymous)
172 
173 namespace Ifpack2 {
174 
175 template<class MatrixType>
176 void
177 Relaxation<MatrixType>::
178 updateCachedMultiVector (const Teuchos::RCP<const Tpetra::Map<local_ordinal_type, global_ordinal_type, node_type>>& map,
179  size_t numVecs) const
180 {
181  // Allocate a multivector if the cached one isn't perfect. Checking
182  // for map pointer equality is much cheaper than Map::isSameAs.
183  if (cachedMV_.is_null () ||
184  map.get () != cachedMV_->getMap ().get () ||
185  cachedMV_->getNumVectors () != numVecs) {
186  using MV = Tpetra::MultiVector<scalar_type, local_ordinal_type,
187  global_ordinal_type, node_type>;
188  cachedMV_ = Teuchos::rcp (new MV (map, numVecs, false));
189  }
190 }
191 
192 template<class MatrixType>
195 {
196  if (A.getRawPtr() != A_.getRawPtr()) { // it's a different matrix
197  Importer_ = Teuchos::null;
198  pointImporter_ = Teuchos::null;
199  Diagonal_ = Teuchos::null; // ??? what if this comes from the user???
200  isInitialized_ = false;
201  IsComputed_ = false;
202  using device_type = typename node_type::device_type;
203  diagOffsets_ = Kokkos::View<size_t*, device_type>();
204  savedDiagOffsets_ = false;
205  hasBlockCrsMatrix_ = false;
206  if (! A.is_null ()) {
207  IsParallel_ = (A->getRowMap ()->getComm ()->getSize () > 1);
208  }
209  A_ = A;
210  }
211 }
212 
213 template<class MatrixType>
216 : A_ (A),
217  IsParallel_ ((A.is_null () || A->getRowMap ().is_null () || A->getRowMap ()->getComm ().is_null ()) ?
218  false : // a reasonable default if there's no communicator
219  A->getRowMap ()->getComm ()->getSize () > 1)
220 {
221  this->setObjectLabel ("Ifpack2::Relaxation");
222 }
223 
224 
225 template<class MatrixType>
228 {
229  using Teuchos::Array;
231  using Teuchos::parameterList;
232  using Teuchos::RCP;
233  using Teuchos::rcp;
234  using Teuchos::rcp_const_cast;
235  using Teuchos::rcp_implicit_cast;
236  using Teuchos::setStringToIntegralParameter;
238 
239  if (validParams_.is_null ()) {
240  RCP<ParameterList> pl = parameterList ("Ifpack2::Relaxation");
241 
242  // Set a validator that automatically converts from the valid
243  // string options to their enum values.
244  Array<std::string> precTypes (8);
245  precTypes[0] = "Jacobi";
246  precTypes[1] = "Gauss-Seidel";
247  precTypes[2] = "Symmetric Gauss-Seidel";
248  precTypes[3] = "MT Gauss-Seidel";
249  precTypes[4] = "MT Symmetric Gauss-Seidel";
250  precTypes[5] = "Richardson";
251  precTypes[6] = "Two-stage Gauss-Seidel";
252  precTypes[7] = "Two-stage Symmetric Gauss-Seidel";
253  Array<Details::RelaxationType> precTypeEnums (8);
254  precTypeEnums[0] = Details::JACOBI;
255  precTypeEnums[1] = Details::GS;
256  precTypeEnums[2] = Details::SGS;
257  precTypeEnums[3] = Details::MTGS;
258  precTypeEnums[4] = Details::MTSGS;
259  precTypeEnums[5] = Details::RICHARDSON;
260  precTypeEnums[6] = Details::GS2;
261  precTypeEnums[7] = Details::SGS2;
262  const std::string defaultPrecType ("Jacobi");
263  setStringToIntegralParameter<Details::RelaxationType> ("relaxation: type",
264  defaultPrecType, "Relaxation method", precTypes (), precTypeEnums (),
265  pl.getRawPtr ());
266 
267  const int numSweeps = 1;
268  RCP<PEV> numSweepsValidator =
269  rcp_implicit_cast<PEV> (rcp (new NonnegativeIntValidator));
270  pl->set ("relaxation: sweeps", numSweeps, "Number of relaxation sweeps",
271  rcp_const_cast<const PEV> (numSweepsValidator));
272 
273  // number of inner sweeps for two-stage GS
274  const int numInnerSweeps = 1;
275  RCP<PEV> numInnerSweepsValidator =
276  rcp_implicit_cast<PEV> (rcp (new NonnegativeIntValidator));
277  pl->set ("relaxation: inner sweeps", numInnerSweeps, "Number of inner relaxation sweeps",
278  rcp_const_cast<const PEV> (numInnerSweepsValidator));
279  // specify if using sptrsv instead of inner-iterations
280  const bool innerSpTrsv = false;
281  pl->set ("relaxation: inner sparse-triangular solve", innerSpTrsv);
282 
283  const scalar_type dampingFactor = STS::one ();
284  pl->set ("relaxation: damping factor", dampingFactor);
285 
286  const bool zeroStartingSolution = true;
287  pl->set ("relaxation: zero starting solution", zeroStartingSolution);
288 
289  const bool doBackwardGS = false;
290  pl->set ("relaxation: backward mode", doBackwardGS);
291 
292  const bool doL1Method = false;
293  pl->set ("relaxation: use l1", doL1Method);
294 
295  const magnitude_type l1eta = (STM::one() + STM::one() + STM::one()) /
296  (STM::one() + STM::one()); // 1.5
297  pl->set ("relaxation: l1 eta", l1eta);
298 
299  const scalar_type minDiagonalValue = STS::zero ();
300  pl->set ("relaxation: min diagonal value", minDiagonalValue);
301 
302  const bool fixTinyDiagEntries = false;
303  pl->set ("relaxation: fix tiny diagonal entries", fixTinyDiagEntries);
304 
305  const bool checkDiagEntries = false;
306  pl->set ("relaxation: check diagonal entries", checkDiagEntries);
307 
308  Teuchos::ArrayRCP<local_ordinal_type> localSmoothingIndices = Teuchos::null;
309  pl->set("relaxation: local smoothing indices", localSmoothingIndices);
310 
311  const bool is_matrix_structurally_symmetric = false;
312  pl->set("relaxation: symmetric matrix structure", is_matrix_structurally_symmetric);
313 
314  const bool ifpack2_dump_matrix = false;
315  pl->set("relaxation: ifpack2 dump matrix", ifpack2_dump_matrix);
316 
317  const int cluster_size = 1;
318  pl->set("relaxation: mtgs cluster size", cluster_size);
319 
320  validParams_ = rcp_const_cast<const ParameterList> (pl);
321  }
322  return validParams_;
323 }
324 
325 
326 template<class MatrixType>
328 {
329  using Teuchos::getIntegralValue;
331  using Teuchos::RCP;
332  typedef scalar_type ST; // just to make code below shorter
333 
334  if (pl.isType<double>("relaxation: damping factor")) {
335  // Make sure that ST=complex can run with a damping factor that is
336  // a double.
337  ST df = pl.get<double>("relaxation: damping factor");
338  pl.remove("relaxation: damping factor");
339  pl.set("relaxation: damping factor",df);
340  }
341 
343 
344  const Details::RelaxationType precType =
345  getIntegralValue<Details::RelaxationType> (pl, "relaxation: type");
346  const int numSweeps = pl.get<int> ("relaxation: sweeps");
347  const int numInnerSweeps = pl.get<int> ("relaxation: inner sweeps");
348  const bool innerSpTrsv = pl.get<bool> ("relaxation: inner sparse-triangular solve");
349  const ST dampingFactor = pl.get<ST> ("relaxation: damping factor");
350  const bool zeroStartSol = pl.get<bool> ("relaxation: zero starting solution");
351  const bool doBackwardGS = pl.get<bool> ("relaxation: backward mode");
352  const bool doL1Method = pl.get<bool> ("relaxation: use l1");
353  const magnitude_type l1Eta = pl.get<magnitude_type> ("relaxation: l1 eta");
354  const ST minDiagonalValue = pl.get<ST> ("relaxation: min diagonal value");
355  const bool fixTinyDiagEntries = pl.get<bool> ("relaxation: fix tiny diagonal entries");
356  const bool checkDiagEntries = pl.get<bool> ("relaxation: check diagonal entries");
357  const bool is_matrix_structurally_symmetric = pl.get<bool> ("relaxation: symmetric matrix structure");
358  const bool ifpack2_dump_matrix = pl.get<bool> ("relaxation: ifpack2 dump matrix");
359  int cluster_size = 1;
360  if(pl.isParameter ("relaxation: mtgs cluster size")) //optional parameter
361  cluster_size = pl.get<int> ("relaxation: mtgs cluster size");
362 
363  Teuchos::ArrayRCP<local_ordinal_type> localSmoothingIndices = pl.get<Teuchos::ArrayRCP<local_ordinal_type> >("relaxation: local smoothing indices");
364 
365 
366  // "Commit" the changes, now that we've validated everything.
367  PrecType_ = precType;
368  NumSweeps_ = numSweeps;
369  NumInnerSweeps_ = numInnerSweeps;
370  InnerSpTrsv_ = innerSpTrsv;
371  DampingFactor_ = dampingFactor;
372  ZeroStartingSolution_ = zeroStartSol;
373  DoBackwardGS_ = doBackwardGS;
374  DoL1Method_ = doL1Method;
375  L1Eta_ = l1Eta;
376  MinDiagonalValue_ = minDiagonalValue;
377  fixTinyDiagEntries_ = fixTinyDiagEntries;
378  checkDiagEntries_ = checkDiagEntries;
379  clusterSize_ = cluster_size;
380  is_matrix_structurally_symmetric_ = is_matrix_structurally_symmetric;
381  ifpack2_dump_matrix_ = ifpack2_dump_matrix;
382  localSmoothingIndices_ = localSmoothingIndices;
383 }
384 
385 
386 template<class MatrixType>
388 {
389  // FIXME (aprokop 18 Oct 2013) Casting away const is bad here.
390  // but otherwise, we will get [unused] in pl
391  this->setParametersImpl(const_cast<Teuchos::ParameterList&>(pl));
392 }
393 
394 
395 template<class MatrixType>
399  A_.is_null (), std::runtime_error, "Ifpack2::Relaxation::getComm: "
400  "The input matrix A is null. Please call setMatrix() with a nonnull "
401  "input matrix before calling this method.");
402  return A_->getRowMap ()->getComm ();
403 }
404 
405 
406 template<class MatrixType>
409  return A_;
410 }
411 
412 
413 template<class MatrixType>
414 Teuchos::RCP<const Tpetra::Map<typename MatrixType::local_ordinal_type,
415  typename MatrixType::global_ordinal_type,
416  typename MatrixType::node_type> >
419  A_.is_null (), std::runtime_error, "Ifpack2::Relaxation::getDomainMap: "
420  "The input matrix A is null. Please call setMatrix() with a nonnull "
421  "input matrix before calling this method.");
422  return A_->getDomainMap ();
423 }
424 
425 
426 template<class MatrixType>
427 Teuchos::RCP<const Tpetra::Map<typename MatrixType::local_ordinal_type,
428  typename MatrixType::global_ordinal_type,
429  typename MatrixType::node_type> >
432  A_.is_null (), std::runtime_error, "Ifpack2::Relaxation::getRangeMap: "
433  "The input matrix A is null. Please call setMatrix() with a nonnull "
434  "input matrix before calling this method.");
435  return A_->getRangeMap ();
436 }
437 
438 
439 template<class MatrixType>
441  return true;
442 }
443 
444 
445 template<class MatrixType>
447  return(NumInitialize_);
448 }
449 
450 
451 template<class MatrixType>
453  return(NumCompute_);
454 }
455 
456 
457 template<class MatrixType>
459  return(NumApply_);
460 }
461 
462 
463 template<class MatrixType>
465  return(InitializeTime_);
466 }
467 
468 
469 template<class MatrixType>
471  return(ComputeTime_);
472 }
473 
474 
475 template<class MatrixType>
477  return(ApplyTime_);
478 }
479 
480 
481 template<class MatrixType>
483  return(ComputeFlops_);
484 }
485 
486 
487 template<class MatrixType>
489  return(ApplyFlops_);
490 }
491 
492 
493 
494 template<class MatrixType>
497  A_.is_null (), std::runtime_error, "Ifpack2::Relaxation::getNodeSmootherComplexity: "
498  "The input matrix A is null. Please call setMatrix() with a nonnull "
499  "input matrix, then call compute(), before calling this method.");
500  // Relaxation methods cost roughly one apply + one diagonal inverse per iteration
501  return A_->getNodeNumRows() + A_->getNodeNumEntries();
502 }
503 
504 
505 template<class MatrixType>
506 void
508 apply (const Tpetra::MultiVector<scalar_type, local_ordinal_type, global_ordinal_type, node_type>& X,
509  Tpetra::MultiVector<scalar_type, local_ordinal_type, global_ordinal_type, node_type>& Y,
510  Teuchos::ETransp /* mode */,
511  scalar_type alpha,
512  scalar_type beta) const
513 {
514  using Teuchos::as;
515  using Teuchos::RCP;
516  using Teuchos::rcp;
517  using Teuchos::rcpFromRef;
518  typedef Tpetra::MultiVector<scalar_type, local_ordinal_type,
521  A_.is_null (), std::runtime_error, "Ifpack2::Relaxation::apply: "
522  "The input matrix A is null. Please call setMatrix() with a nonnull "
523  "input matrix, then call compute(), before calling this method.");
525  ! isComputed (),
526  std::runtime_error,
527  "Ifpack2::Relaxation::apply: You must call compute() on this Ifpack2 "
528  "preconditioner instance before you may call apply(). You may call "
529  "isComputed() to find out if compute() has been called already.");
530  TEUCHOS_TEST_FOR_EXCEPTION(
531  X.getNumVectors() != Y.getNumVectors(),
532  std::runtime_error,
533  "Ifpack2::Relaxation::apply: X and Y have different numbers of columns. "
534  "X has " << X.getNumVectors() << " columns, but Y has "
535  << Y.getNumVectors() << " columns.");
536  TEUCHOS_TEST_FOR_EXCEPTION(
537  beta != STS::zero (), std::logic_error,
538  "Ifpack2::Relaxation::apply: beta = " << beta << " != 0 case not "
539  "implemented.");
540 
541  const std::string timerName ("Ifpack2::Relaxation::apply");
543  if (timer.is_null ()) {
544  timer = Teuchos::TimeMonitor::getNewCounter (timerName);
545  }
546 
547  {
548  Teuchos::TimeMonitor timeMon (*timer);
549  // Special case: alpha == 0.
550  if (alpha == STS::zero ()) {
551  // No floating-point operations, so no need to update a count.
552  Y.putScalar (STS::zero ());
553  }
554  else {
555  // If X and Y alias one another, then we need to create an
556  // auxiliary vector, Xcopy (a deep copy of X).
557  RCP<const MV> Xcopy;
558  // FIXME (mfh 12 Sep 2014) This test for aliasing is incomplete.
559  {
560  auto X_lcl_host = X.getLocalViewHost ();
561  auto Y_lcl_host = Y.getLocalViewHost ();
562 
563  if (X_lcl_host.data () == Y_lcl_host.data ()) {
564  Xcopy = rcp (new MV (X, Teuchos::Copy));
565  } else {
566  Xcopy = rcpFromRef (X);
567  }
568  }
569 
570  // Each of the following methods updates the flop count itself.
571  // All implementations handle zeroing out the starting solution
572  // (if necessary) themselves.
573  switch (PrecType_) {
574  case Ifpack2::Details::JACOBI:
575  ApplyInverseJacobi(*Xcopy,Y);
576  break;
577  case Ifpack2::Details::GS:
578  ApplyInverseGS(*Xcopy,Y);
579  break;
580  case Ifpack2::Details::SGS:
581  ApplyInverseSGS(*Xcopy,Y);
582  break;
583  case Ifpack2::Details::MTSGS:
584  case Ifpack2::Details::SGS2:
585  ApplyInverseMTSGS_CrsMatrix(*Xcopy,Y);
586  break;
587  case Ifpack2::Details::MTGS:
588  case Ifpack2::Details::GS2:
589  ApplyInverseMTGS_CrsMatrix(*Xcopy,Y);
590  break;
591  case Ifpack2::Details::RICHARDSON:
592  ApplyInverseRichardson(*Xcopy,Y);
593  break;
594 
595  default:
596  TEUCHOS_TEST_FOR_EXCEPTION(true, std::logic_error,
597  "Ifpack2::Relaxation::apply: Invalid preconditioner type enum value "
598  << PrecType_ << ". Please report this bug to the Ifpack2 developers.");
599  }
600  if (alpha != STS::one ()) {
601  Y.scale (alpha);
602  const double numGlobalRows = as<double> (A_->getGlobalNumRows ());
603  const double numVectors = as<double> (Y.getNumVectors ());
604  ApplyFlops_ += numGlobalRows * numVectors;
605  }
606  }
607  }
608  ApplyTime_ += timer->totalElapsedTime ();
609  ++NumApply_;
610 }
611 
612 
613 template<class MatrixType>
614 void
616 applyMat (const Tpetra::MultiVector<scalar_type, local_ordinal_type, global_ordinal_type, node_type>& X,
617  Tpetra::MultiVector<scalar_type, local_ordinal_type, global_ordinal_type, node_type>& Y,
618  Teuchos::ETransp mode) const
619 {
621  A_.is_null (), std::runtime_error, "Ifpack2::Relaxation::applyMat: "
622  "The input matrix A is null. Please call setMatrix() with a nonnull "
623  "input matrix, then call compute(), before calling this method.");
625  ! isComputed (), std::runtime_error, "Ifpack2::Relaxation::applyMat: "
626  "isComputed() must be true before you may call applyMat(). "
627  "Please call compute() before calling this method.");
628  TEUCHOS_TEST_FOR_EXCEPTION(
629  X.getNumVectors () != Y.getNumVectors (), std::invalid_argument,
630  "Ifpack2::Relaxation::applyMat: X.getNumVectors() = " << X.getNumVectors ()
631  << " != Y.getNumVectors() = " << Y.getNumVectors () << ".");
632  A_->apply (X, Y, mode);
633 }
634 
635 
636 template<class MatrixType>
638 {
639  const char methodName[] = "Ifpack2::Relaxation::initialize";
640 
642  (A_.is_null (), std::runtime_error, methodName << ": The "
643  "input matrix A is null. Please call setMatrix() with "
644  "a nonnull input matrix before calling this method.");
645 
648 
649  { // Timing of initialize starts here
650  Teuchos::TimeMonitor timeMon (*timer);
651  isInitialized_ = false;
652 
653  {
654  auto rowMap = A_->getRowMap ();
655  auto comm = rowMap.is_null () ? Teuchos::null : rowMap->getComm ();
656  IsParallel_ = ! comm.is_null () && comm->getSize () > 1;
657  }
658 
659  // mfh 21 Mar 2013, 07 May 2019: The Import object may be null,
660  // but in that case, the domain and column Maps are the same and
661  // we don't need to Import anyway.
662  //
663  // mfh 07 May 2019: A_->getGraph() might be an
664  // OverlappingRowGraph, which doesn't have an Import object.
665  // However, in that case, the comm should have size 1.
666  Importer_ = IsParallel_ ? A_->getGraph ()->getImporter () :
667  Teuchos::null;
668 
669  {
671  Teuchos::rcp_dynamic_cast<const block_crs_matrix_type> (A_);
672  hasBlockCrsMatrix_ = ! A_bcrs.is_null ();
673  }
674 
675  if (PrecType_ == Details::MTGS || PrecType_ == Details::MTSGS ||
676  PrecType_ == Details::GS2 || PrecType_ == Details::SGS2) {
677  const crs_matrix_type* crsMat =
678  dynamic_cast<const crs_matrix_type*> (A_.get());
680  (crsMat == nullptr, std::logic_error, methodName << ": "
681  "Multithreaded Gauss-Seidel methods currently only work "
682  "when the input matrix is a Tpetra::CrsMatrix.");
683 
684  // FIXME (mfh 27 May 2019) Dumping the matrix belongs in
685  // compute, not initialize, since users may change the matrix's
686  // values at any time before calling compute.
687  if (ifpack2_dump_matrix_) {
688  static int sequence_number = 0;
689  const std::string file_name = "Ifpack2_MT_GS_" +
690  std::to_string (sequence_number++) + ".mtx";
692  Teuchos::rcp_dynamic_cast<const crs_matrix_type> (A_);
693  if (! rcp_crs_mat.is_null ()) {
694  using writer_type = Tpetra::MatrixMarket::Writer<crs_matrix_type>;
695  writer_type::writeSparseFile (file_name, rcp_crs_mat);
696  }
697  }
698 
699  this->mtKernelHandle_ = Teuchos::rcp (new mt_kernel_handle_type ());
700  if (mtKernelHandle_->get_gs_handle () == nullptr) {
701  if (PrecType_ == Details::GS2 || PrecType_ == Details::SGS2)
702  mtKernelHandle_->create_gs_handle (KokkosSparse::GS_TWOSTAGE);
703  else if(this->clusterSize_ == 1)
704  mtKernelHandle_->create_gs_handle ();
705  else
706  mtKernelHandle_->create_gs_handle (KokkosSparse::CLUSTER_DEFAULT, this->clusterSize_);
707  }
708  local_matrix_type kcsr = crsMat->getLocalMatrix ();
709  if (PrecType_ == Details::GS2 || PrecType_ == Details::SGS2) {
710  // set parameters for two-stage GS
711  mtKernelHandle_->set_gs_set_num_inner_sweeps (NumInnerSweeps_);
712  mtKernelHandle_->set_gs_twostage (!InnerSpTrsv_, A_->getNodeNumRows ());
713  }
714 
715  using KokkosSparse::Experimental::gauss_seidel_symbolic;
716  gauss_seidel_symbolic<mt_kernel_handle_type,
717  lno_row_view_t,
718  lno_nonzero_view_t> (mtKernelHandle_.getRawPtr (),
719  A_->getNodeNumRows (),
720  A_->getNodeNumCols (),
721  kcsr.graph.row_map,
722  kcsr.graph.entries,
723  is_matrix_structurally_symmetric_);
724  }
725  } // timing of initialize stops here
726 
727  InitializeTime_ += timer->totalElapsedTime ();
728  ++NumInitialize_;
729  isInitialized_ = true;
730 }
731 
732 namespace Impl {
733 template <typename BlockDiagView>
734 struct InvertDiagBlocks {
735  typedef int value_type;
736  typedef typename BlockDiagView::size_type Size;
737 
738 private:
739  typedef Kokkos::MemoryTraits<Kokkos::Unmanaged> Unmanaged;
740  template <typename View>
741  using UnmanagedView = Kokkos::View<typename View::data_type, typename View::array_layout,
742  typename View::device_type, Unmanaged>;
743 
744  typedef typename BlockDiagView::non_const_value_type Scalar;
745  typedef typename BlockDiagView::device_type Device;
746  typedef Kokkos::View<Scalar**, Kokkos::LayoutRight, Device> RWrk;
747  typedef Kokkos::View<int**, Kokkos::LayoutRight, Device> IWrk;
748 
749  UnmanagedView<BlockDiagView> block_diag_;
750  // TODO Use thread team and scratch memory space. In this first
751  // pass, provide workspace for each block.
752  RWrk rwrk_buf_;
753  UnmanagedView<RWrk> rwrk_;
754  IWrk iwrk_buf_;
755  UnmanagedView<IWrk> iwrk_;
756 
757 public:
758  InvertDiagBlocks (BlockDiagView& block_diag)
759  : block_diag_(block_diag)
760  {
761  const auto blksz = block_diag.extent(1);
762  Kokkos::resize(rwrk_buf_, block_diag_.extent(0), blksz);
763  rwrk_ = rwrk_buf_;
764  Kokkos::resize(iwrk_buf_, block_diag_.extent(0), blksz);
765  iwrk_ = iwrk_buf_;
766  }
767 
768  KOKKOS_INLINE_FUNCTION
769  void operator() (const Size i, int& jinfo) const {
770  auto D_cur = Kokkos::subview(block_diag_, i, Kokkos::ALL(), Kokkos::ALL());
771  auto ipiv = Kokkos::subview(iwrk_, i, Kokkos::ALL());
772  auto work = Kokkos::subview(rwrk_, i, Kokkos::ALL());
773  int info = 0;
774  Tpetra::GETF2(D_cur, ipiv, info);
775  if (info) {
776  ++jinfo;
777  return;
778  }
779  Tpetra::GETRI(D_cur, ipiv, work, info);
780  if (info) ++jinfo;
781  }
782 
783  // Report the number of blocks with errors.
784  KOKKOS_INLINE_FUNCTION
785  void join (volatile value_type& dst, volatile value_type const& src) const { dst += src; }
786 };
787 }
788 
789 template<class MatrixType>
790 void Relaxation<MatrixType>::computeBlockCrs ()
791 {
792  using Kokkos::ALL;
793  using Teuchos::Array;
794  using Teuchos::ArrayRCP;
795  using Teuchos::ArrayView;
796  using Teuchos::as;
797  using Teuchos::Comm;
798  using Teuchos::RCP;
799  using Teuchos::rcp;
800  using Teuchos::REDUCE_MAX;
801  using Teuchos::REDUCE_MIN;
802  using Teuchos::REDUCE_SUM;
803  using Teuchos::rcp_dynamic_cast;
804  using Teuchos::reduceAll;
805  typedef local_ordinal_type LO;
806  typedef typename node_type::device_type device_type;
807 
808  const std::string timerName ("Ifpack2::Relaxation::computeBlockCrs");
810  if (timer.is_null ()) {
811  timer = Teuchos::TimeMonitor::getNewCounter (timerName);
812  }
813  {
814  Teuchos::TimeMonitor timeMon (*timer);
815 
817  A_.is_null (), std::runtime_error, "Ifpack2::Relaxation::"
818  "computeBlockCrs: The input matrix A is null. Please call setMatrix() "
819  "with a nonnull input matrix, then call initialize(), before calling "
820  "this method.");
821  const block_crs_matrix_type* blockCrsA =
822  dynamic_cast<const block_crs_matrix_type*> (A_.getRawPtr ());
824  blockCrsA == nullptr, std::logic_error, "Ifpack2::Relaxation::"
825  "computeBlockCrs: A_ is not a BlockCrsMatrix, but it should be if we "
826  "got this far. Please report this bug to the Ifpack2 developers.");
827 
828  const scalar_type one = STS::one ();
829 
830  // Reset state.
831  IsComputed_ = false;
832 
833  const LO lclNumMeshRows =
834  blockCrsA->getCrsGraph ().getNodeNumRows ();
835  const LO blockSize = blockCrsA->getBlockSize ();
836 
837  if (! savedDiagOffsets_) {
838  blockDiag_ = block_diag_type (); // clear it before reallocating
839  blockDiag_ = block_diag_type ("Ifpack2::Relaxation::blockDiag_",
840  lclNumMeshRows, blockSize, blockSize);
841  if (Teuchos::as<LO>(diagOffsets_.extent (0) ) < lclNumMeshRows) {
842  // Clear diagOffsets_ first (by assigning an empty View to it)
843  // to save memory, before reallocating.
844  diagOffsets_ = Kokkos::View<size_t*, device_type> ();
845  diagOffsets_ = Kokkos::View<size_t*, device_type> ("offsets", lclNumMeshRows);
846  }
847  blockCrsA->getCrsGraph ().getLocalDiagOffsets (diagOffsets_);
849  (static_cast<size_t> (diagOffsets_.extent (0)) !=
850  static_cast<size_t> (blockDiag_.extent (0)),
851  std::logic_error, "diagOffsets_.extent(0) = " <<
852  diagOffsets_.extent (0) << " != blockDiag_.extent(0) = "
853  << blockDiag_.extent (0) <<
854  ". Please report this bug to the Ifpack2 developers.");
855  savedDiagOffsets_ = true;
856  }
857  blockCrsA->getLocalDiagCopy (blockDiag_, diagOffsets_);
858 
859  // Use an unmanaged View in this method, so that when we take
860  // subviews of it (to get each diagonal block), we don't have to
861  // touch the reference count. Reference count updates are a
862  // thread scalability bottleneck and have a performance cost even
863  // without using threads.
864  unmanaged_block_diag_type blockDiag = blockDiag_;
865 
866  if (DoL1Method_ && IsParallel_) {
867  const scalar_type two = one + one;
868  const size_t maxLength = A_->getNodeMaxNumRowEntries ();
869  Array<LO> indices (maxLength);
870  Array<scalar_type> values (maxLength * blockSize * blockSize);
871  size_t numEntries = 0;
872 
873  for (LO i = 0; i < lclNumMeshRows; ++i) {
874  // FIXME (mfh 16 Dec 2015) Get views instead of copies.
875  blockCrsA->getLocalRowCopy (i, indices (), values (), numEntries);
876 
877  auto diagBlock = Kokkos::subview (blockDiag, i, ALL (), ALL ());
878  for (LO subRow = 0; subRow < blockSize; ++subRow) {
879  magnitude_type diagonal_boost = STM::zero ();
880  for (size_t k = 0 ; k < numEntries ; ++k) {
881  if (indices[k] > lclNumMeshRows) {
882  const size_t offset = blockSize*blockSize*k + subRow*blockSize;
883  for (LO subCol = 0; subCol < blockSize; ++subCol) {
884  diagonal_boost += STS::magnitude (values[offset+subCol] / two);
885  }
886  }
887  }
888  if (STS::magnitude (diagBlock(subRow, subRow)) < L1Eta_ * diagonal_boost) {
889  diagBlock(subRow, subRow) += diagonal_boost;
890  }
891  }
892  }
893  }
894 
895  int info = 0;
896  {
897  Impl::InvertDiagBlocks<unmanaged_block_diag_type> idb(blockDiag);
898  typedef typename unmanaged_block_diag_type::execution_space exec_space;
899  typedef Kokkos::RangePolicy<exec_space, LO> range_type;
900 
901  Kokkos::parallel_reduce (range_type (0, lclNumMeshRows), idb, info);
902  // FIXME (mfh 19 May 2017) Different processes may not
903  // necessarily have this error all at the same time, so it would
904  // be better just to preserve a local error state and let users
905  // check.
907  (info > 0, std::runtime_error, "GETF2 or GETRI failed on = " << info
908  << " diagonal blocks.");
909  }
910 
911  // In a debug build, do an extra test to make sure that all the
912  // factorizations were computed correctly.
913 #ifdef HAVE_IFPACK2_DEBUG
914  const int numResults = 2;
915  // Use "max = -min" trick to get min and max in a single all-reduce.
916  int lclResults[2], gblResults[2];
917  lclResults[0] = info;
918  lclResults[1] = -info;
919  gblResults[0] = 0;
920  gblResults[1] = 0;
921  reduceAll<int, int> (* (A_->getGraph ()->getComm ()), REDUCE_MIN,
922  numResults, lclResults, gblResults);
924  (gblResults[0] != 0 || gblResults[1] != 0, std::runtime_error,
925  "Ifpack2::Relaxation::compute: When processing the input "
926  "Tpetra::BlockCrsMatrix, one or more diagonal block LU factorizations "
927  "failed on one or more (MPI) processes.");
928 #endif // HAVE_IFPACK2_DEBUG
929  } // end TimeMonitor scope
930 
931  ComputeTime_ += timer->totalElapsedTime ();
932  ++NumCompute_;
933  IsComputed_ = true;
934 }
935 
936 template<class MatrixType>
938 {
939  using Tpetra::readWrite;
940  using Teuchos::Array;
941  using Teuchos::ArrayRCP;
942  using Teuchos::ArrayView;
943  using Teuchos::as;
944  using Teuchos::Comm;
945  using Teuchos::RCP;
946  using Teuchos::rcp;
947  using Teuchos::REDUCE_MAX;
948  using Teuchos::REDUCE_MIN;
949  using Teuchos::REDUCE_SUM;
950  using Teuchos::rcp_dynamic_cast;
951  using Teuchos::reduceAll;
952  using LO = local_ordinal_type;
953  using vector_type = Tpetra::Vector<scalar_type, local_ordinal_type,
955  using device_type = typename vector_type::device_type;
956  using IST = typename vector_type::impl_scalar_type;
957  using KAT = Kokkos::ArithTraits<IST>;
958 
959  const char methodName[] = "Ifpack2::Relaxation::compute";
960  const scalar_type zero = STS::zero ();
961  const scalar_type one = STS::one ();
962 
963  // mfh 27 May 2019: Later on, we should introduce an IFPACK2_DEBUG
964  // environment variable to control this behavior at run time.
965 #ifdef HAVE_IFPACK2_DEBUG
966  constexpr bool debug = true;
967 #else
968  constexpr bool debug = false;
969 #endif // HAVE_IFPACK2_DEBUG
970 
972  (A_.is_null (), std::runtime_error, methodName << ": "
973  "The input matrix A is null. Please call setMatrix() with a nonnull "
974  "input matrix, then call initialize(), before calling this method.");
975 
976  // We don't count initialization in compute() time.
977  if (! isInitialized ()) {
978  initialize ();
979  }
980 
981  if (hasBlockCrsMatrix_) {
982  computeBlockCrs ();
983  return;
984  }
985 
988 
989  { // Timing of compute starts here.
990  Teuchos::TimeMonitor timeMon (*timer);
992  (NumSweeps_ < 0, std::logic_error, methodName
993  << ": NumSweeps_ = " << NumSweeps_ << " < 0. "
994  "Please report this bug to the Ifpack2 developers.");
995  IsComputed_ = false;
996 
997  Diagonal_ = Teuchos::null;
998  // A_->getLocalDiagCopy fills in all Vector entries, even if the
999  // matrix has no stored entries in the corresponding rows.
1000  Diagonal_ = rcp (new vector_type (A_->getRowMap (), false));
1001 
1002  // Extract the diagonal entries. The CrsMatrix static graph
1003  // version is faster for subsequent calls to compute(), since it
1004  // caches the diagonal offsets.
1005  //
1006  // isStaticGraph() == true means that the matrix was created with
1007  // a const graph. The only requirement is that the structure of
1008  // the matrix never changes, so isStaticGraph() == true is a bit
1009  // more conservative than we need. However, Tpetra doesn't (as of
1010  // 05 Apr 2013) have a way to tell if the graph hasn't changed
1011  // since the last time we used it.
1012  {
1013  // NOTE (mfh 07 Jul 2013): We must cast here to CrsMatrix
1014  // instead of MatrixType, because isStaticGraph is a CrsMatrix
1015  // method (not inherited from RowMatrix's interface). It's
1016  // perfectly valid to do relaxation on a RowMatrix which is not
1017  // a CrsMatrix.
1018  const crs_matrix_type* crsMat =
1019  dynamic_cast<const crs_matrix_type*> (A_.getRawPtr ());
1020  if (crsMat == nullptr || ! crsMat->isStaticGraph ()) {
1021  A_->getLocalDiagCopy (*Diagonal_); // slow path
1022  }
1023  else {
1024  if (! savedDiagOffsets_) { // we haven't precomputed offsets
1025  const size_t lclNumRows = A_->getRowMap ()->getNodeNumElements ();
1026  if (diagOffsets_.extent (0) < lclNumRows) {
1027  using Kokkos::view_alloc;
1028  using Kokkos::WithoutInitializing;
1029  using offsets_view_type = Kokkos::View<size_t*, device_type>;
1030 
1031  diagOffsets_ = offsets_view_type (); // clear 1st to save mem
1032  auto howAlloc = view_alloc ("offsets", WithoutInitializing);
1033  diagOffsets_ = offsets_view_type (howAlloc, lclNumRows);
1034  }
1035  crsMat->getCrsGraph ()->getLocalDiagOffsets (diagOffsets_);
1036  savedDiagOffsets_ = true;
1037  }
1038  crsMat->getLocalDiagCopy (*Diagonal_, diagOffsets_);
1039 
1040  if (debug) {
1041  // Validate the fast-path diagonal against the slow-path diagonal.
1042  vector_type D_copy (A_->getRowMap ());
1043  A_->getLocalDiagCopy (D_copy);
1044  D_copy.update (STS::one (), *Diagonal_, -STS::one ());
1045  const magnitude_type err = D_copy.normInf ();
1046  // The two diagonals should be exactly the same, so their
1047  // difference should be exactly zero.
1049  (err != STM::zero(), std::logic_error, methodName << ": "
1050  << "\"fast-path\" diagonal computation failed. "
1051  "\\|D1 - D2\\|_inf = " << err << ".");
1052  }
1053  }
1054  }
1055 
1056  // If we're checking the computed inverse diagonal, then keep a
1057  // copy of the original diagonal entries for later comparison.
1058  std::unique_ptr<vector_type> origDiag;
1059  if (checkDiagEntries_) {
1060  origDiag = std::unique_ptr<vector_type>
1061  (new vector_type (*Diagonal_, Teuchos::Copy));
1062  }
1063 
1064  const LO numMyRows = static_cast<LO> (A_->getNodeNumRows ());
1065 
1066  // Setup for L1 Methods.
1067  // Here we add half the value of the off-processor entries in the row,
1068  // but only if diagonal isn't sufficiently large.
1069  //
1070  // This follows from Equation (6.5) in: Baker, Falgout, Kolev and
1071  // Yang. "Multigrid Smoothers for Ultraparallel Computing." SIAM
1072  // J. Sci. Comput., Vol. 33, No. 5. (2011), pp. 2864-2887.
1073  //
1074  // FIXME (mfh 27 May 2019) Add device-parallel version for when
1075  // *A_ is a CrsMatrix. Furthermore, in that case, we should
1076  // combine diagonal extraction above with L1 modification into a
1077  // single parallel loop.
1078  if (DoL1Method_ && IsParallel_) {
1079  vector_type& gblDiag = *Diagonal_;
1080  using rw_type =
1081  decltype (readWrite (gblDiag).on (Kokkos::HostSpace ()));
1082  // Once we have C++14, we can get rid of this alias and use
1083  // "auto" in the lambda below.
1084  using lcl_vec_type =
1085  Tpetra::with_local_access_function_argument_type<rw_type>;
1086  const row_matrix_type& A_row = *A_;
1087  const magnitude_type L1_eta = L1Eta_;
1088  Tpetra::withLocalAccess
1089  ([&A_row, L1_eta, numMyRows] (const lcl_vec_type& diag) {
1090  const magnitude_type two = STM::one () + STM::one ();
1091  const size_t maxLength = A_row.getNodeMaxNumRowEntries ();
1092  Array<local_ordinal_type> indices (maxLength);
1093  Array<scalar_type> values (maxLength);
1094  size_t numEntries;
1095 
1096  for (LO i = 0; i < numMyRows; ++i) {
1097  A_row.getLocalRowCopy (i, indices (), values (), numEntries);
1098  magnitude_type diagonal_boost = STM::zero ();
1099  for (size_t k = 0 ; k < numEntries; ++k) {
1100  if (indices[k] > numMyRows) {
1101  diagonal_boost += STS::magnitude (values[k] / two);
1102  }
1103  }
1104  if (KAT::magnitude (diag[i]) < L1_eta * diagonal_boost) {
1105  diag[i] += diagonal_boost;
1106  }
1107  }
1108  }, readWrite (gblDiag).on (Kokkos::HostSpace ()));
1109  }
1110 
1111  //
1112  // Invert the matrix's diagonal entries (in Diagonal_).
1113  //
1114 
1115  // Precompute some quantities for "fixing" zero or tiny diagonal
1116  // entries. We'll only use them if this "fixing" is enabled.
1117  //
1118  // SmallTraits covers for the lack of eps() in
1119  // Teuchos::ScalarTraits when its template parameter is not a
1120  // floating-point type. (Ifpack2 sometimes gets instantiated for
1121  // integer Scalar types.)
1122  IST oneOverMinDiagVal = KAT::one () / static_cast<IST> (SmallTraits<scalar_type>::eps ());
1123  if ( MinDiagonalValue_ != zero)
1124  oneOverMinDiagVal = KAT::one () / static_cast<IST> (MinDiagonalValue_);
1125 
1126  // It's helpful not to have to recompute this magnitude each time.
1127  const magnitude_type minDiagValMag = STS::magnitude (MinDiagonalValue_);
1128 
1129  if (checkDiagEntries_) {
1130  //
1131  // Check diagonal elements, replace zero elements with the minimum
1132  // diagonal value, and store their inverses. Count the number of
1133  // "small" and zero diagonal entries while we're at it.
1134  //
1135  size_t numSmallDiagEntries = 0; // "small" includes zero
1136  size_t numZeroDiagEntries = 0; // # zero diagonal entries
1137  size_t numNegDiagEntries = 0; // # negative (real parts of) diagonal entries
1138  magnitude_type minMagDiagEntryMag = STM::zero ();
1139  magnitude_type maxMagDiagEntryMag = STM::zero ();
1140 
1141  vector_type& gblDiag = *Diagonal_;
1142  // Once we have C++14, we can get rid of these two aliases and
1143  // use "auto" in the lambda below.
1144  using rw_type =
1145  decltype (readWrite (gblDiag).on (Kokkos::HostSpace ()));
1146  using lcl_vec_type =
1147  Tpetra::with_local_access_function_argument_type<rw_type>;
1148  Tpetra::withLocalAccess
1149  ([&] (const lcl_vec_type& diag) {
1150  // As we go, keep track of the diagonal entries with the
1151  // least and greatest magnitude. We could use the trick of
1152  // starting min with +Inf and max with -Inf, but that
1153  // doesn't work if scalar_type is a built-in integer type.
1154  // Thus, we have to start by reading the first diagonal
1155  // entry redundantly.
1156  if (numMyRows != 0) {
1157  const magnitude_type d_0_mag = KAT::abs (diag[0]);
1158  minMagDiagEntryMag = d_0_mag;
1159  maxMagDiagEntryMag = d_0_mag;
1160  }
1161 
1162  // Go through all the diagonal entries. Compute counts of
1163  // small-magnitude, zero, and negative-real-part entries.
1164  // Invert the diagonal entries that aren't too small. For
1165  // those too small in magnitude, replace them with
1166  // 1/MinDiagonalValue_ (or 1/eps if MinDiagonalValue_
1167  // happens to be zero).
1168  for (LO i = 0; i < numMyRows; ++i) {
1169  const IST d_i = diag[i];
1170  const magnitude_type d_i_mag = KAT::abs (d_i);
1171  // Work-around for GitHub Issue #5269.
1172  //const magnitude_type d_i_real = KAT::real (d_i);
1173  const auto d_i_real = getRealValue (d_i);
1174 
1175  // We can't compare complex numbers, but we can compare their
1176  // real parts.
1177  if (d_i_real < STM::zero ()) {
1178  ++numNegDiagEntries;
1179  }
1180  if (d_i_mag < minMagDiagEntryMag) {
1181  minMagDiagEntryMag = d_i_mag;
1182  }
1183  if (d_i_mag > maxMagDiagEntryMag) {
1184  maxMagDiagEntryMag = d_i_mag;
1185  }
1186 
1187  if (fixTinyDiagEntries_) {
1188  // <= not <, in case minDiagValMag is zero.
1189  if (d_i_mag <= minDiagValMag) {
1190  ++numSmallDiagEntries;
1191  if (d_i_mag == STM::zero ()) {
1192  ++numZeroDiagEntries;
1193  }
1194  diag[i] = oneOverMinDiagVal;
1195  }
1196  else {
1197  diag[i] = KAT::one () / d_i;
1198  }
1199  }
1200  else { // Don't fix zero or tiny diagonal entries.
1201  // <= not <, in case minDiagValMag is zero.
1202  if (d_i_mag <= minDiagValMag) {
1203  ++numSmallDiagEntries;
1204  if (d_i_mag == STM::zero ()) {
1205  ++numZeroDiagEntries;
1206  }
1207  }
1208  diag[i] = KAT::one () / d_i;
1209  }
1210  }
1211  }, readWrite (gblDiag).on (Kokkos::HostSpace ()));
1212 
1213  // Count floating-point operations of computing the inverse diagonal.
1214  //
1215  // FIXME (mfh 30 Mar 2013) Shouldn't counts be global, not local?
1216  if (STS::isComplex) { // magnitude: at least 3 flops per diagonal entry
1217  ComputeFlops_ += 4.0 * numMyRows;
1218  }
1219  else {
1220  ComputeFlops_ += numMyRows;
1221  }
1222 
1223  // Collect global data about the diagonal entries. Only do this
1224  // (which involves O(1) all-reduces) if the user asked us to do
1225  // the extra work.
1226  //
1227  // FIXME (mfh 28 Mar 2013) This is wrong if some processes have
1228  // zero rows. Fixing this requires one of two options:
1229  //
1230  // 1. Use a custom reduction operation that ignores processes
1231  // with zero diagonal entries.
1232  // 2. Split the communicator, compute all-reduces using the
1233  // subcommunicator over processes that have a nonzero number
1234  // of diagonal entries, and then broadcast from one of those
1235  // processes (if there is one) to the processes in the other
1236  // subcommunicator.
1237  const Comm<int>& comm = * (A_->getRowMap ()->getComm ());
1238 
1239  // Compute global min and max magnitude of entries.
1240  Array<magnitude_type> localVals (2);
1241  localVals[0] = minMagDiagEntryMag;
1242  // (- (min (- x))) is the same as (max x).
1243  localVals[1] = -maxMagDiagEntryMag;
1244  Array<magnitude_type> globalVals (2);
1245  reduceAll<int, magnitude_type> (comm, REDUCE_MIN, 2,
1246  localVals.getRawPtr (),
1247  globalVals.getRawPtr ());
1248  globalMinMagDiagEntryMag_ = globalVals[0];
1249  globalMaxMagDiagEntryMag_ = -globalVals[1];
1250 
1251  Array<size_t> localCounts (3);
1252  localCounts[0] = numSmallDiagEntries;
1253  localCounts[1] = numZeroDiagEntries;
1254  localCounts[2] = numNegDiagEntries;
1255  Array<size_t> globalCounts (3);
1256  reduceAll<int, size_t> (comm, REDUCE_SUM, 3,
1257  localCounts.getRawPtr (),
1258  globalCounts.getRawPtr ());
1259  globalNumSmallDiagEntries_ = globalCounts[0];
1260  globalNumZeroDiagEntries_ = globalCounts[1];
1261  globalNumNegDiagEntries_ = globalCounts[2];
1262 
1263  // Compute and save the difference between the computed inverse
1264  // diagonal, and the original diagonal's inverse.
1265  vector_type diff (A_->getRowMap ());
1266  diff.reciprocal (*origDiag);
1267  if (Diagonal_->need_sync_device ()) {
1268  Diagonal_->sync_device ();
1269  }
1270  diff.update (-one, *Diagonal_, one);
1271  globalDiagNormDiff_ = diff.norm2 ();
1272  }
1273  else { // don't check diagonal elements
1274  if (fixTinyDiagEntries_) {
1275  // Go through all the diagonal entries. Invert those that
1276  // aren't too small in magnitude. For those that are too
1277  // small in magnitude, replace them with oneOverMinDiagVal.
1278  vector_type& gblDiag = *Diagonal_;
1279  Tpetra::transform
1280  ("Ifpack2::Relaxation::compute: Invert & fix diagonal",
1281  gblDiag, gblDiag,
1282  KOKKOS_LAMBDA (const IST& d_i) {
1283  const magnitude_type d_i_mag = KAT::magnitude (d_i);
1284 
1285  // <= not <, in case minDiagValMag is zero.
1286  if (d_i_mag <= minDiagValMag) {
1287  return oneOverMinDiagVal;
1288  }
1289  else {
1290  // For Stokhos types, operator/ returns an expression
1291  // type. Explicitly convert to IST before returning.
1292  return IST (KAT::one () / d_i);
1293  }
1294  });
1295  }
1296  else { // don't fix tiny or zero diagonal entries
1297  if (Diagonal_->need_sync_device ()) {
1298  Diagonal_->sync_device ();
1299  }
1300  Diagonal_->reciprocal (*Diagonal_);
1301  }
1302 
1303  if (STS::isComplex) { // magnitude: at least 3 flops per diagonal entry
1304  ComputeFlops_ += 4.0 * numMyRows;
1305  }
1306  else {
1307  ComputeFlops_ += numMyRows;
1308  }
1309  }
1310 
1311  if (Diagonal_->need_sync_device ()) {
1312  Diagonal_->sync_device ();
1313  }
1314 
1315  if (PrecType_ == Ifpack2::Details::MTGS ||
1316  PrecType_ == Ifpack2::Details::MTSGS ||
1317  PrecType_ == Ifpack2::Details::GS2 ||
1318  PrecType_ == Ifpack2::Details::SGS2) {
1319 
1320  //KokkosKernels GaussSeidel Initialization.
1321 
1322  const crs_matrix_type* crsMat =
1323  dynamic_cast<const crs_matrix_type*> (A_.get());
1325  (crsMat == nullptr, std::logic_error, methodName << ": "
1326  "Multithreaded Gauss-Seidel methods currently only work "
1327  "when the input matrix is a Tpetra::CrsMatrix.");
1328  local_matrix_type kcsr = crsMat->getLocalMatrix ();
1329 
1330  auto diagView_2d = Diagonal_->getLocalViewDevice ();
1331  auto diagView_1d = Kokkos::subview (diagView_2d, Kokkos::ALL (), 0);
1332  using KokkosSparse::Experimental::gauss_seidel_numeric;
1333  gauss_seidel_numeric<mt_kernel_handle_type,
1334  lno_row_view_t,
1335  lno_nonzero_view_t,
1336  scalar_nonzero_view_t> (mtKernelHandle_.getRawPtr (),
1337  A_->getNodeNumRows (),
1338  A_->getNodeNumCols (),
1339  kcsr.graph.row_map,
1340  kcsr.graph.entries,
1341  kcsr.values,
1342  diagView_1d,
1343  is_matrix_structurally_symmetric_);
1344  }
1345  } // end TimeMonitor scope
1346 
1347  ComputeTime_ += timer->totalElapsedTime ();
1348  ++NumCompute_;
1349  IsComputed_ = true;
1350 }
1351 
1352 
1353 template<class MatrixType>
1354 void
1356 ApplyInverseRichardson (const Tpetra::MultiVector<scalar_type,local_ordinal_type,global_ordinal_type,node_type>& X,
1357  Tpetra::MultiVector<scalar_type,local_ordinal_type,global_ordinal_type,node_type>& Y) const
1358 {
1359  using Teuchos::as;
1360  const double numGlobalRows = as<double> (A_->getGlobalNumRows ());
1361  const double numVectors = as<double> (X.getNumVectors ());
1362  if (ZeroStartingSolution_) {
1363  // For the first Richardson sweep, if we are allowed to assume that
1364  // the initial guess is zero, then Richardson is just alpha times the RHS
1365  // Compute it as Y(i,j) = DampingFactor_ * X(i,j).
1366  Y.scale(DampingFactor_,X);
1367 
1368  // Count (global) floating-point operations. Ifpack2 represents
1369  // this as a floating-point number rather than an integer, so that
1370  // overflow (for a very large number of calls, or a very large
1371  // problem) is approximate instead of catastrophic.
1372  double flopUpdate = 0.0;
1373  if (DampingFactor_ == STS::one ()) {
1374  // Y(i,j) = X(i,j): one multiply for each entry of Y.
1375  flopUpdate = numGlobalRows * numVectors;
1376  } else {
1377  // Y(i,j) = DampingFactor_ * X(i,j):
1378  // One multiplies per entry of Y.
1379  flopUpdate = numGlobalRows * numVectors;
1380  }
1381  ApplyFlops_ += flopUpdate;
1382  if (NumSweeps_ == 1) {
1383  return;
1384  }
1385  }
1386  // If we were allowed to assume that the starting guess was zero,
1387  // then we have already done the first sweep above.
1388  const int startSweep = ZeroStartingSolution_ ? 1 : 0;
1389 
1390  // Allocate a multivector if the cached one isn't perfect
1391  updateCachedMultiVector(Y.getMap(),as<size_t>(numVectors));
1392 
1393  for (int j = startSweep; j < NumSweeps_; ++j) {
1394  // Each iteration: Y = Y + \omega D^{-1} (X - A*Y)
1395  Tpetra::Details::residual(*A_,Y,X,*cachedMV_);
1396  Y.update(DampingFactor_,*cachedMV_,STS::one());
1397  }
1398 
1399  // For each column of output, for each pass over the matrix:
1400  //
1401  // - One + and one * for each matrix entry
1402  // - One / and one + for each row of the matrix
1403  // - If the damping factor is not one: one * for each row of the
1404  // matrix. (It's not fair to count this if the damping factor is
1405  // one, since the implementation could skip it. Whether it does
1406  // or not is the implementation's choice.)
1407 
1408  // Floating-point operations due to the damping factor, per matrix
1409  // row, per direction, per columm of output.
1410  const double numGlobalNonzeros = as<double> (A_->getGlobalNumEntries ());
1411  const double dampingFlops = (DampingFactor_ == STS::one ()) ? 0.0 : 1.0;
1412  ApplyFlops_ += as<double> (NumSweeps_ - startSweep) * numVectors *
1413  (2.0 * numGlobalNonzeros + dampingFlops);
1414 }
1415 
1416 
1417 template<class MatrixType>
1418 void
1419 Relaxation<MatrixType>::
1420 ApplyInverseJacobi (const Tpetra::MultiVector<scalar_type,local_ordinal_type,global_ordinal_type,node_type>& X,
1421  Tpetra::MultiVector<scalar_type,local_ordinal_type,global_ordinal_type,node_type>& Y) const
1422 {
1423  using Teuchos::as;
1424  if (hasBlockCrsMatrix_) {
1425  ApplyInverseJacobi_BlockCrsMatrix (X, Y);
1426  return;
1427  }
1428 
1429  const double numGlobalRows = as<double> (A_->getGlobalNumRows ());
1430  const double numVectors = as<double> (X.getNumVectors ());
1431  if (ZeroStartingSolution_) {
1432  // For the first Jacobi sweep, if we are allowed to assume that
1433  // the initial guess is zero, then Jacobi is just diagonal
1434  // scaling. (A_ij * x_j = 0 for i != j, since x_j = 0.)
1435  // Compute it as Y(i,j) = DampingFactor_ * X(i,j) * D(i).
1436  Y.elementWiseMultiply (DampingFactor_, *Diagonal_, X, STS::zero ());
1437 
1438  // Count (global) floating-point operations. Ifpack2 represents
1439  // this as a floating-point number rather than an integer, so that
1440  // overflow (for a very large number of calls, or a very large
1441  // problem) is approximate instead of catastrophic.
1442  double flopUpdate = 0.0;
1443  if (DampingFactor_ == STS::one ()) {
1444  // Y(i,j) = X(i,j) * D(i): one multiply for each entry of Y.
1445  flopUpdate = numGlobalRows * numVectors;
1446  } else {
1447  // Y(i,j) = DampingFactor_ * X(i,j) * D(i):
1448  // Two multiplies per entry of Y.
1449  flopUpdate = 2.0 * numGlobalRows * numVectors;
1450  }
1451  ApplyFlops_ += flopUpdate;
1452  if (NumSweeps_ == 1) {
1453  return;
1454  }
1455  }
1456  // If we were allowed to assume that the starting guess was zero,
1457  // then we have already done the first sweep above.
1458  const int startSweep = ZeroStartingSolution_ ? 1 : 0;
1459 
1460  // Allocate a multivector if the cached one isn't perfect
1461  updateCachedMultiVector(Y.getMap(),as<size_t>(numVectors));
1462 
1463  for (int j = startSweep; j < NumSweeps_; ++j) {
1464  // Each iteration: Y = Y + \omega D^{-1} (X - A*Y)
1465  Tpetra::Details::residual(*A_,Y,X,*cachedMV_);
1466  Y.elementWiseMultiply (DampingFactor_, *Diagonal_, *cachedMV_, STS::one ());
1467  }
1468 
1469  // For each column of output, for each pass over the matrix:
1470  //
1471  // - One + and one * for each matrix entry
1472  // - One / and one + for each row of the matrix
1473  // - If the damping factor is not one: one * for each row of the
1474  // matrix. (It's not fair to count this if the damping factor is
1475  // one, since the implementation could skip it. Whether it does
1476  // or not is the implementation's choice.)
1477 
1478  // Floating-point operations due to the damping factor, per matrix
1479  // row, per direction, per columm of output.
1480  const double numGlobalNonzeros = as<double> (A_->getGlobalNumEntries ());
1481  const double dampingFlops = (DampingFactor_ == STS::one ()) ? 0.0 : 1.0;
1482  ApplyFlops_ += as<double> (NumSweeps_ - startSweep) * numVectors *
1483  (2.0 * numGlobalRows + 2.0 * numGlobalNonzeros + dampingFlops);
1484 }
1485 
1486 template<class MatrixType>
1487 void
1488 Relaxation<MatrixType>::
1489 ApplyInverseJacobi_BlockCrsMatrix (const Tpetra::MultiVector<scalar_type,
1490  local_ordinal_type,
1491  global_ordinal_type,
1492  node_type>& X,
1493  Tpetra::MultiVector<scalar_type,
1494  local_ordinal_type,
1495  global_ordinal_type,
1496  node_type>& Y) const
1497 {
1498  using Tpetra::BlockMultiVector;
1499  using BMV = BlockMultiVector<scalar_type, local_ordinal_type,
1500  global_ordinal_type, node_type>;
1501 
1502  const block_crs_matrix_type* blockMatConst =
1503  dynamic_cast<const block_crs_matrix_type*> (A_.getRawPtr ());
1505  (blockMatConst == nullptr, std::logic_error, "This method should "
1506  "never be called if the matrix A_ is not a BlockCrsMatrix. "
1507  "Please report this bug to the Ifpack2 developers.");
1508  // mfh 23 Jan 2016: Unfortunately, the const cast is necessary.
1509  // This is because applyBlock() is nonconst (more accurate), while
1510  // apply() is const (required by Tpetra::Operator interface, but a
1511  // lie, because it possibly allocates temporary buffers).
1512  block_crs_matrix_type* blockMat =
1513  const_cast<block_crs_matrix_type*> (blockMatConst);
1514 
1515  auto meshRowMap = blockMat->getRowMap ();
1516  auto meshColMap = blockMat->getColMap ();
1517  const local_ordinal_type blockSize = blockMat->getBlockSize ();
1518 
1519  BMV X_blk (X, *meshColMap, blockSize);
1520  BMV Y_blk (Y, *meshRowMap, blockSize);
1521 
1522  if (ZeroStartingSolution_) {
1523  // For the first sweep, if we are allowed to assume that the
1524  // initial guess is zero, then block Jacobi is just block diagonal
1525  // scaling. (A_ij * x_j = 0 for i != j, since x_j = 0.)
1526  Y_blk.blockWiseMultiply (DampingFactor_, blockDiag_, X_blk);
1527  if (NumSweeps_ == 1) {
1528  return;
1529  }
1530  }
1531 
1532  auto pointRowMap = Y.getMap ();
1533  const size_t numVecs = X.getNumVectors ();
1534 
1535  // We don't need to initialize the result MV, since the sparse
1536  // mat-vec will clobber its contents anyway.
1537  BMV A_times_Y (*meshRowMap, *pointRowMap, blockSize, numVecs);
1538 
1539  // If we were allowed to assume that the starting guess was zero,
1540  // then we have already done the first sweep above.
1541  const int startSweep = ZeroStartingSolution_ ? 1 : 0;
1542 
1543  for (int j = startSweep; j < NumSweeps_; ++j) {
1544  blockMat->applyBlock (Y_blk, A_times_Y);
1545  // Y := Y + \omega D^{-1} (X - A*Y). Use A_times_Y as scratch.
1546  Y_blk.blockJacobiUpdate (DampingFactor_, blockDiag_,
1547  X_blk, A_times_Y, STS::one ());
1548  }
1549 }
1550 
1551 template<class MatrixType>
1552 void
1553 Relaxation<MatrixType>::
1554 ApplyInverseGS (const Tpetra::MultiVector<scalar_type,local_ordinal_type,global_ordinal_type,node_type>& X,
1555  Tpetra::MultiVector<scalar_type,local_ordinal_type,global_ordinal_type,node_type>& Y) const
1556 {
1557  using this_type = Relaxation<MatrixType>;
1558  // The CrsMatrix version is faster, because it can access the sparse
1559  // matrix data directly, rather than by copying out each row's data
1560  // in turn. Thus, we check whether the RowMatrix is really a
1561  // CrsMatrix.
1562  //
1563  // FIXME (mfh 07 Jul 2013) See note on crs_matrix_type typedef
1564  // declaration in Ifpack2_Relaxation_decl.hpp header file. The code
1565  // will still be correct if the cast fails, but it will use an
1566  // unoptimized kernel.
1567  const block_crs_matrix_type* blockCrsMat =
1568  dynamic_cast<const block_crs_matrix_type*> (A_.getRawPtr ());
1569  const crs_matrix_type* crsMat =
1570  dynamic_cast<const crs_matrix_type*> (A_.getRawPtr ());
1571  if (blockCrsMat != nullptr) {
1572  const_cast<this_type&> (*this).ApplyInverseGS_BlockCrsMatrix (*blockCrsMat, X, Y);
1573  }
1574  else if (crsMat != nullptr) {
1575  ApplyInverseGS_CrsMatrix (*crsMat, X, Y);
1576  }
1577  else {
1578  ApplyInverseGS_RowMatrix (X, Y);
1579  }
1580 }
1581 
1582 
1583 template<class MatrixType>
1584 void
1585 Relaxation<MatrixType>::
1586 ApplyInverseGS_RowMatrix (const Tpetra::MultiVector<scalar_type,local_ordinal_type,global_ordinal_type,node_type>& X,
1587  Tpetra::MultiVector<scalar_type,local_ordinal_type,global_ordinal_type,node_type>& Y) const
1588 {
1589  using Teuchos::Array;
1590  using Teuchos::ArrayRCP;
1591  using Teuchos::ArrayView;
1592  using Teuchos::as;
1593  using Teuchos::RCP;
1594  using Teuchos::rcp;
1595  using Teuchos::rcpFromRef;
1596  typedef Tpetra::MultiVector<scalar_type,local_ordinal_type,global_ordinal_type,node_type> MV;
1597 
1598  // Tpetra's GS implementation for CrsMatrix handles zeroing out the
1599  // starting multivector itself. The generic RowMatrix version here
1600  // does not, so we have to zero out Y here.
1601  if (ZeroStartingSolution_) {
1602  Y.putScalar (STS::zero ());
1603  }
1604 
1605  const size_t NumVectors = X.getNumVectors ();
1606  const size_t maxLength = A_->getNodeMaxNumRowEntries ();
1607  Array<local_ordinal_type> Indices (maxLength);
1608  Array<scalar_type> Values (maxLength);
1609 
1610  // Local smoothing stuff
1611  const size_t numMyRows = A_->getNodeNumRows();
1612  const local_ordinal_type* rowInd = 0;
1613  size_t numActive = numMyRows;
1614  bool do_local = ! localSmoothingIndices_.is_null ();
1615  if (do_local) {
1616  rowInd = localSmoothingIndices_.getRawPtr ();
1617  numActive = localSmoothingIndices_.size ();
1618  }
1619 
1620  RCP<MV> Y2;
1621  if (IsParallel_) {
1622  if (Importer_.is_null ()) { // domain and column Maps are the same.
1623  updateCachedMultiVector (Y.getMap (), NumVectors);
1624  }
1625  else {
1626  updateCachedMultiVector (Importer_->getTargetMap (), NumVectors);
1627  }
1628  Y2 = cachedMV_;
1629  }
1630  else {
1631  Y2 = rcpFromRef (Y);
1632  }
1633 
1634  // Diagonal
1635  ArrayRCP<const scalar_type> d_rcp = Diagonal_->get1dView ();
1636  ArrayView<const scalar_type> d_ptr = d_rcp();
1637 
1638  // Constant stride check
1639  bool constant_stride = X.isConstantStride() && Y2->isConstantStride();
1640 
1641  if (constant_stride) {
1642  // extract 1D RCPs
1643  size_t x_stride = X.getStride();
1644  size_t y2_stride = Y2->getStride();
1645  ArrayRCP<scalar_type> y2_rcp = Y2->get1dViewNonConst();
1646  ArrayRCP<const scalar_type> x_rcp = X.get1dView();
1647  ArrayView<scalar_type> y2_ptr = y2_rcp();
1648  ArrayView<const scalar_type> x_ptr = x_rcp();
1649  Array<scalar_type> dtemp(NumVectors,STS::zero());
1650 
1651  for (int j = 0; j < NumSweeps_; ++j) {
1652  // data exchange is here, once per sweep
1653  if (IsParallel_) {
1654  if (Importer_.is_null ()) {
1655  // FIXME (mfh 27 May 2019) This doesn't deep copy -- not
1656  // clear if this is correct. Reevaluate at some point.
1657 
1658  *Y2 = Y; // just copy, since domain and column Maps are the same
1659  } else {
1660  Y2->doImport (Y, *Importer_, Tpetra::INSERT);
1661  }
1662  }
1663 
1664  if (! DoBackwardGS_) { // Forward sweep
1665  for (size_t ii = 0; ii < numActive; ++ii) {
1666  local_ordinal_type i = as<local_ordinal_type> (do_local ? rowInd[ii] : ii);
1667  size_t NumEntries;
1668  A_->getLocalRowCopy (i, Indices (), Values (), NumEntries);
1669  dtemp.assign(NumVectors,STS::zero());
1670 
1671  for (size_t k = 0; k < NumEntries; ++k) {
1672  const local_ordinal_type col = Indices[k];
1673  for (size_t m = 0; m < NumVectors; ++m) {
1674  dtemp[m] += Values[k] * y2_ptr[col + y2_stride*m];
1675  }
1676  }
1677 
1678  for (size_t m = 0; m < NumVectors; ++m) {
1679  y2_ptr[i + y2_stride*m] += DampingFactor_ * d_ptr[i] * (x_ptr[i + x_stride*m] - dtemp[m]);
1680  }
1681  }
1682  }
1683  else { // Backward sweep
1684  // ptrdiff_t is the same size as size_t, but is signed. Being
1685  // signed is important so that i >= 0 is not trivially true.
1686  for (ptrdiff_t ii = as<ptrdiff_t> (numActive) - 1; ii >= 0; --ii) {
1687  local_ordinal_type i = as<local_ordinal_type> (do_local ? rowInd[ii] : ii);
1688  size_t NumEntries;
1689  A_->getLocalRowCopy (i, Indices (), Values (), NumEntries);
1690  dtemp.assign (NumVectors, STS::zero ());
1691 
1692  for (size_t k = 0; k < NumEntries; ++k) {
1693  const local_ordinal_type col = Indices[k];
1694  for (size_t m = 0; m < NumVectors; ++m) {
1695  dtemp[m] += Values[k] * y2_ptr[col + y2_stride*m];
1696  }
1697  }
1698 
1699  for (size_t m = 0; m < NumVectors; ++m) {
1700  y2_ptr[i + y2_stride*m] += DampingFactor_ * d_ptr[i] * (x_ptr[i + x_stride*m] - dtemp[m]);
1701  }
1702  }
1703  }
1704  // FIXME (mfh 02 Jan 2013) This is only correct if row Map == range Map.
1705  if (IsParallel_) {
1706  Tpetra::deep_copy (Y, *Y2);
1707  }
1708  }
1709  }
1710  else {
1711  // extract 2D RCPS
1712  ArrayRCP<ArrayRCP<scalar_type> > y2_ptr = Y2->get2dViewNonConst ();
1713  ArrayRCP<ArrayRCP<const scalar_type> > x_ptr = X.get2dView ();
1714 
1715  for (int j = 0; j < NumSweeps_; ++j) {
1716  // data exchange is here, once per sweep
1717  if (IsParallel_) {
1718  if (Importer_.is_null ()) {
1719  *Y2 = Y; // just copy, since domain and column Maps are the same
1720  } else {
1721  Y2->doImport (Y, *Importer_, Tpetra::INSERT);
1722  }
1723  }
1724 
1725  if (! DoBackwardGS_) { // Forward sweep
1726  for (size_t ii = 0; ii < numActive; ++ii) {
1727  local_ordinal_type i = as<local_ordinal_type> (do_local ? rowInd[ii] : ii);
1728  size_t NumEntries;
1729  A_->getLocalRowCopy (i, Indices (), Values (), NumEntries);
1730 
1731  for (size_t m = 0; m < NumVectors; ++m) {
1732  scalar_type dtemp = STS::zero ();
1733  ArrayView<const scalar_type> x_local = (x_ptr())[m]();
1734  ArrayView<scalar_type> y2_local = (y2_ptr())[m]();
1735 
1736  for (size_t k = 0; k < NumEntries; ++k) {
1737  const local_ordinal_type col = Indices[k];
1738  dtemp += Values[k] * y2_local[col];
1739  }
1740  y2_local[i] += DampingFactor_ * d_ptr[i] * (x_local[i] - dtemp);
1741  }
1742  }
1743  }
1744  else { // Backward sweep
1745  // ptrdiff_t is the same size as size_t, but is signed. Being
1746  // signed is important so that i >= 0 is not trivially true.
1747  for (ptrdiff_t ii = as<ptrdiff_t> (numActive) - 1; ii >= 0; --ii) {
1748  local_ordinal_type i = as<local_ordinal_type> (do_local ? rowInd[ii] : ii);
1749 
1750  size_t NumEntries;
1751  A_->getLocalRowCopy (i, Indices (), Values (), NumEntries);
1752 
1753  for (size_t m = 0; m < NumVectors; ++m) {
1754  scalar_type dtemp = STS::zero ();
1755  ArrayView<const scalar_type> x_local = (x_ptr())[m]();
1756  ArrayView<scalar_type> y2_local = (y2_ptr())[m]();
1757 
1758  for (size_t k = 0; k < NumEntries; ++k) {
1759  const local_ordinal_type col = Indices[k];
1760  dtemp += Values[k] * y2_local[col];
1761  }
1762  y2_local[i] += DampingFactor_ * d_ptr[i] * (x_local[i] - dtemp);
1763  }
1764  }
1765  }
1766 
1767  // FIXME (mfh 02 Jan 2013) This is only correct if row Map == range Map.
1768  if (IsParallel_) {
1769  Tpetra::deep_copy (Y, *Y2);
1770  }
1771  }
1772  }
1773 
1774 
1775  // See flop count discussion in implementation of ApplyInverseGS_CrsMatrix().
1776  const double dampingFlops = (DampingFactor_ == STS::one()) ? 0.0 : 1.0;
1777  const double numVectors = as<double> (X.getNumVectors ());
1778  const double numGlobalRows = as<double> (A_->getGlobalNumRows ());
1779  const double numGlobalNonzeros = as<double> (A_->getGlobalNumEntries ());
1780  ApplyFlops_ += 2.0 * NumSweeps_ * numVectors *
1781  (2.0 * numGlobalRows + 2.0 * numGlobalNonzeros + dampingFlops);
1782 }
1783 
1784 
1785 template<class MatrixType>
1786 void
1787 Relaxation<MatrixType>::
1788 ApplyInverseGS_CrsMatrix (const crs_matrix_type& A,
1789  const Tpetra::MultiVector<scalar_type,local_ordinal_type,global_ordinal_type,node_type>& X,
1790  Tpetra::MultiVector<scalar_type,local_ordinal_type,global_ordinal_type,node_type>& Y) const
1791 {
1792  using Teuchos::as;
1793  const Tpetra::ESweepDirection direction =
1794  DoBackwardGS_ ? Tpetra::Backward : Tpetra::Forward;
1795  if (localSmoothingIndices_.is_null ()) {
1796  A.gaussSeidelCopy (Y, X, *Diagonal_, DampingFactor_, direction,
1797  NumSweeps_, ZeroStartingSolution_);
1798  }
1799  else {
1800  A.reorderedGaussSeidelCopy (Y, X, *Diagonal_, localSmoothingIndices_ (),
1801  DampingFactor_, direction,
1802  NumSweeps_, ZeroStartingSolution_);
1803  }
1804 
1805  // For each column of output, for each sweep over the matrix:
1806  //
1807  // - One + and one * for each matrix entry
1808  // - One / and one + for each row of the matrix
1809  // - If the damping factor is not one: one * for each row of the
1810  // matrix. (It's not fair to count this if the damping factor is
1811  // one, since the implementation could skip it. Whether it does
1812  // or not is the implementation's choice.)
1813 
1814  // Floating-point operations due to the damping factor, per matrix
1815  // row, per direction, per columm of output.
1816  const double dampingFlops = (DampingFactor_ == STS::one()) ? 0.0 : 1.0;
1817  const double numVectors = as<double> (X.getNumVectors ());
1818  const double numGlobalRows = as<double> (A_->getGlobalNumRows ());
1819  const double numGlobalNonzeros = as<double> (A_->getGlobalNumEntries ());
1820  ApplyFlops_ += NumSweeps_ * numVectors *
1821  (2.0 * numGlobalRows + 2.0 * numGlobalNonzeros + dampingFlops);
1822 }
1823 
1824 template<class MatrixType>
1825 void
1826 Relaxation<MatrixType>::
1827 ApplyInverseGS_BlockCrsMatrix (const block_crs_matrix_type& A,
1828  const Tpetra::MultiVector<scalar_type,local_ordinal_type,global_ordinal_type,node_type>& X,
1829  Tpetra::MultiVector<scalar_type,local_ordinal_type,global_ordinal_type,node_type>& Y)
1830 {
1831  using Tpetra::INSERT;
1832  using Teuchos::RCP;
1833  using Teuchos::rcp;
1834  using Teuchos::rcpFromRef;
1835  using BMV = Tpetra::BlockMultiVector<scalar_type,
1836  local_ordinal_type, global_ordinal_type, node_type>;
1837  using MV = Tpetra::MultiVector<scalar_type,
1838  local_ordinal_type, global_ordinal_type, node_type>;
1839  using map_type = Tpetra::Map<local_ordinal_type,
1840  global_ordinal_type, node_type>;
1841  using import_type = Tpetra::Import<local_ordinal_type,
1842  global_ordinal_type, node_type>;
1843 
1844  //FIXME: (tcf) 8/21/2014 -- may be problematic for multiple right hand sides
1845  //
1846  // NOTE (mfh 12 Sep 2014) I don't think it should be a problem for
1847  // multiple right-hand sides, unless the input or output MultiVector
1848  // does not have constant stride. We should check for that case
1849  // here, in case it doesn't work in localGaussSeidel (which is
1850  // entirely possible).
1851  BMV yBlock(Y, *(A.getGraph ()->getDomainMap()), A.getBlockSize());
1852  const BMV xBlock(X, *(A.getColMap ()), A.getBlockSize());
1853 
1854  bool performImport = false;
1855  RCP<BMV> yBlockCol;
1856  if (Importer_.is_null()) {
1857  yBlockCol = rcpFromRef(yBlock);
1858  }
1859  else {
1860  if (yBlockColumnPointMap_.is_null() ||
1861  yBlockColumnPointMap_->getNumVectors() != yBlock.getNumVectors() ||
1862  yBlockColumnPointMap_->getBlockSize() != yBlock.getBlockSize()) {
1863  yBlockColumnPointMap_ =
1864  rcp(new BMV(*(A.getColMap()), A.getBlockSize(),
1865  static_cast<local_ordinal_type>(yBlock.getNumVectors())));
1866  }
1867  yBlockCol = yBlockColumnPointMap_;
1868  if (pointImporter_.is_null()) {
1869  auto srcMap = rcp(new map_type(yBlock.getPointMap()));
1870  auto tgtMap = rcp(new map_type(yBlockCol->getPointMap()));
1871  pointImporter_ = rcp(new import_type(srcMap, tgtMap));
1872  }
1873  performImport = true;
1874  }
1875 
1876  MV yBlock_mv;
1877  MV yBlockCol_mv;
1878  RCP<const MV> yBlockColPointDomain;
1879  if (performImport) { // create views (shallow copies)
1880  yBlock_mv = yBlock.getMultiVectorView();
1881  yBlockCol_mv = yBlockCol->getMultiVectorView();
1882  yBlockColPointDomain =
1883  yBlockCol_mv.offsetView(A.getDomainMap(), 0);
1884  }
1885 
1886  if (ZeroStartingSolution_) {
1887  yBlockCol->putScalar(STS::zero ());
1888  }
1889  else if (performImport) {
1890  yBlockCol_mv.doImport(yBlock_mv, *pointImporter_, INSERT);
1891  }
1892 
1893  const Tpetra::ESweepDirection direction =
1894  DoBackwardGS_ ? Tpetra::Backward : Tpetra::Forward;
1895 
1896  for (int sweep = 0; sweep < NumSweeps_; ++sweep) {
1897  if (performImport && sweep > 0) {
1898  yBlockCol_mv.doImport(yBlock_mv, *pointImporter_, INSERT);
1899  }
1900  A.localGaussSeidel(xBlock, *yBlockCol, blockDiag_,
1901  DampingFactor_, direction);
1902  if (performImport) {
1903  Tpetra::deep_copy(Y, *yBlockColPointDomain);
1904  }
1905  }
1906 }
1907 
1908 template<class MatrixType>
1909 void
1910 Relaxation<MatrixType>::
1911 MTGaussSeidel (const Tpetra::MultiVector<scalar_type,local_ordinal_type,global_ordinal_type,node_type>& B,
1912  Tpetra::MultiVector<scalar_type,local_ordinal_type,global_ordinal_type,node_type>& X,
1913  const Tpetra::ESweepDirection direction) const
1914 {
1915  using Teuchos::null;
1916  using Teuchos::RCP;
1917  using Teuchos::rcp;
1918  using Teuchos::rcpFromRef;
1919  using Teuchos::rcp_const_cast;
1920  using Teuchos::as;
1921 
1922  typedef scalar_type Scalar;
1923  typedef local_ordinal_type LocalOrdinal;
1924  typedef global_ordinal_type GlobalOrdinal;
1925  typedef node_type Node;
1926 
1927  const char prefix[] = "Ifpack2::Relaxation::(reordered)MTGaussSeidel: ";
1928  const Scalar ZERO = Teuchos::ScalarTraits<Scalar>::zero ();
1929 
1930  const crs_matrix_type* crsMat = dynamic_cast<const crs_matrix_type*> (A_.get());
1932  (crsMat == nullptr, std::logic_error, "Ifpack2::Relaxation::apply: "
1933  "Multithreaded Gauss-Seidel methods currently only work when the "
1934  "input matrix is a Tpetra::CrsMatrix.");
1935 
1936  //Teuchos::ArrayView<local_ordinal_type> rowIndices; // unused, as of 04 Jan 2017
1938  (! localSmoothingIndices_.is_null (), std::logic_error,
1939  "Ifpack2's implementation of Multithreaded Gauss-Seidel does not "
1940  "implement the case where the user supplies an iteration order. "
1941  "This error used to appear as \"MT GaussSeidel ignores the given "
1942  "order\". "
1943  "I tried to add more explanation, but I didn't implement \"MT "
1944  "GaussSeidel\" [sic]. "
1945  "You'll have to ask the person who did.");
1946 
1948  (crsMat == nullptr, std::logic_error, prefix << "The matrix is null."
1949  " This should never happen. Please report this bug to the Ifpack2 "
1950  "developers.");
1952  (! crsMat->isFillComplete (), std::runtime_error, prefix << "The "
1953  "input CrsMatrix is not fill complete. Please call fillComplete "
1954  "on the matrix, then do setup again, before calling apply(). "
1955  "\"Do setup\" means that if only the matrix's values have changed "
1956  "since last setup, you need only call compute(). If the matrix's "
1957  "structure may also have changed, you must first call initialize(), "
1958  "then call compute(). If you have not set up this preconditioner "
1959  "for this matrix before, you must first call initialize(), then "
1960  "call compute().");
1962  (NumSweeps_ < 0, std::logic_error, prefix << ": NumSweeps_ = "
1963  << NumSweeps_ << " < 0. Please report this bug to the Ifpack2 "
1964  "developers.");
1965  if (NumSweeps_ == 0) {
1966  return;
1967  }
1968 
1969  typedef typename Tpetra::MultiVector<Scalar, LocalOrdinal, GlobalOrdinal, Node> MV;
1970  typedef typename crs_matrix_type::import_type import_type;
1971  typedef typename crs_matrix_type::export_type export_type;
1972  typedef typename crs_matrix_type::map_type map_type;
1973 
1974  RCP<const import_type> importer = crsMat->getGraph ()->getImporter ();
1975  RCP<const export_type> exporter = crsMat->getGraph ()->getExporter ();
1977  ! exporter.is_null (), std::runtime_error,
1978  "This method's implementation currently requires that the matrix's row, "
1979  "domain, and range Maps be the same. This cannot be the case, because "
1980  "the matrix has a nontrivial Export object.");
1981 
1982  RCP<const map_type> domainMap = crsMat->getDomainMap ();
1983  RCP<const map_type> rangeMap = crsMat->getRangeMap ();
1984  RCP<const map_type> rowMap = crsMat->getGraph ()->getRowMap ();
1985  RCP<const map_type> colMap = crsMat->getGraph ()->getColMap ();
1986 
1987 #ifdef HAVE_IFPACK2_DEBUG
1988  {
1989  // The relation 'isSameAs' is transitive. It's also a
1990  // collective, so we don't have to do a "shared" test for
1991  // exception (i.e., a global reduction on the test value).
1993  ! X.getMap ()->isSameAs (*domainMap), std::runtime_error,
1994  "Ifpack2::Relaxation::MTGaussSeidel requires that the input "
1995  "multivector X be in the domain Map of the matrix.");
1997  ! B.getMap ()->isSameAs (*rangeMap), std::runtime_error,
1998  "Ifpack2::Relaxation::MTGaussSeidel requires that the input "
1999  "B be in the range Map of the matrix.");
2001  ! D.getMap ()->isSameAs (*rowMap), std::runtime_error,
2002  "Ifpack2::Relaxation::MTGaussSeidel requires that the input "
2003  "D be in the row Map of the matrix.");
2005  ! rowMap->isSameAs (*rangeMap), std::runtime_error,
2006  "Ifpack2::Relaxation::MTGaussSeidel requires that the row Map and the "
2007  "range Map be the same (in the sense of Tpetra::Map::isSameAs).");
2009  ! domainMap->isSameAs (*rangeMap), std::runtime_error,
2010  "Ifpack2::Relaxation::MTGaussSeidel requires that the domain Map and "
2011  "the range Map of the matrix be the same.");
2012  }
2013 #else
2014  // Forestall any compiler warnings for unused variables.
2015  (void) rangeMap;
2016  (void) rowMap;
2017 #endif // HAVE_IFPACK2_DEBUG
2018 
2019  // Fetch a (possibly cached) temporary column Map multivector
2020  // X_colMap, and a domain Map view X_domainMap of it. Both have
2021  // constant stride by construction. We know that the domain Map
2022  // must include the column Map, because our Gauss-Seidel kernel
2023  // requires that the row Map, domain Map, and range Map are all
2024  // the same, and that each process owns all of its own diagonal
2025  // entries of the matrix.
2026 
2027  RCP<MV> X_colMap;
2028  RCP<MV> X_domainMap;
2029  bool copyBackOutput = false;
2030  if (importer.is_null ()) {
2031  if (X.isConstantStride ()) {
2032  X_colMap = rcpFromRef (X);
2033  X_domainMap = rcpFromRef (X);
2034 
2035  // Column Map and domain Map are the same, so there are no
2036  // remote entries. Thus, if we are not setting the initial
2037  // guess to zero, we don't have to worry about setting remote
2038  // entries to zero, even though we are not doing an Import in
2039  // this case.
2040  if (ZeroStartingSolution_) {
2041  X_colMap->putScalar (ZERO);
2042  }
2043  // No need to copy back to X at end.
2044  }
2045  else {
2046  // We must copy X into a constant stride multivector.
2047  // Just use the cached column Map multivector for that.
2048  // force=true means fill with zeros, so no need to fill
2049  // remote entries (not in domain Map) with zeros.
2050  updateCachedMultiVector(colMap,as<size_t>(X.getNumVectors()));
2051  X_colMap = cachedMV_;
2052  // X_domainMap is always a domain Map view of the column Map
2053  // multivector. In this case, the domain and column Maps are
2054  // the same, so X_domainMap _is_ X_colMap.
2055  X_domainMap = X_colMap;
2056  if (! ZeroStartingSolution_) { // Don't copy if zero initial guess
2057  try {
2058  deep_copy (*X_domainMap , X); // Copy X into constant stride MV
2059  } catch (std::exception& e) {
2060  std::ostringstream os;
2061  os << "Ifpack2::Relaxation::MTGaussSeidel: "
2062  "deep_copy(*X_domainMap, X) threw an exception: "
2063  << e.what () << ".";
2064  TEUCHOS_TEST_FOR_EXCEPTION(true, std::runtime_error, e.what ());
2065  }
2066  }
2067  copyBackOutput = true; // Don't forget to copy back at end.
2068  /*
2069  TPETRA_EFFICIENCY_WARNING(
2070  ! X.isConstantStride (),
2071  std::runtime_error,
2072  "MTGaussSeidel: The current implementation of the Gauss-Seidel "
2073  "kernel requires that X and B both have constant stride. Since X "
2074  "does not have constant stride, we had to make a copy. This is a "
2075  "limitation of the current implementation and not your fault, but we "
2076  "still report it as an efficiency warning for your information.");
2077  */
2078  }
2079  }
2080  else { // Column Map and domain Map are _not_ the same.
2081  updateCachedMultiVector(colMap,as<size_t>(X.getNumVectors()));
2082  X_colMap = cachedMV_;
2083 
2084  X_domainMap = X_colMap->offsetViewNonConst (domainMap, 0);
2085 
2086 #ifdef HAVE_IFPACK2_DEBUG
2087  auto X_colMap_host_view = X_colMap->template getLocalView<Kokkos::HostSpace> ();
2088  auto X_domainMap_host_view = X_domainMap->template getLocalView<Kokkos::HostSpace> ();
2089 
2090  if (X_colMap->getLocalLength () != 0 && X_domainMap->getLocalLength ()) {
2092  X_colMap_host_view.data () != X_domainMap_host_view.data (),
2093  std::logic_error, "Ifpack2::Relaxation::MTGaussSeidel: "
2094  "Pointer to start of column Map view of X is not equal to pointer to "
2095  "start of (domain Map view of) X. This may mean that "
2096  "Tpetra::MultiVector::offsetViewNonConst is broken. "
2097  "Please report this bug to the Tpetra developers.");
2098  }
2099 
2101  X_colMap_host_view.extent (0) < X_domainMap_host_view.extent (0) ||
2102  X_colMap->getLocalLength () < X_domainMap->getLocalLength (),
2103  std::logic_error, "Ifpack2::Relaxation::MTGaussSeidel: "
2104  "X_colMap has fewer local rows than X_domainMap. "
2105  "X_colMap_host_view.extent(0) = " << X_colMap_host_view.extent (0)
2106  << ", X_domainMap_host_view.extent(0) = "
2107  << X_domainMap_host_view.extent (0)
2108  << ", X_colMap->getLocalLength() = " << X_colMap->getLocalLength ()
2109  << ", and X_domainMap->getLocalLength() = "
2110  << X_domainMap->getLocalLength ()
2111  << ". This means that Tpetra::MultiVector::offsetViewNonConst "
2112  "is broken. Please report this bug to the Tpetra developers.");
2113 
2115  X_colMap->getNumVectors () != X_domainMap->getNumVectors (),
2116  std::logic_error, "Ifpack2::Relaxation::MTGaussSeidel: "
2117  "X_colMap has a different number of columns than X_domainMap. "
2118  "X_colMap->getNumVectors() = " << X_colMap->getNumVectors ()
2119  << " != X_domainMap->getNumVectors() = "
2120  << X_domainMap->getNumVectors ()
2121  << ". This means that Tpetra::MultiVector::offsetViewNonConst "
2122  "is broken. Please report this bug to the Tpetra developers.");
2123 #endif // HAVE_IFPACK2_DEBUG
2124 
2125  if (ZeroStartingSolution_) {
2126  // No need for an Import, since we're filling with zeros.
2127  X_colMap->putScalar (ZERO);
2128  } else {
2129  // We could just copy X into X_domainMap. However, that
2130  // wastes a copy, because the Import also does a copy (plus
2131  // communication). Since the typical use case for
2132  // Gauss-Seidel is a small number of sweeps (2 is typical), we
2133  // don't want to waste that copy. Thus, we do the Import
2134  // here, and skip the first Import in the first sweep.
2135  // Importing directly from X effects the copy into X_domainMap
2136  // (which is a view of X_colMap).
2137  X_colMap->doImport (X, *importer, Tpetra::CombineMode::INSERT);
2138  }
2139  copyBackOutput = true; // Don't forget to copy back at end.
2140  } // if column and domain Maps are (not) the same
2141 
2142  // The Gauss-Seidel / SOR kernel expects multivectors of constant
2143  // stride. X_colMap is by construction, but B might not be. If
2144  // it's not, we have to make a copy.
2145  RCP<const MV> B_in;
2146  if (B.isConstantStride ()) {
2147  B_in = rcpFromRef (B);
2148  }
2149  else {
2150  // Range Map and row Map are the same in this case, so we can
2151  // use the cached row Map multivector to store a constant stride
2152  // copy of B.
2153  //RCP<MV> B_in_nonconst = crsMat->getRowMapMultiVector (B, true);
2154  RCP<MV> B_in_nonconst = rcp (new MV (rowMap, B.getNumVectors()));
2155  try {
2156  deep_copy (*B_in_nonconst, B);
2157  } catch (std::exception& e) {
2158  std::ostringstream os;
2159  os << "Ifpack2::Relaxation::MTGaussSeidel: "
2160  "deep_copy(*B_in_nonconst, B) threw an exception: "
2161  << e.what () << ".";
2162  TEUCHOS_TEST_FOR_EXCEPTION(true, std::runtime_error, e.what ());
2163  }
2164  B_in = rcp_const_cast<const MV> (B_in_nonconst);
2165 
2166  /*
2167  TPETRA_EFFICIENCY_WARNING(
2168  ! B.isConstantStride (),
2169  std::runtime_error,
2170  "MTGaussSeidel: The current implementation requires that B have "
2171  "constant stride. Since B does not have constant stride, we had to "
2172  "copy it into a separate constant-stride multivector. This is a "
2173  "limitation of the current implementation and not your fault, but we "
2174  "still report it as an efficiency warning for your information.");
2175  */
2176  }
2177 
2178  local_matrix_type kcsr = crsMat->getLocalMatrix ();
2179 
2180  bool update_y_vector = true;
2181  //false as it was done up already, and we dont want to zero it in each sweep.
2182  bool zero_x_vector = false;
2183 
2184  for (int sweep = 0; sweep < NumSweeps_; ++sweep) {
2185  if (! importer.is_null () && sweep > 0) {
2186  // We already did the first Import for the zeroth sweep above,
2187  // if it was necessary.
2188  X_colMap->doImport (*X_domainMap, *importer, Tpetra::CombineMode::INSERT);
2189  }
2190 
2191  if (direction == Tpetra::Symmetric) {
2192  KokkosSparse::Experimental::symmetric_gauss_seidel_apply
2193  (mtKernelHandle_.getRawPtr(), A_->getNodeNumRows(), A_->getNodeNumCols(),
2194  kcsr.graph.row_map, kcsr.graph.entries, kcsr.values,
2195  X_colMap->getLocalViewDevice(),
2196  B_in->getLocalViewDevice(),
2197  zero_x_vector, update_y_vector, DampingFactor_, 1);
2198  }
2199  else if (direction == Tpetra::Forward) {
2200  KokkosSparse::Experimental::forward_sweep_gauss_seidel_apply
2201  (mtKernelHandle_.getRawPtr(), A_->getNodeNumRows(), A_->getNodeNumCols(),
2202  kcsr.graph.row_map,kcsr.graph.entries, kcsr.values,
2203  X_colMap->getLocalViewDevice (),
2204  B_in->getLocalViewDevice(),
2205  zero_x_vector, update_y_vector, DampingFactor_, 1);
2206  }
2207  else if (direction == Tpetra::Backward) {
2208  KokkosSparse::Experimental::backward_sweep_gauss_seidel_apply
2209  (mtKernelHandle_.getRawPtr(), A_->getNodeNumRows(), A_->getNodeNumCols(),
2210  kcsr.graph.row_map,kcsr.graph.entries, kcsr.values,
2211  X_colMap->getLocalViewDevice(),
2212  B_in->getLocalViewDevice(),
2213  zero_x_vector, update_y_vector, DampingFactor_, 1);
2214  }
2215  else {
2217  true, std::invalid_argument,
2218  prefix << "The 'direction' enum does not have any of its valid "
2219  "values: Forward, Backward, or Symmetric.");
2220  }
2221  update_y_vector = false;
2222  }
2223 
2224  if (copyBackOutput) {
2225  try {
2226  deep_copy (X , *X_domainMap); // Copy result back into X.
2227  } catch (std::exception& e) {
2229  true, std::runtime_error, prefix << "deep_copy(X, *X_domainMap) "
2230  "threw an exception: " << e.what ());
2231  }
2232  }
2233 
2234  const double dampingFlops = (DampingFactor_ == STS::one ()) ? 0.0 : 1.0;
2235  const double numVectors = as<double> (X.getNumVectors ());
2236  const double numGlobalRows = as<double> (A_->getGlobalNumRows ());
2237  const double numGlobalNonzeros = as<double> (A_->getGlobalNumEntries ());
2238  double ApplyFlops = NumSweeps_ * numVectors *
2239  (2.0 * numGlobalRows + 2.0 * numGlobalNonzeros + dampingFlops);
2240  if (direction == Tpetra::Symmetric)
2241  ApplyFlops *= 2.0;
2242  ApplyFlops_ += ApplyFlops;
2243 
2244 }
2245 
2246 template<class MatrixType>
2247 void
2248 Relaxation<MatrixType>::
2249 ApplyInverseMTSGS_CrsMatrix (const Tpetra::MultiVector<scalar_type,local_ordinal_type,global_ordinal_type,node_type>& B,
2250  Tpetra::MultiVector<scalar_type,local_ordinal_type,global_ordinal_type,node_type>& X) const
2251 {
2252  const Tpetra::ESweepDirection direction = Tpetra::Symmetric;
2253  this->MTGaussSeidel (B, X, direction);
2254 }
2255 
2256 
2257 template<class MatrixType>
2258 void Relaxation<MatrixType>::ApplyInverseMTGS_CrsMatrix (
2259  const Tpetra::MultiVector<scalar_type,local_ordinal_type,global_ordinal_type,node_type>& B,
2260  Tpetra::MultiVector<scalar_type,local_ordinal_type,global_ordinal_type,node_type>& X) const {
2261 
2262  const Tpetra::ESweepDirection direction =
2263  DoBackwardGS_ ? Tpetra::Backward : Tpetra::Forward;
2264  this->MTGaussSeidel (B, X, direction);
2265 }
2266 
2267 template<class MatrixType>
2268 void
2269 Relaxation<MatrixType>::
2270 ApplyInverseSGS (const Tpetra::MultiVector<scalar_type,local_ordinal_type,global_ordinal_type,node_type>& X,
2271  Tpetra::MultiVector<scalar_type,local_ordinal_type,global_ordinal_type,node_type>& Y) const
2272 {
2273  using this_type = Relaxation<MatrixType>;
2274  // The CrsMatrix version is faster, because it can access the sparse
2275  // matrix data directly, rather than by copying out each row's data
2276  // in turn. Thus, we check whether the RowMatrix is really a
2277  // CrsMatrix.
2278  //
2279  // FIXME (mfh 07 Jul 2013) See note on crs_matrix_type typedef
2280  // declaration in Ifpack2_Relaxation_decl.hpp header file. The code
2281  // will still be correct if the cast fails, but it will use an
2282  // unoptimized kernel.
2283  const block_crs_matrix_type* blockCrsMat =
2284  dynamic_cast<const block_crs_matrix_type*> (A_.getRawPtr ());
2285  const crs_matrix_type* crsMat =
2286  dynamic_cast<const crs_matrix_type*> (A_.getRawPtr ());
2287  if (blockCrsMat != nullptr) {
2288  const_cast<this_type&> (*this).ApplyInverseSGS_BlockCrsMatrix(*blockCrsMat, X, Y);
2289  }
2290  else if (crsMat != nullptr) {
2291  ApplyInverseSGS_CrsMatrix (*crsMat, X, Y);
2292  }
2293  else {
2294  ApplyInverseSGS_RowMatrix (X, Y);
2295  }
2296 }
2297 
2298 
2299 template<class MatrixType>
2300 void
2301 Relaxation<MatrixType>::
2302 ApplyInverseSGS_RowMatrix (const Tpetra::MultiVector<scalar_type,local_ordinal_type,global_ordinal_type,node_type>& X,
2303  Tpetra::MultiVector<scalar_type,local_ordinal_type,global_ordinal_type,node_type>& Y) const
2304 {
2305  using Teuchos::Array;
2306  using Teuchos::ArrayRCP;
2307  using Teuchos::ArrayView;
2308  using Teuchos::as;
2309  using Teuchos::RCP;
2310  using Teuchos::rcp;
2311  using Teuchos::rcpFromRef;
2312  using MV = Tpetra::MultiVector<scalar_type, local_ordinal_type,
2313  global_ordinal_type, node_type>;
2314 
2315  // Tpetra's GS implementation for CrsMatrix handles zeroing out the
2316  // starting multivector itself. The generic RowMatrix version here
2317  // does not, so we have to zero out Y here.
2318  if (ZeroStartingSolution_) {
2319  Y.putScalar (STS::zero ());
2320  }
2321 
2322  const size_t NumVectors = X.getNumVectors ();
2323  const size_t maxLength = A_->getNodeMaxNumRowEntries ();
2324  Array<local_ordinal_type> Indices (maxLength);
2325  Array<scalar_type> Values (maxLength);
2326 
2327  // Local smoothing stuff
2328  const size_t numMyRows = A_->getNodeNumRows();
2329  const local_ordinal_type * rowInd = 0;
2330  size_t numActive = numMyRows;
2331  bool do_local = !localSmoothingIndices_.is_null();
2332  if(do_local) {
2333  rowInd = localSmoothingIndices_.getRawPtr();
2334  numActive = localSmoothingIndices_.size();
2335  }
2336 
2337 
2338  RCP<MV> Y2;
2339  if (IsParallel_) {
2340  if (Importer_.is_null ()) { // domain and column Maps are the same.
2341  updateCachedMultiVector (Y.getMap (), NumVectors);
2342  }
2343  else {
2344  updateCachedMultiVector (Importer_->getTargetMap (), NumVectors);
2345  }
2346  Y2 = cachedMV_;
2347  }
2348  else {
2349  Y2 = rcpFromRef (Y);
2350  }
2351 
2352  // Diagonal
2353  ArrayRCP<const scalar_type> d_rcp = Diagonal_->get1dView();
2354  ArrayView<const scalar_type> d_ptr = d_rcp();
2355 
2356  // Constant stride check
2357  bool constant_stride = X.isConstantStride() && Y2->isConstantStride();
2358 
2359  if(constant_stride) {
2360  // extract 1D RCPs
2361  size_t x_stride = X.getStride();
2362  size_t y2_stride = Y2->getStride();
2363  ArrayRCP<scalar_type> y2_rcp = Y2->get1dViewNonConst();
2364  ArrayRCP<const scalar_type> x_rcp = X.get1dView();
2365  ArrayView<scalar_type> y2_ptr = y2_rcp();
2366  ArrayView<const scalar_type> x_ptr = x_rcp();
2367  Array<scalar_type> dtemp(NumVectors,STS::zero());
2368 
2369  for (int j = 0; j < NumSweeps_; j++) {
2370  // data exchange is here, once per sweep
2371  if (IsParallel_) {
2372  if (Importer_.is_null ()) {
2373  // just copy, since domain and column Maps are the same
2374  Tpetra::deep_copy (*Y2, Y);
2375  } else {
2376  Y2->doImport (Y, *Importer_, Tpetra::INSERT);
2377  }
2378  }
2379  for (size_t ii = 0; ii < numActive; ++ii) {
2380  local_ordinal_type i = as<local_ordinal_type>(do_local ? rowInd[ii] : ii);
2381  size_t NumEntries;
2382  A_->getLocalRowCopy (i, Indices (), Values (), NumEntries);
2383  dtemp.assign(NumVectors,STS::zero());
2384 
2385  for (size_t k = 0; k < NumEntries; ++k) {
2386  const local_ordinal_type col = Indices[k];
2387  for (size_t m = 0; m < NumVectors; ++m) {
2388  dtemp[m] += Values[k] * y2_ptr[col + y2_stride*m];
2389  }
2390  }
2391 
2392  for (size_t m = 0; m < NumVectors; ++m) {
2393  y2_ptr[i + y2_stride*m] += DampingFactor_ * d_ptr[i] * (x_ptr[i + x_stride*m] - dtemp[m]);
2394  }
2395  }
2396 
2397  // ptrdiff_t is the same size as size_t, but is signed. Being
2398  // signed is important so that i >= 0 is not trivially true.
2399  for (ptrdiff_t ii = as<ptrdiff_t> (numActive) - 1; ii >= 0; --ii) {
2400  local_ordinal_type i = as<local_ordinal_type>(do_local ? rowInd[ii] : ii);
2401  size_t NumEntries;
2402  A_->getLocalRowCopy (i, Indices (), Values (), NumEntries);
2403  dtemp.assign(NumVectors,STS::zero());
2404 
2405  for (size_t k = 0; k < NumEntries; ++k) {
2406  const local_ordinal_type col = Indices[k];
2407  for (size_t m = 0; m < NumVectors; ++m) {
2408  dtemp[m] += Values[k] * y2_ptr[col + y2_stride*m];
2409  }
2410  }
2411 
2412  for (size_t m = 0; m < NumVectors; ++m) {
2413  y2_ptr[i + y2_stride*m] += DampingFactor_ * d_ptr[i] * (x_ptr[i + x_stride*m] - dtemp[m]);
2414  }
2415  }
2416 
2417  // FIXME (mfh 02 Jan 2013) This is only correct if row Map == range Map.
2418  if (IsParallel_) {
2419  Tpetra::deep_copy (Y, *Y2);
2420  }
2421  }
2422  }
2423  else {
2424  // extract 2D RCPs
2425  ArrayRCP<ArrayRCP<scalar_type> > y2_ptr = Y2->get2dViewNonConst ();
2426  ArrayRCP<ArrayRCP<const scalar_type> > x_ptr = X.get2dView ();
2427 
2428  for (int iter = 0; iter < NumSweeps_; ++iter) {
2429  // only one data exchange per sweep
2430  if (IsParallel_) {
2431  if (Importer_.is_null ()) {
2432  // just copy, since domain and column Maps are the same
2433  Tpetra::deep_copy (*Y2, Y);
2434  } else {
2435  Y2->doImport (Y, *Importer_, Tpetra::INSERT);
2436  }
2437  }
2438 
2439  for (size_t ii = 0; ii < numActive; ++ii) {
2440  local_ordinal_type i = as<local_ordinal_type>(do_local ? rowInd[ii] : ii);
2441  const scalar_type diag = d_ptr[i];
2442  size_t NumEntries;
2443  A_->getLocalRowCopy (as<local_ordinal_type> (i), Indices (), Values (), NumEntries);
2444 
2445  for (size_t m = 0; m < NumVectors; ++m) {
2446  scalar_type dtemp = STS::zero ();
2447  ArrayView<const scalar_type> x_local = (x_ptr())[m]();
2448  ArrayView<scalar_type> y2_local = (y2_ptr())[m]();
2449 
2450  for (size_t k = 0; k < NumEntries; ++k) {
2451  const local_ordinal_type col = Indices[k];
2452  dtemp += Values[k] * y2_local[col];
2453  }
2454  y2_local[i] += DampingFactor_ * (x_local[i] - dtemp) * diag;
2455  }
2456  }
2457 
2458  // ptrdiff_t is the same size as size_t, but is signed. Being
2459  // signed is important so that i >= 0 is not trivially true.
2460  for (ptrdiff_t ii = as<ptrdiff_t> (numActive) - 1; ii >= 0; --ii) {
2461  local_ordinal_type i = as<local_ordinal_type>(do_local ? rowInd[ii] : ii);
2462  const scalar_type diag = d_ptr[i];
2463  size_t NumEntries;
2464  A_->getLocalRowCopy (as<local_ordinal_type> (i), Indices (), Values (), NumEntries);
2465 
2466  for (size_t m = 0; m < NumVectors; ++m) {
2467  scalar_type dtemp = STS::zero ();
2468  ArrayView<const scalar_type> x_local = (x_ptr())[m]();
2469  ArrayView<scalar_type> y2_local = (y2_ptr())[m]();
2470 
2471  for (size_t k = 0; k < NumEntries; ++k) {
2472  const local_ordinal_type col = Indices[k];
2473  dtemp += Values[k] * y2_local[col];
2474  }
2475  y2_local[i] += DampingFactor_ * (x_local[i] - dtemp) * diag;
2476  }
2477  }
2478 
2479  // FIXME (mfh 02 Jan 2013) This is only correct if row Map == range Map.
2480  if (IsParallel_) {
2481  Tpetra::deep_copy (Y, *Y2);
2482  }
2483  }
2484  }
2485 
2486  // See flop count discussion in implementation of ApplyInverseSGS_CrsMatrix().
2487  const double dampingFlops = (DampingFactor_ == STS::one()) ? 0.0 : 1.0;
2488  const double numVectors = as<double> (X.getNumVectors ());
2489  const double numGlobalRows = as<double> (A_->getGlobalNumRows ());
2490  const double numGlobalNonzeros = as<double> (A_->getGlobalNumEntries ());
2491  ApplyFlops_ += 2.0 * NumSweeps_ * numVectors *
2492  (2.0 * numGlobalRows + 2.0 * numGlobalNonzeros + dampingFlops);
2493 }
2494 
2495 
2496 template<class MatrixType>
2497 void
2498 Relaxation<MatrixType>::
2499 ApplyInverseSGS_CrsMatrix (const crs_matrix_type& A,
2500  const Tpetra::MultiVector<scalar_type,local_ordinal_type,global_ordinal_type,node_type>& X,
2501  Tpetra::MultiVector<scalar_type,local_ordinal_type,global_ordinal_type,node_type>& Y) const
2502 {
2503  using Teuchos::as;
2504  const Tpetra::ESweepDirection direction = Tpetra::Symmetric;
2505  if (localSmoothingIndices_.is_null ()) {
2506  A.gaussSeidelCopy (Y, X, *Diagonal_, DampingFactor_, direction,
2507  NumSweeps_, ZeroStartingSolution_);
2508  }
2509  else {
2510  A.reorderedGaussSeidelCopy (Y, X, *Diagonal_, localSmoothingIndices_ (),
2511  DampingFactor_, direction,
2512  NumSweeps_, ZeroStartingSolution_);
2513  }
2514 
2515  // For each column of output, for each sweep over the matrix:
2516  //
2517  // - One + and one * for each matrix entry
2518  // - One / and one + for each row of the matrix
2519  // - If the damping factor is not one: one * for each row of the
2520  // matrix. (It's not fair to count this if the damping factor is
2521  // one, since the implementation could skip it. Whether it does
2522  // or not is the implementation's choice.)
2523  //
2524  // Each sweep of symmetric Gauss-Seidel / SOR counts as two sweeps,
2525  // one forward and one backward.
2526 
2527  // Floating-point operations due to the damping factor, per matrix
2528  // row, per direction, per columm of output.
2529  const double dampingFlops = (DampingFactor_ == STS::one()) ? 0.0 : 1.0;
2530  const double numVectors = as<double> (X.getNumVectors ());
2531  const double numGlobalRows = as<double> (A_->getGlobalNumRows ());
2532  const double numGlobalNonzeros = as<double> (A_->getGlobalNumEntries ());
2533  ApplyFlops_ += 2.0 * NumSweeps_ * numVectors *
2534  (2.0 * numGlobalRows + 2.0 * numGlobalNonzeros + dampingFlops);
2535 }
2536 
2537 template<class MatrixType>
2538 void
2539 Relaxation<MatrixType>::
2540 ApplyInverseSGS_BlockCrsMatrix (const block_crs_matrix_type& A,
2541  const Tpetra::MultiVector<scalar_type,local_ordinal_type,global_ordinal_type,node_type>& X,
2542  Tpetra::MultiVector<scalar_type,local_ordinal_type,global_ordinal_type,node_type>& Y)
2543 {
2544  using Tpetra::INSERT;
2545  using Teuchos::RCP;
2546  using Teuchos::rcp;
2547  using Teuchos::rcpFromRef;
2548  using BMV = Tpetra::BlockMultiVector<scalar_type,
2549  local_ordinal_type, global_ordinal_type, node_type>;
2550  using MV = Tpetra::MultiVector<scalar_type,
2551  local_ordinal_type, global_ordinal_type, node_type>;
2552  using map_type = Tpetra::Map<local_ordinal_type,
2553  global_ordinal_type, node_type>;
2554  using import_type = Tpetra::Import<local_ordinal_type,
2555  global_ordinal_type, node_type>;
2556 
2557  //FIXME: (tcf) 8/21/2014 -- may be problematic for multiple right hand sides
2558  //
2559  // NOTE (mfh 12 Sep 2014) I don't think it should be a problem for
2560  // multiple right-hand sides, unless the input or output MultiVector
2561  // does not have constant stride. We should check for that case
2562  // here, in case it doesn't work in localGaussSeidel (which is
2563  // entirely possible).
2564  BMV yBlock (Y, * (A.getGraph ()->getDomainMap ()), A.getBlockSize ());
2565  const BMV xBlock (X, * (A.getColMap ()), A.getBlockSize ());
2566 
2567  bool performImport = false;
2568  RCP<BMV> yBlockCol;
2569  if (Importer_.is_null ()) {
2570  yBlockCol = Teuchos::rcpFromRef (yBlock);
2571  }
2572  else {
2573  if (yBlockColumnPointMap_.is_null () ||
2574  yBlockColumnPointMap_->getNumVectors () != yBlock.getNumVectors () ||
2575  yBlockColumnPointMap_->getBlockSize () != yBlock.getBlockSize ()) {
2576  yBlockColumnPointMap_ =
2577  rcp (new BMV (* (A.getColMap ()), A.getBlockSize (),
2578  static_cast<local_ordinal_type> (yBlock.getNumVectors ())));
2579  }
2580  yBlockCol = yBlockColumnPointMap_;
2581  if (pointImporter_.is_null()) {
2582  auto srcMap = rcp(new map_type(yBlock.getPointMap()));
2583  auto tgtMap = rcp(new map_type(yBlockCol->getPointMap()));
2584  pointImporter_ = rcp(new import_type(srcMap, tgtMap));
2585  }
2586  performImport = true;
2587  }
2588 
2589  MV yBlock_mv;
2590  MV yBlockCol_mv;
2591  RCP<const MV> yBlockColPointDomain;
2592  if (performImport) { // create views (shallow copies)
2593  yBlock_mv = yBlock.getMultiVectorView();
2594  yBlockCol_mv = yBlockCol->getMultiVectorView();
2595  yBlockColPointDomain =
2596  yBlockCol_mv.offsetView(A.getDomainMap(), 0);
2597  }
2598 
2599  if (ZeroStartingSolution_) {
2600  yBlockCol->putScalar(STS::zero ());
2601  }
2602  else if (performImport) {
2603  yBlockCol_mv.doImport(yBlock_mv, *pointImporter_, INSERT);
2604  }
2605 
2606  // FIXME (mfh 12 Sep 2014) Shouldn't this come from the user's parameter?
2607  const Tpetra::ESweepDirection direction = Tpetra::Symmetric;
2608 
2609  for (int sweep = 0; sweep < NumSweeps_; ++sweep) {
2610  if (performImport && sweep > 0) {
2611  yBlockCol_mv.doImport(yBlock_mv, *pointImporter_, INSERT);
2612  }
2613  A.localGaussSeidel(xBlock, *yBlockCol, blockDiag_,
2614  DampingFactor_, direction);
2615  if (performImport) {
2616  Tpetra::deep_copy(yBlock_mv, *yBlockColPointDomain);
2617  }
2618  }
2619 }
2620 
2621 template<class MatrixType>
2623 {
2624  std::ostringstream os;
2625 
2626  // Output is a valid YAML dictionary in flow style. If you don't
2627  // like everything on a single line, you should call describe()
2628  // instead.
2629  os << "\"Ifpack2::Relaxation\": {";
2630 
2631  os << "Initialized: " << (isInitialized () ? "true" : "false") << ", "
2632  << "Computed: " << (isComputed () ? "true" : "false") << ", ";
2633 
2634  // It's useful to print this instance's relaxation method (Jacobi,
2635  // Gauss-Seidel, or symmetric Gauss-Seidel). If you want more info
2636  // than that, call describe() instead.
2637  os << "Type: ";
2638  if (PrecType_ == Ifpack2::Details::JACOBI) {
2639  os << "Jacobi";
2640  } else if (PrecType_ == Ifpack2::Details::GS) {
2641  os << "Gauss-Seidel";
2642  } else if (PrecType_ == Ifpack2::Details::SGS) {
2643  os << "Symmetric Gauss-Seidel";
2644  } else if (PrecType_ == Ifpack2::Details::MTGS) {
2645  os << "MT Gauss-Seidel";
2646  } else if (PrecType_ == Ifpack2::Details::MTSGS) {
2647  os << "MT Symmetric Gauss-Seidel";
2648  } else if (PrecType_ == Ifpack2::Details::GS2) {
2649  os << "Two-stage Gauss-Seidel";
2650  } else if (PrecType_ == Ifpack2::Details::SGS2) {
2651  os << "Two-stage Symmetric Gauss-Seidel";
2652  }
2653  else {
2654  os << "INVALID";
2655  }
2656 
2657  os << ", " << "sweeps: " << NumSweeps_ << ", "
2658  << "damping factor: " << DampingFactor_ << ", ";
2659  if (DoL1Method_) {
2660  os << "use l1: " << DoL1Method_ << ", "
2661  << "l1 eta: " << L1Eta_ << ", ";
2662  }
2663 
2664  if (A_.is_null ()) {
2665  os << "Matrix: null";
2666  }
2667  else {
2668  os << "Global matrix dimensions: ["
2669  << A_->getGlobalNumRows () << ", " << A_->getGlobalNumCols () << "]"
2670  << ", Global nnz: " << A_->getGlobalNumEntries();
2671  }
2672 
2673  os << "}";
2674  return os.str ();
2675 }
2676 
2677 
2678 template<class MatrixType>
2679 void
2682  const Teuchos::EVerbosityLevel verbLevel) const
2683 {
2684  using Teuchos::OSTab;
2686  using Teuchos::VERB_DEFAULT;
2687  using Teuchos::VERB_NONE;
2688  using Teuchos::VERB_LOW;
2689  using Teuchos::VERB_MEDIUM;
2690  using Teuchos::VERB_HIGH;
2691  using Teuchos::VERB_EXTREME;
2692  using std::endl;
2693 
2694  const Teuchos::EVerbosityLevel vl =
2695  (verbLevel == VERB_DEFAULT) ? VERB_LOW : verbLevel;
2696 
2697  const int myRank = this->getComm ()->getRank ();
2698 
2699  // none: print nothing
2700  // low: print O(1) info from Proc 0
2701  // medium:
2702  // high:
2703  // extreme:
2704 
2705  if (vl != VERB_NONE && myRank == 0) {
2706  // Describable interface asks each implementation to start with a tab.
2707  OSTab tab1 (out);
2708  // Output is valid YAML; hence the quotes, to protect the colons.
2709  out << "\"Ifpack2::Relaxation\":" << endl;
2710  OSTab tab2 (out);
2711  out << "MatrixType: \"" << TypeNameTraits<MatrixType>::name () << "\""
2712  << endl;
2713  if (this->getObjectLabel () != "") {
2714  out << "Label: " << this->getObjectLabel () << endl;
2715  }
2716  out << "Initialized: " << (isInitialized () ? "true" : "false") << endl
2717  << "Computed: " << (isComputed () ? "true" : "false") << endl
2718  << "Parameters: " << endl;
2719  {
2720  OSTab tab3 (out);
2721  out << "\"relaxation: type\": ";
2722  if (PrecType_ == Ifpack2::Details::JACOBI) {
2723  out << "Jacobi";
2724  } else if (PrecType_ == Ifpack2::Details::GS) {
2725  out << "Gauss-Seidel";
2726  } else if (PrecType_ == Ifpack2::Details::SGS) {
2727  out << "Symmetric Gauss-Seidel";
2728  } else if (PrecType_ == Ifpack2::Details::MTGS) {
2729  out << "MT Gauss-Seidel";
2730  } else if (PrecType_ == Ifpack2::Details::MTSGS) {
2731  out << "MT Symmetric Gauss-Seidel";
2732  } else if (PrecType_ == Ifpack2::Details::GS2) {
2733  out << "Two-stage Gauss-Seidel";
2734  } else if (PrecType_ == Ifpack2::Details::SGS2) {
2735  out << "Two-stage Symmetric Gauss-Seidel";
2736  } else {
2737  out << "INVALID";
2738  }
2739  // We quote these parameter names because they contain colons.
2740  // YAML uses the colon to distinguish key from value.
2741  out << endl
2742  << "\"relaxation: sweeps\": " << NumSweeps_ << endl
2743  << "\"relaxation: damping factor\": " << DampingFactor_ << endl
2744  << "\"relaxation: min diagonal value\": " << MinDiagonalValue_ << endl
2745  << "\"relaxation: zero starting solution\": " << ZeroStartingSolution_ << endl
2746  << "\"relaxation: backward mode\": " << DoBackwardGS_ << endl
2747  << "\"relaxation: use l1\": " << DoL1Method_ << endl
2748  << "\"relaxation: l1 eta\": " << L1Eta_ << endl;
2749  }
2750  out << "Computed quantities:" << endl;
2751  {
2752  OSTab tab3 (out);
2753  out << "Global number of rows: " << A_->getGlobalNumRows () << endl
2754  << "Global number of columns: " << A_->getGlobalNumCols () << endl;
2755  }
2756  if (checkDiagEntries_ && isComputed ()) {
2757  out << "Properties of input diagonal entries:" << endl;
2758  {
2759  OSTab tab3 (out);
2760  out << "Magnitude of minimum-magnitude diagonal entry: "
2761  << globalMinMagDiagEntryMag_ << endl
2762  << "Magnitude of maximum-magnitude diagonal entry: "
2763  << globalMaxMagDiagEntryMag_ << endl
2764  << "Number of diagonal entries with small magnitude: "
2765  << globalNumSmallDiagEntries_ << endl
2766  << "Number of zero diagonal entries: "
2767  << globalNumZeroDiagEntries_ << endl
2768  << "Number of diagonal entries with negative real part: "
2769  << globalNumNegDiagEntries_ << endl
2770  << "Abs 2-norm diff between computed and actual inverse "
2771  << "diagonal: " << globalDiagNormDiff_ << endl;
2772  }
2773  }
2774  if (isComputed ()) {
2775  out << "Saved diagonal offsets: "
2776  << (savedDiagOffsets_ ? "true" : "false") << endl;
2777  }
2778  out << "Call counts and total times (in seconds): " << endl;
2779  {
2780  OSTab tab3 (out);
2781  out << "initialize: " << endl;
2782  {
2783  OSTab tab4 (out);
2784  out << "Call count: " << NumInitialize_ << endl;
2785  out << "Total time: " << InitializeTime_ << endl;
2786  }
2787  out << "compute: " << endl;
2788  {
2789  OSTab tab4 (out);
2790  out << "Call count: " << NumCompute_ << endl;
2791  out << "Total time: " << ComputeTime_ << endl;
2792  }
2793  out << "apply: " << endl;
2794  {
2795  OSTab tab4 (out);
2796  out << "Call count: " << NumApply_ << endl;
2797  out << "Total time: " << ApplyTime_ << endl;
2798  }
2799  }
2800  }
2801 }
2802 
2803 } // namespace Ifpack2
2804 
2805 #define IFPACK2_RELAXATION_INSTANT(S,LO,GO,N) \
2806  template class Ifpack2::Relaxation< Tpetra::RowMatrix<S, LO, GO, N> >;
2807 
2808 #endif // IFPACK2_RELAXATION_DEF_HPP
bool hasTransposeApply() const
Whether apply() and applyMat() let you apply the transpose or conjugate transpose.
Definition: Ifpack2_Relaxation_def.hpp:440
double getComputeFlops() const
Total number of floating-point operations over all calls to compute().
Definition: Ifpack2_Relaxation_def.hpp:482
double getApplyFlops() const
Total number of floating-point operations over all calls to apply().
Definition: Ifpack2_Relaxation_def.hpp:488
void describe(Teuchos::FancyOStream &out, const Teuchos::EVerbosityLevel verbLevel=Teuchos::Describable::verbLevel_default) const
Print the object&#39;s attributes to the given output stream.
Definition: Ifpack2_Relaxation_def.hpp:2681
bool is_null(const boost::shared_ptr< T > &p)
basic_OSTab< char > OSTab
static magnitudeType eps()
Relaxation(const Teuchos::RCP< const row_matrix_type > &A)
Constructor.
Definition: Ifpack2_Relaxation_def.hpp:215
T & get(const std::string &name, T def_value)
void compute()
Compute the preconditioner (&quot;numeric setup&quot;);.
Definition: Ifpack2_Relaxation_def.hpp:937
ParameterList & set(std::string const &name, T const &value, std::string const &docString="", RCP< const ParameterEntryValidator > const &validator=null)
static RCP< Time > getNewCounter(const std::string &name)
static RCP< Time > lookupCounter(const std::string &name)
virtual void printDoc(std::string const &docString, std::ostream &out) const =0
#define TEUCHOS_TEST_FOR_EXCEPTION(throw_exception_test, Exception, msg)
size_t getNodeSmootherComplexity() const
Get a rough estimate of cost per iteration.
Definition: Ifpack2_Relaxation_def.hpp:495
virtual const std::string getXMLTypeName() const =0
static std::ostream & printLines(std::ostream &os, const std::string &linePrefix, const std::string &lines)
Teuchos::RCP< const Teuchos::Comm< int > > getComm() const
The communicator over which the matrix and vectors are distributed.
Definition: Ifpack2_Relaxation_def.hpp:397
virtual void setMatrix(const Teuchos::RCP< const row_matrix_type > &A)
Change the matrix to be preconditioned.
Definition: Ifpack2_Relaxation_def.hpp:194
int getNumApply() const
Total number of calls to apply().
Definition: Ifpack2_Relaxation_def.hpp:458
T * getRawPtr() const
bool remove(std::string const &name, bool throwIfNotExists=true)
double getComputeTime() const
Total time in seconds spent in all calls to compute().
Definition: Ifpack2_Relaxation_def.hpp:470
TEUCHOS_DEPRECATED RCP< T > rcp(T *p, Dealloc_T dealloc, bool owns_mem)
MatrixType::node_type node_type
The Node type used by the input MatrixType.
Definition: Ifpack2_Relaxation_decl.hpp:262
Teuchos::RCP< const Tpetra::Map< local_ordinal_type, global_ordinal_type, node_type > > getDomainMap() const
Returns the Tpetra::Map object associated with the domain of this operator.
Definition: Ifpack2_Relaxation_def.hpp:417
void setParameters(const Teuchos::ParameterList &params)
Set the relaxation / preconditioner parameters.
Definition: Ifpack2_Relaxation_def.hpp:387
File for utility functions.
std::string description() const
A simple one-line description of this object.
Definition: Ifpack2_Relaxation_def.hpp:2622
int getNumInitialize() const
Total number of calls to initialize().
Definition: Ifpack2_Relaxation_def.hpp:446
void validateParametersAndSetDefaults(ParameterList const &validParamList, int const depth=1000)
virtual ValidStringsList validStringValues() const =0
Teuchos::RCP< const Tpetra::Map< local_ordinal_type, global_ordinal_type, node_type > > getRangeMap() const
Returns the Tpetra::Map object associated with the range of this operator.
Definition: Ifpack2_Relaxation_def.hpp:430
MatrixType::global_ordinal_type global_ordinal_type
The type of global indices in the input MatrixType.
Definition: Ifpack2_Relaxation_decl.hpp:259
double getInitializeTime() const
Total time in seconds spent in all calls to initialize().
Definition: Ifpack2_Relaxation_def.hpp:464
any & getAny(bool activeQry=true)
void applyMat(const Tpetra::MultiVector< scalar_type, local_ordinal_type, global_ordinal_type, node_type > &X, Tpetra::MultiVector< scalar_type, local_ordinal_type, global_ordinal_type, node_type > &Y, Teuchos::ETransp mode=Teuchos::NO_TRANS) const
Apply the input matrix to X, returning the result in Y.
Definition: Ifpack2_Relaxation_def.hpp:616
Teuchos::RCP< const row_matrix_type > getMatrix() const
The matrix to be preconditioned.
Definition: Ifpack2_Relaxation_def.hpp:408
TypeTo as(const TypeFrom &t)
MatrixType::local_ordinal_type local_ordinal_type
The type of local indices in the input MatrixType.
Definition: Ifpack2_Relaxation_decl.hpp:256
bool isType(const std::string &name) const
MatrixType::scalar_type scalar_type
The type of the entries of the input MatrixType.
Definition: Ifpack2_Relaxation_decl.hpp:253
double totalElapsedTime(bool readCurrentTime=false) const
std::string typeName() const
const std::type_info & type() const
int getNumCompute() const
Total number of calls to compute().
Definition: Ifpack2_Relaxation_def.hpp:452
Relaxation preconditioners for Tpetra::RowMatrix and Tpetra::CrsMatrix sparse matrices.
Definition: Ifpack2_Relaxation_decl.hpp:236
double getApplyTime() const
Total time in seconds spent in all calls to apply().
Definition: Ifpack2_Relaxation_def.hpp:476
void getValidParameters(Teuchos::ParameterList &params)
Fills a list which contains all the parameters possibly used by Ifpack2.
Definition: Ifpack2_Parameters.cpp:51
T * get() const
void initialize()
Initialize the preconditioner (&quot;symbolic setup&quot;).
Definition: Ifpack2_Relaxation_def.hpp:637
Tpetra::RowMatrix< scalar_type, local_ordinal_type, global_ordinal_type, node_type > row_matrix_type
Tpetra::RowMatrix specialization used by this class.
Definition: Ifpack2_Relaxation_decl.hpp:269
void apply(const Tpetra::MultiVector< scalar_type, local_ordinal_type, global_ordinal_type, node_type > &X, Tpetra::MultiVector< scalar_type, local_ordinal_type, global_ordinal_type, node_type > &Y, Teuchos::ETransp mode=Teuchos::NO_TRANS, scalar_type alpha=Teuchos::ScalarTraits< scalar_type >::one(), scalar_type beta=Teuchos::ScalarTraits< scalar_type >::zero()) const
Apply the preconditioner to X, returning the result in Y.
Definition: Ifpack2_Relaxation_def.hpp:508
Teuchos::RCP< const Teuchos::ParameterList > getValidParameters() const
Return a list of all the parameters that this class accepts.
Definition: Ifpack2_Relaxation_def.hpp:227
Teuchos::ScalarTraits< scalar_type >::magnitudeType magnitude_type
The type of the magnitude (absolute value) of a matrix entry.
Definition: Ifpack2_Relaxation_decl.hpp:265
virtual void validate(ParameterEntry const &entry, std::string const &paramName, std::string const &sublistName) const =0
bool is_null() const