Tpetra parallel linear algebra  Version of the Day
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Pages
Tpetra_CrsMatrixMultiplyOp.hpp
Go to the documentation of this file.
1 // @HEADER
2 // ***********************************************************************
3 //
4 // Tpetra: Templated Linear Algebra Services Package
5 // Copyright (2008) Sandia Corporation
6 //
7 // Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
8 // the U.S. Government retains certain rights in this software.
9 //
10 // Redistribution and use in source and binary forms, with or without
11 // modification, are permitted provided that the following conditions are
12 // met:
13 //
14 // 1. Redistributions of source code must retain the above copyright
15 // notice, this list of conditions and the following disclaimer.
16 //
17 // 2. Redistributions in binary form must reproduce the above copyright
18 // notice, this list of conditions and the following disclaimer in the
19 // documentation and/or other materials provided with the distribution.
20 //
21 // 3. Neither the name of the Corporation nor the names of the
22 // contributors may be used to endorse or promote products derived from
23 // this software without specific prior written permission.
24 //
25 // THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
26 // EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
27 // IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
28 // PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
29 // CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
30 // EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
31 // PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
32 // PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
33 // LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
34 // NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
35 // SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
36 //
37 // Questions? Contact Michael A. Heroux (maherou@sandia.gov)
38 //
39 // ************************************************************************
40 // @HEADER
41 
42 #ifndef TPETRA_CRSMATRIXMULTIPLYOP_HPP
43 #define TPETRA_CRSMATRIXMULTIPLYOP_HPP
44 
49 
51 #include "Tpetra_CrsMatrix.hpp"
52 #include "Tpetra_Util.hpp"
55 #include "Tpetra_LocalCrsMatrixOperator.hpp"
56 
57 namespace Tpetra {
58 
95  template <class Scalar,
96  class MatScalar,
97  class LocalOrdinal,
98  class GlobalOrdinal,
99  class Node>
101  public Operator<Scalar, LocalOrdinal, GlobalOrdinal, Node>
102  {
103  public:
105  using crs_matrix_type =
109 
110  private:
111  using local_matrix_type =
113 
114  public:
116 
117 
122  CrsMatrixMultiplyOp (const Teuchos::RCP<const crs_matrix_type>& A) :
123  matrix_ (A),
124  localMultiply_ (std::make_shared<local_matrix_type> (A->getLocalMatrix ()))
125  {}
126 
128  ~CrsMatrixMultiplyOp () override = default;
129 
131 
133 
136  void
139  Teuchos::ETransp mode = Teuchos::NO_TRANS,
140  Scalar alpha = Teuchos::ScalarTraits<Scalar>::one (),
141  Scalar beta = Teuchos::ScalarTraits<Scalar>::zero ()) const override
142  {
143  TEUCHOS_TEST_FOR_EXCEPTION
144  (! matrix_->isFillComplete (), std::runtime_error,
145  Teuchos::typeName (*this) << "::apply(): underlying matrix is not fill-complete.");
146  TEUCHOS_TEST_FOR_EXCEPTION
147  (X.getNumVectors () != Y.getNumVectors (), std::runtime_error,
148  Teuchos::typeName (*this) << "::apply(X,Y): X and Y must have the same number of vectors.");
149  TEUCHOS_TEST_FOR_EXCEPTION
150  (Teuchos::ScalarTraits<Scalar>::isComplex && mode == Teuchos::TRANS, std::logic_error,
151  Teuchos::typeName (*this) << "::apply() does not currently support transposed multiplications for complex scalar types.");
152  if (mode == Teuchos::NO_TRANS) {
153  applyNonTranspose (X, Y, alpha, beta);
154  }
155  else {
156  applyTranspose (X, Y, mode, alpha, beta);
157  }
158  }
159 
199  void
203  const Scalar& dampingFactor,
204  const ESweepDirection direction,
205  const int numSweeps) const
206  {
207  using Teuchos::null;
208  using Teuchos::RCP;
209  using Teuchos::rcp;
210  using Teuchos::rcpFromRef;
211  using Teuchos::rcp_const_cast;
212  typedef Scalar OS;
213  typedef Export<LocalOrdinal, GlobalOrdinal, Node> export_type;
214  typedef Import<LocalOrdinal, GlobalOrdinal, Node> import_type;
216 
217  TEUCHOS_TEST_FOR_EXCEPTION
218  (numSweeps < 0, std::invalid_argument,
219  "gaussSeidel: The number of sweeps must be nonnegative, "
220  "but you provided numSweeps = " << numSweeps << " < 0.");
221 
222  // Translate from global to local sweep direction.
223  // While doing this, validate the input.
224  ESweepDirection localDirection;
225  if (direction == Forward) {
226  localDirection = Forward;
227  }
228  else if (direction == Backward) {
229  localDirection = Backward;
230  }
231  else if (direction == Symmetric) {
232  // We'll control local sweep direction manually.
233  localDirection = Forward;
234  }
235  else {
236  TEUCHOS_TEST_FOR_EXCEPTION
237  (true, std::invalid_argument,
238  "gaussSeidel: The 'direction' enum does not have any of its valid "
239  "values: Forward, Backward, or Symmetric.");
240  }
241 
242  if (numSweeps == 0) {
243  return; // Nothing to do.
244  }
245 
246  // We don't need the Export object because this method assumes
247  // that the row, domain, and range Maps are the same. We do need
248  // the Import object, if there is one, though.
249  RCP<const import_type> importer = matrix_->getGraph()->getImporter();
250  RCP<const export_type> exporter = matrix_->getGraph()->getExporter();
251  TEUCHOS_TEST_FOR_EXCEPTION
252  (! exporter.is_null (), std::runtime_error,
253  "Tpetra's gaussSeidel implementation requires that the row, domain, "
254  "and range Maps be the same. This cannot be the case, because the "
255  "matrix has a nontrivial Export object.");
256 
257  RCP<const map_type> domainMap = matrix_->getDomainMap ();
258  RCP<const map_type> rangeMap = matrix_->getRangeMap ();
259  RCP<const map_type> rowMap = matrix_->getGraph ()->getRowMap ();
260  RCP<const map_type> colMap = matrix_->getGraph ()->getColMap ();
261 
262  const bool debug = ::Tpetra::Details::Behavior::debug ();
263  if (debug) {
264  // The relation 'isSameAs' is transitive. It's also a
265  // collective, so we don't have to do a "shared" test for
266  // exception (i.e., a global reduction on the test value).
267  TEUCHOS_TEST_FOR_EXCEPTION
268  (! X.getMap ()->isSameAs (*domainMap), std::runtime_error,
269  "Tpetra::CrsMatrix::gaussSeidel requires that the input "
270  "multivector X be in the domain Map of the matrix.");
271  TEUCHOS_TEST_FOR_EXCEPTION
272  (! B.getMap ()->isSameAs (*rangeMap), std::runtime_error,
273  "Tpetra::CrsMatrix::gaussSeidel requires that the input "
274  "B be in the range Map of the matrix.");
275  TEUCHOS_TEST_FOR_EXCEPTION
276  (! D.getMap ()->isSameAs (*rowMap), std::runtime_error,
277  "Tpetra::CrsMatrix::gaussSeidel requires that the input "
278  "D be in the row Map of the matrix.");
279  TEUCHOS_TEST_FOR_EXCEPTION
280  (! rowMap->isSameAs (*rangeMap), std::runtime_error,
281  "Tpetra::CrsMatrix::gaussSeidel requires that the row Map and the "
282  "range Map be the same (in the sense of Tpetra::Map::isSameAs).");
283  TEUCHOS_TEST_FOR_EXCEPTION
284  (! domainMap->isSameAs (*rangeMap), std::runtime_error,
285  "Tpetra::CrsMatrix::gaussSeidel requires that the domain Map and "
286  "the range Map of the matrix be the same.");
287  }
288 
289  // If B is not constant stride, copy it into a constant stride
290  // multivector. We'l handle the right-hand side B first and deal
291  // with X right before the sweeps, to improve locality of the
292  // first sweep. (If the problem is small enough, then that will
293  // hopefully keep more of the entries of X in cache. This
294  // optimizes for the typical case of a small number of sweeps.)
295  RCP<const OSMV> B_in;
296  if (B.isConstantStride()) {
297  B_in = rcpFromRef (B);
298  }
299  else {
300  // The range Map and row Map are the same in this case, so we
301  // can use the (possibly cached) row Map multivector to store a
302  // constant stride copy of B. We don't have to copy back, since
303  // Gauss-Seidel won't modify B.
304  RCP<OSMV> B_in_nonconst = getRowMapMultiVector (B, true);
305  deep_copy (*B_in_nonconst, B);
306  B_in = rcp_const_cast<const OSMV> (B_in_nonconst);
307 
309  (! B.isConstantStride (), std::runtime_error,
310  "gaussSeidel: The current implementation of the Gauss-Seidel kernel "
311  "requires that X and B both have constant stride. Since B does not "
312  "have constant stride, we had to make a copy. This is a limitation of "
313  "the current implementation and not your fault, but we still report it "
314  "as an efficiency warning for your information.");
315  }
316 
317  // If X is not constant stride, copy it into a constant stride
318  // multivector. Also, make the column Map multivector X_colMap,
319  // and its domain Map view X_domainMap. (X actually must be a
320  // domain Map view of a column Map multivector; exploit this, if X
321  // has constant stride.)
322 
323  RCP<OSMV> X_domainMap;
324  RCP<OSMV> X_colMap;
325  bool copiedInput = false;
326 
327  if (importer.is_null ()) { // Domain and column Maps are the same.
328  if (X.isConstantStride ()) {
329  X_domainMap = rcpFromRef (X);
330  X_colMap = X_domainMap;
331  copiedInput = false;
332  }
333  else {
334  // Get a temporary column Map multivector, make a domain Map
335  // view of it, and copy X into the domain Map view. We have
336  // to copy here because we won't be doing Import operations.
337  X_colMap = getColumnMapMultiVector (X, true);
338  X_domainMap = X_colMap; // Domain and column Maps are the same.
339  deep_copy (*X_domainMap, X); // Copy X into the domain Map view.
340  copiedInput = true;
342  (! X.isConstantStride (), std::runtime_error,
343  "gaussSeidel: The current implementation of the Gauss-Seidel kernel "
344  "requires that X and B both have constant stride. Since X does not "
345  "have constant stride, we had to make a copy. This is a limitation of "
346  "the current implementation and not your fault, but we still report it "
347  "as an efficiency warning for your information.");
348  }
349  }
350  else { // We will be doing Import operations in the sweeps.
351  if (X.isConstantStride ()) {
352  X_domainMap = rcpFromRef (X);
353  // This kernel assumes that X is a domain Map view of a
354  // column Map multivector. We will only check if this is
355  // valid in debug mode.
356  X_colMap = X_domainMap->offsetViewNonConst (colMap, 0);
357 
358  // Do the first Import for the first sweep. This simplifies
359  // the logic in the sweeps.
360  X_colMap->doImport (X, *importer, INSERT);
361  copiedInput = false;
362  }
363  else {
364  // Get a temporary column Map multivector X_colMap, and make a
365  // domain Map view X_domainMap of it. Instead of copying, we
366  // do an Import from X into X_domainMap. This saves us a
367  // copy, since the Import has to copy the data anyway.
368  X_colMap = getColumnMapMultiVector (X, true);
369  X_domainMap = X_colMap->offsetViewNonConst (domainMap, 0);
370  X_colMap->doImport (X, *importer, INSERT);
371  copiedInput = true;
373  (! X.isConstantStride (), std::runtime_error,
374  "gaussSeidel: The current implementation of the Gauss-Seidel kernel "
375  "requires that X and B both have constant stride. Since X does not "
376  "have constant stride, we had to make a copy. This is a limitation of "
377  "the current implementation and not your fault, but we still report it "
378  "as an efficiency warning for your information.");
379  }
380  }
381 
382  for (int sweep = 0; sweep < numSweeps; ++sweep) {
383  if (! importer.is_null () && sweep > 0) {
384  // We already did the first Import for the zeroth sweep.
385  X_colMap->doImport (*X_domainMap, *importer, INSERT);
386  }
387 
388  // Do local Gauss-Seidel.
389  if (direction != Symmetric) {
390  matrix_->template localGaussSeidel<OS,OS> (*B_in, *X_colMap, D,
391  dampingFactor,
392  localDirection);
393  }
394  else { // direction == Symmetric
395  matrix_->template localGaussSeidel<OS,OS> (*B_in, *X_colMap, D,
396  dampingFactor,
397  Forward);
398  // Communicate again before the Backward sweep.
399  if (! importer.is_null ()) {
400  X_colMap->doImport (*X_domainMap, *importer, INSERT);
401  }
402  matrix_->template localGaussSeidel<OS,OS> (*B_in, *X_colMap, D,
403  dampingFactor,
404  Backward);
405  }
406  }
407 
408  if (copiedInput) {
409  deep_copy (X, *X_domainMap); // Copy back: X_domainMap -> X.
410  }
411  }
412 
437  void
441  const Scalar& dampingFactor,
442  const ESweepDirection direction,
443  const int numSweeps) const
444  {
445  using Teuchos::null;
446  using Teuchos::RCP;
447  using Teuchos::rcp;
448  using Teuchos::rcpFromRef;
449  using Teuchos::rcp_const_cast;
450  typedef Scalar OS;
451  typedef Export<LocalOrdinal, GlobalOrdinal, Node> export_type;
452  typedef Import<LocalOrdinal, GlobalOrdinal, Node> import_type;
454 
455  TEUCHOS_TEST_FOR_EXCEPTION
456  (numSweeps < 0, std::invalid_argument,
457  "gaussSeidelCopy: The number of sweeps must be nonnegative, "
458  "but you provided numSweeps = " << numSweeps << " < 0.");
459 
460  // Translate from global to local sweep direction.
461  // While doing this, validate the input.
462  ESweepDirection localDirection;
463  if (direction == Forward) {
464  localDirection = Forward;
465  }
466  else if (direction == Backward) {
467  localDirection = Backward;
468  }
469  else if (direction == Symmetric) {
470  // We'll control local sweep direction manually.
471  localDirection = Forward;
472  }
473  else {
474  TEUCHOS_TEST_FOR_EXCEPTION
475  (true, std::invalid_argument,
476  "gaussSeidelCopy: The 'direction' enum does not have any of its "
477  "valid values: Forward, Backward, or Symmetric.");
478  }
479 
480  if (numSweeps == 0) {
481  return;
482  }
483 
484  RCP<const import_type> importer = matrix_->getGraph()->getImporter();
485  RCP<const export_type> exporter = matrix_->getGraph()->getExporter();
486  TEUCHOS_TEST_FOR_EXCEPTION
487  (! exporter.is_null (),
488  std::runtime_error,
489  "Tpetra's gaussSeidelCopy implementation requires that the row, domain, "
490  "and range Maps be the same. This cannot be the case, because the "
491  "matrix has a nontrivial Export object.");
492 
493  RCP<const map_type> domainMap = matrix_->getDomainMap ();
494  RCP<const map_type> rangeMap = matrix_->getRangeMap ();
495  RCP<const map_type> rowMap = matrix_->getGraph ()->getRowMap ();
496  RCP<const map_type> colMap = matrix_->getGraph ()->getColMap ();
497 
498  const bool debug = ::Tpetra::Details::Behavior::debug ();
499  if (debug) {
500  // The relation 'isSameAs' is transitive. It's also a
501  // collective, so we don't have to do a "shared" test for
502  // exception (i.e., a global reduction on the test value).
503  TEUCHOS_TEST_FOR_EXCEPTION
504  (! X.getMap ()->isSameAs (*domainMap), std::runtime_error,
505  "Tpetra::CrsMatrix::gaussSeidelCopy requires that the input "
506  "multivector X be in the domain Map of the matrix.");
507  TEUCHOS_TEST_FOR_EXCEPTION
508  (! B.getMap ()->isSameAs (*rangeMap), std::runtime_error,
509  "Tpetra::CrsMatrix::gaussSeidelCopy requires that the input "
510  "B be in the range Map of the matrix.");
511  TEUCHOS_TEST_FOR_EXCEPTION
512  (! D.getMap ()->isSameAs (*rowMap), std::runtime_error,
513  "Tpetra::CrsMatrix::gaussSeidelCopy requires that the input "
514  "D be in the row Map of the matrix.");
515  TEUCHOS_TEST_FOR_EXCEPTION
516  (! rowMap->isSameAs (*rangeMap), std::runtime_error,
517  "Tpetra::CrsMatrix::gaussSeidelCopy requires that the row Map and the "
518  "range Map be the same (in the sense of Tpetra::Map::isSameAs).");
519  TEUCHOS_TEST_FOR_EXCEPTION
520  (! domainMap->isSameAs (*rangeMap), std::runtime_error,
521  "Tpetra::CrsMatrix::gaussSeidelCopy requires that the domain Map and "
522  "the range Map of the matrix be the same.");
523  }
524 
525  // Fetch a (possibly cached) temporary column Map multivector
526  // X_colMap, and a domain Map view X_domainMap of it. Both have
527  // constant stride by construction. We know that the domain Map
528  // must include the column Map, because our Gauss-Seidel kernel
529  // requires that the row Map, domain Map, and range Map are all
530  // the same, and that each process owns all of its own diagonal
531  // entries of the matrix.
532 
533  RCP<OSMV> X_colMap;
534  RCP<OSMV> X_domainMap;
535  bool copyBackOutput = false;
536  if (importer.is_null ()) {
537  if (X.isConstantStride ()) {
538  X_colMap = rcpFromRef (X);
539  X_domainMap = rcpFromRef (X);
540  // No need to copy back to X at end.
541  }
542  else { // We must copy X into a constant stride multivector.
543  // Just use the cached column Map multivector for that.
544  X_colMap = getColumnMapMultiVector (X, true);
545  // X_domainMap is always a domain Map view of the column Map
546  // multivector. In this case, the domain and column Maps are
547  // the same, so X_domainMap _is_ X_colMap.
548  X_domainMap = X_colMap;
549  deep_copy (*X_domainMap, X); // Copy X into constant stride multivector
550  copyBackOutput = true; // Don't forget to copy back at end.
552  (! X.isConstantStride (), std::runtime_error,
553  "gaussSeidelCopy: The current implementation of the Gauss-Seidel "
554  "kernel requires that X and B both have constant stride. Since X "
555  "does not have constant stride, we had to make a copy. This is a "
556  "limitation of the current implementation and not your fault, but we "
557  "still report it as an efficiency warning for your information.");
558  }
559  }
560  else { // Column Map and domain Map are _not_ the same.
561  X_colMap = getColumnMapMultiVector (X);
562  X_domainMap = X_colMap->offsetViewNonConst (domainMap, 0);
563 
564  // We could just copy X into X_domainMap. However, that wastes
565  // a copy, because the Import also does a copy (plus
566  // communication). Since the typical use case for Gauss-Seidel
567  // is a small number of sweeps (2 is typical), we don't want to
568  // waste that copy. Thus, we do the Import here, and skip the
569  // first Import in the first sweep. Importing directly from X
570  // effects the copy into X_domainMap (which is a view of
571  // X_colMap).
572  X_colMap->doImport (X, *importer, INSERT);
573 
574  copyBackOutput = true; // Don't forget to copy back at end.
575  }
576 
577  // The Gauss-Seidel / SOR kernel expects multivectors of constant
578  // stride. X_colMap is by construction, but B might not be. If
579  // it's not, we have to make a copy.
580  RCP<const OSMV> B_in;
581  if (B.isConstantStride ()) {
582  B_in = rcpFromRef (B);
583  }
584  else {
585  // Range Map and row Map are the same in this case, so we can
586  // use the cached row Map multivector to store a constant stride
587  // copy of B.
588  RCP<OSMV> B_in_nonconst = getRowMapMultiVector (B, true);
589  *B_in_nonconst = B;
590  B_in = rcp_const_cast<const OSMV> (B_in_nonconst);
591 
593  (! B.isConstantStride (), std::runtime_error,
594  "gaussSeidelCopy: The current implementation requires that B have "
595  "constant stride. Since B does not have constant stride, we had to "
596  "copy it into a separate constant-stride multivector. This is a "
597  "limitation of the current implementation and not your fault, but we "
598  "still report it as an efficiency warning for your information.");
599  }
600 
601  for (int sweep = 0; sweep < numSweeps; ++sweep) {
602  if (! importer.is_null () && sweep > 0) {
603  // We already did the first Import for the zeroth sweep above.
604  X_colMap->doImport (*X_domainMap, *importer, INSERT);
605  }
606 
607  // Do local Gauss-Seidel.
608  if (direction != Symmetric) {
609  matrix_->template localGaussSeidel<OS,OS> (*B_in, *X_colMap, D,
610  dampingFactor,
611  localDirection);
612  }
613  else { // direction == Symmetric
614  matrix_->template localGaussSeidel<OS,OS> (*B_in, *X_colMap, D,
615  dampingFactor,
616  Forward);
617  // Communicate again before the Backward sweep, if necessary.
618  if (! importer.is_null ()) {
619  X_colMap->doImport (*X_domainMap, *importer, INSERT);
620  }
621  matrix_->template localGaussSeidel<OS,OS> (*B_in, *X_colMap, D,
622  dampingFactor,
623  Backward);
624  }
625  }
626 
627  if (copyBackOutput) {
628  deep_copy (X, *X_domainMap); // Copy result back into X.
629  }
630  }
631 
637  bool hasTransposeApply() const override {
638  return true;
639  }
640 
642  Teuchos::RCP<const map_type> getDomainMap () const override {
643  return matrix_->getDomainMap ();
644  }
645 
647  Teuchos::RCP<const map_type> getRangeMap () const override {
648  return matrix_->getRangeMap ();
649  }
650 
652 
653  protected:
655 
657  const Teuchos::RCP<const crs_matrix_type> matrix_;
658 
660  LocalCrsMatrixOperator<Scalar, MatScalar, typename
662 
675  mutable Teuchos::RCP<MultiVector<Scalar,LocalOrdinal,GlobalOrdinal,Node> > importMV_;
676 
689  mutable Teuchos::RCP<MultiVector<Scalar,LocalOrdinal,GlobalOrdinal,Node> > exportMV_;
690 
693  void
696  Teuchos::ETransp mode,
697  Scalar alpha,
698  Scalar beta) const
699  {
700  using Teuchos::null;
701  using Teuchos::RCP;
702  using Teuchos::rcp;
703  using export_type = Export<LocalOrdinal, GlobalOrdinal, Node>;
704  using import_type = Import<LocalOrdinal, GlobalOrdinal, Node>;
705  using STS = Teuchos::ScalarTraits<Scalar>;
706 
707  const size_t numVectors = X_in.getNumVectors();
708  // because of Views, it is difficult to determine if X and Y point to the same data.
709  // however, if they reference the exact same object, we will do the user the favor of copying X into new storage (with a warning)
710  // we ony need to do this if we have trivial importers; otherwise, we don't actually apply the operator from X into Y
711  RCP<const import_type> importer = matrix_->getGraph()->getImporter();
712  RCP<const export_type> exporter = matrix_->getGraph()->getExporter();
713 
714  // some parameters for below
715  const bool Y_is_replicated = ! Y_in.isDistributed ();
716  const bool Y_is_overwritten = (beta == STS::zero ());
717  const int myRank = matrix_->getComm ()->getRank ();
718  if (Y_is_replicated && myRank != 0) {
719  beta = STS::zero ();
720  }
721 
722  // access X indirectly, in case we need to create temporary storage
723  RCP<const MV> X;
724  // currently, cannot multiply from multivector of non-constant stride
725  if (! X_in.isConstantStride () && importer == null) {
726  // generate a strided copy of X_in
727  X = Teuchos::rcp (new MV (X_in, Teuchos::Copy));
728  }
729  else {
730  // just temporary, so this non-owning RCP is okay
731  X = Teuchos::rcpFromRef (X_in);
732  }
733 
734  // set up import/export temporary multivectors
735  if (importer != null) {
736  if (importMV_ != null && importMV_->getNumVectors () != numVectors) {
737  importMV_ = null;
738  }
739  if (importMV_ == null) {
740  importMV_ = rcp (new MV (matrix_->getColMap (), numVectors));
741  }
742  }
743  if (exporter != null) {
744  if (exportMV_ != null && exportMV_->getNumVectors () != numVectors) {
745  exportMV_ = null;
746  }
747  if (exportMV_ == null) {
748  exportMV_ = rcp (new MV (matrix_->getRowMap (), numVectors));
749  }
750  }
751 
752  // If we have a non-trivial exporter, we must import elements that are permuted or are on other processors
753  if (exporter != null) {
754  exportMV_->doImport(X_in,*exporter,INSERT);
755  X = exportMV_; // multiply out of exportMV_
756  }
757 
758  auto X_lcl = X->getLocalViewDevice ();
759 
760  // If we have a non-trivial importer, we must export elements that are permuted or belong to other processors
761  // We will compute solution into the to-be-exported MV; get a view
762  if (importer != null) {
763  auto Y_lcl = importMV_->getLocalViewDevice ();
764  importMV_->modify_device ();
765 
766  localMultiply_.apply (X_lcl, Y_lcl, mode, alpha, STS::zero ());
767  if (Y_is_overwritten) {
768  Y_in.putScalar (STS::zero ());
769  }
770  else {
771  Y_in.scale (beta);
772  }
773  Y_in.doExport (*importMV_, *importer, ADD);
774  }
775  // otherwise, multiply into Y
776  else {
777  // can't multiply in-situ; can't multiply into non-strided multivector
778  if (! Y_in.isConstantStride () || X.getRawPtr () == &Y_in) {
779  // generate a strided copy of Y
780  MV Y (Y_in, Teuchos::Copy);
781  auto Y_lcl = Y.getLocalViewDevice ();
782  Y.modify_device ();
783 
784  localMultiply_.apply (X_lcl, Y_lcl, mode, alpha, beta);
785  Tpetra::deep_copy (Y_in, Y);
786  }
787  else {
788  auto Y_lcl = Y_in.getLocalViewDevice ();
789  Y_in.modify_device ();
790 
791  localMultiply_.apply (X_lcl, Y_lcl, mode, alpha, beta);
792  }
793  }
794  // Handle case of rangemap being a local replicated map: in this case, sum contributions from each processor
795  if (Y_is_replicated) {
796  Y_in.reduce();
797  }
798  }
799 
801  void
804  Scalar alpha,
805  Scalar beta) const
806  {
808  using Teuchos::NO_TRANS;
809  using Teuchos::RCP;
810  using Teuchos::rcp;
811  using Teuchos::rcp_const_cast;
812  using Teuchos::rcpFromRef;
813  typedef Export<LocalOrdinal,GlobalOrdinal,Node> export_type;
814  typedef Import<LocalOrdinal,GlobalOrdinal,Node> import_type;
815  typedef Teuchos::ScalarTraits<Scalar> STS;
816 
817  if (alpha == STS::zero ()) {
818  if (beta == STS::zero ()) {
819  Y_in.putScalar (STS::zero ());
820  } else if (beta != STS::one ()) {
821  Y_in.scale (beta);
822  }
823  return;
824  }
825 
826  // It's possible that X is a view of Y or vice versa. We don't
827  // allow this (apply() requires that X and Y not alias one
828  // another), but it's helpful to detect and work around this
829  // case. We don't try to to detect the more subtle cases (e.g.,
830  // one is a subview of the other, but their initial pointers
831  // differ). We only need to do this if this matrix's Import is
832  // trivial; otherwise, we don't actually apply the operator from
833  // X into Y.
834 
835  RCP<const import_type> importer = matrix_->getGraph()->getImporter();
836  RCP<const export_type> exporter = matrix_->getGraph()->getExporter();
837 
838  // If beta == 0, then the output MV will be overwritten; none of
839  // its entries should be read. (Sparse BLAS semantics say that we
840  // must ignore any Inf or NaN entries in Y_in, if beta is zero.)
841  // This matters if we need to do an Export operation; see below.
842  const bool Y_is_overwritten = (beta == STS::zero());
843 
844  // We treat the case of a replicated MV output specially.
845  const bool Y_is_replicated =
846  (! Y_in.isDistributed () && matrix_->getComm ()->getSize () != 1);
847 
848  // This is part of the special case for replicated MV output.
849  // We'll let each process do its thing, but do an all-reduce at
850  // the end to sum up the results. Setting beta=0 on all
851  // processes but Proc 0 makes the math work out for the
852  // all-reduce. (This assumes that the replicated data is
853  // correctly replicated, so that the data are the same on all
854  // processes.)
855  if (Y_is_replicated && matrix_->getComm ()->getRank () > 0) {
856  beta = STS::zero();
857  }
858 
859  // Temporary MV for Import operation. After the block of code
860  // below, this will be an (Imported if necessary) column Map MV
861  // ready to give to localMultiply_.apply(...).
862  RCP<const MV> X_colMap;
863  if (importer.is_null ()) {
864  if (! X_in.isConstantStride ()) {
865  // Not all sparse mat-vec kernels can handle an input MV with
866  // nonconstant stride correctly, so we have to copy it in that
867  // case into a constant stride MV. To make a constant stride
868  // copy of X_in, we force creation of the column (== domain)
869  // Map MV (if it hasn't already been created, else fetch the
870  // cached copy). This avoids creating a new MV each time.
871  RCP<MV> X_colMapNonConst = getColumnMapMultiVector (X_in, true);
872  Tpetra::deep_copy (*X_colMapNonConst, X_in);
873  X_colMap = rcp_const_cast<const MV> (X_colMapNonConst);
874  }
875  else {
876  // The domain and column Maps are the same, so do the local
877  // multiply using the domain Map input MV X_in.
878  X_colMap = rcpFromRef (X_in);
879  }
880  }
881  else { // need to Import source (multi)vector
882  ProfilingRegion regionImport ("Tpetra::CrsMatrixMultiplyOp::apply: Import");
883 
884  // We're doing an Import anyway, which will copy the relevant
885  // elements of the domain Map MV X_in into a separate column Map
886  // MV. Thus, we don't have to worry whether X_in is constant
887  // stride.
888  RCP<MV> X_colMapNonConst = getColumnMapMultiVector (X_in);
889 
890  // Import from the domain Map MV to the column Map MV.
891  X_colMapNonConst->doImport (X_in, *importer, INSERT);
892  X_colMap = rcp_const_cast<const MV> (X_colMapNonConst);
893  }
894 
895  // Temporary MV for doExport (if needed), or for copying a
896  // nonconstant stride output MV into a constant stride MV. This
897  // is null if we don't need the temporary MV, that is, if the
898  // Export is trivial (null).
899  RCP<MV> Y_rowMap = getRowMapMultiVector (Y_in);
900 
901  auto X_lcl = X_colMap->getLocalViewDevice ();
902 
903  // If we have a nontrivial Export object, we must perform an
904  // Export. In that case, the local multiply result will go into
905  // the row Map multivector. We don't have to make a
906  // constant-stride version of Y_in in this case, because we had to
907  // make a constant stride Y_rowMap MV and do an Export anyway.
908  if (! exporter.is_null ()) {
909  auto Y_lcl = Y_rowMap->getLocalViewDevice ();
910  Y_rowMap->modify_device ();
911  localMultiply_.apply (X_lcl, Y_lcl, NO_TRANS,
912  alpha, STS::zero ());
913  {
914  ProfilingRegion regionExport ("Tpetra::CrsMatrixMultiplyOp::apply: Export");
915 
916  // If we're overwriting the output MV Y_in completely (beta
917  // == 0), then make sure that it is filled with zeros before
918  // we do the Export. Otherwise, the ADD combine mode will
919  // use data in Y_in, which is supposed to be zero.
920  if (Y_is_overwritten) {
921  Y_in.putScalar (STS::zero());
922  }
923  else {
924  // Scale the output MV by beta, so that the Export sums in
925  // the mat-vec contribution: Y_in = beta*Y_in + alpha*A*X_in.
926  Y_in.scale (beta);
927  }
928  // Do the Export operation.
929  Y_in.doExport (*Y_rowMap, *exporter, ADD);
930  }
931  }
932  else { // Don't do an Export: row Map and range Map are the same.
933  //
934  // If Y_in does not have constant stride, or if the column Map
935  // MV aliases Y_in, then we can't let the kernel write directly
936  // to Y_in. Instead, we have to use the cached row (== range)
937  // Map MV as temporary storage.
938  //
939  // FIXME (mfh 05 Jun 2014, mfh 07 Dec 2018) This test for
940  // aliasing only tests if the user passed in the same
941  // MultiVector for both X and Y. It won't detect whether one
942  // MultiVector views the other. We should also check the
943  // MultiVectors' raw data pointers.
944  if (! Y_in.isConstantStride () || X_colMap.getRawPtr () == &Y_in) {
945  // Force creating the MV if it hasn't been created already.
946  // This will reuse a previously created cached MV.
947  Y_rowMap = getRowMapMultiVector (Y_in, true);
948 
949  // If beta == 0, we don't need to copy Y_in into Y_rowMap,
950  // since we're overwriting it anyway.
951  if (beta != STS::zero ()) {
952  Tpetra::deep_copy (*Y_rowMap, Y_in);
953  }
954 
955  auto Y_lcl = Y_rowMap->getLocalViewDevice ();
956  Y_rowMap->modify_device ();
957  localMultiply_.apply (X_lcl, Y_lcl, NO_TRANS, alpha, beta);
958  Tpetra::deep_copy (Y_in, *Y_rowMap);
959  }
960  else {
961  auto Y_lcl = Y_in.getLocalViewDevice ();
962  Y_in.modify_device ();
963  localMultiply_.apply (X_lcl, Y_lcl, NO_TRANS, alpha, beta);
964  }
965  }
966 
967  // If the range Map is a locally replicated Map, sum up
968  // contributions from each process. We set beta = 0 on all
969  // processes but Proc 0 initially, so this will handle the scaling
970  // factor beta correctly.
971  if (Y_is_replicated) {
972  ProfilingRegion regionReduce ("Tpetra::CrsMatrixMultiplyOp::apply: Reduce Y");
973  Y_in.reduce ();
974  }
975  }
976 
977  private:
997  Teuchos::RCP<MV>
998  getColumnMapMultiVector (const MultiVector<Scalar,LocalOrdinal,GlobalOrdinal,Node>& X_domainMap,
999  const bool force = false) const
1000  {
1001  using Teuchos::null;
1002  using Teuchos::RCP;
1003  using Teuchos::rcp;
1004  typedef Import<LocalOrdinal,GlobalOrdinal,Node> import_type;
1005 
1006  const size_t numVecs = X_domainMap.getNumVectors ();
1007  RCP<const import_type> importer = matrix_->getGraph ()->getImporter ();
1008  RCP<const map_type> colMap = matrix_->getColMap ();
1009 
1010  RCP<MV> X_colMap; // null by default
1011 
1012  // If the Import object is trivial (null), then we don't need a
1013  // separate column Map multivector. Just return null in that
1014  // case. The caller is responsible for knowing not to use the
1015  // returned null pointer.
1016  //
1017  // If the Import is nontrivial, then we do need a separate
1018  // column Map multivector for the Import operation. Check in
1019  // that case if we have to (re)create the column Map
1020  // multivector.
1021  if (! importer.is_null () || force) {
1022  if (importMV_.is_null () || importMV_->getNumVectors () != numVecs) {
1023  X_colMap = rcp (new MV (colMap, numVecs));
1024 
1025  // Cache the newly created multivector for later reuse.
1026  importMV_ = X_colMap;
1027  }
1028  else { // Yay, we can reuse the cached multivector!
1029  X_colMap = importMV_;
1030  // mfh 09 Jan 2013: We don't have to fill with zeros first,
1031  // because the Import uses INSERT combine mode, which overwrites
1032  // existing entries.
1033  //
1034  //X_colMap->putScalar (STS::zero ());
1035  }
1036  }
1037  return X_colMap;
1038  }
1039 
1061  Teuchos::RCP<MV>
1062  getRowMapMultiVector (const MultiVector<Scalar,LocalOrdinal,GlobalOrdinal,Node>& Y_rangeMap,
1063  const bool force = false) const
1064  {
1065  using Teuchos::null;
1066  using Teuchos::RCP;
1067  using Teuchos::rcp;
1068  typedef Export<LocalOrdinal,GlobalOrdinal,Node> export_type;
1069 
1070  const size_t numVecs = Y_rangeMap.getNumVectors ();
1071  RCP<const export_type> exporter = matrix_->getGraph ()->getExporter ();
1072  RCP<const map_type> rowMap = matrix_->getRowMap ();
1073 
1074  RCP<MV> Y_rowMap; // null by default
1075 
1076  // If the Export object is trivial (null), then we don't need a
1077  // separate row Map multivector. Just return null in that case.
1078  // The caller is responsible for knowing not to use the returned
1079  // null pointer.
1080  //
1081  // If the Export is nontrivial, then we do need a separate row
1082  // Map multivector for the Export operation. Check in that case
1083  // if we have to (re)create the row Map multivector.
1084  if (! exporter.is_null () || force) {
1085  if (exportMV_.is_null () || exportMV_->getNumVectors () != numVecs) {
1086  Y_rowMap = rcp (new MV (rowMap, numVecs));
1087  exportMV_ = Y_rowMap; // Cache the newly created MV for later reuse
1088  }
1089  else { // Yay, we can reuse the cached multivector!
1090  Y_rowMap = exportMV_;
1091  }
1092  }
1093  return Y_rowMap;
1094  }
1095  };
1096 
1104  template <class OpScalar,
1105  class MatScalar,
1106  class LocalOrdinal,
1107  class GlobalOrdinal,
1108  class Node>
1109  Teuchos::RCP<
1110  CrsMatrixMultiplyOp<OpScalar, MatScalar, LocalOrdinal, GlobalOrdinal, Node> >
1111  createCrsMatrixMultiplyOp (const Teuchos::RCP<
1113  {
1114  typedef CrsMatrixMultiplyOp<OpScalar, MatScalar, LocalOrdinal,
1115  GlobalOrdinal, Node> op_type;
1116  return Teuchos::rcp (new op_type (A));
1117  }
1118 
1119 } // end of namespace Tpetra
1120 
1121 #endif // TPETRA_CRSMATRIXMULTIPLYOP_HPP
Communication plan for data redistribution from a uniquely-owned to a (possibly) multiply-owned distr...
Abstract interface for local operators (e.g., matrices and preconditioners).
Sparse matrix that presents a row-oriented interface that lets users read or modify entries...
KokkosSparse::CrsMatrix< impl_scalar_type, local_ordinal_type, device_type, void, typename local_graph_type::size_type > local_matrix_type
The specialization of Kokkos::CrsMatrix that represents the part of the sparse matrix on each MPI pro...
Teuchos::RCP< const map_type > getRangeMap() const override
The range Map of this Operator.
Declaration of Tpetra::Details::Profiling, a scope guard for Kokkos Profiling.
void putScalar(const Scalar &value)
Set all values in the multivector with the given value.
size_t getNumVectors() const
Number of columns in the multivector.
Teuchos::RCP< MultiVector< Scalar, LocalOrdinal, GlobalOrdinal, Node > > exportMV_
Row Map MultiVector used in apply().
bool isConstantStride() const
Whether this multivector has constant stride between columns.
One or more distributed dense vectors.
bool isDistributed() const
Whether this is a globally distributed object.
static bool debug()
Whether Tpetra is in debug mode.
Teuchos::RCP< const map_type > getDomainMap() const override
The domain Map of this Operator.
void apply(const MultiVector< Scalar, LocalOrdinal, GlobalOrdinal, Node > &X, MultiVector< Scalar, LocalOrdinal, GlobalOrdinal, Node > &Y, Teuchos::ETransp mode=Teuchos::NO_TRANS, Scalar alpha=Teuchos::ScalarTraits< Scalar >::one(), Scalar beta=Teuchos::ScalarTraits< Scalar >::zero()) const override
Compute Y = beta*Y + alpha*Op(A)*X, where Op(A) is either A, , or .
bool hasTransposeApply() const override
Whether this Operator&#39;s apply() method can apply the transpose or conjugate transpose.
Teuchos::RCP< MultiVector< Scalar, LocalOrdinal, GlobalOrdinal, Node > > importMV_
Column Map MultiVector used in apply().
~CrsMatrixMultiplyOp() override=default
Destructor (virtual for memory safety of derived classes).
void deep_copy(MultiVector< DS, DL, DG, DN > &dst, const MultiVector< SS, SL, SG, SN > &src)
Copy the contents of the MultiVector src into dst.
CrsMatrixMultiplyOp(const Teuchos::RCP< const crs_matrix_type > &A)
Constructor.
LocalCrsMatrixOperator< Scalar, MatScalar, typename crs_matrix_type::device_type > localMultiply_
Implementation of local sparse matrix-vector multiply.
Insert new values that don&#39;t currently exist.
void scale(const Scalar &alpha)
Scale in place: this = alpha*this.
Abstract interface for operators (e.g., matrices and preconditioners).
const Teuchos::RCP< const crs_matrix_type > matrix_
The underlying CrsMatrix object.
ESweepDirection
Sweep direction for Gauss-Seidel or Successive Over-Relaxation (SOR).
Communication plan for data redistribution from a (possibly) multiply-owned to a uniquely-owned distr...
void applyTranspose(const MultiVector< Scalar, LocalOrdinal, GlobalOrdinal, Node > &X_in, MultiVector< Scalar, LocalOrdinal, GlobalOrdinal, Node > &Y_in, Teuchos::ETransp mode, Scalar alpha, Scalar beta) const
Apply the transpose or conjugate transpose of the matrix to X_in, producing Y_in. ...
Sum new values into existing values.
void modify_device()
Mark data as modified on the device side.
typename Node::device_type device_type
The Kokkos device type.
#define TPETRA_EFFICIENCY_WARNING(throw_exception_test, Exception, msg)
Print or throw an efficency warning.
Forward declaration of Tpetra::CrsMatrixMultiplyOp.
void applyNonTranspose(const MultiVector< Scalar, LocalOrdinal, GlobalOrdinal, Node > &X_in, MultiVector< Scalar, LocalOrdinal, GlobalOrdinal, Node > &Y_in, Scalar alpha, Scalar beta) const
Apply the matrix (not its transpose) to X_in, producing Y_in.
A parallel distribution of indices over processes.
void doExport(const SrcDistObject &source, const Export< LocalOrdinal, GlobalOrdinal, Node > &exporter, const CombineMode CM, const bool restrictedMode=false)
Export data into this object using an Export object (&quot;forward mode&quot;).
A class for wrapping a CrsMatrix multiply in a Operator.
Stand-alone utility functions and macros.
void reduce()
Sum values of a locally replicated multivector across all processes.
Teuchos::RCP< CrsMatrixMultiplyOp< OpScalar, MatScalar, LocalOrdinal, GlobalOrdinal, Node > > createCrsMatrixMultiplyOp(const Teuchos::RCP< const CrsMatrix< MatScalar, LocalOrdinal, GlobalOrdinal, Node > > &A)
Non-member function to create a CrsMatrixMultiplyOp.
virtual Teuchos::RCP< const map_type > getMap() const
The Map describing the parallel distribution of this object.
dual_view_type::t_dev getLocalViewDevice() const
A local Kokkos::View of device memory.
void gaussSeidel(const MultiVector< Scalar, LocalOrdinal, GlobalOrdinal, Node > &B, MultiVector< Scalar, LocalOrdinal, GlobalOrdinal, Node > &X, const MultiVector< Scalar, LocalOrdinal, GlobalOrdinal, Node > &D, const Scalar &dampingFactor, const ESweepDirection direction, const int numSweeps) const
&quot;Hybrid&quot; Jacobi + (Gauss-Seidel or SOR) on .
void gaussSeidelCopy(MultiVector< Scalar, LocalOrdinal, GlobalOrdinal, Node > &X, const MultiVector< Scalar, LocalOrdinal, GlobalOrdinal, Node > &B, const MultiVector< Scalar, LocalOrdinal, GlobalOrdinal, Node > &D, const Scalar &dampingFactor, const ESweepDirection direction, const int numSweeps) const
Version of gaussSeidel(), with fewer requirements on X.
Declaration of Tpetra::Details::Behavior, a class that describes Tpetra&#39;s behavior.