Tpetra parallel linear algebra  Version of the Day
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Pages
Tpetra_CrsMatrixMultiplyOp.hpp
Go to the documentation of this file.
1 // @HEADER
2 // *****************************************************************************
3 // Tpetra: Templated Linear Algebra Services Package
4 //
5 // Copyright 2008 NTESS and the Tpetra contributors.
6 // SPDX-License-Identifier: BSD-3-Clause
7 // *****************************************************************************
8 // @HEADER
9 
10 #ifndef TPETRA_CRSMATRIXMULTIPLYOP_HPP
11 #define TPETRA_CRSMATRIXMULTIPLYOP_HPP
12 
17 
19 #include "Tpetra_CrsMatrix.hpp"
20 #include "Tpetra_Util.hpp"
23 #include "Tpetra_LocalCrsMatrixOperator.hpp"
24 
25 namespace Tpetra {
26 
63 template <class Scalar,
64  class MatScalar,
65  class LocalOrdinal,
66  class GlobalOrdinal,
67  class Node>
68 class CrsMatrixMultiplyOp : public Operator<Scalar, LocalOrdinal, GlobalOrdinal, Node> {
69  public:
71  using crs_matrix_type =
75 
76  private:
77  using local_matrix_device_type =
79 
80  public:
82 
83 
88  CrsMatrixMultiplyOp(const Teuchos::RCP<const crs_matrix_type>& A)
89  : matrix_(A)
90  , localMultiply_(std::make_shared<local_matrix_device_type>(
91  A->getLocalMatrixDevice())) {}
92 
94  ~CrsMatrixMultiplyOp() override = default;
95 
97 
99 
102  void
105  Teuchos::ETransp mode = Teuchos::NO_TRANS,
106  Scalar alpha = Teuchos::ScalarTraits<Scalar>::one(),
107  Scalar beta = Teuchos::ScalarTraits<Scalar>::zero()) const override {
108  TEUCHOS_TEST_FOR_EXCEPTION(!matrix_->isFillComplete(), std::runtime_error,
109  Teuchos::typeName(*this) << "::apply(): underlying matrix is not fill-complete.");
110  TEUCHOS_TEST_FOR_EXCEPTION(X.getNumVectors() != Y.getNumVectors(), std::runtime_error,
111  Teuchos::typeName(*this) << "::apply(X,Y): X and Y must have the same number of vectors.");
112  TEUCHOS_TEST_FOR_EXCEPTION(Teuchos::ScalarTraits<Scalar>::isComplex && mode == Teuchos::TRANS, std::logic_error,
113  Teuchos::typeName(*this) << "::apply() does not currently support transposed multiplications for complex scalar types.");
114  if (mode == Teuchos::NO_TRANS) {
115  applyNonTranspose(X, Y, alpha, beta);
116  } else {
117  applyTranspose(X, Y, mode, alpha, beta);
118  }
119  }
120 
126  bool hasTransposeApply() const override {
127  return true;
128  }
129 
131  Teuchos::RCP<const map_type> getDomainMap() const override {
132  return matrix_->getDomainMap();
133  }
134 
136  Teuchos::RCP<const map_type> getRangeMap() const override {
137  return matrix_->getRangeMap();
138  }
139 
141 
142  protected:
144 
146  const Teuchos::RCP<const crs_matrix_type> matrix_;
147 
150 
163  mutable Teuchos::RCP<MultiVector<Scalar, LocalOrdinal, GlobalOrdinal, Node> > importMV_;
164 
177  mutable Teuchos::RCP<MultiVector<Scalar, LocalOrdinal, GlobalOrdinal, Node> > exportMV_;
178 
181  void
184  Teuchos::ETransp mode,
185  Scalar alpha,
186  Scalar beta) const {
187  using Teuchos::null;
188  using Teuchos::RCP;
189  using Teuchos::rcp;
190  using export_type = Export<LocalOrdinal, GlobalOrdinal, Node>;
191  using import_type = Import<LocalOrdinal, GlobalOrdinal, Node>;
192  using STS = Teuchos::ScalarTraits<Scalar>;
194 
195  const size_t numVectors = X_in.getNumVectors();
196  // because of Views, it is difficult to determine if X and Y point to the same data.
197  // however, if they reference the exact same object, we will do the user the favor of copying X into new storage (with a warning)
198  // we ony need to do this if we have trivial importers; otherwise, we don't actually apply the operator from X into Y
199  RCP<const import_type> importer = matrix_->getGraph()->getImporter();
200  RCP<const export_type> exporter = matrix_->getGraph()->getExporter();
201 
202  // some parameters for below
203  const bool Y_is_replicated = !Y_in.isDistributed();
204  const bool Y_is_overwritten = (beta == STS::zero());
205  const int myRank = matrix_->getComm()->getRank();
206  if (Y_is_replicated && myRank != 0) {
207  beta = STS::zero();
208  }
209 
210  // access X indirectly, in case we need to create temporary storage
211  RCP<const MV> X;
212  // currently, cannot multiply from multivector of non-constant stride
213  if (!X_in.isConstantStride() && importer == null) {
214  // generate a strided copy of X_in
215  X = Teuchos::rcp(new MV(X_in, Teuchos::Copy));
216  } else {
217  // just temporary, so this non-owning RCP is okay
218  X = Teuchos::rcpFromRef(X_in);
219  }
220 
221  // set up import/export temporary multivectors
222  if (importer != null) {
223  if (importMV_ != null && importMV_->getNumVectors() != numVectors) {
224  importMV_ = null;
225  }
226  if (importMV_ == null) {
227  importMV_ = rcp(new MV(matrix_->getColMap(), numVectors));
228  }
229  }
230  if (exporter != null) {
231  if (exportMV_ != null && exportMV_->getNumVectors() != numVectors) {
232  exportMV_ = null;
233  }
234  if (exportMV_ == null) {
235  exportMV_ = rcp(new MV(matrix_->getRowMap(), numVectors));
236  }
237  }
238 
239  // If we have a non-trivial exporter, we must import elements that are permuted or are on other processors
240  if (exporter != null) {
241  exportMV_->doImport(X_in, *exporter, INSERT);
242  X = exportMV_; // multiply out of exportMV_
243  }
244 
245  auto X_lcl = X->getLocalViewDevice(Access::ReadOnly);
246 
247  // If we have a non-trivial importer, we must export elements that are permuted or belong to other processors
248  // We will compute solution into the to-be-exported MV; get a view
249  if (importer != null) {
250  // Beta is zero here, so we clobber Y_lcl
251  auto Y_lcl = importMV_->getLocalViewDevice(Access::OverwriteAll);
252 
253  localMultiply_.apply(X_lcl, Y_lcl, mode, alpha, STS::zero());
254  if (Y_is_overwritten) {
255  Y_in.putScalar(STS::zero());
256  } else {
257  Y_in.scale(beta);
258  }
259  Y_in.doExport(*importMV_, *importer, ADD_ASSIGN);
260  }
261  // otherwise, multiply into Y
262  else {
263  // can't multiply in-situ; can't multiply into non-strided multivector
264  if (!Y_in.isConstantStride() || X.getRawPtr() == &Y_in) {
265  // generate a strided copy of Y
266  MV Y(Y_in, Teuchos::Copy);
267  nonconst_view_type Y_lcl;
268  if (Y_is_overwritten)
269  Y_lcl = Y.getLocalViewDevice(Access::OverwriteAll);
270  else
271  Y_lcl = Y.getLocalViewDevice(Access::ReadWrite);
272 
273  localMultiply_.apply(X_lcl, Y_lcl, mode, alpha, beta);
274  Tpetra::deep_copy(Y_in, Y);
275  } else {
276  nonconst_view_type Y_lcl;
277  if (Y_is_overwritten)
278  Y_lcl = Y_in.getLocalViewDevice(Access::OverwriteAll);
279  else
280  Y_lcl = Y_in.getLocalViewDevice(Access::ReadWrite);
281 
282  localMultiply_.apply(X_lcl, Y_lcl, mode, alpha, beta);
283  }
284  }
285  // Handle case of rangemap being a local replicated map: in this case, sum contributions from each processor
286  if (Y_is_replicated) {
287  Y_in.reduce();
288  }
289  }
290 
292  void
295  Scalar alpha,
296  Scalar beta) const {
297  using Teuchos::NO_TRANS;
298  using Teuchos::RCP;
299  using Teuchos::rcp;
300  using Teuchos::rcp_const_cast;
301  using Teuchos::rcpFromRef;
303  typedef Export<LocalOrdinal, GlobalOrdinal, Node> export_type;
304  typedef Import<LocalOrdinal, GlobalOrdinal, Node> import_type;
305  typedef Teuchos::ScalarTraits<Scalar> STS;
307 
308  if (alpha == STS::zero()) {
309  if (beta == STS::zero()) {
310  Y_in.putScalar(STS::zero());
311  } else if (beta != STS::one()) {
312  Y_in.scale(beta);
313  }
314  return;
315  }
316 
317  // It's possible that X is a view of Y or vice versa. We don't
318  // allow this (apply() requires that X and Y not alias one
319  // another), but it's helpful to detect and work around this
320  // case. We don't try to to detect the more subtle cases (e.g.,
321  // one is a subview of the other, but their initial pointers
322  // differ). We only need to do this if this matrix's Import is
323  // trivial; otherwise, we don't actually apply the operator from
324  // X into Y.
325 
326  RCP<const import_type> importer = matrix_->getGraph()->getImporter();
327  RCP<const export_type> exporter = matrix_->getGraph()->getExporter();
328 
329  // If beta == 0, then the output MV will be overwritten; none of
330  // its entries should be read. (Sparse BLAS semantics say that we
331  // must ignore any Inf or NaN entries in Y_in, if beta is zero.)
332  // This matters if we need to do an Export operation; see below.
333  const bool Y_is_overwritten = (beta == STS::zero());
334 
335  // We treat the case of a replicated MV output specially.
336  const bool Y_is_replicated =
337  (!Y_in.isDistributed() && matrix_->getComm()->getSize() != 1);
338 
339  // This is part of the special case for replicated MV output.
340  // We'll let each process do its thing, but do an all-reduce at
341  // the end to sum up the results. Setting beta=0 on all
342  // processes but Proc 0 makes the math work out for the
343  // all-reduce. (This assumes that the replicated data is
344  // correctly replicated, so that the data are the same on all
345  // processes.)
346  if (Y_is_replicated && matrix_->getComm()->getRank() > 0) {
347  beta = STS::zero();
348  }
349 
350  // Temporary MV for Import operation. After the block of code
351  // below, this will be an (Imported if necessary) column Map MV
352  // ready to give to localMultiply_.apply(...).
353  RCP<const MV> X_colMap;
354  if (importer.is_null()) {
355  if (!X_in.isConstantStride()) {
356  // Not all sparse mat-vec kernels can handle an input MV with
357  // nonconstant stride correctly, so we have to copy it in that
358  // case into a constant stride MV. To make a constant stride
359  // copy of X_in, we force creation of the column (== domain)
360  // Map MV (if it hasn't already been created, else fetch the
361  // cached copy). This avoids creating a new MV each time.
362  RCP<MV> X_colMapNonConst = getColumnMapMultiVector(X_in, true);
363  Tpetra::deep_copy(*X_colMapNonConst, X_in);
364  X_colMap = rcp_const_cast<const MV>(X_colMapNonConst);
365  } else {
366  // The domain and column Maps are the same, so do the local
367  // multiply using the domain Map input MV X_in.
368  X_colMap = rcpFromRef(X_in);
369  }
370  } else { // need to Import source (multi)vector
371  ProfilingRegion regionImport("Tpetra::CrsMatrixMultiplyOp::apply: Import");
372 
373  // We're doing an Import anyway, which will copy the relevant
374  // elements of the domain Map MV X_in into a separate column Map
375  // MV. Thus, we don't have to worry whether X_in is constant
376  // stride.
377  RCP<MV> X_colMapNonConst = getColumnMapMultiVector(X_in);
378 
379  // Import from the domain Map MV to the column Map MV.
380  X_colMapNonConst->doImport(X_in, *importer, INSERT);
381  X_colMap = rcp_const_cast<const MV>(X_colMapNonConst);
382  }
383 
384  // Temporary MV for doExport (if needed), or for copying a
385  // nonconstant stride output MV into a constant stride MV. This
386  // is null if we don't need the temporary MV, that is, if the
387  // Export is trivial (null).
388  RCP<MV> Y_rowMap = getRowMapMultiVector(Y_in);
389 
390  auto X_lcl = X_colMap->getLocalViewDevice(Access::ReadOnly);
391 
392  // If we have a nontrivial Export object, we must perform an
393  // Export. In that case, the local multiply result will go into
394  // the row Map multivector. We don't have to make a
395  // constant-stride version of Y_in in this case, because we had to
396  // make a constant stride Y_rowMap MV and do an Export anyway.
397  if (!exporter.is_null()) {
398  auto Y_lcl = Y_rowMap->getLocalViewDevice(Access::OverwriteAll);
399 
400  localMultiply_.apply(X_lcl, Y_lcl, NO_TRANS,
401  alpha, STS::zero());
402  {
403  ProfilingRegion regionExport("Tpetra::CrsMatrixMultiplyOp::apply: Export");
404 
405  // If we're overwriting the output MV Y_in completely (beta
406  // == 0), then make sure that it is filled with zeros before
407  // we do the Export. Otherwise, the ADD combine mode will
408  // use data in Y_in, which is supposed to be zero.
409  if (Y_is_overwritten) {
410  Y_in.putScalar(STS::zero());
411  } else {
412  // Scale the output MV by beta, so that the Export sums in
413  // the mat-vec contribution: Y_in = beta*Y_in + alpha*A*X_in.
414  Y_in.scale(beta);
415  }
416  // Do the Export operation.
417  Y_in.doExport(*Y_rowMap, *exporter, ADD_ASSIGN);
418  }
419  } else { // Don't do an Export: row Map and range Map are the same.
420  //
421  // If Y_in does not have constant stride, or if the column Map
422  // MV aliases Y_in, then we can't let the kernel write directly
423  // to Y_in. Instead, we have to use the cached row (== range)
424  // Map MV as temporary storage.
425  //
426  // FIXME (mfh 05 Jun 2014, mfh 07 Dec 2018) This test for
427  // aliasing only tests if the user passed in the same
428  // MultiVector for both X and Y. It won't detect whether one
429  // MultiVector views the other. We should also check the
430  // MultiVectors' raw data pointers.
431  if (!Y_in.isConstantStride() || X_colMap.getRawPtr() == &Y_in) {
432  // Force creating the MV if it hasn't been created already.
433  // This will reuse a previously created cached MV.
434  Y_rowMap = getRowMapMultiVector(Y_in, true);
435 
436  // If beta == 0, we don't need to copy Y_in into Y_rowMap,
437  // since we're overwriting it anyway.
438  if (beta != STS::zero()) {
439  Tpetra::deep_copy(*Y_rowMap, Y_in);
440  }
441  nonconst_view_type Y_lcl;
442  if (Y_is_overwritten)
443  Y_lcl = Y_rowMap->getLocalViewDevice(Access::OverwriteAll);
444  else
445  Y_lcl = Y_rowMap->getLocalViewDevice(Access::ReadWrite);
446 
447  localMultiply_.apply(X_lcl, Y_lcl, NO_TRANS, alpha, beta);
448  Tpetra::deep_copy(Y_in, *Y_rowMap);
449  } else {
450  nonconst_view_type Y_lcl;
451  if (Y_is_overwritten)
452  Y_lcl = Y_in.getLocalViewDevice(Access::OverwriteAll);
453  else
454  Y_lcl = Y_in.getLocalViewDevice(Access::ReadWrite);
455 
456  localMultiply_.apply(X_lcl, Y_lcl, NO_TRANS, alpha, beta);
457  }
458  }
459 
460  // If the range Map is a locally replicated Map, sum up
461  // contributions from each process. We set beta = 0 on all
462  // processes but Proc 0 initially, so this will handle the scaling
463  // factor beta correctly.
464  if (Y_is_replicated) {
465  ProfilingRegion regionReduce("Tpetra::CrsMatrixMultiplyOp::apply: Reduce Y");
466  Y_in.reduce();
467  }
468  }
469 
470  private:
490  Teuchos::RCP<MV>
491  getColumnMapMultiVector(const MultiVector<Scalar, LocalOrdinal, GlobalOrdinal, Node>& X_domainMap,
492  const bool force = false) const {
493  using Teuchos::null;
494  using Teuchos::RCP;
495  using Teuchos::rcp;
496  typedef Import<LocalOrdinal, GlobalOrdinal, Node> import_type;
497 
498  const size_t numVecs = X_domainMap.getNumVectors();
499  RCP<const import_type> importer = matrix_->getGraph()->getImporter();
500  RCP<const map_type> colMap = matrix_->getColMap();
501 
502  RCP<MV> X_colMap; // null by default
503 
504  // If the Import object is trivial (null), then we don't need a
505  // separate column Map multivector. Just return null in that
506  // case. The caller is responsible for knowing not to use the
507  // returned null pointer.
508  //
509  // If the Import is nontrivial, then we do need a separate
510  // column Map multivector for the Import operation. Check in
511  // that case if we have to (re)create the column Map
512  // multivector.
513  if (!importer.is_null() || force) {
514  if (importMV_.is_null() || importMV_->getNumVectors() != numVecs) {
515  X_colMap = rcp(new MV(colMap, numVecs));
516 
517  // Cache the newly created multivector for later reuse.
518  importMV_ = X_colMap;
519  } else { // Yay, we can reuse the cached multivector!
520  X_colMap = importMV_;
521  // mfh 09 Jan 2013: We don't have to fill with zeros first,
522  // because the Import uses INSERT combine mode, which overwrites
523  // existing entries.
524  //
525  // X_colMap->putScalar (STS::zero ());
526  }
527  }
528  return X_colMap;
529  }
530 
552  Teuchos::RCP<MV>
553  getRowMapMultiVector(const MultiVector<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Y_rangeMap,
554  const bool force = false) const {
555  using Teuchos::null;
556  using Teuchos::RCP;
557  using Teuchos::rcp;
558  typedef Export<LocalOrdinal, GlobalOrdinal, Node> export_type;
559 
560  const size_t numVecs = Y_rangeMap.getNumVectors();
561  RCP<const export_type> exporter = matrix_->getGraph()->getExporter();
562  RCP<const map_type> rowMap = matrix_->getRowMap();
563 
564  RCP<MV> Y_rowMap; // null by default
565 
566  // If the Export object is trivial (null), then we don't need a
567  // separate row Map multivector. Just return null in that case.
568  // The caller is responsible for knowing not to use the returned
569  // null pointer.
570  //
571  // If the Export is nontrivial, then we do need a separate row
572  // Map multivector for the Export operation. Check in that case
573  // if we have to (re)create the row Map multivector.
574  if (!exporter.is_null() || force) {
575  if (exportMV_.is_null() || exportMV_->getNumVectors() != numVecs) {
576  Y_rowMap = rcp(new MV(rowMap, numVecs));
577  exportMV_ = Y_rowMap; // Cache the newly created MV for later reuse
578  } else { // Yay, we can reuse the cached multivector!
579  Y_rowMap = exportMV_;
580  }
581  }
582  return Y_rowMap;
583  }
584 };
585 
593 template <class OpScalar,
594  class MatScalar,
595  class LocalOrdinal,
596  class GlobalOrdinal,
597  class Node>
598 Teuchos::RCP<
599  CrsMatrixMultiplyOp<OpScalar, MatScalar, LocalOrdinal, GlobalOrdinal, Node> >
600 createCrsMatrixMultiplyOp(const Teuchos::RCP<
602  typedef CrsMatrixMultiplyOp<OpScalar, MatScalar, LocalOrdinal,
603  GlobalOrdinal, Node>
604  op_type;
605  return Teuchos::rcp(new op_type(A));
606 }
607 
608 } // end of namespace Tpetra
609 
610 #endif // TPETRA_CRSMATRIXMULTIPLYOP_HPP
Communication plan for data redistribution from a uniquely-owned to a (possibly) multiply-owned distr...
Abstract interface for local operators (e.g., matrices and preconditioners).
Sparse matrix that presents a row-oriented interface that lets users read or modify entries...
Teuchos::RCP< const map_type > getRangeMap() const override
The range Map of this Operator.
Declaration of Tpetra::Details::Profiling, a scope guard for Kokkos Profiling.
void putScalar(const Scalar &value)
Set all values in the multivector with the given value.
size_t getNumVectors() const
Number of columns in the multivector.
bool isConstantStride() const
Whether this multivector has constant stride between columns.
Teuchos::RCP< MultiVector< Scalar, LocalOrdinal, GlobalOrdinal, Node > > importMV_
Column Map MultiVector used in apply().
bool isDistributed() const
Whether this is a globally distributed object.
Teuchos::RCP< const map_type > getDomainMap() const override
The domain Map of this Operator.
void apply(const MultiVector< Scalar, LocalOrdinal, GlobalOrdinal, Node > &X, MultiVector< Scalar, LocalOrdinal, GlobalOrdinal, Node > &Y, Teuchos::ETransp mode=Teuchos::NO_TRANS, Scalar alpha=Teuchos::ScalarTraits< Scalar >::one(), Scalar beta=Teuchos::ScalarTraits< Scalar >::zero()) const override
Compute Y = beta*Y + alpha*Op(A)*X, where Op(A) is either A, , or .
bool hasTransposeApply() const override
Whether this Operator&#39;s apply() method can apply the transpose or conjugate transpose.
KokkosSparse::CrsMatrix< impl_scalar_type, local_ordinal_type, device_type, void, typename local_graph_device_type::size_type > local_matrix_device_type
The specialization of Kokkos::CrsMatrix that represents the part of the sparse matrix on each MPI pro...
~CrsMatrixMultiplyOp() override=default
Destructor (virtual for memory safety of derived classes).
void deep_copy(MultiVector< DS, DL, DG, DN > &dst, const MultiVector< SS, SL, SG, SN > &src)
Copy the contents of the MultiVector src into dst.
CrsMatrixMultiplyOp(const Teuchos::RCP< const crs_matrix_type > &A)
Constructor.
LocalCrsMatrixOperator< Scalar, MatScalar, typename crs_matrix_type::device_type > localMultiply_
Implementation of local sparse matrix-vector multiply.
Insert new values that don&#39;t currently exist.
void scale(const Scalar &alpha)
Scale in place: this = alpha*this.
Abstract interface for operators (e.g., matrices and preconditioners).
const Teuchos::RCP< const crs_matrix_type > matrix_
The underlying CrsMatrix object.
Communication plan for data redistribution from a (possibly) multiply-owned to a uniquely-owned distr...
void applyTranspose(const MultiVector< Scalar, LocalOrdinal, GlobalOrdinal, Node > &X_in, MultiVector< Scalar, LocalOrdinal, GlobalOrdinal, Node > &Y_in, Teuchos::ETransp mode, Scalar alpha, Scalar beta) const
Apply the transpose or conjugate transpose of the matrix to X_in, producing Y_in. ...
Teuchos::RCP< MultiVector< Scalar, LocalOrdinal, GlobalOrdinal, Node > > exportMV_
Row Map MultiVector used in apply().
dual_view_type::t_dev::const_type getLocalViewDevice(Access::ReadOnlyStruct) const
Return a read-only, up-to-date view of this MultiVector&#39;s local data on device. This requires that th...
Forward declaration of Tpetra::CrsMatrixMultiplyOp.
void applyNonTranspose(const MultiVector< Scalar, LocalOrdinal, GlobalOrdinal, Node > &X_in, MultiVector< Scalar, LocalOrdinal, GlobalOrdinal, Node > &Y_in, Scalar alpha, Scalar beta) const
Apply the matrix (not its transpose) to X_in, producing Y_in.
A parallel distribution of indices over processes.
void doExport(const SrcDistObject &source, const Export< LocalOrdinal, GlobalOrdinal, Node > &exporter, const CombineMode CM, const bool restrictedMode=false)
Export data into this object using an Export object (&quot;forward mode&quot;).
A class for wrapping a CrsMatrix multiply in a Operator.
Stand-alone utility functions and macros.
void reduce()
Sum values of a locally replicated multivector across all processes.
Teuchos::RCP< CrsMatrixMultiplyOp< OpScalar, MatScalar, LocalOrdinal, GlobalOrdinal, Node > > createCrsMatrixMultiplyOp(const Teuchos::RCP< const CrsMatrix< MatScalar, LocalOrdinal, GlobalOrdinal, Node > > &A)
Non-member function to create a CrsMatrixMultiplyOp.
Accumulate new values into existing values (may not be supported in all classes)
Declaration of Tpetra::Details::Behavior, a class that describes Tpetra&#39;s behavior.