Tpetra parallel linear algebra  Version of the Day
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Pages
Tpetra_Details_residual.hpp
Go to the documentation of this file.
1 // @HEADER
2 // *****************************************************************************
3 // Tpetra: Templated Linear Algebra Services Package
4 //
5 // Copyright 2008 NTESS and the Tpetra contributors.
6 // SPDX-License-Identifier: BSD-3-Clause
7 // *****************************************************************************
8 // @HEADER
9 
10 #ifndef TPETRA_DETAILS_RESIDUAL_HPP
11 #define TPETRA_DETAILS_RESIDUAL_HPP
12 
13 #include "TpetraCore_config.h"
14 #include "Tpetra_CrsMatrix.hpp"
15 #include "Tpetra_LocalCrsMatrixOperator.hpp"
16 #include "Tpetra_MultiVector.hpp"
17 #include "Teuchos_RCP.hpp"
18 #include "Teuchos_ScalarTraits.hpp"
21 #include "KokkosSparse_spmv_impl.hpp"
22 
28 
29 namespace Tpetra {
30  namespace Details {
31 
32 
39 template<class AMatrix, class MV, class ConstMV, class Offsets, bool is_MV, bool restrictedMode, bool skipOffRank>
41 
42  using execution_space = typename AMatrix::execution_space;
43  using LO = typename AMatrix::non_const_ordinal_type;
44  using value_type = typename AMatrix::non_const_value_type;
45  using team_policy = typename Kokkos::TeamPolicy<execution_space>;
46  using team_member = typename team_policy::member_type;
47  using ATV = Kokkos::ArithTraits<value_type>;
48 
49  AMatrix A_lcl;
50  ConstMV X_colmap_lcl;
51  ConstMV B_lcl;
52  MV R_lcl;
53  int rows_per_team;
54  Offsets offsets;
55  ConstMV X_domainmap_lcl;
56 
57 
58  LocalResidualFunctor (const AMatrix& A_lcl_,
59  const ConstMV& X_colmap_lcl_,
60  const ConstMV& B_lcl_,
61  const MV& R_lcl_,
62  const int rows_per_team_,
63  const Offsets& offsets_,
64  const ConstMV& X_domainmap_lcl_) :
65  A_lcl(A_lcl_),
66  X_colmap_lcl(X_colmap_lcl_),
67  B_lcl(B_lcl_),
68  R_lcl(R_lcl_),
69  rows_per_team(rows_per_team_),
70  offsets(offsets_),
71  X_domainmap_lcl(X_domainmap_lcl_)
72  { }
73 
74  KOKKOS_INLINE_FUNCTION
75  void operator() (const team_member& dev) const
76  {
77 
78  Kokkos::parallel_for(Kokkos::TeamThreadRange (dev, 0, rows_per_team),[&] (const LO& loop) {
79  const LO lclRow = static_cast<LO> (dev.league_rank ()) * rows_per_team + loop;
80 
81  if (lclRow >= A_lcl.numRows ()) {
82  return;
83  }
84 
85  if (!is_MV) { // MultiVectors only have a single column
86 
87  value_type A_x = ATV::zero ();
88 
89  if (!restrictedMode) {
90  const auto A_row = A_lcl.rowConst(lclRow);
91  const LO row_length = static_cast<LO> (A_row.length);
92 
93  Kokkos::parallel_reduce(Kokkos::ThreadVectorRange (dev, row_length), [&] (const LO iEntry, value_type& lsum) {
94  const auto A_val = A_row.value(iEntry);
95  lsum += A_val * X_colmap_lcl(A_row.colidx(iEntry),0);
96  }, A_x);
97 
98  }
99  else {
100 
101  const LO offRankOffset = offsets(lclRow);
102  const size_t start = A_lcl.graph.row_map(lclRow);
103  const size_t end = A_lcl.graph.row_map(lclRow+1);
104 
105  Kokkos::parallel_reduce(Kokkos::ThreadVectorRange (dev, start, end), [&] (const LO iEntry, value_type& lsum) {
106  const auto A_val = A_lcl.values(iEntry);
107  const auto lclCol = A_lcl.graph.entries(iEntry);
108  if (iEntry < offRankOffset)
109  lsum += A_val * X_domainmap_lcl(lclCol,0);
110  else if (!skipOffRank)
111  lsum += A_val * X_colmap_lcl(lclCol,0);
112  }, A_x);
113  }
114 
115  Kokkos::single(Kokkos::PerThread(dev),[&] () {
116  R_lcl(lclRow,0) = B_lcl(lclRow,0) - A_x;
117  });
118  }
119  else { // MultiVectors have more than one column
120 
121  const LO numVectors = static_cast<LO>(X_colmap_lcl.extent(1));
122 
123  for(LO v=0; v<numVectors; v++) {
124 
125  value_type A_x = ATV::zero ();
126 
127  if (!restrictedMode) {
128 
129  const auto A_row = A_lcl.rowConst(lclRow);
130  const LO row_length = static_cast<LO> (A_row.length);
131 
132  Kokkos::parallel_reduce(Kokkos::ThreadVectorRange (dev, row_length), [&] (const LO iEntry, value_type& lsum) {
133  const auto A_val = A_row.value(iEntry);
134  lsum += A_val * X_colmap_lcl(A_row.colidx(iEntry),v);
135  }, A_x);
136  }
137  else {
138  const LO offRankOffset = offsets(lclRow);
139  const size_t start = A_lcl.graph.row_map(lclRow);
140  const size_t end = A_lcl.graph.row_map(lclRow+1);
141 
142  Kokkos::parallel_reduce(Kokkos::ThreadVectorRange (dev, start, end), [&] (const LO iEntry, value_type& lsum) {
143  const auto A_val = A_lcl.values(iEntry);
144  const auto lclCol = A_lcl.graph.entries(iEntry);
145  if (iEntry < offRankOffset)
146  lsum += A_val * X_domainmap_lcl(lclCol,v);
147  else if (!skipOffRank)
148  lsum += A_val * X_colmap_lcl(lclCol,v);
149  }, A_x);
150  }
151 
152  Kokkos::single(Kokkos::PerThread(dev),[&] () {
153  R_lcl(lclRow,v) = B_lcl(lclRow,v) - A_x;
154  });
155 
156  }//end for numVectors
157  }
158  });//end parallel_for TeamThreadRange
159  }
160 };
161 
162 
164 template<class AMatrix, class MV, class ConstMV, class Offsets, bool is_MV>
166 
167  using execution_space = typename AMatrix::execution_space;
168  using LO = typename AMatrix::non_const_ordinal_type;
169  using value_type = typename AMatrix::non_const_value_type;
170  using team_policy = typename Kokkos::TeamPolicy<execution_space>;
171  using team_member = typename team_policy::member_type;
172  using ATV = Kokkos::ArithTraits<value_type>;
173 
174  AMatrix A_lcl;
175  ConstMV X_colmap_lcl;
176  MV R_lcl;
177  int rows_per_team;
178  Offsets offsets;
179 
180 
181  OffRankUpdateFunctor (const AMatrix& A_lcl_,
182  const ConstMV& X_colmap_lcl_,
183  const MV& R_lcl_,
184  const int rows_per_team_,
185  const Offsets& offsets_) :
186  A_lcl(A_lcl_),
187  X_colmap_lcl(X_colmap_lcl_),
188  R_lcl(R_lcl_),
189  rows_per_team(rows_per_team_),
190  offsets(offsets_)
191  { }
192 
193  KOKKOS_INLINE_FUNCTION
194  void operator() (const team_member& dev) const
195  {
196 
197  Kokkos::parallel_for(Kokkos::TeamThreadRange (dev, 0, rows_per_team),[&] (const LO& loop) {
198  const LO lclRow = static_cast<LO> (dev.league_rank ()) * rows_per_team + loop;
199 
200  if (lclRow >= A_lcl.numRows ()) {
201  return;
202  }
203 
204  const LO offRankOffset = offsets(lclRow);
205  const size_t end = A_lcl.graph.row_map(lclRow+1);
206 
207  if (!is_MV) { // MultiVectors only have a single column
208 
209  value_type A_x = ATV::zero ();
210 
211  Kokkos::parallel_reduce(Kokkos::ThreadVectorRange (dev, offRankOffset, end), [&] (const LO iEntry, value_type& lsum) {
212  const auto A_val = A_lcl.values(iEntry);
213  const auto lclCol = A_lcl.graph.entries(iEntry);
214  lsum += A_val * X_colmap_lcl(lclCol,0);
215  }, A_x);
216 
217  Kokkos::single(Kokkos::PerThread(dev),[&] () {
218  R_lcl(lclRow,0) -= A_x;
219  });
220  }
221  else { // MultiVectors have more than one column
222 
223  const LO numVectors = static_cast<LO>(X_colmap_lcl.extent(1));
224 
225  for(LO v=0; v<numVectors; v++) {
226 
227  value_type A_x = ATV::zero ();
228 
229  Kokkos::parallel_reduce(Kokkos::ThreadVectorRange (dev, offRankOffset, end), [&] (const LO iEntry, value_type& lsum) {
230  const auto A_val = A_lcl.values(iEntry);
231  const auto lclCol = A_lcl.graph.entries(iEntry);
232  lsum += A_val * X_colmap_lcl(lclCol,v);
233  }, A_x);
234 
235  Kokkos::single(Kokkos::PerThread(dev),[&] () {
236  R_lcl(lclRow,v) -= A_x;
237  });
238 
239  }//end for numVectors
240  }
241  });
242  }
243 };
244 
245 template<class SC, class LO, class GO, class NO>
246 void localResidual(const CrsMatrix<SC,LO,GO,NO> & A,
247  const MultiVector<SC,LO,GO,NO> & X_colmap,
248  const MultiVector<SC,LO,GO,NO> & B,
250  const Kokkos::View<const size_t*, typename NO::device_type>& offsets,
251  const MultiVector<SC,LO,GO,NO> * X_domainmap=nullptr) {
253  using Teuchos::NO_TRANS;
254  ProfilingRegion regionLocalApply ("Tpetra::CrsMatrix::localResidual");
255 
256  using local_matrix_device_type = typename CrsMatrix<SC,LO,GO,NO>::local_matrix_device_type;
257  using local_view_device_type = typename MultiVector<SC,LO,GO,NO>::dual_view_type::t_dev::non_const_type;
258  using const_local_view_device_type = typename MultiVector<SC,LO,GO,NO>::dual_view_type::t_dev::const_type;
259  using offset_type = Kokkos::View<const size_t*, typename NO::device_type>;
260 
261  local_matrix_device_type A_lcl = A.getLocalMatrixDevice ();
262  const_local_view_device_type X_colmap_lcl = X_colmap.getLocalViewDevice(Access::ReadOnly);
263  const_local_view_device_type B_lcl = B.getLocalViewDevice(Access::ReadOnly);
264  local_view_device_type R_lcl = R.getLocalViewDevice(Access::OverwriteAll);
265  const_local_view_device_type X_domainmap_lcl;
266 
267  const bool debug = ::Tpetra::Details::Behavior::debug ();
268  if (debug) {
269  TEUCHOS_TEST_FOR_EXCEPTION
270  (X_colmap.getNumVectors () != R.getNumVectors (), std::runtime_error,
271  "X.getNumVectors() = " << X_colmap.getNumVectors () << " != "
272  "R.getNumVectors() = " << R.getNumVectors () << ".");
273  TEUCHOS_TEST_FOR_EXCEPTION
274  (X_colmap.getLocalLength () !=
275  A.getColMap ()->getLocalNumElements (), std::runtime_error,
276  "X has the wrong number of local rows. "
277  "X.getLocalLength() = " << X_colmap.getLocalLength () << " != "
278  "A.getColMap()->getLocalNumElements() = " <<
279  A.getColMap ()->getLocalNumElements () << ".");
280  TEUCHOS_TEST_FOR_EXCEPTION
281  (R.getLocalLength () !=
282  A.getRowMap ()->getLocalNumElements (), std::runtime_error,
283  "R has the wrong number of local rows. "
284  "R.getLocalLength() = " << R.getLocalLength () << " != "
285  "A.getRowMap()->getLocalNumElements() = " <<
286  A.getRowMap ()->getLocalNumElements () << ".");
287  TEUCHOS_TEST_FOR_EXCEPTION
288  (B.getLocalLength () !=
289  A.getRowMap ()->getLocalNumElements (), std::runtime_error,
290  "B has the wrong number of local rows. "
291  "B.getLocalLength() = " << B.getLocalLength () << " != "
292  "A.getRowMap()->getLocalNumElements() = " <<
293  A.getRowMap ()->getLocalNumElements () << ".");
294 
295  TEUCHOS_TEST_FOR_EXCEPTION
296  (! A.isFillComplete (), std::runtime_error, "The matrix A is not "
297  "fill complete. You must call fillComplete() (possibly with "
298  "domain and range Map arguments) without an intervening "
299  "resumeFill() call before you may call this method.");
300  TEUCHOS_TEST_FOR_EXCEPTION
301  (! X_colmap.isConstantStride () || ! R.isConstantStride () || ! B.isConstantStride (),
302  std::runtime_error, "X, Y and B must be constant stride.");
303  // If the two pointers are NULL, then they don't alias one
304  // another, even though they are equal.
305  TEUCHOS_TEST_FOR_EXCEPTION
306  ((X_colmap_lcl.data () == R_lcl.data () && X_colmap_lcl.data () != nullptr) ||
307  (X_colmap_lcl.data () == B_lcl.data () && X_colmap_lcl.data () != nullptr),
308  std::runtime_error, "X, Y and R may not alias one another.");
309  }
310 
311  const bool fusedResidual = ::Tpetra::Details::Behavior::fusedResidual ();
312  if (!fusedResidual) {
313  SC one = Teuchos::ScalarTraits<SC>::one();
314  SC negone = -one;
315  SC zero = Teuchos::ScalarTraits<SC>::zero();
316  // This is currently a "reference implementation" waiting until Kokkos Kernels provides
317  // a residual kernel.
318  A.localApply(X_colmap,R,Teuchos::NO_TRANS, one, zero);
319  R.update(one,B,negone);
320  return;
321  }
322 
323  if (A_lcl.numRows() == 0) {
324  return;
325  }
326 
327  int64_t numLocalRows = A_lcl.numRows ();
328  int64_t myNnz = A_lcl.nnz();
329 
330  int team_size = -1;
331  int vector_length = -1;
332  int64_t rows_per_thread = -1;
333 
334  using execution_space = typename CrsMatrix<SC,LO,GO,NO>::execution_space;
335  using policy_type = typename Kokkos::TeamPolicy<execution_space>;
336 
337  int64_t rows_per_team = KokkosSparse::Impl::spmv_launch_parameters<execution_space>(numLocalRows, myNnz, rows_per_thread, team_size, vector_length);
338  int64_t worksets = (B_lcl.extent (0) + rows_per_team - 1) / rows_per_team;
339 
340  policy_type policy (1, 1);
341  if (team_size < 0) {
342  policy = policy_type (worksets, Kokkos::AUTO, vector_length);
343  }
344  else {
345  policy = policy_type (worksets, team_size, vector_length);
346  }
347 
348  bool is_vector = (X_colmap_lcl.extent(1) == 1);
349 
350  if(is_vector) {
351 
352  if (X_domainmap == nullptr) {
353 
354  using functor_type = LocalResidualFunctor<local_matrix_device_type,local_view_device_type,const_local_view_device_type,offset_type,false,false,false>;
355  functor_type func (A_lcl, X_colmap_lcl, B_lcl, R_lcl, rows_per_team, offsets, X_domainmap_lcl);
356  Kokkos::parallel_for("residual-vector",policy,func);
357 
358  }
359  else {
360 
361  X_domainmap_lcl = X_domainmap->getLocalViewDevice(Access::ReadOnly);
362  using functor_type = LocalResidualFunctor<local_matrix_device_type,local_view_device_type,const_local_view_device_type,offset_type,false,true,false>;
363  functor_type func (A_lcl, X_colmap_lcl, B_lcl, R_lcl, rows_per_team, offsets, X_domainmap_lcl);
364  Kokkos::parallel_for("residual-vector",policy,func);
365 
366  }
367  }
368  else {
369 
370  if (X_domainmap == nullptr) {
371 
372  using functor_type = LocalResidualFunctor<local_matrix_device_type,local_view_device_type,const_local_view_device_type,offset_type,true,false,false>;
373  functor_type func (A_lcl, X_colmap_lcl, B_lcl, R_lcl, rows_per_team, offsets, X_domainmap_lcl);
374  Kokkos::parallel_for("residual-multivector",policy,func);
375 
376  }
377  else {
378 
379  X_domainmap_lcl = X_domainmap->getLocalViewDevice(Access::ReadOnly);
380  using functor_type = LocalResidualFunctor<local_matrix_device_type,local_view_device_type,const_local_view_device_type,offset_type,true,true,false>;
381  functor_type func (A_lcl, X_colmap_lcl, B_lcl, R_lcl, rows_per_team, offsets, X_domainmap_lcl);
382  Kokkos::parallel_for("residual-multivector",policy,func);
383 
384  }
385  }
386 }
387 
388 
389 template<class SC, class LO, class GO, class NO>
390 void localResidualWithCommCompOverlap(const CrsMatrix<SC,LO,GO,NO> & A,
391  MultiVector<SC,LO,GO,NO> & X_colmap,
392  const MultiVector<SC,LO,GO,NO> & B,
393  MultiVector<SC,LO,GO,NO> & R,
394  const Kokkos::View<const size_t*, typename NO::device_type>& offsets,
395  const MultiVector<SC,LO,GO,NO> & X_domainmap) {
397  using Teuchos::NO_TRANS;
398  using Teuchos::RCP;
399  using import_type = typename CrsMatrix<SC,LO,GO,NO>::import_type;
400 
401  ProfilingRegion regionLocalApply ("Tpetra::CrsMatrix::localResidualWithCommCompOverlap");
402 
403  using local_matrix_device_type = typename CrsMatrix<SC,LO,GO,NO>::local_matrix_device_type;
404  using local_view_device_type = typename MultiVector<SC,LO,GO,NO>::dual_view_type::t_dev::non_const_type;
405  using const_local_view_device_type = typename MultiVector<SC,LO,GO,NO>::dual_view_type::t_dev::const_type;
406  using offset_type = Kokkos::View<const size_t*, typename NO::device_type>;
407 
408  local_matrix_device_type A_lcl = A.getLocalMatrixDevice ();
409  const_local_view_device_type X_colmap_lcl = X_colmap.getLocalViewDevice(Access::ReadOnly);
410  const_local_view_device_type B_lcl = B.getLocalViewDevice(Access::ReadOnly);
411  local_view_device_type R_lcl = R.getLocalViewDevice(Access::OverwriteAll);
412  const_local_view_device_type X_domainmap_lcl = X_domainmap.getLocalViewDevice(Access::ReadOnly);;
413 
414  const bool debug = ::Tpetra::Details::Behavior::debug ();
415  if (debug) {
416  TEUCHOS_TEST_FOR_EXCEPTION
417  (X_colmap.getNumVectors () != R.getNumVectors (), std::runtime_error,
418  "X.getNumVectors() = " << X_colmap.getNumVectors () << " != "
419  "R.getNumVectors() = " << R.getNumVectors () << ".");
420  TEUCHOS_TEST_FOR_EXCEPTION
421  (X_colmap.getLocalLength () !=
422  A.getColMap ()->getLocalNumElements (), std::runtime_error,
423  "X has the wrong number of local rows. "
424  "X.getLocalLength() = " << X_colmap.getLocalLength () << " != "
425  "A.getColMap()->getLocalNumElements() = " <<
426  A.getColMap ()->getLocalNumElements () << ".");
427  TEUCHOS_TEST_FOR_EXCEPTION
428  (R.getLocalLength () !=
429  A.getRowMap ()->getLocalNumElements (), std::runtime_error,
430  "R has the wrong number of local rows. "
431  "R.getLocalLength() = " << R.getLocalLength () << " != "
432  "A.getRowMap()->getLocalNumElements() = " <<
433  A.getRowMap ()->getLocalNumElements () << ".");
434  TEUCHOS_TEST_FOR_EXCEPTION
435  (B.getLocalLength () !=
436  A.getRowMap ()->getLocalNumElements (), std::runtime_error,
437  "B has the wrong number of local rows. "
438  "B.getLocalLength() = " << B.getLocalLength () << " != "
439  "A.getRowMap()->getLocalNumElements() = " <<
440  A.getRowMap ()->getLocalNumElements () << ".");
441 
442  TEUCHOS_TEST_FOR_EXCEPTION
443  (! A.isFillComplete (), std::runtime_error, "The matrix A is not "
444  "fill complete. You must call fillComplete() (possibly with "
445  "domain and range Map arguments) without an intervening "
446  "resumeFill() call before you may call this method.");
447  TEUCHOS_TEST_FOR_EXCEPTION
448  (! X_colmap.isConstantStride () || ! R.isConstantStride () || ! B.isConstantStride (),
449  std::runtime_error, "X, Y and B must be constant stride.");
450  // If the two pointers are NULL, then they don't alias one
451  // another, even though they are equal.
452  TEUCHOS_TEST_FOR_EXCEPTION
453  ((X_colmap_lcl.data () == R_lcl.data () && X_colmap_lcl.data () != nullptr) ||
454  (X_colmap_lcl.data () == B_lcl.data () && X_colmap_lcl.data () != nullptr),
455  std::runtime_error, "X, Y and R may not alias one another.");
456  }
457 
458  if (A_lcl.numRows() == 0) {
459  return;
460  }
461 
462  int64_t numLocalRows = A_lcl.numRows ();
463  int64_t myNnz = A_lcl.nnz();
464 
465  int team_size = -1;
466  int vector_length = -1;
467  int64_t rows_per_thread = -1;
468 
469  using execution_space = typename CrsMatrix<SC,LO,GO,NO>::execution_space;
470  using policy_type = typename Kokkos::TeamPolicy<execution_space>;
471 
472  int64_t rows_per_team = KokkosSparse::Impl::spmv_launch_parameters<execution_space>(numLocalRows, myNnz, rows_per_thread, team_size, vector_length);
473  int64_t worksets = (B_lcl.extent (0) + rows_per_team - 1) / rows_per_team;
474 
475  policy_type policy (1, 1);
476  if (team_size < 0) {
477  policy = policy_type (worksets, Kokkos::AUTO, vector_length);
478  }
479  else {
480  policy = policy_type (worksets, team_size, vector_length);
481  }
482 
483  bool is_vector = (X_colmap_lcl.extent(1) == 1);
484 
485  if(is_vector) {
486 
487  using functor_type = LocalResidualFunctor<local_matrix_device_type,local_view_device_type,const_local_view_device_type,offset_type,false,true,true>;
488  functor_type func (A_lcl, X_colmap_lcl, B_lcl, R_lcl, rows_per_team, offsets, X_domainmap_lcl);
489  Kokkos::parallel_for("residual-vector",policy,func);
490 
491  RCP<const import_type> importer = A.getGraph ()->getImporter ();
492  X_colmap.endImport (X_domainmap, *importer, INSERT, true);
493 
494  Kokkos::fence("Tpetra::localResidualWithCommCompOverlap-1");
495 
496  using functor_type2 = OffRankUpdateFunctor<local_matrix_device_type,local_view_device_type,const_local_view_device_type,offset_type,false>;
497  functor_type2 func2 (A_lcl, X_colmap_lcl, R_lcl, rows_per_team, offsets);
498  Kokkos::parallel_for("residual-vector-offrank",policy,func2);
499 
500  }
501  else {
502 
503  using functor_type = LocalResidualFunctor<local_matrix_device_type,local_view_device_type,const_local_view_device_type,offset_type,true,true,true>;
504  functor_type func (A_lcl, X_colmap_lcl, B_lcl, R_lcl, rows_per_team, offsets, X_domainmap_lcl);
505  Kokkos::parallel_for("residual-multivector",policy,func);
506 
507  RCP<const import_type> importer = A.getGraph ()->getImporter ();
508  X_colmap.endImport (X_domainmap, *importer, INSERT, true);
509 
510  Kokkos::fence("Tpetra::localResidualWithCommCompOverlap-2");
511 
512  using functor_type2 = OffRankUpdateFunctor<local_matrix_device_type,local_view_device_type,const_local_view_device_type,offset_type,true>;
513  functor_type2 func2 (A_lcl, X_colmap_lcl, R_lcl, rows_per_team, offsets);
514  Kokkos::parallel_for("residual-vector-offrank",policy,func2);
515 
516  }
517 }
518 
519 
521 template<class SC, class LO, class GO, class NO>
523  const MultiVector<SC,LO,GO,NO> & X_in,
524  const MultiVector<SC,LO,GO,NO> & B_in,
525  MultiVector<SC,LO,GO,NO> & R_in) {
527  using Teuchos::RCP;
528  using Teuchos::rcp;
529  using Teuchos::rcp_const_cast;
530  using Teuchos::rcpFromRef;
531 
532  const bool debug = ::Tpetra::Details::Behavior::debug ();
533  const bool skipCopyAndPermuteIfPossible = ::Tpetra::Details::Behavior::skipCopyAndPermuteIfPossible ();
534  const bool overlapCommunicationAndComputation = ::Tpetra::Details::Behavior::overlapCommunicationAndComputation ();
535  if (overlapCommunicationAndComputation)
536  TEUCHOS_ASSERT(skipCopyAndPermuteIfPossible);
537 
538  // Whether we are using restrictedMode in the import from domain to
539  // column map. Restricted mode skips the copy and permutation of the
540  // local part of X. We are using restrictedMode only when domain and
541  // column map are locally fitted, i.e. when the local indices of
542  // domain and column map match.
543  bool restrictedMode = false;
544 
545  const CrsMatrix<SC,LO,GO,NO> * Apt = dynamic_cast<const CrsMatrix<SC,LO,GO,NO>*>(&Aop);
546  if(!Apt) {
547  // If we're not a CrsMatrix, we can't do fusion, so just do apply+update
548  SC one = Teuchos::ScalarTraits<SC>::one(), negone = -one, zero = Teuchos::ScalarTraits<SC>::zero();
549  Aop.apply(X_in,R_in,Teuchos::NO_TRANS, one, zero);
550  R_in.update(one,B_in,negone);
551  return;
552  }
553  const CrsMatrix<SC,LO,GO,NO> & A = *Apt;
554 
555  using import_type = typename CrsMatrix<SC,LO,GO,NO>::import_type;
556  using export_type = typename CrsMatrix<SC,LO,GO,NO>::export_type;
557  using MV = MultiVector<SC,LO,GO,NO>;
558  using graph_type = Tpetra::CrsGraph<LO,GO,NO>;
559  using offset_type = typename graph_type::offset_device_view_type;
560 
561  // We treat the case of a replicated MV output specially.
562  const bool R_is_replicated =
563  (! R_in.isDistributed () && A.getComm ()->getSize () != 1);
564 
565  // It's possible that R is a view of X or B.
566  // We don't try to to detect the more subtle cases (e.g., one is a
567  // subview of the other, but their initial pointers differ). We
568  // only need to do this if this matrix's Import is trivial;
569  // otherwise, we don't actually apply the operator from X into Y.
570 
571  RCP<const import_type> importer = A.getGraph ()->getImporter ();
572  RCP<const export_type> exporter = A.getGraph ()->getExporter ();
573 
574  // Temporary MV for Import operation. After the block of code
575  // below, this will be an (Imported if necessary) column Map MV
576  // ready to give to localApply(...).
577  RCP<MV> X_colMap;
578  if (importer.is_null ()) {
579  if (! X_in.isConstantStride ()) {
580  // Not all sparse mat-vec kernels can handle an input MV with
581  // nonconstant stride correctly, so we have to copy it in that
582  // case into a constant stride MV. To make a constant stride
583  // copy of X_in, we force creation of the column (== domain)
584  // Map MV (if it hasn't already been created, else fetch the
585  // cached copy). This avoids creating a new MV each time.
586  X_colMap = A.getColumnMapMultiVector (X_in, true);
587  Tpetra::deep_copy (*X_colMap, X_in);
588  // X_colMap = rcp_const_cast<const MV> (X_colMapNonConst);
589  }
590  else {
591  // The domain and column Maps are the same, so do the local
592  // multiply using the domain Map input MV X_in.
593  X_colMap = rcp_const_cast<MV> (rcpFromRef (X_in) );
594  }
595  }
596  else { // need to Import source (multi)vector
597  ProfilingRegion regionImport ("Tpetra::CrsMatrix::residual: Import");
598  // We're doing an Import anyway, which will copy the relevant
599  // elements of the domain Map MV X_in into a separate column Map
600  // MV. Thus, we don't have to worry whether X_in is constant
601  // stride.
602  X_colMap = A.getColumnMapMultiVector (X_in);
603 
604  // Do we want to use restrictedMode?
605  restrictedMode = skipCopyAndPermuteIfPossible && importer->isLocallyFitted();
606 
607  if (debug && restrictedMode) {
608  TEUCHOS_TEST_FOR_EXCEPTION
609  (!importer->getTargetMap()->isLocallyFitted(*importer->getSourceMap()), std::runtime_error,
610  "Source map and target map are not locally fitted, but Tpetra::residual thinks they are.");
611  }
612 
613  // Import from the domain Map MV to the column Map MV.
614  X_colMap->beginImport (X_in, *importer, INSERT, restrictedMode);
615  }
616 
617  offset_type offsets;
618  if (restrictedMode)
619  A.getCrsGraph()->getLocalOffRankOffsets(offsets);
620 
621  // Get a vector for the R_rowMap output residual, handling the
622  // non-constant stride and exporter cases. Since R gets clobbered
623  // we don't need to worry about the data in it
624  RCP<MV> R_rowMap;
625  if(exporter.is_null()) {
626  if (! R_in.isConstantStride ()) {
627  R_rowMap = A.getRowMapMultiVector(R_in);
628  }
629  else {
630  R_rowMap = rcpFromRef (R_in);
631  }
632  }
633  else {
634  R_rowMap = A.getRowMapMultiVector (R_in);
635  }
636 
637  // Get a vector for the B_rowMap output residual, handling the
638  // non-constant stride and exporter cases
639  RCP<const MV> B_rowMap;
640  if(exporter.is_null()) {
641  if (! B_in.isConstantStride ()) {
642  // Do an allocation here. If we need to optimize this later, we can have the matrix
643  // cache this.
644  RCP<MV> B_rowMapNonConst = rcp(new MV(A.getRowMap(),B_in.getNumVectors()));
645  Tpetra::deep_copy (*B_rowMapNonConst, B_in);
646  B_rowMap = rcp_const_cast<const MV> (B_rowMapNonConst);
647  }
648  else {
649  B_rowMap = rcpFromRef (B_in);
650  }
651  }
652  else {
653  // Do an allocation here. If we need to optimize this later, we can have the matrix
654  // cache this.
655  ProfilingRegion regionExport ("Tpetra::CrsMatrix::residual: B Import");
656  RCP<MV> B_rowMapNonConst = rcp(new MV(A.getRowMap(),B_in.getNumVectors()));
657  B_rowMapNonConst->doImport(B_in, *exporter, ADD);
658  B_rowMap = rcp_const_cast<const MV> (B_rowMapNonConst);
659  }
660 
661  // If we have a nontrivial Export object, we must perform an
662  // Export. In that case, the local multiply result will go into
663  // the row Map multivector. We don't have to make a
664  // constant-stride version of R_in in this case, because we had to
665  // make a constant stride R_rowMap MV and do an Export anyway.
666  if (! exporter.is_null ()) {
667  if ( ! importer.is_null ())
668  X_colMap->endImport (X_in, *importer, INSERT, restrictedMode);
669  if (restrictedMode && !importer.is_null ())
670  localResidual (A, *X_colMap, *B_rowMap, *R_rowMap, offsets, &X_in);
671  else
672  localResidual (A, *X_colMap, *B_rowMap, *R_rowMap, offsets);
673 
674  {
675  ProfilingRegion regionExport ("Tpetra::CrsMatrix::residual: R Export");
676 
677  // Do the Export operation.
678  R_in.doExport (*R_rowMap, *exporter, ADD);
679  }
680  }
681  else { // Don't do an Export: row Map and range Map are the same.
682 
683  if (restrictedMode)
684  if (overlapCommunicationAndComputation) {
685  localResidualWithCommCompOverlap (A, *X_colMap, *B_rowMap, *R_rowMap, offsets, X_in);
686  } else {
687  X_colMap->endImport (X_in, *importer, INSERT, restrictedMode);
688  localResidual (A, *X_colMap, *B_rowMap, *R_rowMap, offsets, &X_in);
689  }
690  else {
691  if ( ! importer.is_null ())
692  X_colMap->endImport (X_in, *importer, INSERT, restrictedMode);
693  localResidual (A, *X_colMap, *B_rowMap, *R_rowMap, offsets);
694  }
695 
696  //
697  // If R_in does not have constant stride,
698  // then we can't let the kernel write directly to R_in.
699  // Instead, we have to use the cached row (== range)
700  // Map MV as temporary storage.
701  //
702  if (! R_in.isConstantStride () ) {
703  // We need to be sure to do a copy out in this case.
704  Tpetra::deep_copy (R_in, *R_rowMap);
705  }
706  }
707 
708  // If the range Map is a locally replicated Map, sum up
709  // contributions from each process.
710  if (R_is_replicated) {
711  ProfilingRegion regionReduce ("Tpetra::CrsMatrix::residual: Reduce Y");
712  R_in.reduce ();
713  }
714 }
715 
716 
717 
718 
719 
720  } // namespace Details
721 } // namespace Tpetra
722 
723 #endif // TPETRA_DETAILS_RESIDUAL_HPP
Communication plan for data redistribution from a uniquely-owned to a (possibly) multiply-owned distr...
static bool overlapCommunicationAndComputation()
Overlap communication and computation.
Sparse matrix that presents a row-oriented interface that lets users read or modify entries...
Declaration of Tpetra::Details::Profiling, a scope guard for Kokkos Profiling.
size_t getNumVectors() const
Number of columns in the multivector.
size_t getLocalLength() const
Local number of rows on the calling process.
bool isConstantStride() const
Whether this multivector has constant stride between columns.
One or more distributed dense vectors.
virtual void apply(const MultiVector< Scalar, LocalOrdinal, GlobalOrdinal, Node > &X, MultiVector< Scalar, LocalOrdinal, GlobalOrdinal, Node > &Y, Teuchos::ETransp mode=Teuchos::NO_TRANS, Scalar alpha=Teuchos::ScalarTraits< Scalar >::one(), Scalar beta=Teuchos::ScalarTraits< Scalar >::zero()) const =0
Computes the operator-multivector application.
bool isDistributed() const
Whether this is a globally distributed object.
Teuchos::RCP< const crs_graph_type > getCrsGraph() const
This matrix&#39;s graph, as a CrsGraph.
static bool debug()
Whether Tpetra is in debug mode.
KokkosSparse::CrsMatrix< impl_scalar_type, local_ordinal_type, device_type, void, typename local_graph_device_type::size_type > local_matrix_device_type
The specialization of Kokkos::CrsMatrix that represents the part of the sparse matrix on each MPI pro...
static bool fusedResidual()
Fusing SpMV and update in residual instead of using 2 kernel launches. Fusing kernels implies that no...
Functor for computing R -= A_offRank*X_colmap.
offsets_view_type offsets_
Offsets (output argument)
void deep_copy(MultiVector< DS, DL, DG, DN > &dst, const MultiVector< SS, SL, SG, SN > &src)
Copy the contents of the MultiVector src into dst.
Insert new values that don&#39;t currently exist.
bool isFillComplete() const override
Whether the matrix is fill complete.
Teuchos::RCP< const Teuchos::Comm< int > > getComm() const override
The communicator over which the matrix is distributed.
Abstract interface for operators (e.g., matrices and preconditioners).
void update(const Scalar &alpha, const MultiVector< Scalar, LocalOrdinal, GlobalOrdinal, Node > &A, const Scalar &beta)
Update: this = beta*this + alpha*A.
TPETRA_DETAILS_ALWAYS_INLINE local_matrix_device_type getLocalMatrixDevice() const
The local sparse matrix.
Communication plan for data redistribution from a (possibly) multiply-owned to a uniquely-owned distr...
Functor for computing the residual.
Teuchos::RCP< MV > getColumnMapMultiVector(const MV &X_domainMap, const bool force=false) const
Create a (or fetch a cached) column Map MultiVector.
Teuchos::RCP< const RowGraph< LocalOrdinal, GlobalOrdinal, Node > > getGraph() const override
This matrix&#39;s graph, as a RowGraph.
Sum new values.
Teuchos::RCP< MV > getRowMapMultiVector(const MV &Y_rangeMap, const bool force=false) const
Create a (or fetch a cached) row Map MultiVector.
dual_view_type::t_dev::const_type getLocalViewDevice(Access::ReadOnlyStruct) const
Return a read-only, up-to-date view of this MultiVector&#39;s local data on device. This requires that th...
void start()
Start the deep_copy counter.
A distributed graph accessed by rows (adjacency lists) and stored sparsely.
void doExport(const SrcDistObject &source, const Export< LocalOrdinal, GlobalOrdinal, Node > &exporter, const CombineMode CM, const bool restrictedMode=false)
Export data into this object using an Export object (&quot;forward mode&quot;).
void reduce()
Sum values of a locally replicated multivector across all processes.
static bool skipCopyAndPermuteIfPossible()
Skip copyAndPermute if possible.
void residual(const Operator< SC, LO, GO, NO > &A, const MultiVector< SC, LO, GO, NO > &X, const MultiVector< SC, LO, GO, NO > &B, MultiVector< SC, LO, GO, NO > &R)
Computes R = B - A * X.
Teuchos::RCP< const map_type > getColMap() const override
The Map that describes the column distribution in this matrix.
void localApply(const MultiVector< Scalar, LocalOrdinal, GlobalOrdinal, Node > &X, MultiVector< Scalar, LocalOrdinal, GlobalOrdinal, Node > &Y, const Teuchos::ETransp mode=Teuchos::NO_TRANS, const Scalar &alpha=Teuchos::ScalarTraits< Scalar >::one(), const Scalar &beta=Teuchos::ScalarTraits< Scalar >::zero()) const
Compute the local part of a sparse matrix-(Multi)Vector multiply.
Declaration of Tpetra::Details::Behavior, a class that describes Tpetra&#39;s behavior.
Teuchos::RCP< const map_type > getRowMap() const override
The Map that describes the row distribution in this matrix.