Tpetra parallel linear algebra  Version of the Day
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Pages
Tpetra_Details_residual.hpp
Go to the documentation of this file.
1 // @HEADER
2 // *****************************************************************************
3 // Tpetra: Templated Linear Algebra Services Package
4 //
5 // Copyright 2008 NTESS and the Tpetra contributors.
6 // SPDX-License-Identifier: BSD-3-Clause
7 // *****************************************************************************
8 // @HEADER
9 
10 #ifndef TPETRA_DETAILS_RESIDUAL_HPP
11 #define TPETRA_DETAILS_RESIDUAL_HPP
12 
13 #include "TpetraCore_config.h"
14 #include "Tpetra_CrsMatrix.hpp"
15 #include "Tpetra_LocalCrsMatrixOperator.hpp"
16 #include "Tpetra_MultiVector.hpp"
17 #include "Teuchos_RCP.hpp"
18 #include "Teuchos_ScalarTraits.hpp"
21 #include "KokkosSparse_spmv_impl.hpp"
22 
28 
29 namespace Tpetra {
30 namespace Details {
31 
38 template <class AMatrix, class MV, class ConstMV, class Offsets, bool is_MV, bool restrictedMode, bool skipOffRank>
40  using execution_space = typename AMatrix::execution_space;
41  using LO = typename AMatrix::non_const_ordinal_type;
42  using value_type = typename AMatrix::non_const_value_type;
43  using team_policy = typename Kokkos::TeamPolicy<execution_space>;
44  using team_member = typename team_policy::member_type;
45 #if KOKKOS_VERSION >= 40799
46  using ATV = KokkosKernels::ArithTraits<value_type>;
47 #else
48  using ATV = Kokkos::ArithTraits<value_type>;
49 #endif
50 
51  AMatrix A_lcl;
52  ConstMV X_colmap_lcl;
53  ConstMV B_lcl;
54  MV R_lcl;
55  int rows_per_team;
56  Offsets offsets;
57  ConstMV X_domainmap_lcl;
58 
59  LocalResidualFunctor(const AMatrix& A_lcl_,
60  const ConstMV& X_colmap_lcl_,
61  const ConstMV& B_lcl_,
62  const MV& R_lcl_,
63  const int rows_per_team_,
64  const Offsets& offsets_,
65  const ConstMV& X_domainmap_lcl_)
66  : A_lcl(A_lcl_)
67  , X_colmap_lcl(X_colmap_lcl_)
68  , B_lcl(B_lcl_)
69  , R_lcl(R_lcl_)
70  , rows_per_team(rows_per_team_)
71  , offsets(offsets_)
72  , X_domainmap_lcl(X_domainmap_lcl_) {}
73 
74  KOKKOS_INLINE_FUNCTION
75  void operator()(const team_member& dev) const {
76  Kokkos::parallel_for(Kokkos::TeamThreadRange(dev, 0, rows_per_team), [&](const LO& loop) {
77  const LO lclRow = static_cast<LO>(dev.league_rank()) * rows_per_team + loop;
78 
79  if (lclRow >= A_lcl.numRows()) {
80  return;
81  }
82 
83  if (!is_MV) { // MultiVectors only have a single column
84 
85  value_type A_x = ATV::zero();
86 
87  if (!restrictedMode) {
88  const auto A_row = A_lcl.rowConst(lclRow);
89  const LO row_length = static_cast<LO>(A_row.length);
90 
91  Kokkos::parallel_reduce(
92  Kokkos::ThreadVectorRange(dev, row_length), [&](const LO iEntry, value_type& lsum) {
93  const auto A_val = A_row.value(iEntry);
94  lsum += A_val * X_colmap_lcl(A_row.colidx(iEntry), 0);
95  },
96  A_x);
97 
98  } else {
99  const LO offRankOffset = offsets(lclRow);
100  const size_t start = A_lcl.graph.row_map(lclRow);
101  const size_t end = A_lcl.graph.row_map(lclRow + 1);
102 
103  Kokkos::parallel_reduce(
104  Kokkos::ThreadVectorRange(dev, start, end), [&](const LO iEntry, value_type& lsum) {
105  const auto A_val = A_lcl.values(iEntry);
106  const auto lclCol = A_lcl.graph.entries(iEntry);
107  if (iEntry < offRankOffset)
108  lsum += A_val * X_domainmap_lcl(lclCol, 0);
109  else if (!skipOffRank)
110  lsum += A_val * X_colmap_lcl(lclCol, 0);
111  },
112  A_x);
113  }
114 
115  Kokkos::single(Kokkos::PerThread(dev), [&]() {
116  R_lcl(lclRow, 0) = B_lcl(lclRow, 0) - A_x;
117  });
118  } else { // MultiVectors have more than one column
119 
120  const LO numVectors = static_cast<LO>(X_colmap_lcl.extent(1));
121 
122  for (LO v = 0; v < numVectors; v++) {
123  value_type A_x = ATV::zero();
124 
125  if (!restrictedMode) {
126  const auto A_row = A_lcl.rowConst(lclRow);
127  const LO row_length = static_cast<LO>(A_row.length);
128 
129  Kokkos::parallel_reduce(
130  Kokkos::ThreadVectorRange(dev, row_length), [&](const LO iEntry, value_type& lsum) {
131  const auto A_val = A_row.value(iEntry);
132  lsum += A_val * X_colmap_lcl(A_row.colidx(iEntry), v);
133  },
134  A_x);
135  } else {
136  const LO offRankOffset = offsets(lclRow);
137  const size_t start = A_lcl.graph.row_map(lclRow);
138  const size_t end = A_lcl.graph.row_map(lclRow + 1);
139 
140  Kokkos::parallel_reduce(
141  Kokkos::ThreadVectorRange(dev, start, end), [&](const LO iEntry, value_type& lsum) {
142  const auto A_val = A_lcl.values(iEntry);
143  const auto lclCol = A_lcl.graph.entries(iEntry);
144  if (iEntry < offRankOffset)
145  lsum += A_val * X_domainmap_lcl(lclCol, v);
146  else if (!skipOffRank)
147  lsum += A_val * X_colmap_lcl(lclCol, v);
148  },
149  A_x);
150  }
151 
152  Kokkos::single(Kokkos::PerThread(dev), [&]() {
153  R_lcl(lclRow, v) = B_lcl(lclRow, v) - A_x;
154  });
155 
156  } // end for numVectors
157  }
158  }); // end parallel_for TeamThreadRange
159  }
160 };
161 
163 template <class AMatrix, class MV, class ConstMV, class Offsets, bool is_MV>
165  using execution_space = typename AMatrix::execution_space;
166  using LO = typename AMatrix::non_const_ordinal_type;
167  using value_type = typename AMatrix::non_const_value_type;
168  using team_policy = typename Kokkos::TeamPolicy<execution_space>;
169  using team_member = typename team_policy::member_type;
170 #if KOKKOS_VERSION >= 40799
171  using ATV = KokkosKernels::ArithTraits<value_type>;
172 #else
173  using ATV = Kokkos::ArithTraits<value_type>;
174 #endif
175 
176  AMatrix A_lcl;
177  ConstMV X_colmap_lcl;
178  MV R_lcl;
179  int rows_per_team;
180  Offsets offsets;
181 
182  OffRankUpdateFunctor(const AMatrix& A_lcl_,
183  const ConstMV& X_colmap_lcl_,
184  const MV& R_lcl_,
185  const int rows_per_team_,
186  const Offsets& offsets_)
187  : A_lcl(A_lcl_)
188  , X_colmap_lcl(X_colmap_lcl_)
189  , R_lcl(R_lcl_)
190  , rows_per_team(rows_per_team_)
191  , offsets(offsets_) {}
192 
193  KOKKOS_INLINE_FUNCTION
194  void operator()(const team_member& dev) const {
195  Kokkos::parallel_for(Kokkos::TeamThreadRange(dev, 0, rows_per_team), [&](const LO& loop) {
196  const LO lclRow = static_cast<LO>(dev.league_rank()) * rows_per_team + loop;
197 
198  if (lclRow >= A_lcl.numRows()) {
199  return;
200  }
201 
202  const LO offRankOffset = offsets(lclRow);
203  const size_t end = A_lcl.graph.row_map(lclRow + 1);
204 
205  if (!is_MV) { // MultiVectors only have a single column
206 
207  value_type A_x = ATV::zero();
208 
209  Kokkos::parallel_reduce(
210  Kokkos::ThreadVectorRange(dev, offRankOffset, end), [&](const LO iEntry, value_type& lsum) {
211  const auto A_val = A_lcl.values(iEntry);
212  const auto lclCol = A_lcl.graph.entries(iEntry);
213  lsum += A_val * X_colmap_lcl(lclCol, 0);
214  },
215  A_x);
216 
217  Kokkos::single(Kokkos::PerThread(dev), [&]() {
218  R_lcl(lclRow, 0) -= A_x;
219  });
220  } else { // MultiVectors have more than one column
221 
222  const LO numVectors = static_cast<LO>(X_colmap_lcl.extent(1));
223 
224  for (LO v = 0; v < numVectors; v++) {
225  value_type A_x = ATV::zero();
226 
227  Kokkos::parallel_reduce(
228  Kokkos::ThreadVectorRange(dev, offRankOffset, end), [&](const LO iEntry, value_type& lsum) {
229  const auto A_val = A_lcl.values(iEntry);
230  const auto lclCol = A_lcl.graph.entries(iEntry);
231  lsum += A_val * X_colmap_lcl(lclCol, v);
232  },
233  A_x);
234 
235  Kokkos::single(Kokkos::PerThread(dev), [&]() {
236  R_lcl(lclRow, v) -= A_x;
237  });
238 
239  } // end for numVectors
240  }
241  });
242  }
243 };
244 
245 template <class SC, class LO, class GO, class NO>
246 void localResidual(const CrsMatrix<SC, LO, GO, NO>& A,
247  const MultiVector<SC, LO, GO, NO>& X_colmap,
250  const Kokkos::View<const size_t*, typename NO::device_type>& offsets,
251  const MultiVector<SC, LO, GO, NO>* X_domainmap = nullptr) {
252  using Teuchos::NO_TRANS;
254  ProfilingRegion regionLocalApply("Tpetra::CrsMatrix::localResidual");
255 
256  using local_matrix_device_type = typename CrsMatrix<SC, LO, GO, NO>::local_matrix_device_type;
257  using local_view_device_type = typename MultiVector<SC, LO, GO, NO>::dual_view_type::t_dev::non_const_type;
258  using const_local_view_device_type = typename MultiVector<SC, LO, GO, NO>::dual_view_type::t_dev::const_type;
259  using offset_type = Kokkos::View<const size_t*, typename NO::device_type>;
260 
261  local_matrix_device_type A_lcl = A.getLocalMatrixDevice();
262  const_local_view_device_type X_colmap_lcl = X_colmap.getLocalViewDevice(Access::ReadOnly);
263  const_local_view_device_type B_lcl = B.getLocalViewDevice(Access::ReadOnly);
264  local_view_device_type R_lcl = R.getLocalViewDevice(Access::OverwriteAll);
265  const_local_view_device_type X_domainmap_lcl;
266 
267  const bool debug = ::Tpetra::Details::Behavior::debug();
268  if (debug) {
269  TEUCHOS_TEST_FOR_EXCEPTION(X_colmap.getNumVectors() != R.getNumVectors(), std::runtime_error,
270  "X.getNumVectors() = " << X_colmap.getNumVectors() << " != "
271  "R.getNumVectors() = "
272  << R.getNumVectors() << ".");
273  TEUCHOS_TEST_FOR_EXCEPTION(X_colmap.getLocalLength() !=
274  A.getColMap()->getLocalNumElements(),
275  std::runtime_error,
276  "X has the wrong number of local rows. "
277  "X.getLocalLength() = "
278  << X_colmap.getLocalLength() << " != "
279  "A.getColMap()->getLocalNumElements() = "
280  << A.getColMap()->getLocalNumElements() << ".");
281  TEUCHOS_TEST_FOR_EXCEPTION(R.getLocalLength() !=
282  A.getRowMap()->getLocalNumElements(),
283  std::runtime_error,
284  "R has the wrong number of local rows. "
285  "R.getLocalLength() = "
286  << R.getLocalLength() << " != "
287  "A.getRowMap()->getLocalNumElements() = "
288  << A.getRowMap()->getLocalNumElements() << ".");
289  TEUCHOS_TEST_FOR_EXCEPTION(B.getLocalLength() !=
290  A.getRowMap()->getLocalNumElements(),
291  std::runtime_error,
292  "B has the wrong number of local rows. "
293  "B.getLocalLength() = "
294  << B.getLocalLength() << " != "
295  "A.getRowMap()->getLocalNumElements() = "
296  << A.getRowMap()->getLocalNumElements() << ".");
297 
298  TEUCHOS_TEST_FOR_EXCEPTION(!A.isFillComplete(), std::runtime_error,
299  "The matrix A is not "
300  "fill complete. You must call fillComplete() (possibly with "
301  "domain and range Map arguments) without an intervening "
302  "resumeFill() call before you may call this method.");
303  TEUCHOS_TEST_FOR_EXCEPTION(!X_colmap.isConstantStride() || !R.isConstantStride() || !B.isConstantStride(),
304  std::runtime_error, "X, Y and B must be constant stride.");
305  // If the two pointers are NULL, then they don't alias one
306  // another, even though they are equal.
307  TEUCHOS_TEST_FOR_EXCEPTION((X_colmap_lcl.data() == R_lcl.data() && X_colmap_lcl.data() != nullptr) ||
308  (X_colmap_lcl.data() == B_lcl.data() && X_colmap_lcl.data() != nullptr),
309  std::runtime_error, "X, Y and R may not alias one another.");
310  }
311 
312  const bool fusedResidual = ::Tpetra::Details::Behavior::fusedResidual();
313  if (!fusedResidual) {
314  SC one = Teuchos::ScalarTraits<SC>::one();
315  SC negone = -one;
316  SC zero = Teuchos::ScalarTraits<SC>::zero();
317  // This is currently a "reference implementation" waiting until Kokkos Kernels provides
318  // a residual kernel.
319  A.localApply(X_colmap, R, Teuchos::NO_TRANS, one, zero);
320  R.update(one, B, negone);
321  return;
322  }
323 
324  if (A_lcl.numRows() == 0) {
325  return;
326  }
327 
328  int64_t numLocalRows = A_lcl.numRows();
329  int64_t myNnz = A_lcl.nnz();
330 
331  int team_size = -1;
332  int vector_length = -1;
333  int64_t rows_per_thread = -1;
334 
335  using execution_space = typename CrsMatrix<SC, LO, GO, NO>::execution_space;
336  using policy_type = typename Kokkos::TeamPolicy<execution_space>;
337 
338  int64_t rows_per_team = KokkosSparse::Impl::spmv_launch_parameters<execution_space>(numLocalRows, myNnz, rows_per_thread, team_size, vector_length);
339  int64_t worksets = (B_lcl.extent(0) + rows_per_team - 1) / rows_per_team;
340 
341  policy_type policy(1, 1);
342  if (team_size < 0) {
343  policy = policy_type(worksets, Kokkos::AUTO, vector_length);
344  } else {
345  policy = policy_type(worksets, team_size, vector_length);
346  }
347 
348  bool is_vector = (X_colmap_lcl.extent(1) == 1);
349 
350  if (is_vector) {
351  if (X_domainmap == nullptr) {
352  using functor_type = LocalResidualFunctor<local_matrix_device_type, local_view_device_type, const_local_view_device_type, offset_type, false, false, false>;
353  functor_type func(A_lcl, X_colmap_lcl, B_lcl, R_lcl, rows_per_team, offsets, X_domainmap_lcl);
354  Kokkos::parallel_for("residual-vector", policy, func);
355 
356  } else {
357  X_domainmap_lcl = X_domainmap->getLocalViewDevice(Access::ReadOnly);
358  using functor_type = LocalResidualFunctor<local_matrix_device_type, local_view_device_type, const_local_view_device_type, offset_type, false, true, false>;
359  functor_type func(A_lcl, X_colmap_lcl, B_lcl, R_lcl, rows_per_team, offsets, X_domainmap_lcl);
360  Kokkos::parallel_for("residual-vector", policy, func);
361  }
362  } else {
363  if (X_domainmap == nullptr) {
364  using functor_type = LocalResidualFunctor<local_matrix_device_type, local_view_device_type, const_local_view_device_type, offset_type, true, false, false>;
365  functor_type func(A_lcl, X_colmap_lcl, B_lcl, R_lcl, rows_per_team, offsets, X_domainmap_lcl);
366  Kokkos::parallel_for("residual-multivector", policy, func);
367 
368  } else {
369  X_domainmap_lcl = X_domainmap->getLocalViewDevice(Access::ReadOnly);
370  using functor_type = LocalResidualFunctor<local_matrix_device_type, local_view_device_type, const_local_view_device_type, offset_type, true, true, false>;
371  functor_type func(A_lcl, X_colmap_lcl, B_lcl, R_lcl, rows_per_team, offsets, X_domainmap_lcl);
372  Kokkos::parallel_for("residual-multivector", policy, func);
373  }
374  }
375 }
376 
377 template <class SC, class LO, class GO, class NO>
378 void localResidualWithCommCompOverlap(const CrsMatrix<SC, LO, GO, NO>& A,
379  MultiVector<SC, LO, GO, NO>& X_colmap,
380  const MultiVector<SC, LO, GO, NO>& B,
381  MultiVector<SC, LO, GO, NO>& R,
382  const Kokkos::View<const size_t*, typename NO::device_type>& offsets,
383  const MultiVector<SC, LO, GO, NO>& X_domainmap) {
384  using Teuchos::NO_TRANS;
385  using Teuchos::RCP;
387  using import_type = typename CrsMatrix<SC, LO, GO, NO>::import_type;
388 
389  ProfilingRegion regionLocalApply("Tpetra::CrsMatrix::localResidualWithCommCompOverlap");
390 
391  using local_matrix_device_type = typename CrsMatrix<SC, LO, GO, NO>::local_matrix_device_type;
392  using local_view_device_type = typename MultiVector<SC, LO, GO, NO>::dual_view_type::t_dev::non_const_type;
393  using const_local_view_device_type = typename MultiVector<SC, LO, GO, NO>::dual_view_type::t_dev::const_type;
394  using offset_type = Kokkos::View<const size_t*, typename NO::device_type>;
395 
396  local_matrix_device_type A_lcl = A.getLocalMatrixDevice();
397  const_local_view_device_type X_colmap_lcl = X_colmap.getLocalViewDevice(Access::ReadOnly);
398  const_local_view_device_type B_lcl = B.getLocalViewDevice(Access::ReadOnly);
399  local_view_device_type R_lcl = R.getLocalViewDevice(Access::OverwriteAll);
400  const_local_view_device_type X_domainmap_lcl = X_domainmap.getLocalViewDevice(Access::ReadOnly);
401  ;
402 
403  const bool debug = ::Tpetra::Details::Behavior::debug();
404  if (debug) {
405  TEUCHOS_TEST_FOR_EXCEPTION(X_colmap.getNumVectors() != R.getNumVectors(), std::runtime_error,
406  "X.getNumVectors() = " << X_colmap.getNumVectors() << " != "
407  "R.getNumVectors() = "
408  << R.getNumVectors() << ".");
409  TEUCHOS_TEST_FOR_EXCEPTION(X_colmap.getLocalLength() !=
410  A.getColMap()->getLocalNumElements(),
411  std::runtime_error,
412  "X has the wrong number of local rows. "
413  "X.getLocalLength() = "
414  << X_colmap.getLocalLength() << " != "
415  "A.getColMap()->getLocalNumElements() = "
416  << A.getColMap()->getLocalNumElements() << ".");
417  TEUCHOS_TEST_FOR_EXCEPTION(R.getLocalLength() !=
418  A.getRowMap()->getLocalNumElements(),
419  std::runtime_error,
420  "R has the wrong number of local rows. "
421  "R.getLocalLength() = "
422  << R.getLocalLength() << " != "
423  "A.getRowMap()->getLocalNumElements() = "
424  << A.getRowMap()->getLocalNumElements() << ".");
425  TEUCHOS_TEST_FOR_EXCEPTION(B.getLocalLength() !=
426  A.getRowMap()->getLocalNumElements(),
427  std::runtime_error,
428  "B has the wrong number of local rows. "
429  "B.getLocalLength() = "
430  << B.getLocalLength() << " != "
431  "A.getRowMap()->getLocalNumElements() = "
432  << A.getRowMap()->getLocalNumElements() << ".");
433 
434  TEUCHOS_TEST_FOR_EXCEPTION(!A.isFillComplete(), std::runtime_error,
435  "The matrix A is not "
436  "fill complete. You must call fillComplete() (possibly with "
437  "domain and range Map arguments) without an intervening "
438  "resumeFill() call before you may call this method.");
439  TEUCHOS_TEST_FOR_EXCEPTION(!X_colmap.isConstantStride() || !R.isConstantStride() || !B.isConstantStride(),
440  std::runtime_error, "X, Y and B must be constant stride.");
441  // If the two pointers are NULL, then they don't alias one
442  // another, even though they are equal.
443  TEUCHOS_TEST_FOR_EXCEPTION((X_colmap_lcl.data() == R_lcl.data() && X_colmap_lcl.data() != nullptr) ||
444  (X_colmap_lcl.data() == B_lcl.data() && X_colmap_lcl.data() != nullptr),
445  std::runtime_error, "X, Y and R may not alias one another.");
446  }
447 
448  if (A_lcl.numRows() == 0) {
449  return;
450  }
451 
452  int64_t numLocalRows = A_lcl.numRows();
453  int64_t myNnz = A_lcl.nnz();
454 
455  int team_size = -1;
456  int vector_length = -1;
457  int64_t rows_per_thread = -1;
458 
459  using execution_space = typename CrsMatrix<SC, LO, GO, NO>::execution_space;
460  using policy_type = typename Kokkos::TeamPolicy<execution_space>;
461 
462  int64_t rows_per_team = KokkosSparse::Impl::spmv_launch_parameters<execution_space>(numLocalRows, myNnz, rows_per_thread, team_size, vector_length);
463  int64_t worksets = (B_lcl.extent(0) + rows_per_team - 1) / rows_per_team;
464 
465  policy_type policy(1, 1);
466  if (team_size < 0) {
467  policy = policy_type(worksets, Kokkos::AUTO, vector_length);
468  } else {
469  policy = policy_type(worksets, team_size, vector_length);
470  }
471 
472  bool is_vector = (X_colmap_lcl.extent(1) == 1);
473 
474  if (is_vector) {
475  using functor_type = LocalResidualFunctor<local_matrix_device_type, local_view_device_type, const_local_view_device_type, offset_type, false, true, true>;
476  functor_type func(A_lcl, X_colmap_lcl, B_lcl, R_lcl, rows_per_team, offsets, X_domainmap_lcl);
477  Kokkos::parallel_for("residual-vector", policy, func);
478 
479  RCP<const import_type> importer = A.getGraph()->getImporter();
480  X_colmap.endImport(X_domainmap, *importer, INSERT, true);
481 
482  Kokkos::fence("Tpetra::localResidualWithCommCompOverlap-1");
483 
484  using functor_type2 = OffRankUpdateFunctor<local_matrix_device_type, local_view_device_type, const_local_view_device_type, offset_type, false>;
485  functor_type2 func2(A_lcl, X_colmap_lcl, R_lcl, rows_per_team, offsets);
486  Kokkos::parallel_for("residual-vector-offrank", policy, func2);
487 
488  } else {
489  using functor_type = LocalResidualFunctor<local_matrix_device_type, local_view_device_type, const_local_view_device_type, offset_type, true, true, true>;
490  functor_type func(A_lcl, X_colmap_lcl, B_lcl, R_lcl, rows_per_team, offsets, X_domainmap_lcl);
491  Kokkos::parallel_for("residual-multivector", policy, func);
492 
493  RCP<const import_type> importer = A.getGraph()->getImporter();
494  X_colmap.endImport(X_domainmap, *importer, INSERT, true);
495 
496  Kokkos::fence("Tpetra::localResidualWithCommCompOverlap-2");
497 
498  using functor_type2 = OffRankUpdateFunctor<local_matrix_device_type, local_view_device_type, const_local_view_device_type, offset_type, true>;
499  functor_type2 func2(A_lcl, X_colmap_lcl, R_lcl, rows_per_team, offsets);
500  Kokkos::parallel_for("residual-vector-offrank", policy, func2);
501  }
502 }
503 
505 template <class SC, class LO, class GO, class NO>
507  const MultiVector<SC, LO, GO, NO>& X_in,
508  const MultiVector<SC, LO, GO, NO>& B_in,
510  using Teuchos::RCP;
511  using Teuchos::rcp;
512  using Teuchos::rcp_const_cast;
513  using Teuchos::rcpFromRef;
515 
516  const bool debug = ::Tpetra::Details::Behavior::debug();
517  const bool skipCopyAndPermuteIfPossible = ::Tpetra::Details::Behavior::skipCopyAndPermuteIfPossible();
518  const bool overlapCommunicationAndComputation = ::Tpetra::Details::Behavior::overlapCommunicationAndComputation();
519  if (overlapCommunicationAndComputation)
520  TEUCHOS_ASSERT(skipCopyAndPermuteIfPossible);
521 
522  // Whether we are using restrictedMode in the import from domain to
523  // column map. Restricted mode skips the copy and permutation of the
524  // local part of X. We are using restrictedMode only when domain and
525  // column map are locally fitted, i.e. when the local indices of
526  // domain and column map match.
527  bool restrictedMode = false;
528 
529  const CrsMatrix<SC, LO, GO, NO>* Apt = dynamic_cast<const CrsMatrix<SC, LO, GO, NO>*>(&Aop);
530  if (!Apt) {
531  // If we're not a CrsMatrix, we can't do fusion, so just do apply+update
532  SC one = Teuchos::ScalarTraits<SC>::one(), negone = -one, zero = Teuchos::ScalarTraits<SC>::zero();
533  Aop.apply(X_in, R_in, Teuchos::NO_TRANS, one, zero);
534  R_in.update(one, B_in, negone);
535  return;
536  }
537  const CrsMatrix<SC, LO, GO, NO>& A = *Apt;
538 
539  using import_type = typename CrsMatrix<SC, LO, GO, NO>::import_type;
540  using export_type = typename CrsMatrix<SC, LO, GO, NO>::export_type;
541  using MV = MultiVector<SC, LO, GO, NO>;
542  using graph_type = Tpetra::CrsGraph<LO, GO, NO>;
543  using offset_type = typename graph_type::offset_device_view_type;
544 
545  // We treat the case of a replicated MV output specially.
546  const bool R_is_replicated =
547  (!R_in.isDistributed() && A.getComm()->getSize() != 1);
548 
549  // It's possible that R is a view of X or B.
550  // We don't try to to detect the more subtle cases (e.g., one is a
551  // subview of the other, but their initial pointers differ). We
552  // only need to do this if this matrix's Import is trivial;
553  // otherwise, we don't actually apply the operator from X into Y.
554 
555  RCP<const import_type> importer = A.getGraph()->getImporter();
556  RCP<const export_type> exporter = A.getGraph()->getExporter();
557 
558  // Temporary MV for Import operation. After the block of code
559  // below, this will be an (Imported if necessary) column Map MV
560  // ready to give to localApply(...).
561  RCP<MV> X_colMap;
562  if (importer.is_null()) {
563  if (!X_in.isConstantStride()) {
564  // Not all sparse mat-vec kernels can handle an input MV with
565  // nonconstant stride correctly, so we have to copy it in that
566  // case into a constant stride MV. To make a constant stride
567  // copy of X_in, we force creation of the column (== domain)
568  // Map MV (if it hasn't already been created, else fetch the
569  // cached copy). This avoids creating a new MV each time.
570  X_colMap = A.getColumnMapMultiVector(X_in, true);
571  Tpetra::deep_copy(*X_colMap, X_in);
572  // X_colMap = rcp_const_cast<const MV> (X_colMapNonConst);
573  } else {
574  // The domain and column Maps are the same, so do the local
575  // multiply using the domain Map input MV X_in.
576  X_colMap = rcp_const_cast<MV>(rcpFromRef(X_in));
577  }
578  } else { // need to Import source (multi)vector
579  ProfilingRegion regionImport("Tpetra::CrsMatrix::residual: Import");
580  // We're doing an Import anyway, which will copy the relevant
581  // elements of the domain Map MV X_in into a separate column Map
582  // MV. Thus, we don't have to worry whether X_in is constant
583  // stride.
584  X_colMap = A.getColumnMapMultiVector(X_in);
585 
586  // Do we want to use restrictedMode?
587  restrictedMode = skipCopyAndPermuteIfPossible && importer->isLocallyFitted();
588 
589  if (debug && restrictedMode) {
590  TEUCHOS_TEST_FOR_EXCEPTION(!importer->getTargetMap()->isLocallyFitted(*importer->getSourceMap()), std::runtime_error,
591  "Source map and target map are not locally fitted, but Tpetra::residual thinks they are.");
592  }
593 
594  // Import from the domain Map MV to the column Map MV.
595  X_colMap->beginImport(X_in, *importer, INSERT, restrictedMode);
596  }
597 
598  offset_type offsets;
599  if (restrictedMode)
600  A.getCrsGraph()->getLocalOffRankOffsets(offsets);
601 
602  // Get a vector for the R_rowMap output residual, handling the
603  // non-constant stride and exporter cases. Since R gets clobbered
604  // we don't need to worry about the data in it
605  RCP<MV> R_rowMap;
606  if (exporter.is_null()) {
607  if (!R_in.isConstantStride()) {
608  R_rowMap = A.getRowMapMultiVector(R_in);
609  } else {
610  R_rowMap = rcpFromRef(R_in);
611  }
612  } else {
613  R_rowMap = A.getRowMapMultiVector(R_in);
614  }
615 
616  // Get a vector for the B_rowMap output residual, handling the
617  // non-constant stride and exporter cases
618  RCP<const MV> B_rowMap;
619  if (exporter.is_null()) {
620  if (!B_in.isConstantStride()) {
621  // Do an allocation here. If we need to optimize this later, we can have the matrix
622  // cache this.
623  RCP<MV> B_rowMapNonConst = rcp(new MV(A.getRowMap(), B_in.getNumVectors()));
624  Tpetra::deep_copy(*B_rowMapNonConst, B_in);
625  B_rowMap = rcp_const_cast<const MV>(B_rowMapNonConst);
626  } else {
627  B_rowMap = rcpFromRef(B_in);
628  }
629  } else {
630  // Do an allocation here. If we need to optimize this later, we can have the matrix
631  // cache this.
632  ProfilingRegion regionExport("Tpetra::CrsMatrix::residual: B Import");
633  RCP<MV> B_rowMapNonConst = rcp(new MV(A.getRowMap(), B_in.getNumVectors()));
634  B_rowMapNonConst->doImport(B_in, *exporter, ADD);
635  B_rowMap = rcp_const_cast<const MV>(B_rowMapNonConst);
636  }
637 
638  // If we have a nontrivial Export object, we must perform an
639  // Export. In that case, the local multiply result will go into
640  // the row Map multivector. We don't have to make a
641  // constant-stride version of R_in in this case, because we had to
642  // make a constant stride R_rowMap MV and do an Export anyway.
643  if (!exporter.is_null()) {
644  if (!importer.is_null())
645  X_colMap->endImport(X_in, *importer, INSERT, restrictedMode);
646  if (restrictedMode && !importer.is_null())
647  localResidual(A, *X_colMap, *B_rowMap, *R_rowMap, offsets, &X_in);
648  else
649  localResidual(A, *X_colMap, *B_rowMap, *R_rowMap, offsets);
650 
651  {
652  ProfilingRegion regionExport("Tpetra::CrsMatrix::residual: R Export");
653 
654  // Do the Export operation.
655  R_in.doExport(*R_rowMap, *exporter, ADD);
656  }
657  } else { // Don't do an Export: row Map and range Map are the same.
658 
659  if (restrictedMode)
660  if (overlapCommunicationAndComputation) {
661  localResidualWithCommCompOverlap(A, *X_colMap, *B_rowMap, *R_rowMap, offsets, X_in);
662  } else {
663  X_colMap->endImport(X_in, *importer, INSERT, restrictedMode);
664  localResidual(A, *X_colMap, *B_rowMap, *R_rowMap, offsets, &X_in);
665  }
666  else {
667  if (!importer.is_null())
668  X_colMap->endImport(X_in, *importer, INSERT, restrictedMode);
669  localResidual(A, *X_colMap, *B_rowMap, *R_rowMap, offsets);
670  }
671 
672  //
673  // If R_in does not have constant stride,
674  // then we can't let the kernel write directly to R_in.
675  // Instead, we have to use the cached row (== range)
676  // Map MV as temporary storage.
677  //
678  if (!R_in.isConstantStride()) {
679  // We need to be sure to do a copy out in this case.
680  Tpetra::deep_copy(R_in, *R_rowMap);
681  }
682  }
683 
684  // If the range Map is a locally replicated Map, sum up
685  // contributions from each process.
686  if (R_is_replicated) {
687  ProfilingRegion regionReduce("Tpetra::CrsMatrix::residual: Reduce Y");
688  R_in.reduce();
689  }
690 }
691 
692 } // namespace Details
693 } // namespace Tpetra
694 
695 #endif // TPETRA_DETAILS_RESIDUAL_HPP
Communication plan for data redistribution from a uniquely-owned to a (possibly) multiply-owned distr...
static bool overlapCommunicationAndComputation()
Overlap communication and computation.
Sparse matrix that presents a row-oriented interface that lets users read or modify entries...
Declaration of Tpetra::Details::Profiling, a scope guard for Kokkos Profiling.
size_t getNumVectors() const
Number of columns in the multivector.
size_t getLocalLength() const
Local number of rows on the calling process.
bool isConstantStride() const
Whether this multivector has constant stride between columns.
One or more distributed dense vectors.
virtual void apply(const MultiVector< Scalar, LocalOrdinal, GlobalOrdinal, Node > &X, MultiVector< Scalar, LocalOrdinal, GlobalOrdinal, Node > &Y, Teuchos::ETransp mode=Teuchos::NO_TRANS, Scalar alpha=Teuchos::ScalarTraits< Scalar >::one(), Scalar beta=Teuchos::ScalarTraits< Scalar >::zero()) const =0
Computes the operator-multivector application.
bool isDistributed() const
Whether this is a globally distributed object.
Teuchos::RCP< const crs_graph_type > getCrsGraph() const
This matrix&#39;s graph, as a CrsGraph.
static bool debug()
Whether Tpetra is in debug mode.
KokkosSparse::CrsMatrix< impl_scalar_type, local_ordinal_type, device_type, void, typename local_graph_device_type::size_type > local_matrix_device_type
The specialization of Kokkos::CrsMatrix that represents the part of the sparse matrix on each MPI pro...
static bool fusedResidual()
Fusing SpMV and update in residual instead of using 2 kernel launches. Fusing kernels implies that no...
Functor for computing R -= A_offRank*X_colmap.
offsets_view_type offsets_
Offsets (output argument)
void deep_copy(MultiVector< DS, DL, DG, DN > &dst, const MultiVector< SS, SL, SG, SN > &src)
Copy the contents of the MultiVector src into dst.
Insert new values that don&#39;t currently exist.
bool isFillComplete() const override
Whether the matrix is fill complete.
Teuchos::RCP< const Teuchos::Comm< int > > getComm() const override
The communicator over which the matrix is distributed.
Abstract interface for operators (e.g., matrices and preconditioners).
void update(const Scalar &alpha, const MultiVector< Scalar, LocalOrdinal, GlobalOrdinal, Node > &A, const Scalar &beta)
Update: this = beta*this + alpha*A.
TPETRA_DETAILS_ALWAYS_INLINE local_matrix_device_type getLocalMatrixDevice() const
The local sparse matrix.
Communication plan for data redistribution from a (possibly) multiply-owned to a uniquely-owned distr...
Functor for computing the residual.
Teuchos::RCP< MV > getColumnMapMultiVector(const MV &X_domainMap, const bool force=false) const
Create a (or fetch a cached) column Map MultiVector.
Teuchos::RCP< const RowGraph< LocalOrdinal, GlobalOrdinal, Node > > getGraph() const override
This matrix&#39;s graph, as a RowGraph.
Sum new values.
Teuchos::RCP< MV > getRowMapMultiVector(const MV &Y_rangeMap, const bool force=false) const
Create a (or fetch a cached) row Map MultiVector.
dual_view_type::t_dev::const_type getLocalViewDevice(Access::ReadOnlyStruct) const
Return a read-only, up-to-date view of this MultiVector&#39;s local data on device. This requires that th...
void start()
Start the deep_copy counter.
A distributed graph accessed by rows (adjacency lists) and stored sparsely.
void doExport(const SrcDistObject &source, const Export< LocalOrdinal, GlobalOrdinal, Node > &exporter, const CombineMode CM, const bool restrictedMode=false)
Export data into this object using an Export object (&quot;forward mode&quot;).
void reduce()
Sum values of a locally replicated multivector across all processes.
static bool skipCopyAndPermuteIfPossible()
Skip copyAndPermute if possible.
void residual(const Operator< SC, LO, GO, NO > &A, const MultiVector< SC, LO, GO, NO > &X, const MultiVector< SC, LO, GO, NO > &B, MultiVector< SC, LO, GO, NO > &R)
Computes R = B - A * X.
Teuchos::RCP< const map_type > getColMap() const override
The Map that describes the column distribution in this matrix.
void localApply(const MultiVector< Scalar, LocalOrdinal, GlobalOrdinal, Node > &X, MultiVector< Scalar, LocalOrdinal, GlobalOrdinal, Node > &Y, const Teuchos::ETransp mode=Teuchos::NO_TRANS, const Scalar &alpha=Teuchos::ScalarTraits< Scalar >::one(), const Scalar &beta=Teuchos::ScalarTraits< Scalar >::zero()) const
Compute the local part of a sparse matrix-(Multi)Vector multiply.
Declaration of Tpetra::Details::Behavior, a class that describes Tpetra&#39;s behavior.
Teuchos::RCP< const map_type > getRowMap() const override
The Map that describes the row distribution in this matrix.