Tpetra parallel linear algebra  Version of the Day
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Pages
Tpetra_Details_residual.hpp
Go to the documentation of this file.
1 // @HEADER
2 // ***********************************************************************
3 //
4 // Tpetra: Templated Linear Algebra Services Package
5 // Copyright (2008) Sandia Corporation
6 //
7 // Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
8 // the U.S. Government retains certain rights in this software.
9 //
10 // Redistribution and use in source and binary forms, with or without
11 // modification, are permitted provided that the following conditions are
12 // met:
13 //
14 // 1. Redistributions of source code must retain the above copyright
15 // notice, this list of conditions and the following disclaimer.
16 //
17 // 2. Redistributions in binary form must reproduce the above copyright
18 // notice, this list of conditions and the following disclaimer in the
19 // documentation and/or other materials provided with the distribution.
20 //
21 // 3. Neither the name of the Corporation nor the names of the
22 // contributors may be used to endorse or promote products derived from
23 // this software without specific prior written permission.
24 //
25 // THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
26 // EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
27 // IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
28 // PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
29 // CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
30 // EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
31 // PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
32 // PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
33 // LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
34 // NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
35 // SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
36 //
37 //
38 // ************************************************************************
39 // @HEADER
40 
41 #ifndef TPETRA_DETAILS_RESIDUAL_HPP
42 #define TPETRA_DETAILS_RESIDUAL_HPP
43 
44 #include "TpetraCore_config.h"
45 #include "Tpetra_CrsMatrix.hpp"
46 #include "Tpetra_LocalCrsMatrixOperator.hpp"
47 #include "Tpetra_MultiVector.hpp"
48 #include "Teuchos_RCP.hpp"
49 #include "Teuchos_ScalarTraits.hpp"
52 #include "KokkosSparse_spmv_impl.hpp"
53 
59 
60 namespace Tpetra {
61  namespace Details {
62 
63 
70 template<class AMatrix, class MV, class ConstMV, class Offsets, bool is_MV, bool restrictedMode, bool skipOffRank>
72 
73  using execution_space = typename AMatrix::execution_space;
74  using LO = typename AMatrix::non_const_ordinal_type;
75  using value_type = typename AMatrix::non_const_value_type;
76  using team_policy = typename Kokkos::TeamPolicy<execution_space>;
77  using team_member = typename team_policy::member_type;
78  using ATV = Kokkos::ArithTraits<value_type>;
79 
80  AMatrix A_lcl;
81  ConstMV X_colmap_lcl;
82  ConstMV B_lcl;
83  MV R_lcl;
84  int rows_per_team;
85  Offsets offsets;
86  ConstMV X_domainmap_lcl;
87 
88 
89  LocalResidualFunctor (const AMatrix& A_lcl_,
90  const ConstMV& X_colmap_lcl_,
91  const ConstMV& B_lcl_,
92  const MV& R_lcl_,
93  const int rows_per_team_,
94  const Offsets& offsets_,
95  const ConstMV& X_domainmap_lcl_) :
96  A_lcl(A_lcl_),
97  X_colmap_lcl(X_colmap_lcl_),
98  B_lcl(B_lcl_),
99  R_lcl(R_lcl_),
100  rows_per_team(rows_per_team_),
101  offsets(offsets_),
102  X_domainmap_lcl(X_domainmap_lcl_)
103  { }
104 
105  KOKKOS_INLINE_FUNCTION
106  void operator() (const team_member& dev) const
107  {
108 
109  Kokkos::parallel_for(Kokkos::TeamThreadRange (dev, 0, rows_per_team),[&] (const LO& loop) {
110  const LO lclRow = static_cast<LO> (dev.league_rank ()) * rows_per_team + loop;
111 
112  if (lclRow >= A_lcl.numRows ()) {
113  return;
114  }
115 
116  if (!is_MV) { // MultiVectors only have a single column
117 
118  value_type A_x = ATV::zero ();
119 
120  if (!restrictedMode) {
121  const auto A_row = A_lcl.rowConst(lclRow);
122  const LO row_length = static_cast<LO> (A_row.length);
123 
124  Kokkos::parallel_reduce(Kokkos::ThreadVectorRange (dev, row_length), [&] (const LO iEntry, value_type& lsum) {
125  const auto A_val = A_row.value(iEntry);
126  lsum += A_val * X_colmap_lcl(A_row.colidx(iEntry),0);
127  }, A_x);
128 
129  }
130  else {
131 
132  const LO offRankOffset = offsets(lclRow);
133  const size_t start = A_lcl.graph.row_map(lclRow);
134  const size_t end = A_lcl.graph.row_map(lclRow+1);
135 
136  Kokkos::parallel_reduce(Kokkos::ThreadVectorRange (dev, start, end), [&] (const LO iEntry, value_type& lsum) {
137  const auto A_val = A_lcl.values(iEntry);
138  const auto lclCol = A_lcl.graph.entries(iEntry);
139  if (iEntry < offRankOffset)
140  lsum += A_val * X_domainmap_lcl(lclCol,0);
141  else if (!skipOffRank)
142  lsum += A_val * X_colmap_lcl(lclCol,0);
143  }, A_x);
144  }
145 
146  Kokkos::single(Kokkos::PerThread(dev),[&] () {
147  R_lcl(lclRow,0) = B_lcl(lclRow,0) - A_x;
148  });
149  }
150  else { // MultiVectors have more than one column
151 
152  const LO numVectors = static_cast<LO>(X_colmap_lcl.extent(1));
153 
154  for(LO v=0; v<numVectors; v++) {
155 
156  value_type A_x = ATV::zero ();
157 
158  if (!restrictedMode) {
159 
160  const auto A_row = A_lcl.rowConst(lclRow);
161  const LO row_length = static_cast<LO> (A_row.length);
162 
163  Kokkos::parallel_reduce(Kokkos::ThreadVectorRange (dev, row_length), [&] (const LO iEntry, value_type& lsum) {
164  const auto A_val = A_row.value(iEntry);
165  lsum += A_val * X_colmap_lcl(A_row.colidx(iEntry),v);
166  }, A_x);
167  }
168  else {
169  const LO offRankOffset = offsets(lclRow);
170  const size_t start = A_lcl.graph.row_map(lclRow);
171  const size_t end = A_lcl.graph.row_map(lclRow+1);
172 
173  Kokkos::parallel_reduce(Kokkos::ThreadVectorRange (dev, start, end), [&] (const LO iEntry, value_type& lsum) {
174  const auto A_val = A_lcl.values(iEntry);
175  const auto lclCol = A_lcl.graph.entries(iEntry);
176  if (iEntry < offRankOffset)
177  lsum += A_val * X_domainmap_lcl(lclCol,v);
178  else if (!skipOffRank)
179  lsum += A_val * X_colmap_lcl(lclCol,v);
180  }, A_x);
181  }
182 
183  Kokkos::single(Kokkos::PerThread(dev),[&] () {
184  R_lcl(lclRow,v) = B_lcl(lclRow,v) - A_x;
185  });
186 
187  }//end for numVectors
188  }
189  });//end parallel_for TeamThreadRange
190  }
191 };
192 
193 
195 template<class AMatrix, class MV, class ConstMV, class Offsets, bool is_MV>
197 
198  using execution_space = typename AMatrix::execution_space;
199  using LO = typename AMatrix::non_const_ordinal_type;
200  using value_type = typename AMatrix::non_const_value_type;
201  using team_policy = typename Kokkos::TeamPolicy<execution_space>;
202  using team_member = typename team_policy::member_type;
203  using ATV = Kokkos::ArithTraits<value_type>;
204 
205  AMatrix A_lcl;
206  ConstMV X_colmap_lcl;
207  MV R_lcl;
208  int rows_per_team;
209  Offsets offsets;
210 
211 
212  OffRankUpdateFunctor (const AMatrix& A_lcl_,
213  const ConstMV& X_colmap_lcl_,
214  const MV& R_lcl_,
215  const int rows_per_team_,
216  const Offsets& offsets_) :
217  A_lcl(A_lcl_),
218  X_colmap_lcl(X_colmap_lcl_),
219  R_lcl(R_lcl_),
220  rows_per_team(rows_per_team_),
221  offsets(offsets_)
222  { }
223 
224  KOKKOS_INLINE_FUNCTION
225  void operator() (const team_member& dev) const
226  {
227 
228  Kokkos::parallel_for(Kokkos::TeamThreadRange (dev, 0, rows_per_team),[&] (const LO& loop) {
229  const LO lclRow = static_cast<LO> (dev.league_rank ()) * rows_per_team + loop;
230 
231  if (lclRow >= A_lcl.numRows ()) {
232  return;
233  }
234 
235  const LO offRankOffset = offsets(lclRow);
236  const size_t end = A_lcl.graph.row_map(lclRow+1);
237 
238  if (!is_MV) { // MultiVectors only have a single column
239 
240  value_type A_x = ATV::zero ();
241 
242  Kokkos::parallel_reduce(Kokkos::ThreadVectorRange (dev, offRankOffset, end), [&] (const LO iEntry, value_type& lsum) {
243  const auto A_val = A_lcl.values(iEntry);
244  const auto lclCol = A_lcl.graph.entries(iEntry);
245  lsum += A_val * X_colmap_lcl(lclCol,0);
246  }, A_x);
247 
248  Kokkos::single(Kokkos::PerThread(dev),[&] () {
249  R_lcl(lclRow,0) -= A_x;
250  });
251  }
252  else { // MultiVectors have more than one column
253 
254  const LO numVectors = static_cast<LO>(X_colmap_lcl.extent(1));
255 
256  for(LO v=0; v<numVectors; v++) {
257 
258  value_type A_x = ATV::zero ();
259 
260  Kokkos::parallel_reduce(Kokkos::ThreadVectorRange (dev, offRankOffset, end), [&] (const LO iEntry, value_type& lsum) {
261  const auto A_val = A_lcl.values(iEntry);
262  const auto lclCol = A_lcl.graph.entries(iEntry);
263  lsum += A_val * X_colmap_lcl(lclCol,v);
264  }, A_x);
265 
266  Kokkos::single(Kokkos::PerThread(dev),[&] () {
267  R_lcl(lclRow,v) -= A_x;
268  });
269 
270  }//end for numVectors
271  }
272  });
273  }
274 };
275 
276 template<class SC, class LO, class GO, class NO>
277 void localResidual(const CrsMatrix<SC,LO,GO,NO> & A,
278  const MultiVector<SC,LO,GO,NO> & X_colmap,
279  const MultiVector<SC,LO,GO,NO> & B,
281  const Kokkos::View<const size_t*, typename NO::device_type>& offsets,
282  const MultiVector<SC,LO,GO,NO> * X_domainmap=nullptr) {
284  using Teuchos::NO_TRANS;
285  ProfilingRegion regionLocalApply ("Tpetra::CrsMatrix::localResidual");
286 
287  using local_matrix_device_type = typename CrsMatrix<SC,LO,GO,NO>::local_matrix_device_type;
288  using local_view_device_type = typename MultiVector<SC,LO,GO,NO>::dual_view_type::t_dev::non_const_type;
289  using const_local_view_device_type = typename MultiVector<SC,LO,GO,NO>::dual_view_type::t_dev::const_type;
290  using offset_type = Kokkos::View<const size_t*, typename NO::device_type>;
291 
292  local_matrix_device_type A_lcl = A.getLocalMatrixDevice ();
293  const_local_view_device_type X_colmap_lcl = X_colmap.getLocalViewDevice(Access::ReadOnly);
294  const_local_view_device_type B_lcl = B.getLocalViewDevice(Access::ReadOnly);
295  local_view_device_type R_lcl = R.getLocalViewDevice(Access::OverwriteAll);
296  const_local_view_device_type X_domainmap_lcl;
297 
298  const bool debug = ::Tpetra::Details::Behavior::debug ();
299  if (debug) {
300  TEUCHOS_TEST_FOR_EXCEPTION
301  (X_colmap.getNumVectors () != R.getNumVectors (), std::runtime_error,
302  "X.getNumVectors() = " << X_colmap.getNumVectors () << " != "
303  "R.getNumVectors() = " << R.getNumVectors () << ".");
304  TEUCHOS_TEST_FOR_EXCEPTION
305  (X_colmap.getLocalLength () !=
306  A.getColMap ()->getLocalNumElements (), std::runtime_error,
307  "X has the wrong number of local rows. "
308  "X.getLocalLength() = " << X_colmap.getLocalLength () << " != "
309  "A.getColMap()->getLocalNumElements() = " <<
310  A.getColMap ()->getLocalNumElements () << ".");
311  TEUCHOS_TEST_FOR_EXCEPTION
312  (R.getLocalLength () !=
313  A.getRowMap ()->getLocalNumElements (), std::runtime_error,
314  "R has the wrong number of local rows. "
315  "R.getLocalLength() = " << R.getLocalLength () << " != "
316  "A.getRowMap()->getLocalNumElements() = " <<
317  A.getRowMap ()->getLocalNumElements () << ".");
318  TEUCHOS_TEST_FOR_EXCEPTION
319  (B.getLocalLength () !=
320  A.getRowMap ()->getLocalNumElements (), std::runtime_error,
321  "B has the wrong number of local rows. "
322  "B.getLocalLength() = " << B.getLocalLength () << " != "
323  "A.getRowMap()->getLocalNumElements() = " <<
324  A.getRowMap ()->getLocalNumElements () << ".");
325 
326  TEUCHOS_TEST_FOR_EXCEPTION
327  (! A.isFillComplete (), std::runtime_error, "The matrix A is not "
328  "fill complete. You must call fillComplete() (possibly with "
329  "domain and range Map arguments) without an intervening "
330  "resumeFill() call before you may call this method.");
331  TEUCHOS_TEST_FOR_EXCEPTION
332  (! X_colmap.isConstantStride () || ! R.isConstantStride () || ! B.isConstantStride (),
333  std::runtime_error, "X, Y and B must be constant stride.");
334  // If the two pointers are NULL, then they don't alias one
335  // another, even though they are equal.
336  TEUCHOS_TEST_FOR_EXCEPTION
337  ((X_colmap_lcl.data () == R_lcl.data () && X_colmap_lcl.data () != nullptr) ||
338  (X_colmap_lcl.data () == B_lcl.data () && X_colmap_lcl.data () != nullptr),
339  std::runtime_error, "X, Y and R may not alias one another.");
340  }
341 
342  SC one = Teuchos::ScalarTraits<SC>::one(), negone = -one, zero = Teuchos::ScalarTraits<SC>::zero();
343 #ifdef TPETRA_DETAILS_USE_REFERENCE_RESIDUAL
344  // This is currently a "reference implementation" waiting until Kokkos Kernels provides
345  // a residual kernel.
346  A.localApply(X_colmap,R,Teuchos::NO_TRANS, one, zero);
347  R.update(one,B,negone);
348 #else
349 
350  if (A_lcl.numRows() == 0) {
351  return;
352  }
353 
354  int64_t numLocalRows = A_lcl.numRows ();
355  int64_t myNnz = A_lcl.nnz();
356 
357  //Check for imbalanced rows. If the logic for SPMV to use merge path is triggered,
358  //use it and follow the reference residual code
359  LO maxRowImbalance = 0;
360  if(numLocalRows != 0)
361  maxRowImbalance = A.getLocalMaxNumRowEntries() - (myNnz / numLocalRows);
362  if(size_t(maxRowImbalance) >= Tpetra::Details::Behavior::rowImbalanceThreshold())
363  {
364  //note: lclOp will be wrapped in shared_ptr
365  auto lclOp = A.getLocalMultiplyOperator();
366  //Call local SPMV, requesting merge path, through A's LocalCrsMatrixOperator
367  lclOp->applyImbalancedRows (X_colmap_lcl, R_lcl, Teuchos::NO_TRANS, one, zero);
368  R.update(one,B,negone);
369  return;
370  }
371 
372  int team_size = -1;
373  int vector_length = -1;
374  int64_t rows_per_thread = -1;
375 
376  using execution_space = typename CrsMatrix<SC,LO,GO,NO>::execution_space;
377  using policy_type = typename Kokkos::TeamPolicy<execution_space>;
378 
379  int64_t rows_per_team = KokkosSparse::Impl::spmv_launch_parameters<execution_space>(numLocalRows, myNnz, rows_per_thread, team_size, vector_length);
380  int64_t worksets = (B_lcl.extent (0) + rows_per_team - 1) / rows_per_team;
381 
382  policy_type policy (1, 1);
383  if (team_size < 0) {
384  policy = policy_type (worksets, Kokkos::AUTO, vector_length);
385  }
386  else {
387  policy = policy_type (worksets, team_size, vector_length);
388  }
389 
390  bool is_vector = (X_colmap_lcl.extent(1) == 1);
391 
392  if(is_vector) {
393 
394  if (X_domainmap == nullptr) {
395 
396  using functor_type = LocalResidualFunctor<local_matrix_device_type,local_view_device_type,const_local_view_device_type,offset_type,false,false,false>;
397  functor_type func (A_lcl, X_colmap_lcl, B_lcl, R_lcl, rows_per_team, offsets, X_domainmap_lcl);
398  Kokkos::parallel_for("residual-vector",policy,func);
399 
400  }
401  else {
402 
403  X_domainmap_lcl = X_domainmap->getLocalViewDevice(Access::ReadOnly);
404  using functor_type = LocalResidualFunctor<local_matrix_device_type,local_view_device_type,const_local_view_device_type,offset_type,false,true,false>;
405  functor_type func (A_lcl, X_colmap_lcl, B_lcl, R_lcl, rows_per_team, offsets, X_domainmap_lcl);
406  Kokkos::parallel_for("residual-vector",policy,func);
407 
408  }
409  }
410  else {
411 
412  if (X_domainmap == nullptr) {
413 
414  using functor_type = LocalResidualFunctor<local_matrix_device_type,local_view_device_type,const_local_view_device_type,offset_type,true,false,false>;
415  functor_type func (A_lcl, X_colmap_lcl, B_lcl, R_lcl, rows_per_team, offsets, X_domainmap_lcl);
416  Kokkos::parallel_for("residual-multivector",policy,func);
417 
418  }
419  else {
420 
421  X_domainmap_lcl = X_domainmap->getLocalViewDevice(Access::ReadOnly);
422  using functor_type = LocalResidualFunctor<local_matrix_device_type,local_view_device_type,const_local_view_device_type,offset_type,true,true,false>;
423  functor_type func (A_lcl, X_colmap_lcl, B_lcl, R_lcl, rows_per_team, offsets, X_domainmap_lcl);
424  Kokkos::parallel_for("residual-multivector",policy,func);
425 
426  }
427  }
428 #endif
429 }
430 
431 
432 template<class SC, class LO, class GO, class NO>
433 void localResidualWithCommCompOverlap(const CrsMatrix<SC,LO,GO,NO> & A,
434  MultiVector<SC,LO,GO,NO> & X_colmap,
435  const MultiVector<SC,LO,GO,NO> & B,
436  MultiVector<SC,LO,GO,NO> & R,
437  const Kokkos::View<const size_t*, typename NO::device_type>& offsets,
438  const MultiVector<SC,LO,GO,NO> & X_domainmap) {
440  using Teuchos::NO_TRANS;
441  using Teuchos::RCP;
442  using import_type = typename CrsMatrix<SC,LO,GO,NO>::import_type;
443 
444  ProfilingRegion regionLocalApply ("Tpetra::CrsMatrix::localResidualWithCommCompOverlap");
445 
446  using local_matrix_device_type = typename CrsMatrix<SC,LO,GO,NO>::local_matrix_device_type;
447  using local_view_device_type = typename MultiVector<SC,LO,GO,NO>::dual_view_type::t_dev::non_const_type;
448  using const_local_view_device_type = typename MultiVector<SC,LO,GO,NO>::dual_view_type::t_dev::const_type;
449  using offset_type = Kokkos::View<const size_t*, typename NO::device_type>;
450 
451  local_matrix_device_type A_lcl = A.getLocalMatrixDevice ();
452  const_local_view_device_type X_colmap_lcl = X_colmap.getLocalViewDevice(Access::ReadOnly);
453  const_local_view_device_type B_lcl = B.getLocalViewDevice(Access::ReadOnly);
454  local_view_device_type R_lcl = R.getLocalViewDevice(Access::OverwriteAll);
455  const_local_view_device_type X_domainmap_lcl = X_domainmap.getLocalViewDevice(Access::ReadOnly);;
456 
457  const bool debug = ::Tpetra::Details::Behavior::debug ();
458  if (debug) {
459  TEUCHOS_TEST_FOR_EXCEPTION
460  (X_colmap.getNumVectors () != R.getNumVectors (), std::runtime_error,
461  "X.getNumVectors() = " << X_colmap.getNumVectors () << " != "
462  "R.getNumVectors() = " << R.getNumVectors () << ".");
463  TEUCHOS_TEST_FOR_EXCEPTION
464  (X_colmap.getLocalLength () !=
465  A.getColMap ()->getLocalNumElements (), std::runtime_error,
466  "X has the wrong number of local rows. "
467  "X.getLocalLength() = " << X_colmap.getLocalLength () << " != "
468  "A.getColMap()->getLocalNumElements() = " <<
469  A.getColMap ()->getLocalNumElements () << ".");
470  TEUCHOS_TEST_FOR_EXCEPTION
471  (R.getLocalLength () !=
472  A.getRowMap ()->getLocalNumElements (), std::runtime_error,
473  "R has the wrong number of local rows. "
474  "R.getLocalLength() = " << R.getLocalLength () << " != "
475  "A.getRowMap()->getLocalNumElements() = " <<
476  A.getRowMap ()->getLocalNumElements () << ".");
477  TEUCHOS_TEST_FOR_EXCEPTION
478  (B.getLocalLength () !=
479  A.getRowMap ()->getLocalNumElements (), std::runtime_error,
480  "B has the wrong number of local rows. "
481  "B.getLocalLength() = " << B.getLocalLength () << " != "
482  "A.getRowMap()->getLocalNumElements() = " <<
483  A.getRowMap ()->getLocalNumElements () << ".");
484 
485  TEUCHOS_TEST_FOR_EXCEPTION
486  (! A.isFillComplete (), std::runtime_error, "The matrix A is not "
487  "fill complete. You must call fillComplete() (possibly with "
488  "domain and range Map arguments) without an intervening "
489  "resumeFill() call before you may call this method.");
490  TEUCHOS_TEST_FOR_EXCEPTION
491  (! X_colmap.isConstantStride () || ! R.isConstantStride () || ! B.isConstantStride (),
492  std::runtime_error, "X, Y and B must be constant stride.");
493  // If the two pointers are NULL, then they don't alias one
494  // another, even though they are equal.
495  TEUCHOS_TEST_FOR_EXCEPTION
496  ((X_colmap_lcl.data () == R_lcl.data () && X_colmap_lcl.data () != nullptr) ||
497  (X_colmap_lcl.data () == B_lcl.data () && X_colmap_lcl.data () != nullptr),
498  std::runtime_error, "X, Y and R may not alias one another.");
499  }
500 
501  if (A_lcl.numRows() == 0) {
502  return;
503  }
504 
505  int64_t numLocalRows = A_lcl.numRows ();
506  int64_t myNnz = A_lcl.nnz();
507 
508  // //Check for imbalanced rows. If the logic for SPMV to use merge path is triggered,
509  // //use it and follow the reference residual code
510  // LO maxRowImbalance = 0;
511  // if(numLocalRows != 0)
512  // maxRowImbalance = A.getLocalMaxNumRowEntries() - (myNnz / numLocalRows);
513  // if(size_t(maxRowImbalance) >= Tpetra::Details::Behavior::rowImbalanceThreshold())
514  // {
515  // //note: lclOp will be wrapped in shared_ptr
516  // auto lclOp = A.getLocalMultiplyOperator();
517  // //Call local SPMV, requesting merge path, through A's LocalCrsMatrixOperator
518  // lclOp->applyImbalancedRows (X_colmap_lcl, R_lcl, Teuchos::NO_TRANS, one, zero);
519  // R.update(one,B,negone);
520  // return;
521  // }
522 
523  int team_size = -1;
524  int vector_length = -1;
525  int64_t rows_per_thread = -1;
526 
527  using execution_space = typename CrsMatrix<SC,LO,GO,NO>::execution_space;
528  using policy_type = typename Kokkos::TeamPolicy<execution_space>;
529 
530  int64_t rows_per_team = KokkosSparse::Impl::spmv_launch_parameters<execution_space>(numLocalRows, myNnz, rows_per_thread, team_size, vector_length);
531  int64_t worksets = (B_lcl.extent (0) + rows_per_team - 1) / rows_per_team;
532 
533  policy_type policy (1, 1);
534  if (team_size < 0) {
535  policy = policy_type (worksets, Kokkos::AUTO, vector_length);
536  }
537  else {
538  policy = policy_type (worksets, team_size, vector_length);
539  }
540 
541  bool is_vector = (X_colmap_lcl.extent(1) == 1);
542 
543  if(is_vector) {
544 
545  using functor_type = LocalResidualFunctor<local_matrix_device_type,local_view_device_type,const_local_view_device_type,offset_type,false,true,true>;
546  functor_type func (A_lcl, X_colmap_lcl, B_lcl, R_lcl, rows_per_team, offsets, X_domainmap_lcl);
547  Kokkos::parallel_for("residual-vector",policy,func);
548 
549  RCP<const import_type> importer = A.getGraph ()->getImporter ();
550  X_colmap.endImport (X_domainmap, *importer, INSERT, true);
551 
552  Kokkos::fence("Tpetra::localResidualWithCommCompOverlap-1");
553 
554  using functor_type2 = OffRankUpdateFunctor<local_matrix_device_type,local_view_device_type,const_local_view_device_type,offset_type,false>;
555  functor_type2 func2 (A_lcl, X_colmap_lcl, R_lcl, rows_per_team, offsets);
556  Kokkos::parallel_for("residual-vector-offrank",policy,func2);
557 
558  }
559  else {
560 
561  using functor_type = LocalResidualFunctor<local_matrix_device_type,local_view_device_type,const_local_view_device_type,offset_type,true,true,true>;
562  functor_type func (A_lcl, X_colmap_lcl, B_lcl, R_lcl, rows_per_team, offsets, X_domainmap_lcl);
563  Kokkos::parallel_for("residual-multivector",policy,func);
564 
565  RCP<const import_type> importer = A.getGraph ()->getImporter ();
566  X_colmap.endImport (X_domainmap, *importer, INSERT, true);
567 
568  Kokkos::fence("Tpetra::localResidualWithCommCompOverlap-2");
569 
570  using functor_type2 = OffRankUpdateFunctor<local_matrix_device_type,local_view_device_type,const_local_view_device_type,offset_type,true>;
571  functor_type2 func2 (A_lcl, X_colmap_lcl, R_lcl, rows_per_team, offsets);
572  Kokkos::parallel_for("residual-vector-offrank",policy,func2);
573 
574  }
575 }
576 
577 
579 template<class SC, class LO, class GO, class NO>
581  const MultiVector<SC,LO,GO,NO> & X_in,
582  const MultiVector<SC,LO,GO,NO> & B_in,
583  MultiVector<SC,LO,GO,NO> & R_in) {
585  using Teuchos::RCP;
586  using Teuchos::rcp;
587  using Teuchos::rcp_const_cast;
588  using Teuchos::rcpFromRef;
589 
590  const bool debug = ::Tpetra::Details::Behavior::debug ();
591  const bool skipCopyAndPermuteIfPossible = ::Tpetra::Details::Behavior::skipCopyAndPermuteIfPossible ();
592  const bool overlapCommunicationAndComputation = ::Tpetra::Details::Behavior::overlapCommunicationAndComputation ();
593  if (overlapCommunicationAndComputation)
594  TEUCHOS_ASSERT(skipCopyAndPermuteIfPossible);
595 
596  // Whether we are using restrictedMode in the import from domain to
597  // column map. Restricted mode skips the copy and permutation of the
598  // local part of X. We are using restrictedMode only when domain and
599  // column map are locally fitted, i.e. when the local indices of
600  // domain and column map match.
601  bool restrictedMode = false;
602 
603  const CrsMatrix<SC,LO,GO,NO> * Apt = dynamic_cast<const CrsMatrix<SC,LO,GO,NO>*>(&Aop);
604  if(!Apt) {
605  // If we're not a CrsMatrix, we can't do fusion, so just do apply+update
606  SC one = Teuchos::ScalarTraits<SC>::one(), negone = -one, zero = Teuchos::ScalarTraits<SC>::zero();
607  Aop.apply(X_in,R_in,Teuchos::NO_TRANS, one, zero);
608  R_in.update(one,B_in,negone);
609  return;
610  }
611  const CrsMatrix<SC,LO,GO,NO> & A = *Apt;
612 
613  using import_type = typename CrsMatrix<SC,LO,GO,NO>::import_type;
614  using export_type = typename CrsMatrix<SC,LO,GO,NO>::export_type;
615  using MV = MultiVector<SC,LO,GO,NO>;
616  using graph_type = Tpetra::CrsGraph<LO,GO,NO>;
617  using offset_type = typename graph_type::offset_device_view_type;
618 
619  // We treat the case of a replicated MV output specially.
620  const bool R_is_replicated =
621  (! R_in.isDistributed () && A.getComm ()->getSize () != 1);
622 
623  // It's possible that R is a view of X or B.
624  // We don't try to to detect the more subtle cases (e.g., one is a
625  // subview of the other, but their initial pointers differ). We
626  // only need to do this if this matrix's Import is trivial;
627  // otherwise, we don't actually apply the operator from X into Y.
628 
629  RCP<const import_type> importer = A.getGraph ()->getImporter ();
630  RCP<const export_type> exporter = A.getGraph ()->getExporter ();
631 
632  // Temporary MV for Import operation. After the block of code
633  // below, this will be an (Imported if necessary) column Map MV
634  // ready to give to localApply(...).
635  RCP<MV> X_colMap;
636  if (importer.is_null ()) {
637  if (! X_in.isConstantStride ()) {
638  // Not all sparse mat-vec kernels can handle an input MV with
639  // nonconstant stride correctly, so we have to copy it in that
640  // case into a constant stride MV. To make a constant stride
641  // copy of X_in, we force creation of the column (== domain)
642  // Map MV (if it hasn't already been created, else fetch the
643  // cached copy). This avoids creating a new MV each time.
644  X_colMap = A.getColumnMapMultiVector (X_in, true);
645  Tpetra::deep_copy (*X_colMap, X_in);
646  // X_colMap = rcp_const_cast<const MV> (X_colMapNonConst);
647  }
648  else {
649  // The domain and column Maps are the same, so do the local
650  // multiply using the domain Map input MV X_in.
651  X_colMap = rcp_const_cast<MV> (rcpFromRef (X_in) );
652  }
653  }
654  else { // need to Import source (multi)vector
655  ProfilingRegion regionImport ("Tpetra::CrsMatrix::residual: Import");
656  // We're doing an Import anyway, which will copy the relevant
657  // elements of the domain Map MV X_in into a separate column Map
658  // MV. Thus, we don't have to worry whether X_in is constant
659  // stride.
660  X_colMap = A.getColumnMapMultiVector (X_in);
661 
662  // Do we want to use restrictedMode?
663  restrictedMode = skipCopyAndPermuteIfPossible && importer->isLocallyFitted();
664 
665  if (debug && restrictedMode) {
666  TEUCHOS_TEST_FOR_EXCEPTION
667  (!importer->getTargetMap()->isLocallyFitted(*importer->getSourceMap()), std::runtime_error,
668  "Source map and target map are not locally fitted, but Tpetra::residual thinks they are.");
669  }
670 
671  // Import from the domain Map MV to the column Map MV.
672  X_colMap->beginImport (X_in, *importer, INSERT, restrictedMode);
673  }
674 
675  offset_type offsets;
676  if (restrictedMode)
677  A.getCrsGraph()->getLocalOffRankOffsets(offsets);
678 
679  // Get a vector for the R_rowMap output residual, handling the
680  // non-constant stride and exporter cases. Since R gets clobbered
681  // we don't need to worry about the data in it
682  RCP<MV> R_rowMap;
683  if(exporter.is_null()) {
684  if (! R_in.isConstantStride ()) {
685  R_rowMap = A.getRowMapMultiVector(R_in);
686  }
687  else {
688  R_rowMap = rcpFromRef (R_in);
689  }
690  }
691  else {
692  R_rowMap = A.getRowMapMultiVector (R_in);
693  }
694 
695  // Get a vector for the B_rowMap output residual, handling the
696  // non-constant stride and exporter cases
697  RCP<const MV> B_rowMap;
698  if(exporter.is_null()) {
699  if (! B_in.isConstantStride ()) {
700  // Do an allocation here. If we need to optimize this later, we can have the matrix
701  // cache this.
702  RCP<MV> B_rowMapNonConst = rcp(new MV(A.getRowMap(),B_in.getNumVectors()));
703  Tpetra::deep_copy (*B_rowMapNonConst, B_in);
704  B_rowMap = rcp_const_cast<const MV> (B_rowMapNonConst);
705  }
706  else {
707  B_rowMap = rcpFromRef (B_in);
708  }
709  }
710  else {
711  // Do an allocation here. If we need to optimize this later, we can have the matrix
712  // cache this.
713  ProfilingRegion regionExport ("Tpetra::CrsMatrix::residual: B Import");
714  RCP<MV> B_rowMapNonConst = rcp(new MV(A.getRowMap(),B_in.getNumVectors()));
715  B_rowMapNonConst->doImport(B_in, *exporter, ADD);
716  B_rowMap = rcp_const_cast<const MV> (B_rowMapNonConst);
717  }
718 
719  // If we have a nontrivial Export object, we must perform an
720  // Export. In that case, the local multiply result will go into
721  // the row Map multivector. We don't have to make a
722  // constant-stride version of R_in in this case, because we had to
723  // make a constant stride R_rowMap MV and do an Export anyway.
724  if (! exporter.is_null ()) {
725  if ( ! importer.is_null ())
726  X_colMap->endImport (X_in, *importer, INSERT, restrictedMode);
727  if (restrictedMode && !importer.is_null ())
728  localResidual (A, *X_colMap, *B_rowMap, *R_rowMap, offsets, &X_in);
729  else
730  localResidual (A, *X_colMap, *B_rowMap, *R_rowMap, offsets);
731 
732  {
733  ProfilingRegion regionExport ("Tpetra::CrsMatrix::residual: R Export");
734 
735  // Do the Export operation.
736  R_in.doExport (*R_rowMap, *exporter, ADD);
737  }
738  }
739  else { // Don't do an Export: row Map and range Map are the same.
740 
741  if (restrictedMode)
742  if (overlapCommunicationAndComputation) {
743  localResidualWithCommCompOverlap (A, *X_colMap, *B_rowMap, *R_rowMap, offsets, X_in);
744  } else {
745  X_colMap->endImport (X_in, *importer, INSERT, restrictedMode);
746  localResidual (A, *X_colMap, *B_rowMap, *R_rowMap, offsets, &X_in);
747  }
748  else {
749  if ( ! importer.is_null ())
750  X_colMap->endImport (X_in, *importer, INSERT, restrictedMode);
751  localResidual (A, *X_colMap, *B_rowMap, *R_rowMap, offsets);
752  }
753 
754  //
755  // If R_in does not have constant stride,
756  // then we can't let the kernel write directly to R_in.
757  // Instead, we have to use the cached row (== range)
758  // Map MV as temporary storage.
759  //
760  if (! R_in.isConstantStride () ) {
761  // We need to be sure to do a copy out in this case.
762  Tpetra::deep_copy (R_in, *R_rowMap);
763  }
764  }
765 
766  // If the range Map is a locally replicated Map, sum up
767  // contributions from each process.
768  if (R_is_replicated) {
769  ProfilingRegion regionReduce ("Tpetra::CrsMatrix::residual: Reduce Y");
770  R_in.reduce ();
771  }
772 }
773 
774 
775 
776 
777 
778  } // namespace Details
779 } // namespace Tpetra
780 
781 #endif // TPETRA_DETAILS_RESIDUAL_HPP
Communication plan for data redistribution from a uniquely-owned to a (possibly) multiply-owned distr...
static bool overlapCommunicationAndComputation()
Overlap communication and computation.
Sparse matrix that presents a row-oriented interface that lets users read or modify entries...
Declaration of Tpetra::Details::Profiling, a scope guard for Kokkos Profiling.
size_t getNumVectors() const
Number of columns in the multivector.
size_t getLocalLength() const
Local number of rows on the calling process.
bool isConstantStride() const
Whether this multivector has constant stride between columns.
One or more distributed dense vectors.
virtual void apply(const MultiVector< Scalar, LocalOrdinal, GlobalOrdinal, Node > &X, MultiVector< Scalar, LocalOrdinal, GlobalOrdinal, Node > &Y, Teuchos::ETransp mode=Teuchos::NO_TRANS, Scalar alpha=Teuchos::ScalarTraits< Scalar >::one(), Scalar beta=Teuchos::ScalarTraits< Scalar >::zero()) const =0
Computes the operator-multivector application.
bool isDistributed() const
Whether this is a globally distributed object.
Teuchos::RCP< const crs_graph_type > getCrsGraph() const
This matrix&#39;s graph, as a CrsGraph.
static bool debug()
Whether Tpetra is in debug mode.
KokkosSparse::CrsMatrix< impl_scalar_type, local_ordinal_type, device_type, void, typename local_graph_device_type::size_type > local_matrix_device_type
The specialization of Kokkos::CrsMatrix that represents the part of the sparse matrix on each MPI pro...
Functor for computing R -= A_offRank*X_colmap.
offsets_view_type offsets_
Offsets (output argument)
void deep_copy(MultiVector< DS, DL, DG, DN > &dst, const MultiVector< SS, SL, SG, SN > &src)
Copy the contents of the MultiVector src into dst.
size_t getLocalMaxNumRowEntries() const override
Maximum number of entries in any row of the matrix, on this process.
Insert new values that don&#39;t currently exist.
bool isFillComplete() const override
Whether the matrix is fill complete.
Teuchos::RCP< const Teuchos::Comm< int > > getComm() const override
The communicator over which the matrix is distributed.
Abstract interface for operators (e.g., matrices and preconditioners).
void update(const Scalar &alpha, const MultiVector< Scalar, LocalOrdinal, GlobalOrdinal, Node > &A, const Scalar &beta)
Update: this = beta*this + alpha*A.
local_matrix_device_type getLocalMatrixDevice() const
The local sparse matrix.
Communication plan for data redistribution from a (possibly) multiply-owned to a uniquely-owned distr...
Functor for computing the residual.
Teuchos::RCP< MV > getColumnMapMultiVector(const MV &X_domainMap, const bool force=false) const
Create a (or fetch a cached) column Map MultiVector.
Teuchos::RCP< const RowGraph< LocalOrdinal, GlobalOrdinal, Node > > getGraph() const override
This matrix&#39;s graph, as a RowGraph.
Sum new values.
Teuchos::RCP< MV > getRowMapMultiVector(const MV &Y_rangeMap, const bool force=false) const
Create a (or fetch a cached) row Map MultiVector.
dual_view_type::t_dev::const_type getLocalViewDevice(Access::ReadOnlyStruct) const
Return a read-only, up-to-date view of this MultiVector&#39;s local data on device. This requires that th...
static size_t rowImbalanceThreshold()
Threshold for deciding if a local matrix is &quot;imbalanced&quot; in the number of entries per row...
void start()
Start the deep_copy counter.
A distributed graph accessed by rows (adjacency lists) and stored sparsely.
void doExport(const SrcDistObject &source, const Export< LocalOrdinal, GlobalOrdinal, Node > &exporter, const CombineMode CM, const bool restrictedMode=false)
Export data into this object using an Export object (&quot;forward mode&quot;).
void reduce()
Sum values of a locally replicated multivector across all processes.
static bool skipCopyAndPermuteIfPossible()
Skip copyAndPermute if possible.
void residual(const Operator< SC, LO, GO, NO > &A, const MultiVector< SC, LO, GO, NO > &X, const MultiVector< SC, LO, GO, NO > &B, MultiVector< SC, LO, GO, NO > &R)
Computes R = B - A * X.
Teuchos::RCP< const map_type > getColMap() const override
The Map that describes the column distribution in this matrix.
std::shared_ptr< local_multiply_op_type > getLocalMultiplyOperator() const
The local sparse matrix operator (a wrapper of getLocalMatrixDevice() that supports local matrix-vect...
void localApply(const MultiVector< Scalar, LocalOrdinal, GlobalOrdinal, Node > &X, MultiVector< Scalar, LocalOrdinal, GlobalOrdinal, Node > &Y, const Teuchos::ETransp mode=Teuchos::NO_TRANS, const Scalar &alpha=Teuchos::ScalarTraits< Scalar >::one(), const Scalar &beta=Teuchos::ScalarTraits< Scalar >::zero()) const
Compute the local part of a sparse matrix-(Multi)Vector multiply.
Declaration of Tpetra::Details::Behavior, a class that describes Tpetra&#39;s behavior.
Teuchos::RCP< const map_type > getRowMap() const override
The Map that describes the row distribution in this matrix.