Tpetra parallel linear algebra  Version of the Day
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Pages
Tpetra_Details_residual.hpp
Go to the documentation of this file.
1 // @HEADER
2 // ***********************************************************************
3 //
4 // Tpetra: Templated Linear Algebra Services Package
5 // Copyright (2008) Sandia Corporation
6 //
7 // Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
8 // the U.S. Government retains certain rights in this software.
9 //
10 // Redistribution and use in source and binary forms, with or without
11 // modification, are permitted provided that the following conditions are
12 // met:
13 //
14 // 1. Redistributions of source code must retain the above copyright
15 // notice, this list of conditions and the following disclaimer.
16 //
17 // 2. Redistributions in binary form must reproduce the above copyright
18 // notice, this list of conditions and the following disclaimer in the
19 // documentation and/or other materials provided with the distribution.
20 //
21 // 3. Neither the name of the Corporation nor the names of the
22 // contributors may be used to endorse or promote products derived from
23 // this software without specific prior written permission.
24 //
25 // THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
26 // EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
27 // IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
28 // PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
29 // CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
30 // EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
31 // PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
32 // PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
33 // LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
34 // NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
35 // SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
36 //
37 //
38 // ************************************************************************
39 // @HEADER
40 
41 #ifndef TPETRA_DETAILS_RESIDUAL_HPP
42 #define TPETRA_DETAILS_RESIDUAL_HPP
43 
44 #include "TpetraCore_config.h"
45 #include "Tpetra_CrsMatrix.hpp"
46 #include "Tpetra_LocalCrsMatrixOperator.hpp"
47 #include "Tpetra_MultiVector.hpp"
48 #include "Teuchos_RCP.hpp"
49 #include "Teuchos_ScalarTraits.hpp"
52 
58 
59 namespace Tpetra {
60  namespace Details {
61 
62 
63 template<class ExecutionSpace>
64 int64_t
65 residual_launch_parameters (int64_t numRows,
66  int64_t nnz,
67  int64_t rows_per_thread,
68  int& team_size,
69  int& vector_length)
70 {
71  using execution_space = typename ExecutionSpace::execution_space;
72 
73  int64_t rows_per_team;
74  int64_t nnz_per_row = nnz/numRows;
75 
76  if (nnz_per_row < 1) {
77  nnz_per_row = 1;
78  }
79 
80  if (vector_length < 1) {
81  vector_length = 1;
82  while (vector_length<32 && vector_length*6 < nnz_per_row) {
83  vector_length *= 2;
84  }
85  }
86 
87  // Determine rows per thread
88  if (rows_per_thread < 1) {
89 #ifdef KOKKOS_ENABLE_CUDA
90  if (std::is_same<Kokkos::Cuda, execution_space>::value) {
91  rows_per_thread = 1;
92  }
93  else
94 #endif
95  {
96  if (nnz_per_row < 20 && nnz > 5000000) {
97  rows_per_thread = 256;
98  }
99  else {
100  rows_per_thread = 64;
101  }
102  }
103  }
104 
105 #ifdef KOKKOS_ENABLE_CUDA
106  if (team_size < 1) {
107  if (std::is_same<Kokkos::Cuda,execution_space>::value) {
108  team_size = 256/vector_length;
109  }
110  else {
111  team_size = 1;
112  }
113  }
114 #endif
115 
116  rows_per_team = rows_per_thread * team_size;
117 
118  if (rows_per_team < 0) {
119  int64_t nnz_per_team = 4096;
120  int64_t conc = execution_space::concurrency ();
121  while ((conc * nnz_per_team * 4 > nnz) &&
122  (nnz_per_team > 256)) {
123  nnz_per_team /= 2;
124  }
125  rows_per_team = (nnz_per_team + nnz_per_row - 1) / nnz_per_row;
126  }
127 
128  return rows_per_team;
129 }
130 
131 
132 
133 
134 
135 template<class SC, class LO, class GO, class NO>
136 void localResidual(const CrsMatrix<SC,LO,GO,NO> & A,
137  const MultiVector<SC,LO,GO,NO> & X,
138  const MultiVector<SC,LO,GO,NO> & B,
139  MultiVector<SC,LO,GO,NO> & R) {
141  using Teuchos::NO_TRANS;
142  ProfilingRegion regionLocalApply ("Tpetra::CrsMatrix::localResidual");
143 
144  auto A_lcl = A.getLocalMatrix ();
145  auto X_lcl = X.getLocalViewDevice ();
146  auto B_lcl = B.getLocalViewDevice ();
147  auto R_lcl = R.getLocalViewDevice ();
148  auto lclMatrix_ = A.getLocalMatrix ();
149 
150  const bool debug = ::Tpetra::Details::Behavior::debug ();
151  if (debug) {
152  TEUCHOS_TEST_FOR_EXCEPTION
153  (X.getNumVectors () != R.getNumVectors (), std::runtime_error,
154  "X.getNumVectors() = " << X.getNumVectors () << " != "
155  "R.getNumVectors() = " << R.getNumVectors () << ".");
156  TEUCHOS_TEST_FOR_EXCEPTION
157  (X.getNumVectors () != R.getNumVectors (), std::runtime_error,
158  "X.getNumVectors() = " << X.getNumVectors () << " != "
159  "R.getNumVectors() = " << R.getNumVectors () << ".");
160 
161  TEUCHOS_TEST_FOR_EXCEPTION
162  (X.getLocalLength () !=
163  A.getColMap ()->getNodeNumElements (), std::runtime_error,
164  "X has the wrong number of local rows. "
165  "X.getLocalLength() = " << X.getLocalLength () << " != "
166  "A.getColMap()->getNodeNumElements() = " <<
167  A.getColMap ()->getNodeNumElements () << ".");
168  TEUCHOS_TEST_FOR_EXCEPTION
169  (R.getLocalLength () !=
170  A.getRowMap ()->getNodeNumElements (), std::runtime_error,
171  "R has the wrong number of local rows. "
172  "R.getLocalLength() = " << R.getLocalLength () << " != "
173  "A.getRowMap()->getNodeNumElements() = " <<
174  A.getRowMap ()->getNodeNumElements () << ".");
175  TEUCHOS_TEST_FOR_EXCEPTION
176  (B.getLocalLength () !=
177  A.getRowMap ()->getNodeNumElements (), std::runtime_error,
178  "B has the wrong number of local rows. "
179  "B.getLocalLength() = " << B.getLocalLength () << " != "
180  "A.getRowMap()->getNodeNumElements() = " <<
181  A.getRowMap ()->getNodeNumElements () << ".");
182 
183  TEUCHOS_TEST_FOR_EXCEPTION
184  (! A.isFillComplete (), std::runtime_error, "The matrix A is not "
185  "fill complete. You must call fillComplete() (possibly with "
186  "domain and range Map arguments) without an intervening "
187  "resumeFill() call before you may call this method.");
188  TEUCHOS_TEST_FOR_EXCEPTION
189  (! X.isConstantStride () || ! R.isConstantStride () || ! B.isConstantStride (),
190  std::runtime_error, "X, Y and B must be constant stride.");
191  // If the two pointers are NULL, then they don't alias one
192  // another, even though they are equal.
193  TEUCHOS_TEST_FOR_EXCEPTION
194  ((X_lcl.data () == R_lcl.data () && X_lcl.data () != nullptr) ||
195  (X_lcl.data () == B_lcl.data () && X_lcl.data () != nullptr),
196  std::runtime_error, "X, Y and R may not alias one another.");
197  }
198 
199  SC one = Teuchos::ScalarTraits<SC>::one(), negone = -one, zero = Teuchos::ScalarTraits<SC>::zero();
200 #ifdef TPETRA_DETAILS_USE_REFERENCE_RESIDUAL
201  // This is currently a "reference implementation" waiting until Kokkos Kernels provides
202  // a residual kernel.
203  A.localApply(X,R,Teuchos::NO_TRANS, one, zero);
204  R.update(one,B,negone);
205 #else
206  using execution_space = typename CrsMatrix<SC,LO,GO,NO>::execution_space;
207 
208  if (A_lcl.numRows() == 0) {
209  return;
210  }
211 
212  int64_t numLocalRows = A_lcl.numRows ();
213  int64_t myNnz = A_lcl.nnz();
214 
215  //Check for imbalanced rows. If the logic for SPMV to use merge path is triggered,
216  //use it and follow the reference residual code
217  LO maxRowImbalance = 0;
218  if(numLocalRows != 0)
219  maxRowImbalance = A.getNodeMaxNumRowEntries() - (myNnz / numLocalRows);
220  if(size_t(maxRowImbalance) >= Tpetra::Details::Behavior::rowImbalanceThreshold())
221  {
222  //note: lclOp will be wrapped in shared_ptr
223  auto lclOp = A.getLocalMultiplyOperator();
224  //Call local SPMV, requesting merge path, through A's LocalCrsMatrixOperator
225  lclOp->applyImbalancedRows (X_lcl, R_lcl, Teuchos::NO_TRANS, one, zero);
226  R.update(one,B,negone);
227  return;
228  }
229 
230  int team_size = -1;
231  int vector_length = -1;
232  int64_t rows_per_thread = -1;
233 
234  int64_t rows_per_team =
235  residual_launch_parameters<execution_space>(numLocalRows, myNnz, rows_per_thread, team_size, vector_length);
236  int64_t worksets = (B_lcl.extent (0) + rows_per_team - 1) / rows_per_team;
237 
238  using policy_type = typename Kokkos::TeamPolicy<execution_space>;
239  using team_member = typename policy_type::member_type;
240 
241  using residual_value_type = typename MultiVector<SC,LO,GO,NO>::dual_view_type::t_dev::non_const_value_type;
242  using KAT = Kokkos::ArithTraits<residual_value_type>;
243 
244  policy_type policy (1, 1);
245  if (team_size < 0) {
246  policy = policy_type (worksets, Kokkos::AUTO, vector_length);
247  }
248  else {
249  policy = policy_type (worksets, team_size, vector_length);
250  }
251 
252  bool is_vector = (X_lcl.extent(1) == 1);
253 
254  if(is_vector) {
255  // Vector case
256  // Kernel interior shamelessly horked from Ifpack2_Details_ScaledDampedResidual_def.hpp
257  Kokkos::parallel_for("residual-vector",policy,KOKKOS_LAMBDA(const team_member& dev) {
258  Kokkos::parallel_for(Kokkos::TeamThreadRange (dev, 0, rows_per_team),[&] (const LO& loop) {
259  const LO lclRow = static_cast<LO> (dev.league_rank ()) * rows_per_team + loop;
260 
261  if (lclRow >= A_lcl.numRows ()) {
262  return;
263  }
264 
265  const auto A_row = A_lcl.rowConst(lclRow);
266  const LO row_length = static_cast<LO> (A_row.length);
267  residual_value_type A_x = KAT::zero ();
268 
269  Kokkos::parallel_reduce(Kokkos::ThreadVectorRange (dev, row_length), [&] (const LO iEntry, residual_value_type& lsum) {
270  const auto A_val = A_row.value(iEntry);
271  lsum += A_val * X_lcl(A_row.colidx(iEntry),0);
272  }, A_x);
273 
274  Kokkos::single(Kokkos::PerThread(dev),[&] () {
275  R_lcl(lclRow,0) = B_lcl(lclRow,0) - A_x;
276  });
277  });//end parallel_for TeamThreadRange
278  });//end parallel_for "residual-vector"
279  } else {
280  // MultiVector case
281  // Kernel interior shamelessly horked from Ifpack2_Details_ScaledDampedResidual_def.hpp
282  Kokkos::parallel_for("residual-multivector",policy,KOKKOS_LAMBDA(const team_member& dev) {
283  // NOTE: It looks like I should be able to get this data up above, but if I try to
284  // we get internal compiler errors. Who knew that gcc tried to "gimplify"?
285  const LO numVectors = static_cast<LO>(X_lcl.extent(1));
286  Kokkos::parallel_for(Kokkos::TeamThreadRange (dev, 0, rows_per_team),[&] (const LO& loop) {
287  const LO lclRow = static_cast<LO> (dev.league_rank ()) * rows_per_team + loop;
288 
289  if (lclRow >= A_lcl.numRows ()) {
290  return;
291  }
292  const auto A_row = A_lcl.rowConst(lclRow);
293  const LO row_length = static_cast<LO> (A_row.length);
294  for(LO v=0; v<numVectors; v++) {
295  residual_value_type A_x = KAT::zero ();
296 
297  Kokkos::parallel_reduce(Kokkos::ThreadVectorRange (dev, row_length), [&] (const LO iEntry, residual_value_type& lsum) {
298  const auto A_val = A_row.value(iEntry);
299  lsum += A_val * X_lcl(A_row.colidx(iEntry),v);
300  }, A_x);
301 
302  Kokkos::single(Kokkos::PerThread(dev),[&] () {
303  R_lcl(lclRow,v) = B_lcl(lclRow,v) - A_x;
304  });
305 
306  }//end for numVectors
307  });//end parallel_for TeamThreadRange
308  });//end parallel_for "residual-multivector"
309  }// end else
310 #endif
311 }
312 
313 
315 template<class SC, class LO, class GO, class NO>
317  const MultiVector<SC,LO,GO,NO> & X_in,
318  const MultiVector<SC,LO,GO,NO> & B_in,
319  MultiVector<SC,LO,GO,NO> & R_in) {
321  using Teuchos::RCP;
322  using Teuchos::rcp;
323  using Teuchos::rcp_const_cast;
324  using Teuchos::rcpFromRef;
325 
326  const CrsMatrix<SC,LO,GO,NO> * Apt = dynamic_cast<const CrsMatrix<SC,LO,GO,NO>*>(&Aop);
327  if(!Apt) {
328  // If we're not a CrsMatrix, we can't do fusion, so just do apply+update
329  SC one = Teuchos::ScalarTraits<SC>::one(), negone = -one, zero = Teuchos::ScalarTraits<SC>::zero();
330  Aop.apply(X_in,R_in,Teuchos::NO_TRANS, one, zero);
331  R_in.update(one,B_in,negone);
332  return;
333  }
334  const CrsMatrix<SC,LO,GO,NO> & A = *Apt;
335 
336  using import_type = typename CrsMatrix<SC,LO,GO,NO>::import_type;
337  using export_type = typename CrsMatrix<SC,LO,GO,NO>::export_type;
338  using MV = MultiVector<SC,LO,GO,NO>;
339 
340  // We treat the case of a replicated MV output specially.
341  const bool R_is_replicated =
342  (! R_in.isDistributed () && A.getComm ()->getSize () != 1);
343 
344  // It's possible that R is a view of X or B.
345  // We don't try to to detect the more subtle cases (e.g., one is a
346  // subview of the other, but their initial pointers differ). We
347  // only need to do this if this matrix's Import is trivial;
348  // otherwise, we don't actually apply the operator from X into Y.
349 
350  RCP<const import_type> importer = A.getGraph ()->getImporter ();
351  RCP<const export_type> exporter = A.getGraph ()->getExporter ();
352 
353  // Temporary MV for Import operation. After the block of code
354  // below, this will be an (Imported if necessary) column Map MV
355  // ready to give to localApply(...).
356  RCP<const MV> X_colMap;
357  if (importer.is_null ()) {
358  if (! X_in.isConstantStride ()) {
359  // Not all sparse mat-vec kernels can handle an input MV with
360  // nonconstant stride correctly, so we have to copy it in that
361  // case into a constant stride MV. To make a constant stride
362  // copy of X_in, we force creation of the column (== domain)
363  // Map MV (if it hasn't already been created, else fetch the
364  // cached copy). This avoids creating a new MV each time.
365  RCP<MV> X_colMapNonConst = A.getColumnMapMultiVector (X_in, true);
366  Tpetra::deep_copy (*X_colMapNonConst, X_in);
367  X_colMap = rcp_const_cast<const MV> (X_colMapNonConst);
368  }
369  else {
370  // The domain and column Maps are the same, so do the local
371  // multiply using the domain Map input MV X_in.
372  X_colMap = rcpFromRef (X_in);
373  }
374  }
375  else { // need to Import source (multi)vector
376  ProfilingRegion regionImport ("Tpetra::CrsMatrix::residual: Import");
377  // We're doing an Import anyway, which will copy the relevant
378  // elements of the domain Map MV X_in into a separate column Map
379  // MV. Thus, we don't have to worry whether X_in is constant
380  // stride.
381  RCP<MV> X_colMapNonConst = A.getColumnMapMultiVector (X_in);
382 
383  // Import from the domain Map MV to the column Map MV.
384  X_colMapNonConst->doImport (X_in, *importer, INSERT);
385  X_colMap = rcp_const_cast<const MV> (X_colMapNonConst);
386  }
387 
388  // Get a vector for the R_rowMap output residual, handling the
389  // non-constant stride and exporter cases. Since R gets clobbered
390  // we don't need to worry about the data in it
391  RCP<MV> R_rowMap;
392  if(exporter.is_null()) {
393  if (! R_in.isConstantStride ()) {
394  R_rowMap = A.getRowMapMultiVector(R_in);
395  }
396  else {
397  R_rowMap = rcpFromRef (R_in);
398  }
399  }
400  else {
401  R_rowMap = A.getRowMapMultiVector (R_in);
402  }
403 
404  // Get a vector for the B_rowMap output residual, handling the
405  // non-constant stride and exporter cases
406  RCP<const MV> B_rowMap;
407  if(exporter.is_null()) {
408  if (! B_in.isConstantStride ()) {
409  // Do an allocation here. If we need to optimize this later, we can have the matrix
410  // cache this.
411  RCP<MV> B_rowMapNonConst = rcp(new MV(A.getRowMap(),B_in.getNumVectors()));
412  Tpetra::deep_copy (*B_rowMapNonConst, B_in);
413  B_rowMap = rcp_const_cast<const MV> (B_rowMapNonConst);
414  }
415  else {
416  B_rowMap = rcpFromRef (B_in);
417  }
418  }
419  else {
420  // Do an allocation here. If we need to optimize this later, we can have the matrix
421  // cache this.
422  ProfilingRegion regionExport ("Tpetra::CrsMatrix::residual: B Import");
423  RCP<MV> B_rowMapNonConst = rcp(new MV(A.getRowMap(),B_in.getNumVectors()));
424  B_rowMapNonConst->doImport(B_in, *exporter, ADD);
425  B_rowMap = rcp_const_cast<const MV> (B_rowMapNonConst);
426  }
427 
428  // If we have a nontrivial Export object, we must perform an
429  // Export. In that case, the local multiply result will go into
430  // the row Map multivector. We don't have to make a
431  // constant-stride version of R_in in this case, because we had to
432  // make a constant stride R_rowMap MV and do an Export anyway.
433  if (! exporter.is_null ()) {
434 
435  localResidual (A, *X_colMap, *B_rowMap, *R_rowMap);
436 
437  {
438  ProfilingRegion regionExport ("Tpetra::CrsMatrix::residual: R Export");
439 
440  // Do the Export operation.
441  R_in.doExport (*R_rowMap, *exporter, ADD);
442  }
443  }
444  else { // Don't do an Export: row Map and range Map are the same.
445  //
446  // If R_in does not have constant stride,
447  // then we can't let the kernel write directly to R_in.
448  // Instead, we have to use the cached row (== range)
449  // Map MV as temporary storage.
450  //
451  if (! R_in.isConstantStride () ) {
452  // We need to be sure to do a copy out in this case.
453  localResidual (A, *X_colMap, *B_rowMap, *R_rowMap);
454  Tpetra::deep_copy (R_in, *R_rowMap);
455  }
456  else {
457  localResidual (A, *X_colMap, *B_rowMap, *R_rowMap);
458  }
459  }
460 
461  // If the range Map is a locally replicated Map, sum up
462  // contributions from each process.
463  if (R_is_replicated) {
464  ProfilingRegion regionReduce ("Tpetra::CrsMatrix::residual: Reduce Y");
465  R_in.reduce ();
466  }
467 }
468 
469 
470 
471 
472 
473  } // namespace Details
474 } // namespace Tpetra
475 
476 #endif // TPETRA_DETAILS_RESIDUAL_HPP
Communication plan for data redistribution from a uniquely-owned to a (possibly) multiply-owned distr...
Sparse matrix that presents a row-oriented interface that lets users read or modify entries...
Declaration of Tpetra::Details::Profiling, a scope guard for Kokkos Profiling.
size_t getNumVectors() const
Number of columns in the multivector.
bool isConstantStride() const
Whether this multivector has constant stride between columns.
One or more distributed dense vectors.
virtual void apply(const MultiVector< Scalar, LocalOrdinal, GlobalOrdinal, Node > &X, MultiVector< Scalar, LocalOrdinal, GlobalOrdinal, Node > &Y, Teuchos::ETransp mode=Teuchos::NO_TRANS, Scalar alpha=Teuchos::ScalarTraits< Scalar >::one(), Scalar beta=Teuchos::ScalarTraits< Scalar >::zero()) const =0
Computes the operator-multivector application.
bool isDistributed() const
Whether this is a globally distributed object.
static bool debug()
Whether Tpetra is in debug mode.
void deep_copy(MultiVector< DS, DL, DG, DN > &dst, const MultiVector< SS, SL, SG, SN > &src)
Copy the contents of the MultiVector src into dst.
Insert new values that don&#39;t currently exist.
Teuchos::RCP< const Teuchos::Comm< int > > getComm() const override
The communicator over which the matrix is distributed.
Abstract interface for operators (e.g., matrices and preconditioners).
void update(const Scalar &alpha, const MultiVector< Scalar, LocalOrdinal, GlobalOrdinal, Node > &A, const Scalar &beta)
Update: this = beta*this + alpha*A.
Communication plan for data redistribution from a (possibly) multiply-owned to a uniquely-owned distr...
Teuchos::RCP< MV > getColumnMapMultiVector(const MV &X_domainMap, const bool force=false) const
Create a (or fetch a cached) column Map MultiVector.
Teuchos::RCP< const RowGraph< LocalOrdinal, GlobalOrdinal, Node > > getGraph() const override
This matrix&#39;s graph, as a RowGraph.
Sum new values into existing values.
Teuchos::RCP< MV > getRowMapMultiVector(const MV &Y_rangeMap, const bool force=false) const
Create a (or fetch a cached) row Map MultiVector.
static size_t rowImbalanceThreshold()
Threshold for deciding if a local matrix is &quot;imbalanced&quot; in the number of entries per row...
void doExport(const SrcDistObject &source, const Export< LocalOrdinal, GlobalOrdinal, Node > &exporter, const CombineMode CM, const bool restrictedMode=false)
Export data into this object using an Export object (&quot;forward mode&quot;).
void reduce()
Sum values of a locally replicated multivector across all processes.
void residual(const Operator< SC, LO, GO, NO > &A, const MultiVector< SC, LO, GO, NO > &X, const MultiVector< SC, LO, GO, NO > &B, MultiVector< SC, LO, GO, NO > &R)
Computes R = B - A * X.
Declaration of Tpetra::Details::Behavior, a class that describes Tpetra&#39;s behavior.
Teuchos::RCP< const map_type > getRowMap() const override
The Map that describes the row distribution in this matrix.