Ifpack2 Templated Preconditioning Package  Version 1.0
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Groups Pages
Ifpack2_Details_ScaledDampedResidual_def.hpp
1 /*
2 //@HEADER
3 // ***********************************************************************
4 //
5 // Ifpack2: Templated Object-Oriented Algebraic Preconditioner Package
6 // Copyright (2009) Sandia Corporation
7 //
8 // Under terms of Contract DE-AC04-94AL85000, there is a non-exclusive
9 // license for use of this work by or on behalf of the U.S. Government.
10 //
11 // Redistribution and use in source and binary forms, with or without
12 // modification, are permitted provided that the following conditions are
13 // met:
14 //
15 // 1. Redistributions of source code must retain the above copyright
16 // notice, this list of conditions and the following disclaimer.
17 //
18 // 2. Redistributions in binary form must reproduce the above copyright
19 // notice, this list of conditions and the following disclaimer in the
20 // documentation and/or other materials provided with the distribution.
21 //
22 // 3. Neither the name of the Corporation nor the names of the
23 // contributors may be used to endorse or promote products derived from
24 // this software without specific prior written permission.
25 //
26 // THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
27 // EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
28 // IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
29 // PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
30 // CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
31 // EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
32 // PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
33 // PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
34 // LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
35 // NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
36 // SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
37 //
38 // ***********************************************************************
39 //@HEADER
40 */
41 
42 #ifndef IFPACK2_DETAILS_SCALEDDAMPEDRESIDUAL_DEF_HPP
43 #define IFPACK2_DETAILS_SCALEDDAMPEDRESIDUAL_DEF_HPP
44 
45 #include "Tpetra_CrsMatrix.hpp"
46 #include "Tpetra_MultiVector.hpp"
47 #include "Tpetra_Operator.hpp"
48 #include "Tpetra_Vector.hpp"
49 #include "Tpetra_withLocalAccess_MultiVector.hpp"
50 #include "Tpetra_Export_decl.hpp"
51 #include "Tpetra_Import_decl.hpp"
52 #include "Kokkos_ArithTraits.hpp"
53 #include "Teuchos_Assert.hpp"
54 #include <type_traits>
55 
56 namespace Ifpack2 {
57 namespace Details {
58 namespace Impl {
59 
64 template<class WVector,
65  class DVector,
66  class BVector,
67  class AMatrix,
68  class XVector,
69  class Scalar,
70  bool use_beta>
72  static_assert (static_cast<int> (WVector::Rank) == 1,
73  "WVector must be a rank 1 View.");
74  static_assert (static_cast<int> (DVector::Rank) == 1,
75  "DVector must be a rank 1 View.");
76  static_assert (static_cast<int> (BVector::Rank) == 1,
77  "BVector must be a rank 1 View.");
78  static_assert (static_cast<int> (XVector::Rank) == 1,
79  "XVector must be a rank 1 View.");
80 
81  using execution_space = typename AMatrix::execution_space;
82  using LO = typename AMatrix::non_const_ordinal_type;
83  using value_type = typename AMatrix::non_const_value_type;
84  using team_policy = typename Kokkos::TeamPolicy<execution_space>;
85  using team_member = typename team_policy::member_type;
86  using ATV = Kokkos::ArithTraits<value_type>;
87 
88  const Scalar alpha;
89  WVector m_w;
90  DVector m_d;
91  BVector m_b;
92  AMatrix m_A;
93  XVector m_x;
94  const Scalar beta;
95 
96  const LO rows_per_team;
97 
98  ScaledDampedResidualVectorFunctor (const Scalar& alpha_,
99  const WVector& m_w_,
100  const DVector& m_d_,
101  const BVector& m_b_,
102  const AMatrix& m_A_,
103  const XVector& m_x_,
104  const Scalar& beta_,
105  const int rows_per_team_) :
106  alpha (alpha_),
107  m_w (m_w_),
108  m_d (m_d_),
109  m_b (m_b_),
110  m_A (m_A_),
111  m_x (m_x_),
112  beta (beta_),
113  rows_per_team (rows_per_team_)
114  {
115  const size_t numRows = m_A.numRows ();
116  const size_t numCols = m_A.numCols ();
117 
118  TEUCHOS_ASSERT( m_w.extent (0) == m_d.extent (0) );
119  TEUCHOS_ASSERT( m_w.extent (0) == m_b.extent (0) );
120  TEUCHOS_ASSERT( numRows == size_t (m_w.extent (0)) );
121  TEUCHOS_ASSERT( numCols <= size_t (m_x.extent (0)) );
122  }
123 
124  KOKKOS_INLINE_FUNCTION
125  void operator() (const team_member& dev) const
126  {
127  using residual_value_type = typename BVector::non_const_value_type;
128  using KAT = Kokkos::ArithTraits<residual_value_type>;
129 
130  Kokkos::parallel_for
131  (Kokkos::TeamThreadRange (dev, 0, rows_per_team),
132  [&] (const LO& loop) {
133  const LO lclRow =
134  static_cast<LO> (dev.league_rank ()) * rows_per_team + loop;
135  if (lclRow >= m_A.numRows ()) {
136  return;
137  }
138  const auto A_row = m_A.rowConst(lclRow);
139  const LO row_length = static_cast<LO> (A_row.length);
140  residual_value_type A_x = KAT::zero ();
141 
142  Kokkos::parallel_reduce
143  (Kokkos::ThreadVectorRange (dev, row_length),
144  [&] (const LO iEntry, residual_value_type& lsum) {
145  const auto A_val = A_row.value(iEntry);
146  lsum += A_val * m_x(A_row.colidx(iEntry));
147  }, A_x);
148 
149  Kokkos::single
150  (Kokkos::PerThread(dev),
151  [&] () {
152  const auto alpha_D_res =
153  alpha * m_d(lclRow) * (m_b(lclRow) - A_x);
154  if (use_beta) {
155  m_w(lclRow) = beta * m_w(lclRow) + alpha_D_res;
156  }
157  else {
158  m_w(lclRow) = alpha_D_res;
159  }
160  });
161  });
162  }
163 };
164 
165 template<class ExecutionSpace>
166 int64_t
167 scaled_damped_residual_vector_launch_parameters (int64_t numRows,
168  int64_t nnz,
169  int64_t rows_per_thread,
170  int& team_size,
171  int& vector_length)
172 {
173  using execution_space = typename ExecutionSpace::execution_space;
174 
175  int64_t rows_per_team;
176  int64_t nnz_per_row = nnz/numRows;
177 
178  if (nnz_per_row < 1) {
179  nnz_per_row = 1;
180  }
181 
182  if (vector_length < 1) {
183  vector_length = 1;
184  while (vector_length<32 && vector_length*6 < nnz_per_row) {
185  vector_length *= 2;
186  }
187  }
188 
189  // Determine rows per thread
190  if (rows_per_thread < 1) {
191 #ifdef KOKKOS_ENABLE_CUDA
192  if (std::is_same<Kokkos::Cuda, execution_space>::value) {
193  rows_per_thread = 1;
194  }
195  else
196 #endif
197  {
198  if (nnz_per_row < 20 && nnz > 5000000) {
199  rows_per_thread = 256;
200  }
201  else {
202  rows_per_thread = 64;
203  }
204  }
205  }
206 
207 #ifdef KOKKOS_ENABLE_CUDA
208  if (team_size < 1) {
209  if (std::is_same<Kokkos::Cuda,execution_space>::value) {
210  team_size = 256/vector_length;
211  }
212  else {
213  team_size = 1;
214  }
215  }
216 #endif
217 
218  rows_per_team = rows_per_thread * team_size;
219 
220  if (rows_per_team < 0) {
221  int64_t nnz_per_team = 4096;
222  int64_t conc = execution_space::concurrency ();
223  while ((conc * nnz_per_team * 4 > nnz) &&
224  (nnz_per_team > 256)) {
225  nnz_per_team /= 2;
226  }
227  rows_per_team = (nnz_per_team + nnz_per_row - 1) / nnz_per_row;
228  }
229 
230  return rows_per_team;
231 }
232 
233 // W := alpha * D * (B - A*X) + beta * W.
234 template<class WVector,
235  class DVector,
236  class BVector,
237  class AMatrix,
238  class XVector,
239  class Scalar>
240 static void
241 scaled_damped_residual_vector
242 (const Scalar& alpha,
243  const WVector& w,
244  const DVector& d,
245  const BVector& b,
246  const AMatrix& A,
247  const XVector& x,
248  const Scalar& beta)
249 {
250  using execution_space = typename AMatrix::execution_space;
251 
252  if (A.numRows () == 0) {
253  return;
254  }
255 
256  int team_size = -1;
257  int vector_length = -1;
258  int64_t rows_per_thread = -1;
259 
260  const int64_t rows_per_team =
261  scaled_damped_residual_vector_launch_parameters<execution_space>
262  (A.numRows (), A.nnz (), rows_per_thread, team_size, vector_length);
263  int64_t worksets = (b.extent (0) + rows_per_team - 1) / rows_per_team;
264 
265  using Kokkos::Dynamic;
266  using Kokkos::Schedule;
267  using Kokkos::TeamPolicy;
268  using policy_type = TeamPolicy<execution_space, Schedule<Dynamic>>;
269  const char kernel_label[] = "scaled_damped_residual_vector";
270  policy_type policy (1, 1);
271  if (team_size < 0) {
272  policy = policy_type (worksets, Kokkos::AUTO, vector_length);
273  }
274  else {
275  policy = policy_type (worksets, team_size, vector_length);
276  }
277 
278  // Canonicalize template arguments to avoid redundant instantiations.
279  using w_vec_type = typename WVector::non_const_type;
280  using d_vec_type = typename DVector::const_type;
281  using b_vec_type = typename BVector::const_type;
282  using matrix_type = AMatrix;
283  using x_vec_type = typename XVector::const_type;
284  using scalar_type = typename Kokkos::ArithTraits<Scalar>::val_type;
285 
286  if (beta == Kokkos::ArithTraits<Scalar>::zero ()) {
287  constexpr bool use_beta = false;
288  using functor_type =
289  ScaledDampedResidualVectorFunctor<w_vec_type, d_vec_type,
290  b_vec_type, matrix_type,
291  x_vec_type, scalar_type,
292  use_beta>;
293  functor_type func (alpha, w, d, b, A, x, beta, rows_per_team);
294  Kokkos::parallel_for (kernel_label, policy, func);
295  }
296  else {
297  constexpr bool use_beta = true;
298  using functor_type =
299  ScaledDampedResidualVectorFunctor<w_vec_type, d_vec_type,
300  b_vec_type, matrix_type,
301  x_vec_type, scalar_type,
302  use_beta>;
303  functor_type func (alpha, w, d, b, A, x, beta, rows_per_team);
304  Kokkos::parallel_for (kernel_label, policy, func);
305  }
306 }
307 
308 } // namespace Impl
309 
310 template<class TpetraOperatorType>
311 ScaledDampedResidual<TpetraOperatorType>::
312 ScaledDampedResidual (const Teuchos::RCP<const operator_type>& A)
313 {
314  setMatrix (A);
315 }
316 
317 template<class TpetraOperatorType>
318 void
319 ScaledDampedResidual<TpetraOperatorType>::
320 setMatrix (const Teuchos::RCP<const operator_type>& A)
321 {
322  if (A_op_.get () != A.get ()) {
323  A_op_ = A;
324 
325  // We'll (re)allocate these on demand.
326  X_colMap_ = std::unique_ptr<vector_type> (nullptr);
327  V1_ = std::unique_ptr<multivector_type> (nullptr);
328 
329  using Teuchos::rcp_dynamic_cast;
331  rcp_dynamic_cast<const crs_matrix_type> (A);
332  if (A_crs.is_null ()) {
333  A_crs_ = Teuchos::null;
334  imp_ = Teuchos::null;
335  exp_ = Teuchos::null;
336  }
337  else {
338  TEUCHOS_ASSERT( A_crs->isFillComplete () );
339  A_crs_ = A_crs;
340  auto G = A_crs->getCrsGraph ();
341  imp_ = G->getImporter ();
342  exp_ = G->getExporter ();
343  }
344  }
345 }
346 
347 template<class TpetraOperatorType>
348 void
349 ScaledDampedResidual<TpetraOperatorType>::
350 compute (multivector_type& W,
351  const SC& alpha,
352  vector_type& D_inv,
353  multivector_type& B,
354  multivector_type& X,
355  const SC& beta)
356 {
357  using Teuchos::RCP;
358  using Teuchos::rcp;
359 
360  if (canFuse (B)) {
361  // "nonconst" here has no effect other than on the return type.
362  RCP<vector_type> W_vec = W.getVectorNonConst (0);
363  RCP<vector_type> B_vec = B.getVectorNonConst (0);
364  RCP<vector_type> X_vec = X.getVectorNonConst (0);
365  TEUCHOS_ASSERT( ! A_crs_.is_null () );
366  fusedCase (*W_vec, alpha, D_inv, *B_vec, *A_crs_, *X_vec, beta);
367  }
368  else {
369  TEUCHOS_ASSERT( ! A_op_.is_null () );
370  unfusedCase (W, alpha, D_inv, B, *A_op_, X, beta);
371  }
372 }
373 
374 template<class TpetraOperatorType>
375 typename ScaledDampedResidual<TpetraOperatorType>::vector_type&
376 ScaledDampedResidual<TpetraOperatorType>::
377 importVector (vector_type& X_domMap)
378 {
379  if (imp_.is_null ()) {
380  return X_domMap;
381  }
382  else {
383  if (X_colMap_.get () == nullptr) {
384  using V = vector_type;
385  X_colMap_ = std::unique_ptr<V> (new V (imp_->getTargetMap ()));
386  }
387  X_colMap_->doImport (X_domMap, *imp_, Tpetra::REPLACE);
388  return *X_colMap_;
389  }
390 }
391 
392 template<class TpetraOperatorType>
393 bool
394 ScaledDampedResidual<TpetraOperatorType>::
395 canFuse (const multivector_type& B) const
396 {
397  return B.getNumVectors () == size_t (1) &&
398  ! A_crs_.is_null () &&
399  exp_.is_null ();
400 }
401 
402 template<class TpetraOperatorType>
403 void
404 ScaledDampedResidual<TpetraOperatorType>::
405 unfusedCase (multivector_type& W,
406  const SC& alpha,
407  vector_type& D_inv,
408  multivector_type& B,
409  const operator_type& A,
410  multivector_type& X,
411  const SC& beta)
412 {
413  if (V1_.get () == nullptr) {
414  using MV = multivector_type;
415  const size_t numVecs = B.getNumVectors ();
416  V1_ = std::unique_ptr<MV> (new MV (B.getMap (), numVecs));
417  }
418  const SC one = Teuchos::ScalarTraits<SC>::one ();
419 
420  // V1 = B - A*X
421  Tpetra::deep_copy (*V1_, B);
422  A.apply (X, *V1_, Teuchos::NO_TRANS, -one, one);
423 
424  // W := alpha * D_inv * V1 + beta * W
425  W.elementWiseMultiply (alpha, D_inv, *V1_, beta);
426 }
427 
428 template<class TpetraOperatorType>
429 void
430 ScaledDampedResidual<TpetraOperatorType>::
431 fusedCase (vector_type& W,
432  const SC& alpha,
433  vector_type& D_inv,
434  vector_type& B,
435  const crs_matrix_type& A,
436  vector_type& X,
437  const SC& beta)
438 {
439  vector_type& X_colMap = importVector (X);
440 
441  // Only need these aliases because we lack C++14 generic lambdas.
442  using Tpetra::with_local_access_function_argument_type;
443  using ro_lcl_vec_type =
444  with_local_access_function_argument_type<
445  decltype (readOnly (B))>;
446  using wo_lcl_vec_type =
447  with_local_access_function_argument_type<
448  decltype (writeOnly (B))>;
449  using rw_lcl_vec_type =
450  with_local_access_function_argument_type<
451  decltype (readWrite (B))>;
452 
453  using Tpetra::withLocalAccess;
454  using Tpetra::readOnly;
455  using Tpetra::readWrite;
456  using Tpetra::writeOnly;
457  using Impl::scaled_damped_residual_vector;
458  using STS = Teuchos::ScalarTraits<SC>;
459 
460  auto A_lcl = A.getLocalMatrix ();
461  if (beta == STS::zero ()) {
462  withLocalAccess
463  ([&] (const wo_lcl_vec_type& W_lcl,
464  const ro_lcl_vec_type& D_lcl,
465  const ro_lcl_vec_type& B_lcl,
466  const ro_lcl_vec_type& X_lcl) {
467  scaled_damped_residual_vector (alpha, W_lcl, D_lcl,
468  B_lcl, A_lcl, X_lcl, beta);
469  },
470  writeOnly (W),
471  readOnly (D_inv),
472  readOnly (B),
473  readOnly (X_colMap));
474  }
475  else { // need to read _and_ write W if beta != 0
476  withLocalAccess
477  ([&] (const rw_lcl_vec_type& W_lcl,
478  const ro_lcl_vec_type& D_lcl,
479  const ro_lcl_vec_type& B_lcl,
480  const ro_lcl_vec_type& X_lcl) {
481  scaled_damped_residual_vector (alpha, W_lcl, D_lcl,
482  B_lcl, A_lcl, X_lcl, beta);
483  },
484  readWrite (W),
485  readOnly (D_inv),
486  readOnly (B),
487  readOnly (X_colMap));
488  }
489 }
490 
491 } // namespace Details
492 } // namespace Ifpack2
493 
494 #define IFPACK2_DETAILS_SCALEDDAMPEDRESIDUAL_INSTANT(SC,LO,GO,NT) \
495  template class Ifpack2::Details::ScaledDampedResidual<Tpetra::Operator<SC, LO, GO, NT> >;
496 
497 #endif // IFPACK2_DETAILS_SCALEDDAMPEDRESIDUAL_DEF_HPP
Functor for computing W := alpha * D * (B - A*X) + beta * W.
Definition: Ifpack2_Details_ScaledDampedResidual_def.hpp:71
T * get() const
TEUCHOS_DEPRECATED RCP< T > rcp(T *p, Dealloc_T dealloc, bool owns_mem)
#define TEUCHOS_ASSERT(assertion_test)
bool is_null() const