Ifpack2 Templated Preconditioning Package  Version 1.0
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Groups Pages
Ifpack2_Details_ChebyshevKernel_def.hpp
1 // @HEADER
2 // *****************************************************************************
3 // Ifpack2: Templated Object-Oriented Algebraic Preconditioner Package
4 //
5 // Copyright 2009 NTESS and the Ifpack2 contributors.
6 // SPDX-License-Identifier: BSD-3-Clause
7 // *****************************************************************************
8 // @HEADER
9 
10 #ifndef IFPACK2_DETAILS_CHEBYSHEVKERNEL_DEF_HPP
11 #define IFPACK2_DETAILS_CHEBYSHEVKERNEL_DEF_HPP
12 
13 #include "Tpetra_CrsMatrix.hpp"
14 #include "Tpetra_MultiVector.hpp"
15 #include "Tpetra_Operator.hpp"
16 #include "Tpetra_Vector.hpp"
17 #include "Tpetra_Export_decl.hpp"
18 #include "Tpetra_Import_decl.hpp"
19 #include "Kokkos_ArithTraits.hpp"
20 #include "Teuchos_Assert.hpp"
21 #include <type_traits>
22 #include "KokkosSparse_spmv_impl.hpp"
23 
24 namespace Ifpack2 {
25 namespace Details {
26 namespace Impl {
27 
32 template<class WVector,
33  class DVector,
34  class BVector,
35  class AMatrix,
36  class XVector_colMap,
37  class XVector_domMap,
38  class Scalar,
39  bool use_beta,
40  bool do_X_update>
42  static_assert (static_cast<int> (WVector::rank) == 1,
43  "WVector must be a rank 1 View.");
44  static_assert (static_cast<int> (DVector::rank) == 1,
45  "DVector must be a rank 1 View.");
46  static_assert (static_cast<int> (BVector::rank) == 1,
47  "BVector must be a rank 1 View.");
48  static_assert (static_cast<int> (XVector_colMap::rank) == 1,
49  "XVector_colMap must be a rank 1 View.");
50  static_assert (static_cast<int> (XVector_domMap::rank) == 1,
51  "XVector_domMap must be a rank 1 View.");
52 
53  using execution_space = typename AMatrix::execution_space;
54  using LO = typename AMatrix::non_const_ordinal_type;
55  using value_type = typename AMatrix::non_const_value_type;
56  using team_policy = typename Kokkos::TeamPolicy<execution_space>;
57  using team_member = typename team_policy::member_type;
58  using ATV = Kokkos::ArithTraits<value_type>;
59 
60  const Scalar alpha;
61  WVector m_w;
62  DVector m_d;
63  BVector m_b;
64  AMatrix m_A;
65  XVector_colMap m_x_colMap;
66  XVector_domMap m_x_domMap;
67  const Scalar beta;
68 
69  const LO rows_per_team;
70 
71  ChebyshevKernelVectorFunctor (const Scalar& alpha_,
72  const WVector& m_w_,
73  const DVector& m_d_,
74  const BVector& m_b_,
75  const AMatrix& m_A_,
76  const XVector_colMap& m_x_colMap_,
77  const XVector_domMap& m_x_domMap_,
78  const Scalar& beta_,
79  const int rows_per_team_) :
80  alpha (alpha_),
81  m_w (m_w_),
82  m_d (m_d_),
83  m_b (m_b_),
84  m_A (m_A_),
85  m_x_colMap (m_x_colMap_),
86  m_x_domMap (m_x_domMap_),
87  beta (beta_),
88  rows_per_team (rows_per_team_)
89  {
90  const size_t numRows = m_A.numRows ();
91  const size_t numCols = m_A.numCols ();
92 
93  TEUCHOS_ASSERT( m_w.extent (0) == m_d.extent (0) );
94  TEUCHOS_ASSERT( m_w.extent (0) == m_b.extent (0) );
95  TEUCHOS_ASSERT( numRows == size_t (m_w.extent (0)) );
96  TEUCHOS_ASSERT( numCols <= size_t (m_x_colMap.extent (0)) );
97  TEUCHOS_ASSERT( numRows <= size_t (m_x_domMap.extent (0)) );
98  }
99 
100  KOKKOS_INLINE_FUNCTION
101  void operator() (const team_member& dev) const
102  {
103  using residual_value_type = typename BVector::non_const_value_type;
104  using KAT = Kokkos::ArithTraits<residual_value_type>;
105 
106  Kokkos::parallel_for
107  (Kokkos::TeamThreadRange (dev, 0, rows_per_team),
108  [&] (const LO& loop) {
109  const LO lclRow =
110  static_cast<LO> (dev.league_rank ()) * rows_per_team + loop;
111  if (lclRow >= m_A.numRows ()) {
112  return;
113  }
114  const KokkosSparse::SparseRowViewConst<AMatrix> A_row = m_A.rowConst(lclRow);
115  const LO row_length = static_cast<LO> (A_row.length);
116  residual_value_type A_x = KAT::zero ();
117 
118  Kokkos::parallel_reduce
119  (Kokkos::ThreadVectorRange (dev, row_length),
120  [&] (const LO iEntry, residual_value_type& lsum) {
121  const auto A_val = A_row.value(iEntry);
122  lsum += A_val * m_x_colMap(A_row.colidx(iEntry));
123  }, A_x);
124 
125  Kokkos::single
126  (Kokkos::PerThread(dev),
127  [&] () {
128  const auto alpha_D_res =
129  alpha * m_d(lclRow) * (m_b(lclRow) - A_x);
130  if (use_beta) {
131  m_w(lclRow) = beta * m_w(lclRow) + alpha_D_res;
132  }
133  else {
134  m_w(lclRow) = alpha_D_res;
135  }
136  if (do_X_update)
137  m_x_domMap(lclRow) += m_w(lclRow);
138  });
139  });
140  }
141 
142 };
143 
144 
145 // W := alpha * D * (B - A*X) + beta * W.
146 template<class WVector,
147  class DVector,
148  class BVector,
149  class AMatrix,
150  class XVector_colMap,
151  class XVector_domMap,
152  class Scalar>
153 static void
154 chebyshev_kernel_vector
155 (const Scalar& alpha,
156  const WVector& w,
157  const DVector& d,
158  const BVector& b,
159  const AMatrix& A,
160  const XVector_colMap& x_colMap,
161  const XVector_domMap& x_domMap,
162  const Scalar& beta,
163  const bool do_X_update)
164 {
165  using execution_space = typename AMatrix::execution_space;
166 
167  if (A.numRows () == 0) {
168  return;
169  }
170 
171  int team_size = -1;
172  int vector_length = -1;
173  int64_t rows_per_thread = -1;
174 
175  const int64_t rows_per_team = KokkosSparse::Impl::spmv_launch_parameters<execution_space>(A.numRows(), A.nnz(), rows_per_thread, team_size, vector_length);
176  int64_t worksets = (b.extent (0) + rows_per_team - 1) / rows_per_team;
177 
178  using Kokkos::Dynamic;
179  using Kokkos::Static;
180  using Kokkos::Schedule;
181  using Kokkos::TeamPolicy;
182  using policy_type_dynamic = TeamPolicy<execution_space, Schedule<Dynamic> >;
183  using policy_type_static = TeamPolicy<execution_space, Schedule<Static> >;
184  const char kernel_label[] = "chebyshev_kernel_vector";
185  policy_type_dynamic policyDynamic (1, 1);
186  policy_type_static policyStatic (1, 1);
187  if (team_size < 0) {
188  policyDynamic = policy_type_dynamic (worksets, Kokkos::AUTO, vector_length);
189  policyStatic = policy_type_static (worksets, Kokkos::AUTO, vector_length);
190  }
191  else {
192  policyDynamic = policy_type_dynamic (worksets, team_size, vector_length);
193  policyStatic = policy_type_static (worksets, team_size, vector_length);
194  }
195 
196  // Canonicalize template arguments to avoid redundant instantiations.
197  using w_vec_type = typename WVector::non_const_type;
198  using d_vec_type = typename DVector::const_type;
199  using b_vec_type = typename BVector::const_type;
200  using matrix_type = AMatrix;
201  using x_colMap_vec_type = typename XVector_colMap::const_type;
202  using x_domMap_vec_type = typename XVector_domMap::non_const_type;
203  using scalar_type = typename Kokkos::ArithTraits<Scalar>::val_type;
204 
205  if (beta == Kokkos::ArithTraits<Scalar>::zero ()) {
206  constexpr bool use_beta = false;
207  if (do_X_update) {
208  using functor_type =
209  ChebyshevKernelVectorFunctor<w_vec_type, d_vec_type,
210  b_vec_type, matrix_type,
211  x_colMap_vec_type, x_domMap_vec_type,
212  scalar_type,
213  use_beta,
214  true>;
215  functor_type func (alpha, w, d, b, A, x_colMap, x_domMap, beta, rows_per_team);
216  if(A.nnz()>10000000)
217  Kokkos::parallel_for (kernel_label, policyDynamic, func);
218  else
219  Kokkos::parallel_for (kernel_label, policyStatic, func);
220  } else {
221  using functor_type =
222  ChebyshevKernelVectorFunctor<w_vec_type, d_vec_type,
223  b_vec_type, matrix_type,
224  x_colMap_vec_type, x_domMap_vec_type,
225  scalar_type,
226  use_beta,
227  false>;
228  functor_type func (alpha, w, d, b, A, x_colMap, x_domMap, beta, rows_per_team);
229  if(A.nnz()>10000000)
230  Kokkos::parallel_for (kernel_label, policyDynamic, func);
231  else
232  Kokkos::parallel_for (kernel_label, policyStatic, func);
233  }
234  }
235  else {
236  constexpr bool use_beta = true;
237  if (do_X_update) {
238  using functor_type =
239  ChebyshevKernelVectorFunctor<w_vec_type, d_vec_type,
240  b_vec_type, matrix_type,
241  x_colMap_vec_type, x_domMap_vec_type,
242  scalar_type,
243  use_beta,
244  true>;
245  functor_type func (alpha, w, d, b, A, x_colMap, x_domMap, beta, rows_per_team);
246  if(A.nnz()>10000000)
247  Kokkos::parallel_for (kernel_label, policyDynamic, func);
248  else
249  Kokkos::parallel_for (kernel_label, policyStatic, func);
250  } else {
251  using functor_type =
252  ChebyshevKernelVectorFunctor<w_vec_type, d_vec_type,
253  b_vec_type, matrix_type,
254  x_colMap_vec_type, x_domMap_vec_type,
255  scalar_type,
256  use_beta,
257  false>;
258  functor_type func (alpha, w, d, b, A, x_colMap, x_domMap, beta, rows_per_team);
259  if(A.nnz()>10000000)
260  Kokkos::parallel_for (kernel_label, policyDynamic, func);
261  else
262  Kokkos::parallel_for (kernel_label, policyStatic, func);
263  }
264  }
265 }
266 
267 } // namespace Impl
268 
269 template<class TpetraOperatorType>
270 ChebyshevKernel<TpetraOperatorType>::
271 ChebyshevKernel (const Teuchos::RCP<const operator_type>& A,
272  const bool useNativeSpMV):
273  useNativeSpMV_(useNativeSpMV)
274 {
275  setMatrix (A);
276 }
277 
278 template<class TpetraOperatorType>
279 void
280 ChebyshevKernel<TpetraOperatorType>::
281 setMatrix (const Teuchos::RCP<const operator_type>& A)
282 {
283  if (A_op_.get () != A.get ()) {
284  A_op_ = A;
285 
286  // We'll (re)allocate these on demand.
287  V1_ = std::unique_ptr<multivector_type> (nullptr);
288 
289  using Teuchos::rcp_dynamic_cast;
291  rcp_dynamic_cast<const crs_matrix_type> (A);
292  if (A_crs.is_null ()) {
293  A_crs_ = Teuchos::null;
294  imp_ = Teuchos::null;
295  exp_ = Teuchos::null;
296  X_colMap_ = nullptr;
297  }
298  else {
299  TEUCHOS_ASSERT( A_crs->isFillComplete () );
300  A_crs_ = A_crs;
301  auto G = A_crs->getCrsGraph ();
302  imp_ = G->getImporter ();
303  exp_ = G->getExporter ();
304  if (!imp_.is_null ()) {
305  if (X_colMap_.get () == nullptr ||
306  !X_colMap_->getMap()->isSameAs (*(imp_->getTargetMap ()))) {
307  X_colMap_ = std::unique_ptr<vector_type> (new vector_type (imp_->getTargetMap ()));
308  }
309  } else
310  X_colMap_ = nullptr;
311  }
312  }
313 }
314 
315 template<class TpetraOperatorType>
316 void
317 ChebyshevKernel<TpetraOperatorType>::
318 compute (multivector_type& W,
319  const SC& alpha,
320  vector_type& D_inv,
321  multivector_type& B,
322  multivector_type& X,
323  const SC& beta)
324 {
325  using Teuchos::RCP;
326  using Teuchos::rcp;
327 
328  if (canFuse (B)) {
329  // "nonconst" here has no effect other than on the return type.
330  W_vec_ = W.getVectorNonConst (0);
331  B_vec_ = B.getVectorNonConst (0);
332  X_vec_ = X.getVectorNonConst (0);
333  TEUCHOS_ASSERT( ! A_crs_.is_null () );
334  fusedCase (*W_vec_, alpha, D_inv, *B_vec_, *A_crs_, *X_vec_, beta);
335  }
336  else {
337  TEUCHOS_ASSERT( ! A_op_.is_null () );
338  unfusedCase (W, alpha, D_inv, B, *A_op_, X, beta);
339  }
340 }
341 
342 template<class TpetraOperatorType>
343 typename ChebyshevKernel<TpetraOperatorType>::vector_type&
344 ChebyshevKernel<TpetraOperatorType>::
345 importVector (vector_type& X_domMap)
346 {
347  if (imp_.is_null ()) {
348  return X_domMap;
349  }
350  else {
351  X_colMap_->doImport (X_domMap, *imp_, Tpetra::REPLACE);
352  return *X_colMap_;
353  }
354 }
355 
356 template<class TpetraOperatorType>
357 bool
358 ChebyshevKernel<TpetraOperatorType>::
359 canFuse (const multivector_type& B) const
360 {
361  // If override is enabled
362  if(useNativeSpMV_)
363  return false;
364 
365  // Some criteria must be met for fused kernel
366  return B.getNumVectors () == size_t (1) &&
367  ! A_crs_.is_null () &&
368  exp_.is_null ();
369 }
370 
371 template<class TpetraOperatorType>
372 void
373 ChebyshevKernel<TpetraOperatorType>::
374 unfusedCase (multivector_type& W,
375  const SC& alpha,
376  vector_type& D_inv,
377  multivector_type& B,
378  const operator_type& A,
379  multivector_type& X,
380  const SC& beta)
381 {
382  using STS = Teuchos::ScalarTraits<SC>;
383  if (V1_.get () == nullptr) {
384  using MV = multivector_type;
385  const size_t numVecs = B.getNumVectors ();
386  V1_ = std::unique_ptr<MV> (new MV (B.getMap (), numVecs));
387  }
388  const SC one = Teuchos::ScalarTraits<SC>::one ();
389 
390  // V1 = B - A*X
391  Tpetra::deep_copy (*V1_, B);
392  A.apply (X, *V1_, Teuchos::NO_TRANS, -one, one);
393 
394  // W := alpha * D_inv * V1 + beta * W
395  W.elementWiseMultiply (alpha, D_inv, *V1_, beta);
396 
397  // X := X + W
398  X.update (STS::one(), W, STS::one());
399 }
400 
401 template<class TpetraOperatorType>
402 void
403 ChebyshevKernel<TpetraOperatorType>::
404 fusedCase (vector_type& W,
405  const SC& alpha,
406  vector_type& D_inv,
407  vector_type& B,
408  const crs_matrix_type& A,
409  vector_type& X,
410  const SC& beta)
411 {
412  vector_type& X_colMap = importVector (X);
413 
414  using Impl::chebyshev_kernel_vector;
415  using STS = Teuchos::ScalarTraits<SC>;
416 
417  auto A_lcl = A.getLocalMatrixDevice ();
418  //D_inv, B, X and W are all Vectors, so it's safe to take the first column only
419  auto Dinv_lcl = Kokkos::subview(D_inv.getLocalViewDevice(Tpetra::Access::ReadOnly), Kokkos::ALL(), 0);
420  auto B_lcl = Kokkos::subview(B.getLocalViewDevice(Tpetra::Access::ReadOnly), Kokkos::ALL(), 0);
421  auto X_domMap_lcl = Kokkos::subview(X.getLocalViewDevice(Tpetra::Access::ReadWrite), Kokkos::ALL(), 0);
422  auto X_colMap_lcl = Kokkos::subview(X_colMap.getLocalViewDevice(Tpetra::Access::ReadOnly), Kokkos::ALL(), 0);
423 
424  const bool do_X_update = !imp_.is_null ();
425  if (beta == STS::zero ()) {
426  auto W_lcl = Kokkos::subview(W.getLocalViewDevice(Tpetra::Access::OverwriteAll), Kokkos::ALL(), 0);
427  chebyshev_kernel_vector (alpha, W_lcl, Dinv_lcl,
428  B_lcl, A_lcl,
429  X_colMap_lcl, X_domMap_lcl,
430  beta,
431  do_X_update);
432  }
433  else { // need to read _and_ write W if beta != 0
434  auto W_lcl = Kokkos::subview(W.getLocalViewDevice(Tpetra::Access::ReadWrite), Kokkos::ALL(), 0);
435  chebyshev_kernel_vector (alpha, W_lcl, Dinv_lcl,
436  B_lcl, A_lcl,
437  X_colMap_lcl, X_domMap_lcl,
438  beta,
439  do_X_update);
440  }
441  if (!do_X_update)
442  X.update(STS::one (), W, STS::one ());
443 }
444 
445 } // namespace Details
446 } // namespace Ifpack2
447 
448 #define IFPACK2_DETAILS_CHEBYSHEVKERNEL_INSTANT(SC,LO,GO,NT) \
449  template class Ifpack2::Details::ChebyshevKernel<Tpetra::Operator<SC, LO, GO, NT> >;
450 
451 #endif // IFPACK2_DETAILS_CHEBYSHEVKERNEL_DEF_HPP
T * get() const
TEUCHOS_DEPRECATED RCP< T > rcp(T *p, Dealloc_T dealloc, bool owns_mem)
Functor for computing W := alpha * D * (B - A*X) + beta * W and X := X+W.
Definition: Ifpack2_Details_ChebyshevKernel_def.hpp:41
#define TEUCHOS_ASSERT(assertion_test)
bool is_null() const