42 #ifndef IFPACK2_DETAILS_SCALEDDAMPEDRESIDUAL_DEF_HPP
43 #define IFPACK2_DETAILS_SCALEDDAMPEDRESIDUAL_DEF_HPP
45 #include "Tpetra_CrsMatrix.hpp"
46 #include "Tpetra_MultiVector.hpp"
47 #include "Tpetra_Operator.hpp"
48 #include "Tpetra_Vector.hpp"
49 #include "Tpetra_withLocalAccess_MultiVector.hpp"
50 #include "Tpetra_Export_decl.hpp"
51 #include "Tpetra_Import_decl.hpp"
52 #include "Kokkos_ArithTraits.hpp"
53 #include "Teuchos_Assert.hpp"
54 #include <type_traits>
64 template<
class WVector,
72 static_assert (static_cast<int> (WVector::Rank) == 1,
73 "WVector must be a rank 1 View.");
74 static_assert (static_cast<int> (DVector::Rank) == 1,
75 "DVector must be a rank 1 View.");
76 static_assert (static_cast<int> (BVector::Rank) == 1,
77 "BVector must be a rank 1 View.");
78 static_assert (static_cast<int> (XVector::Rank) == 1,
79 "XVector must be a rank 1 View.");
81 using execution_space =
typename AMatrix::execution_space;
82 using LO =
typename AMatrix::non_const_ordinal_type;
83 using value_type =
typename AMatrix::non_const_value_type;
84 using team_policy =
typename Kokkos::TeamPolicy<execution_space>;
85 using team_member =
typename team_policy::member_type;
86 using ATV = Kokkos::ArithTraits<value_type>;
96 const LO rows_per_team;
105 const int rows_per_team_) :
113 rows_per_team (rows_per_team_)
115 const size_t numRows = m_A.numRows ();
116 const size_t numCols = m_A.numCols ();
124 KOKKOS_INLINE_FUNCTION
125 void operator() (
const team_member& dev)
const
127 using residual_value_type =
typename BVector::non_const_value_type;
128 using KAT = Kokkos::ArithTraits<residual_value_type>;
131 (Kokkos::TeamThreadRange (dev, 0, rows_per_team),
132 [&] (
const LO& loop) {
134 static_cast<LO
> (dev.league_rank ()) * rows_per_team + loop;
135 if (lclRow >= m_A.numRows ()) {
138 const auto A_row = m_A.rowConst(lclRow);
139 const LO row_length =
static_cast<LO
> (A_row.length);
140 residual_value_type A_x = KAT::zero ();
142 Kokkos::parallel_reduce
143 (Kokkos::ThreadVectorRange (dev, row_length),
144 [&] (
const LO iEntry, residual_value_type& lsum) {
145 const auto A_val = A_row.value(iEntry);
146 lsum += A_val * m_x(A_row.colidx(iEntry));
150 (Kokkos::PerThread(dev),
152 const auto alpha_D_res =
153 alpha * m_d(lclRow) * (m_b(lclRow) - A_x);
155 m_w(lclRow) = beta * m_w(lclRow) + alpha_D_res;
158 m_w(lclRow) = alpha_D_res;
165 template<
class ExecutionSpace>
167 scaled_damped_residual_vector_launch_parameters (int64_t numRows,
169 int64_t rows_per_thread,
173 using execution_space =
typename ExecutionSpace::execution_space;
175 int64_t rows_per_team;
176 int64_t nnz_per_row = nnz/numRows;
178 if (nnz_per_row < 1) {
182 if (vector_length < 1) {
184 while (vector_length<32 && vector_length*6 < nnz_per_row) {
190 if (rows_per_thread < 1) {
191 #ifdef KOKKOS_ENABLE_CUDA
192 if (std::is_same<Kokkos::Cuda, execution_space>::value) {
198 if (nnz_per_row < 20 && nnz > 5000000) {
199 rows_per_thread = 256;
202 rows_per_thread = 64;
207 #ifdef KOKKOS_ENABLE_CUDA
209 if (std::is_same<Kokkos::Cuda,execution_space>::value) {
210 team_size = 256/vector_length;
218 rows_per_team = rows_per_thread * team_size;
220 if (rows_per_team < 0) {
221 int64_t nnz_per_team = 4096;
222 int64_t conc = execution_space::concurrency ();
223 while ((conc * nnz_per_team * 4 > nnz) &&
224 (nnz_per_team > 256)) {
227 rows_per_team = (nnz_per_team + nnz_per_row - 1) / nnz_per_row;
230 return rows_per_team;
234 template<
class WVector,
241 scaled_damped_residual_vector
242 (
const Scalar& alpha,
250 using execution_space =
typename AMatrix::execution_space;
252 if (A.numRows () == 0) {
257 int vector_length = -1;
258 int64_t rows_per_thread = -1;
260 const int64_t rows_per_team =
261 scaled_damped_residual_vector_launch_parameters<execution_space>
262 (A.numRows (), A.nnz (), rows_per_thread, team_size, vector_length);
263 int64_t worksets = (b.extent (0) + rows_per_team - 1) / rows_per_team;
265 using Kokkos::Dynamic;
266 using Kokkos::Schedule;
267 using Kokkos::TeamPolicy;
268 using policy_type = TeamPolicy<execution_space, Schedule<Dynamic>>;
269 const char kernel_label[] =
"scaled_damped_residual_vector";
270 policy_type policy (1, 1);
272 policy = policy_type (worksets, Kokkos::AUTO, vector_length);
275 policy = policy_type (worksets, team_size, vector_length);
279 using w_vec_type =
typename WVector::non_const_type;
280 using d_vec_type =
typename DVector::const_type;
281 using b_vec_type =
typename BVector::const_type;
282 using matrix_type = AMatrix;
283 using x_vec_type =
typename XVector::const_type;
284 using scalar_type =
typename Kokkos::ArithTraits<Scalar>::val_type;
286 if (beta == Kokkos::ArithTraits<Scalar>::zero ()) {
287 constexpr
bool use_beta =
false;
289 ScaledDampedResidualVectorFunctor<w_vec_type, d_vec_type,
290 b_vec_type, matrix_type,
291 x_vec_type, scalar_type,
293 functor_type func (alpha, w, d, b, A, x, beta, rows_per_team);
294 Kokkos::parallel_for (kernel_label, policy, func);
297 constexpr
bool use_beta =
true;
299 ScaledDampedResidualVectorFunctor<w_vec_type, d_vec_type,
300 b_vec_type, matrix_type,
301 x_vec_type, scalar_type,
303 functor_type func (alpha, w, d, b, A, x, beta, rows_per_team);
304 Kokkos::parallel_for (kernel_label, policy, func);
310 template<
class TpetraOperatorType>
311 ScaledDampedResidual<TpetraOperatorType>::
317 template<
class TpetraOperatorType>
319 ScaledDampedResidual<TpetraOperatorType>::
322 if (A_op_.get () != A.
get ()) {
326 X_colMap_ = std::unique_ptr<vector_type> (
nullptr);
327 V1_ = std::unique_ptr<multivector_type> (
nullptr);
329 using Teuchos::rcp_dynamic_cast;
331 rcp_dynamic_cast<
const crs_matrix_type> (A);
333 A_crs_ = Teuchos::null;
334 imp_ = Teuchos::null;
335 exp_ = Teuchos::null;
340 auto G = A_crs->getCrsGraph ();
341 imp_ = G->getImporter ();
342 exp_ = G->getExporter ();
347 template<
class TpetraOperatorType>
349 ScaledDampedResidual<TpetraOperatorType>::
350 compute (multivector_type& W,
362 RCP<vector_type> W_vec = W.getVectorNonConst (0);
363 RCP<vector_type> B_vec = B.getVectorNonConst (0);
364 RCP<vector_type> X_vec = X.getVectorNonConst (0);
366 fusedCase (*W_vec, alpha, D_inv, *B_vec, *A_crs_, *X_vec, beta);
370 unfusedCase (W, alpha, D_inv, B, *A_op_, X, beta);
374 template<
class TpetraOperatorType>
375 typename ScaledDampedResidual<TpetraOperatorType>::vector_type&
376 ScaledDampedResidual<TpetraOperatorType>::
377 importVector (vector_type& X_domMap)
379 if (imp_.is_null ()) {
383 if (X_colMap_.get () ==
nullptr) {
384 using V = vector_type;
385 X_colMap_ = std::unique_ptr<V> (
new V (imp_->getTargetMap ()));
387 X_colMap_->doImport (X_domMap, *imp_, Tpetra::REPLACE);
392 template<
class TpetraOperatorType>
394 ScaledDampedResidual<TpetraOperatorType>::
395 canFuse (
const multivector_type& B)
const
397 return B.getNumVectors () == size_t (1) &&
398 ! A_crs_.is_null () &&
402 template<
class TpetraOperatorType>
404 ScaledDampedResidual<TpetraOperatorType>::
405 unfusedCase (multivector_type& W,
409 const operator_type& A,
413 if (V1_.get () ==
nullptr) {
414 using MV = multivector_type;
415 const size_t numVecs = B.getNumVectors ();
416 V1_ = std::unique_ptr<MV> (
new MV (B.getMap (), numVecs));
421 Tpetra::deep_copy (*V1_, B);
425 W.elementWiseMultiply (alpha, D_inv, *V1_, beta);
428 template<
class TpetraOperatorType>
430 ScaledDampedResidual<TpetraOperatorType>::
431 fusedCase (vector_type& W,
435 const crs_matrix_type& A,
439 vector_type& X_colMap = importVector (X);
442 using Tpetra::with_local_access_function_argument_type;
443 using ro_lcl_vec_type =
444 with_local_access_function_argument_type<
445 decltype (readOnly (B))>;
446 using wo_lcl_vec_type =
447 with_local_access_function_argument_type<
448 decltype (writeOnly (B))>;
449 using rw_lcl_vec_type =
450 with_local_access_function_argument_type<
451 decltype (readWrite (B))>;
453 using Tpetra::withLocalAccess;
454 using Tpetra::readOnly;
455 using Tpetra::readWrite;
456 using Tpetra::writeOnly;
457 using Impl::scaled_damped_residual_vector;
460 auto A_lcl = A.getLocalMatrix ();
461 if (beta == STS::zero ()) {
463 ([&] (
const wo_lcl_vec_type& W_lcl,
464 const ro_lcl_vec_type& D_lcl,
465 const ro_lcl_vec_type& B_lcl,
466 const ro_lcl_vec_type& X_lcl) {
467 scaled_damped_residual_vector (alpha, W_lcl, D_lcl,
468 B_lcl, A_lcl, X_lcl, beta);
473 readOnly (X_colMap));
477 ([&] (
const rw_lcl_vec_type& W_lcl,
478 const ro_lcl_vec_type& D_lcl,
479 const ro_lcl_vec_type& B_lcl,
480 const ro_lcl_vec_type& X_lcl) {
481 scaled_damped_residual_vector (alpha, W_lcl, D_lcl,
482 B_lcl, A_lcl, X_lcl, beta);
487 readOnly (X_colMap));
494 #define IFPACK2_DETAILS_SCALEDDAMPEDRESIDUAL_INSTANT(SC,LO,GO,NT) \
495 template class Ifpack2::Details::ScaledDampedResidual<Tpetra::Operator<SC, LO, GO, NT> >;
497 #endif // IFPACK2_DETAILS_SCALEDDAMPEDRESIDUAL_DEF_HPP
Functor for computing W := alpha * D * (B - A*X) + beta * W.
Definition: Ifpack2_Details_ScaledDampedResidual_def.hpp:71
TEUCHOS_DEPRECATED RCP< T > rcp(T *p, Dealloc_T dealloc, bool owns_mem)
#define TEUCHOS_ASSERT(assertion_test)