42 #ifndef IFPACK2_DETAILS_SCALEDDAMPEDRESIDUAL_DEF_HPP 
   43 #define IFPACK2_DETAILS_SCALEDDAMPEDRESIDUAL_DEF_HPP 
   45 #include "Tpetra_CrsMatrix.hpp" 
   46 #include "Tpetra_MultiVector.hpp" 
   47 #include "Tpetra_Operator.hpp" 
   48 #include "Tpetra_Vector.hpp" 
   49 #include "Tpetra_withLocalAccess_MultiVector.hpp" 
   50 #include "Tpetra_Export_decl.hpp" 
   51 #include "Tpetra_Import_decl.hpp" 
   52 #include "Kokkos_ArithTraits.hpp" 
   53 #include "Teuchos_Assert.hpp" 
   54 #include <type_traits> 
   64 template<
class WVector,
 
   72   static_assert (static_cast<int> (WVector::Rank) == 1,
 
   73                  "WVector must be a rank 1 View.");
 
   74   static_assert (static_cast<int> (DVector::Rank) == 1,
 
   75                  "DVector must be a rank 1 View.");
 
   76   static_assert (static_cast<int> (BVector::Rank) == 1,
 
   77                  "BVector must be a rank 1 View.");
 
   78   static_assert (static_cast<int> (XVector::Rank) == 1,
 
   79                  "XVector must be a rank 1 View.");
 
   81   using execution_space = 
typename AMatrix::execution_space;
 
   82   using LO = 
typename AMatrix::non_const_ordinal_type;
 
   83   using value_type = 
typename AMatrix::non_const_value_type;
 
   84   using team_policy = 
typename Kokkos::TeamPolicy<execution_space>;
 
   85   using team_member = 
typename team_policy::member_type;
 
   86   using ATV = Kokkos::ArithTraits<value_type>;
 
   96   const LO rows_per_team;
 
  105                                      const int rows_per_team_) :
 
  113     rows_per_team (rows_per_team_)
 
  115     const size_t numRows = m_A.numRows ();
 
  116     const size_t numCols = m_A.numCols ();
 
  124   KOKKOS_INLINE_FUNCTION
 
  125   void operator() (
const team_member& dev)
 const 
  127     using residual_value_type = 
typename BVector::non_const_value_type;
 
  128     using KAT = Kokkos::ArithTraits<residual_value_type>;
 
  131       (Kokkos::TeamThreadRange (dev, 0, rows_per_team),
 
  132        [&] (
const LO& loop) {
 
  134            static_cast<LO
> (dev.league_rank ()) * rows_per_team + loop;
 
  135          if (lclRow >= m_A.numRows ()) {
 
  138          const auto A_row = m_A.rowConst(lclRow);
 
  139          const LO row_length = 
static_cast<LO
> (A_row.length);
 
  140          residual_value_type A_x = KAT::zero ();
 
  142          Kokkos::parallel_reduce
 
  143            (Kokkos::ThreadVectorRange (dev, row_length),
 
  144             [&] (
const LO iEntry, residual_value_type& lsum) {
 
  145               const auto A_val = A_row.value(iEntry);
 
  146               lsum += A_val * m_x(A_row.colidx(iEntry));
 
  150            (Kokkos::PerThread(dev),
 
  152               const auto alpha_D_res =
 
  153                 alpha * m_d(lclRow) * (m_b(lclRow) - A_x);
 
  155                 m_w(lclRow) = beta * m_w(lclRow) + alpha_D_res;
 
  158                 m_w(lclRow) = alpha_D_res;
 
  165 template<
class ExecutionSpace>
 
  167 scaled_damped_residual_vector_launch_parameters (int64_t numRows,
 
  169                                                  int64_t rows_per_thread,
 
  173   using execution_space = 
typename ExecutionSpace::execution_space;
 
  175   int64_t rows_per_team;
 
  176   int64_t nnz_per_row = nnz/numRows;
 
  178   if (nnz_per_row < 1) {
 
  182   if (vector_length < 1) {
 
  184     while (vector_length<32 && vector_length*6 < nnz_per_row) {
 
  190   if (rows_per_thread < 1) {
 
  191 #ifdef KOKKOS_ENABLE_CUDA 
  192     if (std::is_same<Kokkos::Cuda, execution_space>::value) {
 
  198       if (nnz_per_row < 20 && nnz > 5000000) {
 
  199         rows_per_thread = 256;
 
  202         rows_per_thread = 64;
 
  207 #ifdef KOKKOS_ENABLE_CUDA 
  209     if (std::is_same<Kokkos::Cuda,execution_space>::value) {
 
  210       team_size = 256/vector_length;
 
  218   rows_per_team = rows_per_thread * team_size;
 
  220   if (rows_per_team < 0) {
 
  221     int64_t nnz_per_team = 4096;
 
  222     int64_t conc = execution_space::concurrency ();
 
  223     while ((conc * nnz_per_team * 4 > nnz) &&
 
  224            (nnz_per_team > 256)) {
 
  227     rows_per_team = (nnz_per_team + nnz_per_row - 1) / nnz_per_row;
 
  230   return rows_per_team;
 
  234 template<
class WVector,
 
  241 scaled_damped_residual_vector
 
  242 (
const Scalar& alpha,
 
  250   using execution_space = 
typename AMatrix::execution_space;
 
  252   if (A.numRows () == 0) {
 
  257   int vector_length = -1;
 
  258   int64_t rows_per_thread = -1;
 
  260   const int64_t rows_per_team =
 
  261     scaled_damped_residual_vector_launch_parameters<execution_space>
 
  262       (A.numRows (), A.nnz (), rows_per_thread, team_size, vector_length);
 
  263   int64_t worksets = (b.extent (0) + rows_per_team - 1) / rows_per_team;
 
  265   using Kokkos::Dynamic;
 
  266   using Kokkos::Schedule;
 
  267   using Kokkos::TeamPolicy;
 
  268   using policy_type = TeamPolicy<execution_space, Schedule<Dynamic>>;
 
  269   const char kernel_label[] = 
"scaled_damped_residual_vector";
 
  270   policy_type policy (1, 1);
 
  272     policy = policy_type (worksets, Kokkos::AUTO, vector_length);
 
  275     policy = policy_type (worksets, team_size, vector_length);
 
  279   using w_vec_type = 
typename WVector::non_const_type;
 
  280   using d_vec_type = 
typename DVector::const_type;
 
  281   using b_vec_type = 
typename BVector::const_type;
 
  282   using matrix_type = AMatrix;
 
  283   using x_vec_type = 
typename XVector::const_type;
 
  284   using scalar_type = 
typename Kokkos::ArithTraits<Scalar>::val_type;
 
  286   if (beta == Kokkos::ArithTraits<Scalar>::zero ()) {
 
  287     constexpr 
bool use_beta = 
false;
 
  289       ScaledDampedResidualVectorFunctor<w_vec_type, d_vec_type,
 
  290                                         b_vec_type, matrix_type,
 
  291                                         x_vec_type, scalar_type,
 
  293     functor_type func (alpha, w, d, b, A, x, beta, rows_per_team);
 
  294     Kokkos::parallel_for (kernel_label, policy, func);
 
  297     constexpr 
bool use_beta = 
true;
 
  299       ScaledDampedResidualVectorFunctor<w_vec_type, d_vec_type,
 
  300                                         b_vec_type, matrix_type,
 
  301                                         x_vec_type, scalar_type,
 
  303     functor_type func (alpha, w, d, b, A, x, beta, rows_per_team);
 
  304     Kokkos::parallel_for (kernel_label, policy, func);
 
  310 template<
class TpetraOperatorType>
 
  311 ScaledDampedResidual<TpetraOperatorType>::
 
  317 template<
class TpetraOperatorType>
 
  319 ScaledDampedResidual<TpetraOperatorType>::
 
  322   if (A_op_.get () != A.
get ()) {
 
  326     X_colMap_ = std::unique_ptr<vector_type> (
nullptr);
 
  327     V1_ = std::unique_ptr<multivector_type> (
nullptr);
 
  329     using Teuchos::rcp_dynamic_cast;
 
  331       rcp_dynamic_cast<
const crs_matrix_type> (A);
 
  333       A_crs_ = Teuchos::null;
 
  334       imp_ = Teuchos::null;
 
  335       exp_ = Teuchos::null;
 
  340       auto G = A_crs->getCrsGraph ();
 
  341       imp_ = G->getImporter ();
 
  342       exp_ = G->getExporter ();
 
  347 template<
class TpetraOperatorType>
 
  349 ScaledDampedResidual<TpetraOperatorType>::
 
  350 compute (multivector_type& W,
 
  362     RCP<vector_type> W_vec = W.getVectorNonConst (0);
 
  363     RCP<vector_type> B_vec = B.getVectorNonConst (0);
 
  364     RCP<vector_type> X_vec = X.getVectorNonConst (0);
 
  366     fusedCase (*W_vec, alpha, D_inv, *B_vec, *A_crs_, *X_vec, beta);
 
  370     unfusedCase (W, alpha, D_inv, B, *A_op_, X, beta);
 
  374 template<
class TpetraOperatorType>
 
  375 typename ScaledDampedResidual<TpetraOperatorType>::vector_type&
 
  376 ScaledDampedResidual<TpetraOperatorType>::
 
  377 importVector (vector_type& X_domMap)
 
  379   if (imp_.is_null ()) {
 
  383     if (X_colMap_.get () == 
nullptr) {
 
  384       using V = vector_type;
 
  385       X_colMap_ = std::unique_ptr<V> (
new V (imp_->getTargetMap ()));
 
  387     X_colMap_->doImport (X_domMap, *imp_, Tpetra::REPLACE);
 
  392 template<
class TpetraOperatorType>
 
  394 ScaledDampedResidual<TpetraOperatorType>::
 
  395 canFuse (
const multivector_type& B)
 const 
  397   return B.getNumVectors () == size_t (1) &&
 
  398     ! A_crs_.is_null () &&
 
  402 template<
class TpetraOperatorType>
 
  404 ScaledDampedResidual<TpetraOperatorType>::
 
  405 unfusedCase (multivector_type& W,
 
  409              const operator_type& A,
 
  413   if (V1_.get () == 
nullptr) {
 
  414     using MV = multivector_type;
 
  415     const size_t numVecs = B.getNumVectors ();
 
  416     V1_ = std::unique_ptr<MV> (
new MV (B.getMap (), numVecs));
 
  421   Tpetra::deep_copy (*V1_, B);
 
  425   W.elementWiseMultiply (alpha, D_inv, *V1_, beta);
 
  428 template<
class TpetraOperatorType>
 
  430 ScaledDampedResidual<TpetraOperatorType>::
 
  431 fusedCase (vector_type& W,
 
  435            const crs_matrix_type& A,
 
  439   vector_type& X_colMap = importVector (X);
 
  442   using Tpetra::with_local_access_function_argument_type;
 
  443   using ro_lcl_vec_type =
 
  444     with_local_access_function_argument_type<
 
  445       decltype (readOnly (B))>;
 
  446   using wo_lcl_vec_type =
 
  447     with_local_access_function_argument_type<
 
  448       decltype (writeOnly (B))>;
 
  449   using rw_lcl_vec_type =
 
  450     with_local_access_function_argument_type<
 
  451       decltype (readWrite (B))>;
 
  453   using Tpetra::withLocalAccess;
 
  454   using Tpetra::readOnly;
 
  455   using Tpetra::readWrite;
 
  456   using Tpetra::writeOnly;
 
  457   using Impl::scaled_damped_residual_vector;
 
  460   auto A_lcl = A.getLocalMatrix ();
 
  461   if (beta == STS::zero ()) {
 
  463       ([&] (
const wo_lcl_vec_type& W_lcl,
 
  464             const ro_lcl_vec_type& D_lcl,
 
  465             const ro_lcl_vec_type& B_lcl,
 
  466             const ro_lcl_vec_type& X_lcl) {
 
  467          scaled_damped_residual_vector (alpha, W_lcl, D_lcl,
 
  468                                         B_lcl, A_lcl, X_lcl, beta);
 
  473        readOnly (X_colMap));
 
  477       ([&] (
const rw_lcl_vec_type& W_lcl,
 
  478             const ro_lcl_vec_type& D_lcl,
 
  479             const ro_lcl_vec_type& B_lcl,
 
  480             const ro_lcl_vec_type& X_lcl) {
 
  481          scaled_damped_residual_vector (alpha, W_lcl, D_lcl,
 
  482                                         B_lcl, A_lcl, X_lcl, beta);
 
  487        readOnly (X_colMap));
 
  494 #define IFPACK2_DETAILS_SCALEDDAMPEDRESIDUAL_INSTANT(SC,LO,GO,NT) \ 
  495   template class Ifpack2::Details::ScaledDampedResidual<Tpetra::Operator<SC, LO, GO, NT> >; 
  497 #endif // IFPACK2_DETAILS_SCALEDDAMPEDRESIDUAL_DEF_HPP 
Functor for computing W := alpha * D * (B - A*X) + beta * W. 
Definition: Ifpack2_Details_ScaledDampedResidual_def.hpp:71
 
TEUCHOS_DEPRECATED RCP< T > rcp(T *p, Dealloc_T dealloc, bool owns_mem)
 
#define TEUCHOS_ASSERT(assertion_test)