43 #ifndef IFPACK2_BLOCKTRIDICONTAINER_IMPL_HPP 
   44 #define IFPACK2_BLOCKTRIDICONTAINER_IMPL_HPP 
   48 #include <Tpetra_Details_extractMpiCommFromTeuchos.hpp> 
   49 #include <Tpetra_Distributor.hpp> 
   50 #include <Tpetra_BlockMultiVector.hpp> 
   52 #include <Kokkos_ArithTraits.hpp> 
   53 #include <KokkosBatched_Util.hpp> 
   54 #include <KokkosBatched_Vector.hpp> 
   55 #include <KokkosBatched_Copy_Decl.hpp> 
   56 #include <KokkosBatched_Copy_Impl.hpp> 
   57 #include <KokkosBatched_AddRadial_Decl.hpp> 
   58 #include <KokkosBatched_AddRadial_Impl.hpp> 
   59 #include <KokkosBatched_SetIdentity_Decl.hpp> 
   60 #include <KokkosBatched_SetIdentity_Impl.hpp> 
   61 #include <KokkosBatched_Gemm_Decl.hpp> 
   62 #include <KokkosBatched_Gemm_Serial_Impl.hpp> 
   63 #include <KokkosBatched_Gemm_Team_Impl.hpp> 
   64 #include <KokkosBatched_Gemv_Decl.hpp> 
   65 #include <KokkosBatched_Gemv_Serial_Impl.hpp> 
   66 #include <KokkosBatched_Gemv_Team_Impl.hpp> 
   67 #include <KokkosBatched_Trsm_Decl.hpp> 
   68 #include <KokkosBatched_Trsm_Serial_Impl.hpp> 
   69 #include <KokkosBatched_Trsm_Team_Impl.hpp> 
   70 #include <KokkosBatched_Trsv_Decl.hpp> 
   71 #include <KokkosBatched_Trsv_Serial_Impl.hpp> 
   72 #include <KokkosBatched_Trsv_Team_Impl.hpp> 
   73 #include <KokkosBatched_LU_Decl.hpp> 
   74 #include <KokkosBatched_LU_Serial_Impl.hpp> 
   75 #include <KokkosBatched_LU_Team_Impl.hpp> 
   77 #include <KokkosBlas1_nrm1.hpp> 
   78 #include <KokkosBlas1_nrm2.hpp> 
   85 #if defined(KOKKOS_ENABLE_CUDA) && defined(IFPACK2_BLOCKTRIDICONTAINER_ENABLE_PROFILE) 
   86 #include "cuda_profiler_api.h" 
   91 #define IFPACK2_BLOCKTRIDICONTAINER_USE_MPI_3 
  102 #define IFPACK2_BLOCKTRIDICONTAINER_USE_PINNED_MEMORY_FOR_MPI 
  106 #define IFPACK2_BLOCKTRIDICONTAINER_USE_CUDA_SPACE 
  109 #if defined(HAVE_IFPACK2_BLOCKTRIDICONTAINER_SMALL_SCALAR) 
  110 #define IFPACK2_BLOCKTRIDICONTAINER_USE_SMALL_SCALAR_FOR_BLOCKTRIDIAG 
  115   namespace BlockTriDiContainerDetails {
 
  118 #if defined(__KOKKOSBATCHED_PROMOTION__) 
  119 namespace KB = KokkosBatched;
 
  121 namespace KB = KokkosBatched::Experimental;
 
  129     template <
typename MemoryTraitsType, Kokkos::MemoryTraitsFlags flag>
 
  130     using MemoryTraits = Kokkos::MemoryTraits<MemoryTraitsType::is_unmanaged |
 
  131                                               MemoryTraitsType::is_random_access |
 
  134     template <
typename ViewType>
 
  135     using Unmanaged = Kokkos::View<
typename ViewType::data_type,
 
  136                                    typename ViewType::array_layout,
 
  137                                    typename ViewType::device_type,
 
  138                                   MemoryTraits<typename ViewType::memory_traits,Kokkos::Unmanaged> >;
 
  139     template <
typename ViewType>
 
  140     using Atomic = Kokkos::View<
typename ViewType::data_type,
 
  141                                 typename ViewType::array_layout,
 
  142                                 typename ViewType::device_type,
 
  143                                 MemoryTraits<typename ViewType::memory_traits,Kokkos::Atomic> >;
 
  144     template <
typename ViewType>
 
  145     using Const = Kokkos::View<
typename ViewType::const_data_type,
 
  146                                typename ViewType::array_layout,
 
  147                                typename ViewType::device_type,
 
  148                                typename ViewType::memory_traits>;
 
  149     template <
typename ViewType>
 
  150     using ConstUnmanaged = Const<Unmanaged<ViewType> >;
 
  152     template <
typename ViewType>
 
  153     using AtomicUnmanaged = Atomic<Unmanaged<ViewType> >;
 
  155     template <
typename ViewType>
 
  156     using Unmanaged = Kokkos::View<
typename ViewType::data_type,
 
  157                                    typename ViewType::array_layout,
 
  158                                    typename ViewType::device_type,
 
  159                                    MemoryTraits<typename ViewType::memory_traits,Kokkos::Unmanaged> >;
 
  162     template <
typename ViewType>
 
  163     using Scratch = Kokkos::View<
typename ViewType::data_type,
 
  164                                  typename ViewType::array_layout,
 
  165                                  typename ViewType::execution_space::scratch_memory_space,
 
  166                                  MemoryTraits<typename ViewType::memory_traits, Kokkos::Unmanaged> >;
 
  172 #if defined(IFPACK2_BLOCKTRIDICONTAINER_USE_SMALL_SCALAR_FOR_BLOCKTRIDIAG) 
  180     template<
typename T> 
struct is_cuda        { 
enum : 
bool { value = 
false }; };
 
  181 #if defined(KOKKOS_ENABLE_CUDA) 
  182     template<> 
struct is_cuda<Kokkos::Cuda>    { 
enum : 
bool { value = 
true  }; };
 
  188     template<
typename CommPtrType>
 
  190       const auto rank = comm->getRank();
 
  191       const auto nranks = comm->getSize();
 
  192       std::stringstream ss;
 
  193       ss << 
"Rank " << rank << 
" of " << nranks << 
": ";
 
  200     template<
typename T, 
int N>
 
  203       KOKKOS_INLINE_FUNCTION
 
  205         for (
int i=0;i<N;++i)
 
  208       KOKKOS_INLINE_FUNCTION
 
  210         for (
int i=0;i<N;++i)
 
  214     template<
typename T, 
int N>
 
  216     KOKKOS_INLINE_FUNCTION
 
  220       for (
int i=0;i<N;++i)
 
  223     template<
typename T, 
int N>
 
  225     KOKKOS_INLINE_FUNCTION
 
  227     operator+=(ArrayValueType<T,N> &a,
 
  228                const ArrayValueType<T,N> &b) {
 
  229       for (
int i=0;i<N;++i)
 
  236     template<
typename T, 
int N, 
typename ExecSpace>
 
  240       typedef Kokkos::View<value_type,ExecSpace,Kokkos::MemoryTraits<Kokkos::Unmanaged> > result_view_type;
 
  243       KOKKOS_INLINE_FUNCTION
 
  246       KOKKOS_INLINE_FUNCTION
 
  248         for (
int i=0;i<N;++i)
 
  249           dst.v[i] += src.v[i];
 
  251       KOKKOS_INLINE_FUNCTION
 
  253         for (
int i=0;i<N;++i)
 
  254           dst.v[i] += src.v[i];
 
  256       KOKKOS_INLINE_FUNCTION
 
  258         for (
int i=0;i<N;++i)
 
  259           val.v[i] = Kokkos::reduction_identity<T>::sum();
 
  261       KOKKOS_INLINE_FUNCTION
 
  265       KOKKOS_INLINE_FUNCTION
 
  266       result_view_type view()
 const {
 
  267         return result_view_type(value);
 
  271 #if defined(HAVE_IFPACK2_BLOCKTRIDICONTAINER_TIMERS) 
  272 #define IFPACK2_BLOCKTRIDICONTAINER_TIMER(label) TEUCHOS_FUNC_TIME_MONITOR(label); 
  274 #define IFPACK2_BLOCKTRIDICONTAINER_TIMER(label) 
  277 #if defined(KOKKOS_ENABLE_CUDA) && defined(IFPACK2_BLOCKTRIDICONTAINER_ENABLE_PROFILE) 
  278 #define IFPACK2_BLOCKTRIDICONTAINER_PROFILER_REGION_BEGIN \ 
  279     CUDA_SAFE_CALL(cudaProfilerStart()); 
  281 #define IFPACK2_BLOCKTRIDICONTAINER_PROFILER_REGION_END \ 
  282     { CUDA_SAFE_CALL( cudaProfilerStop() ); } 
  284 #define IFPACK2_BLOCKTRIDICONTAINER_PROFILER_REGION_BEGIN 
  286 #define IFPACK2_BLOCKTRIDICONTAINER_PROFILER_REGION_END 
  292     template <
typename MatrixType>
 
  298       typedef typename MatrixType::scalar_type scalar_type;
 
  299       typedef typename MatrixType::local_ordinal_type local_ordinal_type;
 
  300       typedef typename MatrixType::global_ordinal_type global_ordinal_type;
 
  301       typedef typename MatrixType::node_type node_type;
 
  307       typedef typename Kokkos::ArithTraits<impl_scalar_type>::mag_type magnitude_type;
 
  309       typedef typename BlockTridiagScalarType<impl_scalar_type>::type btdm_scalar_type;
 
  310       typedef typename Kokkos::ArithTraits<btdm_scalar_type>::mag_type btdm_magnitude_type;
 
  321       typedef typename node_device_type::execution_space node_execution_space;
 
  322       typedef typename node_device_type::memory_space node_memory_space;
 
  324 #if defined(KOKKOS_ENABLE_CUDA) && defined(IFPACK2_BLOCKTRIDICONTAINER_USE_CUDA_SPACE) 
  325       typedef node_execution_space execution_space;
 
  327       typedef typename std::conditional<std::is_same<node_memory_space,Kokkos::CudaUVMSpace>::value,
 
  329                                         node_memory_space>::type memory_space;
 
  330       typedef Kokkos::Device<execution_space,memory_space> device_type;
 
  333       typedef node_execution_space execution_space;
 
  334       typedef node_memory_space memory_space;
 
  337       typedef Tpetra::MultiVector<scalar_type,local_ordinal_type,global_ordinal_type,node_type> tpetra_multivector_type;
 
  338       typedef Tpetra::Map<local_ordinal_type,global_ordinal_type,node_type> tpetra_map_type;
 
  339       typedef Tpetra::Import<local_ordinal_type,global_ordinal_type,node_type> tpetra_import_type;
 
  340       typedef Tpetra::RowMatrix<scalar_type,local_ordinal_type,global_ordinal_type,node_type> tpetra_row_matrix_type;
 
  341       typedef Tpetra::BlockCrsMatrix<scalar_type,local_ordinal_type,global_ordinal_type,node_type> tpetra_block_crs_matrix_type;
 
  342       typedef typename tpetra_block_crs_matrix_type::little_block_type tpetra_block_access_view_type;
 
  343       typedef Tpetra::BlockMultiVector<scalar_type,local_ordinal_type,global_ordinal_type,node_type> tpetra_block_multivector_type;
 
  344       typedef typename tpetra_block_crs_matrix_type::crs_graph_type::local_graph_type local_crs_graph_type;
 
  349       template<
typename T, 
int l> 
using Vector = KB::Vector<T,l>;
 
  350       template<
typename T> 
using SIMD = KB::SIMD<T>;
 
  351       template<
typename T, 
typename M> 
using DefaultVectorLength = KB::DefaultVectorLength<T,M>;
 
  352       template<
typename T, 
typename M> 
using DefaultInternalVectorLength = KB::DefaultInternalVectorLength<T,M>;
 
  354       static constexpr 
int vector_length = DefaultVectorLength<btdm_scalar_type,memory_space>::value;
 
  355       static constexpr 
int internal_vector_length = DefaultInternalVectorLength<btdm_scalar_type,memory_space>::value;
 
  363       typedef Kokkos::View<local_ordinal_type*,device_type> local_ordinal_type_1d_view;
 
  365       typedef Kokkos::View<impl_scalar_type*,device_type> impl_scalar_type_1d_view;
 
  366       typedef Kokkos::View<impl_scalar_type*,node_device_type> impl_scalar_type_1d_view_tpetra;
 
  369       typedef Kokkos::View<impl_scalar_type**,Kokkos::LayoutLeft,device_type> impl_scalar_type_2d_view;
 
  370       typedef Kokkos::View<impl_scalar_type**,Kokkos::LayoutLeft,node_device_type> impl_scalar_type_2d_view_tpetra;
 
  373       typedef Kokkos::View<vector_type*,device_type> vector_type_1d_view;
 
  374       typedef Kokkos::View<vector_type***,Kokkos::LayoutRight,device_type> vector_type_3d_view;
 
  375       typedef Kokkos::View<internal_vector_type***,Kokkos::LayoutRight,device_type> internal_vector_type_3d_view;
 
  376       typedef Kokkos::View<internal_vector_type****,Kokkos::LayoutRight,device_type> internal_vector_type_4d_view;
 
  377       typedef Kokkos::View<btdm_scalar_type***,Kokkos::LayoutRight,device_type> btdm_scalar_type_3d_view;
 
  378       typedef Kokkos::View<btdm_scalar_type****,Kokkos::LayoutRight,device_type> btdm_scalar_type_4d_view;
 
  384     template<
typename MatrixType>
 
  387       IFPACK2_BLOCKTRIDICONTAINER_TIMER(
"BlockTriDi::CreateBlockCrsTpetraImporter");
 
  389       using tpetra_map_type = 
typename impl_type::tpetra_map_type;
 
  390       using tpetra_mv_type = 
typename impl_type::tpetra_block_multivector_type;
 
  391       using tpetra_import_type = 
typename impl_type::tpetra_import_type;
 
  393       const auto g = A->getCrsGraph();  
 
  394       const auto blocksize = A->getBlockSize();
 
  395       const auto src = 
Teuchos::rcp(
new tpetra_map_type(tpetra_mv_type::makePointMap(*g.getDomainMap(), blocksize)));
 
  396       const auto tgt = 
Teuchos::rcp(
new tpetra_map_type(tpetra_mv_type::makePointMap(*g.getColMap()   , blocksize)));
 
  406     template<
typename MatrixType>
 
  407     struct AsyncableImport {
 
  409       using impl_type = ImplType<MatrixType>;
 
  415 #if !defined(HAVE_IFPACK2_MPI) 
  416       typedef int MPI_Request;
 
  417       typedef int MPI_Comm;
 
  419       using scalar_type = 
typename impl_type::scalar_type;
 
  423       static int isend(
const MPI_Comm comm, 
const char* buf, 
int count, 
int dest, 
int tag, MPI_Request* ireq) {
 
  424 #ifdef HAVE_IFPACK2_MPI 
  426         int ret = MPI_Isend(const_cast<char*>(buf), count, MPI_CHAR, dest, tag, comm, ireq == NULL ? &ureq : ireq);
 
  427         if (ireq == NULL) MPI_Request_free(&ureq);
 
  434       static int irecv(
const MPI_Comm comm, 
char* buf, 
int count, 
int src, 
int tag, MPI_Request* ireq) {
 
  435 #ifdef HAVE_IFPACK2_MPI 
  437         int ret = MPI_Irecv(buf, count, MPI_CHAR, src, tag, comm, ireq == NULL ? &ureq : ireq);
 
  438         if (ireq == NULL) MPI_Request_free(&ureq);
 
  445 #if defined(KOKKOS_ENABLE_CUDA) && defined(IFPACK2_BLOCKTRIDICONTAINER_USE_CUDA_STREAM) 
  446       struct CallbackDataArgsForSendRecv {
 
  457       static void CUDART_CB callback_isend(cudaStream_t this_stream, cudaError_t status, 
void *data) {
 
  458         CallbackDataArgsForSendRecv *in = 
reinterpret_cast<CallbackDataArgsForSendRecv*
>(data);
 
  459         in->r_val = 
isend(in->comm, in->buf, in->count, in->pid, in->tag, in->ireq);
 
  462       static void CUDART_CB callback_irecv(cudaStream_t this_stream, cudaError_t status, 
void *data) {
 
  463         CallbackDataArgsForSendRecv *in = 
reinterpret_cast<CallbackDataArgsForSendRecv*
>(data);
 
  464         in->r_val = irecv(in->comm, in->buf, in->count, in->pid, in->tag, in->ireq);
 
  467       static void CUDART_CB callback_wait(cudaStream_t this_stream, cudaError_t status, 
void *data) {
 
  468 #ifdef HAVE_IFPACK2_MPI 
  469         MPI_Request *req = 
reinterpret_cast<MPI_Request*
>(data);
 
  470         MPI_Wait(req, MPI_STATUS_IGNORE);
 
  471 #endif // HAVE_IFPACK2_MPI 
  473 #endif // defined(KOKKOS_ENABLE_CUDA) && defined(IFPACK2_BLOCKTRIDICONTAINER_USE_CUDA_STREAM) 
  475       static int waitany(
int count, MPI_Request* reqs, 
int* index) {
 
  476 #ifdef HAVE_IFPACK2_MPI 
  477         return MPI_Waitany(count, reqs, index, MPI_STATUS_IGNORE);
 
  483       static int waitall(
int count, MPI_Request* reqs) {
 
  484 #ifdef HAVE_IFPACK2_MPI 
  485         return MPI_Waitall(count, reqs, MPI_STATUS_IGNORE);
 
  492       using tpetra_map_type = 
typename impl_type::tpetra_map_type;
 
  493       using tpetra_import_type = 
typename impl_type::tpetra_import_type;
 
  495       using local_ordinal_type = 
typename impl_type::local_ordinal_type;
 
  496       using global_ordinal_type = 
typename impl_type::global_ordinal_type;
 
  500       using int_1d_view_host = Kokkos::View<int*,Kokkos::HostSpace>;
 
  501       using local_ordinal_type_1d_view_host = Kokkos::View<local_ordinal_type*,Kokkos::HostSpace>;
 
  503       using execution_space = 
typename impl_type::execution_space;
 
  504       using memory_space = 
typename impl_type::memory_space;
 
  505       using local_ordinal_type_1d_view = 
typename impl_type::local_ordinal_type_1d_view;
 
  507       using size_type_1d_view_host = Kokkos::View<size_type*,Kokkos::HostSpace>;
 
  509 #if defined(KOKKOS_ENABLE_CUDA) 
  510       using impl_scalar_type_1d_view =
 
  511         typename std::conditional<std::is_same<execution_space,Kokkos::Cuda>::value,
 
  512 #  if defined(IFPACK2_BLOCKTRIDICONTAINER_USE_PINNED_MEMORY_FOR_MPI) 
  513                                   Kokkos::View<impl_scalar_type*,Kokkos::CudaHostPinnedSpace>,
 
  514 #  elif defined(IFPACK2_BLOCKTRIDICONTAINER_USE_CUDA_MEMORY_FOR_MPI) 
  515                                   Kokkos::View<impl_scalar_type*,Kokkos::CudaSpace>,
 
  516 #  else                           // no experimental macros are defined 
  517                                   typename impl_type::impl_scalar_type_1d_view,
 
  519                                   typename impl_type::impl_scalar_type_1d_view>::type; 
  521       using impl_scalar_type_1d_view = 
typename impl_type::impl_scalar_type_1d_view;
 
  523       using impl_scalar_type_2d_view = 
typename impl_type::impl_scalar_type_2d_view;
 
  524       using impl_scalar_type_2d_view_tpetra = 
typename impl_type::impl_scalar_type_2d_view_tpetra;
 
  526 #ifdef HAVE_IFPACK2_MPI 
  530       impl_scalar_type_2d_view_tpetra remote_multivector;
 
  531       local_ordinal_type blocksize;
 
  534       struct SendRecvPair {
 
  539       SendRecvPair<int_1d_view_host> pids;           
 
  540       SendRecvPair<std::vector<MPI_Request> > reqs;  
 
  541       SendRecvPair<size_type_1d_view> offset;        
 
  542       SendRecvPair<size_type_1d_view_host> offset_host;        
 
  543       SendRecvPair<local_ordinal_type_1d_view> lids; 
 
  544       SendRecvPair<impl_scalar_type_1d_view> buffer; 
 
  546       local_ordinal_type_1d_view dm2cm; 
 
  548 #if defined(KOKKOS_ENABLE_CUDA) && defined(IFPACK2_BLOCKTRIDICONTAINER_USE_CUDA_STREAM) 
  549       using cuda_stream_1d_std_vector = std::vector<cudaStream_t>;
 
  550       SendRecvPair<cuda_stream_1d_std_vector> stream;
 
  552       using callback_data_send_recv_1d_std_vector = std::vector<CallbackDataArgsForSendRecv>;
 
  553       SendRecvPair<callback_data_send_recv_1d_std_vector> callback_data;
 
  559                            const size_type_1d_view &offs) {
 
  561         Kokkos::View<size_t*,Kokkos::HostSpace> lens_host(const_cast<size_t*>(lens.
getRawPtr()), lens.
size());
 
  562         const auto lens_device = Kokkos::create_mirror_view_and_copy(memory_space(), lens_host);
 
  565         const Kokkos::RangePolicy<execution_space> policy(0,offs.extent(0));
 
  566         const local_ordinal_type lens_size = lens_device.extent(0);
 
  567         Kokkos::parallel_scan
 
  568           (
"AsyncableImport::RangePolicy::setOffsetValues",
 
  569            policy, KOKKOS_LAMBDA(
const local_ordinal_type &i, size_type &update, 
const bool &
final) {
 
  572             update += (i < lens_size ? lens_device[i] : 0);
 
  577       void createMpiRequests(
const tpetra_import_type &
import) {
 
  578         Tpetra::Distributor &distributor = 
import.getDistributor();
 
  581         const auto pids_from = distributor.getProcsFrom();
 
  583         memcpy(pids.recv.data(), pids_from.getRawPtr(), 
sizeof(int)*pids.recv.extent(0));
 
  585         const auto pids_to = distributor.getProcsTo();
 
  587         memcpy(pids.send.data(), pids_to.getRawPtr(), 
sizeof(int)*pids.send.extent(0));
 
  590         reqs.recv.resize(pids.recv.extent(0)); memset(reqs.recv.data(), 0, reqs.recv.size()*
sizeof(MPI_Request));
 
  591         reqs.send.resize(pids.send.extent(0)); memset(reqs.send.data(), 0, reqs.send.size()*
sizeof(MPI_Request));
 
  594         const auto lengths_to = distributor.getLengthsTo();
 
  597         const auto lengths_from = distributor.getLengthsFrom();
 
  600         setOffsetValues(lengths_to,   offset.send);
 
  601         offset_host.send = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), offset.send);
 
  603         setOffsetValues(lengths_from, offset.recv);
 
  604         offset_host.recv = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), offset.recv);
 
  607       void createSendRecvIDs(
const tpetra_import_type &
import) {
 
  609         const auto remote_lids = 
import.getRemoteLIDs();
 
  610         const local_ordinal_type_1d_view_host
 
  611           remote_lids_view_host(const_cast<local_ordinal_type*>(remote_lids.getRawPtr()), remote_lids.size());
 
  613         Kokkos::deep_copy(lids.recv, remote_lids_view_host);
 
  616         auto epids = 
import.getExportPIDs();
 
  617         auto elids = 
import.getExportLIDs();
 
  620         auto lids_send_host = Kokkos::create_mirror_view(lids.send);
 
  623         for (local_ordinal_type cnt=0,i=0,iend=pids.send.extent(0);i<iend;++i) {
 
  624           const auto pid_send_value = pids.send[i];
 
  625           for (local_ordinal_type j=0,jend=epids.size();j<jend;++j)
 
  626             if (epids[j] == pid_send_value) lids_send_host[cnt++] = elids[j];
 
  627 #if !defined(__CUDA_ARCH__) 
  628           TEUCHOS_ASSERT(static_cast<size_t>(cnt) == offset_host.send[i+1]);
 
  631         Kokkos::deep_copy(lids.send, lids_send_host);
 
  637       struct ToMultiVector {};
 
  641                        const local_ordinal_type blocksize_,
 
  642                        const local_ordinal_type_1d_view dm2cm_) {
 
  643         blocksize = blocksize_;
 
  646 #ifdef HAVE_IFPACK2_MPI 
  647         comm = Tpetra::Details::extractMpiCommFromTeuchos(*tgt_map->getComm());
 
  649         const tpetra_import_type 
import(src_map, tgt_map);
 
  651         createMpiRequests(
import);
 
  652         createSendRecvIDs(
import);
 
  654 #if defined(KOKKOS_ENABLE_CUDA) && defined(IFPACK2_BLOCKTRIDICONTAINER_USE_CUDA_STREAM) 
  656           const local_ordinal_type num_streams = pids.send.extent(0);
 
  657           stream.send.resize(num_streams);
 
  658           callback_data.send.resize(num_streams);
 
  659           for (local_ordinal_type i=0;i<num_streams;++i)
 
  660             CUDA_SAFE_CALL(cudaStreamCreate(&stream.send[i])); 
 
  663           const local_ordinal_type num_streams = pids.recv.extent(0);
 
  664           stream.recv.resize(num_streams);
 
  665           callback_data.recv.resize(num_streams);
 
  666           for (local_ordinal_type i=0;i<num_streams;++i)
 
  667             CUDA_SAFE_CALL(cudaStreamCreate(&stream.recv[i])); 
 
  673 #if defined(KOKKOS_ENABLE_CUDA) && defined(IFPACK2_BLOCKTRIDICONTAINER_USE_CUDA_STREAM) 
  675           const local_ordinal_type num_streams = stream.send.size();
 
  676           for (local_ordinal_type i=0;i<num_streams;++i)
 
  677             CUDA_SAFE_CALL(cudaStreamDestroy(stream.send[i]));
 
  680           const local_ordinal_type num_streams = stream.recv.size();
 
  681           for (local_ordinal_type i=0;i<num_streams;++i)
 
  682             CUDA_SAFE_CALL(cudaStreamDestroy(stream.recv[i]));
 
  687       void createDataBuffer(
const local_ordinal_type &num_vectors) {
 
  688         const size_type extent_0 = lids.recv.extent(0)*blocksize;
 
  689         const size_type extent_1 = num_vectors;
 
  690         if (remote_multivector.extent(0) == extent_0 &&
 
  691             remote_multivector.extent(1) == extent_1) {
 
  697           const auto send_buffer_size = offset_host.send[offset_host.send.extent(0)-1]*blocksize*num_vectors;
 
  698           const auto recv_buffer_size = offset_host.recv[offset_host.recv.extent(0)-1]*blocksize*num_vectors;
 
  706 #ifdef HAVE_IFPACK2_MPI 
  707         waitall(reqs.recv.size(), reqs.recv.data());
 
  708         waitall(reqs.send.size(), reqs.send.data());
 
  717 #if defined(KOKKOS_ENABLE_CUDA) && defined(IFPACK2_BLOCKTRIDICONTAINER_USE_CUDA_STREAM) 
  718       template<
typename PackTag>
 
  720       void copyViaCudaStream(
const local_ordinal_type_1d_view &lids_,
 
  721                              const impl_scalar_type_1d_view &buffer_,
 
  722                              const local_ordinal_type ibeg_,
 
  723                              const local_ordinal_type iend_,
 
  724                              const impl_scalar_type_2d_view_tpetra &multivector_,
 
  725                              const local_ordinal_type blocksize_,
 
  726                              const cudaStream_t &stream_) {
 
  727         const local_ordinal_type num_vectors = multivector_.extent(1);
 
  728         const local_ordinal_type mv_blocksize = blocksize_*num_vectors;
 
  729         const local_ordinal_type idiff = iend_ - ibeg_;
 
  730         const auto abase = buffer_.data() + mv_blocksize*ibeg_;
 
  732         using team_policy_type = Kokkos::TeamPolicy<execution_space>;
 
  733         local_ordinal_type vector_size(0);
 
  734         if      (blocksize_ <=  4) vector_size =  4;
 
  735         else if (blocksize_ <=  8) vector_size =  8;
 
  736         else if (blocksize_ <= 16) vector_size = 16;
 
  737         else                       vector_size = 32;
 
  739         const auto exec_instance = Kokkos::Cuda(stream_);
 
  740         const auto work_item_property = Kokkos::Experimental::WorkItemProperty::HintLightWeight;
 
  741         const team_policy_type policy(exec_instance, idiff, 1, vector_size);
 
  744            Kokkos::Experimental::require(policy, work_item_property),
 
  745            KOKKOS_LAMBDA(
const typename team_policy_type::member_type &member) {
 
  746             const local_ordinal_type i = member.league_rank();
 
  748               (Kokkos::TeamThreadRange(member,num_vectors),[&](
const local_ordinal_type &j) {
 
  749                 auto aptr = abase + blocksize_*(i + idiff*j);
 
  750                 auto bptr = &multivector_(blocksize_*lids_(i + ibeg_), j);
 
  751                 if (std::is_same<PackTag,ToBuffer>::value)
 
  753                     (Kokkos::ThreadVectorRange(member,blocksize_),[&](
const local_ordinal_type &k) {
 
  758                     (Kokkos::ThreadVectorRange(member,blocksize_),[&](
const local_ordinal_type &k) {
 
  765       void asyncSendRecvViaCudaStream(
const impl_scalar_type_2d_view_tpetra &mv) {
 
  766         IFPACK2_BLOCKTRIDICONTAINER_TIMER(
"BlockTriDi::AsyncableImport::AsyncSendRecv");
 
  768 #ifdef HAVE_IFPACK2_MPI 
  770         const local_ordinal_type num_vectors = mv.extent(1);
 
  771         const local_ordinal_type mv_blocksize = blocksize*num_vectors;
 
  774         for (local_ordinal_type i=0,iend=pids.recv.extent(0);i<iend;++i) {
 
  776                 reinterpret_cast<char*>(buffer.recv.data() + offset_host.recv[i]*mv_blocksize),
 
  777                 (offset_host.recv[i+1] - offset_host.recv[i])*mv_blocksize*
sizeof(impl_scalar_type),
 
  784 #if defined (KOKKOS_ENABLE_OPENMP) 
  785 #pragma omp parallel for 
  787         for (local_ordinal_type i=0;i<static_cast<local_ordinal_type>(pids.send.extent(0));++i) {
 
  788           auto &stream_at_i = stream.send[i];
 
  790           copyViaCudaStream<ToBuffer>(lids.send, buffer.send,
 
  791                                       offset_host.send(i), offset_host.send(i+1),
 
  808 #if defined (KOKKOS_ENABLE_OPENMP) 
  809 #pragma omp parallel for 
  811         for (local_ordinal_type i=0;i<static_cast<local_ordinal_type>(pids.send.extent(0));++i) {
 
  815                 reinterpret_cast<const char*>(buffer.send.data() + offset_host.send[i]*mv_blocksize),
 
  816                 (offset_host.send[i+1] - offset_host.send[i])*mv_blocksize*
sizeof(impl_scalar_type),
 
  823         for (local_ordinal_type i=0,iend=pids.recv.extent(0);i<iend;++i) {
 
  826           MPI_Iprobe(pids.recv[i], 42, comm, &flag, &stat);
 
  828 #endif // HAVE_IFPACK2_MPI 
  831       void syncRecvViaCudaStream() {
 
  832         IFPACK2_BLOCKTRIDICONTAINER_TIMER(
"BlockTriDi::AsyncableImport::SyncRecv");
 
  833 #ifdef HAVE_IFPACK2_MPI 
  835 #if defined (KOKKOS_ENABLE_OPENMP) 
  836 #pragma omp parallel for 
  838         for (local_ordinal_type i=0;i<static_cast<local_ordinal_type>(pids.recv.extent(0));++i) {
 
  839           local_ordinal_type idx = i;
 
  840           auto &stream_at_idx = stream.recv[idx];
 
  843           waitany(pids.recv.extent(0), reqs.recv.data(), &idx);
 
  849           copyViaCudaStream<ToMultiVector>(lids.recv, buffer.recv,
 
  850                                            offset_host.recv(idx), offset_host.recv(idx+1),
 
  851                                            remote_multivector, blocksize,
 
  859         waitall(reqs.send.size(), reqs.send.data());
 
  860 #endif // HAVE_IFPACK2_MPI 
  862 #endif //defined(KOKKOS_ENABLE_CUDA) && defined(IFPACK2_BLOCKTRIDICONTAINER_USE_CUDA_STREAM) 
  869       template<
typename PackTag>
 
  871       void copy(
const local_ordinal_type_1d_view &lids_,
 
  872                 const impl_scalar_type_1d_view &buffer_,
 
  873                 const local_ordinal_type &ibeg_,
 
  874                 const local_ordinal_type &iend_,
 
  875                 const impl_scalar_type_2d_view_tpetra &multivector_,
 
  876                 const local_ordinal_type blocksize_) {
 
  877         const local_ordinal_type num_vectors = multivector_.extent(1);
 
  878         const local_ordinal_type mv_blocksize = blocksize_*num_vectors;
 
  879         const local_ordinal_type idiff = iend_ - ibeg_;
 
  880         const auto abase = buffer_.data() + mv_blocksize*ibeg_;
 
  881         if (is_cuda<execution_space>::value) {
 
  882 #if defined(KOKKOS_ENABLE_CUDA) 
  883           using team_policy_type = Kokkos::TeamPolicy<execution_space>;
 
  884           local_ordinal_type vector_size(0);
 
  885           if      (blocksize_ <=  4) vector_size =  4;
 
  886           else if (blocksize_ <=  8) vector_size =  8;
 
  887           else if (blocksize_ <= 16) vector_size = 16;
 
  888           else                       vector_size = 32;
 
  889           const team_policy_type policy(idiff, 1, vector_size);
 
  891             (
"AsyncableImport::TeamPolicy::copy",
 
  892              policy, KOKKOS_LAMBDA(
const typename team_policy_type::member_type &member) {
 
  893               const local_ordinal_type i = member.league_rank();
 
  895                 (Kokkos::TeamThreadRange(member,num_vectors),[&](
const local_ordinal_type &j) {
 
  896                   auto aptr = abase + blocksize_*(i + idiff*j);
 
  897                   auto bptr = &multivector_(blocksize_*lids_(i + ibeg_), j);
 
  898                   if (std::is_same<PackTag,ToBuffer>::value)
 
  900                       (Kokkos::ThreadVectorRange(member,blocksize_),[&](
const local_ordinal_type &k) {
 
  905                       (Kokkos::ThreadVectorRange(member,blocksize_),[&](
const local_ordinal_type &k) {
 
  912 #if defined(__CUDA_ARCH__) 
  916             const Kokkos::RangePolicy<execution_space> policy(0, idiff*num_vectors);
 
  918               (
"AsyncableImport::RangePolicy::copy",
 
  919                policy, KOKKOS_LAMBDA(
const local_ordinal_type &ij) {
 
  920                 const local_ordinal_type i = ij%idiff;
 
  921                 const local_ordinal_type j = ij/idiff;
 
  922                 auto aptr = abase + blocksize_*(i + idiff*j);
 
  923                 auto bptr = &multivector_(blocksize_*lids_(i + ibeg_), j);
 
  924                 auto from = std::is_same<PackTag,ToBuffer>::value ? bptr : aptr;
 
  925                 auto to   = std::is_same<PackTag,ToBuffer>::value ? aptr : bptr;
 
  926                 memcpy(to, from, 
sizeof(impl_scalar_type)*blocksize_);
 
  934       void asyncSendRecv(
const impl_scalar_type_2d_view_tpetra &mv) {
 
  935         IFPACK2_BLOCKTRIDICONTAINER_TIMER(
"BlockTriDi::AsyncableImport::AsyncSendRecv");
 
  937 #ifdef HAVE_IFPACK2_MPI 
  939         const local_ordinal_type num_vectors = mv.extent(1);
 
  940         const local_ordinal_type mv_blocksize = blocksize*num_vectors;
 
  943         for (local_ordinal_type i=0,iend=pids.recv.extent(0);i<iend;++i) {
 
  945                 reinterpret_cast<char*>(buffer.recv.data() + offset_host.recv[i]*mv_blocksize),
 
  946                 (offset_host.recv[i+1] - offset_host.recv[i])*mv_blocksize*
sizeof(impl_scalar_type),
 
  953         for (local_ordinal_type i=0,iend=pids.send.extent(0);i<iend;++i) {
 
  954           copy<ToBuffer>(lids.send, buffer.send, offset_host.send(i), offset_host.send(i+1),
 
  957                 reinterpret_cast<const char*>(buffer.send.data() + offset_host.send[i]*mv_blocksize),
 
  958                 (offset_host.send[i+1] - offset_host.send[i])*mv_blocksize*
sizeof(impl_scalar_type),
 
  966         for (local_ordinal_type i=0,iend=pids.recv.extent(0);i<iend;++i) {
 
  969           MPI_Iprobe(pids.recv[i], 42, comm, &flag, &stat);
 
  975         IFPACK2_BLOCKTRIDICONTAINER_TIMER(
"BlockTriDi::AsyncableImport::SyncRecv");
 
  976 #ifdef HAVE_IFPACK2_MPI 
  978         for (local_ordinal_type i=0,iend=pids.recv.extent(0);i<iend;++i) {
 
  979           local_ordinal_type idx = i;
 
  980           waitany(pids.recv.extent(0), reqs.recv.data(), &idx);
 
  981           copy<ToMultiVector>(lids.recv, buffer.recv, offset_host.recv(idx), offset_host.recv(idx+1),
 
  982                               remote_multivector, blocksize);
 
  985         waitall(reqs.send.size(), reqs.send.data());
 
  989       void syncExchange(
const impl_scalar_type_2d_view_tpetra &mv) {
 
  990         IFPACK2_BLOCKTRIDICONTAINER_TIMER(
"BlockTriDi::AsyncableImport::SyncExchange");
 
  991 #if defined(KOKKOS_ENABLE_CUDA) && defined(IFPACK2_BLOCKTRIDICONTAINER_USE_CUDA_STREAM) 
  992         asyncSendRecvViaCudaStream(mv);
 
  993         syncRecvViaCudaStream();
 
 1000       impl_scalar_type_2d_view_tpetra getRemoteMultiVectorLocalView()
 const { 
return remote_multivector; }
 
 1006     template<
typename MatrixType>
 
 1010       using tpetra_map_type = 
typename impl_type::tpetra_map_type;
 
 1011       using local_ordinal_type = 
typename impl_type::local_ordinal_type;
 
 1012       using global_ordinal_type = 
typename impl_type::global_ordinal_type;
 
 1013       using local_ordinal_type_1d_view = 
typename impl_type::local_ordinal_type_1d_view;
 
 1015       const auto g = A->getCrsGraph();  
 
 1016       const auto blocksize = A->getBlockSize();
 
 1017       const auto domain_map = g.getDomainMap();
 
 1018       const auto column_map = g.getColMap();
 
 1020       std::vector<global_ordinal_type> gids;
 
 1021       bool separate_remotes = 
true, found_first = 
false, need_owned_permutation = 
false;
 
 1022       for (
size_t i=0;i<column_map->getNodeNumElements();++i) {
 
 1023         const global_ordinal_type gid = column_map->getGlobalElement(i);
 
 1024         if (!domain_map->isNodeGlobalElement(gid)) {
 
 1026           gids.push_back(gid);
 
 1027         } 
else if (found_first) {
 
 1028           separate_remotes = 
false;
 
 1031         if (!need_owned_permutation &&
 
 1032             domain_map->getLocalElement(gid) != 
static_cast<local_ordinal_type
>(i)) {
 
 1041           need_owned_permutation = 
true;
 
 1045       if (separate_remotes) {
 
 1047         const auto parsimonious_col_map
 
 1048           = 
Teuchos::rcp(
new tpetra_map_type(invalid, gids.data(), gids.size(), 0, domain_map->getComm()));
 
 1049         if (parsimonious_col_map->getGlobalNumElements() > 0) {
 
 1051           local_ordinal_type_1d_view dm2cm;
 
 1052           if (need_owned_permutation) {
 
 1053             dm2cm = local_ordinal_type_1d_view(
do_not_initialize_tag(
"dm2cm"), domain_map->getNodeNumElements());
 
 1054             const auto dm2cm_host = Kokkos::create_mirror_view(dm2cm);
 
 1055             for (
size_t i=0;i<domain_map->getNodeNumElements();++i)
 
 1056               dm2cm_host(i) = domain_map->getLocalElement(column_map->getGlobalElement(i));
 
 1057             Kokkos::deep_copy(dm2cm, dm2cm_host);
 
 1059           return Teuchos::rcp(
new AsyncableImport<MatrixType>(domain_map, parsimonious_col_map, blocksize, dm2cm));
 
 1062       return Teuchos::null;
 
 1065     template<
typename MatrixType>
 
 1066     struct PartInterface {
 
 1067       using local_ordinal_type = 
typename ImplType<MatrixType>::local_ordinal_type;
 
 1068       using local_ordinal_type_1d_view = 
typename ImplType<MatrixType>::local_ordinal_type_1d_view;
 
 1070       PartInterface() = 
default;
 
 1071       PartInterface(
const PartInterface &b) = 
default;
 
 1091       local_ordinal_type_1d_view lclrow;
 
 1093       local_ordinal_type_1d_view partptr; 
 
 1096       local_ordinal_type_1d_view packptr; 
 
 1099       local_ordinal_type_1d_view part2rowidx0; 
 
 1103       local_ordinal_type_1d_view part2packrowidx0; 
 
 1104       local_ordinal_type part2packrowidx0_back; 
 
 1106       local_ordinal_type_1d_view rowidx2part; 
 
 1112       bool row_contiguous;
 
 1114       local_ordinal_type max_partsz;
 
 1120     template<
typename MatrixType>
 
 1121     PartInterface<MatrixType>
 
 1125       using local_ordinal_type = 
typename impl_type::local_ordinal_type;
 
 1126       using local_ordinal_type_1d_view = 
typename impl_type::local_ordinal_type_1d_view;
 
 1128       constexpr 
int vector_length = impl_type::vector_length;
 
 1130       const auto comm = A->getRowMap()->getComm();
 
 1132       PartInterface<MatrixType> interf;
 
 1134       const bool jacobi = partitions.size() == 0;
 
 1135       const local_ordinal_type A_n_lclrows = A->getNodeNumRows();
 
 1136       const local_ordinal_type nparts = jacobi ? A_n_lclrows : partitions.size();
 
 1138 #if defined(BLOCKTRIDICONTAINER_DEBUG) 
 1139       local_ordinal_type nrows = 0;
 
 1143         for (local_ordinal_type i=0;i<nparts;++i) nrows += partitions[i].size();
 
 1146         (nrows != A_n_lclrows, 
get_msg_prefix(comm) << 
"The #rows implied by the local partition is not " 
 1147          << 
"the same as getNodeNumRows: " << nrows << 
" vs " << A_n_lclrows);
 
 1151       std::vector<local_ordinal_type> p;
 
 1153         interf.max_partsz = 1;
 
 1158         typedef std::pair<local_ordinal_type,local_ordinal_type> size_idx_pair_type;
 
 1159         std::vector<size_idx_pair_type> partsz(nparts);
 
 1160         for (local_ordinal_type i=0;i<nparts;++i)
 
 1161           partsz[i] = size_idx_pair_type(partitions[i].size(), i);
 
 1162         std::sort(partsz.begin(), partsz.end(),
 
 1163                   [] (
const size_idx_pair_type& x, 
const size_idx_pair_type& y) {
 
 1164                     return x.first > y.first;
 
 1166         for (local_ordinal_type i=0;i<nparts;++i)
 
 1167           p[i] = partsz[i].second;
 
 1169         interf.max_partsz = partsz[0].first;
 
 1175       interf.part2rowidx0 = local_ordinal_type_1d_view(
do_not_initialize_tag(
"part2rowidx0"), nparts + 1);
 
 1176       interf.part2packrowidx0 = local_ordinal_type_1d_view(
do_not_initialize_tag(
"part2packrowidx0"), nparts + 1);
 
 1180       const auto partptr = Kokkos::create_mirror_view(interf.partptr);
 
 1181       const auto lclrow = Kokkos::create_mirror_view(interf.lclrow);
 
 1182       const auto part2rowidx0 = Kokkos::create_mirror_view(interf.part2rowidx0);
 
 1183       const auto part2packrowidx0 = Kokkos::create_mirror_view(interf.part2packrowidx0);
 
 1184       const auto rowidx2part = Kokkos::create_mirror_view(interf.rowidx2part);
 
 1187       interf.row_contiguous = 
true;
 
 1189       part2rowidx0(0) = 0;
 
 1190       part2packrowidx0(0) = 0;
 
 1191       local_ordinal_type pack_nrows = 0;
 
 1192       for (local_ordinal_type ip=0;ip<nparts;++ip) {
 
 1193         const auto* part = jacobi ? NULL : &partitions[p[ip]];
 
 1194         const local_ordinal_type ipnrows = jacobi ? 1 : part->size();
 
 1195         TEUCHOS_ASSERT(ip == 0 || (jacobi || ipnrows <= static_cast<local_ordinal_type>(partitions[p[ip-1]].size())));
 
 1198                                     << 
"partition " << p[ip]
 
 1199                                     << 
" is empty, which is not allowed.");
 
 1201         part2rowidx0(ip+1) = part2rowidx0(ip) + ipnrows;
 
 1204         if (ip % vector_length == 0) pack_nrows = ipnrows;
 
 1205         part2packrowidx0(ip+1) = part2packrowidx0(ip) + ((ip+1) % vector_length == 0 || ip+1 == nparts ? pack_nrows : 0);
 
 1206         const local_ordinal_type os = partptr(ip);
 
 1207         for (local_ordinal_type i=0;i<ipnrows;++i) {
 
 1208           const auto lcl_row = jacobi ? ip : (*part)[i];
 
 1211                                       << 
"partitions[" << p[ip] << 
"][" 
 1212                                       << i << 
"] = " << lcl_row
 
 1213                                       << 
" but input matrix implies limits of [0, " << A_n_lclrows-1
 
 1215           lclrow(os+i) = lcl_row;
 
 1216           rowidx2part(os+i) = ip;
 
 1217           if (interf.row_contiguous && os+i > 0 && lclrow((os+i)-1) + 1 != lcl_row)
 
 1218             interf.row_contiguous = 
false;
 
 1220         partptr(ip+1) = os + ipnrows;
 
 1222 #if defined(BLOCKTRIDICONTAINER_DEBUG) 
 1225       if (lclrow(0) != 0) interf.row_contiguous = 
false;
 
 1227       Kokkos::deep_copy(interf.partptr, partptr);
 
 1228       Kokkos::deep_copy(interf.lclrow, lclrow);
 
 1231       interf.part2rowidx0 = interf.partptr;
 
 1232       Kokkos::deep_copy(interf.part2packrowidx0, part2packrowidx0);
 
 1234       interf.part2packrowidx0_back = part2packrowidx0(part2packrowidx0.extent(0) - 1);
 
 1235       Kokkos::deep_copy(interf.rowidx2part, rowidx2part);
 
 1238         local_ordinal_type npacks = 0;
 
 1239         for (local_ordinal_type ip=1;ip<=nparts;++ip)
 
 1240           if (part2packrowidx0(ip) != part2packrowidx0(ip-1))
 
 1243         const auto packptr = Kokkos::create_mirror_view(interf.packptr);
 
 1245         for (local_ordinal_type ip=1,k=1;ip<=nparts;++ip)
 
 1246           if (part2packrowidx0(ip) != part2packrowidx0(ip-1))
 
 1248         Kokkos::deep_copy(interf.packptr, packptr);
 
 1257     template <
typename MatrixType>
 
 1260       using local_ordinal_type_1d_view = 
typename impl_type::local_ordinal_type_1d_view;
 
 1262       using vector_type_3d_view = 
typename impl_type::vector_type_3d_view;
 
 1268       size_type_1d_view flat_td_ptr, pack_td_ptr;
 
 1271       local_ordinal_type_1d_view A_colindsub;
 
 1274       vector_type_3d_view values;
 
 1276       bool is_diagonal_only;
 
 1282       template <
typename idx_type>
 
 1283       static KOKKOS_FORCEINLINE_FUNCTION
 
 1284       idx_type IndexToRow (
const idx_type& ind) { 
return (ind + 1) / 3; }
 
 1287       template <
typename idx_type>
 
 1288       static KOKKOS_FORCEINLINE_FUNCTION
 
 1289       idx_type RowToIndex (
const idx_type& row) { 
return row > 0 ? 3*row - 1 : 0; }
 
 1291       template <
typename idx_type>
 
 1292       static KOKKOS_FORCEINLINE_FUNCTION
 
 1293       idx_type NumBlocks (
const idx_type& nrows) { 
return nrows > 0 ? 3*nrows - 2 : 0; }
 
 1300     template<
typename MatrixType>
 
 1304       using execution_space = 
typename impl_type::execution_space;
 
 1305       using local_ordinal_type = 
typename impl_type::local_ordinal_type;
 
 1306       using size_type = 
typename impl_type::size_type;
 
 1307       using size_type_1d_view = 
typename impl_type::size_type_1d_view;
 
 1309       constexpr 
int vector_length = impl_type::vector_length;
 
 1313       const local_ordinal_type ntridiags = interf.partptr.extent(0) - 1;
 
 1317         const Kokkos::RangePolicy<execution_space> policy(0,ntridiags + 1);
 
 1318         Kokkos::parallel_scan
 
 1319           (
"createBlockTridiags::RangePolicy::flat_td_ptr",
 
 1320            policy, KOKKOS_LAMBDA(
const local_ordinal_type &i, size_type &update, 
const bool &
final) {
 
 1322               btdm.flat_td_ptr(i) = update;
 
 1323             if (i < ntridiags) {
 
 1324               const local_ordinal_type nrows = interf.partptr(i+1) - interf.partptr(i);
 
 1325               update += btdm.NumBlocks(nrows);
 
 1329         const auto nblocks = Kokkos::create_mirror_view_and_copy
 
 1330           (Kokkos::HostSpace(), Kokkos::subview(btdm.flat_td_ptr, ntridiags));
 
 1331         btdm.is_diagonal_only = (
static_cast<local_ordinal_type
>(nblocks()) == ntridiags);
 
 1335       if (vector_length == 1) {
 
 1336         btdm.pack_td_ptr = btdm.flat_td_ptr;
 
 1338         const local_ordinal_type npacks = interf.packptr.extent(0) - 1;
 
 1340         const Kokkos::RangePolicy<execution_space> policy(0,npacks);
 
 1341         Kokkos::parallel_scan
 
 1342           (
"createBlockTridiags::RangePolicy::pack_td_ptr",
 
 1343            policy, KOKKOS_LAMBDA(
const local_ordinal_type &i, size_type &update, 
const bool &
final) {
 
 1344             const local_ordinal_type parti      = interf.packptr(i);
 
 1345             const local_ordinal_type parti_next = interf.packptr(i+1);
 
 1347               const size_type nblks = update;
 
 1348               for (local_ordinal_type pti=parti;pti<parti_next;++pti)
 
 1349                 btdm.pack_td_ptr(pti) = nblks;
 
 1350               const local_ordinal_type nrows = interf.partptr(parti+1) - interf.partptr(parti);
 
 1353                 btdm.pack_td_ptr(ntridiags) = nblks + btdm.NumBlocks(nrows);
 
 1356               const local_ordinal_type nrows = interf.partptr(parti+1) - interf.partptr(parti);
 
 1357               update += btdm.NumBlocks(nrows);
 
 1377     template<
typename MatrixType>
 
 1379     setTridiagsToIdentity
 
 1380       (
const BlockTridiags<MatrixType>& btdm,
 
 1381        const typename ImplType<MatrixType>::local_ordinal_type_1d_view& packptr)
 
 1383       using impl_type = ImplType<MatrixType>;
 
 1384       using execution_space = 
typename impl_type::execution_space;
 
 1385       using local_ordinal_type = 
typename impl_type::local_ordinal_type;
 
 1386       using size_type_1d_view = 
typename impl_type::size_type_1d_view;
 
 1388       const ConstUnmanaged<size_type_1d_view> pack_td_ptr(btdm.pack_td_ptr);
 
 1389       const local_ordinal_type blocksize = btdm.values.extent(1);
 
 1392         const int vector_length = impl_type::vector_length;
 
 1393         const int internal_vector_length = impl_type::internal_vector_length;
 
 1395         using btdm_scalar_type = 
typename impl_type::btdm_scalar_type;
 
 1396         using internal_vector_type = 
typename impl_type::internal_vector_type;
 
 1397         using internal_vector_type_4d_view =
 
 1398           typename impl_type::internal_vector_type_4d_view;
 
 1400         using team_policy_type = Kokkos::TeamPolicy<execution_space>;
 
 1401         const internal_vector_type_4d_view values
 
 1402           (reinterpret_cast<internal_vector_type*>(btdm.values.data()),
 
 1403            btdm.values.extent(0),
 
 1404            btdm.values.extent(1),
 
 1405            btdm.values.extent(2),
 
 1406            vector_length/internal_vector_length);
 
 1407         const local_ordinal_type vector_loop_size = values.extent(3);
 
 1408 #if defined(KOKKOS_ENABLE_CUDA) && defined(__CUDA_ARCH__) 
 1409         local_ordinal_type total_team_size(0);
 
 1410         if      (blocksize <=  5) total_team_size =  32;
 
 1411         else if (blocksize <=  9) total_team_size =  64;
 
 1412         else if (blocksize <= 12) total_team_size =  96;
 
 1413         else if (blocksize <= 16) total_team_size = 128;
 
 1414         else if (blocksize <= 20) total_team_size = 160;
 
 1415         else                      total_team_size = 160;
 
 1416         const local_ordinal_type team_size = total_team_size/vector_loop_size;
 
 1417         const team_policy_type policy(packptr.extent(0)-1, team_size, vector_loop_size);
 
 1418 #else // Host architecture: team size is always one 
 1419         const team_policy_type policy(packptr.extent(0)-1, 1, 1);
 
 1421         Kokkos::parallel_for
 
 1422           (
"setTridiagsToIdentity::TeamPolicy",
 
 1423            policy, KOKKOS_LAMBDA(
const typename team_policy_type::member_type &member) {
 
 1424             const local_ordinal_type k = member.league_rank();
 
 1425             const local_ordinal_type ibeg = pack_td_ptr(packptr(k));
 
 1426             const local_ordinal_type iend = pack_td_ptr(packptr(k+1));
 
 1427             const local_ordinal_type diff = iend - ibeg;
 
 1428             const local_ordinal_type icount = diff/3 + (diff%3 > 0);
 
 1429             const btdm_scalar_type one(1);
 
 1430             Kokkos::parallel_for(Kokkos::ThreadVectorRange(member, vector_loop_size),[&](
const int &v) {
 
 1431                 Kokkos::parallel_for(Kokkos::TeamThreadRange(member,icount),[&](
const local_ordinal_type &ii) {
 
 1432                     const local_ordinal_type i = ibeg + ii*3;
 
 1433                     for (local_ordinal_type j=0;j<blocksize;++j)
 
 1434                       values(i,j,j,v) = one;
 
 1444     template <
typename MatrixType>
 
 1447       using local_ordinal_type_1d_view = 
typename impl_type::local_ordinal_type_1d_view;
 
 1449       using impl_scalar_type_1d_view_tpetra = 
typename impl_type::impl_scalar_type_1d_view_tpetra;
 
 1452       size_type_1d_view rowptr, rowptr_remote;
 
 1459       local_ordinal_type_1d_view A_colindsub, A_colindsub_remote;
 
 1462       bool is_tpetra_block_crs;
 
 1465       impl_scalar_type_1d_view_tpetra tpetra_values;
 
 1468       AmD(
const AmD &b) = 
default;
 
 1474     template<
typename MatrixType>
 
 1477                          const PartInterface<MatrixType> &interf,
 
 1480                          const bool overlap_communication_and_computation) {
 
 1481       IFPACK2_BLOCKTRIDICONTAINER_TIMER(
"BlockTriDi::SymbolicPhase");
 
 1484       using node_memory_space = 
typename impl_type::node_memory_space;
 
 1485       using host_execution_space = 
typename impl_type::host_execution_space;
 
 1487       using local_ordinal_type = 
typename impl_type::local_ordinal_type;
 
 1488       using global_ordinal_type = 
typename impl_type::global_ordinal_type;
 
 1489       using size_type = 
typename impl_type::size_type;
 
 1490       using local_ordinal_type_1d_view = 
typename impl_type::local_ordinal_type_1d_view;
 
 1491       using size_type_1d_view = 
typename impl_type::size_type_1d_view;
 
 1492       using vector_type_3d_view = 
typename impl_type::vector_type_3d_view;
 
 1493       using block_crs_matrix_type = 
typename impl_type::tpetra_block_crs_matrix_type;
 
 1495       constexpr 
int vector_length = impl_type::vector_length;
 
 1497       const auto comm = A->getRowMap()->getComm();
 
 1498       const auto& g = A->getCrsGraph();
 
 1499       const auto blocksize = A->getBlockSize();
 
 1502       const auto partptr = Kokkos::create_mirror_view_and_copy     (Kokkos::HostSpace(), interf.partptr);
 
 1503       const auto lclrow = Kokkos::create_mirror_view_and_copy      (Kokkos::HostSpace(), interf.lclrow);
 
 1504       const auto rowidx2part = Kokkos::create_mirror_view_and_copy (Kokkos::HostSpace(), interf.rowidx2part);
 
 1505       const auto part2rowidx0 = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), interf.part2rowidx0);
 
 1506       const auto packptr = Kokkos::create_mirror_view_and_copy     (Kokkos::HostSpace(), interf.packptr);
 
 1508       const local_ordinal_type nrows = partptr(partptr.extent(0) - 1);
 
 1511       Kokkos::View<local_ordinal_type*,host_execution_space> col2row(
"col2row", A->getNodeNumCols());
 
 1514         const auto rowmap = g.getRowMap();
 
 1515         const auto colmap = g.getColMap();
 
 1516         const auto dommap = g.getDomainMap();
 
 1517         TEUCHOS_ASSERT( !(rowmap.is_null() || colmap.is_null() || dommap.is_null()));
 
 1519 #if !defined(__CUDA_ARCH__) 
 1520         const Kokkos::RangePolicy<host_execution_space> policy(0,nrows);
 
 1521         Kokkos::parallel_for
 
 1522           (
"performSymbolicPhase::RangePolicy::col2row",
 
 1523            policy, KOKKOS_LAMBDA(
const local_ordinal_type &lr) {
 
 1524             const global_ordinal_type gid = rowmap->getGlobalElement(lr);
 
 1526             if (dommap->isNodeGlobalElement(gid)) {
 
 1527               const local_ordinal_type lc = colmap->getLocalElement(gid);
 
 1528 #  if defined(BLOCKTRIDICONTAINER_DEBUG) 
 1531                                           << 
" gives an invalid local column.");
 
 1541         const auto& local_graph = g.getLocalGraph();
 
 1542         const auto& local_graph_rowptr = local_graph.row_map;
 
 1543         TEUCHOS_ASSERT(local_graph_rowptr.size() == 
static_cast<size_t>(nrows + 1));
 
 1544         const auto& local_graph_colidx = local_graph.entries;
 
 1548         Kokkos::View<local_ordinal_type*,host_execution_space> lclrow2idx(
"lclrow2idx", nrows);
 
 1550           const Kokkos::RangePolicy<host_execution_space> policy(0,nrows);
 
 1551           Kokkos::parallel_for
 
 1552             (
"performSymbolicPhase::RangePolicy::lclrow2idx",
 
 1553              policy, KOKKOS_LAMBDA(
const local_ordinal_type &i) {
 
 1554               lclrow2idx[lclrow(i)] = i;
 
 1560         typename sum_reducer_type::value_type sum_reducer_value;
 
 1562           const Kokkos::RangePolicy<host_execution_space> policy(0,nrows);
 
 1563           Kokkos::parallel_reduce
 
 1566              policy, KOKKOS_LAMBDA(
const local_ordinal_type &lr, 
typename sum_reducer_type::value_type &update) {
 
 1568               const local_ordinal_type ri0 = lclrow2idx[lr];
 
 1569               const local_ordinal_type pi0 = rowidx2part(ri0);
 
 1570               for (size_type j=local_graph_rowptr(lr);j<local_graph_rowptr(lr+1);++j) {
 
 1571                 const local_ordinal_type lc = local_graph_colidx(j);
 
 1572                 const local_ordinal_type lc2r = col2row[lc];
 
 1573                 bool incr_R = 
false;
 
 1579                   const local_ordinal_type ri = lclrow2idx[lc2r];
 
 1580                   const local_ordinal_type pi = rowidx2part(ri);
 
 1588                   if (ri0 + 1 >= ri && ri0 <= ri + 1)
 
 1594                   if (lc < nrows) ++update.v[1]; 
 
 1598             }, sum_reducer_type(sum_reducer_value));
 
 1600         size_type D_nnz = sum_reducer_value.v[0];
 
 1601         size_type R_nnz_owned = sum_reducer_value.v[1];
 
 1602         size_type R_nnz_remote = sum_reducer_value.v[2];
 
 1604         if (!overlap_communication_and_computation) {
 
 1605           R_nnz_owned += R_nnz_remote;
 
 1611           const auto flat_td_ptr = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), btdm.flat_td_ptr);
 
 1613           btdm.A_colindsub = local_ordinal_type_1d_view(
"btdm.A_colindsub", D_nnz);
 
 1614           const auto D_A_colindsub = Kokkos::create_mirror_view(btdm.A_colindsub);
 
 1616 #if defined(BLOCKTRIDICONTAINER_DEBUG) 
 1620           const local_ordinal_type nparts = partptr.extent(0) - 1;
 
 1622             const Kokkos::RangePolicy<host_execution_space> policy(0, nparts);
 
 1623             Kokkos::parallel_for
 
 1624               (
"performSymbolicPhase::RangePolicy<host_execution_space>::D_graph",
 
 1625                policy, KOKKOS_LAMBDA(
const local_ordinal_type &pi0) {
 
 1626                 const local_ordinal_type part_ri0 = part2rowidx0(pi0);
 
 1627                 local_ordinal_type offset = 0;
 
 1628                 for (local_ordinal_type ri0=partptr(pi0);ri0<partptr(pi0+1);++ri0) {
 
 1629                   const local_ordinal_type td_row_os = btdm.RowToIndex(ri0 - part_ri0) + offset;
 
 1631                   const local_ordinal_type lr0 = lclrow(ri0);
 
 1632                   const size_type j0 = local_graph_rowptr(lr0);
 
 1633                   for (size_type j=j0;j<local_graph_rowptr(lr0+1);++j) {
 
 1634                     const local_ordinal_type lc = local_graph_colidx(j);
 
 1635                     const local_ordinal_type lc2r = col2row[lc];
 
 1637                     const local_ordinal_type ri = lclrow2idx[lc2r];
 
 1638                     const local_ordinal_type pi = rowidx2part(ri);
 
 1639                     if (pi != pi0) 
continue;
 
 1640                     if (ri + 1 < ri0 || ri > ri0 + 1) 
continue;
 
 1641                     const local_ordinal_type row_entry = j - j0;
 
 1642                     D_A_colindsub(flat_td_ptr(pi0) + ((td_row_os + ri) - ri0)) = row_entry;
 
 1647 #if defined(BLOCKTRIDICONTAINER_DEBUG) 
 1648           for (
size_t i=0;i<D_A_colindsub.extent(0);++i)
 
 1651           Kokkos::deep_copy(btdm.A_colindsub, D_A_colindsub);
 
 1655             const auto pack_td_ptr_last = Kokkos::subview(btdm.pack_td_ptr, nparts);
 
 1656             const auto num_packed_blocks = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), pack_td_ptr_last);
 
 1657             btdm.values = vector_type_3d_view(
"btdm.values", num_packed_blocks(), blocksize, blocksize);
 
 1658             if (vector_length > 1) setTridiagsToIdentity(btdm, interf.packptr);
 
 1664           amd.rowptr = size_type_1d_view(
"amd.rowptr", nrows + 1);
 
 1665           amd.A_colindsub = local_ordinal_type_1d_view(
do_not_initialize_tag(
"amd.A_colindsub"), R_nnz_owned);
 
 1667           const auto R_rowptr = Kokkos::create_mirror_view(amd.rowptr);
 
 1668           const auto R_A_colindsub = Kokkos::create_mirror_view(amd.A_colindsub);
 
 1670           amd.rowptr_remote = size_type_1d_view(
"amd.rowptr_remote", overlap_communication_and_computation ? nrows + 1 : 0);
 
 1671           amd.A_colindsub_remote = local_ordinal_type_1d_view(
do_not_initialize_tag(
"amd.A_colindsub_remote"), R_nnz_remote);
 
 1673           const auto R_rowptr_remote = Kokkos::create_mirror_view(amd.rowptr_remote);
 
 1674           const auto R_A_colindsub_remote = Kokkos::create_mirror_view(amd.A_colindsub_remote);
 
 1677             const Kokkos::RangePolicy<host_execution_space> policy(0,nrows);
 
 1678             Kokkos::parallel_for
 
 1679               (
"performSymbolicPhase::RangePolicy<host_execution_space>::R_graph_count",
 
 1680                policy, KOKKOS_LAMBDA(
const local_ordinal_type &lr) {
 
 1681                 const local_ordinal_type ri0 = lclrow2idx[lr];
 
 1682                 const local_ordinal_type pi0 = rowidx2part(ri0);
 
 1683                 const size_type j0 = local_graph_rowptr(lr);
 
 1684                 for (size_type j=j0;j<local_graph_rowptr(lr+1);++j) {
 
 1685                   const local_ordinal_type lc = local_graph_colidx(j);
 
 1686                   const local_ordinal_type lc2r = col2row[lc];
 
 1688                     const local_ordinal_type ri = lclrow2idx[lc2r];
 
 1689                     const local_ordinal_type pi = rowidx2part(ri);
 
 1690                     if (pi == pi0 && ri + 1 >= ri0 && ri <= ri0 + 1) {
 
 1695                   if (!overlap_communication_and_computation || lc < nrows) {
 
 1698                     ++R_rowptr_remote(lr);
 
 1707             Kokkos::RangePolicy<host_execution_space> policy(0,nrows+1);
 
 1708             Kokkos::parallel_scan
 
 1709               (
"performSymbolicPhase::RangePolicy<host_execution_space>::R_graph_fill",
 
 1710                policy, KOKKOS_LAMBDA(
const local_ordinal_type &lr,
 
 1711                                      update_type &update,
 
 1712                                      const bool &
final) {
 
 1714                 val.v[0] = R_rowptr(lr);
 
 1715                 if (overlap_communication_and_computation)
 
 1716                   val.v[1] = R_rowptr_remote(lr);
 
 1719                   R_rowptr(lr) = update.v[0];
 
 1720                   if (overlap_communication_and_computation)
 
 1721                     R_rowptr_remote(lr) = update.v[1];
 
 1724                     const local_ordinal_type ri0 = lclrow2idx[lr];
 
 1725                     const local_ordinal_type pi0 = rowidx2part(ri0);
 
 1727                     size_type cnt_rowptr = R_rowptr(lr);
 
 1728                     size_type cnt_rowptr_remote = overlap_communication_and_computation ? R_rowptr_remote(lr) : 0; 
 
 1730                     const size_type j0 = local_graph_rowptr(lr);
 
 1731                     for (size_type j=j0;j<local_graph_rowptr(lr+1);++j) {
 
 1732                       const local_ordinal_type lc = local_graph_colidx(j);
 
 1733                       const local_ordinal_type lc2r = col2row[lc];
 
 1735                         const local_ordinal_type ri = lclrow2idx[lc2r];
 
 1736                         const local_ordinal_type pi = rowidx2part(ri);
 
 1737                         if (pi == pi0 && ri + 1 >= ri0 && ri <= ri0 + 1)
 
 1740                       const local_ordinal_type row_entry = j - j0;
 
 1741                       if (!overlap_communication_and_computation || lc < nrows)
 
 1742                         R_A_colindsub(cnt_rowptr++) = row_entry;
 
 1744                         R_A_colindsub_remote(cnt_rowptr_remote++) = row_entry;
 
 1752           Kokkos::deep_copy(amd.rowptr, R_rowptr);
 
 1753           Kokkos::deep_copy(amd.A_colindsub, R_A_colindsub);
 
 1754           if (overlap_communication_and_computation) {
 
 1756             Kokkos::deep_copy(amd.rowptr_remote, R_rowptr_remote);
 
 1757             Kokkos::deep_copy(amd.A_colindsub_remote, R_A_colindsub_remote);
 
 1761           amd.tpetra_values = (
const_cast<block_crs_matrix_type*
>(A.get())->
 
 1762                                template getValues<node_memory_space>());
 
 1771     template<
typename ArgActiveExecutionMemorySpace>
 
 1776       typedef KB::Mode::Serial mode_type;
 
 1777 #if defined(__KOKKOSBATCHED_INTEL_MKL_COMPACT_BATCHED__) 
 1778       typedef KB::Algo::Level3::CompactMKL algo_type;
 
 1780       typedef KB::Algo::Level3::Blocked algo_type;
 
 1782       static int recommended_team_size(
const int ,
 
 1790 #if defined(KOKKOS_ENABLE_CUDA) 
 1791     static inline int ExtractAndFactorizeRecommendedCudaTeamSize(
const int blksize,
 
 1792                                                                  const int vector_length,
 
 1793                                                                  const int internal_vector_length) {
 
 1794       const int vector_size = vector_length/internal_vector_length;
 
 1795       int total_team_size(0);
 
 1796       if      (blksize <=  5) total_team_size =  32;
 
 1797       else if (blksize <=  9) total_team_size =  32; 
 
 1798       else if (blksize <= 12) total_team_size =  96;
 
 1799       else if (blksize <= 16) total_team_size = 128;
 
 1800       else if (blksize <= 20) total_team_size = 160;
 
 1801       else                    total_team_size = 160;
 
 1802       return 2*total_team_size/vector_size;
 
 1805     struct ExtractAndFactorizeTridiagsDefaultModeAndAlgo<Kokkos::CudaSpace> {
 
 1806       typedef KB::Mode::Team mode_type;
 
 1807       typedef KB::Algo::Level3::Unblocked algo_type;
 
 1808       static int recommended_team_size(
const int blksize,
 
 1809                                        const int vector_length,
 
 1810                                        const int internal_vector_length) {
 
 1811         return ExtractAndFactorizeRecommendedCudaTeamSize(blksize, vector_length, internal_vector_length);
 
 1815     struct ExtractAndFactorizeTridiagsDefaultModeAndAlgo<Kokkos::CudaUVMSpace> {
 
 1816       typedef KB::Mode::Team mode_type;
 
 1817       typedef KB::Algo::Level3::Unblocked algo_type;
 
 1818       static int recommended_team_size(
const int blksize,
 
 1819                                        const int vector_length,
 
 1820                                        const int internal_vector_length) {
 
 1821         return ExtractAndFactorizeRecommendedCudaTeamSize(blksize, vector_length, internal_vector_length);
 
 1826     template<
typename MatrixType>
 
 1827     struct ExtractAndFactorizeTridiags {
 
 1829       using impl_type = ImplType<MatrixType>;
 
 1831       using execution_space = 
typename impl_type::execution_space;
 
 1832       using memory_space = 
typename impl_type::memory_space;
 
 1834       using local_ordinal_type = 
typename impl_type::local_ordinal_type;
 
 1837       using magnitude_type = 
typename impl_type::magnitude_type;
 
 1839       using block_crs_matrix_type = 
typename impl_type::tpetra_block_crs_matrix_type;
 
 1841       using local_ordinal_type_1d_view = 
typename impl_type::local_ordinal_type_1d_view;
 
 1843       using impl_scalar_type_1d_view_tpetra = 
typename impl_type::impl_scalar_type_1d_view_tpetra;
 
 1845       using btdm_scalar_type = 
typename impl_type::btdm_scalar_type;
 
 1846       using btdm_magnitude_type = 
typename impl_type::btdm_magnitude_type;
 
 1847       using vector_type_3d_view = 
typename impl_type::vector_type_3d_view;
 
 1848       using internal_vector_type_4d_view = 
typename impl_type::internal_vector_type_4d_view;
 
 1849       using btdm_scalar_type_4d_view = 
typename impl_type::btdm_scalar_type_4d_view;
 
 1850       using internal_vector_scratch_type_3d_view = Scratch<typename impl_type::internal_vector_type_3d_view>;
 
 1851       using btdm_scalar_scratch_type_3d_view = Scratch<typename impl_type::btdm_scalar_type_3d_view>;
 
 1853       using internal_vector_type = 
typename impl_type::internal_vector_type;
 
 1854       static constexpr 
int vector_length = impl_type::vector_length;
 
 1855       static constexpr 
int internal_vector_length = impl_type::internal_vector_length;
 
 1858       using team_policy_type = Kokkos::TeamPolicy<execution_space>;
 
 1859       using member_type = 
typename team_policy_type::member_type;
 
 1863       const ConstUnmanaged<local_ordinal_type_1d_view> partptr, lclrow, packptr;
 
 1864       const local_ordinal_type max_partsz;
 
 1866       using a_rowptr_value_type = 
typename Kokkos::ViewTraits<local_ordinal_type*,typename impl_type::node_device_type>::size_type;
 
 1867       using size_type_1d_view_tpetra = Kokkos::View<a_rowptr_value_type*,typename impl_type::node_device_type>;
 
 1868       const ConstUnmanaged<size_type_1d_view_tpetra> A_rowptr;
 
 1869       const ConstUnmanaged<impl_scalar_type_1d_view_tpetra> A_values;
 
 1871       const ConstUnmanaged<size_type_1d_view> pack_td_ptr, flat_td_ptr;
 
 1872       const ConstUnmanaged<local_ordinal_type_1d_view> A_colindsub;
 
 1873       const Unmanaged<internal_vector_type_4d_view> internal_vector_values;
 
 1874       const Unmanaged<btdm_scalar_type_4d_view> scalar_values;
 
 1876       const local_ordinal_type blocksize, blocksize_square;
 
 1878       const magnitude_type tiny;
 
 1879       const local_ordinal_type vector_loop_size;
 
 1880       const local_ordinal_type vector_length_value;
 
 1883       ExtractAndFactorizeTridiags(
const BlockTridiags<MatrixType> &btdm_,
 
 1884                                   const PartInterface<MatrixType> &interf_,
 
 1886                                   const magnitude_type& tiny_) :
 
 1888         partptr(interf_.partptr),
 
 1889         lclrow(interf_.lclrow),
 
 1890         packptr(interf_.packptr),
 
 1891         max_partsz(interf_.max_partsz),
 
 1893         A_rowptr(A_->getCrsGraph().getLocalGraph().row_map),
 
 1894         A_values(const_cast<block_crs_matrix_type*>(A_.get())->template getValues<memory_space>()),
 
 1896         pack_td_ptr(btdm_.pack_td_ptr),
 
 1897         flat_td_ptr(btdm_.flat_td_ptr),
 
 1898         A_colindsub(btdm_.A_colindsub),
 
 1899         internal_vector_values((internal_vector_type*)btdm_.values.data(),
 
 1900                                btdm_.values.extent(0),
 
 1901                                btdm_.values.extent(1),
 
 1902                                btdm_.values.extent(2),
 
 1903                                vector_length/internal_vector_length),
 
 1904         scalar_values((btdm_scalar_type*)btdm_.values.data(),
 
 1905                       btdm_.values.extent(0),
 
 1906                       btdm_.values.extent(1),
 
 1907                       btdm_.values.extent(2),
 
 1909         blocksize(btdm_.values.extent(1)),
 
 1910         blocksize_square(blocksize*blocksize),
 
 1913         vector_loop_size(vector_length/internal_vector_length),
 
 1914         vector_length_value(vector_length) {}
 
 1918       KOKKOS_INLINE_FUNCTION
 
 1920       extract(local_ordinal_type partidx,
 
 1921               local_ordinal_type npacks)
 const {
 
 1922         const size_type kps = pack_td_ptr(partidx);
 
 1923         local_ordinal_type kfs[vector_length] = {};
 
 1924         local_ordinal_type ri0[vector_length] = {};
 
 1925         local_ordinal_type nrows[vector_length] = {};
 
 1927         for (local_ordinal_type vi=0;vi<npacks;++vi,++partidx) {
 
 1928           kfs[vi] = flat_td_ptr(partidx);
 
 1929           ri0[vi] = partptr(partidx);
 
 1930           nrows[vi] = partptr(partidx+1) - ri0[vi];
 
 1932         for (local_ordinal_type tr=0,j=0;tr<nrows[0];++tr) {
 
 1933           for (local_ordinal_type e=0;e<3;++e) {
 
 1934             const impl_scalar_type* block[vector_length] = {};
 
 1935             for (local_ordinal_type vi=0;vi<npacks;++vi) {
 
 1936               const size_type Aj = A_rowptr(lclrow(ri0[vi] + tr)) + A_colindsub(kfs[vi] + j);
 
 1937               block[vi] = &A_values(Aj*blocksize_square);
 
 1939             const size_type pi = kps + j;
 
 1941             for (local_ordinal_type ii=0;ii<blocksize;++ii) {
 
 1942               for (local_ordinal_type jj=0;jj<blocksize;++jj) {
 
 1943                 const auto idx = ii*blocksize + jj;
 
 1944                 auto& v = internal_vector_values(pi, ii, jj, 0);
 
 1945                 for (local_ordinal_type vi=0;vi<npacks;++vi)
 
 1946                   v[vi] = static_cast<btdm_scalar_type>(block[vi][idx]);
 
 1950             if (nrows[0] == 1) 
break;
 
 1951             if (e == 1 && (tr == 0 || tr+1 == nrows[0])) 
break;
 
 1952             for (local_ordinal_type vi=1;vi<npacks;++vi) {
 
 1953               if ((e == 0 && nrows[vi] == 1) || (e == 1 && tr+1 == nrows[vi])) {
 
 1962       KOKKOS_INLINE_FUNCTION
 
 1964       extract(
const member_type &member,
 
 1965               const local_ordinal_type &partidxbeg,
 
 1966               const local_ordinal_type &npacks,
 
 1967               const local_ordinal_type &vbeg)
 const {
 
 1968         local_ordinal_type kfs_vals[internal_vector_length] = {};
 
 1969         local_ordinal_type ri0_vals[internal_vector_length] = {};
 
 1970         local_ordinal_type nrows_vals[internal_vector_length] = {};
 
 1972         const size_type kps = pack_td_ptr(partidxbeg);
 
 1973         for (local_ordinal_type v=vbeg,vi=0;v<npacks && vi<internal_vector_length;++v,++vi) {
 
 1974           kfs_vals[vi] = flat_td_ptr(partidxbeg+vi);
 
 1975           ri0_vals[vi] = partptr(partidxbeg+vi);
 
 1976           nrows_vals[vi] = partptr(partidxbeg+vi+1) - ri0_vals[vi];
 
 1979         local_ordinal_type j_vals[internal_vector_length] = {};
 
 1980         for (local_ordinal_type tr=0;tr<nrows_vals[0];++tr) {
 
 1981           for (local_ordinal_type v=vbeg,vi=0;v<npacks && vi<internal_vector_length;++v,++vi) {
 
 1982             const local_ordinal_type nrows = nrows_vals[vi];
 
 1984               auto &j = j_vals[vi];
 
 1985               const local_ordinal_type kfs = kfs_vals[vi];
 
 1986               const local_ordinal_type ri0 = ri0_vals[vi];
 
 1987               const local_ordinal_type lbeg = (tr == 0         ? 1 : 0);
 
 1988               const local_ordinal_type lend = (tr == nrows - 1 ? 2 : 3);
 
 1989               for (local_ordinal_type l=lbeg;l<lend;++l,++j) {
 
 1990                 const size_type Aj = A_rowptr(lclrow(ri0 + tr)) + A_colindsub(kfs + j);
 
 1991                 const impl_scalar_type* block = &A_values(Aj*blocksize_square);
 
 1992                 const size_type pi = kps + j;
 
 1993                 Kokkos::parallel_for
 
 1994                   (Kokkos::TeamThreadRange(member,blocksize),
 
 1995                    [&](
const local_ordinal_type &ii) {
 
 1996                     for (local_ordinal_type jj=0;jj<blocksize;++jj)
 
 1997                       scalar_values(pi, ii, jj, v) = static_cast<btdm_scalar_type>(block[ii*blocksize + jj]);
 
 2005       template<
typename AAViewType,
 
 2006                typename WWViewType>
 
 2007       KOKKOS_INLINE_FUNCTION
 
 2009       factorize(
const member_type &member,
 
 2010                 const local_ordinal_type &i0,
 
 2011                 const local_ordinal_type &nrows,
 
 2012                 const local_ordinal_type &v,
 
 2013                 const AAViewType &AA,
 
 2014                 const WWViewType &WW)
 const {
 
 2015         typedef ExtractAndFactorizeTridiagsDefaultModeAndAlgo
 
 2016           <Kokkos::Impl::ActiveExecutionMemorySpace> default_mode_and_algo_type;
 
 2017         typedef default_mode_and_algo_type::mode_type default_mode_type;
 
 2018         typedef default_mode_and_algo_type::algo_type default_algo_type;
 
 2021         const auto one = Kokkos::ArithTraits<btdm_magnitude_type>::one();
 
 2024         auto A = Kokkos::subview(AA, i0, Kokkos::ALL(), Kokkos::ALL(), v);
 
 2026                default_mode_type,KB::Algo::LU::Unblocked>
 
 2027           ::invoke(member, A , tiny);
 
 2031           local_ordinal_type i = i0;
 
 2032           for (local_ordinal_type tr=1;tr<nrows;++tr,i+=3) {
 
 2033             B.assign_data( &AA(i+1,0,0,v) );
 
 2034             KB::Trsm<member_type,
 
 2035                      KB::Side::Left,KB::Uplo::Lower,KB::Trans::NoTranspose,KB::Diag::Unit,
 
 2036                      default_mode_type,default_algo_type>
 
 2037               ::invoke(member, one, A, B);
 
 2038             C.assign_data( &AA(i+2,0,0,v) );
 
 2039             KB::Trsm<member_type,
 
 2040                      KB::Side::Right,KB::Uplo::Upper,KB::Trans::NoTranspose,KB::Diag::NonUnit,
 
 2041                      default_mode_type,default_algo_type>
 
 2042               ::invoke(member, one, A, C);
 
 2043             A.assign_data( &AA(i+3,0,0,v) );
 
 2044             KB::Gemm<member_type,
 
 2045                      KB::Trans::NoTranspose,KB::Trans::NoTranspose,
 
 2046                      default_mode_type,default_algo_type>
 
 2047               ::invoke(member, -one, C, B, one, A);
 
 2049                    default_mode_type,KB::Algo::LU::Unblocked>
 
 2050               ::invoke(member, A, tiny);
 
 2054           auto W = Kokkos::subview(WW, Kokkos::ALL(), Kokkos::ALL(), v);
 
 2055           KB::Copy<member_type,KB::Trans::NoTranspose,default_mode_type>
 
 2056             ::invoke(member, A, W);
 
 2057           KB::SetIdentity<member_type,default_mode_type>
 
 2058             ::invoke(member, A);
 
 2059           member.team_barrier();
 
 2060           KB::Trsm<member_type,
 
 2061                    KB::Side::Left,KB::Uplo::Lower,KB::Trans::NoTranspose,KB::Diag::Unit,
 
 2062                    default_mode_type,default_algo_type>
 
 2063             ::invoke(member, one, W, A);
 
 2064           KB::Trsm<member_type,
 
 2065                    KB::Side::Left,KB::Uplo::Upper,KB::Trans::NoTranspose,KB::Diag::NonUnit,
 
 2066                    default_mode_type,default_algo_type>
 
 2067             ::invoke(member, one, W, A);
 
 2073       struct ExtractAndFactorizeTag {};
 
 2075       KOKKOS_INLINE_FUNCTION
 
 2077       operator() (
const ExtractAndFactorizeTag &, 
const member_type &member)
 const {
 
 2079         const local_ordinal_type packidx = member.league_rank();
 
 2081         const local_ordinal_type partidx = packptr(packidx);
 
 2082         const local_ordinal_type npacks = packptr(packidx+1) - partidx;
 
 2083         const local_ordinal_type i0 = pack_td_ptr(partidx);
 
 2084         const local_ordinal_type nrows = partptr(partidx+1) - partptr(partidx);
 
 2086         internal_vector_scratch_type_3d_view
 
 2087           WW(member.team_scratch(0), blocksize, blocksize, vector_loop_size);
 
 2088         if (vector_loop_size == 1) {
 
 2089           extract(partidx, npacks);
 
 2090           factorize(member, i0, nrows, 0, internal_vector_values, WW);
 
 2092           Kokkos::parallel_for
 
 2093             (Kokkos::ThreadVectorRange(member, vector_loop_size), [&](
const local_ordinal_type &v) {
 
 2094               const local_ordinal_type vbeg = v*internal_vector_length;
 
 2096                 extract(member, partidx+vbeg, npacks, vbeg);
 
 2097               factorize(member, i0, nrows, v, internal_vector_values, WW);
 
 2103         IFPACK2_BLOCKTRIDICONTAINER_PROFILER_REGION_BEGIN;
 
 2104         const local_ordinal_type team_size =
 
 2105           ExtractAndFactorizeTridiagsDefaultModeAndAlgo<typename execution_space::memory_space>::
 
 2106           recommended_team_size(blocksize, vector_length, internal_vector_length);
 
 2107         const local_ordinal_type per_team_scratch = internal_vector_scratch_type_3d_view::
 
 2108           shmem_size(blocksize, blocksize, vector_loop_size);
 
 2110         Kokkos::TeamPolicy<execution_space,ExtractAndFactorizeTag>
 
 2111           policy(packptr.extent(0)-1, team_size, vector_loop_size);
 
 2112 #if defined(KOKKOS_ENABLE_DEPRECATED_CODE) 
 2113         Kokkos::parallel_for(
"ExtractAndFactorize::TeamPolicy::run<ExtractAndFactorizeTag>",
 
 2114                              policy.set_scratch_size(0,Kokkos::PerTeam(per_team_scratch)), *
this);
 
 2116         policy.set_scratch_size(0,Kokkos::PerTeam(per_team_scratch));
 
 2117         Kokkos::parallel_for(
"ExtractAndFactorize::TeamPolicy::run<ExtractAndFactorizeTag>",
 
 2120         IFPACK2_BLOCKTRIDICONTAINER_PROFILER_REGION_END;
 
 2128     template<
typename MatrixType>
 
 2131                         const PartInterface<MatrixType> &interf,
 
 2133                         const typename ImplType<MatrixType>::magnitude_type tiny) {
 
 2134       IFPACK2_BLOCKTRIDICONTAINER_TIMER(
"BlockTriDi::NumericPhase");
 
 2135       ExtractAndFactorizeTridiags<MatrixType> 
function(btdm, interf, A, tiny);
 
 2142     template<
typename MatrixType>
 
 2146       using execution_space = 
typename impl_type::execution_space;
 
 2147       using memory_space = 
typename impl_type::memory_space;
 
 2149       using local_ordinal_type = 
typename impl_type::local_ordinal_type;
 
 2151       using btdm_scalar_type = 
typename impl_type::btdm_scalar_type;
 
 2152       using tpetra_multivector_type = 
typename impl_type::tpetra_multivector_type;
 
 2153       using local_ordinal_type_1d_view = 
typename impl_type::local_ordinal_type_1d_view;
 
 2154       using vector_type_3d_view = 
typename impl_type::vector_type_3d_view;
 
 2155       using impl_scalar_type_2d_view_tpetra = 
typename impl_type::impl_scalar_type_2d_view_tpetra;
 
 2156       static constexpr 
int vector_length = impl_type::vector_length;
 
 2158       using member_type = 
typename Kokkos::TeamPolicy<execution_space>::member_type;
 
 2162       const ConstUnmanaged<local_ordinal_type_1d_view> partptr;
 
 2163       const ConstUnmanaged<local_ordinal_type_1d_view> packptr;
 
 2164       const ConstUnmanaged<local_ordinal_type_1d_view> part2packrowidx0;
 
 2165       const ConstUnmanaged<local_ordinal_type_1d_view> part2rowidx0;
 
 2166       const ConstUnmanaged<local_ordinal_type_1d_view> lclrow;
 
 2167       const local_ordinal_type blocksize;
 
 2168       const local_ordinal_type num_vectors;
 
 2171       vector_type_3d_view packed_multivector;
 
 2172       impl_scalar_type_2d_view_tpetra scalar_multivector;
 
 2174       template<
typename TagType>
 
 2175       KOKKOS_INLINE_FUNCTION
 
 2176       void copy_multivectors(
const local_ordinal_type &j,
 
 2177                              const local_ordinal_type &vi,
 
 2178                              const local_ordinal_type &pri,
 
 2179                              const local_ordinal_type &ri0)
 const {
 
 2180         for (local_ordinal_type col=0;col<num_vectors;++col)
 
 2181           for (local_ordinal_type i=0;i<blocksize;++i)
 
 2182             packed_multivector(pri, i, col)[vi] = 
static_cast<btdm_scalar_type
>(scalar_multivector(blocksize*lclrow(ri0+j)+i,col));
 
 2188                            const vector_type_3d_view &pmv)
 
 2189         : partptr(interf.partptr),
 
 2190           packptr(interf.packptr),
 
 2191           part2packrowidx0(interf.part2packrowidx0),
 
 2192           part2rowidx0(interf.part2rowidx0),
 
 2193           lclrow(interf.lclrow),
 
 2194           blocksize(pmv.extent(1)),
 
 2195           num_vectors(pmv.extent(2)),
 
 2196           packed_multivector(pmv) {}
 
 2201       operator() (
const local_ordinal_type &packidx)
 const {
 
 2202         local_ordinal_type partidx = packptr(packidx);
 
 2203         local_ordinal_type npacks = packptr(packidx+1) - partidx;
 
 2204         const local_ordinal_type pri0 = part2packrowidx0(partidx);
 
 2206         local_ordinal_type ri0[vector_length] = {};
 
 2207         local_ordinal_type nrows[vector_length] = {};
 
 2208         for (local_ordinal_type v=0;v<npacks;++v,++partidx) {
 
 2209           ri0[v] = part2rowidx0(partidx);
 
 2210           nrows[v] = part2rowidx0(partidx+1) - ri0[v];
 
 2212         for (local_ordinal_type j=0;j<nrows[0];++j) {
 
 2213           local_ordinal_type cnt = 1;
 
 2214           for (;cnt<npacks && j!= nrows[cnt];++cnt);
 
 2216           const local_ordinal_type pri = pri0 + j;
 
 2217           for (local_ordinal_type col=0;col<num_vectors;++col)
 
 2218             for (local_ordinal_type i=0;i<blocksize;++i)
 
 2219               for (local_ordinal_type v=0;v<npacks;++v)
 
 2220                 packed_multivector(pri, i, col)[v] = 
static_cast<btdm_scalar_type
>(scalar_multivector(blocksize*lclrow(ri0[v]+j)+i,col));
 
 2224       KOKKOS_INLINE_FUNCTION
 
 2226       operator() (
const member_type &member)
 const {
 
 2227         const local_ordinal_type packidx = member.league_rank();
 
 2228         const local_ordinal_type partidx_begin = packptr(packidx);
 
 2229         const local_ordinal_type npacks = packptr(packidx+1) - partidx_begin;
 
 2230         const local_ordinal_type pri0 = part2packrowidx0(partidx_begin);
 
 2231         Kokkos::parallel_for(Kokkos::ThreadVectorRange(member, npacks), [&](
const local_ordinal_type &v) {
 
 2232             const local_ordinal_type partidx = partidx_begin + v;
 
 2233             const local_ordinal_type ri0 = part2rowidx0(partidx);
 
 2234             const local_ordinal_type nrows = part2rowidx0(partidx+1) - ri0;
 
 2237               const local_ordinal_type pri = pri0;
 
 2238               for (local_ordinal_type col=0;col<num_vectors;++col) {
 
 2239                 Kokkos::parallel_for(Kokkos::TeamThreadRange(member, blocksize), [&](
const local_ordinal_type &i) {
 
 2240                     packed_multivector(pri, i, col)[v] = 
static_cast<btdm_scalar_type
>(scalar_multivector(blocksize*lclrow(ri0)+i,col));
 
 2244               Kokkos::parallel_for(Kokkos::TeamThreadRange(member, nrows), [&](
const local_ordinal_type &j) {
 
 2245                   const local_ordinal_type pri = pri0 + j;
 
 2246                   for (local_ordinal_type col=0;col<num_vectors;++col)
 
 2247                     for (local_ordinal_type i=0;i<blocksize;++i)
 
 2248                       packed_multivector(pri, i, col)[v] = static_cast<btdm_scalar_type>(scalar_multivector(blocksize*lclrow(ri0+j)+i,col));
 
 2254       void run(
const impl_scalar_type_2d_view_tpetra &scalar_multivector_) {
 
 2255         IFPACK2_BLOCKTRIDICONTAINER_PROFILER_REGION_BEGIN;
 
 2256         IFPACK2_BLOCKTRIDICONTAINER_TIMER(
"BlockTriDi::MultiVectorConverter");
 
 2258         scalar_multivector = scalar_multivector_;
 
 2260 #if defined(KOKKOS_ENABLE_CUDA) 
 2261           const local_ordinal_type vl = vector_length;
 
 2262           const Kokkos::TeamPolicy<execution_space> policy(packptr.extent(0) - 1, Kokkos::AUTO(), vl);
 
 2263           Kokkos::parallel_for
 
 2264             (
"MultiVectorConverter::TeamPolicy", policy, *
this);
 
 2267 #if defined(__CUDA_ARCH__) 
 2270           const Kokkos::RangePolicy<execution_space> policy(0, packptr.extent(0) - 1);
 
 2271           Kokkos::parallel_for
 
 2272             (
"MultiVectorConverter::RangePolicy", policy, *
this);
 
 2275         IFPACK2_BLOCKTRIDICONTAINER_PROFILER_REGION_END;
 
 2282     template<
typename ArgActiveExecutionMemorySpace>
 
 2287       typedef KB::Mode::Serial mode_type;
 
 2288       typedef KB::Algo::Level2::Unblocked single_vector_algo_type;
 
 2289 #if defined(__KOKKOSBATCHED_INTEL_MKL_COMPACT_BATCHED__) 
 2290       typedef KB::Algo::Level3::CompactMKL multi_vector_algo_type;
 
 2292       typedef KB::Algo::Level3::Blocked multi_vector_algo_type;
 
 2294       static int recommended_team_size(
const int ,
 
 2301 #if defined(KOKKOS_ENABLE_CUDA) 
 2302     static inline int SolveTridiagsRecommendedCudaTeamSize(
const int blksize,
 
 2303                                                            const int vector_length,
 
 2304                                                            const int internal_vector_length) {
 
 2305       const int vector_size = vector_length/internal_vector_length;
 
 2306       int total_team_size(0);
 
 2307       if      (blksize <=  5) total_team_size =  32;
 
 2308       else if (blksize <=  9) total_team_size =  32; 
 
 2309       else if (blksize <= 12) total_team_size =  96;
 
 2310       else if (blksize <= 16) total_team_size = 128;
 
 2311       else if (blksize <= 20) total_team_size = 160;
 
 2312       else                    total_team_size = 160;
 
 2313       return total_team_size/vector_size;
 
 2317     struct SolveTridiagsDefaultModeAndAlgo<Kokkos::CudaSpace> {
 
 2318       typedef KB::Mode::Team mode_type;
 
 2319       typedef KB::Algo::Level2::Unblocked single_vector_algo_type;
 
 2320       typedef KB::Algo::Level3::Unblocked multi_vector_algo_type;
 
 2321       static int recommended_team_size(
const int blksize,
 
 2322                                        const int vector_length,
 
 2323                                        const int internal_vector_length) {
 
 2324         return SolveTridiagsRecommendedCudaTeamSize(blksize, vector_length, internal_vector_length);
 
 2328     struct SolveTridiagsDefaultModeAndAlgo<Kokkos::CudaUVMSpace> {
 
 2329       typedef KB::Mode::Team mode_type;
 
 2330       typedef KB::Algo::Level2::Unblocked single_vector_algo_type;
 
 2331       typedef KB::Algo::Level3::Unblocked multi_vector_algo_type;
 
 2332       static int recommended_team_size(
const int blksize,
 
 2333                                        const int vector_length,
 
 2334                                        const int internal_vector_length) {
 
 2335         return SolveTridiagsRecommendedCudaTeamSize(blksize, vector_length, internal_vector_length);
 
 2340     template<
typename MatrixType>
 
 2341     struct SolveTridiags {
 
 2343       using impl_type = ImplType<MatrixType>;
 
 2344       using execution_space = 
typename impl_type::execution_space;
 
 2346       using local_ordinal_type = 
typename impl_type::local_ordinal_type;
 
 2349       using magnitude_type = 
typename impl_type::magnitude_type;
 
 2350       using btdm_scalar_type = 
typename impl_type::btdm_scalar_type;
 
 2351       using btdm_magnitude_type = 
typename impl_type::btdm_magnitude_type;
 
 2353       using local_ordinal_type_1d_view = 
typename impl_type::local_ordinal_type_1d_view;
 
 2356       using vector_type_3d_view = 
typename impl_type::vector_type_3d_view;
 
 2357       using internal_vector_type_4d_view = 
typename impl_type::internal_vector_type_4d_view;
 
 2360       using internal_vector_scratch_type_3d_view = Scratch<typename impl_type::internal_vector_type_3d_view>;
 
 2362       using internal_vector_type =
typename impl_type::internal_vector_type;
 
 2363       static constexpr 
int vector_length = impl_type::vector_length;
 
 2364       static constexpr 
int internal_vector_length = impl_type::internal_vector_length;
 
 2367       using impl_scalar_type_1d_view = 
typename impl_type::impl_scalar_type_1d_view;
 
 2368       using impl_scalar_type_2d_view_tpetra = 
typename impl_type::impl_scalar_type_2d_view_tpetra;
 
 2371       using team_policy_type = Kokkos::TeamPolicy<execution_space>;
 
 2372       using member_type = 
typename team_policy_type::member_type;
 
 2376       const ConstUnmanaged<local_ordinal_type_1d_view> partptr;
 
 2377       const ConstUnmanaged<local_ordinal_type_1d_view> packptr;
 
 2378       const ConstUnmanaged<local_ordinal_type_1d_view> part2packrowidx0;
 
 2379       const ConstUnmanaged<local_ordinal_type_1d_view> lclrow;
 
 2382       const ConstUnmanaged<size_type_1d_view> pack_td_ptr;
 
 2385       const ConstUnmanaged<internal_vector_type_4d_view> D_internal_vector_values;
 
 2386       const Unmanaged<internal_vector_type_4d_view> X_internal_vector_values;
 
 2388       const local_ordinal_type vector_loop_size;
 
 2391       Unmanaged<impl_scalar_type_2d_view_tpetra> Y_scalar_multivector;
 
 2392 #if defined(__CUDA_ARCH__) 
 2393       AtomicUnmanaged<impl_scalar_type_1d_view> Z_scalar_vector;
 
 2395        Unmanaged<impl_scalar_type_1d_view> Z_scalar_vector;
 
 2397       const impl_scalar_type df;
 
 2398       const bool compute_diff;
 
 2401       SolveTridiags(
const PartInterface<MatrixType> &interf,
 
 2402                     const BlockTridiags<MatrixType> &btdm,
 
 2403                     const vector_type_3d_view &pmv,
 
 2404                     const impl_scalar_type damping_factor,
 
 2405                     const bool is_norm_manager_active)
 
 2408         partptr(interf.partptr),
 
 2409         packptr(interf.packptr),
 
 2410         part2packrowidx0(interf.part2packrowidx0),
 
 2411         lclrow(interf.lclrow),
 
 2413         pack_td_ptr(btdm.pack_td_ptr),
 
 2414         D_internal_vector_values((internal_vector_type*)btdm.values.data(),
 
 2415                                  btdm.values.extent(0),
 
 2416                                  btdm.values.extent(1),
 
 2417                                  btdm.values.extent(2),
 
 2418                                  vector_length/internal_vector_length),
 
 2419         X_internal_vector_values((internal_vector_type*)pmv.data(),
 
 2423                                  vector_length/internal_vector_length),
 
 2424         vector_loop_size(vector_length/internal_vector_length),
 
 2425         Y_scalar_multivector(),
 
 2428         compute_diff(is_norm_manager_active)
 
 2434       KOKKOS_INLINE_FUNCTION
 
 2436       copyToFlatMultiVector(
const member_type &member,
 
 2437                             const local_ordinal_type partidxbeg, 
 
 2438                             const local_ordinal_type npacks,
 
 2439                             const local_ordinal_type pri0,
 
 2440                             const local_ordinal_type v, 
 
 2441                             const local_ordinal_type blocksize,
 
 2442                             const local_ordinal_type num_vectors)
 const {
 
 2443         const local_ordinal_type vbeg = v*internal_vector_length;
 
 2444         if (vbeg < npacks) {
 
 2445           local_ordinal_type ri0_vals[internal_vector_length] = {};
 
 2446           local_ordinal_type nrows_vals[internal_vector_length] = {};
 
 2447           for (local_ordinal_type vv=vbeg,vi=0;vv<npacks && vi<internal_vector_length;++vv,++vi) {
 
 2448             const local_ordinal_type partidx = partidxbeg+vv;
 
 2449             ri0_vals[vi] = partptr(partidx);
 
 2450             nrows_vals[vi] = partptr(partidx+1) - ri0_vals[vi];
 
 2453           impl_scalar_type z_partial_sum(0);
 
 2454           if (nrows_vals[0] == 1) {
 
 2455             const local_ordinal_type j=0, pri=pri0;
 
 2457               for (local_ordinal_type vv=vbeg,vi=0;vv<npacks && vi<internal_vector_length;++vv,++vi) {
 
 2458                 const local_ordinal_type ri0 = ri0_vals[vi];
 
 2459                 const local_ordinal_type nrows = nrows_vals[vi];
 
 2461                   Kokkos::parallel_for
 
 2462                     (Kokkos::TeamThreadRange(member, blocksize),
 
 2463                      [&](
const local_ordinal_type &i) {
 
 2464                       const local_ordinal_type row = blocksize*lclrow(ri0+j)+i;
 
 2465                       for (local_ordinal_type col=0;col<num_vectors;++col) {
 
 2466                         impl_scalar_type &y = Y_scalar_multivector(row,col);
 
 2467                         const impl_scalar_type yd = X_internal_vector_values(pri, i, col, v)[vi] - y;
 
 2471                           const auto yd_abs = Kokkos::ArithTraits<impl_scalar_type>::abs(yd);
 
 2472                           z_partial_sum += yd_abs*yd_abs;
 
 2480             Kokkos::parallel_for
 
 2481               (Kokkos::TeamThreadRange(member, nrows_vals[0]),
 
 2482                [&](
const local_ordinal_type &j) {
 
 2483                 const local_ordinal_type pri = pri0 + j;
 
 2484                 for (local_ordinal_type vv=vbeg,vi=0;vv<npacks && vi<internal_vector_length;++vv,++vi) {
 
 2485                   const local_ordinal_type ri0 = ri0_vals[vi];
 
 2486                   const local_ordinal_type nrows = nrows_vals[vi];
 
 2488                     for (local_ordinal_type col=0;col<num_vectors;++col) {
 
 2489                       for (local_ordinal_type i=0;i<blocksize;++i) {
 
 2490                         const local_ordinal_type row = blocksize*lclrow(ri0+j)+i;
 
 2491                         impl_scalar_type &y = Y_scalar_multivector(row,col);
 
 2492                         const impl_scalar_type yd = X_internal_vector_values(pri, i, col, v)[vi] - y;
 
 2496                           const auto yd_abs = Kokkos::ArithTraits<impl_scalar_type>::abs(yd);
 
 2497                           z_partial_sum += yd_abs*yd_abs;
 
 2506             Z_scalar_vector(member.league_rank()) += z_partial_sum;
 
 2513       template<
typename WWViewType>
 
 2514       KOKKOS_INLINE_FUNCTION
 
 2516       solveSingleVector(
const member_type &member,
 
 2517                         const local_ordinal_type &blocksize,
 
 2518                         const local_ordinal_type &i0,
 
 2519                         const local_ordinal_type &r0,
 
 2520                         const local_ordinal_type &nrows,
 
 2521                         const local_ordinal_type &v,
 
 2522                         const WWViewType &WW)
 const {
 
 2523         typedef SolveTridiagsDefaultModeAndAlgo
 
 2524           <Kokkos::Impl::ActiveExecutionMemorySpace> default_mode_and_algo_type;
 
 2525         typedef default_mode_and_algo_type::mode_type default_mode_type;
 
 2526         typedef default_mode_and_algo_type::single_vector_algo_type default_algo_type;
 
 2529         auto A = D_internal_vector_values.data();
 
 2530         auto X = X_internal_vector_values.data();
 
 2533         const auto one = Kokkos::ArithTraits<btdm_magnitude_type>::one();
 
 2534         const auto zero = Kokkos::ArithTraits<btdm_magnitude_type>::zero();
 
 2538         const local_ordinal_type astep = D_internal_vector_values.stride_0();
 
 2539         const local_ordinal_type as0 = D_internal_vector_values.stride_1(); 
 
 2540         const local_ordinal_type as1 = D_internal_vector_values.stride_2(); 
 
 2541         const local_ordinal_type xstep = X_internal_vector_values.stride_0();
 
 2542         const local_ordinal_type xs0 = X_internal_vector_values.stride_1(); 
 
 2555           KOKKOSBATCHED_TRSV_LOWER_NO_TRANSPOSE_INTERNAL_INVOKE
 
 2556             (default_mode_type,default_algo_type,
 
 2559              blocksize,blocksize,
 
 2564           for (local_ordinal_type tr=1;tr<nrows;++tr) {
 
 2565             KOKKOSBATCHED_GEMV_NO_TRANSPOSE_INTERNAL_INVOKE
 
 2566               (default_mode_type,default_algo_type,
 
 2568                blocksize, blocksize,
 
 2570                A+2*astep, as0, as1,
 
 2575             KOKKOSBATCHED_TRSV_LOWER_NO_TRANSPOSE_INTERNAL_INVOKE
 
 2576               (default_mode_type,default_algo_type,
 
 2579                blocksize,blocksize,
 
 2581                A+3*astep, as0, as1,
 
 2589           KOKKOSBATCHED_TRSV_UPPER_NO_TRANSPOSE_INTERNAL_INVOKE
 
 2590             (default_mode_type,default_algo_type,
 
 2593              blocksize, blocksize,
 
 2598           for (local_ordinal_type tr=nrows;tr>1;--tr) {
 
 2600             KOKKOSBATCHED_GEMV_NO_TRANSPOSE_INTERNAL_INVOKE
 
 2601               (default_mode_type,default_algo_type,
 
 2603                blocksize, blocksize,
 
 2605                A+1*astep, as0, as1,
 
 2610             KOKKOSBATCHED_TRSV_UPPER_NO_TRANSPOSE_INTERNAL_INVOKE
 
 2611               (default_mode_type,default_algo_type,
 
 2614                blocksize, blocksize,
 
 2624           const local_ordinal_type ws0 = WW.stride_0();
 
 2625           auto W = WW.data() + v;
 
 2626           KOKKOSBATCHED_COPY_VECTOR_NO_TRANSPOSE_INTERNAL_INVOKE
 
 2628              member, blocksize, X, xs0, W, ws0);
 
 2629           KOKKOSBATCHED_GEMV_NO_TRANSPOSE_INTERNAL_INVOKE
 
 2630             (default_mode_type,default_algo_type,
 
 2632              blocksize, blocksize,
 
 2641       template<
typename WWViewType>
 
 2642       KOKKOS_INLINE_FUNCTION
 
 2644       solveMultiVector(
const member_type &member,
 
 2645                        const local_ordinal_type &,
 
 2646                        const local_ordinal_type &i0,
 
 2647                        const local_ordinal_type &r0,
 
 2648                        const local_ordinal_type &nrows,
 
 2649                        const local_ordinal_type &v,
 
 2650                        const WWViewType &WW)
 const {
 
 2651         typedef SolveTridiagsDefaultModeAndAlgo
 
 2652           <Kokkos::Impl::ActiveExecutionMemorySpace> default_mode_and_algo_type;
 
 2653         typedef default_mode_and_algo_type::mode_type default_mode_type;
 
 2654         typedef default_mode_and_algo_type::multi_vector_algo_type default_algo_type;
 
 2657         const auto one = Kokkos::ArithTraits<btdm_magnitude_type>::one();
 
 2658         const auto zero = Kokkos::ArithTraits<btdm_magnitude_type>::zero();
 
 2661         auto A  = Kokkos::subview(D_internal_vector_values, i0, Kokkos::ALL(), Kokkos::ALL(), v);
 
 2662         auto X1 = Kokkos::subview(X_internal_vector_values, r0, Kokkos::ALL(), Kokkos::ALL(), v);
 
 2665         local_ordinal_type i = i0, r = r0;
 
 2670           KB::Trsm<member_type,
 
 2671                    KB::Side::Left,KB::Uplo::Lower,KB::Trans::NoTranspose,KB::Diag::Unit,
 
 2672                    default_mode_type,default_algo_type>
 
 2673             ::invoke(member, one, A, X1);
 
 2674           for (local_ordinal_type tr=1;tr<nrows;++tr,i+=3) {
 
 2675             A.assign_data( &D_internal_vector_values(i+2,0,0,v) );
 
 2676             X2.assign_data( &X_internal_vector_values(++r,0,0,v) );
 
 2677             KB::Gemm<member_type,
 
 2678                      KB::Trans::NoTranspose,KB::Trans::NoTranspose,
 
 2679                      default_mode_type,default_algo_type>
 
 2680               ::invoke(member, -one, A, X1, one, X2);
 
 2681             A.assign_data( &D_internal_vector_values(i+3,0,0,v) );
 
 2682             KB::Trsm<member_type,
 
 2683                      KB::Side::Left,KB::Uplo::Lower,KB::Trans::NoTranspose,KB::Diag::Unit,
 
 2684                      default_mode_type,default_algo_type>
 
 2685               ::invoke(member, one, A, X2);
 
 2686             X1.assign_data( X2.data() );
 
 2690           KB::Trsm<member_type,
 
 2691                    KB::Side::Left,KB::Uplo::Upper,KB::Trans::NoTranspose,KB::Diag::NonUnit,
 
 2692                    default_mode_type,default_algo_type>
 
 2693             ::invoke(member, one, A, X1);
 
 2694           for (local_ordinal_type tr=nrows;tr>1;--tr) {
 
 2696             A.assign_data( &D_internal_vector_values(i+1,0,0,v) );
 
 2697             X2.assign_data( &X_internal_vector_values(--r,0,0,v) );
 
 2698             KB::Gemm<member_type,
 
 2699                      KB::Trans::NoTranspose,KB::Trans::NoTranspose,
 
 2700                      default_mode_type,default_algo_type>
 
 2701               ::invoke(member, -one, A, X1, one, X2);
 
 2702             A.assign_data( &D_internal_vector_values(i,0,0,v) );
 
 2703             KB::Trsm<member_type,
 
 2704                      KB::Side::Left,KB::Uplo::Upper,KB::Trans::NoTranspose,KB::Diag::NonUnit,
 
 2705                      default_mode_type,default_algo_type>
 
 2706               ::invoke(member, one, A, X2);
 
 2707             X1.assign_data( X2.data() );
 
 2711           auto W = Kokkos::subview(WW, Kokkos::ALL(), Kokkos::ALL(), v);
 
 2712           KB::Copy<member_type,KB::Trans::NoTranspose,default_mode_type>
 
 2713             ::invoke(member, X1, W);
 
 2714           KB::Gemm<member_type,
 
 2715                    KB::Trans::NoTranspose,KB::Trans::NoTranspose,
 
 2716                    default_mode_type,default_algo_type>
 
 2717             ::invoke(member, one, A, W, zero, X1);
 
 2721       template<
int B> 
struct SingleVectorTag {};
 
 2722       template<
int B> 
struct MultiVectorTag {};
 
 2725       KOKKOS_INLINE_FUNCTION
 
 2727       operator() (
const SingleVectorTag<B> &, 
const member_type &member)
 const {
 
 2728         const local_ordinal_type packidx = member.league_rank();
 
 2729         const local_ordinal_type partidx = packptr(packidx);
 
 2730         const local_ordinal_type npacks = packptr(packidx+1) - partidx;
 
 2731         const local_ordinal_type pri0 = part2packrowidx0(partidx);
 
 2732         const local_ordinal_type i0 = pack_td_ptr(partidx);
 
 2733         const local_ordinal_type r0 = part2packrowidx0(partidx);
 
 2734         const local_ordinal_type nrows = partptr(partidx+1) - partptr(partidx);
 
 2735         const local_ordinal_type blocksize = (B == 0 ? D_internal_vector_values.extent(1) : B);
 
 2736         const local_ordinal_type num_vectors = 1;
 
 2737         internal_vector_scratch_type_3d_view
 
 2738           WW(member.team_scratch(0), blocksize, 1, vector_loop_size);
 
 2739         Kokkos::single(Kokkos::PerTeam(member), [&]() {
 
 2740             Z_scalar_vector(member.league_rank()) = impl_scalar_type(0);
 
 2742         Kokkos::parallel_for
 
 2743           (Kokkos::ThreadVectorRange(member, vector_loop_size),[&](
const int &v) {
 
 2744             solveSingleVector(member, blocksize, i0, r0, nrows, v, WW);
 
 2745             copyToFlatMultiVector(member, partidx, npacks, pri0, v, blocksize, num_vectors);
 
 2750       KOKKOS_INLINE_FUNCTION
 
 2752       operator() (
const MultiVectorTag<B> &, 
const member_type &member)
 const {
 
 2753         const local_ordinal_type packidx = member.league_rank();
 
 2754         const local_ordinal_type partidx = packptr(packidx);
 
 2755         const local_ordinal_type npacks = packptr(packidx+1) - partidx;
 
 2756         const local_ordinal_type pri0 = part2packrowidx0(partidx);
 
 2757         const local_ordinal_type i0 = pack_td_ptr(partidx);
 
 2758         const local_ordinal_type r0 = part2packrowidx0(partidx);
 
 2759         const local_ordinal_type nrows = partptr(partidx+1) - partptr(partidx);
 
 2760         const local_ordinal_type blocksize = (B == 0 ? D_internal_vector_values.extent(1) : B);
 
 2761         const local_ordinal_type num_vectors = X_internal_vector_values.extent(2);
 
 2763         internal_vector_scratch_type_3d_view
 
 2764           WW(member.team_scratch(0), blocksize, num_vectors, vector_loop_size);
 
 2765         Kokkos::single(Kokkos::PerTeam(member), [&]() {
 
 2766             Z_scalar_vector(member.league_rank()) = impl_scalar_type(0);
 
 2768         Kokkos::parallel_for
 
 2769           (Kokkos::ThreadVectorRange(member, vector_loop_size),[&](
const int &v) {
 
 2770             solveMultiVector(member, blocksize, i0, r0, nrows, v, WW);
 
 2771             copyToFlatMultiVector(member, partidx, npacks, pri0, v, blocksize, num_vectors);
 
 2775       void run(
const impl_scalar_type_2d_view_tpetra &Y,
 
 2776                const impl_scalar_type_1d_view &Z) {
 
 2777         IFPACK2_BLOCKTRIDICONTAINER_PROFILER_REGION_BEGIN;
 
 2778         IFPACK2_BLOCKTRIDICONTAINER_TIMER(
"BlockTriDi::SolveTridiags");
 
 2781         this->Y_scalar_multivector = Y;
 
 2782         this->Z_scalar_vector = Z;
 
 2784         const local_ordinal_type num_vectors = X_internal_vector_values.extent(2);
 
 2785         const local_ordinal_type blocksize = D_internal_vector_values.extent(1);
 
 2787         const local_ordinal_type team_size =
 
 2788           SolveTridiagsDefaultModeAndAlgo<typename execution_space::memory_space>::
 
 2789           recommended_team_size(blocksize, vector_length, internal_vector_length);
 
 2790         const int per_team_scratch = internal_vector_scratch_type_3d_view
 
 2791           ::shmem_size(blocksize, num_vectors, vector_loop_size);
 
 2793 #if defined(KOKKOS_ENABLE_DEPRECATED_CODE) 
 2794 #define BLOCKTRIDICONTAINER_DETAILS_SOLVETRIDIAGS(B)                    \ 
 2795         if (num_vectors == 1) {                                         \ 
 2796           const Kokkos::TeamPolicy<execution_space,SingleVectorTag<B> > \ 
 2797             policy(packptr.extent(0) - 1, team_size, vector_loop_size); \ 
 2798           Kokkos::parallel_for                                          \ 
 2799             ("SolveTridiags::TeamPolicy::run<SingleVector>",            \ 
 2800              policy.set_scratch_size(0,Kokkos::PerTeam(per_team_scratch)), *this); \ 
 2802           const Kokkos::TeamPolicy<execution_space,MultiVectorTag<B> > \ 
 2803             policy(packptr.extent(0) - 1, team_size, vector_loop_size); \ 
 2804           Kokkos::parallel_for                                          \ 
 2805             ("SolveTridiags::TeamPolicy::run<MultiVector>",             \ 
 2806              policy.set_scratch_size(0,Kokkos::PerTeam(per_team_scratch)), *this); \ 
 2809 #define BLOCKTRIDICONTAINER_DETAILS_SOLVETRIDIAGS(B)                    \ 
 2810         if (num_vectors == 1) {                                         \ 
 2811           Kokkos::TeamPolicy<execution_space,SingleVectorTag<B> >       \ 
 2812             policy(packptr.extent(0) - 1, team_size, vector_loop_size); \ 
 2813           policy.set_scratch_size(0,Kokkos::PerTeam(per_team_scratch)); \ 
 2814           Kokkos::parallel_for                                          \ 
 2815             ("SolveTridiags::TeamPolicy::run<SingleVector>",            \ 
 2818           Kokkos::TeamPolicy<execution_space,MultiVectorTag<B> >        \ 
 2819             policy(packptr.extent(0) - 1, team_size, vector_loop_size); \ 
 2820           policy.set_scratch_size(0,Kokkos::PerTeam(per_team_scratch)); \ 
 2821           Kokkos::parallel_for                                          \ 
 2822             ("SolveTridiags::TeamPolicy::run<MultiVector>",             \ 
 2826         switch (blocksize) {
 
 2827         case   3: BLOCKTRIDICONTAINER_DETAILS_SOLVETRIDIAGS( 3);
 
 2828         case   5: BLOCKTRIDICONTAINER_DETAILS_SOLVETRIDIAGS( 5);
 
 2829         case   7: BLOCKTRIDICONTAINER_DETAILS_SOLVETRIDIAGS( 7);
 
 2830         case   9: BLOCKTRIDICONTAINER_DETAILS_SOLVETRIDIAGS( 9);
 
 2831         case  10: BLOCKTRIDICONTAINER_DETAILS_SOLVETRIDIAGS(10);
 
 2832         case  11: BLOCKTRIDICONTAINER_DETAILS_SOLVETRIDIAGS(11);
 
 2833         case  16: BLOCKTRIDICONTAINER_DETAILS_SOLVETRIDIAGS(16);
 
 2834         case  17: BLOCKTRIDICONTAINER_DETAILS_SOLVETRIDIAGS(17);
 
 2835         case  18: BLOCKTRIDICONTAINER_DETAILS_SOLVETRIDIAGS(18);
 
 2836         default : BLOCKTRIDICONTAINER_DETAILS_SOLVETRIDIAGS( 0);
 
 2838 #undef BLOCKTRIDICONTAINER_DETAILS_SOLVETRIDIAGS 
 2840         IFPACK2_BLOCKTRIDICONTAINER_PROFILER_REGION_END;
 
 2848                                                                      const int team_size) {
 
 2849       int total_team_size(0);
 
 2850       if      (blksize <=  5) total_team_size =  32;
 
 2851       else if (blksize <=  9) total_team_size =  32; 
 
 2852       else if (blksize <= 12) total_team_size =  96;
 
 2853       else if (blksize <= 16) total_team_size = 128;
 
 2854       else if (blksize <= 20) total_team_size = 160;
 
 2855       else                    total_team_size = 160;
 
 2856       return total_team_size/team_size;
 
 2859     template<
typename MatrixType>
 
 2860     struct ComputeResidualVector {
 
 2862       using impl_type = ImplType<MatrixType>;
 
 2864       using execution_space = 
typename impl_type::execution_space;
 
 2865       using memory_space = 
typename impl_type::memory_space;
 
 2867       using local_ordinal_type = 
typename impl_type::local_ordinal_type;
 
 2870       using magnitude_type = 
typename impl_type::magnitude_type;
 
 2871       using btdm_scalar_type = 
typename impl_type::btdm_scalar_type;
 
 2872       using btdm_magnitude_type = 
typename impl_type::btdm_magnitude_type;
 
 2874       using local_ordinal_type_1d_view = 
typename impl_type::local_ordinal_type_1d_view;
 
 2876       using tpetra_block_access_view_type = 
typename impl_type::tpetra_block_access_view_type; 
 
 2877       using impl_scalar_type_1d_view = 
typename impl_type::impl_scalar_type_1d_view;
 
 2878       using impl_scalar_type_2d_view_tpetra = 
typename impl_type::impl_scalar_type_2d_view_tpetra; 
 
 2879       using vector_type_3d_view = 
typename impl_type::vector_type_3d_view;
 
 2880       using btdm_scalar_type_4d_view = 
typename impl_type::btdm_scalar_type_4d_view;
 
 2881       static constexpr 
int vector_length = impl_type::vector_length;
 
 2884       using member_type = 
typename Kokkos::TeamPolicy<execution_space>::member_type;
 
 2887       enum : 
int { max_blocksize = 32 };
 
 2890       ConstUnmanaged<impl_scalar_type_2d_view_tpetra> b;
 
 2891       ConstUnmanaged<impl_scalar_type_2d_view_tpetra> x; 
 
 2892       ConstUnmanaged<impl_scalar_type_2d_view_tpetra> x_remote;
 
 2893       Unmanaged<impl_scalar_type_2d_view_tpetra> y;
 
 2894       Unmanaged<vector_type_3d_view> y_packed;
 
 2895       Unmanaged<btdm_scalar_type_4d_view> y_packed_scalar;
 
 2898       const ConstUnmanaged<size_type_1d_view> rowptr, rowptr_remote;
 
 2899       const ConstUnmanaged<local_ordinal_type_1d_view> colindsub, colindsub_remote;
 
 2900       const ConstUnmanaged<impl_scalar_type_1d_view> tpetra_values;
 
 2904       using a_rowptr_value_type = 
typename Kokkos::ViewTraits<local_ordinal_type*,node_device_type>::size_type;
 
 2905       const ConstUnmanaged<Kokkos::View<a_rowptr_value_type*,node_device_type> > A_rowptr;
 
 2906       const ConstUnmanaged<Kokkos::View<local_ordinal_type*,node_device_type> > A_colind;
 
 2909       const local_ordinal_type blocksize_requested;
 
 2912       const ConstUnmanaged<local_ordinal_type_1d_view> part2packrowidx0;
 
 2913       const ConstUnmanaged<local_ordinal_type_1d_view> part2rowidx0;
 
 2914       const ConstUnmanaged<local_ordinal_type_1d_view> rowidx2part;
 
 2915       const ConstUnmanaged<local_ordinal_type_1d_view> partptr;
 
 2916       const ConstUnmanaged<local_ordinal_type_1d_view> lclrow;
 
 2917       const ConstUnmanaged<local_ordinal_type_1d_view> dm2cm;
 
 2918       const bool is_dm2cm_active;
 
 2921       template<
typename LocalCrsGraphType>
 
 2922       ComputeResidualVector(
const AmD<MatrixType> &amd,
 
 2923                             const LocalCrsGraphType &graph,
 
 2924                             const local_ordinal_type &blocksize_requested_,
 
 2925                             const PartInterface<MatrixType> &interf,
 
 2926                             const local_ordinal_type_1d_view &dm2cm_)
 
 2927         : rowptr(amd.rowptr), rowptr_remote(amd.rowptr_remote),
 
 2928           colindsub(amd.A_colindsub), colindsub_remote(amd.A_colindsub_remote),
 
 2929           tpetra_values(amd.tpetra_values),
 
 2930           A_rowptr(graph.row_map),
 
 2931           A_colind(graph.entries),
 
 2932           blocksize_requested(blocksize_requested_),
 
 2933           part2packrowidx0(interf.part2packrowidx0),
 
 2934           part2rowidx0(interf.part2rowidx0),
 
 2935           rowidx2part(interf.rowidx2part),
 
 2936           partptr(interf.partptr),
 
 2937           lclrow(interf.lclrow),
 
 2939           is_dm2cm_active(dm2cm_.span() > 0)
 
 2944       SerialGemv(
const local_ordinal_type &blocksize,
 
 2945                  const impl_scalar_type * 
const KOKKOS_RESTRICT AA,
 
 2946                  const impl_scalar_type * 
const KOKKOS_RESTRICT xx,
 
 2947                   impl_scalar_type * KOKKOS_RESTRICT yy)
 const {
 
 2948         for (local_ordinal_type k0=0;k0<blocksize;++k0) {
 
 2949           impl_scalar_type val = 0;
 
 2950           const local_ordinal_type offset = k0*blocksize;
 
 2951 #if defined(KOKKOS_ENABLE_PRAGMA_IVDEP) 
 2954 #if defined(KOKKOS_ENABLE_PRAGMA_UNROLL) 
 2957           for (local_ordinal_type k1=0;k1<blocksize;++k1)
 
 2958             val += AA[offset+k1]*xx[k1];
 
 2963       template<
typename bbViewType, 
typename yyViewType>
 
 2964       KOKKOS_INLINE_FUNCTION
 
 2966       VectorCopy(
const member_type &member,
 
 2967                  const local_ordinal_type &blocksize,
 
 2968                  const bbViewType &bb,
 
 2969                  const yyViewType &yy)
 const {
 
 2970         Kokkos::parallel_for(Kokkos::ThreadVectorRange(member, blocksize), [&](
const local_ordinal_type &k0)  {
 
 2971             yy(k0) = 
static_cast<typename yyViewType::const_value_type
>(bb(k0));
 
 2975       template<
typename AAViewType, 
typename xxViewType, 
typename yyViewType>
 
 2976       KOKKOS_INLINE_FUNCTION
 
 2978       TeamVectorGemv(
const member_type &member,
 
 2979                const local_ordinal_type &blocksize,
 
 2980                const AAViewType &AA,
 
 2981                const xxViewType &xx,
 
 2982                const yyViewType &yy)
 const {
 
 2983         Kokkos::parallel_for
 
 2984           (Kokkos::TeamThreadRange(member, blocksize),
 
 2985            [&](
const local_ordinal_type &k0) {
 
 2986             impl_scalar_type val = 0;
 
 2987             Kokkos::parallel_for
 
 2988               (Kokkos::ThreadVectorRange(member, blocksize),
 
 2989                [&](
const local_ordinal_type &k1) {
 
 2990                 val += AA(k0,k1)*xx(k1);
 
 2992             Kokkos::atomic_fetch_add(&yy(k0), 
typename yyViewType::const_value_type(-val));
 
 2996       template<
typename AAViewType, 
typename xxViewType, 
typename yyViewType>
 
 2997       KOKKOS_INLINE_FUNCTION
 
 2999       VectorGemv(
const member_type &member,
 
 3000                  const local_ordinal_type &blocksize,
 
 3001                  const AAViewType &AA,
 
 3002                  const xxViewType &xx,
 
 3003                  const yyViewType &yy)
 const {
 
 3004         Kokkos::parallel_for
 
 3005           (Kokkos::ThreadVectorRange(member, blocksize),
 
 3006            [&](
const local_ordinal_type &k0) {
 
 3007             impl_scalar_type val(0);
 
 3008             for (local_ordinal_type k1=0;k1<blocksize;++k1) {
 
 3009               val += AA(k0,k1)*xx(k1);
 
 3011             Kokkos::atomic_fetch_add(&yy(k0), 
typename yyViewType::const_value_type(-val));
 
 3038       operator() (
const SeqTag &, 
const local_ordinal_type& i)
 const {
 
 3039         const local_ordinal_type blocksize = blocksize_requested;
 
 3040         const local_ordinal_type blocksize_square = blocksize*blocksize;
 
 3043         const Kokkos::pair<local_ordinal_type,local_ordinal_type> block_range(0, blocksize);
 
 3044         const local_ordinal_type num_vectors = y.extent(1);
 
 3045         const local_ordinal_type row = i*blocksize;
 
 3046         for (local_ordinal_type col=0;col<num_vectors;++col) {
 
 3048           impl_scalar_type *yy = &y(row, col);
 
 3049           const impl_scalar_type * 
const bb = &b(row, col);
 
 3050           memcpy(yy, bb, 
sizeof(impl_scalar_type)*blocksize);
 
 3053           const size_type A_k0 = A_rowptr[i];
 
 3054           for (size_type k=rowptr[i];k<rowptr[i+1];++k) {
 
 3055             const size_type j = A_k0 + colindsub[k];
 
 3056             const impl_scalar_type * 
const AA = &tpetra_values(j*blocksize_square);
 
 3057             const impl_scalar_type * 
const xx = &x(A_colind[j]*blocksize, col);
 
 3058             SerialGemv(blocksize,AA,xx,yy);
 
 3063       KOKKOS_INLINE_FUNCTION
 
 3065       operator() (
const SeqTag &, 
const member_type &member)
 const {
 
 3068         const local_ordinal_type blocksize = blocksize_requested;
 
 3069         const local_ordinal_type blocksize_square = blocksize*blocksize;
 
 3071         const local_ordinal_type lr = member.league_rank();
 
 3072         const Kokkos::pair<local_ordinal_type,local_ordinal_type> block_range(0, blocksize);
 
 3073         const local_ordinal_type num_vectors = y.extent(1);
 
 3076         auto bb = Kokkos::subview(b, block_range, 0);
 
 3078         auto yy = Kokkos::subview(y, block_range, 0);
 
 3079         auto A_block = ConstUnmanaged<tpetra_block_access_view_type>(NULL, blocksize, blocksize);
 
 3081         const local_ordinal_type row = lr*blocksize;
 
 3082         for (local_ordinal_type col=0;col<num_vectors;++col) {
 
 3084           yy.assign_data(&y(row, col));
 
 3085           bb.assign_data(&b(row, col));
 
 3086           if (member.team_rank() == 0)
 
 3087             VectorCopy(member, blocksize, bb, yy);
 
 3088           member.team_barrier();
 
 3091           const size_type A_k0 = A_rowptr[lr];
 
 3092           Kokkos::parallel_for
 
 3093             (Kokkos::TeamThreadRange(member, rowptr[lr], rowptr[lr+1]),
 
 3094              [&](
const local_ordinal_type &k) {
 
 3095               const size_type j = A_k0 + colindsub[k];
 
 3096               A_block.assign_data( &tpetra_values(j*blocksize_square) );
 
 3097               xx.assign_data( &x(A_colind[j]*blocksize, col) );
 
 3098               VectorGemv(member, blocksize, A_block, xx, yy);
 
 3109       operator() (
const AsyncTag<B> &, 
const local_ordinal_type &rowidx)
 const {
 
 3110         const local_ordinal_type blocksize = (B == 0 ? blocksize_requested : B);
 
 3111         const local_ordinal_type blocksize_square = blocksize*blocksize;
 
 3114         const local_ordinal_type partidx = rowidx2part(rowidx);
 
 3115         const local_ordinal_type pri = part2packrowidx0(partidx) + (rowidx - partptr(partidx));
 
 3116         const local_ordinal_type v = partidx % vector_length;
 
 3118         const local_ordinal_type num_vectors = y_packed.extent(2);
 
 3119         const local_ordinal_type num_local_rows = lclrow.extent(0);
 
 3122         impl_scalar_type yy[B == 0 ? max_blocksize : B] = {};
 
 3124         const local_ordinal_type lr = lclrow(rowidx);
 
 3125         const local_ordinal_type row = lr*blocksize;
 
 3126         for (local_ordinal_type col=0;col<num_vectors;++col) {
 
 3128           memcpy(yy, &b(row, col), 
sizeof(impl_scalar_type)*blocksize);
 
 3131           const size_type A_k0 = A_rowptr[lr];
 
 3132           for (size_type k=rowptr[lr];k<rowptr[lr+1];++k) {
 
 3133             const size_type j = A_k0 + colindsub[k];
 
 3134             const impl_scalar_type * 
const AA = &tpetra_values(j*blocksize_square);
 
 3135             const local_ordinal_type A_colind_at_j = A_colind[j];
 
 3136             if (A_colind_at_j < num_local_rows) {
 
 3137               const auto loc = is_dm2cm_active ? dm2cm[A_colind_at_j] : A_colind_at_j;
 
 3138               const impl_scalar_type * 
const xx = &x(loc*blocksize, col);
 
 3139               SerialGemv(blocksize, AA,xx,yy);
 
 3141               const auto loc = A_colind_at_j - num_local_rows;
 
 3142               const impl_scalar_type * 
const xx_remote = &x_remote(loc*blocksize, col);
 
 3143               SerialGemv(blocksize, AA,xx_remote,yy);
 
 3147           for (local_ordinal_type k=0;k<blocksize;++k)
 
 3148             y_packed(pri, k, col)[v] = yy[k];
 
 3153       KOKKOS_INLINE_FUNCTION
 
 3155       operator() (
const AsyncTag<B> &, 
const member_type &member)
 const {
 
 3156         const local_ordinal_type blocksize = (B == 0 ? blocksize_requested : B);
 
 3157         const local_ordinal_type blocksize_square = blocksize*blocksize;
 
 3160         const local_ordinal_type rowidx = member.league_rank();
 
 3161         const local_ordinal_type partidx = rowidx2part(rowidx);
 
 3162         const local_ordinal_type pri = part2packrowidx0(partidx) + (rowidx - partptr(partidx));
 
 3163         const local_ordinal_type v = partidx % vector_length;
 
 3165         const Kokkos::pair<local_ordinal_type,local_ordinal_type> block_range(0, blocksize);
 
 3166         const local_ordinal_type num_vectors = y_packed_scalar.extent(2);
 
 3167         const local_ordinal_type num_local_rows = lclrow.extent(0);
 
 3170         auto bb = Kokkos::subview(b, block_range, 0);
 
 3171         auto xx = Kokkos::subview(x, block_range, 0);
 
 3172         auto xx_remote = Kokkos::subview(x_remote, block_range, 0);
 
 3173         auto yy = Kokkos::subview(y_packed_scalar, 0, block_range, 0, 0);
 
 3174         auto A_block = ConstUnmanaged<tpetra_block_access_view_type>(NULL, blocksize, blocksize);
 
 3176         const local_ordinal_type lr = lclrow(rowidx);
 
 3177         const local_ordinal_type row = lr*blocksize;
 
 3178         for (local_ordinal_type col=0;col<num_vectors;++col) {
 
 3180           bb.assign_data(&b(row, col));
 
 3181           yy.assign_data(&y_packed_scalar(pri, 0, col, v));
 
 3182           if (member.team_rank() == 0)
 
 3183             VectorCopy(member, blocksize, bb, yy);
 
 3184           member.team_barrier();
 
 3187           const size_type A_k0 = A_rowptr[lr];
 
 3188           Kokkos::parallel_for
 
 3189             (Kokkos::TeamThreadRange(member, rowptr[lr], rowptr[lr+1]),
 
 3190              [&](
const local_ordinal_type &k) {
 
 3191               const size_type j = A_k0 + colindsub[k];
 
 3192               A_block.assign_data( &tpetra_values(j*blocksize_square) );
 
 3194               const local_ordinal_type A_colind_at_j = A_colind[j];
 
 3195               if (A_colind_at_j < num_local_rows) {
 
 3196                 const auto loc = is_dm2cm_active ? dm2cm[A_colind_at_j] : A_colind_at_j;
 
 3197                 xx.assign_data( &x(loc*blocksize, col) );
 
 3198                 VectorGemv(member, blocksize, A_block, xx, yy);
 
 3200                 const auto loc = A_colind_at_j - num_local_rows;
 
 3201                 xx_remote.assign_data( &x_remote(loc*blocksize, col) );
 
 3202                 VectorGemv(member, blocksize, A_block, xx_remote, yy);
 
 3208       template <
int P, 
int B> 
struct OverlapTag {};
 
 3210       template<
int P, 
int B>
 
 3213       operator() (
const OverlapTag<P,B> &, 
const local_ordinal_type& rowidx)
 const {
 
 3214         const local_ordinal_type blocksize = (B == 0 ? blocksize_requested : B);
 
 3215         const local_ordinal_type blocksize_square = blocksize*blocksize;
 
 3218         const local_ordinal_type partidx = rowidx2part(rowidx);
 
 3219         const local_ordinal_type pri = part2packrowidx0(partidx) + (rowidx - partptr(partidx));
 
 3220         const local_ordinal_type v = partidx % vector_length;
 
 3222         const local_ordinal_type num_vectors = y_packed.extent(2);
 
 3223         const local_ordinal_type num_local_rows = lclrow.extent(0);
 
 3226         impl_scalar_type yy[max_blocksize] = {};
 
 3228         auto colindsub_used = (P == 0 ? colindsub : colindsub_remote);
 
 3229         auto rowptr_used = (P == 0 ? rowptr : rowptr_remote);
 
 3231         const local_ordinal_type lr = lclrow(rowidx);
 
 3232         const local_ordinal_type row = lr*blocksize;
 
 3233         for (local_ordinal_type col=0;col<num_vectors;++col) {
 
 3236             memcpy(yy, &b(row, col), 
sizeof(impl_scalar_type)*blocksize);
 
 3239             memset(yy, 0, 
sizeof(impl_scalar_type)*blocksize);
 
 3243           const size_type A_k0 = A_rowptr[lr];
 
 3244           for (size_type k=rowptr_used[lr];k<rowptr_used[lr+1];++k) {
 
 3245             const size_type j = A_k0 + colindsub_used[k];
 
 3246             const impl_scalar_type * 
const AA = &tpetra_values(j*blocksize_square);
 
 3247             const local_ordinal_type A_colind_at_j = A_colind[j];
 
 3249               const auto loc = is_dm2cm_active ? dm2cm[A_colind_at_j] : A_colind_at_j;
 
 3250               const impl_scalar_type * 
const xx = &x(loc*blocksize, col);
 
 3251               SerialGemv(blocksize,AA,xx,yy);
 
 3253               const auto loc = A_colind_at_j - num_local_rows;
 
 3254               const impl_scalar_type * 
const xx_remote = &x_remote(loc*blocksize, col);
 
 3255               SerialGemv(blocksize,AA,xx_remote,yy);
 
 3260             for (local_ordinal_type k=0;k<blocksize;++k)
 
 3261               y_packed(pri, k, col)[v] = yy[k];
 
 3263             for (local_ordinal_type k=0;k<blocksize;++k)
 
 3264               y_packed(pri, k, col)[v] += yy[k];
 
 3269       template<
int P, 
int B>
 
 3270       KOKKOS_INLINE_FUNCTION
 
 3272       operator() (
const OverlapTag<P,B> &, 
const member_type &member)
 const {
 
 3273         const local_ordinal_type blocksize = (B == 0 ? blocksize_requested : B);
 
 3274         const local_ordinal_type blocksize_square = blocksize*blocksize;
 
 3277         const local_ordinal_type rowidx = member.league_rank();
 
 3278         const local_ordinal_type partidx = rowidx2part(rowidx);
 
 3279         const local_ordinal_type pri = part2packrowidx0(partidx) + (rowidx - partptr(partidx));
 
 3280         const local_ordinal_type v = partidx % vector_length;
 
 3282         const Kokkos::pair<local_ordinal_type,local_ordinal_type> block_range(0, blocksize);
 
 3283         const local_ordinal_type num_vectors = y_packed_scalar.extent(2);
 
 3284         const local_ordinal_type num_local_rows = lclrow.extent(0);
 
 3287         auto bb = Kokkos::subview(b, block_range, 0);
 
 3289         auto xx_remote = bb; 
 
 3290         auto yy = Kokkos::subview(y_packed_scalar, 0, block_range, 0, 0);
 
 3291         auto A_block = ConstUnmanaged<tpetra_block_access_view_type>(NULL, blocksize, blocksize);
 
 3292         auto colindsub_used = (P == 0 ? colindsub : colindsub_remote);
 
 3293         auto rowptr_used = (P == 0 ? rowptr : rowptr_remote);
 
 3295         const local_ordinal_type lr = lclrow(rowidx);
 
 3296         const local_ordinal_type row = lr*blocksize;
 
 3297         for (local_ordinal_type col=0;col<num_vectors;++col) {
 
 3298           yy.assign_data(&y_packed_scalar(pri, 0, col, v));
 
 3301             bb.assign_data(&b(row, col));
 
 3302             if (member.team_rank() == 0)
 
 3303               VectorCopy(member, blocksize, bb, yy);
 
 3304             member.team_barrier();
 
 3308           const size_type A_k0 = A_rowptr[lr];
 
 3309           Kokkos::parallel_for
 
 3310             (Kokkos::TeamThreadRange(member, rowptr_used[lr], rowptr_used[lr+1]),
 
 3311              [&](
const local_ordinal_type &k) {
 
 3312               const size_type j = A_k0 + colindsub_used[k];
 
 3313               A_block.assign_data( &tpetra_values(j*blocksize_square) );
 
 3315               const local_ordinal_type A_colind_at_j = A_colind[j];
 
 3317                 const auto loc = is_dm2cm_active ? dm2cm[A_colind_at_j] : A_colind_at_j;
 
 3318                 xx.assign_data( &x(loc*blocksize, col) );
 
 3319                 VectorGemv(member, blocksize, A_block, xx, yy);
 
 3321                 const auto loc = A_colind_at_j - num_local_rows;
 
 3322                 xx_remote.assign_data( &x_remote(loc*blocksize, col) );
 
 3323                 VectorGemv(member, blocksize, A_block, xx_remote, yy);
 
 3330       template<
typename MultiVectorLocalViewTypeY,
 
 3331                typename MultiVectorLocalViewTypeB,
 
 3332                typename MultiVectorLocalViewTypeX>
 
 3333       void run(
const MultiVectorLocalViewTypeY &y_,
 
 3334                const MultiVectorLocalViewTypeB &b_,
 
 3335                const MultiVectorLocalViewTypeX &x_) {
 
 3336         IFPACK2_BLOCKTRIDICONTAINER_PROFILER_REGION_BEGIN;
 
 3337         IFPACK2_BLOCKTRIDICONTAINER_TIMER(
"BlockTriDi::ComputeResidual::<SeqTag>");
 
 3339         y = y_; b = b_; x = x_;
 
 3340         if (is_cuda<execution_space>::value) {
 
 3341 #if defined(KOKKOS_ENABLE_CUDA) 
 3342           const local_ordinal_type blocksize = blocksize_requested;
 
 3343           const local_ordinal_type team_size = 8;
 
 3345           const Kokkos::TeamPolicy<execution_space,SeqTag> policy(rowptr.extent(0) - 1, team_size, vector_size);
 
 3346           Kokkos::parallel_for
 
 3347             (
"ComputeResidual::TeamPolicy::run<SeqTag>", policy, *
this);
 
 3350 #if defined(__CUDA_ARCH__) 
 3353           const Kokkos::RangePolicy<execution_space,SeqTag> policy(0, rowptr.extent(0) - 1);
 
 3354           Kokkos::parallel_for
 
 3355             (
"ComputeResidual::RangePolicy::run<SeqTag>", policy, *
this);
 
 3358         IFPACK2_BLOCKTRIDICONTAINER_PROFILER_REGION_END;
 
 3362       template<
typename MultiVectorLocalViewTypeB,
 
 3363                typename MultiVectorLocalViewTypeX,
 
 3364                typename MultiVectorLocalViewTypeX_Remote>
 
 3365       void run(
const vector_type_3d_view &y_packed_,
 
 3366                const MultiVectorLocalViewTypeB &b_,
 
 3367                const MultiVectorLocalViewTypeX &x_,
 
 3368                const MultiVectorLocalViewTypeX_Remote &x_remote_) {
 
 3369         IFPACK2_BLOCKTRIDICONTAINER_PROFILER_REGION_BEGIN;
 
 3370         IFPACK2_BLOCKTRIDICONTAINER_TIMER(
"BlockTriDi::ComputeResidual::<AsyncTag>");
 
 3372         b = b_; x = x_; x_remote = x_remote_;
 
 3373         if (is_cuda<execution_space>::value) {
 
 3374 #if defined(KOKKOS_ENABLE_CUDA) 
 3375           y_packed_scalar = btdm_scalar_type_4d_view((btdm_scalar_type*)y_packed_.data(),
 
 3376                                                      y_packed_.extent(0),
 
 3377                                                      y_packed_.extent(1),
 
 3378                                                      y_packed_.extent(2),
 
 3382           y_packed = y_packed_;
 
 3385         if (is_cuda<execution_space>::value) {
 
 3386 #if defined(KOKKOS_ENABLE_CUDA) 
 3387           const local_ordinal_type blocksize = blocksize_requested;
 
 3388           const local_ordinal_type team_size = 8;
 
 3394 #define BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL(B) {                \ 
 3395             const Kokkos::TeamPolicy<execution_space,AsyncTag<B> >      \ 
 3396               policy(rowidx2part.extent(0), team_size, vector_size);    \ 
 3397             Kokkos::parallel_for                                        \ 
 3398               ("ComputeResidual::TeamPolicy::run<AsyncTag>",            \ 
 3399                policy, *this); } break 
 3400           switch (blocksize_requested) {
 
 3401           case   3: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL( 3);
 
 3402           case   5: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL( 5);
 
 3403           case   7: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL( 7);
 
 3404           case   9: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL( 9);
 
 3405           case  10: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL(10);
 
 3406           case  11: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL(11);
 
 3407           case  16: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL(16);
 
 3408           case  17: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL(17);
 
 3409           case  18: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL(18);
 
 3410           default : BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL( 0);
 
 3412 #undef BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL 
 3415 #if defined(__CUDA_ARCH__) 
 3418 #define BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL(B) {                \ 
 3419             const Kokkos::RangePolicy<execution_space,AsyncTag<B> > policy(0, rowidx2part.extent(0)); \ 
 3420             Kokkos::parallel_for                                        \ 
 3421               ("ComputeResidual::RangePolicy::run<AsyncTag>",           \ 
 3422                policy, *this); } break 
 3423           switch (blocksize_requested) {
 
 3424           case   3: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL( 3);
 
 3425           case   5: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL( 5);
 
 3426           case   7: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL( 7);
 
 3427           case   9: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL( 9);
 
 3428           case  10: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL(10);
 
 3429           case  11: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL(11);
 
 3430           case  16: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL(16);
 
 3431           case  17: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL(17);
 
 3432           case  18: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL(18);
 
 3433           default : BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL( 0);
 
 3435 #undef BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL 
 3438         IFPACK2_BLOCKTRIDICONTAINER_PROFILER_REGION_END;
 
 3442       template<
typename MultiVectorLocalViewTypeB,
 
 3443                typename MultiVectorLocalViewTypeX,
 
 3444                typename MultiVectorLocalViewTypeX_Remote>
 
 3445       void run(
const vector_type_3d_view &y_packed_,
 
 3446                const MultiVectorLocalViewTypeB &b_,
 
 3447                const MultiVectorLocalViewTypeX &x_,
 
 3448                const MultiVectorLocalViewTypeX_Remote &x_remote_,
 
 3449                const bool compute_owned) {
 
 3450         IFPACK2_BLOCKTRIDICONTAINER_PROFILER_REGION_BEGIN;
 
 3451         IFPACK2_BLOCKTRIDICONTAINER_TIMER(
"BlockTriDi::ComputeResidual::<OverlapTag>");
 
 3453         b = b_; x = x_; x_remote = x_remote_;
 
 3454         if (is_cuda<execution_space>::value) {
 
 3455 #if defined(KOKKOS_ENABLE_CUDA) 
 3456           y_packed_scalar = btdm_scalar_type_4d_view((btdm_scalar_type*)y_packed_.data(),
 
 3457                                                      y_packed_.extent(0),
 
 3458                                                      y_packed_.extent(1),
 
 3459                                                      y_packed_.extent(2),
 
 3463           y_packed = y_packed_;
 
 3466         if (is_cuda<execution_space>::value) {
 
 3467 #if defined(KOKKOS_ENABLE_CUDA) 
 3468           const local_ordinal_type blocksize = blocksize_requested;
 
 3469           const local_ordinal_type team_size = 8;
 
 3475 #define BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL(B)  \ 
 3476           if (compute_owned) {                                          \ 
 3477             const Kokkos::TeamPolicy<execution_space,OverlapTag<0,B> > \ 
 3478               policy(rowidx2part.extent(0), team_size, vector_size);    \ 
 3479             Kokkos::parallel_for                                        \ 
 3480               ("ComputeResidual::TeamPolicy::run<OverlapTag<0> >", policy, *this); \ 
 3482             const Kokkos::TeamPolicy<execution_space,OverlapTag<1,B> > \ 
 3483               policy(rowidx2part.extent(0), team_size, vector_size);    \ 
 3484             Kokkos::parallel_for                                        \ 
 3485               ("ComputeResidual::TeamPolicy::run<OverlapTag<1> >", policy, *this); \ 
 3487           switch (blocksize_requested) {
 
 3488           case   3: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL( 3);
 
 3489           case   5: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL( 5);
 
 3490           case   7: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL( 7);
 
 3491           case   9: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL( 9);
 
 3492           case  10: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL(10);
 
 3493           case  11: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL(11);
 
 3494           case  16: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL(16);
 
 3495           case  17: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL(17);
 
 3496           case  18: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL(18);
 
 3497           default : BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL( 0);
 
 3499 #undef BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL 
 3502 #if defined(__CUDA_ARCH__) 
 3505 #define BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL(B)  \ 
 3506           if (compute_owned) {                                          \ 
 3507             const Kokkos::RangePolicy<execution_space,OverlapTag<0,B> > \ 
 3508               policy(0, rowidx2part.extent(0));                         \ 
 3509             Kokkos::parallel_for                                        \ 
 3510               ("ComputeResidual::RangePolicy::run<OverlapTag<0> >", policy, *this); \ 
 3512             const Kokkos::RangePolicy<execution_space,OverlapTag<1,B> > \ 
 3513               policy(0, rowidx2part.extent(0));                         \ 
 3514             Kokkos::parallel_for                                        \ 
 3515               ("ComputeResidual::RangePolicy::run<OverlapTag<1> >", policy, *this); \ 
 3518           switch (blocksize_requested) {
 
 3519           case   3: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL( 3);
 
 3520           case   5: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL( 5);
 
 3521           case   7: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL( 7);
 
 3522           case   9: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL( 9);
 
 3523           case  10: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL(10);
 
 3524           case  11: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL(11);
 
 3525           case  16: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL(16);
 
 3526           case  17: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL(17);
 
 3527           case  18: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL(18);
 
 3528           default : BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL( 0);
 
 3530 #undef BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL 
 3533         IFPACK2_BLOCKTRIDICONTAINER_PROFILER_REGION_END;
 
 3537     template<
typename MatrixType>
 
 3538     void reduceVector(
const ConstUnmanaged<
typename ImplType<MatrixType>::impl_scalar_type_1d_view> zz,
 
 3539                        typename ImplType<MatrixType>::magnitude_type *vals) {
 
 3540       IFPACK2_BLOCKTRIDICONTAINER_PROFILER_REGION_BEGIN;
 
 3541       IFPACK2_BLOCKTRIDICONTAINER_TIMER(
"BlockTriDi::ReduceVector");
 
 3543       using impl_type = ImplType<MatrixType>;
 
 3544       using local_ordinal_type = 
typename impl_type::local_ordinal_type;
 
 3545       using impl_scalar_type = 
typename impl_type::impl_scalar_type;
 
 3547       const auto norm2 = KokkosBlas::nrm1(zz);
 
 3549       impl_scalar_type norm2(0);
 
 3550       Kokkos::parallel_reduce
 
 3551         (
"ReduceMultiVector::Device",
 
 3552          Kokkos::RangePolicy<typename impl_type::execution_space>(0,zz.extent(0)),
 
 3553          KOKKOS_LAMBDA(
const local_ordinal_type &i, impl_scalar_type &update) {
 
 3557       vals[0] = Kokkos::ArithTraits<impl_scalar_type>::abs(norm2);
 
 3559       IFPACK2_BLOCKTRIDICONTAINER_PROFILER_REGION_END;
 
 3565     template<
typename MatrixType>
 
 3570       using magnitude_type = 
typename impl_type::magnitude_type;
 
 3574       int sweep_step_, sweep_step_upper_bound_;
 
 3575 #ifdef HAVE_IFPACK2_MPI 
 3576       MPI_Request mpi_request_;
 
 3579       magnitude_type work_[3];
 
 3586         sweep_step_upper_bound_ = 1;
 
 3587         collective_ = comm->getSize() > 1;
 
 3589 #ifdef HAVE_IFPACK2_MPI 
 3590           const auto mpi_comm = Teuchos::rcp_dynamic_cast<
const Teuchos::MpiComm<int> >(comm);
 
 3592           comm_ = *mpi_comm->getRawMpiComm();
 
 3595         const magnitude_type zero(0), minus_one(-1);
 
 3598         work_[2] = minus_one;
 
 3602       void setCheckFrequency(
const int sweep_step) {
 
 3604         sweep_step_upper_bound_ = sweep_step; 
 3608       // Get the buffer into which to store rank-local squared norms. 
 3609       magnitude_type* getBuffer() { return &work_[0]; } 
 3611       // Call MPI_Iallreduce to find the global squared norms. 
 3612       void ireduce(const int sweep, const bool force = false) { 
 3613         if ( ! force && sweep % sweep_step_) return; 
 3615         IFPACK2_BLOCKTRIDICONTAINER_TIMER("BlockTriDi::NormManager::Ireduce
"); 
 3617         work_[1] = work_[0]; 
 3618         auto send_data = &work_[1]; 
 3619         auto recv_data = &work_[0]; 
 3621 #ifdef HAVE_IFPACK2_MPI 
 3622 # if defined(IFPACK2_BLOCKTRIDICONTAINER_USE_MPI_3) 
 3623           MPI_Iallreduce(send_data, recv_data, 1, 
 3624                          Teuchos::Details::MpiTypeTraits<magnitude_type>::getType(), 
 3625                          MPI_SUM, comm_, &mpi_request_); 
 3627           MPI_Allreduce (send_data, recv_data, 1, 
 3628                          Teuchos::Details::MpiTypeTraits<magnitude_type>::getType(), 
 3635       // Check if the norm-based termination criterion is met. tol2 is the 
 3636       // tolerance squared. Sweep is the sweep index. If not every iteration is 
 3637       // being checked, this function immediately returns false. If a check must 
 3638       // be done at this iteration, it waits for the reduction triggered by 
 3639       // ireduce to complete, then checks the global norm against the tolerance. 
 3640       bool checkDone (const int sweep, const magnitude_type tol2, const bool force = false) { 
 3642         if (sweep <= 0) return false; 
 3644         IFPACK2_BLOCKTRIDICONTAINER_TIMER("BlockTriDi::NormManager::CheckDone
"); 
 3646         TEUCHOS_ASSERT(sweep >= 1); 
 3647         if ( ! force && (sweep - 1) % sweep_step_) return false; 
 3649 #ifdef HAVE_IFPACK2_MPI 
 3650 # if defined(IFPACK2_BLOCKTRIDICONTAINER_USE_MPI_3) 
 3651           MPI_Wait(&mpi_request_, MPI_STATUS_IGNORE); 
 3659           work_[2] = work_[0]; 
 3661           r_val = (work_[0] < tol2*work_[2]); 
 3664         // adjust sweep step 
 3665         const auto adjusted_sweep_step = 2*sweep_step_; 
 3666         if (adjusted_sweep_step < sweep_step_upper_bound_) { 
 3667           sweep_step_ = adjusted_sweep_step; 
 3669           sweep_step_ = sweep_step_upper_bound_; 
 3674       // After termination has occurred, finalize the norms for use in 
 3675       // get_norms{0,final}. 
 3677         work_[0] = std::sqrt(work_[0]); // after converged 
 3679           work_[2] = std::sqrt(work_[2]); // first norm 
 3680         // if work_[2] is minus one, then norm is not requested. 
 3683       // Report norms to the caller. 
 3684       const magnitude_type getNorms0 () const { return work_[2]; } 
 3685       const magnitude_type getNormsFinal () const { return work_[0]; } 
 3691     template<typename MatrixType> 
 3693     applyInverseJacobi(// importer 
 3694                        const Teuchos::RCP<const typename ImplType<MatrixType>::tpetra_block_crs_matrix_type> &A, 
 3695                        const Teuchos::RCP<const typename ImplType<MatrixType>::tpetra_import_type> &tpetra_importer, 
 3696                        const Teuchos::RCP<AsyncableImport<MatrixType> > &async_importer, 
 3697                        const bool overlap_communication_and_computation, 
 3699                        const typename ImplType<MatrixType>::tpetra_multivector_type &X,  // tpetra interface 
 3700                        /* */ typename ImplType<MatrixType>::tpetra_multivector_type &Y,  // tpetra interface 
 3701                        /* */ typename ImplType<MatrixType>::tpetra_multivector_type &Z,  // temporary tpetra interface (seq_method) 
 3702                        /* */ typename ImplType<MatrixType>::impl_scalar_type_1d_view &W,  // temporary tpetra interface (diff) 
 3703                        // local object interface 
 3704                        const PartInterface<MatrixType> &interf, // mesh interface 
 3705                        const BlockTridiags<MatrixType> &btdm, // packed block tridiagonal matrices 
 3706                        const AmD<MatrixType> &amd, // R = A - D 
 3707                        /* */ typename ImplType<MatrixType>::vector_type_1d_view &work, // workspace for packed multivector of right hand side 
 3708                        /* */ NormManager<MatrixType> &norm_manager, 
 3709                        // preconditioner parameters 
 3710                        const typename ImplType<MatrixType>::impl_scalar_type &damping_factor, 
 3711                        /* */ bool is_y_zero, 
 3712                        const int max_num_sweeps, 
 3713                        const typename ImplType<MatrixType>::magnitude_type tol, 
 3714                        const int check_tol_every) { 
 3715       IFPACK2_BLOCKTRIDICONTAINER_TIMER("BlockTriDi::ApplyInverseJacobi
"); 
 3717       using impl_type = ImplType<MatrixType>; 
 3718       using node_memory_space = typename impl_type::node_memory_space; 
 3719       using local_ordinal_type = typename impl_type::local_ordinal_type; 
 3720       using size_type = typename impl_type::size_type; 
 3721       using impl_scalar_type = typename impl_type::impl_scalar_type; 
 3722       using magnitude_type = typename impl_type::magnitude_type; 
 3723       using local_ordinal_type_1d_view = typename impl_type::local_ordinal_type_1d_view; 
 3724       using vector_type_1d_view = typename impl_type::vector_type_1d_view; 
 3725       using vector_type_3d_view = typename impl_type::vector_type_3d_view; 
 3726       using tpetra_multivector_type = typename impl_type::tpetra_multivector_type; 
 3728       using impl_scalar_type_1d_view = typename impl_type::impl_scalar_type_1d_view; 
 3730       // either tpetra importer or async importer must be active 
 3731       TEUCHOS_TEST_FOR_EXCEPT_MSG(!tpetra_importer.is_null() && !async_importer.is_null(), 
 3732                                   "Neither Tpetra importer nor Async importer is null.
"); 
 3733       // max number of sweeps should be positive number 
 3734       TEUCHOS_TEST_FOR_EXCEPT_MSG(max_num_sweeps <= 0, 
 3735                                   "Maximum number of sweeps must be >= 1.
"); 
 3738       const bool is_seq_method_requested = !tpetra_importer.is_null(); 
 3739       const bool is_async_importer_active = !async_importer.is_null(); 
 3740       const bool is_norm_manager_active = tol > Kokkos::ArithTraits<magnitude_type>::zero(); 
 3741       const magnitude_type tolerance = tol*tol; 
 3742       const local_ordinal_type blocksize = btdm.values.extent(1); 
 3743       const local_ordinal_type num_vectors = Y.getNumVectors(); 
 3744       const local_ordinal_type num_blockrows = interf.part2packrowidx0_back; 
 3746       const impl_scalar_type zero(0.0); 
 3748       TEUCHOS_TEST_FOR_EXCEPT_MSG(is_norm_manager_active && is_seq_method_requested, 
 3750                                   "which in any 
case is 
for developer use only, 
" << 
 3751                                   "does not support norm-based termination.
"); 
 3753       // if workspace is needed more, resize it 
 3754       const size_type work_span_required = num_blockrows*num_vectors*blocksize; 
 3755       if (work.span() < work_span_required) 
 3756         work = vector_type_1d_view("vector workspace 1d view
", work_span_required); 
 3759       const local_ordinal_type W_size = interf.packptr.extent(0)-1; 
 3760       if (local_ordinal_type(W.extent(0)) < W_size) 
 3761         W = impl_scalar_type_1d_view("W
", W_size); 
 3763       typename impl_type::impl_scalar_type_2d_view_tpetra remote_multivector; 
 3765         if (is_seq_method_requested) { 
 3766           if (Z.getNumVectors() != Y.getNumVectors()) 
 3767             Z = tpetra_multivector_type(tpetra_importer->getTargetMap(), num_vectors, false); 
 3769           if (is_async_importer_active) { 
 3770             // create comm data buffer and keep it here 
 3771             async_importer->createDataBuffer(num_vectors); 
 3772             remote_multivector = async_importer->getRemoteMultiVectorLocalView(); 
 3777       // wrap the workspace with 3d view 
 3778       vector_type_3d_view pmv(work.data(), num_blockrows, blocksize, num_vectors); 
 3779       const auto XX = X.template getLocalView<node_memory_space>(); 
 3780       const auto YY = Y.template getLocalView<node_memory_space>(); 
 3781       const auto ZZ = Z.template getLocalView<node_memory_space>(); 
 3782       if (is_y_zero) Kokkos::deep_copy(YY, zero); 
 3784       MultiVectorConverter<MatrixType> multivector_converter(interf, pmv); 
 3785       SolveTridiags<MatrixType> solve_tridiags(interf, btdm, pmv, 
 3786                                                damping_factor, is_norm_manager_active); 
 3788       const local_ordinal_type_1d_view dummy_local_ordinal_type_1d_view; 
 3789       ComputeResidualVector<MatrixType> 
 3790         compute_residual_vector(amd, A->getCrsGraph().getLocalGraph(), blocksize, interf, 
 3791                                 is_async_importer_active ? async_importer->dm2cm : dummy_local_ordinal_type_1d_view); 
 3793       // norm manager workspace resize 
 3794       if (is_norm_manager_active) 
 3795         norm_manager.setCheckFrequency(check_tol_every); 
 3799       for (;sweep<max_num_sweeps;++sweep) { 
 3803             multivector_converter.run(XX); 
 3805             if (is_seq_method_requested) { 
 3806               // SEQ METHOD IS TESTING ONLY 
 3809               Z.doImport(Y, *tpetra_importer, Tpetra::REPLACE); 
 3810               compute_residual_vector.run(YY, XX, ZZ); 
 3812               // pmv := y(lclrow). 
 3813               multivector_converter.run(YY); 
 3815               // fused y := x - R y and pmv := y(lclrow); 
 3816               // real use case does not use overlap comp and comm 
 3817               if (overlap_communication_and_computation || !is_async_importer_active) { 
 3818                 if (is_async_importer_active) async_importer->asyncSendRecv(YY); 
 3819                 compute_residual_vector.run(pmv, XX, YY, remote_multivector, true); 
 3820                 if (is_norm_manager_active && norm_manager.checkDone(sweep, tolerance)) { 
 3821                   if (is_async_importer_active) async_importer->cancel(); 
 3824                 if (is_async_importer_active) { 
 3825                   async_importer->syncRecv(); 
 3826                   compute_residual_vector.run(pmv, XX, YY, remote_multivector, false); 
 3829                 if (is_async_importer_active) 
 3830                   async_importer->syncExchange(YY); 
 3831                 if (is_norm_manager_active && norm_manager.checkDone(sweep, tolerance)) break; 
 3832                 compute_residual_vector.run(pmv, XX, YY, remote_multivector); 
 3838         // pmv := inv(D) pmv. 
 3840           solve_tridiags.run(YY, W); 
 3843           if (is_norm_manager_active) { 
 3844             // y(lclrow) = (b - a) y(lclrow) + a pmv, with b = 1 always. 
 3845             reduceVector<MatrixType>(W, norm_manager.getBuffer()); 
 3846             if (sweep + 1 == max_num_sweeps) { 
 3847               norm_manager.ireduce(sweep, true); 
 3848               norm_manager.checkDone(sweep + 1, tolerance, true); 
 3850               norm_manager.ireduce(sweep); 
 3857       //sqrt the norms for the caller's use. 
 3858       if (is_norm_manager_active) norm_manager.finalize(); 
 3864     template<typename MatrixType> 
 3866       using impl_type = ImplType<MatrixType>; 
 3867       using part_interface_type = PartInterface<MatrixType>; 
 3868       using block_tridiags_type = BlockTridiags<MatrixType>; 
 3869       using amd_type = AmD<MatrixType>; 
 3870       using norm_manager_type = NormManager<MatrixType>; 
 3871       using async_import_type = AsyncableImport<MatrixType>; 
 3873       // distructed objects 
 3874       Teuchos::RCP<const typename impl_type::tpetra_block_crs_matrix_type> A; 
 3875       Teuchos::RCP<const typename impl_type::tpetra_import_type> tpetra_importer; 
 3876       Teuchos::RCP<async_import_type> async_importer; 
 3877       bool overlap_communication_and_computation; 
 3879       // copy of Y (mutable to penentrate const) 
 3880       mutable typename impl_type::tpetra_multivector_type Z; 
 3881       mutable typename impl_type::impl_scalar_type_1d_view W; 
 3884       part_interface_type part_interface; 
 3885       block_tridiags_type block_tridiags; // D 
 3886       amd_type a_minus_d; // R = A - D 
 3887       mutable typename impl_type::vector_type_1d_view work; // right hand side workspace 
 3888       mutable norm_manager_type norm_manager; 
 3891   } // namespace BlockTriDiContainerDetails 
 3893 } // namespace Ifpack2 
Definition: Ifpack2_BlockTriDiContainer_impl.hpp:171
 
BlockTridiags< MatrixType > createBlockTridiags(const PartInterface< MatrixType > &interf)
Definition: Ifpack2_BlockTriDiContainer_impl.hpp:1302
 
int applyInverseJacobi(const Teuchos::RCP< const typename ImplType< MatrixType >::tpetra_block_crs_matrix_type > &A, const Teuchos::RCP< const typename ImplType< MatrixType >::tpetra_import_type > &tpetra_importer, const Teuchos::RCP< AsyncableImport< MatrixType > > &async_importer, const bool overlap_communication_and_computation, const typename ImplType< MatrixType >::tpetra_multivector_type &X, typename ImplType< MatrixType >::tpetra_multivector_type &Y, typename ImplType< MatrixType >::tpetra_multivector_type &Z, typename ImplType< MatrixType >::impl_scalar_type_1d_view &W, const PartInterface< MatrixType > &interf, const BlockTridiags< MatrixType > &btdm, const AmD< MatrixType > &amd, typename ImplType< MatrixType >::vector_type_1d_view &work, NormManager< MatrixType > &norm_manager, const typename ImplType< MatrixType >::impl_scalar_type &damping_factor, bool is_y_zero, const int max_num_sweeps, const typename ImplType< MatrixType >::magnitude_type tol, const int check_tol_every)
Definition: Ifpack2_BlockTriDiContainer_impl.hpp:3693
 
PartInterface< MatrixType > createPartInterface(const Teuchos::RCP< const typename ImplType< MatrixType >::tpetra_block_crs_matrix_type > &A, const Teuchos::Array< Teuchos::Array< typename ImplType< MatrixType >::local_ordinal_type > > &partitions)
Definition: Ifpack2_BlockTriDiContainer_impl.hpp:1122
 
Definition: Ifpack2_BlockTriDiContainer_impl.hpp:201
 
void performNumericPhase(const Teuchos::RCP< const typename ImplType< MatrixType >::tpetra_block_crs_matrix_type > &A, const PartInterface< MatrixType > &interf, BlockTridiags< MatrixType > &btdm, const typename ImplType< MatrixType >::magnitude_type tiny)
Definition: Ifpack2_BlockTriDiContainer_impl.hpp:2130
 
static int ComputeResidualVectorRecommendedCudaVectorSize(const int blksize, const int team_size)
Definition: Ifpack2_BlockTriDiContainer_impl.hpp:2847
 
Kokkos::View< size_type *, device_type > size_type_1d_view
Definition: Ifpack2_BlockTriDiContainer_impl.hpp:362
 
Kokkos::Details::ArithTraits< scalar_type >::val_type impl_scalar_type
Definition: Ifpack2_BlockTriDiContainer_impl.hpp:306
 
TEUCHOS_DEPRECATED RCP< T > rcp(T *p, Dealloc_T dealloc, bool owns_mem)
 
#define TEUCHOS_TEST_FOR_EXCEPT_MSG(throw_exception_test, msg)
 
Definition: Ifpack2_BlockTriDiContainer_impl.hpp:180
 
size_t size_type
Definition: Ifpack2_BlockTriDiContainer_impl.hpp:297
 
std::string get_msg_prefix(const CommPtrType &comm)
Definition: Ifpack2_BlockTriDiContainer_impl.hpp:189
 
Definition: Ifpack2_BlockTriDiContainer_impl.hpp:237
 
Kokkos::ViewAllocateWithoutInitializing do_not_initialize_tag
for the next promotion 
Definition: Ifpack2_BlockTriDiContainer_impl.hpp:127
 
KB::Vector< T, l > Vector
Definition: Ifpack2_BlockTriDiContainer_impl.hpp:349
 
void send(const Packet sendBuffer[], const Ordinal count, const int destRank, const int tag, const Comm< Ordinal > &comm)
 
Teuchos::RCP< AsyncableImport< MatrixType > > createBlockCrsAsyncImporter(const Teuchos::RCP< const typename ImplType< MatrixType >::tpetra_block_crs_matrix_type > &A)
Definition: Ifpack2_BlockTriDiContainer_impl.hpp:1008
 
node_type::device_type node_device_type
Definition: Ifpack2_BlockTriDiContainer_impl.hpp:320
 
Definition: Ifpack2_BlockTriDiContainer_impl.hpp:3566
 
Teuchos::RCP< const typename ImplType< MatrixType >::tpetra_import_type > createBlockCrsTpetraImporter(const Teuchos::RCP< const typename ImplType< MatrixType >::tpetra_block_crs_matrix_type > &A)
Definition: Ifpack2_BlockTriDiContainer_impl.hpp:386
 
Kokkos::DefaultHostExecutionSpace host_execution_space
Definition: Ifpack2_BlockTriDiContainer_impl.hpp:315
 
void performSymbolicPhase(const Teuchos::RCP< const typename ImplType< MatrixType >::tpetra_block_crs_matrix_type > &A, const PartInterface< MatrixType > &interf, BlockTridiags< MatrixType > &btdm, AmD< MatrixType > &amd, const bool overlap_communication_and_computation)
Definition: Ifpack2_BlockTriDiContainer_impl.hpp:1476
 
Definition: Ifpack2_BlockTriDiContainer_impl.hpp:1772
 
RCP< CommRequest< Ordinal > > isend(const ArrayRCP< const Packet > &sendBuffer, const int destRank, const int tag, const Comm< Ordinal > &comm)
 
#define TEUCHOS_ASSERT(assertion_test)
 
Definition: Ifpack2_BlockTriDiContainer_impl.hpp:1445
 
Definition: Ifpack2_BlockTriDiContainer_impl.hpp:1258
 
Definition: Ifpack2_BlockTriDiContainer_impl.hpp:2283
 
Definition: Ifpack2_BlockTriDiContainer_impl.hpp:293
 
Definition: Ifpack2_BlockTriDiContainer_impl.hpp:2143