Ifpack2 Templated Preconditioning Package  Version 1.0
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Groups Pages
Ifpack2_BlockTriDiContainer_impl.hpp
1 /*@HEADER
2 // ***********************************************************************
3 //
4 // Ifpack2: Templated Object-Oriented Algebraic Preconditioner Package
5 // Copyright (2009) Sandia Corporation
6 //
7 // Under terms of Contract DE-AC04-94AL85000, there is a non-exclusive
8 // license for use of this work by or on behalf of the U.S. Government.
9 //
10 // Redistribution and use in source and binary forms, with or without
11 // modification, are permitted provided that the following conditions are
12 // met:
13 //
14 // 1. Redistributions of source code must retain the above copyright
15 // notice, this list of conditions and the following disclaimer.
16 //
17 // 2. Redistributions in binary form must reproduce the above copyright
18 // notice, this list of conditions and the following disclaimer in the
19 // documentation and/or other materials provided with the distribution.
20 //
21 // 3. Neither the name of the Corporation nor the names of the
22 // contributors may be used to endorse or promote products derived from
23 // this software without specific prior written permission.
24 //
25 // THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
26 // EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
27 // IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
28 // PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
29 // CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
30 // EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
31 // PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
32 // PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
33 // LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
34 // NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
35 // SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
36 //
37 // Questions? Contact Michael A. Heroux (maherou@sandia.gov)
38 //
39 // ***********************************************************************
40 //@HEADER
41 */
42 
43 #ifndef IFPACK2_BLOCKTRIDICONTAINER_IMPL_HPP
44 #define IFPACK2_BLOCKTRIDICONTAINER_IMPL_HPP
45 
47 
48 #include <Tpetra_Details_extractMpiCommFromTeuchos.hpp>
49 #include <Tpetra_Distributor.hpp>
50 #include <Tpetra_BlockMultiVector.hpp>
51 
52 #include <Kokkos_ArithTraits.hpp>
53 #include <KokkosBatched_Util.hpp>
54 #include <KokkosBatched_Vector.hpp>
55 #include <KokkosBatched_Copy_Decl.hpp>
56 #include <KokkosBatched_Copy_Impl.hpp>
57 #include <KokkosBatched_AddRadial_Decl.hpp>
58 #include <KokkosBatched_AddRadial_Impl.hpp>
59 #include <KokkosBatched_SetIdentity_Decl.hpp>
60 #include <KokkosBatched_SetIdentity_Impl.hpp>
61 #include <KokkosBatched_Gemm_Decl.hpp>
62 #include <KokkosBatched_Gemm_Serial_Impl.hpp>
63 #include <KokkosBatched_Gemm_Team_Impl.hpp>
64 #include <KokkosBatched_Gemv_Decl.hpp>
65 #include <KokkosBatched_Gemv_Serial_Impl.hpp>
66 #include <KokkosBatched_Gemv_Team_Impl.hpp>
67 #include <KokkosBatched_Trsm_Decl.hpp>
68 #include <KokkosBatched_Trsm_Serial_Impl.hpp>
69 #include <KokkosBatched_Trsm_Team_Impl.hpp>
70 #include <KokkosBatched_Trsv_Decl.hpp>
71 #include <KokkosBatched_Trsv_Serial_Impl.hpp>
72 #include <KokkosBatched_Trsv_Team_Impl.hpp>
73 #include <KokkosBatched_LU_Decl.hpp>
74 #include <KokkosBatched_LU_Serial_Impl.hpp>
75 #include <KokkosBatched_LU_Team_Impl.hpp>
76 
77 #include <KokkosBlas1_nrm1.hpp>
78 #include <KokkosBlas1_nrm2.hpp>
79 
80 #include <memory>
81 
82 // need to interface this into cmake variable (or only use this flag when it is necessary)
83 //#define IFPACK2_BLOCKTRIDICONTAINER_ENABLE_PROFILE
84 //#undef IFPACK2_BLOCKTRIDICONTAINER_ENABLE_PROFILE
85 #if defined(KOKKOS_ENABLE_CUDA) && defined(IFPACK2_BLOCKTRIDICONTAINER_ENABLE_PROFILE)
86 #include "cuda_profiler_api.h"
87 #endif
88 
89 // I am not 100% sure about the mpi 3 on cuda
90 #if MPI_VERSION >= 3
91 #define IFPACK2_BLOCKTRIDICONTAINER_USE_MPI_3
92 #endif
93 
94 // this requires kokkos develop branch (do not use this yet)
95 //#define IFPACK2_BLOCKTRIDICONTAINER_USE_CUDA_STREAM
96 
97 // ::: Experiments :::
98 // define either pinned memory or cudamemory for mpi
99 // if both macros are disabled, it will use tpetra memory space which is uvm space for cuda
100 // if defined, this use pinned memory instead of device pointer
101 // by default, we enable pinned memory
102 #define IFPACK2_BLOCKTRIDICONTAINER_USE_PINNED_MEMORY_FOR_MPI
103 //#define IFPACK2_BLOCKTRIDICONTAINER_USE_CUDA_MEMORY_FOR_MPI
104 
105 // if defined, all views are allocated on cuda space intead of cuda uvm space
106 #define IFPACK2_BLOCKTRIDICONTAINER_USE_CUDA_SPACE
107 
108 // if defined, btdm_scalar_type is used (if impl_scala_type is double, btdm_scalar_type is float)
109 #if defined(HAVE_IFPACK2_BLOCKTRIDICONTAINER_SMALL_SCALAR)
110 #define IFPACK2_BLOCKTRIDICONTAINER_USE_SMALL_SCALAR_FOR_BLOCKTRIDIAG
111 #endif
112 
113 namespace Ifpack2 {
114 
115  namespace BlockTriDiContainerDetails {
116 
118 #if defined(__KOKKOSBATCHED_PROMOTION__)
119 namespace KB = KokkosBatched;
120 #else
121 namespace KB = KokkosBatched::Experimental;
122 #endif
123 
127  using do_not_initialize_tag = Kokkos::ViewAllocateWithoutInitializing;
128 
129  template <typename MemoryTraitsType, Kokkos::MemoryTraitsFlags flag>
130  using MemoryTraits = Kokkos::MemoryTraits<MemoryTraitsType::is_unmanaged |
131  MemoryTraitsType::is_random_access |
132  flag>;
133 
134  template <typename ViewType>
135  using Unmanaged = Kokkos::View<typename ViewType::data_type,
136  typename ViewType::array_layout,
137  typename ViewType::device_type,
138  MemoryTraits<typename ViewType::memory_traits,Kokkos::Unmanaged> >;
139  template <typename ViewType>
140  using Atomic = Kokkos::View<typename ViewType::data_type,
141  typename ViewType::array_layout,
142  typename ViewType::device_type,
143  MemoryTraits<typename ViewType::memory_traits,Kokkos::Atomic> >;
144  template <typename ViewType>
145  using Const = Kokkos::View<typename ViewType::const_data_type,
146  typename ViewType::array_layout,
147  typename ViewType::device_type,
148  typename ViewType::memory_traits>;
149  template <typename ViewType>
150  using ConstUnmanaged = Const<Unmanaged<ViewType> >;
151 
152  template <typename ViewType>
153  using AtomicUnmanaged = Atomic<Unmanaged<ViewType> >;
154 
155  template <typename ViewType>
156  using Unmanaged = Kokkos::View<typename ViewType::data_type,
157  typename ViewType::array_layout,
158  typename ViewType::device_type,
159  MemoryTraits<typename ViewType::memory_traits,Kokkos::Unmanaged> >;
160 
161 
162  template <typename ViewType>
163  using Scratch = Kokkos::View<typename ViewType::data_type,
164  typename ViewType::array_layout,
165  typename ViewType::execution_space::scratch_memory_space,
166  MemoryTraits<typename ViewType::memory_traits, Kokkos::Unmanaged> >;
167 
171  template<typename T> struct BlockTridiagScalarType { typedef T type; };
172 #if defined(IFPACK2_BLOCKTRIDICONTAINER_USE_SMALL_SCALAR_FOR_BLOCKTRIDIAG)
173  template<> struct BlockTridiagScalarType<double> { typedef float type; };
174  //template<> struct SmallScalarType<Kokkos::complex<double> > { typedef Kokkos::complex<float> type; };
175 #endif
176 
180  template<typename T> struct is_cuda { enum : bool { value = false }; };
181 #if defined(KOKKOS_ENABLE_CUDA)
182  template<> struct is_cuda<Kokkos::Cuda> { enum : bool { value = true }; };
183 #endif
184 
188  template<typename CommPtrType>
189  std::string get_msg_prefix (const CommPtrType &comm) {
190  const auto rank = comm->getRank();
191  const auto nranks = comm->getSize();
192  std::stringstream ss;
193  ss << "Rank " << rank << " of " << nranks << ": ";
194  return ss.str();
195  }
196 
200  template<typename T, int N>
201  struct ArrayValueType {
202  T v[N];
203  KOKKOS_INLINE_FUNCTION
204  ArrayValueType() {
205  for (int i=0;i<N;++i)
206  this->v[i] = 0;
207  }
208  KOKKOS_INLINE_FUNCTION
209  ArrayValueType(const ArrayValueType &b) {
210  for (int i=0;i<N;++i)
211  this->v[i] = b.v[i];
212  }
213  };
214  template<typename T, int N>
215  static
216  KOKKOS_INLINE_FUNCTION
217  void
218  operator+=(volatile ArrayValueType<T,N> &a,
219  volatile const ArrayValueType<T,N> &b) {
220  for (int i=0;i<N;++i)
221  a.v[i] += b.v[i];
222  }
223  template<typename T, int N>
224  static
225  KOKKOS_INLINE_FUNCTION
226  void
227  operator+=(ArrayValueType<T,N> &a,
228  const ArrayValueType<T,N> &b) {
229  for (int i=0;i<N;++i)
230  a.v[i] += b.v[i];
231  }
232 
236  template<typename T, int N, typename ExecSpace>
237  struct SumReducer {
238  typedef SumReducer reducer;
240  typedef Kokkos::View<value_type,ExecSpace,Kokkos::MemoryTraits<Kokkos::Unmanaged> > result_view_type;
241  value_type *value;
242 
243  KOKKOS_INLINE_FUNCTION
244  SumReducer(value_type &val) : value(&val) {}
245 
246  KOKKOS_INLINE_FUNCTION
247  void join(value_type &dst, value_type &src) const {
248  for (int i=0;i<N;++i)
249  dst.v[i] += src.v[i];
250  }
251  KOKKOS_INLINE_FUNCTION
252  void join(volatile value_type &dst, const volatile value_type &src) const {
253  for (int i=0;i<N;++i)
254  dst.v[i] += src.v[i];
255  }
256  KOKKOS_INLINE_FUNCTION
257  void init(value_type &val) const {
258  for (int i=0;i<N;++i)
259  val.v[i] = Kokkos::reduction_identity<T>::sum();
260  }
261  KOKKOS_INLINE_FUNCTION
262  value_type& reference() {
263  return *value;
264  }
265  KOKKOS_INLINE_FUNCTION
266  result_view_type view() const {
267  return result_view_type(value);
268  }
269  };
270 
271 #if defined(HAVE_IFPACK2_BLOCKTRIDICONTAINER_TIMERS)
272 #define IFPACK2_BLOCKTRIDICONTAINER_TIMER(label) TEUCHOS_FUNC_TIME_MONITOR(label);
273 #else
274 #define IFPACK2_BLOCKTRIDICONTAINER_TIMER(label)
275 #endif
276 
277 #if defined(KOKKOS_ENABLE_CUDA) && defined(IFPACK2_BLOCKTRIDICONTAINER_ENABLE_PROFILE)
278 #define IFPACK2_BLOCKTRIDICONTAINER_PROFILER_REGION_BEGIN \
279  CUDA_SAFE_CALL(cudaProfilerStart());
280 
281 #define IFPACK2_BLOCKTRIDICONTAINER_PROFILER_REGION_END \
282  { CUDA_SAFE_CALL( cudaProfilerStop() ); }
283 #else
284 #define IFPACK2_BLOCKTRIDICONTAINER_PROFILER_REGION_BEGIN
286 #define IFPACK2_BLOCKTRIDICONTAINER_PROFILER_REGION_END
287 #endif
288 
292  template <typename MatrixType>
293  struct ImplType {
297  typedef size_t size_type;
298  typedef typename MatrixType::scalar_type scalar_type;
299  typedef typename MatrixType::local_ordinal_type local_ordinal_type;
300  typedef typename MatrixType::global_ordinal_type global_ordinal_type;
301  typedef typename MatrixType::node_type node_type;
302 
306  typedef typename Kokkos::Details::ArithTraits<scalar_type>::val_type impl_scalar_type;
307  typedef typename Kokkos::ArithTraits<impl_scalar_type>::mag_type magnitude_type;
308 
309  typedef typename BlockTridiagScalarType<impl_scalar_type>::type btdm_scalar_type;
310  typedef typename Kokkos::ArithTraits<btdm_scalar_type>::mag_type btdm_magnitude_type;
311 
315  typedef Kokkos::DefaultHostExecutionSpace host_execution_space;
316 
320  typedef typename node_type::device_type node_device_type;
321  typedef typename node_device_type::execution_space node_execution_space;
322  typedef typename node_device_type::memory_space node_memory_space;
323 
324 #if defined(KOKKOS_ENABLE_CUDA) && defined(IFPACK2_BLOCKTRIDICONTAINER_USE_CUDA_SPACE)
325  typedef node_execution_space execution_space;
327  typedef typename std::conditional<std::is_same<node_memory_space,Kokkos::CudaUVMSpace>::value,
328  Kokkos::CudaSpace,
329  node_memory_space>::type memory_space;
330  typedef Kokkos::Device<execution_space,memory_space> device_type;
331 #else
332  typedef node_device_type device_type;
333  typedef node_execution_space execution_space;
334  typedef node_memory_space memory_space;
335 #endif
336 
337  typedef Tpetra::MultiVector<scalar_type,local_ordinal_type,global_ordinal_type,node_type> tpetra_multivector_type;
338  typedef Tpetra::Map<local_ordinal_type,global_ordinal_type,node_type> tpetra_map_type;
339  typedef Tpetra::Import<local_ordinal_type,global_ordinal_type,node_type> tpetra_import_type;
340  typedef Tpetra::RowMatrix<scalar_type,local_ordinal_type,global_ordinal_type,node_type> tpetra_row_matrix_type;
341  typedef Tpetra::BlockCrsMatrix<scalar_type,local_ordinal_type,global_ordinal_type,node_type> tpetra_block_crs_matrix_type;
342  typedef typename tpetra_block_crs_matrix_type::little_block_type tpetra_block_access_view_type;
343  typedef Tpetra::BlockMultiVector<scalar_type,local_ordinal_type,global_ordinal_type,node_type> tpetra_block_multivector_type;
344  typedef typename tpetra_block_crs_matrix_type::crs_graph_type::local_graph_type local_crs_graph_type;
345 
349  template<typename T, int l> using Vector = KB::Vector<T,l>;
350  template<typename T> using SIMD = KB::SIMD<T>;
351  template<typename T, typename M> using DefaultVectorLength = KB::DefaultVectorLength<T,M>;
352  template<typename T, typename M> using DefaultInternalVectorLength = KB::DefaultInternalVectorLength<T,M>;
353 
354  static constexpr int vector_length = DefaultVectorLength<btdm_scalar_type,memory_space>::value;
355  static constexpr int internal_vector_length = DefaultInternalVectorLength<btdm_scalar_type,memory_space>::value;
356  typedef Vector<SIMD<btdm_scalar_type>,vector_length> vector_type;
357  typedef Vector<SIMD<btdm_scalar_type>,internal_vector_length> internal_vector_type;
358 
362  typedef Kokkos::View<size_type*,device_type> size_type_1d_view;
363  typedef Kokkos::View<local_ordinal_type*,device_type> local_ordinal_type_1d_view;
364  // tpetra block crs values
365  typedef Kokkos::View<impl_scalar_type*,device_type> impl_scalar_type_1d_view;
366  typedef Kokkos::View<impl_scalar_type*,node_device_type> impl_scalar_type_1d_view_tpetra;
367 
368  // tpetra multivector values (layout left): may need to change the typename more explicitly
369  typedef Kokkos::View<impl_scalar_type**,Kokkos::LayoutLeft,device_type> impl_scalar_type_2d_view;
370  typedef Kokkos::View<impl_scalar_type**,Kokkos::LayoutLeft,node_device_type> impl_scalar_type_2d_view_tpetra;
371 
372  // packed data always use layout right
373  typedef Kokkos::View<vector_type*,device_type> vector_type_1d_view;
374  typedef Kokkos::View<vector_type***,Kokkos::LayoutRight,device_type> vector_type_3d_view;
375  typedef Kokkos::View<internal_vector_type***,Kokkos::LayoutRight,device_type> internal_vector_type_3d_view;
376  typedef Kokkos::View<internal_vector_type****,Kokkos::LayoutRight,device_type> internal_vector_type_4d_view;
377  typedef Kokkos::View<btdm_scalar_type***,Kokkos::LayoutRight,device_type> btdm_scalar_type_3d_view;
378  typedef Kokkos::View<btdm_scalar_type****,Kokkos::LayoutRight,device_type> btdm_scalar_type_4d_view;
379  };
380 
384  template<typename MatrixType>
386  createBlockCrsTpetraImporter(const Teuchos::RCP<const typename ImplType<MatrixType>::tpetra_block_crs_matrix_type> &A) {
387  IFPACK2_BLOCKTRIDICONTAINER_TIMER("BlockTriDi::CreateBlockCrsTpetraImporter");
388  using impl_type = ImplType<MatrixType>;
389  using tpetra_map_type = typename impl_type::tpetra_map_type;
390  using tpetra_mv_type = typename impl_type::tpetra_block_multivector_type;
391  using tpetra_import_type = typename impl_type::tpetra_import_type;
392 
393  const auto g = A->getCrsGraph(); // tpetra crs graph object
394  const auto blocksize = A->getBlockSize();
395  const auto src = Teuchos::rcp(new tpetra_map_type(tpetra_mv_type::makePointMap(*g.getDomainMap(), blocksize)));
396  const auto tgt = Teuchos::rcp(new tpetra_map_type(tpetra_mv_type::makePointMap(*g.getColMap() , blocksize)));
397 
398  return Teuchos::rcp(new tpetra_import_type(src, tgt));
399  }
400 
401  // Partial replacement for forward-mode MultiVector::doImport.
402  // Permits overlapped communication and computation, but also supports sync'ed.
403  // I'm finding that overlapped comm/comp can give quite poor performance on some
404  // platforms, so we can't just use it straightforwardly always.
405 
406  template<typename MatrixType>
407  struct AsyncableImport {
408  public:
409  using impl_type = ImplType<MatrixType>;
410 
411  private:
415 #if !defined(HAVE_IFPACK2_MPI)
416  typedef int MPI_Request;
417  typedef int MPI_Comm;
418 #endif
419  using scalar_type = typename impl_type::scalar_type;
422 
423  static int isend(const MPI_Comm comm, const char* buf, int count, int dest, int tag, MPI_Request* ireq) {
424 #ifdef HAVE_IFPACK2_MPI
425  MPI_Request ureq;
426  int ret = MPI_Isend(const_cast<char*>(buf), count, MPI_CHAR, dest, tag, comm, ireq == NULL ? &ureq : ireq);
427  if (ireq == NULL) MPI_Request_free(&ureq);
428  return ret;
429 #else
430  return 0;
431 #endif
432  }
433 
434  static int irecv(const MPI_Comm comm, char* buf, int count, int src, int tag, MPI_Request* ireq) {
435 #ifdef HAVE_IFPACK2_MPI
436  MPI_Request ureq;
437  int ret = MPI_Irecv(buf, count, MPI_CHAR, src, tag, comm, ireq == NULL ? &ureq : ireq);
438  if (ireq == NULL) MPI_Request_free(&ureq);
439  return ret;
440 #else
441  return 0;
442 #endif
443  }
444 
445 #if defined(KOKKOS_ENABLE_CUDA) && defined(IFPACK2_BLOCKTRIDICONTAINER_USE_CUDA_STREAM)
446  struct CallbackDataArgsForSendRecv {
447  MPI_Comm comm;
448  char *buf;
449  int count;
450  int pid;
451  int tag;
452  MPI_Request *ireq;
453 
454  int r_val;
455  };
456 
457  static void CUDART_CB callback_isend(cudaStream_t this_stream, cudaError_t status, void *data) {
458  CallbackDataArgsForSendRecv *in = reinterpret_cast<CallbackDataArgsForSendRecv*>(data);
459  in->r_val = isend(in->comm, in->buf, in->count, in->pid, in->tag, in->ireq);
460  }
461 
462  static void CUDART_CB callback_irecv(cudaStream_t this_stream, cudaError_t status, void *data) {
463  CallbackDataArgsForSendRecv *in = reinterpret_cast<CallbackDataArgsForSendRecv*>(data);
464  in->r_val = irecv(in->comm, in->buf, in->count, in->pid, in->tag, in->ireq);
465  }
466 
467  static void CUDART_CB callback_wait(cudaStream_t this_stream, cudaError_t status, void *data) {
468 #ifdef HAVE_IFPACK2_MPI
469  MPI_Request *req = reinterpret_cast<MPI_Request*>(data);
470  MPI_Wait(req, MPI_STATUS_IGNORE);
471 #endif // HAVE_IFPACK2_MPI
472  }
473 #endif // defined(KOKKOS_ENABLE_CUDA) && defined(IFPACK2_BLOCKTRIDICONTAINER_USE_CUDA_STREAM)
474 
475  static int waitany(int count, MPI_Request* reqs, int* index) {
476 #ifdef HAVE_IFPACK2_MPI
477  return MPI_Waitany(count, reqs, index, MPI_STATUS_IGNORE);
478 #else
479  return 0;
480 #endif
481  }
482 
483  static int waitall(int count, MPI_Request* reqs) {
484 #ifdef HAVE_IFPACK2_MPI
485  return MPI_Waitall(count, reqs, MPI_STATUS_IGNORE);
486 #else
487  return 0;
488 #endif
489  }
490 
491  public:
492  using tpetra_map_type = typename impl_type::tpetra_map_type;
493  using tpetra_import_type = typename impl_type::tpetra_import_type;
494 
495  using local_ordinal_type = typename impl_type::local_ordinal_type;
496  using global_ordinal_type = typename impl_type::global_ordinal_type;
497  using size_type = typename impl_type::size_type;
498  using impl_scalar_type = typename impl_type::impl_scalar_type;
499 
500  using int_1d_view_host = Kokkos::View<int*,Kokkos::HostSpace>;
501  using local_ordinal_type_1d_view_host = Kokkos::View<local_ordinal_type*,Kokkos::HostSpace>;
502 
503  using execution_space = typename impl_type::execution_space;
504  using memory_space = typename impl_type::memory_space;
505  using local_ordinal_type_1d_view = typename impl_type::local_ordinal_type_1d_view;
506  using size_type_1d_view = typename impl_type::size_type_1d_view;
507  using size_type_1d_view_host = Kokkos::View<size_type*,Kokkos::HostSpace>;
508 
509 #if defined(KOKKOS_ENABLE_CUDA)
510  using impl_scalar_type_1d_view =
511  typename std::conditional<std::is_same<execution_space,Kokkos::Cuda>::value,
512 # if defined(IFPACK2_BLOCKTRIDICONTAINER_USE_PINNED_MEMORY_FOR_MPI)
513  Kokkos::View<impl_scalar_type*,Kokkos::CudaHostPinnedSpace>,
514 # elif defined(IFPACK2_BLOCKTRIDICONTAINER_USE_CUDA_MEMORY_FOR_MPI)
515  Kokkos::View<impl_scalar_type*,Kokkos::CudaSpace>,
516 # else // no experimental macros are defined
517  typename impl_type::impl_scalar_type_1d_view,
518 # endif
519  typename impl_type::impl_scalar_type_1d_view>::type;
520 #else
521  using impl_scalar_type_1d_view = typename impl_type::impl_scalar_type_1d_view;
522 #endif
523  using impl_scalar_type_2d_view = typename impl_type::impl_scalar_type_2d_view;
524  using impl_scalar_type_2d_view_tpetra = typename impl_type::impl_scalar_type_2d_view_tpetra;
525 
526 #ifdef HAVE_IFPACK2_MPI
527  MPI_Comm comm;
528 #endif
529 
530  impl_scalar_type_2d_view_tpetra remote_multivector;
531  local_ordinal_type blocksize;
532 
533  template<typename T>
534  struct SendRecvPair {
535  T send, recv;
536  };
537 
538  // (s)end and (r)eceive data:
539  SendRecvPair<int_1d_view_host> pids; // mpi ranks
540  SendRecvPair<std::vector<MPI_Request> > reqs; // MPI_Request is pointer, cannot use kokkos view
541  SendRecvPair<size_type_1d_view> offset; // offsets to local id list and data buffer
542  SendRecvPair<size_type_1d_view_host> offset_host; // offsets to local id list and data buffer
543  SendRecvPair<local_ordinal_type_1d_view> lids; // local id list
544  SendRecvPair<impl_scalar_type_1d_view> buffer; // data buffer
545 
546  local_ordinal_type_1d_view dm2cm; // permutation
547 
548 #if defined(KOKKOS_ENABLE_CUDA) && defined(IFPACK2_BLOCKTRIDICONTAINER_USE_CUDA_STREAM)
549  using cuda_stream_1d_std_vector = std::vector<cudaStream_t>;
550  SendRecvPair<cuda_stream_1d_std_vector> stream;
551 
552  using callback_data_send_recv_1d_std_vector = std::vector<CallbackDataArgsForSendRecv>;
553  SendRecvPair<callback_data_send_recv_1d_std_vector> callback_data;
554 #endif
555 
556  // for cuda
557  public:
558  void setOffsetValues(const Teuchos::ArrayView<const size_t> &lens,
559  const size_type_1d_view &offs) {
560  // wrap lens to kokkos view and deep copy to device
561  Kokkos::View<size_t*,Kokkos::HostSpace> lens_host(const_cast<size_t*>(lens.getRawPtr()), lens.size());
562  const auto lens_device = Kokkos::create_mirror_view_and_copy(memory_space(), lens_host);
563 
564  // exclusive scan
565  const Kokkos::RangePolicy<execution_space> policy(0,offs.extent(0));
566  const local_ordinal_type lens_size = lens_device.extent(0);
567  Kokkos::parallel_scan
568  ("AsyncableImport::RangePolicy::setOffsetValues",
569  policy, KOKKOS_LAMBDA(const local_ordinal_type &i, size_type &update, const bool &final) {
570  if (final)
571  offs(i) = update;
572  update += (i < lens_size ? lens_device[i] : 0);
573  });
574  }
575 
576  private:
577  void createMpiRequests(const tpetra_import_type &import) {
578  Tpetra::Distributor &distributor = import.getDistributor();
579 
580  // copy pids from distributor
581  const auto pids_from = distributor.getProcsFrom();
582  pids.recv = int_1d_view_host(do_not_initialize_tag("pids recv"), pids_from.size());
583  memcpy(pids.recv.data(), pids_from.getRawPtr(), sizeof(int)*pids.recv.extent(0));
584 
585  const auto pids_to = distributor.getProcsTo();
586  pids.send = int_1d_view_host(do_not_initialize_tag("pids send"), pids_to.size());
587  memcpy(pids.send.data(), pids_to.getRawPtr(), sizeof(int)*pids.send.extent(0));
588 
589  // mpi requests
590  reqs.recv.resize(pids.recv.extent(0)); memset(reqs.recv.data(), 0, reqs.recv.size()*sizeof(MPI_Request));
591  reqs.send.resize(pids.send.extent(0)); memset(reqs.send.data(), 0, reqs.send.size()*sizeof(MPI_Request));
592 
593  // construct offsets
594  const auto lengths_to = distributor.getLengthsTo();
595  offset.send = size_type_1d_view(do_not_initialize_tag("offset send"), lengths_to.size() + 1);
596 
597  const auto lengths_from = distributor.getLengthsFrom();
598  offset.recv = size_type_1d_view(do_not_initialize_tag("offset recv"), lengths_from.size() + 1);
599 
600  setOffsetValues(lengths_to, offset.send);
601  offset_host.send = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), offset.send);
602 
603  setOffsetValues(lengths_from, offset.recv);
604  offset_host.recv = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), offset.recv);
605  }
606 
607  void createSendRecvIDs(const tpetra_import_type &import) {
608  // For each remote PID, the list of LIDs to receive.
609  const auto remote_lids = import.getRemoteLIDs();
610  const local_ordinal_type_1d_view_host
611  remote_lids_view_host(const_cast<local_ordinal_type*>(remote_lids.getRawPtr()), remote_lids.size());
612  lids.recv = local_ordinal_type_1d_view(do_not_initialize_tag("lids recv"), remote_lids.size());
613  Kokkos::deep_copy(lids.recv, remote_lids_view_host);
614 
615  // For each export PID, the list of LIDs to send.
616  auto epids = import.getExportPIDs();
617  auto elids = import.getExportLIDs();
618  TEUCHOS_ASSERT(epids.size() == elids.size());
619  lids.send = local_ordinal_type_1d_view(do_not_initialize_tag("lids send"), elids.size());
620  auto lids_send_host = Kokkos::create_mirror_view(lids.send);
621 
622  // naive search (not sure if pids or epids are sorted)
623  for (local_ordinal_type cnt=0,i=0,iend=pids.send.extent(0);i<iend;++i) {
624  const auto pid_send_value = pids.send[i];
625  for (local_ordinal_type j=0,jend=epids.size();j<jend;++j)
626  if (epids[j] == pid_send_value) lids_send_host[cnt++] = elids[j];
627 #if !defined(__CUDA_ARCH__)
628  TEUCHOS_ASSERT(static_cast<size_t>(cnt) == offset_host.send[i+1]);
629 #endif
630  }
631  Kokkos::deep_copy(lids.send, lids_send_host);
632  }
633 
634  public:
635  // for cuda, all tag types are public
636  struct ToBuffer {};
637  struct ToMultiVector {};
638 
639  AsyncableImport (const Teuchos::RCP<const tpetra_map_type>& src_map,
641  const local_ordinal_type blocksize_,
642  const local_ordinal_type_1d_view dm2cm_) {
643  blocksize = blocksize_;
644  dm2cm = dm2cm_;
645 
646 #ifdef HAVE_IFPACK2_MPI
647  comm = Tpetra::Details::extractMpiCommFromTeuchos(*tgt_map->getComm());
648 #endif
649  const tpetra_import_type import(src_map, tgt_map);
650 
651  createMpiRequests(import);
652  createSendRecvIDs(import);
653 
654 #if defined(KOKKOS_ENABLE_CUDA) && defined(IFPACK2_BLOCKTRIDICONTAINER_USE_CUDA_STREAM)
655  {
656  const local_ordinal_type num_streams = pids.send.extent(0);
657  stream.send.resize(num_streams);
658  callback_data.send.resize(num_streams);
659  for (local_ordinal_type i=0;i<num_streams;++i)
660  CUDA_SAFE_CALL(cudaStreamCreate(&stream.send[i])); // nonblocking flag is not really necessary here
661  }
662  {
663  const local_ordinal_type num_streams = pids.recv.extent(0);
664  stream.recv.resize(num_streams);
665  callback_data.recv.resize(num_streams);
666  for (local_ordinal_type i=0;i<num_streams;++i)
667  CUDA_SAFE_CALL(cudaStreamCreate(&stream.recv[i])); // nonblocking flag is not really necessary here
668  }
669 #endif
670  }
671 
672  ~AsyncableImport() {
673 #if defined(KOKKOS_ENABLE_CUDA) && defined(IFPACK2_BLOCKTRIDICONTAINER_USE_CUDA_STREAM)
674  {
675  const local_ordinal_type num_streams = stream.send.size();
676  for (local_ordinal_type i=0;i<num_streams;++i)
677  CUDA_SAFE_CALL(cudaStreamDestroy(stream.send[i]));
678  }
679  {
680  const local_ordinal_type num_streams = stream.recv.size();
681  for (local_ordinal_type i=0;i<num_streams;++i)
682  CUDA_SAFE_CALL(cudaStreamDestroy(stream.recv[i]));
683  }
684 #endif
685  }
686 
687  void createDataBuffer(const local_ordinal_type &num_vectors) {
688  const size_type extent_0 = lids.recv.extent(0)*blocksize;
689  const size_type extent_1 = num_vectors;
690  if (remote_multivector.extent(0) == extent_0 &&
691  remote_multivector.extent(1) == extent_1) {
692  // skip
693  } else {
694  remote_multivector =
695  impl_scalar_type_2d_view_tpetra(do_not_initialize_tag("remote multivector"), extent_0, extent_1);
696 
697  const auto send_buffer_size = offset_host.send[offset_host.send.extent(0)-1]*blocksize*num_vectors;
698  const auto recv_buffer_size = offset_host.recv[offset_host.recv.extent(0)-1]*blocksize*num_vectors;
699 
700  buffer.send = impl_scalar_type_1d_view(do_not_initialize_tag("buffer send"), send_buffer_size);
701  buffer.recv = impl_scalar_type_1d_view(do_not_initialize_tag("buffer recv"), recv_buffer_size);
702  }
703  }
704 
705  void cancel () {
706 #ifdef HAVE_IFPACK2_MPI
707  waitall(reqs.recv.size(), reqs.recv.data());
708  waitall(reqs.send.size(), reqs.send.data());
709 #endif
710  }
711 
712  // ======================================================================
713  // Async version using cuda stream
714  // - cuda only with kokkos develop branch
715  // ======================================================================
716 
717 #if defined(KOKKOS_ENABLE_CUDA) && defined(IFPACK2_BLOCKTRIDICONTAINER_USE_CUDA_STREAM)
718  template<typename PackTag>
719  static
720  void copyViaCudaStream(const local_ordinal_type_1d_view &lids_,
721  const impl_scalar_type_1d_view &buffer_,
722  const local_ordinal_type ibeg_,
723  const local_ordinal_type iend_,
724  const impl_scalar_type_2d_view_tpetra &multivector_,
725  const local_ordinal_type blocksize_,
726  const cudaStream_t &stream_) {
727  const local_ordinal_type num_vectors = multivector_.extent(1);
728  const local_ordinal_type mv_blocksize = blocksize_*num_vectors;
729  const local_ordinal_type idiff = iend_ - ibeg_;
730  const auto abase = buffer_.data() + mv_blocksize*ibeg_;
731 
732  using team_policy_type = Kokkos::TeamPolicy<execution_space>;
733  local_ordinal_type vector_size(0);
734  if (blocksize_ <= 4) vector_size = 4;
735  else if (blocksize_ <= 8) vector_size = 8;
736  else if (blocksize_ <= 16) vector_size = 16;
737  else vector_size = 32;
738 
739  const auto exec_instance = Kokkos::Cuda(stream_);
740  const auto work_item_property = Kokkos::Experimental::WorkItemProperty::HintLightWeight;
741  const team_policy_type policy(exec_instance, idiff, 1, vector_size);
742  Kokkos::parallel_for
743  (//"AsyncableImport::TeamPolicy::copyViaCudaStream",
744  Kokkos::Experimental::require(policy, work_item_property),
745  KOKKOS_LAMBDA(const typename team_policy_type::member_type &member) {
746  const local_ordinal_type i = member.league_rank();
747  Kokkos::parallel_for
748  (Kokkos::TeamThreadRange(member,num_vectors),[&](const local_ordinal_type &j) {
749  auto aptr = abase + blocksize_*(i + idiff*j);
750  auto bptr = &multivector_(blocksize_*lids_(i + ibeg_), j);
751  if (std::is_same<PackTag,ToBuffer>::value)
752  Kokkos::parallel_for
753  (Kokkos::ThreadVectorRange(member,blocksize_),[&](const local_ordinal_type &k) {
754  aptr[k] = bptr[k];
755  });
756  else
757  Kokkos::parallel_for
758  (Kokkos::ThreadVectorRange(member,blocksize_),[&](const local_ordinal_type &k) {
759  bptr[k] = aptr[k];
760  });
761  });
762  });
763  }
764 
765  void asyncSendRecvViaCudaStream(const impl_scalar_type_2d_view_tpetra &mv) {
766  IFPACK2_BLOCKTRIDICONTAINER_TIMER("BlockTriDi::AsyncableImport::AsyncSendRecv");
767 
768 #ifdef HAVE_IFPACK2_MPI
769  // constants and reallocate data buffers if necessary
770  const local_ordinal_type num_vectors = mv.extent(1);
771  const local_ordinal_type mv_blocksize = blocksize*num_vectors;
772 
773  // 0. post receive async
774  for (local_ordinal_type i=0,iend=pids.recv.extent(0);i<iend;++i) {
775  irecv(comm,
776  reinterpret_cast<char*>(buffer.recv.data() + offset_host.recv[i]*mv_blocksize),
777  (offset_host.recv[i+1] - offset_host.recv[i])*mv_blocksize*sizeof(impl_scalar_type),
778  pids.recv[i],
779  42,
780  &reqs.recv[i]);
781  }
782 
783  // 1. async memcpy
784 #if defined (KOKKOS_ENABLE_OPENMP)
785 #pragma omp parallel for
786 #endif
787  for (local_ordinal_type i=0;i<static_cast<local_ordinal_type>(pids.send.extent(0));++i) {
788  auto &stream_at_i = stream.send[i];
789  // 1.0. enqueue pack buffer
790  copyViaCudaStream<ToBuffer>(lids.send, buffer.send,
791  offset_host.send(i), offset_host.send(i+1),
792  mv, blocksize,
793  stream_at_i);
794 
795  // // 1.1. enqueue call back posting isend , call back does not really work; what did I do wrong.
796  // auto &data_at_i = callback_data.send[i];
797  // data_at_i.comm = comm;
798  // data_at_i.buf = (char*)(buffer.send.data() + offset_host.send[i]*mv_blocksize);
799  // data_at_i.count = (offset_host.send[i+1] - offset_host.send[i])*mv_blocksize*sizeof(impl_scalar_type);
800  // data_at_i.pid = pids.send[i];
801  // data_at_i.tag = 42;
802  // data_at_i.ireq = &reqs.send[i];
803  // data_at_i.r_val = 0;
804  // CUDA_SAFE_CALL(cudaStreamAddCallback(stream_at_i, callback_isend, (void*)&data_at_i, 0));
805  }
806 
807  Kokkos::fence();
808 #if defined (KOKKOS_ENABLE_OPENMP)
809 #pragma omp parallel for
810 #endif
811  for (local_ordinal_type i=0;i<static_cast<local_ordinal_type>(pids.send.extent(0));++i) {
812  // 1.1. sync the stream and isend
813  //CUDA_SAFE_CALL(cudaStreamSynchronize(stream_at_i));
814  isend(comm,
815  reinterpret_cast<const char*>(buffer.send.data() + offset_host.send[i]*mv_blocksize),
816  (offset_host.send[i+1] - offset_host.send[i])*mv_blocksize*sizeof(impl_scalar_type),
817  pids.send[i],
818  42,
819  &reqs.send[i]);
820  }
821 
822  // 2. poke communication
823  for (local_ordinal_type i=0,iend=pids.recv.extent(0);i<iend;++i) {
824  int flag;
825  MPI_Status stat;
826  MPI_Iprobe(pids.recv[i], 42, comm, &flag, &stat);
827  }
828 #endif // HAVE_IFPACK2_MPI
829  }
830 
831  void syncRecvViaCudaStream() {
832  IFPACK2_BLOCKTRIDICONTAINER_TIMER("BlockTriDi::AsyncableImport::SyncRecv");
833 #ifdef HAVE_IFPACK2_MPI
834  // 0. wait for receive async.
835 #if defined (KOKKOS_ENABLE_OPENMP)
836 #pragma omp parallel for
837 #endif
838  for (local_ordinal_type i=0;i<static_cast<local_ordinal_type>(pids.recv.extent(0));++i) {
839  local_ordinal_type idx = i;
840  auto &stream_at_idx = stream.recv[idx];
841 
842  // 0.0. wait any
843  waitany(pids.recv.extent(0), reqs.recv.data(), &idx);
844 
845  // 0.0. add call back for waiting version (not working, what did i do wrong?)
846  //CUDA_SAFE_CALL(cudaStreamAddCallback(stream_at_idx, callback_wait, (void*)&reqs.recv[idx], 0));
847 
848  // 0.1. unpack data after data is moved into a device
849  copyViaCudaStream<ToMultiVector>(lids.recv, buffer.recv,
850  offset_host.recv(idx), offset_host.recv(idx+1),
851  remote_multivector, blocksize,
852  stream_at_idx);
853  }
854 
855  // 1. fire up all cuda events
856  Kokkos::fence();
857 
858  // 2. cleanup all open comm
859  waitall(reqs.send.size(), reqs.send.data());
860 #endif // HAVE_IFPACK2_MPI
861  }
862 #endif //defined(KOKKOS_ENABLE_CUDA) && defined(IFPACK2_BLOCKTRIDICONTAINER_USE_CUDA_STREAM)
863 
864  // ======================================================================
865  // Generic version without using cuda stream
866  // - only difference between cuda and host architecture is on using team
867  // or range policies.
868  // ======================================================================
869  template<typename PackTag>
870  static
871  void copy(const local_ordinal_type_1d_view &lids_,
872  const impl_scalar_type_1d_view &buffer_,
873  const local_ordinal_type &ibeg_,
874  const local_ordinal_type &iend_,
875  const impl_scalar_type_2d_view_tpetra &multivector_,
876  const local_ordinal_type blocksize_) {
877  const local_ordinal_type num_vectors = multivector_.extent(1);
878  const local_ordinal_type mv_blocksize = blocksize_*num_vectors;
879  const local_ordinal_type idiff = iend_ - ibeg_;
880  const auto abase = buffer_.data() + mv_blocksize*ibeg_;
881  if (is_cuda<execution_space>::value) {
882 #if defined(KOKKOS_ENABLE_CUDA)
883  using team_policy_type = Kokkos::TeamPolicy<execution_space>;
884  local_ordinal_type vector_size(0);
885  if (blocksize_ <= 4) vector_size = 4;
886  else if (blocksize_ <= 8) vector_size = 8;
887  else if (blocksize_ <= 16) vector_size = 16;
888  else vector_size = 32;
889  const team_policy_type policy(idiff, 1, vector_size);
890  Kokkos::parallel_for
891  ("AsyncableImport::TeamPolicy::copy",
892  policy, KOKKOS_LAMBDA(const typename team_policy_type::member_type &member) {
893  const local_ordinal_type i = member.league_rank();
894  Kokkos::parallel_for
895  (Kokkos::TeamThreadRange(member,num_vectors),[&](const local_ordinal_type &j) {
896  auto aptr = abase + blocksize_*(i + idiff*j);
897  auto bptr = &multivector_(blocksize_*lids_(i + ibeg_), j);
898  if (std::is_same<PackTag,ToBuffer>::value)
899  Kokkos::parallel_for
900  (Kokkos::ThreadVectorRange(member,blocksize_),[&](const local_ordinal_type &k) {
901  aptr[k] = bptr[k];
902  });
903  else
904  Kokkos::parallel_for
905  (Kokkos::ThreadVectorRange(member,blocksize_),[&](const local_ordinal_type &k) {
906  bptr[k] = aptr[k];
907  });
908  });
909  });
910 #endif
911  } else {
912 #if defined(__CUDA_ARCH__)
913  TEUCHOS_TEST_FOR_EXCEPT_MSG(true, "Error: CUDA should not see this code");
914 #else
915  {
916  const Kokkos::RangePolicy<execution_space> policy(0, idiff*num_vectors);
917  Kokkos::parallel_for
918  ("AsyncableImport::RangePolicy::copy",
919  policy, KOKKOS_LAMBDA(const local_ordinal_type &ij) {
920  const local_ordinal_type i = ij%idiff;
921  const local_ordinal_type j = ij/idiff;
922  auto aptr = abase + blocksize_*(i + idiff*j);
923  auto bptr = &multivector_(blocksize_*lids_(i + ibeg_), j);
924  auto from = std::is_same<PackTag,ToBuffer>::value ? bptr : aptr;
925  auto to = std::is_same<PackTag,ToBuffer>::value ? aptr : bptr;
926  memcpy(to, from, sizeof(impl_scalar_type)*blocksize_);
927  });
928  }
929 #endif
930  }
931  Kokkos::fence();
932  }
933 
934  void asyncSendRecv(const impl_scalar_type_2d_view_tpetra &mv) {
935  IFPACK2_BLOCKTRIDICONTAINER_TIMER("BlockTriDi::AsyncableImport::AsyncSendRecv");
936 
937 #ifdef HAVE_IFPACK2_MPI
938  // constants and reallocate data buffers if necessary
939  const local_ordinal_type num_vectors = mv.extent(1);
940  const local_ordinal_type mv_blocksize = blocksize*num_vectors;
941 
942  // receive async
943  for (local_ordinal_type i=0,iend=pids.recv.extent(0);i<iend;++i) {
944  irecv(comm,
945  reinterpret_cast<char*>(buffer.recv.data() + offset_host.recv[i]*mv_blocksize),
946  (offset_host.recv[i+1] - offset_host.recv[i])*mv_blocksize*sizeof(impl_scalar_type),
947  pids.recv[i],
948  42,
949  &reqs.recv[i]);
950  }
951 
952  // send async
953  for (local_ordinal_type i=0,iend=pids.send.extent(0);i<iend;++i) {
954  copy<ToBuffer>(lids.send, buffer.send, offset_host.send(i), offset_host.send(i+1),
955  mv, blocksize);
956  isend(comm,
957  reinterpret_cast<const char*>(buffer.send.data() + offset_host.send[i]*mv_blocksize),
958  (offset_host.send[i+1] - offset_host.send[i])*mv_blocksize*sizeof(impl_scalar_type),
959  pids.send[i],
960  42,
961  &reqs.send[i]);
962  }
963 
964  // I find that issuing an Iprobe seems to nudge some MPIs into action,
965  // which helps with overlapped comm/comp performance.
966  for (local_ordinal_type i=0,iend=pids.recv.extent(0);i<iend;++i) {
967  int flag;
968  MPI_Status stat;
969  MPI_Iprobe(pids.recv[i], 42, comm, &flag, &stat);
970  }
971 #endif
972  }
973 
974  void syncRecv() {
975  IFPACK2_BLOCKTRIDICONTAINER_TIMER("BlockTriDi::AsyncableImport::SyncRecv");
976 #ifdef HAVE_IFPACK2_MPI
977  // receive async.
978  for (local_ordinal_type i=0,iend=pids.recv.extent(0);i<iend;++i) {
979  local_ordinal_type idx = i;
980  waitany(pids.recv.extent(0), reqs.recv.data(), &idx);
981  copy<ToMultiVector>(lids.recv, buffer.recv, offset_host.recv(idx), offset_host.recv(idx+1),
982  remote_multivector, blocksize);
983  }
984  // wait on the sends to match all Isends with a cleanup operation.
985  waitall(reqs.send.size(), reqs.send.data());
986 #endif
987  }
988 
989  void syncExchange(const impl_scalar_type_2d_view_tpetra &mv) {
990  IFPACK2_BLOCKTRIDICONTAINER_TIMER("BlockTriDi::AsyncableImport::SyncExchange");
991 #if defined(KOKKOS_ENABLE_CUDA) && defined(IFPACK2_BLOCKTRIDICONTAINER_USE_CUDA_STREAM)
992  asyncSendRecvViaCudaStream(mv);
993  syncRecvViaCudaStream();
994 #else
995  asyncSendRecv(mv);
996  syncRecv();
997 #endif
998  }
999 
1000  impl_scalar_type_2d_view_tpetra getRemoteMultiVectorLocalView() const { return remote_multivector; }
1001  };
1002 
1006  template<typename MatrixType>
1008  createBlockCrsAsyncImporter(const Teuchos::RCP<const typename ImplType<MatrixType>::tpetra_block_crs_matrix_type> &A) {
1009  using impl_type = ImplType<MatrixType>;
1010  using tpetra_map_type = typename impl_type::tpetra_map_type;
1011  using local_ordinal_type = typename impl_type::local_ordinal_type;
1012  using global_ordinal_type = typename impl_type::global_ordinal_type;
1013  using local_ordinal_type_1d_view = typename impl_type::local_ordinal_type_1d_view;
1014 
1015  const auto g = A->getCrsGraph(); // tpetra crs graph object
1016  const auto blocksize = A->getBlockSize();
1017  const auto domain_map = g.getDomainMap();
1018  const auto column_map = g.getColMap();
1019 
1020  std::vector<global_ordinal_type> gids;
1021  bool separate_remotes = true, found_first = false, need_owned_permutation = false;
1022  for (size_t i=0;i<column_map->getNodeNumElements();++i) {
1023  const global_ordinal_type gid = column_map->getGlobalElement(i);
1024  if (!domain_map->isNodeGlobalElement(gid)) {
1025  found_first = true;
1026  gids.push_back(gid);
1027  } else if (found_first) {
1028  separate_remotes = false;
1029  break;
1030  }
1031  if (!need_owned_permutation &&
1032  domain_map->getLocalElement(gid) != static_cast<local_ordinal_type>(i)) {
1033  // The owned part of the domain and column maps are different
1034  // orderings. We *could* do a super efficient impl of this case in the
1035  // num_sweeps > 1 case by adding complexity to PermuteAndRepack. But,
1036  // really, if a caller cares about speed, they wouldn't make different
1037  // local permutations like this. So we punt on the best impl and go for
1038  // a pretty good one: the permutation is done in place in
1039  // compute_b_minus_Rx for the pure-owned part of the MVP. The only cost
1040  // is the presumably worse memory access pattern of the input vector.
1041  need_owned_permutation = true;
1042  }
1043  }
1044 
1045  if (separate_remotes) {
1047  const auto parsimonious_col_map
1048  = Teuchos::rcp(new tpetra_map_type(invalid, gids.data(), gids.size(), 0, domain_map->getComm()));
1049  if (parsimonious_col_map->getGlobalNumElements() > 0) {
1050  // make the importer only if needed.
1051  local_ordinal_type_1d_view dm2cm;
1052  if (need_owned_permutation) {
1053  dm2cm = local_ordinal_type_1d_view(do_not_initialize_tag("dm2cm"), domain_map->getNodeNumElements());
1054  const auto dm2cm_host = Kokkos::create_mirror_view(dm2cm);
1055  for (size_t i=0;i<domain_map->getNodeNumElements();++i)
1056  dm2cm_host(i) = domain_map->getLocalElement(column_map->getGlobalElement(i));
1057  Kokkos::deep_copy(dm2cm, dm2cm_host);
1058  }
1059  return Teuchos::rcp(new AsyncableImport<MatrixType>(domain_map, parsimonious_col_map, blocksize, dm2cm));
1060  }
1061  }
1062  return Teuchos::null;
1063  }
1064 
1065  template<typename MatrixType>
1066  struct PartInterface {
1067  using local_ordinal_type = typename ImplType<MatrixType>::local_ordinal_type;
1068  using local_ordinal_type_1d_view = typename ImplType<MatrixType>::local_ordinal_type_1d_view;
1069 
1070  PartInterface() = default;
1071  PartInterface(const PartInterface &b) = default;
1072 
1073  // Some terms:
1074  // The matrix A is split as A = D + R, where D is the matrix of tridiag
1075  // blocks and R is the remainder.
1076  // A part is roughly a synonym for a tridiag. The distinction is that a part
1077  // is the set of rows belonging to one tridiag and, equivalently, the off-diag
1078  // rows in R associated with that tridiag. In contrast, the term tridiag is
1079  // used to refer specifically to tridiag data, such as the pointer into the
1080  // tridiag data array.
1081  // Local (lcl) row arge the LIDs. lclrow lists the LIDs belonging to each
1082  // tridiag, and partptr points to the beginning of each tridiag. This is the
1083  // LID space.
1084  // Row index (idx) is the ordinal in the tridiag ordering. lclrow is indexed
1085  // by this ordinal. This is the 'index' space.
1086  // A flat index is the mathematical index into an array. A pack index
1087  // accounts for SIMD packing.
1088 
1089  // Local row LIDs. Permutation from caller's index space to tridiag index
1090  // space.
1091  local_ordinal_type_1d_view lclrow;
1092  // partptr_ is the pointer array into lclrow_.
1093  local_ordinal_type_1d_view partptr; // np+1
1094  // packptr_(i), for i the pack index, indexes partptr_. partptr_(packptr_(i))
1095  // is the start of the i'th pack.
1096  local_ordinal_type_1d_view packptr; // npack+1
1097  // part2rowidx0_(i) is the flat row index of the start of the i'th part. It's
1098  // an alias of partptr_ in the case of no overlap.
1099  local_ordinal_type_1d_view part2rowidx0; // np+1
1100  // part2packrowidx0_(i) is the packed row index. If vector_length is 1, then
1101  // it's the same as part2rowidx0_; if it's > 1, then the value is combined
1102  // with i % vector_length to get the location in the packed data.
1103  local_ordinal_type_1d_view part2packrowidx0; // np+1
1104  local_ordinal_type part2packrowidx0_back; // So we don't need to grab the array from the GPU.
1105  // rowidx2part_ maps the row index to the part index.
1106  local_ordinal_type_1d_view rowidx2part; // nr
1107  // True if lcl{row|col} is at most a constant away from row{idx|col}. In
1108  // practice, this knowledge is not particularly useful, as packing for batched
1109  // processing is done at the same time as the permutation from LID to index
1110  // space. But it's easy to detect, so it's recorded in case an optimization
1111  // can be made based on it.
1112  bool row_contiguous;
1113 
1114  local_ordinal_type max_partsz;
1115  };
1116 
1120  template<typename MatrixType>
1121  PartInterface<MatrixType>
1122  createPartInterface(const Teuchos::RCP<const typename ImplType<MatrixType>::tpetra_block_crs_matrix_type> &A,
1123  const Teuchos::Array<Teuchos::Array<typename ImplType<MatrixType>::local_ordinal_type> > &partitions) {
1124  using impl_type = ImplType<MatrixType>;
1125  using local_ordinal_type = typename impl_type::local_ordinal_type;
1126  using local_ordinal_type_1d_view = typename impl_type::local_ordinal_type_1d_view;
1127 
1128  constexpr int vector_length = impl_type::vector_length;
1129 
1130  const auto comm = A->getRowMap()->getComm();
1131 
1132  PartInterface<MatrixType> interf;
1133 
1134  const bool jacobi = partitions.size() == 0;
1135  const local_ordinal_type A_n_lclrows = A->getNodeNumRows();
1136  const local_ordinal_type nparts = jacobi ? A_n_lclrows : partitions.size();
1137 
1138 #if defined(BLOCKTRIDICONTAINER_DEBUG)
1139  local_ordinal_type nrows = 0;
1140  if (jacobi)
1141  nrows = nparts;
1142  else
1143  for (local_ordinal_type i=0;i<nparts;++i) nrows += partitions[i].size();
1144 
1146  (nrows != A_n_lclrows, get_msg_prefix(comm) << "The #rows implied by the local partition is not "
1147  << "the same as getNodeNumRows: " << nrows << " vs " << A_n_lclrows);
1148 #endif
1149 
1150  // permutation vector
1151  std::vector<local_ordinal_type> p;
1152  if (jacobi) {
1153  interf.max_partsz = 1;
1154  } else {
1155  // reorder parts to maximize simd packing efficiency
1156  p.resize(nparts);
1157 
1158  typedef std::pair<local_ordinal_type,local_ordinal_type> size_idx_pair_type;
1159  std::vector<size_idx_pair_type> partsz(nparts);
1160  for (local_ordinal_type i=0;i<nparts;++i)
1161  partsz[i] = size_idx_pair_type(partitions[i].size(), i);
1162  std::sort(partsz.begin(), partsz.end(),
1163  [] (const size_idx_pair_type& x, const size_idx_pair_type& y) {
1164  return x.first > y.first;
1165  });
1166  for (local_ordinal_type i=0;i<nparts;++i)
1167  p[i] = partsz[i].second;
1168 
1169  interf.max_partsz = partsz[0].first;
1170  }
1171 
1172  // allocate parts
1173  interf.partptr = local_ordinal_type_1d_view(do_not_initialize_tag("partptr"), nparts + 1);
1174  interf.lclrow = local_ordinal_type_1d_view(do_not_initialize_tag("lclrow"), A_n_lclrows);
1175  interf.part2rowidx0 = local_ordinal_type_1d_view(do_not_initialize_tag("part2rowidx0"), nparts + 1);
1176  interf.part2packrowidx0 = local_ordinal_type_1d_view(do_not_initialize_tag("part2packrowidx0"), nparts + 1);
1177  interf.rowidx2part = local_ordinal_type_1d_view(do_not_initialize_tag("rowidx2part"), A_n_lclrows);
1178 
1179  // mirror to host and compute on host execution space
1180  const auto partptr = Kokkos::create_mirror_view(interf.partptr);
1181  const auto lclrow = Kokkos::create_mirror_view(interf.lclrow);
1182  const auto part2rowidx0 = Kokkos::create_mirror_view(interf.part2rowidx0);
1183  const auto part2packrowidx0 = Kokkos::create_mirror_view(interf.part2packrowidx0);
1184  const auto rowidx2part = Kokkos::create_mirror_view(interf.rowidx2part);
1185 
1186  // Determine parts.
1187  interf.row_contiguous = true;
1188  partptr(0) = 0;
1189  part2rowidx0(0) = 0;
1190  part2packrowidx0(0) = 0;
1191  local_ordinal_type pack_nrows = 0;
1192  for (local_ordinal_type ip=0;ip<nparts;++ip) {
1193  const auto* part = jacobi ? NULL : &partitions[p[ip]];
1194  const local_ordinal_type ipnrows = jacobi ? 1 : part->size();
1195  TEUCHOS_ASSERT(ip == 0 || (jacobi || ipnrows <= static_cast<local_ordinal_type>(partitions[p[ip-1]].size())));
1196  TEUCHOS_TEST_FOR_EXCEPT_MSG(ipnrows == 0,
1197  get_msg_prefix(comm)
1198  << "partition " << p[ip]
1199  << " is empty, which is not allowed.");
1200  //assume No overlap.
1201  part2rowidx0(ip+1) = part2rowidx0(ip) + ipnrows;
1202  // Since parts are ordered in nonincreasing size, the size of the first
1203  // part in a pack is the size for all parts in the pack.
1204  if (ip % vector_length == 0) pack_nrows = ipnrows;
1205  part2packrowidx0(ip+1) = part2packrowidx0(ip) + ((ip+1) % vector_length == 0 || ip+1 == nparts ? pack_nrows : 0);
1206  const local_ordinal_type os = partptr(ip);
1207  for (local_ordinal_type i=0;i<ipnrows;++i) {
1208  const auto lcl_row = jacobi ? ip : (*part)[i];
1209  TEUCHOS_TEST_FOR_EXCEPT_MSG(lcl_row < 0 || lcl_row >= A_n_lclrows,
1210  get_msg_prefix(comm)
1211  << "partitions[" << p[ip] << "]["
1212  << i << "] = " << lcl_row
1213  << " but input matrix implies limits of [0, " << A_n_lclrows-1
1214  << "].");
1215  lclrow(os+i) = lcl_row;
1216  rowidx2part(os+i) = ip;
1217  if (interf.row_contiguous && os+i > 0 && lclrow((os+i)-1) + 1 != lcl_row)
1218  interf.row_contiguous = false;
1219  }
1220  partptr(ip+1) = os + ipnrows;
1221  }
1222 #if defined(BLOCKTRIDICONTAINER_DEBUG)
1223  TEUCHOS_ASSERT(partptr(nparts) == nrows);
1224 #endif
1225  if (lclrow(0) != 0) interf.row_contiguous = false;
1226 
1227  Kokkos::deep_copy(interf.partptr, partptr);
1228  Kokkos::deep_copy(interf.lclrow, lclrow);
1229 
1230  //assume No overlap. Thus:
1231  interf.part2rowidx0 = interf.partptr;
1232  Kokkos::deep_copy(interf.part2packrowidx0, part2packrowidx0);
1233 
1234  interf.part2packrowidx0_back = part2packrowidx0(part2packrowidx0.extent(0) - 1);
1235  Kokkos::deep_copy(interf.rowidx2part, rowidx2part);
1236 
1237  { // Fill packptr.
1238  local_ordinal_type npacks = 0;
1239  for (local_ordinal_type ip=1;ip<=nparts;++ip)
1240  if (part2packrowidx0(ip) != part2packrowidx0(ip-1))
1241  ++npacks;
1242  interf.packptr = local_ordinal_type_1d_view(do_not_initialize_tag("packptr"), npacks + 1);
1243  const auto packptr = Kokkos::create_mirror_view(interf.packptr);
1244  packptr(0) = 0;
1245  for (local_ordinal_type ip=1,k=1;ip<=nparts;++ip)
1246  if (part2packrowidx0(ip) != part2packrowidx0(ip-1))
1247  packptr(k++) = ip;
1248  Kokkos::deep_copy(interf.packptr, packptr);
1249  }
1250 
1251  return interf;
1252  }
1253 
1257  template <typename MatrixType>
1258  struct BlockTridiags {
1260  using local_ordinal_type_1d_view = typename impl_type::local_ordinal_type_1d_view;
1261  using size_type_1d_view = typename impl_type::size_type_1d_view;
1262  using vector_type_3d_view = typename impl_type::vector_type_3d_view;
1263 
1264  // flat_td_ptr(i) is the index into flat-array values of the start of the
1265  // i'th tridiag. pack_td_ptr is the same, but for packs. If vector_length ==
1266  // 1, pack_td_ptr is the same as flat_td_ptr; if vector_length > 1, then i %
1267  // vector_length is the position in the pack.
1268  size_type_1d_view flat_td_ptr, pack_td_ptr;
1269  // List of local column indices into A from which to grab
1270  // data. flat_td_ptr(i) points to the start of the i'th tridiag's data.
1271  local_ordinal_type_1d_view A_colindsub;
1272  // Tridiag block values. pack_td_ptr(i) points to the start of the i'th
1273  // tridiag's pack, and i % vector_length gives the position in the pack.
1274  vector_type_3d_view values;
1275 
1276  bool is_diagonal_only;
1277 
1278  BlockTridiags() = default;
1279  BlockTridiags(const BlockTridiags &b) = default;
1280 
1281  // Index into row-major block of a tridiag.
1282  template <typename idx_type>
1283  static KOKKOS_FORCEINLINE_FUNCTION
1284  idx_type IndexToRow (const idx_type& ind) { return (ind + 1) / 3; }
1285  // Given a row of a row-major tridiag, return the index of the first block
1286  // in that row.
1287  template <typename idx_type>
1288  static KOKKOS_FORCEINLINE_FUNCTION
1289  idx_type RowToIndex (const idx_type& row) { return row > 0 ? 3*row - 1 : 0; }
1290  // Number of blocks in a tridiag having a given number of rows.
1291  template <typename idx_type>
1292  static KOKKOS_FORCEINLINE_FUNCTION
1293  idx_type NumBlocks (const idx_type& nrows) { return nrows > 0 ? 3*nrows - 2 : 0; }
1294  };
1295 
1296 
1300  template<typename MatrixType>
1302  createBlockTridiags(const PartInterface<MatrixType> &interf) {
1303  using impl_type = ImplType<MatrixType>;
1304  using execution_space = typename impl_type::execution_space;
1305  using local_ordinal_type = typename impl_type::local_ordinal_type;
1306  using size_type = typename impl_type::size_type;
1307  using size_type_1d_view = typename impl_type::size_type_1d_view;
1308 
1309  constexpr int vector_length = impl_type::vector_length;
1310 
1312 
1313  const local_ordinal_type ntridiags = interf.partptr.extent(0) - 1;
1314 
1315  { // construct the flat index pointers into the tridiag values array.
1316  btdm.flat_td_ptr = size_type_1d_view(do_not_initialize_tag("btdm.flat_td_ptr"), ntridiags + 1);
1317  const Kokkos::RangePolicy<execution_space> policy(0,ntridiags + 1);
1318  Kokkos::parallel_scan
1319  ("createBlockTridiags::RangePolicy::flat_td_ptr",
1320  policy, KOKKOS_LAMBDA(const local_ordinal_type &i, size_type &update, const bool &final) {
1321  if (final)
1322  btdm.flat_td_ptr(i) = update;
1323  if (i < ntridiags) {
1324  const local_ordinal_type nrows = interf.partptr(i+1) - interf.partptr(i);
1325  update += btdm.NumBlocks(nrows);
1326  }
1327  });
1328 
1329  const auto nblocks = Kokkos::create_mirror_view_and_copy
1330  (Kokkos::HostSpace(), Kokkos::subview(btdm.flat_td_ptr, ntridiags));
1331  btdm.is_diagonal_only = (static_cast<local_ordinal_type>(nblocks()) == ntridiags);
1332  }
1333 
1334  // And the packed index pointers.
1335  if (vector_length == 1) {
1336  btdm.pack_td_ptr = btdm.flat_td_ptr;
1337  } else {
1338  const local_ordinal_type npacks = interf.packptr.extent(0) - 1;
1339  btdm.pack_td_ptr = size_type_1d_view(do_not_initialize_tag("btdm.pack_td_ptr"), ntridiags + 1);
1340  const Kokkos::RangePolicy<execution_space> policy(0,npacks);
1341  Kokkos::parallel_scan
1342  ("createBlockTridiags::RangePolicy::pack_td_ptr",
1343  policy, KOKKOS_LAMBDA(const local_ordinal_type &i, size_type &update, const bool &final) {
1344  const local_ordinal_type parti = interf.packptr(i);
1345  const local_ordinal_type parti_next = interf.packptr(i+1);
1346  if (final) {
1347  const size_type nblks = update;
1348  for (local_ordinal_type pti=parti;pti<parti_next;++pti)
1349  btdm.pack_td_ptr(pti) = nblks;
1350  const local_ordinal_type nrows = interf.partptr(parti+1) - interf.partptr(parti);
1351  // last one
1352  if (i == npacks-1)
1353  btdm.pack_td_ptr(ntridiags) = nblks + btdm.NumBlocks(nrows);
1354  }
1355  {
1356  const local_ordinal_type nrows = interf.partptr(parti+1) - interf.partptr(parti);
1357  update += btdm.NumBlocks(nrows);
1358  }
1359  });
1360  }
1361 
1362  // values and A_colindsub are created in the symbolic phase
1363 
1364  return btdm;
1365  }
1366 
1367  // Set the tridiags to be I to the full pack block size. That way, if a
1368  // tridiag within a pack is shorter than the longest one, the extra blocks are
1369  // processed in a safe way. Similarly, in the solve phase, if the extra blocks
1370  // in the packed multvector are 0, and the tridiag LU reflects the extra I
1371  // blocks, then the solve proceeds as though the extra blocks aren't
1372  // present. Since this extra work is part of the SIMD calls, it's not actually
1373  // extra work. Instead, it means we don't have to put checks or masks in, or
1374  // quiet NaNs. This functor has to be called just once, in the symbolic phase,
1375  // since the numeric phase fills in only the used entries, leaving these I
1376  // blocks intact.
1377  template<typename MatrixType>
1378  void
1379  setTridiagsToIdentity
1380  (const BlockTridiags<MatrixType>& btdm,
1381  const typename ImplType<MatrixType>::local_ordinal_type_1d_view& packptr)
1382  {
1383  using impl_type = ImplType<MatrixType>;
1384  using execution_space = typename impl_type::execution_space;
1385  using local_ordinal_type = typename impl_type::local_ordinal_type;
1386  using size_type_1d_view = typename impl_type::size_type_1d_view;
1387 
1388  const ConstUnmanaged<size_type_1d_view> pack_td_ptr(btdm.pack_td_ptr);
1389  const local_ordinal_type blocksize = btdm.values.extent(1);
1390 
1391  {
1392  const int vector_length = impl_type::vector_length;
1393  const int internal_vector_length = impl_type::internal_vector_length;
1394 
1395  using btdm_scalar_type = typename impl_type::btdm_scalar_type;
1396  using internal_vector_type = typename impl_type::internal_vector_type;
1397  using internal_vector_type_4d_view =
1398  typename impl_type::internal_vector_type_4d_view;
1399 
1400  using team_policy_type = Kokkos::TeamPolicy<execution_space>;
1401  const internal_vector_type_4d_view values
1402  (reinterpret_cast<internal_vector_type*>(btdm.values.data()),
1403  btdm.values.extent(0),
1404  btdm.values.extent(1),
1405  btdm.values.extent(2),
1406  vector_length/internal_vector_length);
1407  const local_ordinal_type vector_loop_size = values.extent(3);
1408 #if defined(KOKKOS_ENABLE_CUDA) && defined(__CUDA_ARCH__)
1409  local_ordinal_type total_team_size(0);
1410  if (blocksize <= 5) total_team_size = 32;
1411  else if (blocksize <= 9) total_team_size = 64;
1412  else if (blocksize <= 12) total_team_size = 96;
1413  else if (blocksize <= 16) total_team_size = 128;
1414  else if (blocksize <= 20) total_team_size = 160;
1415  else total_team_size = 160;
1416  const local_ordinal_type team_size = total_team_size/vector_loop_size;
1417  const team_policy_type policy(packptr.extent(0)-1, team_size, vector_loop_size);
1418 #else // Host architecture: team size is always one
1419  const team_policy_type policy(packptr.extent(0)-1, 1, 1);
1420 #endif
1421  Kokkos::parallel_for
1422  ("setTridiagsToIdentity::TeamPolicy",
1423  policy, KOKKOS_LAMBDA(const typename team_policy_type::member_type &member) {
1424  const local_ordinal_type k = member.league_rank();
1425  const local_ordinal_type ibeg = pack_td_ptr(packptr(k));
1426  const local_ordinal_type iend = pack_td_ptr(packptr(k+1));
1427  const local_ordinal_type diff = iend - ibeg;
1428  const local_ordinal_type icount = diff/3 + (diff%3 > 0);
1429  const btdm_scalar_type one(1);
1430  Kokkos::parallel_for(Kokkos::ThreadVectorRange(member, vector_loop_size),[&](const int &v) {
1431  Kokkos::parallel_for(Kokkos::TeamThreadRange(member,icount),[&](const local_ordinal_type &ii) {
1432  const local_ordinal_type i = ibeg + ii*3;
1433  for (local_ordinal_type j=0;j<blocksize;++j)
1434  values(i,j,j,v) = one;
1435  });
1436  });
1437  });
1438  }
1439  }
1440 
1444  template <typename MatrixType>
1445  struct AmD {
1447  using local_ordinal_type_1d_view = typename impl_type::local_ordinal_type_1d_view;
1448  using size_type_1d_view = typename impl_type::size_type_1d_view;
1449  using impl_scalar_type_1d_view_tpetra = typename impl_type::impl_scalar_type_1d_view_tpetra;
1450 
1451  // rowptr points to the start of each row of A_colindsub.
1452  size_type_1d_view rowptr, rowptr_remote;
1453  // Indices into A's rows giving the blocks to extract. rowptr(i) points to
1454  // the i'th row. Thus, g.entries(A_colindsub(rowptr(row) : rowptr(row+1))),
1455  // where g is A's graph, are the columns AmD uses. If seq_method_, then
1456  // A_colindsub contains all the LIDs and A_colindsub_remote is empty. If !
1457  // seq_method_, then A_colindsub contains owned LIDs and A_colindsub_remote
1458  // contains the remote ones.
1459  local_ordinal_type_1d_view A_colindsub, A_colindsub_remote;
1460 
1461  // Currently always true.
1462  bool is_tpetra_block_crs;
1463 
1464  // If is_tpetra_block_crs, then this is a pointer to A_'s value data.
1465  impl_scalar_type_1d_view_tpetra tpetra_values;
1466 
1467  AmD() = default;
1468  AmD(const AmD &b) = default;
1469  };
1470 
1474  template<typename MatrixType>
1475  void
1476  performSymbolicPhase(const Teuchos::RCP<const typename ImplType<MatrixType>::tpetra_block_crs_matrix_type> &A,
1477  const PartInterface<MatrixType> &interf,
1479  AmD<MatrixType> &amd,
1480  const bool overlap_communication_and_computation) {
1481  IFPACK2_BLOCKTRIDICONTAINER_TIMER("BlockTriDi::SymbolicPhase");
1482 
1483  using impl_type = ImplType<MatrixType>;
1484  using node_memory_space = typename impl_type::node_memory_space;
1485  using host_execution_space = typename impl_type::host_execution_space;
1486 
1487  using local_ordinal_type = typename impl_type::local_ordinal_type;
1488  using global_ordinal_type = typename impl_type::global_ordinal_type;
1489  using size_type = typename impl_type::size_type;
1490  using local_ordinal_type_1d_view = typename impl_type::local_ordinal_type_1d_view;
1491  using size_type_1d_view = typename impl_type::size_type_1d_view;
1492  using vector_type_3d_view = typename impl_type::vector_type_3d_view;
1493  using block_crs_matrix_type = typename impl_type::tpetra_block_crs_matrix_type;
1494 
1495  constexpr int vector_length = impl_type::vector_length;
1496 
1497  const auto comm = A->getRowMap()->getComm();
1498  const auto& g = A->getCrsGraph();
1499  const auto blocksize = A->getBlockSize();
1500 
1501  // mirroring to host
1502  const auto partptr = Kokkos::create_mirror_view_and_copy (Kokkos::HostSpace(), interf.partptr);
1503  const auto lclrow = Kokkos::create_mirror_view_and_copy (Kokkos::HostSpace(), interf.lclrow);
1504  const auto rowidx2part = Kokkos::create_mirror_view_and_copy (Kokkos::HostSpace(), interf.rowidx2part);
1505  const auto part2rowidx0 = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), interf.part2rowidx0);
1506  const auto packptr = Kokkos::create_mirror_view_and_copy (Kokkos::HostSpace(), interf.packptr);
1507 
1508  const local_ordinal_type nrows = partptr(partptr.extent(0) - 1);
1509 
1510  // find column to row map on host
1511  Kokkos::View<local_ordinal_type*,host_execution_space> col2row("col2row", A->getNodeNumCols());
1512  Kokkos::deep_copy(col2row, Teuchos::OrdinalTraits<local_ordinal_type>::invalid());
1513  {
1514  const auto rowmap = g.getRowMap();
1515  const auto colmap = g.getColMap();
1516  const auto dommap = g.getDomainMap();
1517  TEUCHOS_ASSERT( !(rowmap.is_null() || colmap.is_null() || dommap.is_null()));
1518 
1519 #if !defined(__CUDA_ARCH__)
1520  const Kokkos::RangePolicy<host_execution_space> policy(0,nrows);
1521  Kokkos::parallel_for
1522  ("performSymbolicPhase::RangePolicy::col2row",
1523  policy, KOKKOS_LAMBDA(const local_ordinal_type &lr) {
1524  const global_ordinal_type gid = rowmap->getGlobalElement(lr);
1526  if (dommap->isNodeGlobalElement(gid)) {
1527  const local_ordinal_type lc = colmap->getLocalElement(gid);
1528 # if defined(BLOCKTRIDICONTAINER_DEBUG)
1530  get_msg_prefix(comm) << "GID " << gid
1531  << " gives an invalid local column.");
1532 # endif
1533  col2row(lc) = lr;
1534  }
1535  });
1536 #endif
1537  }
1538 
1539  // construct the D and R graphs in A = D + R.
1540  {
1541  const auto& local_graph = g.getLocalGraph();
1542  const auto& local_graph_rowptr = local_graph.row_map;
1543  TEUCHOS_ASSERT(local_graph_rowptr.size() == static_cast<size_t>(nrows + 1));
1544  const auto& local_graph_colidx = local_graph.entries;
1545 
1546  //assume no overlap.
1547 
1548  Kokkos::View<local_ordinal_type*,host_execution_space> lclrow2idx("lclrow2idx", nrows);
1549  {
1550  const Kokkos::RangePolicy<host_execution_space> policy(0,nrows);
1551  Kokkos::parallel_for
1552  ("performSymbolicPhase::RangePolicy::lclrow2idx",
1553  policy, KOKKOS_LAMBDA(const local_ordinal_type &i) {
1554  lclrow2idx[lclrow(i)] = i;
1555  });
1556  }
1557 
1558  // count (block) nnzs in D and R.
1559  typedef SumReducer<size_type,3,host_execution_space> sum_reducer_type;
1560  typename sum_reducer_type::value_type sum_reducer_value;
1561  {
1562  const Kokkos::RangePolicy<host_execution_space> policy(0,nrows);
1563  Kokkos::parallel_reduce
1564  // profiling interface does not work
1565  (//"performSymbolicPhase::RangePolicy::count_nnz",
1566  policy, KOKKOS_LAMBDA(const local_ordinal_type &lr, typename sum_reducer_type::value_type &update) {
1567  // LID -> index.
1568  const local_ordinal_type ri0 = lclrow2idx[lr];
1569  const local_ordinal_type pi0 = rowidx2part(ri0);
1570  for (size_type j=local_graph_rowptr(lr);j<local_graph_rowptr(lr+1);++j) {
1571  const local_ordinal_type lc = local_graph_colidx(j);
1572  const local_ordinal_type lc2r = col2row[lc];
1573  bool incr_R = false;
1574  do { // breakable
1576  incr_R = true;
1577  break;
1578  }
1579  const local_ordinal_type ri = lclrow2idx[lc2r];
1580  const local_ordinal_type pi = rowidx2part(ri);
1581  if (pi != pi0) {
1582  incr_R = true;
1583  break;
1584  }
1585  // Test for being in the tridiag. This is done in index space. In
1586  // LID space, tridiag LIDs in a row are not necessarily related by
1587  // {-1, 0, 1}.
1588  if (ri0 + 1 >= ri && ri0 <= ri + 1)
1589  ++update.v[0]; // D_nnz
1590  else
1591  incr_R = true;
1592  } while (0);
1593  if (incr_R) {
1594  if (lc < nrows) ++update.v[1]; // R_nnz_owned
1595  else ++update.v[2]; // R_nnz_remote
1596  }
1597  }
1598  }, sum_reducer_type(sum_reducer_value));
1599  }
1600  size_type D_nnz = sum_reducer_value.v[0];
1601  size_type R_nnz_owned = sum_reducer_value.v[1];
1602  size_type R_nnz_remote = sum_reducer_value.v[2];
1603 
1604  if (!overlap_communication_and_computation) {
1605  R_nnz_owned += R_nnz_remote;
1606  R_nnz_remote = 0;
1607  }
1608 
1609  // construct the D graph.
1610  {
1611  const auto flat_td_ptr = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), btdm.flat_td_ptr);
1612 
1613  btdm.A_colindsub = local_ordinal_type_1d_view("btdm.A_colindsub", D_nnz);
1614  const auto D_A_colindsub = Kokkos::create_mirror_view(btdm.A_colindsub);
1615 
1616 #if defined(BLOCKTRIDICONTAINER_DEBUG)
1617  Kokkos::deep_copy(D_A_colindsub, Teuchos::OrdinalTraits<local_ordinal_type>::invalid());
1618 #endif
1619 
1620  const local_ordinal_type nparts = partptr.extent(0) - 1;
1621  {
1622  const Kokkos::RangePolicy<host_execution_space> policy(0, nparts);
1623  Kokkos::parallel_for
1624  ("performSymbolicPhase::RangePolicy<host_execution_space>::D_graph",
1625  policy, KOKKOS_LAMBDA(const local_ordinal_type &pi0) {
1626  const local_ordinal_type part_ri0 = part2rowidx0(pi0);
1627  local_ordinal_type offset = 0;
1628  for (local_ordinal_type ri0=partptr(pi0);ri0<partptr(pi0+1);++ri0) {
1629  const local_ordinal_type td_row_os = btdm.RowToIndex(ri0 - part_ri0) + offset;
1630  offset = 1;
1631  const local_ordinal_type lr0 = lclrow(ri0);
1632  const size_type j0 = local_graph_rowptr(lr0);
1633  for (size_type j=j0;j<local_graph_rowptr(lr0+1);++j) {
1634  const local_ordinal_type lc = local_graph_colidx(j);
1635  const local_ordinal_type lc2r = col2row[lc];
1637  const local_ordinal_type ri = lclrow2idx[lc2r];
1638  const local_ordinal_type pi = rowidx2part(ri);
1639  if (pi != pi0) continue;
1640  if (ri + 1 < ri0 || ri > ri0 + 1) continue;
1641  const local_ordinal_type row_entry = j - j0;
1642  D_A_colindsub(flat_td_ptr(pi0) + ((td_row_os + ri) - ri0)) = row_entry;
1643  }
1644  }
1645  });
1646  }
1647 #if defined(BLOCKTRIDICONTAINER_DEBUG)
1648  for (size_t i=0;i<D_A_colindsub.extent(0);++i)
1650 #endif
1651  Kokkos::deep_copy(btdm.A_colindsub, D_A_colindsub);
1652 
1653  // Allocate values.
1654  {
1655  const auto pack_td_ptr_last = Kokkos::subview(btdm.pack_td_ptr, nparts);
1656  const auto num_packed_blocks = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), pack_td_ptr_last);
1657  btdm.values = vector_type_3d_view("btdm.values", num_packed_blocks(), blocksize, blocksize);
1658  if (vector_length > 1) setTridiagsToIdentity(btdm, interf.packptr);
1659  }
1660  }
1661 
1662  // Construct the R graph.
1663  {
1664  amd.rowptr = size_type_1d_view("amd.rowptr", nrows + 1);
1665  amd.A_colindsub = local_ordinal_type_1d_view(do_not_initialize_tag("amd.A_colindsub"), R_nnz_owned);
1666 
1667  const auto R_rowptr = Kokkos::create_mirror_view(amd.rowptr);
1668  const auto R_A_colindsub = Kokkos::create_mirror_view(amd.A_colindsub);
1669 
1670  amd.rowptr_remote = size_type_1d_view("amd.rowptr_remote", overlap_communication_and_computation ? nrows + 1 : 0);
1671  amd.A_colindsub_remote = local_ordinal_type_1d_view(do_not_initialize_tag("amd.A_colindsub_remote"), R_nnz_remote);
1672 
1673  const auto R_rowptr_remote = Kokkos::create_mirror_view(amd.rowptr_remote);
1674  const auto R_A_colindsub_remote = Kokkos::create_mirror_view(amd.A_colindsub_remote);
1675 
1676  {
1677  const Kokkos::RangePolicy<host_execution_space> policy(0,nrows);
1678  Kokkos::parallel_for
1679  ("performSymbolicPhase::RangePolicy<host_execution_space>::R_graph_count",
1680  policy, KOKKOS_LAMBDA(const local_ordinal_type &lr) {
1681  const local_ordinal_type ri0 = lclrow2idx[lr];
1682  const local_ordinal_type pi0 = rowidx2part(ri0);
1683  const size_type j0 = local_graph_rowptr(lr);
1684  for (size_type j=j0;j<local_graph_rowptr(lr+1);++j) {
1685  const local_ordinal_type lc = local_graph_colidx(j);
1686  const local_ordinal_type lc2r = col2row[lc];
1688  const local_ordinal_type ri = lclrow2idx[lc2r];
1689  const local_ordinal_type pi = rowidx2part(ri);
1690  if (pi == pi0 && ri + 1 >= ri0 && ri <= ri0 + 1) {
1691  continue;
1692  }
1693  }
1694  // exclusive scan will be performed later
1695  if (!overlap_communication_and_computation || lc < nrows) {
1696  ++R_rowptr(lr);
1697  } else {
1698  ++R_rowptr_remote(lr);
1699  }
1700  }
1701  });
1702  }
1703 
1704  // exclusive scan
1705  typedef ArrayValueType<size_type,2> update_type;
1706  {
1707  Kokkos::RangePolicy<host_execution_space> policy(0,nrows+1);
1708  Kokkos::parallel_scan
1709  ("performSymbolicPhase::RangePolicy<host_execution_space>::R_graph_fill",
1710  policy, KOKKOS_LAMBDA(const local_ordinal_type &lr,
1711  update_type &update,
1712  const bool &final) {
1713  update_type val;
1714  val.v[0] = R_rowptr(lr);
1715  if (overlap_communication_and_computation)
1716  val.v[1] = R_rowptr_remote(lr);
1717 
1718  if (final) {
1719  R_rowptr(lr) = update.v[0];
1720  if (overlap_communication_and_computation)
1721  R_rowptr_remote(lr) = update.v[1];
1722 
1723  if (lr < nrows) {
1724  const local_ordinal_type ri0 = lclrow2idx[lr];
1725  const local_ordinal_type pi0 = rowidx2part(ri0);
1726 
1727  size_type cnt_rowptr = R_rowptr(lr);
1728  size_type cnt_rowptr_remote = overlap_communication_and_computation ? R_rowptr_remote(lr) : 0; // when not overlap_communication_and_computation, this value is garbage
1729 
1730  const size_type j0 = local_graph_rowptr(lr);
1731  for (size_type j=j0;j<local_graph_rowptr(lr+1);++j) {
1732  const local_ordinal_type lc = local_graph_colidx(j);
1733  const local_ordinal_type lc2r = col2row[lc];
1735  const local_ordinal_type ri = lclrow2idx[lc2r];
1736  const local_ordinal_type pi = rowidx2part(ri);
1737  if (pi == pi0 && ri + 1 >= ri0 && ri <= ri0 + 1)
1738  continue;
1739  }
1740  const local_ordinal_type row_entry = j - j0;
1741  if (!overlap_communication_and_computation || lc < nrows)
1742  R_A_colindsub(cnt_rowptr++) = row_entry;
1743  else
1744  R_A_colindsub_remote(cnt_rowptr_remote++) = row_entry;
1745  }
1746  }
1747  }
1748  update += val;
1749  });
1750  }
1751  TEUCHOS_ASSERT(R_rowptr(nrows) == R_nnz_owned);
1752  Kokkos::deep_copy(amd.rowptr, R_rowptr);
1753  Kokkos::deep_copy(amd.A_colindsub, R_A_colindsub);
1754  if (overlap_communication_and_computation) {
1755  TEUCHOS_ASSERT(R_rowptr_remote(nrows) == R_nnz_remote);
1756  Kokkos::deep_copy(amd.rowptr_remote, R_rowptr_remote);
1757  Kokkos::deep_copy(amd.A_colindsub_remote, R_A_colindsub_remote);
1758  }
1759 
1760  // Allocate or view values.
1761  amd.tpetra_values = (const_cast<block_crs_matrix_type*>(A.get())->
1762  template getValues<node_memory_space>());
1763  }
1764  }
1765  }
1766 
1767 
1771  template<typename ArgActiveExecutionMemorySpace>
1773 
1774  template<>
1775  struct ExtractAndFactorizeTridiagsDefaultModeAndAlgo<Kokkos::HostSpace> {
1776  typedef KB::Mode::Serial mode_type;
1777 #if defined(__KOKKOSBATCHED_INTEL_MKL_COMPACT_BATCHED__)
1778  typedef KB::Algo::Level3::CompactMKL algo_type;
1779 #else
1780  typedef KB::Algo::Level3::Blocked algo_type;
1781 #endif
1782  static int recommended_team_size(const int /* blksize */,
1783  const int /* vector_length */,
1784  const int /* internal_vector_length */) {
1785  return 1;
1786  }
1787 
1788  };
1789 
1790 #if defined(KOKKOS_ENABLE_CUDA)
1791  static inline int ExtractAndFactorizeRecommendedCudaTeamSize(const int blksize,
1792  const int vector_length,
1793  const int internal_vector_length) {
1794  const int vector_size = vector_length/internal_vector_length;
1795  int total_team_size(0);
1796  if (blksize <= 5) total_team_size = 32;
1797  else if (blksize <= 9) total_team_size = 32; // 64
1798  else if (blksize <= 12) total_team_size = 96;
1799  else if (blksize <= 16) total_team_size = 128;
1800  else if (blksize <= 20) total_team_size = 160;
1801  else total_team_size = 160;
1802  return 2*total_team_size/vector_size;
1803  }
1804  template<>
1805  struct ExtractAndFactorizeTridiagsDefaultModeAndAlgo<Kokkos::CudaSpace> {
1806  typedef KB::Mode::Team mode_type;
1807  typedef KB::Algo::Level3::Unblocked algo_type;
1808  static int recommended_team_size(const int blksize,
1809  const int vector_length,
1810  const int internal_vector_length) {
1811  return ExtractAndFactorizeRecommendedCudaTeamSize(blksize, vector_length, internal_vector_length);
1812  }
1813  };
1814  template<>
1815  struct ExtractAndFactorizeTridiagsDefaultModeAndAlgo<Kokkos::CudaUVMSpace> {
1816  typedef KB::Mode::Team mode_type;
1817  typedef KB::Algo::Level3::Unblocked algo_type;
1818  static int recommended_team_size(const int blksize,
1819  const int vector_length,
1820  const int internal_vector_length) {
1821  return ExtractAndFactorizeRecommendedCudaTeamSize(blksize, vector_length, internal_vector_length);
1822  }
1823  };
1824 #endif
1825 
1826  template<typename MatrixType>
1827  struct ExtractAndFactorizeTridiags {
1828  public:
1829  using impl_type = ImplType<MatrixType>;
1830  // a functor cannot have both device_type and execution_space; specialization error in kokkos
1831  using execution_space = typename impl_type::execution_space;
1832  using memory_space = typename impl_type::memory_space;
1834  using local_ordinal_type = typename impl_type::local_ordinal_type;
1835  using size_type = typename impl_type::size_type;
1836  using impl_scalar_type = typename impl_type::impl_scalar_type;
1837  using magnitude_type = typename impl_type::magnitude_type;
1839  using block_crs_matrix_type = typename impl_type::tpetra_block_crs_matrix_type;
1841  using local_ordinal_type_1d_view = typename impl_type::local_ordinal_type_1d_view;
1842  using size_type_1d_view = typename impl_type::size_type_1d_view;
1843  using impl_scalar_type_1d_view_tpetra = typename impl_type::impl_scalar_type_1d_view_tpetra;
1845  using btdm_scalar_type = typename impl_type::btdm_scalar_type;
1846  using btdm_magnitude_type = typename impl_type::btdm_magnitude_type;
1847  using vector_type_3d_view = typename impl_type::vector_type_3d_view;
1848  using internal_vector_type_4d_view = typename impl_type::internal_vector_type_4d_view;
1849  using btdm_scalar_type_4d_view = typename impl_type::btdm_scalar_type_4d_view;
1850  using internal_vector_scratch_type_3d_view = Scratch<typename impl_type::internal_vector_type_3d_view>;
1851  using btdm_scalar_scratch_type_3d_view = Scratch<typename impl_type::btdm_scalar_type_3d_view>;
1852 
1853  using internal_vector_type = typename impl_type::internal_vector_type;
1854  static constexpr int vector_length = impl_type::vector_length;
1855  static constexpr int internal_vector_length = impl_type::internal_vector_length;
1856 
1858  using team_policy_type = Kokkos::TeamPolicy<execution_space>;
1859  using member_type = typename team_policy_type::member_type;
1860 
1861  private:
1862  // part interface
1863  const ConstUnmanaged<local_ordinal_type_1d_view> partptr, lclrow, packptr;
1864  const local_ordinal_type max_partsz;
1865  // block crs matrix (it could be Kokkos::UVMSpace::size_type, which is int)
1866  using a_rowptr_value_type = typename Kokkos::ViewTraits<local_ordinal_type*,typename impl_type::node_device_type>::size_type;
1867  using size_type_1d_view_tpetra = Kokkos::View<a_rowptr_value_type*,typename impl_type::node_device_type>;
1868  const ConstUnmanaged<size_type_1d_view_tpetra> A_rowptr;
1869  const ConstUnmanaged<impl_scalar_type_1d_view_tpetra> A_values;
1870  // block tridiags
1871  const ConstUnmanaged<size_type_1d_view> pack_td_ptr, flat_td_ptr;
1872  const ConstUnmanaged<local_ordinal_type_1d_view> A_colindsub;
1873  const Unmanaged<internal_vector_type_4d_view> internal_vector_values;
1874  const Unmanaged<btdm_scalar_type_4d_view> scalar_values;
1875  // shared information
1876  const local_ordinal_type blocksize, blocksize_square;
1877  // diagonal safety
1878  const magnitude_type tiny;
1879  const local_ordinal_type vector_loop_size;
1880  const local_ordinal_type vector_length_value;
1881 
1882  public:
1883  ExtractAndFactorizeTridiags(const BlockTridiags<MatrixType> &btdm_,
1884  const PartInterface<MatrixType> &interf_,
1886  const magnitude_type& tiny_) :
1887  // interface
1888  partptr(interf_.partptr),
1889  lclrow(interf_.lclrow),
1890  packptr(interf_.packptr),
1891  max_partsz(interf_.max_partsz),
1892  // block crs matrix
1893  A_rowptr(A_->getCrsGraph().getLocalGraph().row_map),
1894  A_values(const_cast<block_crs_matrix_type*>(A_.get())->template getValues<memory_space>()),
1895  // block tridiags
1896  pack_td_ptr(btdm_.pack_td_ptr),
1897  flat_td_ptr(btdm_.flat_td_ptr),
1898  A_colindsub(btdm_.A_colindsub),
1899  internal_vector_values((internal_vector_type*)btdm_.values.data(),
1900  btdm_.values.extent(0),
1901  btdm_.values.extent(1),
1902  btdm_.values.extent(2),
1903  vector_length/internal_vector_length),
1904  scalar_values((btdm_scalar_type*)btdm_.values.data(),
1905  btdm_.values.extent(0),
1906  btdm_.values.extent(1),
1907  btdm_.values.extent(2),
1908  vector_length),
1909  blocksize(btdm_.values.extent(1)),
1910  blocksize_square(blocksize*blocksize),
1911  // diagonal weight to avoid zero pivots
1912  tiny(tiny_),
1913  vector_loop_size(vector_length/internal_vector_length),
1914  vector_length_value(vector_length) {}
1915 
1916  private:
1917 
1918  KOKKOS_INLINE_FUNCTION
1919  void
1920  extract(local_ordinal_type partidx,
1921  local_ordinal_type npacks) const {
1922  const size_type kps = pack_td_ptr(partidx);
1923  local_ordinal_type kfs[vector_length] = {};
1924  local_ordinal_type ri0[vector_length] = {};
1925  local_ordinal_type nrows[vector_length] = {};
1926 
1927  for (local_ordinal_type vi=0;vi<npacks;++vi,++partidx) {
1928  kfs[vi] = flat_td_ptr(partidx);
1929  ri0[vi] = partptr(partidx);
1930  nrows[vi] = partptr(partidx+1) - ri0[vi];
1931  }
1932  for (local_ordinal_type tr=0,j=0;tr<nrows[0];++tr) {
1933  for (local_ordinal_type e=0;e<3;++e) {
1934  const impl_scalar_type* block[vector_length] = {};
1935  for (local_ordinal_type vi=0;vi<npacks;++vi) {
1936  const size_type Aj = A_rowptr(lclrow(ri0[vi] + tr)) + A_colindsub(kfs[vi] + j);
1937  block[vi] = &A_values(Aj*blocksize_square);
1938  }
1939  const size_type pi = kps + j;
1940  ++j;
1941  for (local_ordinal_type ii=0;ii<blocksize;++ii) {
1942  for (local_ordinal_type jj=0;jj<blocksize;++jj) {
1943  const auto idx = ii*blocksize + jj;
1944  auto& v = internal_vector_values(pi, ii, jj, 0);
1945  for (local_ordinal_type vi=0;vi<npacks;++vi)
1946  v[vi] = static_cast<btdm_scalar_type>(block[vi][idx]);
1947  }
1948  }
1949 
1950  if (nrows[0] == 1) break;
1951  if (e == 1 && (tr == 0 || tr+1 == nrows[0])) break;
1952  for (local_ordinal_type vi=1;vi<npacks;++vi) {
1953  if ((e == 0 && nrows[vi] == 1) || (e == 1 && tr+1 == nrows[vi])) {
1954  npacks = vi;
1955  break;
1956  }
1957  }
1958  }
1959  }
1960  }
1961 
1962  KOKKOS_INLINE_FUNCTION
1963  void
1964  extract(const member_type &member,
1965  const local_ordinal_type &partidxbeg,
1966  const local_ordinal_type &npacks,
1967  const local_ordinal_type &vbeg) const {
1968  local_ordinal_type kfs_vals[internal_vector_length] = {};
1969  local_ordinal_type ri0_vals[internal_vector_length] = {};
1970  local_ordinal_type nrows_vals[internal_vector_length] = {};
1971 
1972  const size_type kps = pack_td_ptr(partidxbeg);
1973  for (local_ordinal_type v=vbeg,vi=0;v<npacks && vi<internal_vector_length;++v,++vi) {
1974  kfs_vals[vi] = flat_td_ptr(partidxbeg+vi);
1975  ri0_vals[vi] = partptr(partidxbeg+vi);
1976  nrows_vals[vi] = partptr(partidxbeg+vi+1) - ri0_vals[vi];
1977  }
1978 
1979  local_ordinal_type j_vals[internal_vector_length] = {};
1980  for (local_ordinal_type tr=0;tr<nrows_vals[0];++tr) {
1981  for (local_ordinal_type v=vbeg,vi=0;v<npacks && vi<internal_vector_length;++v,++vi) {
1982  const local_ordinal_type nrows = nrows_vals[vi];
1983  if (tr < nrows) {
1984  auto &j = j_vals[vi];
1985  const local_ordinal_type kfs = kfs_vals[vi];
1986  const local_ordinal_type ri0 = ri0_vals[vi];
1987  const local_ordinal_type lbeg = (tr == 0 ? 1 : 0);
1988  const local_ordinal_type lend = (tr == nrows - 1 ? 2 : 3);
1989  for (local_ordinal_type l=lbeg;l<lend;++l,++j) {
1990  const size_type Aj = A_rowptr(lclrow(ri0 + tr)) + A_colindsub(kfs + j);
1991  const impl_scalar_type* block = &A_values(Aj*blocksize_square);
1992  const size_type pi = kps + j;
1993  Kokkos::parallel_for
1994  (Kokkos::TeamThreadRange(member,blocksize),
1995  [&](const local_ordinal_type &ii) {
1996  for (local_ordinal_type jj=0;jj<blocksize;++jj)
1997  scalar_values(pi, ii, jj, v) = static_cast<btdm_scalar_type>(block[ii*blocksize + jj]);
1998  });
1999  }
2000  }
2001  }
2002  }
2003  }
2004 
2005  template<typename AAViewType,
2006  typename WWViewType>
2007  KOKKOS_INLINE_FUNCTION
2008  void
2009  factorize(const member_type &member,
2010  const local_ordinal_type &i0,
2011  const local_ordinal_type &nrows,
2012  const local_ordinal_type &v,
2013  const AAViewType &AA,
2014  const WWViewType &WW) const {
2015  typedef ExtractAndFactorizeTridiagsDefaultModeAndAlgo
2016  <Kokkos::Impl::ActiveExecutionMemorySpace> default_mode_and_algo_type;
2017  typedef default_mode_and_algo_type::mode_type default_mode_type;
2018  typedef default_mode_and_algo_type::algo_type default_algo_type;
2019 
2020  // constant
2021  const auto one = Kokkos::ArithTraits<btdm_magnitude_type>::one();
2022 
2023  // subview pattern
2024  auto A = Kokkos::subview(AA, i0, Kokkos::ALL(), Kokkos::ALL(), v);
2025  KB::LU<member_type,
2026  default_mode_type,KB::Algo::LU::Unblocked>
2027  ::invoke(member, A , tiny);
2028  if (nrows > 1) {
2029  auto B = A;
2030  auto C = A;
2031  local_ordinal_type i = i0;
2032  for (local_ordinal_type tr=1;tr<nrows;++tr,i+=3) {
2033  B.assign_data( &AA(i+1,0,0,v) );
2034  KB::Trsm<member_type,
2035  KB::Side::Left,KB::Uplo::Lower,KB::Trans::NoTranspose,KB::Diag::Unit,
2036  default_mode_type,default_algo_type>
2037  ::invoke(member, one, A, B);
2038  C.assign_data( &AA(i+2,0,0,v) );
2039  KB::Trsm<member_type,
2040  KB::Side::Right,KB::Uplo::Upper,KB::Trans::NoTranspose,KB::Diag::NonUnit,
2041  default_mode_type,default_algo_type>
2042  ::invoke(member, one, A, C);
2043  A.assign_data( &AA(i+3,0,0,v) );
2044  KB::Gemm<member_type,
2045  KB::Trans::NoTranspose,KB::Trans::NoTranspose,
2046  default_mode_type,default_algo_type>
2047  ::invoke(member, -one, C, B, one, A);
2048  KB::LU<member_type,
2049  default_mode_type,KB::Algo::LU::Unblocked>
2050  ::invoke(member, A, tiny);
2051  }
2052  } else {
2053  // for block jacobi invert a matrix here
2054  auto W = Kokkos::subview(WW, Kokkos::ALL(), Kokkos::ALL(), v);
2055  KB::Copy<member_type,KB::Trans::NoTranspose,default_mode_type>
2056  ::invoke(member, A, W);
2057  KB::SetIdentity<member_type,default_mode_type>
2058  ::invoke(member, A);
2059  member.team_barrier();
2060  KB::Trsm<member_type,
2061  KB::Side::Left,KB::Uplo::Lower,KB::Trans::NoTranspose,KB::Diag::Unit,
2062  default_mode_type,default_algo_type>
2063  ::invoke(member, one, W, A);
2064  KB::Trsm<member_type,
2065  KB::Side::Left,KB::Uplo::Upper,KB::Trans::NoTranspose,KB::Diag::NonUnit,
2066  default_mode_type,default_algo_type>
2067  ::invoke(member, one, W, A);
2068  }
2069  }
2070 
2071  public:
2072 
2073  struct ExtractAndFactorizeTag {};
2074 
2075  KOKKOS_INLINE_FUNCTION
2076  void
2077  operator() (const ExtractAndFactorizeTag &, const member_type &member) const {
2078  // btdm is packed and sorted from largest one
2079  const local_ordinal_type packidx = member.league_rank();
2080 
2081  const local_ordinal_type partidx = packptr(packidx);
2082  const local_ordinal_type npacks = packptr(packidx+1) - partidx;
2083  const local_ordinal_type i0 = pack_td_ptr(partidx);
2084  const local_ordinal_type nrows = partptr(partidx+1) - partptr(partidx);
2085 
2086  internal_vector_scratch_type_3d_view
2087  WW(member.team_scratch(0), blocksize, blocksize, vector_loop_size);
2088  if (vector_loop_size == 1) {
2089  extract(partidx, npacks);
2090  factorize(member, i0, nrows, 0, internal_vector_values, WW);
2091  } else {
2092  Kokkos::parallel_for
2093  (Kokkos::ThreadVectorRange(member, vector_loop_size), [&](const local_ordinal_type &v) {
2094  const local_ordinal_type vbeg = v*internal_vector_length;
2095  if (vbeg < npacks)
2096  extract(member, partidx+vbeg, npacks, vbeg);
2097  factorize(member, i0, nrows, v, internal_vector_values, WW);
2098  });
2099  }
2100  }
2101 
2102  void run() {
2103  IFPACK2_BLOCKTRIDICONTAINER_PROFILER_REGION_BEGIN;
2104  const local_ordinal_type team_size =
2105  ExtractAndFactorizeTridiagsDefaultModeAndAlgo<typename execution_space::memory_space>::
2106  recommended_team_size(blocksize, vector_length, internal_vector_length);
2107  const local_ordinal_type per_team_scratch = internal_vector_scratch_type_3d_view::
2108  shmem_size(blocksize, blocksize, vector_loop_size);
2109 
2110  Kokkos::TeamPolicy<execution_space,ExtractAndFactorizeTag>
2111  policy(packptr.extent(0)-1, team_size, vector_loop_size);
2112 #if defined(KOKKOS_ENABLE_DEPRECATED_CODE)
2113  Kokkos::parallel_for("ExtractAndFactorize::TeamPolicy::run<ExtractAndFactorizeTag>",
2114  policy.set_scratch_size(0,Kokkos::PerTeam(per_team_scratch)), *this);
2115 #else
2116  policy.set_scratch_size(0,Kokkos::PerTeam(per_team_scratch));
2117  Kokkos::parallel_for("ExtractAndFactorize::TeamPolicy::run<ExtractAndFactorizeTag>",
2118  policy, *this);
2119 #endif
2120  IFPACK2_BLOCKTRIDICONTAINER_PROFILER_REGION_END;
2121  }
2122 
2123  };
2124 
2128  template<typename MatrixType>
2129  void
2130  performNumericPhase(const Teuchos::RCP<const typename ImplType<MatrixType>::tpetra_block_crs_matrix_type> &A,
2131  const PartInterface<MatrixType> &interf,
2133  const typename ImplType<MatrixType>::magnitude_type tiny) {
2134  IFPACK2_BLOCKTRIDICONTAINER_TIMER("BlockTriDi::NumericPhase");
2135  ExtractAndFactorizeTridiags<MatrixType> function(btdm, interf, A, tiny);
2136  function.run();
2137  }
2138 
2142  template<typename MatrixType>
2144  public:
2146  using execution_space = typename impl_type::execution_space;
2147  using memory_space = typename impl_type::memory_space;
2148 
2149  using local_ordinal_type = typename impl_type::local_ordinal_type;
2150  using impl_scalar_type = typename impl_type::impl_scalar_type;
2151  using btdm_scalar_type = typename impl_type::btdm_scalar_type;
2152  using tpetra_multivector_type = typename impl_type::tpetra_multivector_type;
2153  using local_ordinal_type_1d_view = typename impl_type::local_ordinal_type_1d_view;
2154  using vector_type_3d_view = typename impl_type::vector_type_3d_view;
2155  using impl_scalar_type_2d_view_tpetra = typename impl_type::impl_scalar_type_2d_view_tpetra;
2156  static constexpr int vector_length = impl_type::vector_length;
2157 
2158  using member_type = typename Kokkos::TeamPolicy<execution_space>::member_type;
2159 
2160  private:
2161  // part interface
2162  const ConstUnmanaged<local_ordinal_type_1d_view> partptr;
2163  const ConstUnmanaged<local_ordinal_type_1d_view> packptr;
2164  const ConstUnmanaged<local_ordinal_type_1d_view> part2packrowidx0;
2165  const ConstUnmanaged<local_ordinal_type_1d_view> part2rowidx0;
2166  const ConstUnmanaged<local_ordinal_type_1d_view> lclrow;
2167  const local_ordinal_type blocksize;
2168  const local_ordinal_type num_vectors;
2169 
2170  // packed multivector output (or input)
2171  vector_type_3d_view packed_multivector;
2172  impl_scalar_type_2d_view_tpetra scalar_multivector;
2173 
2174  template<typename TagType>
2175  KOKKOS_INLINE_FUNCTION
2176  void copy_multivectors(const local_ordinal_type &j,
2177  const local_ordinal_type &vi,
2178  const local_ordinal_type &pri,
2179  const local_ordinal_type &ri0) const {
2180  for (local_ordinal_type col=0;col<num_vectors;++col)
2181  for (local_ordinal_type i=0;i<blocksize;++i)
2182  packed_multivector(pri, i, col)[vi] = static_cast<btdm_scalar_type>(scalar_multivector(blocksize*lclrow(ri0+j)+i,col));
2183  }
2184 
2185  public:
2186 
2187  MultiVectorConverter(const PartInterface<MatrixType> &interf,
2188  const vector_type_3d_view &pmv)
2189  : partptr(interf.partptr),
2190  packptr(interf.packptr),
2191  part2packrowidx0(interf.part2packrowidx0),
2192  part2rowidx0(interf.part2rowidx0),
2193  lclrow(interf.lclrow),
2194  blocksize(pmv.extent(1)),
2195  num_vectors(pmv.extent(2)),
2196  packed_multivector(pmv) {}
2197 
2198  // TODO:: modify this routine similar to the team level functions
2199  inline
2200  void
2201  operator() (const local_ordinal_type &packidx) const {
2202  local_ordinal_type partidx = packptr(packidx);
2203  local_ordinal_type npacks = packptr(packidx+1) - partidx;
2204  const local_ordinal_type pri0 = part2packrowidx0(partidx);
2205 
2206  local_ordinal_type ri0[vector_length] = {};
2207  local_ordinal_type nrows[vector_length] = {};
2208  for (local_ordinal_type v=0;v<npacks;++v,++partidx) {
2209  ri0[v] = part2rowidx0(partidx);
2210  nrows[v] = part2rowidx0(partidx+1) - ri0[v];
2211  }
2212  for (local_ordinal_type j=0;j<nrows[0];++j) {
2213  local_ordinal_type cnt = 1;
2214  for (;cnt<npacks && j!= nrows[cnt];++cnt);
2215  npacks = cnt;
2216  const local_ordinal_type pri = pri0 + j;
2217  for (local_ordinal_type col=0;col<num_vectors;++col)
2218  for (local_ordinal_type i=0;i<blocksize;++i)
2219  for (local_ordinal_type v=0;v<npacks;++v)
2220  packed_multivector(pri, i, col)[v] = static_cast<btdm_scalar_type>(scalar_multivector(blocksize*lclrow(ri0[v]+j)+i,col));
2221  }
2222  }
2223 
2224  KOKKOS_INLINE_FUNCTION
2225  void
2226  operator() (const member_type &member) const {
2227  const local_ordinal_type packidx = member.league_rank();
2228  const local_ordinal_type partidx_begin = packptr(packidx);
2229  const local_ordinal_type npacks = packptr(packidx+1) - partidx_begin;
2230  const local_ordinal_type pri0 = part2packrowidx0(partidx_begin);
2231  Kokkos::parallel_for(Kokkos::ThreadVectorRange(member, npacks), [&](const local_ordinal_type &v) {
2232  const local_ordinal_type partidx = partidx_begin + v;
2233  const local_ordinal_type ri0 = part2rowidx0(partidx);
2234  const local_ordinal_type nrows = part2rowidx0(partidx+1) - ri0;
2235 
2236  if (nrows == 1) {
2237  const local_ordinal_type pri = pri0;
2238  for (local_ordinal_type col=0;col<num_vectors;++col) {
2239  Kokkos::parallel_for(Kokkos::TeamThreadRange(member, blocksize), [&](const local_ordinal_type &i) {
2240  packed_multivector(pri, i, col)[v] = static_cast<btdm_scalar_type>(scalar_multivector(blocksize*lclrow(ri0)+i,col));
2241  });
2242  }
2243  } else {
2244  Kokkos::parallel_for(Kokkos::TeamThreadRange(member, nrows), [&](const local_ordinal_type &j) {
2245  const local_ordinal_type pri = pri0 + j;
2246  for (local_ordinal_type col=0;col<num_vectors;++col)
2247  for (local_ordinal_type i=0;i<blocksize;++i)
2248  packed_multivector(pri, i, col)[v] = static_cast<btdm_scalar_type>(scalar_multivector(blocksize*lclrow(ri0+j)+i,col));
2249  });
2250  }
2251  });
2252  }
2253 
2254  void run(const impl_scalar_type_2d_view_tpetra &scalar_multivector_) {
2255  IFPACK2_BLOCKTRIDICONTAINER_PROFILER_REGION_BEGIN;
2256  IFPACK2_BLOCKTRIDICONTAINER_TIMER("BlockTriDi::MultiVectorConverter");
2257 
2258  scalar_multivector = scalar_multivector_;
2260 #if defined(KOKKOS_ENABLE_CUDA)
2261  const local_ordinal_type vl = vector_length;
2262  const Kokkos::TeamPolicy<execution_space> policy(packptr.extent(0) - 1, Kokkos::AUTO(), vl);
2263  Kokkos::parallel_for
2264  ("MultiVectorConverter::TeamPolicy", policy, *this);
2265 #endif
2266  } else {
2267 #if defined(__CUDA_ARCH__)
2268  TEUCHOS_TEST_FOR_EXCEPT_MSG(true, "Error: CUDA should not see this code");
2269 #else
2270  const Kokkos::RangePolicy<execution_space> policy(0, packptr.extent(0) - 1);
2271  Kokkos::parallel_for
2272  ("MultiVectorConverter::RangePolicy", policy, *this);
2273 #endif
2274  }
2275  IFPACK2_BLOCKTRIDICONTAINER_PROFILER_REGION_END;
2276  }
2277  };
2278 
2282  template<typename ArgActiveExecutionMemorySpace>
2284 
2285  template<>
2286  struct SolveTridiagsDefaultModeAndAlgo<Kokkos::HostSpace> {
2287  typedef KB::Mode::Serial mode_type;
2288  typedef KB::Algo::Level2::Unblocked single_vector_algo_type;
2289 #if defined(__KOKKOSBATCHED_INTEL_MKL_COMPACT_BATCHED__)
2290  typedef KB::Algo::Level3::CompactMKL multi_vector_algo_type;
2291 #else
2292  typedef KB::Algo::Level3::Blocked multi_vector_algo_type;
2293 #endif
2294  static int recommended_team_size(const int /* blksize */,
2295  const int /* vector_length */,
2296  const int /* internal_vector_length */) {
2297  return 1;
2298  }
2299  };
2300 
2301 #if defined(KOKKOS_ENABLE_CUDA)
2302  static inline int SolveTridiagsRecommendedCudaTeamSize(const int blksize,
2303  const int vector_length,
2304  const int internal_vector_length) {
2305  const int vector_size = vector_length/internal_vector_length;
2306  int total_team_size(0);
2307  if (blksize <= 5) total_team_size = 32;
2308  else if (blksize <= 9) total_team_size = 32; // 64
2309  else if (blksize <= 12) total_team_size = 96;
2310  else if (blksize <= 16) total_team_size = 128;
2311  else if (blksize <= 20) total_team_size = 160;
2312  else total_team_size = 160;
2313  return total_team_size/vector_size;
2314  }
2315 
2316  template<>
2317  struct SolveTridiagsDefaultModeAndAlgo<Kokkos::CudaSpace> {
2318  typedef KB::Mode::Team mode_type;
2319  typedef KB::Algo::Level2::Unblocked single_vector_algo_type;
2320  typedef KB::Algo::Level3::Unblocked multi_vector_algo_type;
2321  static int recommended_team_size(const int blksize,
2322  const int vector_length,
2323  const int internal_vector_length) {
2324  return SolveTridiagsRecommendedCudaTeamSize(blksize, vector_length, internal_vector_length);
2325  }
2326  };
2327  template<>
2328  struct SolveTridiagsDefaultModeAndAlgo<Kokkos::CudaUVMSpace> {
2329  typedef KB::Mode::Team mode_type;
2330  typedef KB::Algo::Level2::Unblocked single_vector_algo_type;
2331  typedef KB::Algo::Level3::Unblocked multi_vector_algo_type;
2332  static int recommended_team_size(const int blksize,
2333  const int vector_length,
2334  const int internal_vector_length) {
2335  return SolveTridiagsRecommendedCudaTeamSize(blksize, vector_length, internal_vector_length);
2336  }
2337  };
2338 #endif
2339 
2340  template<typename MatrixType>
2341  struct SolveTridiags {
2342  public:
2343  using impl_type = ImplType<MatrixType>;
2344  using execution_space = typename impl_type::execution_space;
2345 
2346  using local_ordinal_type = typename impl_type::local_ordinal_type;
2347  using size_type = typename impl_type::size_type;
2348  using impl_scalar_type = typename impl_type::impl_scalar_type;
2349  using magnitude_type = typename impl_type::magnitude_type;
2350  using btdm_scalar_type = typename impl_type::btdm_scalar_type;
2351  using btdm_magnitude_type = typename impl_type::btdm_magnitude_type;
2353  using local_ordinal_type_1d_view = typename impl_type::local_ordinal_type_1d_view;
2354  using size_type_1d_view = typename impl_type::size_type_1d_view;
2356  using vector_type_3d_view = typename impl_type::vector_type_3d_view;
2357  using internal_vector_type_4d_view = typename impl_type::internal_vector_type_4d_view;
2358  //using btdm_scalar_type_4d_view = typename impl_type::btdm_scalar_type_4d_view;
2359 
2360  using internal_vector_scratch_type_3d_view = Scratch<typename impl_type::internal_vector_type_3d_view>;
2361 
2362  using internal_vector_type =typename impl_type::internal_vector_type;
2363  static constexpr int vector_length = impl_type::vector_length;
2364  static constexpr int internal_vector_length = impl_type::internal_vector_length;
2365 
2367  using impl_scalar_type_1d_view = typename impl_type::impl_scalar_type_1d_view;
2368  using impl_scalar_type_2d_view_tpetra = typename impl_type::impl_scalar_type_2d_view_tpetra;
2369 
2371  using team_policy_type = Kokkos::TeamPolicy<execution_space>;
2372  using member_type = typename team_policy_type::member_type;
2373 
2374  private:
2375  // part interface
2376  const ConstUnmanaged<local_ordinal_type_1d_view> partptr;
2377  const ConstUnmanaged<local_ordinal_type_1d_view> packptr;
2378  const ConstUnmanaged<local_ordinal_type_1d_view> part2packrowidx0;
2379  const ConstUnmanaged<local_ordinal_type_1d_view> lclrow;
2380 
2381  // block tridiags
2382  const ConstUnmanaged<size_type_1d_view> pack_td_ptr;
2383 
2384  // block tridiags values
2385  const ConstUnmanaged<internal_vector_type_4d_view> D_internal_vector_values;
2386  const Unmanaged<internal_vector_type_4d_view> X_internal_vector_values;
2387 
2388  const local_ordinal_type vector_loop_size;
2389 
2390  // copy to multivectors : damping factor and Y_scalar_multivector
2391  Unmanaged<impl_scalar_type_2d_view_tpetra> Y_scalar_multivector;
2392 #if defined(__CUDA_ARCH__)
2393  AtomicUnmanaged<impl_scalar_type_1d_view> Z_scalar_vector;
2394 #else
2395  /* */ Unmanaged<impl_scalar_type_1d_view> Z_scalar_vector;
2396 #endif
2397  const impl_scalar_type df;
2398  const bool compute_diff;
2399 
2400  public:
2401  SolveTridiags(const PartInterface<MatrixType> &interf,
2402  const BlockTridiags<MatrixType> &btdm,
2403  const vector_type_3d_view &pmv,
2404  const impl_scalar_type damping_factor,
2405  const bool is_norm_manager_active)
2406  :
2407  // interface
2408  partptr(interf.partptr),
2409  packptr(interf.packptr),
2410  part2packrowidx0(interf.part2packrowidx0),
2411  lclrow(interf.lclrow),
2412  // block tridiags and multivector
2413  pack_td_ptr(btdm.pack_td_ptr),
2414  D_internal_vector_values((internal_vector_type*)btdm.values.data(),
2415  btdm.values.extent(0),
2416  btdm.values.extent(1),
2417  btdm.values.extent(2),
2418  vector_length/internal_vector_length),
2419  X_internal_vector_values((internal_vector_type*)pmv.data(),
2420  pmv.extent(0),
2421  pmv.extent(1),
2422  pmv.extent(2),
2423  vector_length/internal_vector_length),
2424  vector_loop_size(vector_length/internal_vector_length),
2425  Y_scalar_multivector(),
2426  Z_scalar_vector(),
2427  df(damping_factor),
2428  compute_diff(is_norm_manager_active)
2429  {}
2430 
2431  public:
2432 
2434  KOKKOS_INLINE_FUNCTION
2435  void
2436  copyToFlatMultiVector(const member_type &member,
2437  const local_ordinal_type partidxbeg, // partidx for v = 0
2438  const local_ordinal_type npacks,
2439  const local_ordinal_type pri0,
2440  const local_ordinal_type v, // index with a loop of vector_loop_size
2441  const local_ordinal_type blocksize,
2442  const local_ordinal_type num_vectors) const {
2443  const local_ordinal_type vbeg = v*internal_vector_length;
2444  if (vbeg < npacks) {
2445  local_ordinal_type ri0_vals[internal_vector_length] = {};
2446  local_ordinal_type nrows_vals[internal_vector_length] = {};
2447  for (local_ordinal_type vv=vbeg,vi=0;vv<npacks && vi<internal_vector_length;++vv,++vi) {
2448  const local_ordinal_type partidx = partidxbeg+vv;
2449  ri0_vals[vi] = partptr(partidx);
2450  nrows_vals[vi] = partptr(partidx+1) - ri0_vals[vi];
2451  }
2452 
2453  impl_scalar_type z_partial_sum(0);
2454  if (nrows_vals[0] == 1) {
2455  const local_ordinal_type j=0, pri=pri0;
2456  {
2457  for (local_ordinal_type vv=vbeg,vi=0;vv<npacks && vi<internal_vector_length;++vv,++vi) {
2458  const local_ordinal_type ri0 = ri0_vals[vi];
2459  const local_ordinal_type nrows = nrows_vals[vi];
2460  if (j < nrows) {
2461  Kokkos::parallel_for
2462  (Kokkos::TeamThreadRange(member, blocksize),
2463  [&](const local_ordinal_type &i) {
2464  const local_ordinal_type row = blocksize*lclrow(ri0+j)+i;
2465  for (local_ordinal_type col=0;col<num_vectors;++col) {
2466  impl_scalar_type &y = Y_scalar_multivector(row,col);
2467  const impl_scalar_type yd = X_internal_vector_values(pri, i, col, v)[vi] - y;
2468  y += df*yd;
2469 
2470  {//if (compute_diff) {
2471  const auto yd_abs = Kokkos::ArithTraits<impl_scalar_type>::abs(yd);
2472  z_partial_sum += yd_abs*yd_abs;
2473  }
2474  }
2475  });
2476  }
2477  }
2478  }
2479  } else {
2480  Kokkos::parallel_for
2481  (Kokkos::TeamThreadRange(member, nrows_vals[0]),
2482  [&](const local_ordinal_type &j) {
2483  const local_ordinal_type pri = pri0 + j;
2484  for (local_ordinal_type vv=vbeg,vi=0;vv<npacks && vi<internal_vector_length;++vv,++vi) {
2485  const local_ordinal_type ri0 = ri0_vals[vi];
2486  const local_ordinal_type nrows = nrows_vals[vi];
2487  if (j < nrows) {
2488  for (local_ordinal_type col=0;col<num_vectors;++col) {
2489  for (local_ordinal_type i=0;i<blocksize;++i) {
2490  const local_ordinal_type row = blocksize*lclrow(ri0+j)+i;
2491  impl_scalar_type &y = Y_scalar_multivector(row,col);
2492  const impl_scalar_type yd = X_internal_vector_values(pri, i, col, v)[vi] - y;
2493  y += df*yd;
2494 
2495  {//if (compute_diff) {
2496  const auto yd_abs = Kokkos::ArithTraits<impl_scalar_type>::abs(yd);
2497  z_partial_sum += yd_abs*yd_abs;
2498  }
2499  }
2500  }
2501  }
2502  }
2503  });
2504  }
2505  //if (compute_diff)
2506  Z_scalar_vector(member.league_rank()) += z_partial_sum;
2507  }
2508  }
2509 
2513  template<typename WWViewType>
2514  KOKKOS_INLINE_FUNCTION
2515  void
2516  solveSingleVector(const member_type &member,
2517  const local_ordinal_type &blocksize,
2518  const local_ordinal_type &i0,
2519  const local_ordinal_type &r0,
2520  const local_ordinal_type &nrows,
2521  const local_ordinal_type &v,
2522  const WWViewType &WW) const {
2523  typedef SolveTridiagsDefaultModeAndAlgo
2524  <Kokkos::Impl::ActiveExecutionMemorySpace> default_mode_and_algo_type;
2525  typedef default_mode_and_algo_type::mode_type default_mode_type;
2526  typedef default_mode_and_algo_type::single_vector_algo_type default_algo_type;
2527 
2528  // base pointers
2529  auto A = D_internal_vector_values.data();
2530  auto X = X_internal_vector_values.data();
2531 
2532  // constant
2533  const auto one = Kokkos::ArithTraits<btdm_magnitude_type>::one();
2534  const auto zero = Kokkos::ArithTraits<btdm_magnitude_type>::zero();
2535  //const local_ordinal_type num_vectors = X_scalar_values.extent(2);
2536 
2537  // const local_ordinal_type blocksize = D_scalar_values.extent(1);
2538  const local_ordinal_type astep = D_internal_vector_values.stride_0();
2539  const local_ordinal_type as0 = D_internal_vector_values.stride_1(); //blocksize*vector_length;
2540  const local_ordinal_type as1 = D_internal_vector_values.stride_2(); //vector_length;
2541  const local_ordinal_type xstep = X_internal_vector_values.stride_0();
2542  const local_ordinal_type xs0 = X_internal_vector_values.stride_1(); //vector_length;
2543 
2544  // for multiple rhs
2545  //const local_ordinal_type xs0 = num_vectors*vector_length; //X_scalar_values.stride_1();
2546  //const local_ordinal_type xs1 = vector_length; //X_scalar_values.stride_2();
2547 
2548  // move to starting point
2549  A += i0*astep + v;
2550  X += r0*xstep + v;
2551 
2552  //for (local_ordinal_type col=0;col<num_vectors;++col)
2553  if (nrows > 1) {
2554  // solve Lx = x
2555  KOKKOSBATCHED_TRSV_LOWER_NO_TRANSPOSE_INTERNAL_INVOKE
2556  (default_mode_type,default_algo_type,
2557  member,
2558  KB::Diag::Unit,
2559  blocksize,blocksize,
2560  one,
2561  A, as0, as1,
2562  X, xs0);
2563 
2564  for (local_ordinal_type tr=1;tr<nrows;++tr) {
2565  KOKKOSBATCHED_GEMV_NO_TRANSPOSE_INTERNAL_INVOKE
2566  (default_mode_type,default_algo_type,
2567  member,
2568  blocksize, blocksize,
2569  -one,
2570  A+2*astep, as0, as1,
2571  X, xs0,
2572  one,
2573  X+1*xstep, xs0);
2574 
2575  KOKKOSBATCHED_TRSV_LOWER_NO_TRANSPOSE_INTERNAL_INVOKE
2576  (default_mode_type,default_algo_type,
2577  member,
2578  KB::Diag::Unit,
2579  blocksize,blocksize,
2580  one,
2581  A+3*astep, as0, as1,
2582  X+1*xstep, xs0);
2583 
2584  A += 3*astep;
2585  X += 1*xstep;
2586  }
2587 
2588  // solve Ux = x
2589  KOKKOSBATCHED_TRSV_UPPER_NO_TRANSPOSE_INTERNAL_INVOKE
2590  (default_mode_type,default_algo_type,
2591  member,
2592  KB::Diag::NonUnit,
2593  blocksize, blocksize,
2594  one,
2595  A, as0, as1,
2596  X, xs0);
2597 
2598  for (local_ordinal_type tr=nrows;tr>1;--tr) {
2599  A -= 3*astep;
2600  KOKKOSBATCHED_GEMV_NO_TRANSPOSE_INTERNAL_INVOKE
2601  (default_mode_type,default_algo_type,
2602  member,
2603  blocksize, blocksize,
2604  -one,
2605  A+1*astep, as0, as1,
2606  X, xs0,
2607  one,
2608  X-1*xstep, xs0);
2609 
2610  KOKKOSBATCHED_TRSV_UPPER_NO_TRANSPOSE_INTERNAL_INVOKE
2611  (default_mode_type,default_algo_type,
2612  member,
2613  KB::Diag::NonUnit,
2614  blocksize, blocksize,
2615  one,
2616  A, as0, as1,
2617  X-1*xstep,xs0);
2618 
2619  X -= 1*xstep;
2620  }
2621  // for multiple rhs
2622  //X += xs1;
2623  } else {
2624  const local_ordinal_type ws0 = WW.stride_0();
2625  auto W = WW.data() + v;
2626  KOKKOSBATCHED_COPY_VECTOR_NO_TRANSPOSE_INTERNAL_INVOKE
2627  (default_mode_type,
2628  member, blocksize, X, xs0, W, ws0);
2629  KOKKOSBATCHED_GEMV_NO_TRANSPOSE_INTERNAL_INVOKE
2630  (default_mode_type,default_algo_type,
2631  member,
2632  blocksize, blocksize,
2633  one,
2634  A, as0, as1,
2635  W, xs0,
2636  zero,
2637  X, xs0);
2638  }
2639  }
2640 
2641  template<typename WWViewType>
2642  KOKKOS_INLINE_FUNCTION
2643  void
2644  solveMultiVector(const member_type &member,
2645  const local_ordinal_type &/* blocksize */,
2646  const local_ordinal_type &i0,
2647  const local_ordinal_type &r0,
2648  const local_ordinal_type &nrows,
2649  const local_ordinal_type &v,
2650  const WWViewType &WW) const {
2651  typedef SolveTridiagsDefaultModeAndAlgo
2652  <Kokkos::Impl::ActiveExecutionMemorySpace> default_mode_and_algo_type;
2653  typedef default_mode_and_algo_type::mode_type default_mode_type;
2654  typedef default_mode_and_algo_type::multi_vector_algo_type default_algo_type;
2655 
2656  // constant
2657  const auto one = Kokkos::ArithTraits<btdm_magnitude_type>::one();
2658  const auto zero = Kokkos::ArithTraits<btdm_magnitude_type>::zero();
2659 
2660  // subview pattern
2661  auto A = Kokkos::subview(D_internal_vector_values, i0, Kokkos::ALL(), Kokkos::ALL(), v);
2662  auto X1 = Kokkos::subview(X_internal_vector_values, r0, Kokkos::ALL(), Kokkos::ALL(), v);
2663  auto X2 = X1;
2664 
2665  local_ordinal_type i = i0, r = r0;
2666 
2667 
2668  if (nrows > 1) {
2669  // solve Lx = x
2670  KB::Trsm<member_type,
2671  KB::Side::Left,KB::Uplo::Lower,KB::Trans::NoTranspose,KB::Diag::Unit,
2672  default_mode_type,default_algo_type>
2673  ::invoke(member, one, A, X1);
2674  for (local_ordinal_type tr=1;tr<nrows;++tr,i+=3) {
2675  A.assign_data( &D_internal_vector_values(i+2,0,0,v) );
2676  X2.assign_data( &X_internal_vector_values(++r,0,0,v) );
2677  KB::Gemm<member_type,
2678  KB::Trans::NoTranspose,KB::Trans::NoTranspose,
2679  default_mode_type,default_algo_type>
2680  ::invoke(member, -one, A, X1, one, X2);
2681  A.assign_data( &D_internal_vector_values(i+3,0,0,v) );
2682  KB::Trsm<member_type,
2683  KB::Side::Left,KB::Uplo::Lower,KB::Trans::NoTranspose,KB::Diag::Unit,
2684  default_mode_type,default_algo_type>
2685  ::invoke(member, one, A, X2);
2686  X1.assign_data( X2.data() );
2687  }
2688 
2689  // solve Ux = x
2690  KB::Trsm<member_type,
2691  KB::Side::Left,KB::Uplo::Upper,KB::Trans::NoTranspose,KB::Diag::NonUnit,
2692  default_mode_type,default_algo_type>
2693  ::invoke(member, one, A, X1);
2694  for (local_ordinal_type tr=nrows;tr>1;--tr) {
2695  i -= 3;
2696  A.assign_data( &D_internal_vector_values(i+1,0,0,v) );
2697  X2.assign_data( &X_internal_vector_values(--r,0,0,v) );
2698  KB::Gemm<member_type,
2699  KB::Trans::NoTranspose,KB::Trans::NoTranspose,
2700  default_mode_type,default_algo_type>
2701  ::invoke(member, -one, A, X1, one, X2);
2702  A.assign_data( &D_internal_vector_values(i,0,0,v) );
2703  KB::Trsm<member_type,
2704  KB::Side::Left,KB::Uplo::Upper,KB::Trans::NoTranspose,KB::Diag::NonUnit,
2705  default_mode_type,default_algo_type>
2706  ::invoke(member, one, A, X2);
2707  X1.assign_data( X2.data() );
2708  }
2709  } else {
2710  // matrix is already inverted
2711  auto W = Kokkos::subview(WW, Kokkos::ALL(), Kokkos::ALL(), v);
2712  KB::Copy<member_type,KB::Trans::NoTranspose,default_mode_type>
2713  ::invoke(member, X1, W);
2714  KB::Gemm<member_type,
2715  KB::Trans::NoTranspose,KB::Trans::NoTranspose,
2716  default_mode_type,default_algo_type>
2717  ::invoke(member, one, A, W, zero, X1);
2718  }
2719  }
2720 
2721  template<int B> struct SingleVectorTag {};
2722  template<int B> struct MultiVectorTag {};
2723 
2724  template<int B>
2725  KOKKOS_INLINE_FUNCTION
2726  void
2727  operator() (const SingleVectorTag<B> &, const member_type &member) const {
2728  const local_ordinal_type packidx = member.league_rank();
2729  const local_ordinal_type partidx = packptr(packidx);
2730  const local_ordinal_type npacks = packptr(packidx+1) - partidx;
2731  const local_ordinal_type pri0 = part2packrowidx0(partidx);
2732  const local_ordinal_type i0 = pack_td_ptr(partidx);
2733  const local_ordinal_type r0 = part2packrowidx0(partidx);
2734  const local_ordinal_type nrows = partptr(partidx+1) - partptr(partidx);
2735  const local_ordinal_type blocksize = (B == 0 ? D_internal_vector_values.extent(1) : B);
2736  const local_ordinal_type num_vectors = 1;
2737  internal_vector_scratch_type_3d_view
2738  WW(member.team_scratch(0), blocksize, 1, vector_loop_size);
2739  Kokkos::single(Kokkos::PerTeam(member), [&]() {
2740  Z_scalar_vector(member.league_rank()) = impl_scalar_type(0);
2741  });
2742  Kokkos::parallel_for
2743  (Kokkos::ThreadVectorRange(member, vector_loop_size),[&](const int &v) {
2744  solveSingleVector(member, blocksize, i0, r0, nrows, v, WW);
2745  copyToFlatMultiVector(member, partidx, npacks, pri0, v, blocksize, num_vectors);
2746  });
2747  }
2748 
2749  template<int B>
2750  KOKKOS_INLINE_FUNCTION
2751  void
2752  operator() (const MultiVectorTag<B> &, const member_type &member) const {
2753  const local_ordinal_type packidx = member.league_rank();
2754  const local_ordinal_type partidx = packptr(packidx);
2755  const local_ordinal_type npacks = packptr(packidx+1) - partidx;
2756  const local_ordinal_type pri0 = part2packrowidx0(partidx);
2757  const local_ordinal_type i0 = pack_td_ptr(partidx);
2758  const local_ordinal_type r0 = part2packrowidx0(partidx);
2759  const local_ordinal_type nrows = partptr(partidx+1) - partptr(partidx);
2760  const local_ordinal_type blocksize = (B == 0 ? D_internal_vector_values.extent(1) : B);
2761  const local_ordinal_type num_vectors = X_internal_vector_values.extent(2);
2762 
2763  internal_vector_scratch_type_3d_view
2764  WW(member.team_scratch(0), blocksize, num_vectors, vector_loop_size);
2765  Kokkos::single(Kokkos::PerTeam(member), [&]() {
2766  Z_scalar_vector(member.league_rank()) = impl_scalar_type(0);
2767  });
2768  Kokkos::parallel_for
2769  (Kokkos::ThreadVectorRange(member, vector_loop_size),[&](const int &v) {
2770  solveMultiVector(member, blocksize, i0, r0, nrows, v, WW);
2771  copyToFlatMultiVector(member, partidx, npacks, pri0, v, blocksize, num_vectors);
2772  });
2773  }
2774 
2775  void run(const impl_scalar_type_2d_view_tpetra &Y,
2776  const impl_scalar_type_1d_view &Z) {
2777  IFPACK2_BLOCKTRIDICONTAINER_PROFILER_REGION_BEGIN;
2778  IFPACK2_BLOCKTRIDICONTAINER_TIMER("BlockTriDi::SolveTridiags");
2779 
2781  this->Y_scalar_multivector = Y;
2782  this->Z_scalar_vector = Z;
2783 
2784  const local_ordinal_type num_vectors = X_internal_vector_values.extent(2);
2785  const local_ordinal_type blocksize = D_internal_vector_values.extent(1);
2786 
2787  const local_ordinal_type team_size =
2788  SolveTridiagsDefaultModeAndAlgo<typename execution_space::memory_space>::
2789  recommended_team_size(blocksize, vector_length, internal_vector_length);
2790  const int per_team_scratch = internal_vector_scratch_type_3d_view
2791  ::shmem_size(blocksize, num_vectors, vector_loop_size);
2792 
2793 #if defined(KOKKOS_ENABLE_DEPRECATED_CODE)
2794 #define BLOCKTRIDICONTAINER_DETAILS_SOLVETRIDIAGS(B) \
2795  if (num_vectors == 1) { \
2796  const Kokkos::TeamPolicy<execution_space,SingleVectorTag<B> > \
2797  policy(packptr.extent(0) - 1, team_size, vector_loop_size); \
2798  Kokkos::parallel_for \
2799  ("SolveTridiags::TeamPolicy::run<SingleVector>", \
2800  policy.set_scratch_size(0,Kokkos::PerTeam(per_team_scratch)), *this); \
2801  } else { \
2802  const Kokkos::TeamPolicy<execution_space,MultiVectorTag<B> > \
2803  policy(packptr.extent(0) - 1, team_size, vector_loop_size); \
2804  Kokkos::parallel_for \
2805  ("SolveTridiags::TeamPolicy::run<MultiVector>", \
2806  policy.set_scratch_size(0,Kokkos::PerTeam(per_team_scratch)), *this); \
2807  } break
2808 #else
2809 #define BLOCKTRIDICONTAINER_DETAILS_SOLVETRIDIAGS(B) \
2810  if (num_vectors == 1) { \
2811  Kokkos::TeamPolicy<execution_space,SingleVectorTag<B> > \
2812  policy(packptr.extent(0) - 1, team_size, vector_loop_size); \
2813  policy.set_scratch_size(0,Kokkos::PerTeam(per_team_scratch)); \
2814  Kokkos::parallel_for \
2815  ("SolveTridiags::TeamPolicy::run<SingleVector>", \
2816  policy, *this); \
2817  } else { \
2818  Kokkos::TeamPolicy<execution_space,MultiVectorTag<B> > \
2819  policy(packptr.extent(0) - 1, team_size, vector_loop_size); \
2820  policy.set_scratch_size(0,Kokkos::PerTeam(per_team_scratch)); \
2821  Kokkos::parallel_for \
2822  ("SolveTridiags::TeamPolicy::run<MultiVector>", \
2823  policy, *this); \
2824  } break
2825 #endif
2826  switch (blocksize) {
2827  case 3: BLOCKTRIDICONTAINER_DETAILS_SOLVETRIDIAGS( 3);
2828  case 5: BLOCKTRIDICONTAINER_DETAILS_SOLVETRIDIAGS( 5);
2829  case 7: BLOCKTRIDICONTAINER_DETAILS_SOLVETRIDIAGS( 7);
2830  case 9: BLOCKTRIDICONTAINER_DETAILS_SOLVETRIDIAGS( 9);
2831  case 10: BLOCKTRIDICONTAINER_DETAILS_SOLVETRIDIAGS(10);
2832  case 11: BLOCKTRIDICONTAINER_DETAILS_SOLVETRIDIAGS(11);
2833  case 16: BLOCKTRIDICONTAINER_DETAILS_SOLVETRIDIAGS(16);
2834  case 17: BLOCKTRIDICONTAINER_DETAILS_SOLVETRIDIAGS(17);
2835  case 18: BLOCKTRIDICONTAINER_DETAILS_SOLVETRIDIAGS(18);
2836  default : BLOCKTRIDICONTAINER_DETAILS_SOLVETRIDIAGS( 0);
2837  }
2838 #undef BLOCKTRIDICONTAINER_DETAILS_SOLVETRIDIAGS
2839 
2840  IFPACK2_BLOCKTRIDICONTAINER_PROFILER_REGION_END;
2841  }
2842  };
2843 
2847  static inline int ComputeResidualVectorRecommendedCudaVectorSize(const int blksize,
2848  const int team_size) {
2849  int total_team_size(0);
2850  if (blksize <= 5) total_team_size = 32;
2851  else if (blksize <= 9) total_team_size = 32; // 64
2852  else if (blksize <= 12) total_team_size = 96;
2853  else if (blksize <= 16) total_team_size = 128;
2854  else if (blksize <= 20) total_team_size = 160;
2855  else total_team_size = 160;
2856  return total_team_size/team_size;
2857  }
2858 
2859  template<typename MatrixType>
2860  struct ComputeResidualVector {
2861  public:
2862  using impl_type = ImplType<MatrixType>;
2863  using node_device_type = typename impl_type::node_device_type;
2864  using execution_space = typename impl_type::execution_space;
2865  using memory_space = typename impl_type::memory_space;
2866 
2867  using local_ordinal_type = typename impl_type::local_ordinal_type;
2868  using size_type = typename impl_type::size_type;
2869  using impl_scalar_type = typename impl_type::impl_scalar_type;
2870  using magnitude_type = typename impl_type::magnitude_type;
2871  using btdm_scalar_type = typename impl_type::btdm_scalar_type;
2872  using btdm_magnitude_type = typename impl_type::btdm_magnitude_type;
2874  using local_ordinal_type_1d_view = typename impl_type::local_ordinal_type_1d_view;
2875  using size_type_1d_view = typename impl_type::size_type_1d_view;
2876  using tpetra_block_access_view_type = typename impl_type::tpetra_block_access_view_type; // block crs (layout right)
2877  using impl_scalar_type_1d_view = typename impl_type::impl_scalar_type_1d_view;
2878  using impl_scalar_type_2d_view_tpetra = typename impl_type::impl_scalar_type_2d_view_tpetra; // block multivector (layout left)
2879  using vector_type_3d_view = typename impl_type::vector_type_3d_view;
2880  using btdm_scalar_type_4d_view = typename impl_type::btdm_scalar_type_4d_view;
2881  static constexpr int vector_length = impl_type::vector_length;
2882 
2884  using member_type = typename Kokkos::TeamPolicy<execution_space>::member_type;
2885 
2886  // enum for max blocksize and vector length
2887  enum : int { max_blocksize = 32 };
2888 
2889  private:
2890  ConstUnmanaged<impl_scalar_type_2d_view_tpetra> b;
2891  ConstUnmanaged<impl_scalar_type_2d_view_tpetra> x; // x_owned
2892  ConstUnmanaged<impl_scalar_type_2d_view_tpetra> x_remote;
2893  Unmanaged<impl_scalar_type_2d_view_tpetra> y;
2894  Unmanaged<vector_type_3d_view> y_packed;
2895  Unmanaged<btdm_scalar_type_4d_view> y_packed_scalar;
2896 
2897  // AmD information
2898  const ConstUnmanaged<size_type_1d_view> rowptr, rowptr_remote;
2899  const ConstUnmanaged<local_ordinal_type_1d_view> colindsub, colindsub_remote;
2900  const ConstUnmanaged<impl_scalar_type_1d_view> tpetra_values;
2901 
2902  // block crs graph information
2903  // for cuda (kokkos crs graph uses a different size_type from size_t)
2904  using a_rowptr_value_type = typename Kokkos::ViewTraits<local_ordinal_type*,node_device_type>::size_type;
2905  const ConstUnmanaged<Kokkos::View<a_rowptr_value_type*,node_device_type> > A_rowptr;
2906  const ConstUnmanaged<Kokkos::View<local_ordinal_type*,node_device_type> > A_colind;
2907 
2908  // blocksize
2909  const local_ordinal_type blocksize_requested;
2910 
2911  // part interface
2912  const ConstUnmanaged<local_ordinal_type_1d_view> part2packrowidx0;
2913  const ConstUnmanaged<local_ordinal_type_1d_view> part2rowidx0;
2914  const ConstUnmanaged<local_ordinal_type_1d_view> rowidx2part;
2915  const ConstUnmanaged<local_ordinal_type_1d_view> partptr;
2916  const ConstUnmanaged<local_ordinal_type_1d_view> lclrow;
2917  const ConstUnmanaged<local_ordinal_type_1d_view> dm2cm;
2918  const bool is_dm2cm_active;
2919 
2920  public:
2921  template<typename LocalCrsGraphType>
2922  ComputeResidualVector(const AmD<MatrixType> &amd,
2923  const LocalCrsGraphType &graph,
2924  const local_ordinal_type &blocksize_requested_,
2925  const PartInterface<MatrixType> &interf,
2926  const local_ordinal_type_1d_view &dm2cm_)
2927  : rowptr(amd.rowptr), rowptr_remote(amd.rowptr_remote),
2928  colindsub(amd.A_colindsub), colindsub_remote(amd.A_colindsub_remote),
2929  tpetra_values(amd.tpetra_values),
2930  A_rowptr(graph.row_map),
2931  A_colind(graph.entries),
2932  blocksize_requested(blocksize_requested_),
2933  part2packrowidx0(interf.part2packrowidx0),
2934  part2rowidx0(interf.part2rowidx0),
2935  rowidx2part(interf.rowidx2part),
2936  partptr(interf.partptr),
2937  lclrow(interf.lclrow),
2938  dm2cm(dm2cm_),
2939  is_dm2cm_active(dm2cm_.span() > 0)
2940  {}
2941 
2942  inline
2943  void
2944  SerialGemv(const local_ordinal_type &blocksize,
2945  const impl_scalar_type * const KOKKOS_RESTRICT AA,
2946  const impl_scalar_type * const KOKKOS_RESTRICT xx,
2947  /* */ impl_scalar_type * KOKKOS_RESTRICT yy) const {
2948  for (local_ordinal_type k0=0;k0<blocksize;++k0) {
2949  impl_scalar_type val = 0;
2950  const local_ordinal_type offset = k0*blocksize;
2951 #if defined(KOKKOS_ENABLE_PRAGMA_IVDEP)
2952 # pragma ivdep
2953 #endif
2954 #if defined(KOKKOS_ENABLE_PRAGMA_UNROLL)
2955 # pragma unroll
2956 #endif
2957  for (local_ordinal_type k1=0;k1<blocksize;++k1)
2958  val += AA[offset+k1]*xx[k1];
2959  yy[k0] -= val;
2960  }
2961  }
2962 
2963  template<typename bbViewType, typename yyViewType>
2964  KOKKOS_INLINE_FUNCTION
2965  void
2966  VectorCopy(const member_type &member,
2967  const local_ordinal_type &blocksize,
2968  const bbViewType &bb,
2969  const yyViewType &yy) const {
2970  Kokkos::parallel_for(Kokkos::ThreadVectorRange(member, blocksize), [&](const local_ordinal_type &k0) {
2971  yy(k0) = static_cast<typename yyViewType::const_value_type>(bb(k0));
2972  });
2973  }
2974 
2975  template<typename AAViewType, typename xxViewType, typename yyViewType>
2976  KOKKOS_INLINE_FUNCTION
2977  void
2978  TeamVectorGemv(const member_type &member,
2979  const local_ordinal_type &blocksize,
2980  const AAViewType &AA,
2981  const xxViewType &xx,
2982  const yyViewType &yy) const {
2983  Kokkos::parallel_for
2984  (Kokkos::TeamThreadRange(member, blocksize),
2985  [&](const local_ordinal_type &k0) {
2986  impl_scalar_type val = 0;
2987  Kokkos::parallel_for
2988  (Kokkos::ThreadVectorRange(member, blocksize),
2989  [&](const local_ordinal_type &k1) {
2990  val += AA(k0,k1)*xx(k1);
2991  });
2992  Kokkos::atomic_fetch_add(&yy(k0), typename yyViewType::const_value_type(-val));
2993  });
2994  }
2995 
2996  template<typename AAViewType, typename xxViewType, typename yyViewType>
2997  KOKKOS_INLINE_FUNCTION
2998  void
2999  VectorGemv(const member_type &member,
3000  const local_ordinal_type &blocksize,
3001  const AAViewType &AA,
3002  const xxViewType &xx,
3003  const yyViewType &yy) const {
3004  Kokkos::parallel_for
3005  (Kokkos::ThreadVectorRange(member, blocksize),
3006  [&](const local_ordinal_type &k0) {
3007  impl_scalar_type val(0);
3008  for (local_ordinal_type k1=0;k1<blocksize;++k1) {
3009  val += AA(k0,k1)*xx(k1);
3010  }
3011  Kokkos::atomic_fetch_add(&yy(k0), typename yyViewType::const_value_type(-val));
3012  });
3013  }
3014 
3015  // template<typename AAViewType, typename xxViewType, typename yyViewType>
3016  // KOKKOS_INLINE_FUNCTION
3017  // void
3018  // VectorGemv(const member_type &member,
3019  // const local_ordinal_type &blocksize,
3020  // const AAViewType &AA,
3021  // const xxViewType &xx,
3022  // const yyViewType &yy) const {
3023  // for (local_ordinal_type k0=0;k0<blocksize;++k0) {
3024  // impl_scalar_type val = 0;
3025  // Kokkos::parallel_for
3026  // (Kokkos::ThreadVectorRange(member, blocksize),
3027  // [&](const local_ordinal_type &k1) {
3028  // val += AA(k0,k1)*xx(k1);
3029  // });
3030  // Kokkos::atomic_fetch_add(&yy(k0), -val);
3031  // }
3032  // }
3033 
3034  struct SeqTag {};
3035 
3036  inline
3037  void
3038  operator() (const SeqTag &, const local_ordinal_type& i) const {
3039  const local_ordinal_type blocksize = blocksize_requested;
3040  const local_ordinal_type blocksize_square = blocksize*blocksize;
3041 
3042  // constants
3043  const Kokkos::pair<local_ordinal_type,local_ordinal_type> block_range(0, blocksize);
3044  const local_ordinal_type num_vectors = y.extent(1);
3045  const local_ordinal_type row = i*blocksize;
3046  for (local_ordinal_type col=0;col<num_vectors;++col) {
3047  // y := b
3048  impl_scalar_type *yy = &y(row, col);
3049  const impl_scalar_type * const bb = &b(row, col);
3050  memcpy(yy, bb, sizeof(impl_scalar_type)*blocksize);
3051 
3052  // y -= Rx
3053  const size_type A_k0 = A_rowptr[i];
3054  for (size_type k=rowptr[i];k<rowptr[i+1];++k) {
3055  const size_type j = A_k0 + colindsub[k];
3056  const impl_scalar_type * const AA = &tpetra_values(j*blocksize_square);
3057  const impl_scalar_type * const xx = &x(A_colind[j]*blocksize, col);
3058  SerialGemv(blocksize,AA,xx,yy);
3059  }
3060  }
3061  }
3062 
3063  KOKKOS_INLINE_FUNCTION
3064  void
3065  operator() (const SeqTag &, const member_type &member) const {
3066 
3067  // constants
3068  const local_ordinal_type blocksize = blocksize_requested;
3069  const local_ordinal_type blocksize_square = blocksize*blocksize;
3070 
3071  const local_ordinal_type lr = member.league_rank();
3072  const Kokkos::pair<local_ordinal_type,local_ordinal_type> block_range(0, blocksize);
3073  const local_ordinal_type num_vectors = y.extent(1);
3074 
3075  // subview pattern
3076  auto bb = Kokkos::subview(b, block_range, 0);
3077  auto xx = bb;
3078  auto yy = Kokkos::subview(y, block_range, 0);
3079  auto A_block = ConstUnmanaged<tpetra_block_access_view_type>(NULL, blocksize, blocksize);
3080 
3081  const local_ordinal_type row = lr*blocksize;
3082  for (local_ordinal_type col=0;col<num_vectors;++col) {
3083  // y := b
3084  yy.assign_data(&y(row, col));
3085  bb.assign_data(&b(row, col));
3086  if (member.team_rank() == 0)
3087  VectorCopy(member, blocksize, bb, yy);
3088  member.team_barrier();
3089 
3090  // y -= Rx
3091  const size_type A_k0 = A_rowptr[lr];
3092  Kokkos::parallel_for
3093  (Kokkos::TeamThreadRange(member, rowptr[lr], rowptr[lr+1]),
3094  [&](const local_ordinal_type &k) {
3095  const size_type j = A_k0 + colindsub[k];
3096  A_block.assign_data( &tpetra_values(j*blocksize_square) );
3097  xx.assign_data( &x(A_colind[j]*blocksize, col) );
3098  VectorGemv(member, blocksize, A_block, xx, yy);
3099  });
3100  }
3101  }
3102 
3103  template<int B>
3104  struct AsyncTag {};
3105 
3106  template<int B>
3107  inline
3108  void
3109  operator() (const AsyncTag<B> &, const local_ordinal_type &rowidx) const {
3110  const local_ordinal_type blocksize = (B == 0 ? blocksize_requested : B);
3111  const local_ordinal_type blocksize_square = blocksize*blocksize;
3112 
3113  // constants
3114  const local_ordinal_type partidx = rowidx2part(rowidx);
3115  const local_ordinal_type pri = part2packrowidx0(partidx) + (rowidx - partptr(partidx));
3116  const local_ordinal_type v = partidx % vector_length;
3117 
3118  const local_ordinal_type num_vectors = y_packed.extent(2);
3119  const local_ordinal_type num_local_rows = lclrow.extent(0);
3120 
3121  // temporary buffer for y flat
3122  impl_scalar_type yy[B == 0 ? max_blocksize : B] = {};
3123 
3124  const local_ordinal_type lr = lclrow(rowidx);
3125  const local_ordinal_type row = lr*blocksize;
3126  for (local_ordinal_type col=0;col<num_vectors;++col) {
3127  // y := b
3128  memcpy(yy, &b(row, col), sizeof(impl_scalar_type)*blocksize);
3129 
3130  // y -= Rx
3131  const size_type A_k0 = A_rowptr[lr];
3132  for (size_type k=rowptr[lr];k<rowptr[lr+1];++k) {
3133  const size_type j = A_k0 + colindsub[k];
3134  const impl_scalar_type * const AA = &tpetra_values(j*blocksize_square);
3135  const local_ordinal_type A_colind_at_j = A_colind[j];
3136  if (A_colind_at_j < num_local_rows) {
3137  const auto loc = is_dm2cm_active ? dm2cm[A_colind_at_j] : A_colind_at_j;
3138  const impl_scalar_type * const xx = &x(loc*blocksize, col);
3139  SerialGemv(blocksize, AA,xx,yy);
3140  } else {
3141  const auto loc = A_colind_at_j - num_local_rows;
3142  const impl_scalar_type * const xx_remote = &x_remote(loc*blocksize, col);
3143  SerialGemv(blocksize, AA,xx_remote,yy);
3144  }
3145  }
3146  // move yy to y_packed
3147  for (local_ordinal_type k=0;k<blocksize;++k)
3148  y_packed(pri, k, col)[v] = yy[k];
3149  }
3150  }
3151 
3152  template<int B>
3153  KOKKOS_INLINE_FUNCTION
3154  void
3155  operator() (const AsyncTag<B> &, const member_type &member) const {
3156  const local_ordinal_type blocksize = (B == 0 ? blocksize_requested : B);
3157  const local_ordinal_type blocksize_square = blocksize*blocksize;
3158 
3159  // constants
3160  const local_ordinal_type rowidx = member.league_rank();
3161  const local_ordinal_type partidx = rowidx2part(rowidx);
3162  const local_ordinal_type pri = part2packrowidx0(partidx) + (rowidx - partptr(partidx));
3163  const local_ordinal_type v = partidx % vector_length;
3164 
3165  const Kokkos::pair<local_ordinal_type,local_ordinal_type> block_range(0, blocksize);
3166  const local_ordinal_type num_vectors = y_packed_scalar.extent(2);
3167  const local_ordinal_type num_local_rows = lclrow.extent(0);
3168 
3169  // subview pattern
3170  auto bb = Kokkos::subview(b, block_range, 0);
3171  auto xx = Kokkos::subview(x, block_range, 0);
3172  auto xx_remote = Kokkos::subview(x_remote, block_range, 0);
3173  auto yy = Kokkos::subview(y_packed_scalar, 0, block_range, 0, 0);
3174  auto A_block = ConstUnmanaged<tpetra_block_access_view_type>(NULL, blocksize, blocksize);
3175 
3176  const local_ordinal_type lr = lclrow(rowidx);
3177  const local_ordinal_type row = lr*blocksize;
3178  for (local_ordinal_type col=0;col<num_vectors;++col) {
3179  // y := b
3180  bb.assign_data(&b(row, col));
3181  yy.assign_data(&y_packed_scalar(pri, 0, col, v));
3182  if (member.team_rank() == 0)
3183  VectorCopy(member, blocksize, bb, yy);
3184  member.team_barrier();
3185 
3186  // y -= Rx
3187  const size_type A_k0 = A_rowptr[lr];
3188  Kokkos::parallel_for
3189  (Kokkos::TeamThreadRange(member, rowptr[lr], rowptr[lr+1]),
3190  [&](const local_ordinal_type &k) {
3191  const size_type j = A_k0 + colindsub[k];
3192  A_block.assign_data( &tpetra_values(j*blocksize_square) );
3193 
3194  const local_ordinal_type A_colind_at_j = A_colind[j];
3195  if (A_colind_at_j < num_local_rows) {
3196  const auto loc = is_dm2cm_active ? dm2cm[A_colind_at_j] : A_colind_at_j;
3197  xx.assign_data( &x(loc*blocksize, col) );
3198  VectorGemv(member, blocksize, A_block, xx, yy);
3199  } else {
3200  const auto loc = A_colind_at_j - num_local_rows;
3201  xx_remote.assign_data( &x_remote(loc*blocksize, col) );
3202  VectorGemv(member, blocksize, A_block, xx_remote, yy);
3203  }
3204  });
3205  }
3206  }
3207 
3208  template <int P, int B> struct OverlapTag {};
3209 
3210  template<int P, int B>
3211  inline
3212  void
3213  operator() (const OverlapTag<P,B> &, const local_ordinal_type& rowidx) const {
3214  const local_ordinal_type blocksize = (B == 0 ? blocksize_requested : B);
3215  const local_ordinal_type blocksize_square = blocksize*blocksize;
3216 
3217  // constants
3218  const local_ordinal_type partidx = rowidx2part(rowidx);
3219  const local_ordinal_type pri = part2packrowidx0(partidx) + (rowidx - partptr(partidx));
3220  const local_ordinal_type v = partidx % vector_length;
3221 
3222  const local_ordinal_type num_vectors = y_packed.extent(2);
3223  const local_ordinal_type num_local_rows = lclrow.extent(0);
3224 
3225  // temporary buffer for y flat
3226  impl_scalar_type yy[max_blocksize] = {};
3227 
3228  auto colindsub_used = (P == 0 ? colindsub : colindsub_remote);
3229  auto rowptr_used = (P == 0 ? rowptr : rowptr_remote);
3230 
3231  const local_ordinal_type lr = lclrow(rowidx);
3232  const local_ordinal_type row = lr*blocksize;
3233  for (local_ordinal_type col=0;col<num_vectors;++col) {
3234  if (P == 0) {
3235  // y := b
3236  memcpy(yy, &b(row, col), sizeof(impl_scalar_type)*blocksize);
3237  } else {
3238  // y (temporary) := 0
3239  memset(yy, 0, sizeof(impl_scalar_type)*blocksize);
3240  }
3241 
3242  // y -= Rx
3243  const size_type A_k0 = A_rowptr[lr];
3244  for (size_type k=rowptr_used[lr];k<rowptr_used[lr+1];++k) {
3245  const size_type j = A_k0 + colindsub_used[k];
3246  const impl_scalar_type * const AA = &tpetra_values(j*blocksize_square);
3247  const local_ordinal_type A_colind_at_j = A_colind[j];
3248  if (P == 0) {
3249  const auto loc = is_dm2cm_active ? dm2cm[A_colind_at_j] : A_colind_at_j;
3250  const impl_scalar_type * const xx = &x(loc*blocksize, col);
3251  SerialGemv(blocksize,AA,xx,yy);
3252  } else {
3253  const auto loc = A_colind_at_j - num_local_rows;
3254  const impl_scalar_type * const xx_remote = &x_remote(loc*blocksize, col);
3255  SerialGemv(blocksize,AA,xx_remote,yy);
3256  }
3257  }
3258  // move yy to y_packed
3259  if (P == 0) {
3260  for (local_ordinal_type k=0;k<blocksize;++k)
3261  y_packed(pri, k, col)[v] = yy[k];
3262  } else {
3263  for (local_ordinal_type k=0;k<blocksize;++k)
3264  y_packed(pri, k, col)[v] += yy[k];
3265  }
3266  }
3267  }
3268 
3269  template<int P, int B>
3270  KOKKOS_INLINE_FUNCTION
3271  void
3272  operator() (const OverlapTag<P,B> &, const member_type &member) const {
3273  const local_ordinal_type blocksize = (B == 0 ? blocksize_requested : B);
3274  const local_ordinal_type blocksize_square = blocksize*blocksize;
3275 
3276  // constants
3277  const local_ordinal_type rowidx = member.league_rank();
3278  const local_ordinal_type partidx = rowidx2part(rowidx);
3279  const local_ordinal_type pri = part2packrowidx0(partidx) + (rowidx - partptr(partidx));
3280  const local_ordinal_type v = partidx % vector_length;
3281 
3282  const Kokkos::pair<local_ordinal_type,local_ordinal_type> block_range(0, blocksize);
3283  const local_ordinal_type num_vectors = y_packed_scalar.extent(2);
3284  const local_ordinal_type num_local_rows = lclrow.extent(0);
3285 
3286  // subview pattern
3287  auto bb = Kokkos::subview(b, block_range, 0);
3288  auto xx = bb; //Kokkos::subview(x, block_range, 0);
3289  auto xx_remote = bb; //Kokkos::subview(x_remote, block_range, 0);
3290  auto yy = Kokkos::subview(y_packed_scalar, 0, block_range, 0, 0);
3291  auto A_block = ConstUnmanaged<tpetra_block_access_view_type>(NULL, blocksize, blocksize);
3292  auto colindsub_used = (P == 0 ? colindsub : colindsub_remote);
3293  auto rowptr_used = (P == 0 ? rowptr : rowptr_remote);
3294 
3295  const local_ordinal_type lr = lclrow(rowidx);
3296  const local_ordinal_type row = lr*blocksize;
3297  for (local_ordinal_type col=0;col<num_vectors;++col) {
3298  yy.assign_data(&y_packed_scalar(pri, 0, col, v));
3299  if (P == 0) {
3300  // y := b
3301  bb.assign_data(&b(row, col));
3302  if (member.team_rank() == 0)
3303  VectorCopy(member, blocksize, bb, yy);
3304  member.team_barrier();
3305  }
3306 
3307  // y -= Rx
3308  const size_type A_k0 = A_rowptr[lr];
3309  Kokkos::parallel_for
3310  (Kokkos::TeamThreadRange(member, rowptr_used[lr], rowptr_used[lr+1]),
3311  [&](const local_ordinal_type &k) {
3312  const size_type j = A_k0 + colindsub_used[k];
3313  A_block.assign_data( &tpetra_values(j*blocksize_square) );
3314 
3315  const local_ordinal_type A_colind_at_j = A_colind[j];
3316  if (P == 0) {
3317  const auto loc = is_dm2cm_active ? dm2cm[A_colind_at_j] : A_colind_at_j;
3318  xx.assign_data( &x(loc*blocksize, col) );
3319  VectorGemv(member, blocksize, A_block, xx, yy);
3320  } else {
3321  const auto loc = A_colind_at_j - num_local_rows;
3322  xx_remote.assign_data( &x_remote(loc*blocksize, col) );
3323  VectorGemv(member, blocksize, A_block, xx_remote, yy);
3324  }
3325  });
3326  }
3327  }
3328 
3329  // y = b - Rx; seq method
3330  template<typename MultiVectorLocalViewTypeY,
3331  typename MultiVectorLocalViewTypeB,
3332  typename MultiVectorLocalViewTypeX>
3333  void run(const MultiVectorLocalViewTypeY &y_,
3334  const MultiVectorLocalViewTypeB &b_,
3335  const MultiVectorLocalViewTypeX &x_) {
3336  IFPACK2_BLOCKTRIDICONTAINER_PROFILER_REGION_BEGIN;
3337  IFPACK2_BLOCKTRIDICONTAINER_TIMER("BlockTriDi::ComputeResidual::<SeqTag>");
3338 
3339  y = y_; b = b_; x = x_;
3340  if (is_cuda<execution_space>::value) {
3341 #if defined(KOKKOS_ENABLE_CUDA)
3342  const local_ordinal_type blocksize = blocksize_requested;
3343  const local_ordinal_type team_size = 8;
3344  const local_ordinal_type vector_size = ComputeResidualVectorRecommendedCudaVectorSize(blocksize, team_size);
3345  const Kokkos::TeamPolicy<execution_space,SeqTag> policy(rowptr.extent(0) - 1, team_size, vector_size);
3346  Kokkos::parallel_for
3347  ("ComputeResidual::TeamPolicy::run<SeqTag>", policy, *this);
3348 #endif
3349  } else {
3350 #if defined(__CUDA_ARCH__)
3351  TEUCHOS_TEST_FOR_EXCEPT_MSG(true, "Error: CUDA should not see this code");
3352 #else
3353  const Kokkos::RangePolicy<execution_space,SeqTag> policy(0, rowptr.extent(0) - 1);
3354  Kokkos::parallel_for
3355  ("ComputeResidual::RangePolicy::run<SeqTag>", policy, *this);
3356 #endif
3357  }
3358  IFPACK2_BLOCKTRIDICONTAINER_PROFILER_REGION_END;
3359  }
3360 
3361  // y = b - R (x , x_remote)
3362  template<typename MultiVectorLocalViewTypeB,
3363  typename MultiVectorLocalViewTypeX,
3364  typename MultiVectorLocalViewTypeX_Remote>
3365  void run(const vector_type_3d_view &y_packed_,
3366  const MultiVectorLocalViewTypeB &b_,
3367  const MultiVectorLocalViewTypeX &x_,
3368  const MultiVectorLocalViewTypeX_Remote &x_remote_) {
3369  IFPACK2_BLOCKTRIDICONTAINER_PROFILER_REGION_BEGIN;
3370  IFPACK2_BLOCKTRIDICONTAINER_TIMER("BlockTriDi::ComputeResidual::<AsyncTag>");
3371 
3372  b = b_; x = x_; x_remote = x_remote_;
3373  if (is_cuda<execution_space>::value) {
3374 #if defined(KOKKOS_ENABLE_CUDA)
3375  y_packed_scalar = btdm_scalar_type_4d_view((btdm_scalar_type*)y_packed_.data(),
3376  y_packed_.extent(0),
3377  y_packed_.extent(1),
3378  y_packed_.extent(2),
3379  vector_length);
3380 #endif
3381  } else {
3382  y_packed = y_packed_;
3383  }
3384 
3385  if (is_cuda<execution_space>::value) {
3386 #if defined(KOKKOS_ENABLE_CUDA)
3387  const local_ordinal_type blocksize = blocksize_requested;
3388  const local_ordinal_type team_size = 8;
3389  const local_ordinal_type vector_size = ComputeResidualVectorRecommendedCudaVectorSize(blocksize, team_size);
3390  // local_ordinal_type vl_power_of_two = 1;
3391  // for (;vl_power_of_two<=blocksize_requested;vl_power_of_two*=2);
3392  // vl_power_of_two *= (vl_power_of_two < blocksize_requested ? 2 : 1);
3393  // const local_ordinal_type vl = vl_power_of_two > vector_length ? vector_length : vl_power_of_two;
3394 #define BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL(B) { \
3395  const Kokkos::TeamPolicy<execution_space,AsyncTag<B> > \
3396  policy(rowidx2part.extent(0), team_size, vector_size); \
3397  Kokkos::parallel_for \
3398  ("ComputeResidual::TeamPolicy::run<AsyncTag>", \
3399  policy, *this); } break
3400  switch (blocksize_requested) {
3401  case 3: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL( 3);
3402  case 5: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL( 5);
3403  case 7: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL( 7);
3404  case 9: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL( 9);
3405  case 10: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL(10);
3406  case 11: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL(11);
3407  case 16: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL(16);
3408  case 17: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL(17);
3409  case 18: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL(18);
3410  default : BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL( 0);
3411  }
3412 #undef BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL
3413 #endif
3414  } else {
3415 #if defined(__CUDA_ARCH__)
3416  TEUCHOS_TEST_FOR_EXCEPT_MSG(true, "Error: CUDA should not see this code");
3417 #else
3418 #define BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL(B) { \
3419  const Kokkos::RangePolicy<execution_space,AsyncTag<B> > policy(0, rowidx2part.extent(0)); \
3420  Kokkos::parallel_for \
3421  ("ComputeResidual::RangePolicy::run<AsyncTag>", \
3422  policy, *this); } break
3423  switch (blocksize_requested) {
3424  case 3: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL( 3);
3425  case 5: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL( 5);
3426  case 7: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL( 7);
3427  case 9: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL( 9);
3428  case 10: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL(10);
3429  case 11: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL(11);
3430  case 16: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL(16);
3431  case 17: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL(17);
3432  case 18: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL(18);
3433  default : BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL( 0);
3434  }
3435 #undef BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL
3436 #endif
3437  }
3438  IFPACK2_BLOCKTRIDICONTAINER_PROFILER_REGION_END;
3439  }
3440 
3441  // y = b - R (y , y_remote)
3442  template<typename MultiVectorLocalViewTypeB,
3443  typename MultiVectorLocalViewTypeX,
3444  typename MultiVectorLocalViewTypeX_Remote>
3445  void run(const vector_type_3d_view &y_packed_,
3446  const MultiVectorLocalViewTypeB &b_,
3447  const MultiVectorLocalViewTypeX &x_,
3448  const MultiVectorLocalViewTypeX_Remote &x_remote_,
3449  const bool compute_owned) {
3450  IFPACK2_BLOCKTRIDICONTAINER_PROFILER_REGION_BEGIN;
3451  IFPACK2_BLOCKTRIDICONTAINER_TIMER("BlockTriDi::ComputeResidual::<OverlapTag>");
3452 
3453  b = b_; x = x_; x_remote = x_remote_;
3454  if (is_cuda<execution_space>::value) {
3455 #if defined(KOKKOS_ENABLE_CUDA)
3456  y_packed_scalar = btdm_scalar_type_4d_view((btdm_scalar_type*)y_packed_.data(),
3457  y_packed_.extent(0),
3458  y_packed_.extent(1),
3459  y_packed_.extent(2),
3460  vector_length);
3461 #endif
3462  } else {
3463  y_packed = y_packed_;
3464  }
3465 
3466  if (is_cuda<execution_space>::value) {
3467 #if defined(KOKKOS_ENABLE_CUDA)
3468  const local_ordinal_type blocksize = blocksize_requested;
3469  const local_ordinal_type team_size = 8;
3470  const local_ordinal_type vector_size = ComputeResidualVectorRecommendedCudaVectorSize(blocksize, team_size);
3471  // local_ordinal_type vl_power_of_two = 1;
3472  // for (;vl_power_of_two<=blocksize_requested;vl_power_of_two*=2);
3473  // vl_power_of_two *= (vl_power_of_two < blocksize_requested ? 2 : 1);
3474  // const local_ordinal_type vl = vl_power_of_two > vector_length ? vector_length : vl_power_of_two;
3475 #define BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL(B) \
3476  if (compute_owned) { \
3477  const Kokkos::TeamPolicy<execution_space,OverlapTag<0,B> > \
3478  policy(rowidx2part.extent(0), team_size, vector_size); \
3479  Kokkos::parallel_for \
3480  ("ComputeResidual::TeamPolicy::run<OverlapTag<0> >", policy, *this); \
3481  } else { \
3482  const Kokkos::TeamPolicy<execution_space,OverlapTag<1,B> > \
3483  policy(rowidx2part.extent(0), team_size, vector_size); \
3484  Kokkos::parallel_for \
3485  ("ComputeResidual::TeamPolicy::run<OverlapTag<1> >", policy, *this); \
3486  } break
3487  switch (blocksize_requested) {
3488  case 3: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL( 3);
3489  case 5: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL( 5);
3490  case 7: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL( 7);
3491  case 9: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL( 9);
3492  case 10: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL(10);
3493  case 11: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL(11);
3494  case 16: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL(16);
3495  case 17: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL(17);
3496  case 18: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL(18);
3497  default : BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL( 0);
3498  }
3499 #undef BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL
3500 #endif
3501  } else {
3502 #if defined(__CUDA_ARCH__)
3503  TEUCHOS_TEST_FOR_EXCEPT_MSG(true, "Error: CUDA should not see this code");
3504 #else
3505 #define BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL(B) \
3506  if (compute_owned) { \
3507  const Kokkos::RangePolicy<execution_space,OverlapTag<0,B> > \
3508  policy(0, rowidx2part.extent(0)); \
3509  Kokkos::parallel_for \
3510  ("ComputeResidual::RangePolicy::run<OverlapTag<0> >", policy, *this); \
3511  } else { \
3512  const Kokkos::RangePolicy<execution_space,OverlapTag<1,B> > \
3513  policy(0, rowidx2part.extent(0)); \
3514  Kokkos::parallel_for \
3515  ("ComputeResidual::RangePolicy::run<OverlapTag<1> >", policy, *this); \
3516  } break
3517 
3518  switch (blocksize_requested) {
3519  case 3: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL( 3);
3520  case 5: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL( 5);
3521  case 7: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL( 7);
3522  case 9: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL( 9);
3523  case 10: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL(10);
3524  case 11: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL(11);
3525  case 16: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL(16);
3526  case 17: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL(17);
3527  case 18: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL(18);
3528  default : BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL( 0);
3529  }
3530 #undef BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL
3531 #endif
3532  }
3533  IFPACK2_BLOCKTRIDICONTAINER_PROFILER_REGION_END;
3534  }
3535  };
3536 
3537  template<typename MatrixType>
3538  void reduceVector(const ConstUnmanaged<typename ImplType<MatrixType>::impl_scalar_type_1d_view> zz,
3539  /* */ typename ImplType<MatrixType>::magnitude_type *vals) {
3540  IFPACK2_BLOCKTRIDICONTAINER_PROFILER_REGION_BEGIN;
3541  IFPACK2_BLOCKTRIDICONTAINER_TIMER("BlockTriDi::ReduceVector");
3542 
3543  using impl_type = ImplType<MatrixType>;
3544  using local_ordinal_type = typename impl_type::local_ordinal_type;
3545  using impl_scalar_type = typename impl_type::impl_scalar_type;
3546 #if 0
3547  const auto norm2 = KokkosBlas::nrm1(zz);
3548 #else
3549  impl_scalar_type norm2(0);
3550  Kokkos::parallel_reduce
3551  ("ReduceMultiVector::Device",
3552  Kokkos::RangePolicy<typename impl_type::execution_space>(0,zz.extent(0)),
3553  KOKKOS_LAMBDA(const local_ordinal_type &i, impl_scalar_type &update) {
3554  update += zz(i);
3555  }, norm2);
3556 #endif
3557  vals[0] = Kokkos::ArithTraits<impl_scalar_type>::abs(norm2);
3558 
3559  IFPACK2_BLOCKTRIDICONTAINER_PROFILER_REGION_END;
3560  }
3561 
3565  template<typename MatrixType>
3566  struct NormManager {
3567  public:
3569  using host_execution_space = typename impl_type::host_execution_space;
3570  using magnitude_type = typename impl_type::magnitude_type;
3571 
3572  private:
3573  bool collective_;
3574  int sweep_step_, sweep_step_upper_bound_;
3575 #ifdef HAVE_IFPACK2_MPI
3576  MPI_Request mpi_request_;
3577  MPI_Comm comm_;
3578 #endif
3579  magnitude_type work_[3];
3580 
3581  public:
3582  NormManager() = default;
3583  NormManager(const NormManager &b) = default;
3584  NormManager(const Teuchos::RCP<const Teuchos::Comm<int> >& comm) {
3585  sweep_step_ = 1;
3586  sweep_step_upper_bound_ = 1;
3587  collective_ = comm->getSize() > 1;
3588  if (collective_) {
3589 #ifdef HAVE_IFPACK2_MPI
3590  const auto mpi_comm = Teuchos::rcp_dynamic_cast<const Teuchos::MpiComm<int> >(comm);
3591  TEUCHOS_ASSERT( ! mpi_comm.is_null());
3592  comm_ = *mpi_comm->getRawMpiComm();
3593 #endif
3594  }
3595  const magnitude_type zero(0), minus_one(-1);
3596  work_[0] = zero;
3597  work_[1] = zero;
3598  work_[2] = minus_one;
3599  }
3600 
3601  // Check the norm every sweep_step sweeps.
3602  void setCheckFrequency(const int sweep_step) {
3603  TEUCHOS_TEST_FOR_EXCEPT_MSG(sweep_step < 1, "sweep step must be >= 1");
3604  sweep_step_upper_bound_ = sweep_step;
3605  sweep_step_ = 1;
3606  }
3607 
3608  // Get the buffer into which to store rank-local squared norms.
3609  magnitude_type* getBuffer() { return &work_[0]; }
3610 
3611  // Call MPI_Iallreduce to find the global squared norms.
3612  void ireduce(const int sweep, const bool force = false) {
3613  if ( ! force && sweep % sweep_step_) return;
3614 
3615  IFPACK2_BLOCKTRIDICONTAINER_TIMER("BlockTriDi::NormManager::Ireduce");
3616 
3617  work_[1] = work_[0];
3618  auto send_data = &work_[1];
3619  auto recv_data = &work_[0];
3620  if (collective_) {
3621 #ifdef HAVE_IFPACK2_MPI
3622 # if defined(IFPACK2_BLOCKTRIDICONTAINER_USE_MPI_3)
3623  MPI_Iallreduce(send_data, recv_data, 1,
3624  Teuchos::Details::MpiTypeTraits<magnitude_type>::getType(),
3625  MPI_SUM, comm_, &mpi_request_);
3626 # else
3627  MPI_Allreduce (send_data, recv_data, 1,
3628  Teuchos::Details::MpiTypeTraits<magnitude_type>::getType(),
3629  MPI_SUM, comm_);
3630 # endif
3631 #endif
3632  }
3633  }
3634 
3635  // Check if the norm-based termination criterion is met. tol2 is the
3636  // tolerance squared. Sweep is the sweep index. If not every iteration is
3637  // being checked, this function immediately returns false. If a check must
3638  // be done at this iteration, it waits for the reduction triggered by
3639  // ireduce to complete, then checks the global norm against the tolerance.
3640  bool checkDone (const int sweep, const magnitude_type tol2, const bool force = false) {
3641  // early return
3642  if (sweep <= 0) return false;
3643 
3644  IFPACK2_BLOCKTRIDICONTAINER_TIMER("BlockTriDi::NormManager::CheckDone");
3645 
3646  TEUCHOS_ASSERT(sweep >= 1);
3647  if ( ! force && (sweep - 1) % sweep_step_) return false;
3648  if (collective_) {
3649 #ifdef HAVE_IFPACK2_MPI
3650 # if defined(IFPACK2_BLOCKTRIDICONTAINER_USE_MPI_3)
3651  MPI_Wait(&mpi_request_, MPI_STATUS_IGNORE);
3652 # else
3653  // Do nothing.
3654 # endif
3655 #endif
3656  }
3657  bool r_val = false;
3658  if (sweep == 1) {
3659  work_[2] = work_[0];
3660  } else {
3661  r_val = (work_[0] < tol2*work_[2]);
3662  }
3663 
3664  // adjust sweep step
3665  const auto adjusted_sweep_step = 2*sweep_step_;
3666  if (adjusted_sweep_step < sweep_step_upper_bound_) {
3667  sweep_step_ = adjusted_sweep_step;
3668  } else {
3669  sweep_step_ = sweep_step_upper_bound_;
3670  }
3671  return r_val;
3672  }
3673 
3674  // After termination has occurred, finalize the norms for use in
3675  // get_norms{0,final}.
3676  void finalize () {
3677  work_[0] = std::sqrt(work_[0]); // after converged
3678  if (work_[2] >= 0)
3679  work_[2] = std::sqrt(work_[2]); // first norm
3680  // if work_[2] is minus one, then norm is not requested.
3681  }
3682 
3683  // Report norms to the caller.
3684  const magnitude_type getNorms0 () const { return work_[2]; }
3685  const magnitude_type getNormsFinal () const { return work_[0]; }
3686  };
3687 
3691  template<typename MatrixType>
3692  int
3693  applyInverseJacobi(// importer
3694  const Teuchos::RCP<const typename ImplType<MatrixType>::tpetra_block_crs_matrix_type> &A,
3695  const Teuchos::RCP<const typename ImplType<MatrixType>::tpetra_import_type> &tpetra_importer,
3696  const Teuchos::RCP<AsyncableImport<MatrixType> > &async_importer,
3697  const bool overlap_communication_and_computation,
3698  // tpetra interface
3699  const typename ImplType<MatrixType>::tpetra_multivector_type &X, // tpetra interface
3700  /* */ typename ImplType<MatrixType>::tpetra_multivector_type &Y, // tpetra interface
3701  /* */ typename ImplType<MatrixType>::tpetra_multivector_type &Z, // temporary tpetra interface (seq_method)
3702  /* */ typename ImplType<MatrixType>::impl_scalar_type_1d_view &W, // temporary tpetra interface (diff)
3703  // local object interface
3704  const PartInterface<MatrixType> &interf, // mesh interface
3705  const BlockTridiags<MatrixType> &btdm, // packed block tridiagonal matrices
3706  const AmD<MatrixType> &amd, // R = A - D
3707  /* */ typename ImplType<MatrixType>::vector_type_1d_view &work, // workspace for packed multivector of right hand side
3708  /* */ NormManager<MatrixType> &norm_manager,
3709  // preconditioner parameters
3710  const typename ImplType<MatrixType>::impl_scalar_type &damping_factor,
3711  /* */ bool is_y_zero,
3712  const int max_num_sweeps,
3713  const typename ImplType<MatrixType>::magnitude_type tol,
3714  const int check_tol_every) {
3715  IFPACK2_BLOCKTRIDICONTAINER_TIMER("BlockTriDi::ApplyInverseJacobi");
3716 
3717  using impl_type = ImplType<MatrixType>;
3718  using node_memory_space = typename impl_type::node_memory_space;
3719  using local_ordinal_type = typename impl_type::local_ordinal_type;
3720  using size_type = typename impl_type::size_type;
3721  using impl_scalar_type = typename impl_type::impl_scalar_type;
3722  using magnitude_type = typename impl_type::magnitude_type;
3723  using local_ordinal_type_1d_view = typename impl_type::local_ordinal_type_1d_view;
3724  using vector_type_1d_view = typename impl_type::vector_type_1d_view;
3725  using vector_type_3d_view = typename impl_type::vector_type_3d_view;
3726  using tpetra_multivector_type = typename impl_type::tpetra_multivector_type;
3727 
3728  using impl_scalar_type_1d_view = typename impl_type::impl_scalar_type_1d_view;
3729 
3730  // either tpetra importer or async importer must be active
3731  TEUCHOS_TEST_FOR_EXCEPT_MSG(!tpetra_importer.is_null() && !async_importer.is_null(),
3732  "Neither Tpetra importer nor Async importer is null.");
3733  // max number of sweeps should be positive number
3734  TEUCHOS_TEST_FOR_EXCEPT_MSG(max_num_sweeps <= 0,
3735  "Maximum number of sweeps must be >= 1.");
3736 
3737  // const parameters
3738  const bool is_seq_method_requested = !tpetra_importer.is_null();
3739  const bool is_async_importer_active = !async_importer.is_null();
3740  const bool is_norm_manager_active = tol > Kokkos::ArithTraits<magnitude_type>::zero();
3741  const magnitude_type tolerance = tol*tol;
3742  const local_ordinal_type blocksize = btdm.values.extent(1);
3743  const local_ordinal_type num_vectors = Y.getNumVectors();
3744  const local_ordinal_type num_blockrows = interf.part2packrowidx0_back;
3745 
3746  const impl_scalar_type zero(0.0);
3747 
3748  TEUCHOS_TEST_FOR_EXCEPT_MSG(is_norm_manager_active && is_seq_method_requested,
3749  "The seq method for applyInverseJacobi, " <<
3750  "which in any case is for developer use only, " <<
3751  "does not support norm-based termination.");
3752 
3753  // if workspace is needed more, resize it
3754  const size_type work_span_required = num_blockrows*num_vectors*blocksize;
3755  if (work.span() < work_span_required)
3756  work = vector_type_1d_view("vector workspace 1d view", work_span_required);
3757 
3758  // construct W
3759  const local_ordinal_type W_size = interf.packptr.extent(0)-1;
3760  if (local_ordinal_type(W.extent(0)) < W_size)
3761  W = impl_scalar_type_1d_view("W", W_size);
3762 
3763  typename impl_type::impl_scalar_type_2d_view_tpetra remote_multivector;
3764  {
3765  if (is_seq_method_requested) {
3766  if (Z.getNumVectors() != Y.getNumVectors())
3767  Z = tpetra_multivector_type(tpetra_importer->getTargetMap(), num_vectors, false);
3768  } else {
3769  if (is_async_importer_active) {
3770  // create comm data buffer and keep it here
3771  async_importer->createDataBuffer(num_vectors);
3772  remote_multivector = async_importer->getRemoteMultiVectorLocalView();
3773  }
3774  }
3775  }
3776 
3777  // wrap the workspace with 3d view
3778  vector_type_3d_view pmv(work.data(), num_blockrows, blocksize, num_vectors);
3779  const auto XX = X.template getLocalView<node_memory_space>();
3780  const auto YY = Y.template getLocalView<node_memory_space>();
3781  const auto ZZ = Z.template getLocalView<node_memory_space>();
3782  if (is_y_zero) Kokkos::deep_copy(YY, zero);
3783 
3784  MultiVectorConverter<MatrixType> multivector_converter(interf, pmv);
3785  SolveTridiags<MatrixType> solve_tridiags(interf, btdm, pmv,
3786  damping_factor, is_norm_manager_active);
3787 
3788  const local_ordinal_type_1d_view dummy_local_ordinal_type_1d_view;
3789  ComputeResidualVector<MatrixType>
3790  compute_residual_vector(amd, A->getCrsGraph().getLocalGraph(), blocksize, interf,
3791  is_async_importer_active ? async_importer->dm2cm : dummy_local_ordinal_type_1d_view);
3792 
3793  // norm manager workspace resize
3794  if (is_norm_manager_active)
3795  norm_manager.setCheckFrequency(check_tol_every);
3796 
3797  // iterate
3798  int sweep = 0;
3799  for (;sweep<max_num_sweeps;++sweep) {
3800  {
3801  if (is_y_zero) {
3802  // pmv := x(lclrow)
3803  multivector_converter.run(XX);
3804  } else {
3805  if (is_seq_method_requested) {
3806  // SEQ METHOD IS TESTING ONLY
3807 
3808  // y := x - R y
3809  Z.doImport(Y, *tpetra_importer, Tpetra::REPLACE);
3810  compute_residual_vector.run(YY, XX, ZZ);
3811 
3812  // pmv := y(lclrow).
3813  multivector_converter.run(YY);
3814  } else {
3815  // fused y := x - R y and pmv := y(lclrow);
3816  // real use case does not use overlap comp and comm
3817  if (overlap_communication_and_computation || !is_async_importer_active) {
3818  if (is_async_importer_active) async_importer->asyncSendRecv(YY);
3819  compute_residual_vector.run(pmv, XX, YY, remote_multivector, true);
3820  if (is_norm_manager_active && norm_manager.checkDone(sweep, tolerance)) {
3821  if (is_async_importer_active) async_importer->cancel();
3822  break;
3823  }
3824  if (is_async_importer_active) {
3825  async_importer->syncRecv();
3826  compute_residual_vector.run(pmv, XX, YY, remote_multivector, false);
3827  }
3828  } else {
3829  if (is_async_importer_active)
3830  async_importer->syncExchange(YY);
3831  if (is_norm_manager_active && norm_manager.checkDone(sweep, tolerance)) break;
3832  compute_residual_vector.run(pmv, XX, YY, remote_multivector);
3833  }
3834  }
3835  }
3836  }
3837 
3838  // pmv := inv(D) pmv.
3839  {
3840  solve_tridiags.run(YY, W);
3841  }
3842  {
3843  if (is_norm_manager_active) {
3844  // y(lclrow) = (b - a) y(lclrow) + a pmv, with b = 1 always.
3845  reduceVector<MatrixType>(W, norm_manager.getBuffer());
3846  if (sweep + 1 == max_num_sweeps) {
3847  norm_manager.ireduce(sweep, true);
3848  norm_manager.checkDone(sweep + 1, tolerance, true);
3849  } else {
3850  norm_manager.ireduce(sweep);
3851  }
3852  }
3853  }
3854  is_y_zero = false;
3855  }
3856 
3857  //sqrt the norms for the caller's use.
3858  if (is_norm_manager_active) norm_manager.finalize();
3859 
3860  return sweep;
3861  }
3862 
3863 
3864  template<typename MatrixType>
3865  struct ImplObject {
3866  using impl_type = ImplType<MatrixType>;
3867  using part_interface_type = PartInterface<MatrixType>;
3868  using block_tridiags_type = BlockTridiags<MatrixType>;
3869  using amd_type = AmD<MatrixType>;
3870  using norm_manager_type = NormManager<MatrixType>;
3871  using async_import_type = AsyncableImport<MatrixType>;
3872 
3873  // distructed objects
3874  Teuchos::RCP<const typename impl_type::tpetra_block_crs_matrix_type> A;
3875  Teuchos::RCP<const typename impl_type::tpetra_import_type> tpetra_importer;
3876  Teuchos::RCP<async_import_type> async_importer;
3877  bool overlap_communication_and_computation;
3878 
3879  // copy of Y (mutable to penentrate const)
3880  mutable typename impl_type::tpetra_multivector_type Z;
3881  mutable typename impl_type::impl_scalar_type_1d_view W;
3882 
3883  // local objects
3884  part_interface_type part_interface;
3885  block_tridiags_type block_tridiags; // D
3886  amd_type a_minus_d; // R = A - D
3887  mutable typename impl_type::vector_type_1d_view work; // right hand side workspace
3888  mutable norm_manager_type norm_manager;
3889  };
3890 
3891  } // namespace BlockTriDiContainerDetails
3892 
3893 } // namespace Ifpack2
3894 
3895 #endif
Definition: Ifpack2_BlockTriDiContainer_impl.hpp:171
BlockTridiags< MatrixType > createBlockTridiags(const PartInterface< MatrixType > &interf)
Definition: Ifpack2_BlockTriDiContainer_impl.hpp:1302
int applyInverseJacobi(const Teuchos::RCP< const typename ImplType< MatrixType >::tpetra_block_crs_matrix_type > &A, const Teuchos::RCP< const typename ImplType< MatrixType >::tpetra_import_type > &tpetra_importer, const Teuchos::RCP< AsyncableImport< MatrixType > > &async_importer, const bool overlap_communication_and_computation, const typename ImplType< MatrixType >::tpetra_multivector_type &X, typename ImplType< MatrixType >::tpetra_multivector_type &Y, typename ImplType< MatrixType >::tpetra_multivector_type &Z, typename ImplType< MatrixType >::impl_scalar_type_1d_view &W, const PartInterface< MatrixType > &interf, const BlockTridiags< MatrixType > &btdm, const AmD< MatrixType > &amd, typename ImplType< MatrixType >::vector_type_1d_view &work, NormManager< MatrixType > &norm_manager, const typename ImplType< MatrixType >::impl_scalar_type &damping_factor, bool is_y_zero, const int max_num_sweeps, const typename ImplType< MatrixType >::magnitude_type tol, const int check_tol_every)
Definition: Ifpack2_BlockTriDiContainer_impl.hpp:3693
PartInterface< MatrixType > createPartInterface(const Teuchos::RCP< const typename ImplType< MatrixType >::tpetra_block_crs_matrix_type > &A, const Teuchos::Array< Teuchos::Array< typename ImplType< MatrixType >::local_ordinal_type > > &partitions)
Definition: Ifpack2_BlockTriDiContainer_impl.hpp:1122
size_type size() const
Definition: Ifpack2_BlockTriDiContainer_impl.hpp:201
void performNumericPhase(const Teuchos::RCP< const typename ImplType< MatrixType >::tpetra_block_crs_matrix_type > &A, const PartInterface< MatrixType > &interf, BlockTridiags< MatrixType > &btdm, const typename ImplType< MatrixType >::magnitude_type tiny)
Definition: Ifpack2_BlockTriDiContainer_impl.hpp:2130
static int ComputeResidualVectorRecommendedCudaVectorSize(const int blksize, const int team_size)
Definition: Ifpack2_BlockTriDiContainer_impl.hpp:2847
Kokkos::View< size_type *, device_type > size_type_1d_view
Definition: Ifpack2_BlockTriDiContainer_impl.hpp:362
Kokkos::Details::ArithTraits< scalar_type >::val_type impl_scalar_type
Definition: Ifpack2_BlockTriDiContainer_impl.hpp:306
TEUCHOS_DEPRECATED RCP< T > rcp(T *p, Dealloc_T dealloc, bool owns_mem)
#define TEUCHOS_TEST_FOR_EXCEPT_MSG(throw_exception_test, msg)
Definition: Ifpack2_BlockTriDiContainer_impl.hpp:180
size_t size_type
Definition: Ifpack2_BlockTriDiContainer_impl.hpp:297
std::string get_msg_prefix(const CommPtrType &comm)
Definition: Ifpack2_BlockTriDiContainer_impl.hpp:189
Definition: Ifpack2_BlockTriDiContainer_impl.hpp:237
Kokkos::ViewAllocateWithoutInitializing do_not_initialize_tag
for the next promotion
Definition: Ifpack2_BlockTriDiContainer_impl.hpp:127
KB::Vector< T, l > Vector
Definition: Ifpack2_BlockTriDiContainer_impl.hpp:349
void send(const Packet sendBuffer[], const Ordinal count, const int destRank, const int tag, const Comm< Ordinal > &comm)
Teuchos::RCP< AsyncableImport< MatrixType > > createBlockCrsAsyncImporter(const Teuchos::RCP< const typename ImplType< MatrixType >::tpetra_block_crs_matrix_type > &A)
Definition: Ifpack2_BlockTriDiContainer_impl.hpp:1008
T * getRawPtr() const
node_type::device_type node_device_type
Definition: Ifpack2_BlockTriDiContainer_impl.hpp:320
Definition: Ifpack2_BlockTriDiContainer_impl.hpp:3566
Teuchos::RCP< const typename ImplType< MatrixType >::tpetra_import_type > createBlockCrsTpetraImporter(const Teuchos::RCP< const typename ImplType< MatrixType >::tpetra_block_crs_matrix_type > &A)
Definition: Ifpack2_BlockTriDiContainer_impl.hpp:386
Kokkos::DefaultHostExecutionSpace host_execution_space
Definition: Ifpack2_BlockTriDiContainer_impl.hpp:315
void performSymbolicPhase(const Teuchos::RCP< const typename ImplType< MatrixType >::tpetra_block_crs_matrix_type > &A, const PartInterface< MatrixType > &interf, BlockTridiags< MatrixType > &btdm, AmD< MatrixType > &amd, const bool overlap_communication_and_computation)
Definition: Ifpack2_BlockTriDiContainer_impl.hpp:1476
RCP< CommRequest< Ordinal > > isend(const ArrayRCP< const Packet > &sendBuffer, const int destRank, const int tag, const Comm< Ordinal > &comm)
#define TEUCHOS_ASSERT(assertion_test)
Definition: Ifpack2_BlockTriDiContainer_impl.hpp:1445
Definition: Ifpack2_BlockTriDiContainer_impl.hpp:1258
Definition: Ifpack2_BlockTriDiContainer_impl.hpp:2283
Definition: Ifpack2_BlockTriDiContainer_impl.hpp:293
Definition: Ifpack2_BlockTriDiContainer_impl.hpp:2143