Ifpack2 Templated Preconditioning Package  Version 1.0
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Groups Pages
Ifpack2_BlockTriDiContainer_impl.hpp
1 // @HEADER
2 // *****************************************************************************
3 // Ifpack2: Templated Object-Oriented Algebraic Preconditioner Package
4 //
5 // Copyright 2009 NTESS and the Ifpack2 contributors.
6 // SPDX-License-Identifier: BSD-3-Clause
7 // *****************************************************************************
8 // @HEADER
9 
10 #ifndef IFPACK2_BLOCKTRIDICONTAINER_IMPL_HPP
11 #define IFPACK2_BLOCKTRIDICONTAINER_IMPL_HPP
12 
13 //#define IFPACK2_BLOCKTRIDICONTAINER_WRITE_MM
14 //#define IFPACK2_BLOCKTRIDICONTAINER_USE_PRINTF
15 
17 
18 #include <Tpetra_Details_extractMpiCommFromTeuchos.hpp>
19 #include <Tpetra_Distributor.hpp>
20 #include <Tpetra_BlockMultiVector.hpp>
21 
22 #include <Kokkos_ArithTraits.hpp>
23 #include <KokkosBatched_Util.hpp>
24 #include <KokkosBatched_Vector.hpp>
25 #include <KokkosBatched_Copy_Decl.hpp>
26 #include <KokkosBatched_Copy_Impl.hpp>
27 #include <KokkosBatched_AddRadial_Decl.hpp>
28 #include <KokkosBatched_AddRadial_Impl.hpp>
29 #include <KokkosBatched_SetIdentity_Decl.hpp>
30 #include <KokkosBatched_SetIdentity_Impl.hpp>
31 #include <KokkosBatched_Gemm_Decl.hpp>
32 #include <KokkosBatched_Gemm_Serial_Impl.hpp>
33 #include <KokkosBatched_Gemm_Team_Impl.hpp>
34 #include <KokkosBatched_Gemv_Decl.hpp>
35 #include <KokkosBatched_Gemv_Team_Impl.hpp>
36 #include <KokkosBatched_Trsm_Decl.hpp>
37 #include <KokkosBatched_Trsm_Serial_Impl.hpp>
38 #include <KokkosBatched_Trsm_Team_Impl.hpp>
39 #include <KokkosBatched_Trsv_Decl.hpp>
40 #include <KokkosBatched_Trsv_Serial_Impl.hpp>
41 #include <KokkosBatched_Trsv_Team_Impl.hpp>
42 #include <KokkosBatched_LU_Decl.hpp>
43 #include <KokkosBatched_LU_Serial_Impl.hpp>
44 #include <KokkosBatched_LU_Team_Impl.hpp>
45 
46 #include <KokkosBlas1_nrm1.hpp>
47 #include <KokkosBlas1_nrm2.hpp>
48 
49 #include <memory>
50 
51 #include "Ifpack2_BlockHelper.hpp"
52 #include "Ifpack2_BlockComputeResidualVector.hpp"
53 #include "Ifpack2_BlockComputeResidualAndSolve.hpp"
54 
55 //#include <KokkosBlas2_gemv.hpp>
56 
57 // need to interface this into cmake variable (or only use this flag when it is necessary)
58 //#define IFPACK2_BLOCKTRIDICONTAINER_ENABLE_PROFILE
59 //#undef IFPACK2_BLOCKTRIDICONTAINER_ENABLE_PROFILE
60 #if defined(KOKKOS_ENABLE_CUDA) && defined(IFPACK2_BLOCKTRIDICONTAINER_ENABLE_PROFILE)
61 #include "cuda_profiler_api.h"
62 #endif
63 
64 // I am not 100% sure about the mpi 3 on cuda
65 #if MPI_VERSION >= 3
66 #define IFPACK2_BLOCKTRIDICONTAINER_USE_MPI_3
67 #endif
68 
69 // ::: Experiments :::
70 // define either pinned memory or cudamemory for mpi
71 // if both macros are disabled, it will use tpetra memory space which is uvm space for cuda
72 // if defined, this use pinned memory instead of device pointer
73 // by default, we enable pinned memory
74 #define IFPACK2_BLOCKTRIDICONTAINER_USE_PINNED_MEMORY_FOR_MPI
75 //#define IFPACK2_BLOCKTRIDICONTAINER_USE_CUDA_MEMORY_FOR_MPI
76 
77 // if defined, all views are allocated on cuda space intead of cuda uvm space
78 #define IFPACK2_BLOCKTRIDICONTAINER_USE_CUDA_SPACE
79 
80 // if defined, btdm_scalar_type is used (if impl_scala_type is double, btdm_scalar_type is float)
81 #if defined(HAVE_IFPACK2_BLOCKTRIDICONTAINER_SMALL_SCALAR)
82 #define IFPACK2_BLOCKTRIDICONTAINER_USE_SMALL_SCALAR_FOR_BLOCKTRIDIAG
83 #endif
84 
85 // if defined, it uses multiple execution spaces
86 #define IFPACK2_BLOCKTRIDICONTAINER_USE_EXEC_SPACE_INSTANCES
87 
88 namespace Ifpack2 {
89 
90  namespace BlockTriDiContainerDetails {
91 
92  namespace KB = KokkosBatched;
93 
97  using do_not_initialize_tag = Kokkos::ViewAllocateWithoutInitializing;
98 
99  template <typename MemoryTraitsType, Kokkos::MemoryTraitsFlags flag>
100  using MemoryTraits = Kokkos::MemoryTraits<MemoryTraitsType::is_unmanaged |
101  MemoryTraitsType::is_random_access |
102  flag>;
103 
104  template <typename ViewType>
105  using Unmanaged = Kokkos::View<typename ViewType::data_type,
106  typename ViewType::array_layout,
107  typename ViewType::device_type,
108  MemoryTraits<typename ViewType::memory_traits,Kokkos::Unmanaged> >;
109  template <typename ViewType>
110  using Atomic = Kokkos::View<typename ViewType::data_type,
111  typename ViewType::array_layout,
112  typename ViewType::device_type,
113  MemoryTraits<typename ViewType::memory_traits,Kokkos::Atomic> >;
114  template <typename ViewType>
115  using Const = Kokkos::View<typename ViewType::const_data_type,
116  typename ViewType::array_layout,
117  typename ViewType::device_type,
118  typename ViewType::memory_traits>;
119  template <typename ViewType>
120  using ConstUnmanaged = Const<Unmanaged<ViewType> >;
121 
122  template <typename ViewType>
123  using AtomicUnmanaged = Atomic<Unmanaged<ViewType> >;
124 
125  template <typename ViewType>
126  using Unmanaged = Kokkos::View<typename ViewType::data_type,
127  typename ViewType::array_layout,
128  typename ViewType::device_type,
129  MemoryTraits<typename ViewType::memory_traits,Kokkos::Unmanaged> >;
130 
131 
132  template <typename ViewType>
133  using Scratch = Kokkos::View<typename ViewType::data_type,
134  typename ViewType::array_layout,
135  typename ViewType::execution_space::scratch_memory_space,
136  MemoryTraits<typename ViewType::memory_traits, Kokkos::Unmanaged> >;
137 
141  template<typename T> struct BlockTridiagScalarType { typedef T type; };
142 #if defined(IFPACK2_BLOCKTRIDICONTAINER_USE_SMALL_SCALAR_FOR_BLOCKTRIDIAG)
143  template<> struct BlockTridiagScalarType<double> { typedef float type; };
144  //template<> struct SmallScalarType<Kokkos::complex<double> > { typedef Kokkos::complex<float> type; };
145 #endif
146 
147 #if defined(KOKKOS_ENABLE_CUDA) && defined(IFPACK2_BLOCKTRIDICONTAINER_ENABLE_PROFILE)
148 #define IFPACK2_BLOCKTRIDICONTAINER_PROFILER_REGION_BEGIN \
149  KOKKOS_IMPL_CUDA_SAFE_CALL(cudaProfilerStart());
150 
151 #define IFPACK2_BLOCKTRIDICONTAINER_PROFILER_REGION_END \
152  { KOKKOS_IMPL_CUDA_SAFE_CALL( cudaProfilerStop() ); }
153 #else
154 #define IFPACK2_BLOCKTRIDICONTAINER_PROFILER_REGION_BEGIN
156 #define IFPACK2_BLOCKTRIDICONTAINER_PROFILER_REGION_END
157 #endif
158 
162  template<typename MatrixType>
164  createBlockCrsTpetraImporter(const Teuchos::RCP<const typename BlockHelperDetails::ImplType<MatrixType>::tpetra_row_matrix_type> &A) {
165  IFPACK2_BLOCKHELPER_TIMER("BlockTriDi::CreateBlockCrsTpetraImporter", CreateBlockCrsTpetraImporter);
167  using tpetra_map_type = typename impl_type::tpetra_map_type;
168  using tpetra_mv_type = typename impl_type::tpetra_block_multivector_type;
169  using tpetra_import_type = typename impl_type::tpetra_import_type;
170  using crs_matrix_type = typename impl_type::tpetra_crs_matrix_type;
171  using block_crs_matrix_type = typename impl_type::tpetra_block_crs_matrix_type;
172 
173  auto A_crs = Teuchos::rcp_dynamic_cast<const crs_matrix_type>(A);
174  auto A_bcrs = Teuchos::rcp_dynamic_cast<const block_crs_matrix_type>(A);
175 
176  bool hasBlockCrsMatrix = ! A_bcrs.is_null ();
177 
178  // This is OK here to use the graph of the A_crs matrix and a block size of 1
179  const auto g = hasBlockCrsMatrix ? A_bcrs->getCrsGraph() : *(A_crs->getCrsGraph()); // tpetra crs graph object
180 
181  const auto blocksize = hasBlockCrsMatrix ? A_bcrs->getBlockSize() : 1;
182  const auto src = Teuchos::rcp(new tpetra_map_type(tpetra_mv_type::makePointMap(*g.getDomainMap(), blocksize)));
183  const auto tgt = Teuchos::rcp(new tpetra_map_type(tpetra_mv_type::makePointMap(*g.getColMap() , blocksize)));
184  IFPACK2_BLOCKHELPER_TIMER_FENCE(typename BlockHelperDetails::ImplType<MatrixType>::execution_space)
185  return Teuchos::rcp(new tpetra_import_type(src, tgt));
186  }
187 
188  // Partial replacement for forward-mode MultiVector::doImport.
189  // Permits overlapped communication and computation, but also supports sync'ed.
190  // I'm finding that overlapped comm/comp can give quite poor performance on some
191  // platforms, so we can't just use it straightforwardly always.
192 
193  template<typename MatrixType>
194  struct AsyncableImport {
195  public:
197 
198  private:
202 #if !defined(HAVE_IFPACK2_MPI)
203  typedef int MPI_Request;
204  typedef int MPI_Comm;
205 #endif
206  using scalar_type = typename impl_type::scalar_type;
209 
210  static int isend(const MPI_Comm comm, const char* buf, int count, int dest, int tag, MPI_Request* ireq) {
211 #ifdef HAVE_IFPACK2_MPI
212  MPI_Request ureq;
213  int ret = MPI_Isend(const_cast<char*>(buf), count, MPI_CHAR, dest, tag, comm, ireq == NULL ? &ureq : ireq);
214  if (ireq == NULL) MPI_Request_free(&ureq);
215  return ret;
216 #else
217  return 0;
218 #endif
219  }
220 
221  static int irecv(const MPI_Comm comm, char* buf, int count, int src, int tag, MPI_Request* ireq) {
222 #ifdef HAVE_IFPACK2_MPI
223  MPI_Request ureq;
224  int ret = MPI_Irecv(buf, count, MPI_CHAR, src, tag, comm, ireq == NULL ? &ureq : ireq);
225  if (ireq == NULL) MPI_Request_free(&ureq);
226  return ret;
227 #else
228  return 0;
229 #endif
230  }
231 
232  static int waitany(int count, MPI_Request* reqs, int* index) {
233 #ifdef HAVE_IFPACK2_MPI
234  return MPI_Waitany(count, reqs, index, MPI_STATUS_IGNORE);
235 #else
236  return 0;
237 #endif
238  }
239 
240  static int waitall(int count, MPI_Request* reqs) {
241 #ifdef HAVE_IFPACK2_MPI
242  return MPI_Waitall(count, reqs, MPI_STATUS_IGNORE);
243 #else
244  return 0;
245 #endif
246  }
247 
248  public:
249  using tpetra_map_type = typename impl_type::tpetra_map_type;
250  using tpetra_import_type = typename impl_type::tpetra_import_type;
251 
252  using local_ordinal_type = typename impl_type::local_ordinal_type;
253  using global_ordinal_type = typename impl_type::global_ordinal_type;
254  using size_type = typename impl_type::size_type;
255  using impl_scalar_type = typename impl_type::impl_scalar_type;
256 
257  using int_1d_view_host = Kokkos::View<int*,Kokkos::HostSpace>;
258  using local_ordinal_type_1d_view_host = Kokkos::View<local_ordinal_type*,Kokkos::HostSpace>;
259 
260  using execution_space = typename impl_type::execution_space;
261  using memory_space = typename impl_type::memory_space;
262  using local_ordinal_type_1d_view = typename impl_type::local_ordinal_type_1d_view;
263  using size_type_1d_view = typename impl_type::size_type_1d_view;
264  using size_type_1d_view_host = Kokkos::View<size_type*,Kokkos::HostSpace>;
265 
266 #if defined(KOKKOS_ENABLE_CUDA)
267  using impl_scalar_type_1d_view =
268  typename std::conditional<std::is_same<execution_space,Kokkos::Cuda>::value,
269 # if defined(IFPACK2_BLOCKTRIDICONTAINER_USE_PINNED_MEMORY_FOR_MPI)
270  Kokkos::View<impl_scalar_type*,Kokkos::CudaHostPinnedSpace>,
271 # elif defined(IFPACK2_BLOCKTRIDICONTAINER_USE_CUDA_MEMORY_FOR_MPI)
272  Kokkos::View<impl_scalar_type*,Kokkos::CudaSpace>,
273 # else // no experimental macros are defined
274  typename impl_type::impl_scalar_type_1d_view,
275 # endif
276  typename impl_type::impl_scalar_type_1d_view>::type;
277 #else
278  using impl_scalar_type_1d_view = typename impl_type::impl_scalar_type_1d_view;
279 #endif
280  using impl_scalar_type_1d_view_host = Kokkos::View<impl_scalar_type*,Kokkos::HostSpace>;
281  using impl_scalar_type_2d_view = typename impl_type::impl_scalar_type_2d_view;
282  using impl_scalar_type_2d_view_tpetra = typename impl_type::impl_scalar_type_2d_view_tpetra;
283 
284 #ifdef HAVE_IFPACK2_MPI
285  MPI_Comm comm;
286 #endif
287 
288  impl_scalar_type_2d_view_tpetra remote_multivector;
289  local_ordinal_type blocksize;
290 
291  template<typename T>
292  struct SendRecvPair {
293  T send, recv;
294  };
295 
296  // (s)end and (r)eceive data:
297  SendRecvPair<int_1d_view_host> pids; // mpi ranks
298  SendRecvPair<std::vector<MPI_Request> > reqs; // MPI_Request is pointer, cannot use kokkos view
299  SendRecvPair<size_type_1d_view> offset; // offsets to local id list and data buffer
300  SendRecvPair<size_type_1d_view_host> offset_host; // offsets to local id list and data buffer
301  SendRecvPair<local_ordinal_type_1d_view> lids; // local id list
302  SendRecvPair<impl_scalar_type_1d_view> buffer; // data buffer
303  SendRecvPair<impl_scalar_type_1d_view_host> buffer_host; // data buffer
304 
305  local_ordinal_type_1d_view dm2cm; // permutation
306 
307 #if defined(KOKKOS_ENABLE_CUDA) || defined(KOKKOS_ENABLE_HIP) || defined(KOKKOS_ENABLE_SYCL)
308  using exec_instance_1d_std_vector = std::vector<execution_space>;
309  exec_instance_1d_std_vector exec_instances;
310 #endif
311 
312  // for cuda
313  public:
314  void setOffsetValues(const Teuchos::ArrayView<const size_t> &lens,
315  const size_type_1d_view &offs) {
316  // wrap lens to kokkos view and deep copy to device
317  Kokkos::View<size_t*,Kokkos::HostSpace> lens_host(const_cast<size_t*>(lens.getRawPtr()), lens.size());
318  const auto lens_device = Kokkos::create_mirror_view_and_copy(memory_space(), lens_host);
319 
320  // exclusive scan
321  const Kokkos::RangePolicy<execution_space> policy(0,offs.extent(0));
322  const local_ordinal_type lens_size = lens_device.extent(0);
323  Kokkos::parallel_scan
324  ("AsyncableImport::RangePolicy::setOffsetValues",
325  policy, KOKKOS_LAMBDA(const local_ordinal_type &i, size_type &update, const bool &final) {
326  if (final)
327  offs(i) = update;
328  update += (i < lens_size ? lens_device[i] : 0);
329  });
330  }
331 
332  void setOffsetValuesHost(const Teuchos::ArrayView<const size_t> &lens,
333  const size_type_1d_view_host &offs) {
334  // wrap lens to kokkos view and deep copy to device
335  Kokkos::View<size_t*,Kokkos::HostSpace> lens_host(const_cast<size_t*>(lens.getRawPtr()), lens.size());
336  const auto lens_device = Kokkos::create_mirror_view_and_copy(memory_space(), lens_host);
337 
338  // exclusive scan
339  offs(0) = 0;
340  for (local_ordinal_type i=1,iend=offs.extent(0);i<iend;++i) {
341  offs(i) = offs(i-1) + lens[i-1];
342  }
343  }
344 
345  private:
346  void createMpiRequests(const tpetra_import_type &import) {
347  Tpetra::Distributor &distributor = import.getDistributor();
348 
349  // copy pids from distributor
350  const auto pids_from = distributor.getProcsFrom();
351  pids.recv = int_1d_view_host(do_not_initialize_tag("pids recv"), pids_from.size());
352  memcpy(pids.recv.data(), pids_from.getRawPtr(), sizeof(int)*pids.recv.extent(0));
353 
354  const auto pids_to = distributor.getProcsTo();
355  pids.send = int_1d_view_host(do_not_initialize_tag("pids send"), pids_to.size());
356  memcpy(pids.send.data(), pids_to.getRawPtr(), sizeof(int)*pids.send.extent(0));
357 
358  // mpi requests
359  reqs.recv.resize(pids.recv.extent(0)); memset(reqs.recv.data(), 0, reqs.recv.size()*sizeof(MPI_Request));
360  reqs.send.resize(pids.send.extent(0)); memset(reqs.send.data(), 0, reqs.send.size()*sizeof(MPI_Request));
361 
362  // construct offsets
363 #if 0
364  const auto lengths_to = distributor.getLengthsTo();
365  offset.send = size_type_1d_view(do_not_initialize_tag("offset send"), lengths_to.size() + 1);
366 
367  const auto lengths_from = distributor.getLengthsFrom();
368  offset.recv = size_type_1d_view(do_not_initialize_tag("offset recv"), lengths_from.size() + 1);
369 
370  setOffsetValues(lengths_to, offset.send);
371  offset_host.send = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), offset.send);
372 
373  setOffsetValues(lengths_from, offset.recv);
374  offset_host.recv = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), offset.recv);
375 #else
376  const auto lengths_to = distributor.getLengthsTo();
377  offset_host.send = size_type_1d_view_host(do_not_initialize_tag("offset send"), lengths_to.size() + 1);
378 
379  const auto lengths_from = distributor.getLengthsFrom();
380  offset_host.recv = size_type_1d_view_host(do_not_initialize_tag("offset recv"), lengths_from.size() + 1);
381 
382  setOffsetValuesHost(lengths_to, offset_host.send);
383  //offset.send = Kokkos::create_mirror_view_and_copy(memory_space(), offset_host.send);
384 
385  setOffsetValuesHost(lengths_from, offset_host.recv);
386  //offset.recv = Kokkos::create_mirror_view_and_copy(memory_space(), offset_host.recv);
387 #endif
388  }
389 
390  void createSendRecvIDs(const tpetra_import_type &import) {
391  // For each remote PID, the list of LIDs to receive.
392  const auto remote_lids = import.getRemoteLIDs();
393  const local_ordinal_type_1d_view_host
394  remote_lids_view_host(const_cast<local_ordinal_type*>(remote_lids.getRawPtr()), remote_lids.size());
395  lids.recv = local_ordinal_type_1d_view(do_not_initialize_tag("lids recv"), remote_lids.size());
396  Kokkos::deep_copy(lids.recv, remote_lids_view_host);
397 
398  // For each export PID, the list of LIDs to send.
399  auto epids = import.getExportPIDs();
400  auto elids = import.getExportLIDs();
401  TEUCHOS_ASSERT(epids.size() == elids.size());
402  lids.send = local_ordinal_type_1d_view(do_not_initialize_tag("lids send"), elids.size());
403  auto lids_send_host = Kokkos::create_mirror_view(lids.send);
404 
405  // naive search (not sure if pids or epids are sorted)
406  for (local_ordinal_type cnt=0,i=0,iend=pids.send.extent(0);i<iend;++i) {
407  const auto pid_send_value = pids.send[i];
408  for (local_ordinal_type j=0,jend=epids.size();j<jend;++j)
409  if (epids[j] == pid_send_value) lids_send_host[cnt++] = elids[j];
410  TEUCHOS_ASSERT(static_cast<size_t>(cnt) == offset_host.send[i+1]);
411  }
412  Kokkos::deep_copy(lids.send, lids_send_host);
413  }
414 
415  void createExecutionSpaceInstances() {
416 #if defined(KOKKOS_ENABLE_CUDA) || defined(KOKKOS_ENABLE_HIP) || defined(KOKKOS_ENABLE_SYCL)
417  //The following line creates 8 streams:
418  exec_instances =
419  Kokkos::Experimental::partition_space(execution_space(), 1, 1, 1, 1, 1, 1, 1, 1);
420 #endif
421  }
422 
423  public:
424  // for cuda, all tag types are public
425  struct ToBuffer {};
426  struct ToMultiVector {};
427 
428  AsyncableImport (const Teuchos::RCP<const tpetra_map_type>& src_map,
430  const local_ordinal_type blocksize_,
431  const local_ordinal_type_1d_view dm2cm_) {
432  blocksize = blocksize_;
433  dm2cm = dm2cm_;
434 
435 #ifdef HAVE_IFPACK2_MPI
436  comm = Tpetra::Details::extractMpiCommFromTeuchos(*tgt_map->getComm());
437 #endif
438  const tpetra_import_type import(src_map, tgt_map);
439 
440  createMpiRequests(import);
441  createSendRecvIDs(import);
442  createExecutionSpaceInstances();
443  }
444 
445  void createDataBuffer(const local_ordinal_type &num_vectors) {
446  const size_type extent_0 = lids.recv.extent(0)*blocksize;
447  const size_type extent_1 = num_vectors;
448  if (remote_multivector.extent(0) == extent_0 &&
449  remote_multivector.extent(1) == extent_1) {
450  // skip
451  } else {
452  remote_multivector =
453  impl_scalar_type_2d_view_tpetra(do_not_initialize_tag("remote multivector"), extent_0, extent_1);
454 
455  const auto send_buffer_size = offset_host.send[offset_host.send.extent(0)-1]*blocksize*num_vectors;
456  const auto recv_buffer_size = offset_host.recv[offset_host.recv.extent(0)-1]*blocksize*num_vectors;
457 
458  buffer.send = impl_scalar_type_1d_view(do_not_initialize_tag("buffer send"), send_buffer_size);
459  buffer.recv = impl_scalar_type_1d_view(do_not_initialize_tag("buffer recv"), recv_buffer_size);
460 
461  if (!Tpetra::Details::Behavior::assumeMpiIsGPUAware()) {
462  buffer_host.send = impl_scalar_type_1d_view_host(do_not_initialize_tag("buffer send"), send_buffer_size);
463  buffer_host.recv = impl_scalar_type_1d_view_host(do_not_initialize_tag("buffer recv"), recv_buffer_size);
464  }
465  }
466  }
467 
468  void cancel () {
469 #ifdef HAVE_IFPACK2_MPI
470  waitall(reqs.recv.size(), reqs.recv.data());
471  waitall(reqs.send.size(), reqs.send.data());
472 #endif
473  }
474 
475  // ======================================================================
476  // Async version using execution space instances
477  // ======================================================================
478 
479 #if defined(KOKKOS_ENABLE_CUDA) || defined(KOKKOS_ENABLE_HIP) || defined(KOKKOS_ENABLE_SYCL)
480  template<typename PackTag>
481  static
482  void copy(const local_ordinal_type_1d_view &lids_,
483  const impl_scalar_type_1d_view &buffer_,
484  const local_ordinal_type ibeg_,
485  const local_ordinal_type iend_,
486  const impl_scalar_type_2d_view_tpetra &multivector_,
487  const local_ordinal_type blocksize_,
488  const execution_space &exec_instance_) {
489  const local_ordinal_type num_vectors = multivector_.extent(1);
490  const local_ordinal_type mv_blocksize = blocksize_*num_vectors;
491  const local_ordinal_type idiff = iend_ - ibeg_;
492  const auto abase = buffer_.data() + mv_blocksize*ibeg_;
493 
494  using team_policy_type = Kokkos::TeamPolicy<execution_space>;
495  local_ordinal_type vector_size(0);
496  if (blocksize_ <= 4) vector_size = 4;
497  else if (blocksize_ <= 8) vector_size = 8;
498  else if (blocksize_ <= 16) vector_size = 16;
499  else vector_size = 32;
500 
501  const auto work_item_property = Kokkos::Experimental::WorkItemProperty::HintLightWeight;
502  const team_policy_type policy(exec_instance_, idiff, 1, vector_size);
503  Kokkos::parallel_for
504  (//"AsyncableImport::TeamPolicy::copyViaCudaStream",
505  Kokkos::Experimental::require(policy, work_item_property),
506  KOKKOS_LAMBDA(const typename team_policy_type::member_type &member) {
507  const local_ordinal_type i = member.league_rank();
508  Kokkos::parallel_for
509  (Kokkos::TeamThreadRange(member,num_vectors),[&](const local_ordinal_type &j) {
510  auto aptr = abase + blocksize_*(i + idiff*j);
511  auto bptr = &multivector_(blocksize_*lids_(i + ibeg_), j);
512  if (std::is_same<PackTag,ToBuffer>::value)
513  Kokkos::parallel_for
514  (Kokkos::ThreadVectorRange(member,blocksize_),[&](const local_ordinal_type &k) {
515  aptr[k] = bptr[k];
516  });
517  else
518  Kokkos::parallel_for
519  (Kokkos::ThreadVectorRange(member,blocksize_),[&](const local_ordinal_type &k) {
520  bptr[k] = aptr[k];
521  });
522  });
523  });
524  }
525 
526  void asyncSendRecvVar1(const impl_scalar_type_2d_view_tpetra &mv) {
527  IFPACK2_BLOCKHELPER_TIMER("BlockTriDi::AsyncableImport::AsyncSendRecv", AsyncSendRecv);
528 
529 #ifdef HAVE_IFPACK2_MPI
530  // constants and reallocate data buffers if necessary
531  const local_ordinal_type num_vectors = mv.extent(1);
532  const local_ordinal_type mv_blocksize = blocksize*num_vectors;
533 
534  // 0. post receive async
535  for (local_ordinal_type i=0,iend=pids.recv.extent(0);i<iend;++i) {
536  if(Tpetra::Details::Behavior::assumeMpiIsGPUAware()) {
537  irecv(comm,
538  reinterpret_cast<char*>(buffer.recv.data() + offset_host.recv[i]*mv_blocksize),
539  (offset_host.recv[i+1] - offset_host.recv[i])*mv_blocksize*sizeof(impl_scalar_type),
540  pids.recv[i],
541  42,
542  &reqs.recv[i]);
543  }
544  else {
545  irecv(comm,
546  reinterpret_cast<char*>(buffer_host.recv.data() + offset_host.recv[i]*mv_blocksize),
547  (offset_host.recv[i+1] - offset_host.recv[i])*mv_blocksize*sizeof(impl_scalar_type),
548  pids.recv[i],
549  42,
550  &reqs.recv[i]);
551  }
552  }
553 
555  execution_space().fence();
556 
557  // 1. async memcpy
558  for (local_ordinal_type i=0;i<static_cast<local_ordinal_type>(pids.send.extent(0));++i) {
559  // 1.0. enqueue pack buffer
560  if (i<8) exec_instances[i%8].fence();
561  copy<ToBuffer>(lids.send, buffer.send,
562  offset_host.send(i), offset_host.send(i+1),
563  mv, blocksize,
564  //execution_space());
565  exec_instances[i%8]);
566  if (!Tpetra::Details::Behavior::assumeMpiIsGPUAware()) {
567  //if (i<8) exec_instances[i%8].fence();
568  const local_ordinal_type num_vectors = mv.extent(1);
569  const local_ordinal_type mv_blocksize = blocksize*num_vectors;
570 
571  Kokkos::deep_copy(exec_instances[i%8],
572  Kokkos::subview(buffer_host.send,
573  Kokkos::pair<local_ordinal_type, local_ordinal_type>(
574  offset_host.send(i)*mv_blocksize,
575  offset_host.send(i+1)*mv_blocksize)),
576  Kokkos::subview(buffer.send,
577  Kokkos::pair<local_ordinal_type, local_ordinal_type>(
578  offset_host.send(i)*mv_blocksize,
579  offset_host.send(i+1)*mv_blocksize)));
580  }
581  }
583  //execution_space().fence();
584  for (local_ordinal_type i=0;i<static_cast<local_ordinal_type>(pids.send.extent(0));++i) {
585  // 1.1. sync the stream and isend
586  if (i<8) exec_instances[i%8].fence();
587  if(Tpetra::Details::Behavior::assumeMpiIsGPUAware()) {
588  isend(comm,
589  reinterpret_cast<const char*>(buffer.send.data() + offset_host.send[i]*mv_blocksize),
590  (offset_host.send[i+1] - offset_host.send[i])*mv_blocksize*sizeof(impl_scalar_type),
591  pids.send[i],
592  42,
593  &reqs.send[i]);
594  }
595  else {
596  isend(comm,
597  reinterpret_cast<const char*>(buffer_host.send.data() + offset_host.send[i]*mv_blocksize),
598  (offset_host.send[i+1] - offset_host.send[i])*mv_blocksize*sizeof(impl_scalar_type),
599  pids.send[i],
600  42,
601  &reqs.send[i]);
602  }
603  }
604 
605  // 2. poke communication
606  for (local_ordinal_type i=0,iend=pids.recv.extent(0);i<iend;++i) {
607  int flag;
608  MPI_Status stat;
609  MPI_Iprobe(pids.recv[i], 42, comm, &flag, &stat);
610  }
611 #endif // HAVE_IFPACK2_MPI
612  IFPACK2_BLOCKHELPER_TIMER_FENCE(execution_space)
613  }
614 
615  void syncRecvVar1() {
616  IFPACK2_BLOCKHELPER_TIMER("BlockTriDi::AsyncableImport::SyncRecv", SyncRecv);
617 #ifdef HAVE_IFPACK2_MPI
618  // 0. wait for receive async.
619  for (local_ordinal_type i=0;i<static_cast<local_ordinal_type>(pids.recv.extent(0));++i) {
620  local_ordinal_type idx = i;
621 
622  // 0.0. wait any
623  waitany(pids.recv.extent(0), reqs.recv.data(), &idx);
624 
625  if (!Tpetra::Details::Behavior::assumeMpiIsGPUAware()) {
626  const local_ordinal_type num_vectors = remote_multivector.extent(1);
627  const local_ordinal_type mv_blocksize = blocksize*num_vectors;
628 
629  Kokkos::deep_copy(
630  Kokkos::subview(buffer.recv,
631  Kokkos::pair<local_ordinal_type, local_ordinal_type>(
632  offset_host.recv(idx)*mv_blocksize,
633  offset_host.recv(idx+1)*mv_blocksize)),
634  Kokkos::subview(buffer_host.recv,
635  Kokkos::pair<local_ordinal_type, local_ordinal_type>(
636  offset_host.recv(idx)*mv_blocksize,
637  offset_host.recv(idx+1)*mv_blocksize)));
638  }
639 
640  // 0.1. unpack data after data is moved into a device
641  copy<ToMultiVector>(lids.recv, buffer.recv,
642  offset_host.recv(idx), offset_host.recv(idx+1),
643  remote_multivector, blocksize,
644  exec_instances[idx%8]);
645  }
646 
647  // 1. fire up all cuda events
648  Kokkos::fence();
649 
650  // 2. cleanup all open comm
651  waitall(reqs.send.size(), reqs.send.data());
652 #endif // HAVE_IFPACK2_MPI
653  IFPACK2_BLOCKHELPER_TIMER_FENCE(execution_space)
654  }
655 #endif //defined(KOKKOS_ENABLE_CUDA|HIP|SYCL)
656 
657  // ======================================================================
658  // Generic version without using execution space instances
659  // - only difference between device and host architecture is on using team
660  // or range policies.
661  // ======================================================================
662  template<typename PackTag>
663  static
664  void copy(const local_ordinal_type_1d_view &lids_,
665  const impl_scalar_type_1d_view &buffer_,
666  const local_ordinal_type &ibeg_,
667  const local_ordinal_type &iend_,
668  const impl_scalar_type_2d_view_tpetra &multivector_,
669  const local_ordinal_type blocksize_) {
670  const local_ordinal_type num_vectors = multivector_.extent(1);
671  const local_ordinal_type mv_blocksize = blocksize_*num_vectors;
672  const local_ordinal_type idiff = iend_ - ibeg_;
673  const auto abase = buffer_.data() + mv_blocksize*ibeg_;
674  if constexpr (BlockHelperDetails::is_device<execution_space>::value) {
675  using team_policy_type = Kokkos::TeamPolicy<execution_space>;
676  local_ordinal_type vector_size(0);
677  if (blocksize_ <= 4) vector_size = 4;
678  else if (blocksize_ <= 8) vector_size = 8;
679  else if (blocksize_ <= 16) vector_size = 16;
680  else vector_size = 32;
681  const team_policy_type policy(idiff, 1, vector_size);
682  Kokkos::parallel_for
683  ("AsyncableImport::TeamPolicy::copy",
684  policy, KOKKOS_LAMBDA(const typename team_policy_type::member_type &member) {
685  const local_ordinal_type i = member.league_rank();
686  Kokkos::parallel_for
687  (Kokkos::TeamThreadRange(member,num_vectors),[&](const local_ordinal_type &j) {
688  auto aptr = abase + blocksize_*(i + idiff*j);
689  auto bptr = &multivector_(blocksize_*lids_(i + ibeg_), j);
690  if (std::is_same<PackTag,ToBuffer>::value)
691  Kokkos::parallel_for
692  (Kokkos::ThreadVectorRange(member,blocksize_),[&](const local_ordinal_type &k) {
693  aptr[k] = bptr[k];
694  });
695  else
696  Kokkos::parallel_for
697  (Kokkos::ThreadVectorRange(member,blocksize_),[&](const local_ordinal_type &k) {
698  bptr[k] = aptr[k];
699  });
700  });
701  });
702  } else {
703  const Kokkos::RangePolicy<execution_space> policy(0, idiff*num_vectors);
704  Kokkos::parallel_for
705  ("AsyncableImport::RangePolicy::copy",
706  policy, KOKKOS_LAMBDA(const local_ordinal_type &ij) {
707  const local_ordinal_type i = ij%idiff;
708  const local_ordinal_type j = ij/idiff;
709  auto aptr = abase + blocksize_*(i + idiff*j);
710  auto bptr = &multivector_(blocksize_*lids_(i + ibeg_), j);
711  auto from = std::is_same<PackTag,ToBuffer>::value ? bptr : aptr;
712  auto to = std::is_same<PackTag,ToBuffer>::value ? aptr : bptr;
713  memcpy(to, from, sizeof(impl_scalar_type)*blocksize_);
714  });
715  }
716  }
717 
718 
722  void asyncSendRecvVar0(const impl_scalar_type_2d_view_tpetra &mv) {
723  IFPACK2_BLOCKHELPER_TIMER("BlockTriDi::AsyncableImport::AsyncSendRecv", AsyncSendRecv);
724 
725 #ifdef HAVE_IFPACK2_MPI
726  // constants and reallocate data buffers if necessary
727  const local_ordinal_type num_vectors = mv.extent(1);
728  const local_ordinal_type mv_blocksize = blocksize*num_vectors;
729 
730  // receive async
731  for (local_ordinal_type i=0,iend=pids.recv.extent(0);i<iend;++i) {
732  if(Tpetra::Details::Behavior::assumeMpiIsGPUAware()) {
733  irecv(comm,
734  reinterpret_cast<char*>(buffer.recv.data() + offset_host.recv[i]*mv_blocksize),
735  (offset_host.recv[i+1] - offset_host.recv[i])*mv_blocksize*sizeof(impl_scalar_type),
736  pids.recv[i],
737  42,
738  &reqs.recv[i]);
739  }
740  else {
741  irecv(comm,
742  reinterpret_cast<char*>(buffer_host.recv.data() + offset_host.recv[i]*mv_blocksize),
743  (offset_host.recv[i+1] - offset_host.recv[i])*mv_blocksize*sizeof(impl_scalar_type),
744  pids.recv[i],
745  42,
746  &reqs.recv[i]);
747  }
748  }
749 
750  // send async
751  for (local_ordinal_type i=0,iend=pids.send.extent(0);i<iend;++i) {
752  copy<ToBuffer>(lids.send, buffer.send, offset_host.send(i), offset_host.send(i+1),
753  mv, blocksize);
754  Kokkos::fence();
755  if(Tpetra::Details::Behavior::assumeMpiIsGPUAware()) {
756  isend(comm,
757  reinterpret_cast<const char*>(buffer.send.data() + offset_host.send[i]*mv_blocksize),
758  (offset_host.send[i+1] - offset_host.send[i])*mv_blocksize*sizeof(impl_scalar_type),
759  pids.send[i],
760  42,
761  &reqs.send[i]);
762  }
763  else {
764  Kokkos::deep_copy(
765  Kokkos::subview(buffer_host.send,
766  Kokkos::pair<local_ordinal_type, local_ordinal_type>(
767  offset_host.send(i)*mv_blocksize,
768  offset_host.send(i+1)*mv_blocksize)),
769  Kokkos::subview(buffer.send,
770  Kokkos::pair<local_ordinal_type, local_ordinal_type>(
771  offset_host.send(i)*mv_blocksize,
772  offset_host.send(i+1)*mv_blocksize)));
773  isend(comm,
774  reinterpret_cast<const char*>(buffer_host.send.data() + offset_host.send[i]*mv_blocksize),
775  (offset_host.send[i+1] - offset_host.send[i])*mv_blocksize*sizeof(impl_scalar_type),
776  pids.send[i],
777  42,
778  &reqs.send[i]);
779  }
780  }
781 
782  // I find that issuing an Iprobe seems to nudge some MPIs into action,
783  // which helps with overlapped comm/comp performance.
784  for (local_ordinal_type i=0,iend=pids.recv.extent(0);i<iend;++i) {
785  int flag;
786  MPI_Status stat;
787  MPI_Iprobe(pids.recv[i], 42, comm, &flag, &stat);
788  }
789 #endif
790  IFPACK2_BLOCKHELPER_TIMER_FENCE(execution_space)
791  }
792 
793  void syncRecvVar0() {
794  IFPACK2_BLOCKHELPER_TIMER("BlockTriDi::AsyncableImport::SyncRecv", SyncRecv);
795 #ifdef HAVE_IFPACK2_MPI
796  // receive async.
797  for (local_ordinal_type i=0,iend=pids.recv.extent(0);i<iend;++i) {
798  local_ordinal_type idx = i;
799  waitany(pids.recv.extent(0), reqs.recv.data(), &idx);
800  if (!Tpetra::Details::Behavior::assumeMpiIsGPUAware()) {
801  const local_ordinal_type num_vectors = remote_multivector.extent(1);
802  const local_ordinal_type mv_blocksize = blocksize*num_vectors;
803  Kokkos::deep_copy(
804  Kokkos::subview(buffer.recv,
805  Kokkos::pair<local_ordinal_type, local_ordinal_type>(
806  offset_host.recv(idx)*mv_blocksize,
807  offset_host.recv(idx+1)*mv_blocksize)),
808  Kokkos::subview(buffer_host.recv,
809  Kokkos::pair<local_ordinal_type, local_ordinal_type>(
810  offset_host.recv(idx)*mv_blocksize,
811  offset_host.recv(idx+1)*mv_blocksize)));
812  }
813  copy<ToMultiVector>(lids.recv, buffer.recv, offset_host.recv(idx), offset_host.recv(idx+1),
814  remote_multivector, blocksize);
815  }
816  // wait on the sends to match all Isends with a cleanup operation.
817  waitall(reqs.send.size(), reqs.send.data());
818 #endif
819  IFPACK2_BLOCKHELPER_TIMER_FENCE(execution_space)
820  }
821 
825  void asyncSendRecv(const impl_scalar_type_2d_view_tpetra &mv) {
826 #if defined(KOKKOS_ENABLE_CUDA) || defined(KOKKOS_ENABLE_HIP) || defined(KOKKOS_ENABLE_SYCL)
827 #if defined(IFPACK2_BLOCKTRIDICONTAINER_USE_EXEC_SPACE_INSTANCES)
828  asyncSendRecvVar1(mv);
829 #else
830  asyncSendRecvVar0(mv);
831 #endif
832 #else
833  asyncSendRecvVar0(mv);
834 #endif
835  }
836  void syncRecv() {
837 #if defined(KOKKOS_ENABLE_CUDA) || defined(KOKKOS_ENABLE_HIP) || defined(KOKKOS_ENABLE_SYCL)
838 #if defined(IFPACK2_BLOCKTRIDICONTAINER_USE_EXEC_SPACE_INSTANCES)
839  syncRecvVar1();
840 #else
841  syncRecvVar0();
842 #endif
843 #else
844  syncRecvVar0();
845 #endif
846  }
847 
848  void syncExchange(const impl_scalar_type_2d_view_tpetra &mv) {
849  IFPACK2_BLOCKHELPER_TIMER("BlockTriDi::AsyncableImport::SyncExchange", SyncExchange);
850  asyncSendRecv(mv);
851  syncRecv();
852  IFPACK2_BLOCKHELPER_TIMER_FENCE(execution_space)
853  }
854 
855  impl_scalar_type_2d_view_tpetra getRemoteMultiVectorLocalView() const { return remote_multivector; }
856  };
857 
858  template <typename ViewType1, typename ViewType2>
859  struct are_same_struct {
860  ViewType1 keys1;
861  ViewType2 keys2;
862 
863  are_same_struct(ViewType1 keys1_, ViewType2 keys2_) : keys1(keys1_), keys2(keys2_) {}
864  KOKKOS_INLINE_FUNCTION
865  void operator()(int i, unsigned int& count) const {
866  if (keys1(i) != keys2(i)) count++;
867  }
868  };
869 
870  template <typename ViewType1, typename ViewType2>
871  bool are_same (ViewType1 keys1, ViewType2 keys2) {
872  unsigned int are_same_ = 0;
873 
874  Kokkos::parallel_reduce(Kokkos::RangePolicy<typename ViewType1::execution_space>(0, keys1.extent(0)),
875  are_same_struct(keys1, keys2),
876  are_same_);
877  return are_same_==0;
878  }
879 
883  template<typename MatrixType>
885  createBlockCrsAsyncImporter(const Teuchos::RCP<const typename BlockHelperDetails::ImplType<MatrixType>::tpetra_row_matrix_type> &A) {
886  IFPACK2_BLOCKHELPER_TIMER("createBlockCrsAsyncImporter", createBlockCrsAsyncImporter);
888  using tpetra_map_type = typename impl_type::tpetra_map_type;
889  using local_ordinal_type = typename impl_type::local_ordinal_type;
890  using global_ordinal_type = typename impl_type::global_ordinal_type;
891  using local_ordinal_type_1d_view = typename impl_type::local_ordinal_type_1d_view;
892  using crs_matrix_type = typename impl_type::tpetra_crs_matrix_type;
893  using block_crs_matrix_type = typename impl_type::tpetra_block_crs_matrix_type;
894  using global_indices_array_device_type = Kokkos::View<const global_ordinal_type*, typename tpetra_map_type::device_type>;
895 
896  auto A_crs = Teuchos::rcp_dynamic_cast<const crs_matrix_type>(A);
897  auto A_bcrs = Teuchos::rcp_dynamic_cast<const block_crs_matrix_type>(A);
898 
899  bool hasBlockCrsMatrix = ! A_bcrs.is_null ();
900 
901  // This is OK here to use the graph of the A_crs matrix and a block size of 1
902  const auto g = hasBlockCrsMatrix ? A_bcrs->getCrsGraph() : *(A_crs->getCrsGraph()); // tpetra crs graph object
903 
904  const auto blocksize = hasBlockCrsMatrix ? A_bcrs->getBlockSize() : 1;
905  const auto domain_map = g.getDomainMap();
906  const auto column_map = g.getColMap();
907 
908  std::vector<global_ordinal_type> gids;
909 
910  Kokkos::Subview<global_indices_array_device_type, std::pair<int,int>> column_map_global_iD_last;
911 
912  bool separate_remotes = true, found_first = false, need_owned_permutation = false;
913  {
914  IFPACK2_BLOCKHELPER_TIMER("createBlockCrsAsyncImporter::loop_over_local_elements", loop_over_local_elements);
915 
916  global_indices_array_device_type column_map_global_iD = column_map->getMyGlobalIndicesDevice();
917  global_indices_array_device_type domain_map_global_iD = domain_map->getMyGlobalIndicesDevice();
918 
919  if(are_same(domain_map_global_iD, column_map_global_iD)) {
920  // this should be the most likely path
921  separate_remotes = true;
922  need_owned_permutation = false;
923 
924  column_map_global_iD_last = Kokkos::subview(column_map_global_iD,
925  std::pair<int,int>(domain_map_global_iD.extent(0), column_map_global_iD.extent(0)));
926  }
927  else {
928  // This loop is relatively expensive
929  for (size_t i=0;i<column_map->getLocalNumElements();++i) {
930  const global_ordinal_type gid = column_map->getGlobalElement(i);
931  if (!domain_map->isNodeGlobalElement(gid)) {
932  found_first = true;
933  gids.push_back(gid);
934  } else if (found_first) {
935  separate_remotes = false;
936  break;
937  }
938  if (!found_first && !need_owned_permutation &&
939  domain_map->getLocalElement(gid) != static_cast<local_ordinal_type>(i)) {
940  // The owned part of the domain and column maps are different
941  // orderings. We *could* do a super efficient impl of this case in the
942  // num_sweeps > 1 case by adding complexity to PermuteAndRepack. But,
943  // really, if a caller cares about speed, they wouldn't make different
944  // local permutations like this. So we punt on the best impl and go for
945  // a pretty good one: the permutation is done in place in
946  // compute_b_minus_Rx for the pure-owned part of the MVP. The only cost
947  // is the presumably worse memory access pattern of the input vector.
948  need_owned_permutation = true;
949  }
950  }
951  }
952  IFPACK2_BLOCKHELPER_TIMER_FENCE(typename BlockHelperDetails::ImplType<MatrixType>::execution_space)
953  }
954 
955  if (separate_remotes) {
956  IFPACK2_BLOCKHELPER_TIMER("createBlockCrsAsyncImporter::separate_remotes", separate_remotes);
958  const auto parsimonious_col_map
959  = need_owned_permutation ?
960  Teuchos::rcp(new tpetra_map_type(invalid, gids.data(), gids.size(), 0, domain_map->getComm())):
961  Teuchos::rcp(new tpetra_map_type(invalid, column_map_global_iD_last, 0, domain_map->getComm()));
962  if (parsimonious_col_map->getGlobalNumElements() > 0) {
963  // make the importer only if needed.
964  local_ordinal_type_1d_view dm2cm;
965  if (need_owned_permutation) {
966  dm2cm = local_ordinal_type_1d_view(do_not_initialize_tag("dm2cm"), domain_map->getLocalNumElements());
967  const auto dm2cm_host = Kokkos::create_mirror_view(dm2cm);
968  for (size_t i=0;i<domain_map->getLocalNumElements();++i)
969  dm2cm_host(i) = domain_map->getLocalElement(column_map->getGlobalElement(i));
970  Kokkos::deep_copy(dm2cm, dm2cm_host);
971  }
972  IFPACK2_BLOCKHELPER_TIMER_FENCE(typename BlockHelperDetails::ImplType<MatrixType>::execution_space)
973  return Teuchos::rcp(new AsyncableImport<MatrixType>(domain_map, parsimonious_col_map, blocksize, dm2cm));
974  }
975  }
976  IFPACK2_BLOCKHELPER_TIMER_FENCE(typename BlockHelperDetails::ImplType<MatrixType>::execution_space)
977  return Teuchos::null;
978  }
979 
980  template<typename local_ordinal_type>
981  local_ordinal_type costTRSM(const local_ordinal_type block_size) {
982  return block_size*block_size;
983  }
984 
985  template<typename local_ordinal_type>
986  local_ordinal_type costGEMV(const local_ordinal_type block_size) {
987  return 2*block_size*block_size;
988  }
989 
990  template<typename local_ordinal_type>
991  local_ordinal_type costTriDiagSolve(const local_ordinal_type subline_length, const local_ordinal_type block_size) {
992  return 2 * subline_length * costTRSM(block_size) + 2 * (subline_length-1) * costGEMV(block_size);
993  }
994 
995  template<typename local_ordinal_type>
996  local_ordinal_type costSolveSchur(const local_ordinal_type num_parts,
997  const local_ordinal_type num_teams,
998  const local_ordinal_type line_length,
999  const local_ordinal_type block_size,
1000  const local_ordinal_type n_subparts_per_part) {
1001  const local_ordinal_type subline_length = ceil(double(line_length - (n_subparts_per_part-1) * 2) / n_subparts_per_part);
1002  if (subline_length < 1) {
1003  return INT_MAX;
1004  }
1005 
1006  const local_ordinal_type p_n_lines = ceil(double(num_parts)/num_teams);
1007  const local_ordinal_type p_n_sublines = ceil(double(n_subparts_per_part)*num_parts/num_teams);
1008  const local_ordinal_type p_n_sublines_2 = ceil(double(n_subparts_per_part-1)*num_parts/num_teams);
1009 
1010  const local_ordinal_type p_costApplyE = p_n_sublines_2 * subline_length * 2 * costGEMV(block_size);
1011  const local_ordinal_type p_costApplyS = p_n_lines * costTriDiagSolve((n_subparts_per_part-1)*2,block_size);
1012  const local_ordinal_type p_costApplyAinv = p_n_sublines * costTriDiagSolve(subline_length,block_size);
1013  const local_ordinal_type p_costApplyC = p_n_sublines_2 * 2 * costGEMV(block_size);
1014 
1015  if (n_subparts_per_part == 1) {
1016  return p_costApplyAinv;
1017  }
1018  return p_costApplyE + p_costApplyS + p_costApplyAinv + p_costApplyC;
1019  }
1020 
1021  template<typename local_ordinal_type>
1022  local_ordinal_type getAutomaticNSubparts(const local_ordinal_type num_parts,
1023  const local_ordinal_type num_teams,
1024  const local_ordinal_type line_length,
1025  const local_ordinal_type block_size) {
1026  local_ordinal_type n_subparts_per_part_0 = 1;
1027  local_ordinal_type flop_0 = costSolveSchur(num_parts, num_teams, line_length, block_size, n_subparts_per_part_0);
1028  local_ordinal_type flop_1 = costSolveSchur(num_parts, num_teams, line_length, block_size, n_subparts_per_part_0+1);
1029  while (flop_0 > flop_1) {
1030  flop_0 = flop_1;
1031  flop_1 = costSolveSchur(num_parts, num_teams, line_length, block_size, (++n_subparts_per_part_0)+1);
1032  }
1033  return n_subparts_per_part_0;
1034  }
1035 
1036  template<typename ArgActiveExecutionMemorySpace>
1037  struct SolveTridiagsDefaultModeAndAlgo;
1038 
1042  template<typename MatrixType>
1043  BlockHelperDetails::PartInterface<MatrixType>
1044  createPartInterface(const Teuchos::RCP<const typename BlockHelperDetails::ImplType<MatrixType>::tpetra_row_matrix_type> &A,
1045  const Teuchos::RCP<const typename BlockHelperDetails::ImplType<MatrixType>::tpetra_crs_graph_type> &G,
1046  const Teuchos::Array<Teuchos::Array<typename BlockHelperDetails::ImplType<MatrixType>::local_ordinal_type> > &partitions,
1047  const typename BlockHelperDetails::ImplType<MatrixType>::local_ordinal_type n_subparts_per_part_in) {
1048  IFPACK2_BLOCKHELPER_TIMER("createPartInterface", createPartInterface);
1049  using impl_type = BlockHelperDetails::ImplType<MatrixType>;
1050  using local_ordinal_type = typename impl_type::local_ordinal_type;
1051  using local_ordinal_type_1d_view = typename impl_type::local_ordinal_type_1d_view;
1052  using local_ordinal_type_2d_view = typename impl_type::local_ordinal_type_2d_view;
1053  using size_type = typename impl_type::size_type;
1054 
1055  auto bA = Teuchos::rcp_dynamic_cast<const typename BlockHelperDetails::ImplType<MatrixType>::tpetra_block_crs_matrix_type>(A);
1056 
1057  TEUCHOS_ASSERT(!bA.is_null() || G->getLocalNumRows() != 0);
1058  const local_ordinal_type blocksize = bA.is_null() ? A->getLocalNumRows() / G->getLocalNumRows() : A->getBlockSize();
1059  constexpr int vector_length = impl_type::vector_length;
1060  constexpr int internal_vector_length = impl_type::internal_vector_length;
1061 
1062  const auto comm = A->getRowMap()->getComm();
1063 
1064  BlockHelperDetails::PartInterface<MatrixType> interf;
1065 
1066  const bool jacobi = partitions.size() == 0;
1067  const local_ordinal_type A_n_lclrows = G->getLocalNumRows();
1068  const local_ordinal_type nparts = jacobi ? A_n_lclrows : partitions.size();
1069 
1070  typedef std::pair<local_ordinal_type,local_ordinal_type> size_idx_pair_type;
1071  std::vector<size_idx_pair_type> partsz(nparts);
1072 
1073  if (!jacobi) {
1074  for (local_ordinal_type i=0;i<nparts;++i)
1075  partsz[i] = size_idx_pair_type(partitions[i].size(), i);
1076  std::sort(partsz.begin(), partsz.end(),
1077  [] (const size_idx_pair_type& x, const size_idx_pair_type& y) {
1078  return x.first > y.first;
1079  });
1080  }
1081 
1082  local_ordinal_type n_subparts_per_part;
1083  if (n_subparts_per_part_in == -1) {
1084  // If the number of subparts is set to -1, the user let the algorithm
1085  // decides the value automatically
1086  using execution_space = typename impl_type::execution_space;
1087 
1088  const int line_length = partsz[0].first;
1089 
1090  const local_ordinal_type team_size =
1091  SolveTridiagsDefaultModeAndAlgo<typename execution_space::memory_space>::
1092  recommended_team_size(blocksize, vector_length, internal_vector_length);
1093 
1094  const local_ordinal_type num_teams = std::max(1, execution_space().concurrency() / (team_size * vector_length));
1095 
1096  n_subparts_per_part = getAutomaticNSubparts(nparts, num_teams, line_length, blocksize);
1097 
1098 #ifdef IFPACK2_BLOCKTRIDICONTAINER_USE_PRINTF
1099  printf("Automatically chosen n_subparts_per_part = %d for nparts = %d, num_teams = %d, team_size = %d, line_length = %d, and blocksize = %d;\n", n_subparts_per_part, nparts, num_teams, team_size, line_length, blocksize);
1100 #endif
1101  }
1102  else {
1103  n_subparts_per_part = n_subparts_per_part_in;
1104  }
1105 
1106  // Total number of sub lines:
1107  const local_ordinal_type n_sub_parts = nparts * n_subparts_per_part;
1108  // Total number of sub lines + the Schur complement blocks.
1109  // For a given live 2 sub lines implies one Schur complement, 3 sub lines implies two Schur complements etc.
1110  const local_ordinal_type n_sub_parts_and_schur = n_sub_parts + nparts * (n_subparts_per_part-1);
1111 
1112 #if defined(BLOCKTRIDICONTAINER_DEBUG)
1113  local_ordinal_type nrows = 0;
1114  if (jacobi)
1115  nrows = nparts;
1116  else
1117  for (local_ordinal_type i=0;i<nparts;++i) nrows += partitions[i].size();
1118 
1120  (nrows != A_n_lclrows, BlockHelperDetails::get_msg_prefix(comm) << "The #rows implied by the local partition is not "
1121  << "the same as getLocalNumRows: " << nrows << " vs " << A_n_lclrows);
1122 #endif
1123 
1124  // permutation vector
1125  std::vector<local_ordinal_type> p;
1126  if (jacobi) {
1127  interf.max_partsz = 1;
1128  interf.max_subpartsz = 0;
1129  interf.n_subparts_per_part = 1;
1130  interf.nparts = nparts;
1131  } else {
1132  // reorder parts to maximize simd packing efficiency
1133  p.resize(nparts);
1134 
1135  for (local_ordinal_type i=0;i<nparts;++i)
1136  p[i] = partsz[i].second;
1137 
1138  interf.max_partsz = partsz[0].first;
1139 
1140  constexpr local_ordinal_type connection_length = 2;
1141  const local_ordinal_type sub_line_length = (interf.max_partsz - (n_subparts_per_part - 1) * connection_length) / n_subparts_per_part;
1142  const local_ordinal_type last_sub_line_length = interf.max_partsz - (n_subparts_per_part - 1) * (connection_length + sub_line_length);
1143 
1144  interf.max_subpartsz = (sub_line_length > last_sub_line_length) ? sub_line_length : last_sub_line_length;
1145  interf.n_subparts_per_part = n_subparts_per_part;
1146  interf.nparts = nparts;
1147  }
1148 
1149  // allocate parts
1150  interf.partptr = local_ordinal_type_1d_view(do_not_initialize_tag("partptr"), nparts + 1);
1151  interf.lclrow = local_ordinal_type_1d_view(do_not_initialize_tag("lclrow"), A_n_lclrows);
1152  interf.part2rowidx0 = local_ordinal_type_1d_view(do_not_initialize_tag("part2rowidx0"), nparts + 1);
1153  interf.part2packrowidx0 = local_ordinal_type_1d_view(do_not_initialize_tag("part2packrowidx0"), nparts + 1);
1154  interf.rowidx2part = local_ordinal_type_1d_view(do_not_initialize_tag("rowidx2part"), A_n_lclrows);
1155 
1156  interf.part2rowidx0_sub = local_ordinal_type_1d_view(do_not_initialize_tag("part2rowidx0_sub"), n_sub_parts_and_schur + 1);
1157  interf.part2packrowidx0_sub = local_ordinal_type_2d_view(do_not_initialize_tag("part2packrowidx0_sub"), nparts, 2 * n_subparts_per_part);
1158  interf.rowidx2part_sub = local_ordinal_type_1d_view(do_not_initialize_tag("rowidx2part"), A_n_lclrows);
1159 
1160  interf.partptr_sub = local_ordinal_type_2d_view(do_not_initialize_tag("partptr_sub"), n_sub_parts_and_schur, 2);
1161 
1162  // mirror to host and compute on host execution space
1163  const auto partptr = Kokkos::create_mirror_view(interf.partptr);
1164  const auto partptr_sub = Kokkos::create_mirror_view(interf.partptr_sub);
1165 
1166  const auto lclrow = Kokkos::create_mirror_view(interf.lclrow);
1167  const auto part2rowidx0 = Kokkos::create_mirror_view(interf.part2rowidx0);
1168  const auto part2packrowidx0 = Kokkos::create_mirror_view(interf.part2packrowidx0);
1169  const auto rowidx2part = Kokkos::create_mirror_view(interf.rowidx2part);
1170 
1171  const auto part2rowidx0_sub = Kokkos::create_mirror_view(interf.part2rowidx0_sub);
1172  const auto part2packrowidx0_sub = Kokkos::create_mirror_view(Kokkos::HostSpace(), interf.part2packrowidx0_sub);
1173  const auto rowidx2part_sub = Kokkos::create_mirror_view(interf.rowidx2part_sub);
1174 
1175  // Determine parts.
1176  interf.row_contiguous = true;
1177  partptr(0) = 0;
1178  part2rowidx0(0) = 0;
1179  part2packrowidx0(0) = 0;
1180  local_ordinal_type pack_nrows = 0;
1181  local_ordinal_type pack_nrows_sub = 0;
1182  if (jacobi) {
1183  IFPACK2_BLOCKHELPER_TIMER("compute part indices (Jacobi)", Jacobi);
1184  // Jacobi (all lines have length 1) means that A_n_lclrows == nparts,
1185  // so the mapping between parts and rows is trivial.
1186  // Note: we can leave interf.row_contiguous = true, since for all i: lclrow(i) == i
1187  for (local_ordinal_type i=0; i <= nparts; ++i) {
1188  part2rowidx0(i) = i;
1189  partptr(i) = i;
1190  }
1191  for (local_ordinal_type i=0; i < nparts; ++i) {
1192  rowidx2part(i) = i;
1193  lclrow(i) = i;
1194  }
1195  for (local_ordinal_type ip=0;ip<nparts;++ip) {
1196  //assume No overlap.
1197  if (ip % vector_length == 0) pack_nrows = 1;
1198  part2packrowidx0(ip+1) = part2packrowidx0(ip) + ((ip+1) % vector_length == 0 || ip+1 == nparts ? pack_nrows : 0);
1199  }
1200  part2rowidx0_sub(0) = 0;
1201  partptr_sub(0, 0) = 0;
1202 
1203  for (local_ordinal_type ip=0;ip<nparts;++ip) {
1204  constexpr local_ordinal_type ipnrows = 1;
1205  const local_ordinal_type full_line_length = partptr(ip+1) - partptr(ip);
1206 
1208  (full_line_length != ipnrows, std::logic_error,
1209  "In the part " << ip );
1210 
1211  constexpr local_ordinal_type connection_length = 2;
1212 
1213  if (full_line_length < n_subparts_per_part + (n_subparts_per_part - 1) * connection_length )
1215  (true, std::logic_error,
1216  "The part " << ip << " is too short to use " << n_subparts_per_part << " sub parts.");
1217 
1218  const local_ordinal_type sub_line_length = (full_line_length - (n_subparts_per_part - 1) * connection_length) / n_subparts_per_part;
1219  const local_ordinal_type last_sub_line_length = full_line_length - (n_subparts_per_part - 1) * (connection_length + sub_line_length);
1220 
1221  if (ip % vector_length == 0) pack_nrows_sub = ipnrows;
1222 
1223  for (local_ordinal_type local_sub_ip=0; local_sub_ip<n_subparts_per_part;++local_sub_ip) {
1224  const local_ordinal_type sub_ip = nparts*(2*local_sub_ip) + ip;
1225  const local_ordinal_type schur_ip = nparts*(2*local_sub_ip+1) + ip;
1226  if (local_sub_ip != n_subparts_per_part-1) {
1227  if (local_sub_ip != 0) {
1228  partptr_sub(sub_ip, 0) = partptr_sub(nparts*(2*local_sub_ip-1) + ip, 1);
1229  }
1230  else if (ip != 0) {
1231  partptr_sub(sub_ip, 0) = partptr_sub(nparts*2*(n_subparts_per_part-1) + ip - 1, 1);
1232  }
1233  partptr_sub(sub_ip, 1) = sub_line_length + partptr_sub(sub_ip, 0);
1234  partptr_sub(schur_ip, 0) = partptr_sub(sub_ip, 1);
1235  partptr_sub(schur_ip, 1) = connection_length + partptr_sub(schur_ip, 0);
1236 
1237  part2rowidx0_sub(sub_ip + 1) = part2rowidx0_sub(sub_ip) + sub_line_length;
1238  part2rowidx0_sub(sub_ip + 2) = part2rowidx0_sub(sub_ip + 1) + connection_length;
1239 
1240 #ifdef IFPACK2_BLOCKTRIDICONTAINER_USE_PRINTF
1241  printf("Sub Part index = %d, first LID associated to the sub part = %d, sub part size = %d;\n", sub_ip, partptr_sub(ip, 2 * local_sub_ip), sub_line_length);
1242  printf("Sub Part index Schur = %d, first LID associated to the sub part = %d, sub part size = %d;\n", sub_ip + 1, partptr_sub(ip, 2 * local_sub_ip + 1), connection_length);
1243 #endif
1244  }
1245  else {
1246  if (local_sub_ip != 0) {
1247  partptr_sub(sub_ip, 0) = partptr_sub(nparts*(2*local_sub_ip-1) + ip, 1);
1248  }
1249  else if (ip != 0) {
1250  partptr_sub(sub_ip, 0) = partptr_sub(nparts*2*(n_subparts_per_part-1) + ip - 1, 1);
1251  }
1252  partptr_sub(sub_ip, 1) = last_sub_line_length + partptr_sub(sub_ip, 0);
1253 
1254  part2rowidx0_sub(sub_ip + 1) = part2rowidx0_sub(sub_ip) + last_sub_line_length;
1255 
1256 #ifdef IFPACK2_BLOCKTRIDICONTAINER_USE_PRINTF
1257  printf("Sub Part index = %d, first LID associated to the sub part = %d, sub part size = %d;\n", sub_ip, partptr_sub(ip, 2 * local_sub_ip), last_sub_line_length);
1258 #endif
1259  }
1260  }
1261  }
1262 
1263 #ifdef IFPACK2_BLOCKTRIDICONTAINER_WRITE_MM
1264  std::cout << "partptr_sub = " << std::endl;
1265  for (size_type i = 0; i < partptr_sub.extent(0); ++i) {
1266  for (size_type j = 0; j < partptr_sub.extent(1); ++j) {
1267  std::cout << partptr_sub(i,j) << " ";
1268  }
1269  std::cout << std::endl;
1270  }
1271  std::cout << "partptr_sub end" << std::endl;
1272 #endif
1273 
1274  {
1275  local_ordinal_type npacks = ceil(float(nparts)/vector_length);
1276 
1277  local_ordinal_type ip_max = nparts > vector_length ? vector_length : nparts;
1278  for (local_ordinal_type ip=0;ip<ip_max;++ip) {
1279  part2packrowidx0_sub(ip, 0) = 0;
1280  }
1281  for (local_ordinal_type ipack=0;ipack<npacks;++ipack) {
1282  if (ipack != 0) {
1283  local_ordinal_type ip_min = ipack*vector_length;
1284  ip_max = nparts > (ipack+1)*vector_length ? (ipack+1)*vector_length : nparts;
1285  for (local_ordinal_type ip=ip_min;ip<ip_max;++ip) {
1286  part2packrowidx0_sub(ip, 0) = part2packrowidx0_sub(ip-vector_length, part2packrowidx0_sub.extent(1)-1);
1287  }
1288  }
1289 
1290  for (size_type local_sub_ip=0; local_sub_ip<part2packrowidx0_sub.extent(1)-1;++local_sub_ip) {
1291  local_ordinal_type ip_min = ipack*vector_length;
1292  ip_max = nparts > (ipack+1)*vector_length ? (ipack+1)*vector_length : nparts;
1293 
1294  const local_ordinal_type full_line_length = partptr(ip_min+1) - partptr(ip_min);
1295 
1296  constexpr local_ordinal_type connection_length = 2;
1297 
1298  const local_ordinal_type sub_line_length = (full_line_length - (n_subparts_per_part - 1) * connection_length) / n_subparts_per_part;
1299  const local_ordinal_type last_sub_line_length = full_line_length - (n_subparts_per_part - 1) * (connection_length + sub_line_length);
1300 
1301  if (local_sub_ip % 2 == 0) pack_nrows_sub = sub_line_length;
1302  if (local_sub_ip % 2 == 1) pack_nrows_sub = connection_length;
1303  if (local_sub_ip == part2packrowidx0_sub.extent(1)-2) pack_nrows_sub = last_sub_line_length;
1304 
1305  part2packrowidx0_sub(ip_min, local_sub_ip + 1) = part2packrowidx0_sub(ip_min, local_sub_ip) + pack_nrows_sub;
1306 
1307  for (local_ordinal_type ip=ip_min+1;ip<ip_max;++ip) {
1308  part2packrowidx0_sub(ip, local_sub_ip + 1) = part2packrowidx0_sub(ip_min, local_sub_ip + 1);
1309  }
1310  }
1311  }
1312 
1313  Kokkos::deep_copy(interf.part2packrowidx0_sub, part2packrowidx0_sub);
1314  }
1315  IFPACK2_BLOCKHELPER_TIMER_FENCE(typename BlockHelperDetails::ImplType<MatrixType>::execution_space)
1316  } else {
1317  IFPACK2_BLOCKHELPER_TIMER("compute part indices", indices);
1318  for (local_ordinal_type ip=0;ip<nparts;++ip) {
1319  const auto* part = &partitions[p[ip]];
1320  const local_ordinal_type ipnrows = part->size();
1321  TEUCHOS_ASSERT(ip == 0 || (ipnrows <= static_cast<local_ordinal_type>(partitions[p[ip-1]].size())));
1322  TEUCHOS_TEST_FOR_EXCEPT_MSG(ipnrows == 0,
1323  BlockHelperDetails::get_msg_prefix(comm)
1324  << "partition " << p[ip]
1325  << " is empty, which is not allowed.");
1326  //assume No overlap.
1327  part2rowidx0(ip+1) = part2rowidx0(ip) + ipnrows;
1328  // Since parts are ordered in decreasing size, the size of the first
1329  // part in a pack is the size for all parts in the pack.
1330  if (ip % vector_length == 0) pack_nrows = ipnrows;
1331  part2packrowidx0(ip+1) = part2packrowidx0(ip) + ((ip+1) % vector_length == 0 || ip+1 == nparts ? pack_nrows : 0);
1332  const local_ordinal_type offset = partptr(ip);
1333  for (local_ordinal_type i=0;i<ipnrows;++i) {
1334  const auto lcl_row = (*part)[i];
1335  TEUCHOS_TEST_FOR_EXCEPT_MSG(lcl_row < 0 || lcl_row >= A_n_lclrows,
1336  BlockHelperDetails::get_msg_prefix(comm)
1337  << "partitions[" << p[ip] << "]["
1338  << i << "] = " << lcl_row
1339  << " but input matrix implies limits of [0, " << A_n_lclrows-1
1340  << "].");
1341  lclrow(offset+i) = lcl_row;
1342  rowidx2part(offset+i) = ip;
1343  if (interf.row_contiguous && offset+i > 0 && lclrow((offset+i)-1) + 1 != lcl_row)
1344  interf.row_contiguous = false;
1345  }
1346  partptr(ip+1) = offset + ipnrows;
1347 
1348 #ifdef IFPACK2_BLOCKTRIDICONTAINER_USE_PRINTF
1349  printf("Part index = ip = %d, first LID associated to the part = partptr(ip) = offset = %d, part->size() = ipnrows = %d;\n", ip, offset, ipnrows);
1350  printf("partptr(%d+1) = %d\n", ip, partptr(ip+1));
1351 #endif
1352  }
1353 
1354  part2rowidx0_sub(0) = 0;
1355  partptr_sub(0, 0) = 0;
1356  //const local_ordinal_type number_pack_per_sub_part = ceil(float(nparts)/vector_length);
1357 
1358  for (local_ordinal_type ip=0;ip<nparts;++ip) {
1359  const auto* part = &partitions[p[ip]];
1360  const local_ordinal_type ipnrows = part->size();
1361  const local_ordinal_type full_line_length = partptr(ip+1) - partptr(ip);
1362 
1364  (full_line_length != ipnrows, std::logic_error,
1365  "In the part " << ip );
1366 
1367  constexpr local_ordinal_type connection_length = 2;
1368 
1369  if (full_line_length < n_subparts_per_part + (n_subparts_per_part - 1) * connection_length )
1371  (true, std::logic_error,
1372  "The part " << ip << " is too short to use " << n_subparts_per_part << " sub parts.");
1373 
1374  const local_ordinal_type sub_line_length = (full_line_length - (n_subparts_per_part - 1) * connection_length) / n_subparts_per_part;
1375  const local_ordinal_type last_sub_line_length = full_line_length - (n_subparts_per_part - 1) * (connection_length + sub_line_length);
1376 
1377  if (ip % vector_length == 0) pack_nrows_sub = ipnrows;
1378 
1379  for (local_ordinal_type local_sub_ip=0; local_sub_ip<n_subparts_per_part;++local_sub_ip) {
1380  const local_ordinal_type sub_ip = nparts*(2*local_sub_ip) + ip;
1381  const local_ordinal_type schur_ip = nparts*(2*local_sub_ip+1) + ip;
1382  if (local_sub_ip != n_subparts_per_part-1) {
1383  if (local_sub_ip != 0) {
1384  partptr_sub(sub_ip, 0) = partptr_sub(nparts*(2*local_sub_ip-1) + ip, 1);
1385  }
1386  else if (ip != 0) {
1387  partptr_sub(sub_ip, 0) = partptr_sub(nparts*2*(n_subparts_per_part-1) + ip - 1, 1);
1388  }
1389  partptr_sub(sub_ip, 1) = sub_line_length + partptr_sub(sub_ip, 0);
1390  partptr_sub(schur_ip, 0) = partptr_sub(sub_ip, 1);
1391  partptr_sub(schur_ip, 1) = connection_length + partptr_sub(schur_ip, 0);
1392 
1393  part2rowidx0_sub(sub_ip + 1) = part2rowidx0_sub(sub_ip) + sub_line_length;
1394  part2rowidx0_sub(sub_ip + 2) = part2rowidx0_sub(sub_ip + 1) + connection_length;
1395 
1396 #ifdef IFPACK2_BLOCKTRIDICONTAINER_USE_PRINTF
1397  printf("Sub Part index = %d, first LID associated to the sub part = %d, sub part size = %d;\n", sub_ip, partptr_sub(sub_ip, 0), sub_line_length);
1398  printf("Sub Part index Schur = %d, first LID associated to the sub part = %d, sub part size = %d;\n", sub_ip + 1, partptr_sub(ip, 2 * local_sub_ip + 1), connection_length);
1399 #endif
1400  }
1401  else {
1402  if (local_sub_ip != 0) {
1403  partptr_sub(sub_ip, 0) = partptr_sub(nparts*(2*local_sub_ip-1) + ip, 1);
1404  }
1405  else if (ip != 0) {
1406  partptr_sub(sub_ip, 0) = partptr_sub(nparts*2*(n_subparts_per_part-1) + ip - 1, 1);
1407  }
1408  partptr_sub(sub_ip, 1) = last_sub_line_length + partptr_sub(sub_ip, 0);
1409 
1410  part2rowidx0_sub(sub_ip + 1) = part2rowidx0_sub(sub_ip) + last_sub_line_length;
1411 
1412 #ifdef IFPACK2_BLOCKTRIDICONTAINER_USE_PRINTF
1413  printf("Sub Part index = %d, first LID associated to the sub part = %d, sub part size = %d;\n", sub_ip, partptr_sub(sub_ip, 0), last_sub_line_length);
1414 #endif
1415  }
1416  }
1417  }
1418 
1419  {
1420  local_ordinal_type npacks = ceil(float(nparts)/vector_length);
1421 
1422  local_ordinal_type ip_max = nparts > vector_length ? vector_length : nparts;
1423  for (local_ordinal_type ip=0;ip<ip_max;++ip) {
1424  part2packrowidx0_sub(ip, 0) = 0;
1425  }
1426  for (local_ordinal_type ipack=0;ipack<npacks;++ipack) {
1427  if (ipack != 0) {
1428  local_ordinal_type ip_min = ipack*vector_length;
1429  ip_max = nparts > (ipack+1)*vector_length ? (ipack+1)*vector_length : nparts;
1430  for (local_ordinal_type ip=ip_min;ip<ip_max;++ip) {
1431  part2packrowidx0_sub(ip, 0) = part2packrowidx0_sub(ip-vector_length, part2packrowidx0_sub.extent(1)-1);
1432  }
1433  }
1434 
1435  for (size_type local_sub_ip=0; local_sub_ip<part2packrowidx0_sub.extent(1)-1;++local_sub_ip) {
1436  local_ordinal_type ip_min = ipack*vector_length;
1437  ip_max = nparts > (ipack+1)*vector_length ? (ipack+1)*vector_length : nparts;
1438 
1439  const local_ordinal_type full_line_length = partptr(ip_min+1) - partptr(ip_min);
1440 
1441  constexpr local_ordinal_type connection_length = 2;
1442 
1443  const local_ordinal_type sub_line_length = (full_line_length - (n_subparts_per_part - 1) * connection_length) / n_subparts_per_part;
1444  const local_ordinal_type last_sub_line_length = full_line_length - (n_subparts_per_part - 1) * (connection_length + sub_line_length);
1445 
1446  if (local_sub_ip % 2 == 0) pack_nrows_sub = sub_line_length;
1447  if (local_sub_ip % 2 == 1) pack_nrows_sub = connection_length;
1448  if (local_sub_ip == part2packrowidx0_sub.extent(1)-2) pack_nrows_sub = last_sub_line_length;
1449 
1450  part2packrowidx0_sub(ip_min, local_sub_ip + 1) = part2packrowidx0_sub(ip_min, local_sub_ip) + pack_nrows_sub;
1451 
1452  for (local_ordinal_type ip=ip_min+1;ip<ip_max;++ip) {
1453  part2packrowidx0_sub(ip, local_sub_ip + 1) = part2packrowidx0_sub(ip_min, local_sub_ip + 1);
1454  }
1455  }
1456  }
1457 
1458  Kokkos::deep_copy(interf.part2packrowidx0_sub, part2packrowidx0_sub);
1459  }
1460  IFPACK2_BLOCKHELPER_TIMER_FENCE(typename BlockHelperDetails::ImplType<MatrixType>::execution_space)
1461  }
1462 #if defined(BLOCKTRIDICONTAINER_DEBUG)
1463  TEUCHOS_ASSERT(partptr(nparts) == nrows);
1464 #endif
1465  if (lclrow(0) != 0) interf.row_contiguous = false;
1466 
1467  Kokkos::deep_copy(interf.partptr, partptr);
1468  Kokkos::deep_copy(interf.lclrow, lclrow);
1469 
1470  Kokkos::deep_copy(interf.partptr_sub, partptr_sub);
1471 
1472  //assume No overlap. Thus:
1473  interf.part2rowidx0 = interf.partptr;
1474  Kokkos::deep_copy(interf.part2packrowidx0, part2packrowidx0);
1475 
1476  interf.part2packrowidx0_back = part2packrowidx0_sub(part2packrowidx0_sub.extent(0) - 1, part2packrowidx0_sub.extent(1) - 1);
1477  Kokkos::deep_copy(interf.rowidx2part, rowidx2part);
1478 
1479  { // Fill packptr.
1480  IFPACK2_BLOCKHELPER_TIMER("Fill packptr", packptr0);
1481  local_ordinal_type npacks = ceil(float(nparts)/vector_length) * (part2packrowidx0_sub.extent(1)-1);
1482  npacks = 0;
1483  for (local_ordinal_type ip=1;ip<=nparts;++ip) //n_sub_parts_and_schur
1484  if (part2packrowidx0(ip) != part2packrowidx0(ip-1))
1485  ++npacks;
1486 
1487  interf.packptr = local_ordinal_type_1d_view(do_not_initialize_tag("packptr"), npacks + 1);
1488  const auto packptr = Kokkos::create_mirror_view(interf.packptr);
1489  packptr(0) = 0;
1490  for (local_ordinal_type ip=1,k=1;ip<=nparts;++ip)
1491  if (part2packrowidx0(ip) != part2packrowidx0(ip-1))
1492  packptr(k++) = ip;
1493 
1494  Kokkos::deep_copy(interf.packptr, packptr);
1495 
1496  local_ordinal_type npacks_per_subpart = ceil(float(nparts)/vector_length);
1497  npacks = ceil(float(nparts)/vector_length) * (part2packrowidx0_sub.extent(1)-1);
1498 
1499  interf.packindices_sub = local_ordinal_type_1d_view(do_not_initialize_tag("packindices_sub"), npacks_per_subpart*n_subparts_per_part);
1500  interf.packindices_schur = local_ordinal_type_2d_view(do_not_initialize_tag("packindices_schur"), npacks_per_subpart,n_subparts_per_part-1);
1501 
1502  const auto packindices_sub = Kokkos::create_mirror_view(interf.packindices_sub);
1503  const auto packindices_schur = Kokkos::create_mirror_view(interf.packindices_schur);
1504 
1505 
1506  // Fill packindices_sub and packindices_schur
1507  for (local_ordinal_type local_sub_ip=0; local_sub_ip<n_subparts_per_part-1;++local_sub_ip) {
1508  for (local_ordinal_type local_pack_ip=0; local_pack_ip<npacks_per_subpart;++local_pack_ip) {
1509  packindices_sub(local_sub_ip * npacks_per_subpart + local_pack_ip) = 2 * local_sub_ip * npacks_per_subpart + local_pack_ip;
1510  packindices_schur(local_pack_ip,local_sub_ip) = 2 * local_sub_ip * npacks_per_subpart + local_pack_ip + npacks_per_subpart;
1511  }
1512  }
1513 
1514  for (local_ordinal_type local_pack_ip=0; local_pack_ip<npacks_per_subpart;++local_pack_ip) {
1515  packindices_sub((n_subparts_per_part-1) * npacks_per_subpart + local_pack_ip) = 2 * (n_subparts_per_part-1) * npacks_per_subpart + local_pack_ip;
1516  }
1517 
1518 #ifdef IFPACK2_BLOCKTRIDICONTAINER_WRITE_MM
1519  std::cout << "packindices_sub = " << std::endl;
1520  for (size_type i = 0; i < packindices_sub.extent(0); ++i) {
1521  std::cout << packindices_sub(i) << " ";
1522  }
1523  std::cout << std::endl;
1524  std::cout << "packindices_sub end" << std::endl;
1525 
1526  std::cout << "packindices_schur = " << std::endl;
1527  for (size_type i = 0; i < packindices_schur.extent(0); ++i) {
1528  for (size_type j = 0; j < packindices_schur.extent(1); ++j) {
1529  std::cout << packindices_schur(i,j) << " ";
1530  }
1531  std::cout << std::endl;
1532  }
1533 
1534  std::cout << "packindices_schur end" << std::endl;
1535 #endif
1536 
1537  Kokkos::deep_copy(interf.packindices_sub, packindices_sub);
1538  Kokkos::deep_copy(interf.packindices_schur, packindices_schur);
1539 
1540  interf.packptr_sub = local_ordinal_type_1d_view(do_not_initialize_tag("packptr"), npacks + 1);
1541  const auto packptr_sub = Kokkos::create_mirror_view(interf.packptr_sub);
1542  packptr_sub(0) = 0;
1543  for (local_ordinal_type k=0;k<npacks + 1;++k)
1544  packptr_sub(k) = packptr(k%npacks_per_subpart) + (k / npacks_per_subpart) * packptr(npacks_per_subpart);
1545 
1546  Kokkos::deep_copy(interf.packptr_sub, packptr_sub);
1547  IFPACK2_BLOCKHELPER_TIMER_FENCE(typename BlockHelperDetails::ImplType<MatrixType>::execution_space)
1548  }
1549  IFPACK2_BLOCKHELPER_TIMER_FENCE(typename BlockHelperDetails::ImplType<MatrixType>::execution_space)
1550 
1551  return interf;
1552  }
1553 
1557  template <typename MatrixType>
1558  struct BlockTridiags {
1560  using local_ordinal_type_1d_view = typename impl_type::local_ordinal_type_1d_view;
1561  using size_type_1d_view = typename impl_type::size_type_1d_view;
1562  using size_type_2d_view = typename impl_type::size_type_2d_view;
1563  using vector_type_3d_view = typename impl_type::vector_type_3d_view;
1564  using vector_type_4d_view = typename impl_type::vector_type_4d_view;
1565  using btdm_scalar_type_3d_view = typename impl_type::btdm_scalar_type_3d_view;
1566 
1567  // flat_td_ptr(i) is the index into flat-array values of the start of the
1568  // i'th tridiag. pack_td_ptr is the same, but for packs. If vector_length ==
1569  // 1, pack_td_ptr is the same as flat_td_ptr; if vector_length > 1, then i %
1570  // vector_length is the position in the pack.
1571  size_type_2d_view flat_td_ptr, pack_td_ptr, pack_td_ptr_schur;
1572  // List of local column indices into A from which to grab
1573  // data. flat_td_ptr(i) points to the start of the i'th tridiag's data.
1574  local_ordinal_type_1d_view A_colindsub;
1575  // Tridiag block values. pack_td_ptr(i) points to the start of the i'th
1576  // tridiag's pack, and i % vector_length gives the position in the pack.
1577  vector_type_3d_view values;
1578  // Schur block values. pack_td_ptr_schur(i) points to the start of the i'th
1579  // Schur's pack, and i % vector_length gives the position in the pack.
1580  vector_type_3d_view values_schur;
1581  // inv(A_00)*A_01 block values.
1582  vector_type_4d_view e_values;
1583 
1584  // The following are for fused block Jacobi only.
1585  // For block row i, diag_offset(i)...diag_offset(i + bs^2)
1586  // is the range of scalars for the diagonal block.
1587  size_type_1d_view diag_offsets;
1588  // For fused residual+solve block Jacobi case,
1589  // this contains the diagonal block inverses in flat, local row indexing:
1590  // d_inv(row, :, :) gives the row-major block for row.
1591  btdm_scalar_type_3d_view d_inv;
1592 
1593  bool is_diagonal_only;
1594 
1595  BlockTridiags() = default;
1596  BlockTridiags(const BlockTridiags &b) = default;
1597 
1598  // Index into row-major block of a tridiag.
1599  template <typename idx_type>
1600  static KOKKOS_FORCEINLINE_FUNCTION
1601  idx_type IndexToRow (const idx_type& ind) { return (ind + 1) / 3; }
1602  // Given a row of a row-major tridiag, return the index of the first block
1603  // in that row.
1604  template <typename idx_type>
1605  static KOKKOS_FORCEINLINE_FUNCTION
1606  idx_type RowToIndex (const idx_type& row) { return row > 0 ? 3*row - 1 : 0; }
1607  // Number of blocks in a tridiag having a given number of rows.
1608  template <typename idx_type>
1609  static KOKKOS_FORCEINLINE_FUNCTION
1610  idx_type NumBlocks (const idx_type& nrows) { return nrows > 0 ? 3*nrows - 2 : 0; }
1611  // Number of blocks associated to a Schur complement having a given number of rows.
1612  template <typename idx_type>
1613  static KOKKOS_FORCEINLINE_FUNCTION
1614  idx_type NumBlocksSchur (const idx_type& nrows) { return nrows > 0 ? 3*nrows + 2 : 0; }
1615  };
1616 
1617 
1621  template<typename MatrixType>
1623  createBlockTridiags(const BlockHelperDetails::PartInterface<MatrixType> &interf) {
1624  IFPACK2_BLOCKHELPER_TIMER("createBlockTridiags", createBlockTridiags0);
1625  using impl_type = BlockHelperDetails::ImplType<MatrixType>;
1626  using execution_space = typename impl_type::execution_space;
1627  using local_ordinal_type = typename impl_type::local_ordinal_type;
1628  using size_type = typename impl_type::size_type;
1629  using size_type_2d_view = typename impl_type::size_type_2d_view;
1630 
1631  constexpr int vector_length = impl_type::vector_length;
1632 
1634 
1635  const local_ordinal_type ntridiags = interf.partptr_sub.extent(0);
1636 
1637  { // construct the flat index pointers into the tridiag values array.
1638  btdm.flat_td_ptr = size_type_2d_view(do_not_initialize_tag("btdm.flat_td_ptr"), interf.nparts, 2*interf.n_subparts_per_part);
1639  const Kokkos::RangePolicy<execution_space> policy(0, 2 * interf.nparts * interf.n_subparts_per_part );
1640  Kokkos::parallel_scan
1641  ("createBlockTridiags::RangePolicy::flat_td_ptr",
1642  policy, KOKKOS_LAMBDA(const local_ordinal_type &i, size_type &update, const bool &final) {
1643  const local_ordinal_type partidx = i/(2 * interf.n_subparts_per_part);
1644  const local_ordinal_type local_subpartidx = i % (2 * interf.n_subparts_per_part);
1645 
1646  if (final) {
1647  btdm.flat_td_ptr(partidx, local_subpartidx) = update;
1648  }
1649  if (local_subpartidx != (2 * interf.n_subparts_per_part -1)) {
1650  const local_ordinal_type nrows = interf.partptr_sub(interf.nparts*local_subpartidx + partidx,1) - interf.partptr_sub(interf.nparts*local_subpartidx + partidx,0);
1651  if (local_subpartidx % 2 == 0)
1652  update += btdm.NumBlocks(nrows);
1653  else
1654  update += btdm.NumBlocksSchur(nrows);
1655  }
1656  });
1657 
1658  const auto nblocks = Kokkos::create_mirror_view_and_copy
1659  (Kokkos::HostSpace(), Kokkos::subview(btdm.flat_td_ptr, interf.nparts-1, 2*interf.n_subparts_per_part-1));
1660  btdm.is_diagonal_only = (static_cast<local_ordinal_type>(nblocks()) == ntridiags);
1661  }
1662 
1663  // And the packed index pointers.
1664  if (vector_length == 1) {
1665  btdm.pack_td_ptr = btdm.flat_td_ptr;
1666  } else {
1667  //const local_ordinal_type npacks = interf.packptr_sub.extent(0) - 1;
1668 
1669  local_ordinal_type npacks_per_subpart = 0;
1670  const auto part2packrowidx0 = Kokkos::create_mirror_view(interf.part2packrowidx0);
1671  Kokkos::deep_copy(part2packrowidx0, interf.part2packrowidx0);
1672  for (local_ordinal_type ip=1;ip<=interf.nparts;++ip) //n_sub_parts_and_schur
1673  if (part2packrowidx0(ip) != part2packrowidx0(ip-1))
1674  ++npacks_per_subpart;
1675 
1676  btdm.pack_td_ptr = size_type_2d_view(do_not_initialize_tag("btdm.pack_td_ptr"), interf.nparts, 2*interf.n_subparts_per_part);
1677  const Kokkos::RangePolicy<execution_space> policy(0,npacks_per_subpart);
1678 
1679  Kokkos::parallel_for
1680  ("createBlockTridiags::RangePolicy::pack_td_ptr",
1681  policy, KOKKOS_LAMBDA(const local_ordinal_type &i) {
1682  for (local_ordinal_type j = 0; j < 2*interf.n_subparts_per_part; ++j) {
1683  const local_ordinal_type pack_id = ( j == 2*interf.n_subparts_per_part-1 ) ? i+(j-1)*npacks_per_subpart : i+j*npacks_per_subpart;
1684  const local_ordinal_type nparts_in_pack = interf.packptr_sub(pack_id+1) - interf.packptr_sub(pack_id);
1685 
1686  const local_ordinal_type parti = interf.packptr_sub(pack_id);
1687  const local_ordinal_type partidx = parti%interf.nparts;
1688 
1689  for (local_ordinal_type pti=0;pti<nparts_in_pack;++pti) {
1690  btdm.pack_td_ptr(partidx+pti, j) = btdm.flat_td_ptr(i, j);
1691  }
1692  }
1693  });
1694  }
1695 
1696  btdm.pack_td_ptr_schur = size_type_2d_view(do_not_initialize_tag("btdm.pack_td_ptr_schur"), interf.nparts, interf.n_subparts_per_part);
1697 
1698  const auto host_pack_td_ptr_schur = Kokkos::create_mirror_view(btdm.pack_td_ptr_schur);
1699  constexpr local_ordinal_type connection_length = 2;
1700 
1701  host_pack_td_ptr_schur(0,0) = 0;
1702  for (local_ordinal_type i = 0; i < interf.nparts; ++i) {
1703  if (i % vector_length == 0) {
1704  if (i != 0)
1705  host_pack_td_ptr_schur(i,0) = host_pack_td_ptr_schur(i-1,host_pack_td_ptr_schur.extent(1)-1);
1706  for (local_ordinal_type j = 0; j < interf.n_subparts_per_part-1; ++j) {
1707  host_pack_td_ptr_schur(i,j+1) = host_pack_td_ptr_schur(i,j) + btdm.NumBlocks(connection_length) + (j != 0 ? 1 : 0) + (j != interf.n_subparts_per_part-2 ? 1 : 0);
1708  }
1709  }
1710  else {
1711  for (local_ordinal_type j = 0; j < interf.n_subparts_per_part; ++j) {
1712  host_pack_td_ptr_schur(i,j) = host_pack_td_ptr_schur(i-1,j);
1713  }
1714  }
1715  }
1716 
1717  Kokkos::deep_copy(btdm.pack_td_ptr_schur, host_pack_td_ptr_schur);
1718 
1719 #ifdef IFPACK2_BLOCKTRIDICONTAINER_WRITE_MM
1720  const auto host_flat_td_ptr = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), btdm.flat_td_ptr);
1721  std::cout << "flat_td_ptr = " << std::endl;
1722  for (size_type i = 0; i < host_flat_td_ptr.extent(0); ++i) {
1723  for (size_type j = 0; j < host_flat_td_ptr.extent(1); ++j) {
1724  std::cout << host_flat_td_ptr(i,j) << " ";
1725  }
1726  std::cout << std::endl;
1727  }
1728  std::cout << "flat_td_ptr end" << std::endl;
1729 
1730  const auto host_pack_td_ptr = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), btdm.pack_td_ptr);
1731 
1732  std::cout << "pack_td_ptr = " << std::endl;
1733  for (size_type i = 0; i < host_pack_td_ptr.extent(0); ++i) {
1734  for (size_type j = 0; j < host_pack_td_ptr.extent(1); ++j) {
1735  std::cout << host_pack_td_ptr(i,j) << " ";
1736  }
1737  std::cout << std::endl;
1738  }
1739  std::cout << "pack_td_ptr end" << std::endl;
1740 
1741 
1742  std::cout << "pack_td_ptr_schur = " << std::endl;
1743  for (size_type i = 0; i < host_pack_td_ptr_schur.extent(0); ++i) {
1744  for (size_type j = 0; j < host_pack_td_ptr_schur.extent(1); ++j) {
1745  std::cout << host_pack_td_ptr_schur(i,j) << " ";
1746  }
1747  std::cout << std::endl;
1748  }
1749  std::cout << "pack_td_ptr_schur end" << std::endl;
1750 #endif
1751 
1752  // values and A_colindsub are created in the symbolic phase
1753  IFPACK2_BLOCKHELPER_TIMER_FENCE(typename BlockHelperDetails::ImplType<MatrixType>::execution_space)
1754 
1755  return btdm;
1756  }
1757 
1758  // Set the tridiags to be I to the full pack block size. That way, if a
1759  // tridiag within a pack is shorter than the longest one, the extra blocks are
1760  // processed in a safe way. Similarly, in the solve phase, if the extra blocks
1761  // in the packed multvector are 0, and the tridiag LU reflects the extra I
1762  // blocks, then the solve proceeds as though the extra blocks aren't
1763  // present. Since this extra work is part of the SIMD calls, it's not actually
1764  // extra work. Instead, it means we don't have to put checks or masks in, or
1765  // quiet NaNs. This functor has to be called just once, in the symbolic phase,
1766  // since the numeric phase fills in only the used entries, leaving these I
1767  // blocks intact.
1768  template<typename MatrixType>
1769  void
1770  setTridiagsToIdentity
1771  (const BlockTridiags<MatrixType>& btdm,
1772  const typename BlockHelperDetails::ImplType<MatrixType>::local_ordinal_type_1d_view& packptr)
1773  {
1774  using impl_type = BlockHelperDetails::ImplType<MatrixType>;
1775  using execution_space = typename impl_type::execution_space;
1776  using local_ordinal_type = typename impl_type::local_ordinal_type;
1777  using size_type_2d_view = typename impl_type::size_type_2d_view;
1778 
1779  const ConstUnmanaged<size_type_2d_view> pack_td_ptr(btdm.pack_td_ptr);
1780  const local_ordinal_type blocksize = btdm.values.extent(1);
1781 
1782  {
1783  const int vector_length = impl_type::vector_length;
1784  const int internal_vector_length = impl_type::internal_vector_length;
1785 
1786  using btdm_scalar_type = typename impl_type::btdm_scalar_type;
1787  using internal_vector_type = typename impl_type::internal_vector_type;
1788  using internal_vector_type_4d_view =
1789  typename impl_type::internal_vector_type_4d_view;
1790 
1791  using team_policy_type = Kokkos::TeamPolicy<execution_space>;
1792  const internal_vector_type_4d_view values
1793  (reinterpret_cast<internal_vector_type*>(btdm.values.data()),
1794  btdm.values.extent(0),
1795  btdm.values.extent(1),
1796  btdm.values.extent(2),
1797  vector_length/internal_vector_length);
1798  const local_ordinal_type vector_loop_size = values.extent(3);
1799 #if defined(KOKKOS_ENABLE_CUDA) && defined(__CUDA_ARCH__)
1800  local_ordinal_type total_team_size(0);
1801  if (blocksize <= 5) total_team_size = 32;
1802  else if (blocksize <= 9) total_team_size = 64;
1803  else if (blocksize <= 12) total_team_size = 96;
1804  else if (blocksize <= 16) total_team_size = 128;
1805  else if (blocksize <= 20) total_team_size = 160;
1806  else total_team_size = 160;
1807  const local_ordinal_type team_size = total_team_size/vector_loop_size;
1808  const team_policy_type policy(packptr.extent(0)-1, team_size, vector_loop_size);
1809 #elif defined(KOKKOS_ENABLE_HIP)
1810  // FIXME: HIP
1811  // These settings might be completely wrong
1812  // will have to do some experiments to decide
1813  // what makes sense on AMD GPUs
1814  local_ordinal_type total_team_size(0);
1815  if (blocksize <= 5) total_team_size = 32;
1816  else if (blocksize <= 9) total_team_size = 64;
1817  else if (blocksize <= 12) total_team_size = 96;
1818  else if (blocksize <= 16) total_team_size = 128;
1819  else if (blocksize <= 20) total_team_size = 160;
1820  else total_team_size = 160;
1821  const local_ordinal_type team_size = total_team_size/vector_loop_size;
1822  const team_policy_type policy(packptr.extent(0)-1, team_size, vector_loop_size);
1823 #elif defined(KOKKOS_ENABLE_SYCL)
1824  // SYCL: FIXME
1825  local_ordinal_type total_team_size(0);
1826  if (blocksize <= 5) total_team_size = 32;
1827  else if (blocksize <= 9) total_team_size = 64;
1828  else if (blocksize <= 12) total_team_size = 96;
1829  else if (blocksize <= 16) total_team_size = 128;
1830  else if (blocksize <= 20) total_team_size = 160;
1831  else total_team_size = 160;
1832  const local_ordinal_type team_size = total_team_size/vector_loop_size;
1833  const team_policy_type policy(packptr.extent(0)-1, team_size, vector_loop_size);
1834 #else
1835  // Host architecture: team size is always one
1836  const team_policy_type policy(packptr.extent(0)-1, 1, 1);
1837 #endif
1838  Kokkos::parallel_for
1839  ("setTridiagsToIdentity::TeamPolicy",
1840  policy, KOKKOS_LAMBDA(const typename team_policy_type::member_type &member) {
1841  const local_ordinal_type k = member.league_rank();
1842  const local_ordinal_type ibeg = pack_td_ptr(packptr(k),0);
1843  const local_ordinal_type iend = pack_td_ptr(packptr(k),pack_td_ptr.extent(1)-1);
1844 
1845  const local_ordinal_type diff = iend - ibeg;
1846  const local_ordinal_type icount = diff/3 + (diff%3 > 0);
1847  const btdm_scalar_type one(1);
1848  Kokkos::parallel_for(Kokkos::ThreadVectorRange(member, vector_loop_size),[&](const int &v) {
1849  Kokkos::parallel_for(Kokkos::TeamThreadRange(member,icount),[&](const local_ordinal_type &ii) {
1850  const local_ordinal_type i = ibeg + ii*3;
1851  for (local_ordinal_type j=0;j<blocksize;++j) {
1852  values(i,j,j,v) = one;
1853  }
1854  });
1855  });
1856  });
1857  }
1858  }
1859 
1863  template<typename MatrixType>
1864  void
1865  performSymbolicPhase(const Teuchos::RCP<const typename BlockHelperDetails::ImplType<MatrixType>::tpetra_row_matrix_type> &A,
1866  const Teuchos::RCP<const typename BlockHelperDetails::ImplType<MatrixType>::tpetra_crs_graph_type> &g,
1867  const BlockHelperDetails::PartInterface<MatrixType> &interf,
1870  const bool overlap_communication_and_computation,
1871  const Teuchos::RCP<AsyncableImport<MatrixType> > &async_importer,
1872  bool useSeqMethod,
1873  bool use_fused_jacobi) {
1874  IFPACK2_BLOCKHELPER_TIMER("BlockTriDi::SymbolicPhase", SymbolicPhase);
1875 
1876  using impl_type = BlockHelperDetails::ImplType<MatrixType>;
1877 
1878  using execution_space = typename impl_type::execution_space;
1879  using host_execution_space = typename impl_type::host_execution_space;
1880 
1881  using local_ordinal_type = typename impl_type::local_ordinal_type;
1882  using global_ordinal_type = typename impl_type::global_ordinal_type;
1883  using size_type = typename impl_type::size_type;
1884  using local_ordinal_type_1d_view = typename impl_type::local_ordinal_type_1d_view;
1885  using size_type_1d_view = typename impl_type::size_type_1d_view;
1886  using vector_type_3d_view = typename impl_type::vector_type_3d_view;
1887  using vector_type_4d_view = typename impl_type::vector_type_4d_view;
1888  using crs_matrix_type = typename impl_type::tpetra_crs_matrix_type;
1889  using block_crs_matrix_type = typename impl_type::tpetra_block_crs_matrix_type;
1890  using btdm_scalar_type_3d_view = typename impl_type::btdm_scalar_type_3d_view;
1891 
1892  constexpr int vector_length = impl_type::vector_length;
1893 
1894  const auto comm = A->getRowMap()->getComm();
1895 
1896  auto A_crs = Teuchos::rcp_dynamic_cast<const crs_matrix_type>(A);
1897  auto A_bcrs = Teuchos::rcp_dynamic_cast<const block_crs_matrix_type>(A);
1898 
1899  bool hasBlockCrsMatrix = ! A_bcrs.is_null ();
1900  TEUCHOS_ASSERT(hasBlockCrsMatrix || g->getLocalNumRows() != 0);
1901  const local_ordinal_type blocksize = hasBlockCrsMatrix ? A->getBlockSize() : A->getLocalNumRows()/g->getLocalNumRows();
1902 
1903  // mirroring to host
1904  const auto partptr = Kokkos::create_mirror_view_and_copy (Kokkos::HostSpace(), interf.partptr);
1905  const auto lclrow = Kokkos::create_mirror_view_and_copy (Kokkos::HostSpace(), interf.lclrow);
1906  const auto rowidx2part = Kokkos::create_mirror_view_and_copy (Kokkos::HostSpace(), interf.rowidx2part);
1907  const auto part2rowidx0 = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), interf.part2rowidx0);
1908  const auto packptr = Kokkos::create_mirror_view_and_copy (Kokkos::HostSpace(), interf.packptr);
1909 
1910  const local_ordinal_type nrows = partptr(partptr.extent(0) - 1);
1911 
1912  Kokkos::View<local_ordinal_type*,host_execution_space> col2row("col2row", A->getLocalNumCols());
1913 
1914  // find column to row map on host
1915 
1916  Kokkos::deep_copy(col2row, Teuchos::OrdinalTraits<local_ordinal_type>::invalid());
1917  {
1918  const auto rowmap = g->getRowMap();
1919  const auto colmap = g->getColMap();
1920  const auto dommap = g->getDomainMap();
1921  TEUCHOS_ASSERT( !(rowmap.is_null() || colmap.is_null() || dommap.is_null()));
1922 
1923 #if !defined(__CUDA_ARCH__) && !defined(__HIP_DEVICE_COMPILE__) && !defined(__SYCL_DEVICE_ONLY__)
1924  const Kokkos::RangePolicy<host_execution_space> policy(0,nrows);
1925  Kokkos::parallel_for
1926  ("performSymbolicPhase::RangePolicy::col2row",
1927  policy, KOKKOS_LAMBDA(const local_ordinal_type &lr) {
1928  const global_ordinal_type gid = rowmap->getGlobalElement(lr);
1930  if (dommap->isNodeGlobalElement(gid)) {
1931  const local_ordinal_type lc = colmap->getLocalElement(gid);
1932 # if defined(BLOCKTRIDICONTAINER_DEBUG)
1934  BlockHelperDetails::get_msg_prefix(comm) << "GID " << gid
1935  << " gives an invalid local column.");
1936 # endif
1937  col2row(lc) = lr;
1938  }
1939  });
1940 #endif
1941  }
1942 
1943  // construct the D and R graphs in A = D + R.
1944  {
1945  const auto local_graph = g->getLocalGraphHost();
1946  const auto local_graph_rowptr = local_graph.row_map;
1947  TEUCHOS_ASSERT(local_graph_rowptr.size() == static_cast<size_t>(nrows + 1));
1948  const auto local_graph_colidx = local_graph.entries;
1949 
1950  //assume no overlap.
1951 
1952  Kokkos::View<local_ordinal_type*,host_execution_space> lclrow2idx("lclrow2idx", nrows);
1953  {
1954  const Kokkos::RangePolicy<host_execution_space> policy(0,nrows);
1955  Kokkos::parallel_for
1956  ("performSymbolicPhase::RangePolicy::lclrow2idx",
1957  policy, KOKKOS_LAMBDA(const local_ordinal_type &i) {
1958  lclrow2idx[lclrow(i)] = i;
1959  });
1960  }
1961 
1962  // count (block) nnzs in D and R.
1964  typename sum_reducer_type::value_type sum_reducer_value;
1965  {
1966  const Kokkos::RangePolicy<host_execution_space> policy(0,nrows);
1967  Kokkos::parallel_reduce
1968  // profiling interface does not work
1969  (//"performSymbolicPhase::RangePolicy::count_nnz",
1970  policy, KOKKOS_LAMBDA(const local_ordinal_type &lr, typename sum_reducer_type::value_type &update) {
1971  // LID -> index.
1972  const local_ordinal_type ri0 = lclrow2idx[lr];
1973  const local_ordinal_type pi0 = rowidx2part(ri0);
1974  for (size_type j=local_graph_rowptr(lr);j<local_graph_rowptr(lr+1);++j) {
1975  const local_ordinal_type lc = local_graph_colidx(j);
1976  const local_ordinal_type lc2r = col2row[lc];
1977  bool incr_R = false;
1978  do { // breakable
1979  if (lc2r == (local_ordinal_type) -1) {
1980  incr_R = true;
1981  break;
1982  }
1983  const local_ordinal_type ri = lclrow2idx[lc2r];
1984  const local_ordinal_type pi = rowidx2part(ri);
1985  if (pi != pi0) {
1986  incr_R = true;
1987  break;
1988  }
1989  // Test for being in the tridiag. This is done in index space. In
1990  // LID space, tridiag LIDs in a row are not necessarily related by
1991  // {-1, 0, 1}.
1992  if (ri0 + 1 >= ri && ri0 <= ri + 1)
1993  ++update.v[0]; // D_nnz
1994  else
1995  incr_R = true;
1996  } while (0);
1997  if (incr_R) {
1998  if (lc < nrows) ++update.v[1]; // R_nnz_owned
1999  else ++update.v[2]; // R_nnz_remote
2000  }
2001  }
2002  }, sum_reducer_type(sum_reducer_value));
2003  }
2004  size_type D_nnz = sum_reducer_value.v[0];
2005  size_type R_nnz_owned = sum_reducer_value.v[1];
2006  size_type R_nnz_remote = sum_reducer_value.v[2];
2007 
2008  if (!overlap_communication_and_computation) {
2009  R_nnz_owned += R_nnz_remote;
2010  R_nnz_remote = 0;
2011  }
2012 
2013  // construct the D_00 graph.
2014  {
2015  const auto flat_td_ptr = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), btdm.flat_td_ptr);
2016 
2017  btdm.A_colindsub = local_ordinal_type_1d_view("btdm.A_colindsub", D_nnz);
2018  const auto D_A_colindsub = Kokkos::create_mirror_view(btdm.A_colindsub);
2019 
2020 #if defined(BLOCKTRIDICONTAINER_DEBUG)
2021  Kokkos::deep_copy(D_A_colindsub, Teuchos::OrdinalTraits<local_ordinal_type>::invalid());
2022 #endif
2023 
2024  const local_ordinal_type nparts = partptr.extent(0) - 1;
2025 
2026  {
2027  const Kokkos::RangePolicy<host_execution_space> policy(0, nparts);
2028  Kokkos::parallel_for
2029  ("performSymbolicPhase::RangePolicy<host_execution_space>::D_graph",
2030  policy, KOKKOS_LAMBDA(const local_ordinal_type &pi0) {
2031  const local_ordinal_type part_ri0 = part2rowidx0(pi0);
2032  local_ordinal_type offset = 0;
2033  for (local_ordinal_type ri0=partptr(pi0);ri0<partptr(pi0+1);++ri0) {
2034  const local_ordinal_type td_row_os = btdm.RowToIndex(ri0 - part_ri0) + offset;
2035  offset = 1;
2036  const local_ordinal_type lr0 = lclrow(ri0);
2037  const size_type j0 = local_graph_rowptr(lr0);
2038  for (size_type j=j0;j<local_graph_rowptr(lr0+1);++j) {
2039  const local_ordinal_type lc = local_graph_colidx(j);
2040  const local_ordinal_type lc2r = col2row[lc];
2041  if (lc2r == (local_ordinal_type) -1) continue;
2042  const local_ordinal_type ri = lclrow2idx[lc2r];
2043  const local_ordinal_type pi = rowidx2part(ri);
2044  if (pi != pi0) continue;
2045  if (ri + 1 < ri0 || ri > ri0 + 1) continue;
2046  const local_ordinal_type row_entry = j - j0;
2047  D_A_colindsub(flat_td_ptr(pi0,0) + ((td_row_os + ri) - ri0)) = row_entry;
2048  }
2049  }
2050  });
2051  }
2052 #if defined(BLOCKTRIDICONTAINER_DEBUG)
2053  for (size_t i=0;i<D_A_colindsub.extent(0);++i)
2055 #endif
2056  Kokkos::deep_copy(btdm.A_colindsub, D_A_colindsub);
2057 
2058  // Allocate values.
2059  {
2060  const auto pack_td_ptr_last = Kokkos::subview(btdm.pack_td_ptr, btdm.pack_td_ptr.extent(0)-1, btdm.pack_td_ptr.extent(1)-1);
2061  const auto num_packed_blocks = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), pack_td_ptr_last);
2062  btdm.values = vector_type_3d_view("btdm.values", num_packed_blocks(), blocksize, blocksize);
2063 
2064  if (interf.n_subparts_per_part > 1) {
2065  const auto pack_td_ptr_schur_last = Kokkos::subview(btdm.pack_td_ptr_schur, btdm.pack_td_ptr_schur.extent(0)-1, btdm.pack_td_ptr_schur.extent(1)-1);
2066  const auto num_packed_blocks_schur = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), pack_td_ptr_schur_last);
2067  btdm.values_schur = vector_type_3d_view("btdm.values_schur", num_packed_blocks_schur(), blocksize, blocksize);
2068  }
2069 
2070  if (vector_length > 1) setTridiagsToIdentity(btdm, interf.packptr);
2071  }
2072  }
2073 
2074  // Construct the R graph.
2075  {
2076  amd.rowptr = size_type_1d_view("amd.rowptr", nrows + 1);
2077  amd.A_colindsub = local_ordinal_type_1d_view(do_not_initialize_tag("amd.A_colindsub"), R_nnz_owned);
2078 
2079  const auto R_rowptr = Kokkos::create_mirror_view(amd.rowptr);
2080  const auto R_A_colindsub = Kokkos::create_mirror_view(amd.A_colindsub);
2081 
2082  amd.rowptr_remote = size_type_1d_view("amd.rowptr_remote", overlap_communication_and_computation ? nrows + 1 : 0);
2083  amd.A_colindsub_remote = local_ordinal_type_1d_view(do_not_initialize_tag("amd.A_colindsub_remote"), R_nnz_remote);
2084 
2085  const auto R_rowptr_remote = Kokkos::create_mirror_view(amd.rowptr_remote);
2086  const auto R_A_colindsub_remote = Kokkos::create_mirror_view(amd.A_colindsub_remote);
2087 
2088  {
2089  const Kokkos::RangePolicy<host_execution_space> policy(0,nrows);
2090  Kokkos::parallel_for
2091  ("performSymbolicPhase::RangePolicy<host_execution_space>::R_graph_count",
2092  policy, KOKKOS_LAMBDA(const local_ordinal_type &lr) {
2093  const local_ordinal_type ri0 = lclrow2idx[lr];
2094  const local_ordinal_type pi0 = rowidx2part(ri0);
2095  const size_type j0 = local_graph_rowptr(lr);
2096  for (size_type j=j0;j<local_graph_rowptr(lr+1);++j) {
2097  const local_ordinal_type lc = local_graph_colidx(j);
2098  const local_ordinal_type lc2r = col2row[lc];
2099  if (lc2r != (local_ordinal_type) -1) {
2100  const local_ordinal_type ri = lclrow2idx[lc2r];
2101  const local_ordinal_type pi = rowidx2part(ri);
2102  if (pi == pi0 && ri + 1 >= ri0 && ri <= ri0 + 1) {
2103  continue;
2104  }
2105  }
2106  // exclusive scan will be performed later
2107  if (!overlap_communication_and_computation || lc < nrows) {
2108  ++R_rowptr(lr);
2109  } else {
2110  ++R_rowptr_remote(lr);
2111  }
2112  }
2113  });
2114  }
2115 
2116  // exclusive scan
2118  {
2119  Kokkos::RangePolicy<host_execution_space> policy(0,nrows+1);
2120  Kokkos::parallel_scan
2121  ("performSymbolicPhase::RangePolicy<host_execution_space>::R_graph_fill",
2122  policy, KOKKOS_LAMBDA(const local_ordinal_type &lr,
2123  update_type &update,
2124  const bool &final) {
2125  update_type val;
2126  val.v[0] = R_rowptr(lr);
2127  if (overlap_communication_and_computation)
2128  val.v[1] = R_rowptr_remote(lr);
2129 
2130  if (final) {
2131  R_rowptr(lr) = update.v[0];
2132  if (overlap_communication_and_computation)
2133  R_rowptr_remote(lr) = update.v[1];
2134 
2135  if (lr < nrows) {
2136  const local_ordinal_type ri0 = lclrow2idx[lr];
2137  const local_ordinal_type pi0 = rowidx2part(ri0);
2138 
2139  size_type cnt_rowptr = R_rowptr(lr);
2140  size_type cnt_rowptr_remote = overlap_communication_and_computation ? R_rowptr_remote(lr) : 0; // when not overlap_communication_and_computation, this value is garbage
2141 
2142  const size_type j0 = local_graph_rowptr(lr);
2143  for (size_type j=j0;j<local_graph_rowptr(lr+1);++j) {
2144  const local_ordinal_type lc = local_graph_colidx(j);
2145  const local_ordinal_type lc2r = col2row[lc];
2146  if (lc2r != (local_ordinal_type) -1) {
2147  const local_ordinal_type ri = lclrow2idx[lc2r];
2148  const local_ordinal_type pi = rowidx2part(ri);
2149  if (pi == pi0 && ri + 1 >= ri0 && ri <= ri0 + 1)
2150  continue;
2151  }
2152  const local_ordinal_type row_entry = j - j0;
2153  if (!overlap_communication_and_computation || lc < nrows)
2154  R_A_colindsub(cnt_rowptr++) = row_entry;
2155  else
2156  R_A_colindsub_remote(cnt_rowptr_remote++) = row_entry;
2157  }
2158  }
2159  }
2160  update += val;
2161  });
2162  }
2163  TEUCHOS_ASSERT(R_rowptr(nrows) == R_nnz_owned);
2164  Kokkos::deep_copy(amd.rowptr, R_rowptr);
2165  Kokkos::deep_copy(amd.A_colindsub, R_A_colindsub);
2166  if (overlap_communication_and_computation) {
2167  TEUCHOS_ASSERT(R_rowptr_remote(nrows) == R_nnz_remote);
2168  Kokkos::deep_copy(amd.rowptr_remote, R_rowptr_remote);
2169  Kokkos::deep_copy(amd.A_colindsub_remote, R_A_colindsub_remote);
2170  }
2171 
2172  // Allocate or view values.
2173  if (hasBlockCrsMatrix)
2174  amd.tpetra_values = (const_cast<block_crs_matrix_type*>(A_bcrs.get())->getValuesDeviceNonConst());
2175  else {
2176  amd.tpetra_values = (const_cast<crs_matrix_type*>(A_crs.get()))->getLocalValuesDevice (Tpetra::Access::ReadWrite);
2177  }
2178  }
2179 
2180  // Allocate view for E and initialize the values with B:
2181 
2182  if (interf.n_subparts_per_part > 1)
2183  btdm.e_values = vector_type_4d_view("btdm.e_values", 2, interf.part2packrowidx0_back, blocksize, blocksize);
2184  }
2185  // Precompute offsets of each A and x entry to speed up residual.
2186  // Applies if all of these are true:
2187  // - hasBlockCrsMatrix
2188  // - execution_space is a GPU
2189  // - !useSeqMethod (since this uses a different scheme for indexing A,x)
2190  //
2191  // Reading A, x take up to 4 and 6 levels of indirection respectively,
2192  // but precomputing the offsets reduces it to 2 for both (get index, then value)
2193  if(BlockHelperDetails::is_device<execution_space>::value && !useSeqMethod && hasBlockCrsMatrix)
2194  {
2195  bool is_async_importer_active = !async_importer.is_null();
2196  local_ordinal_type_1d_view dm2cm = is_async_importer_active ? async_importer->dm2cm : local_ordinal_type_1d_view();
2197  bool ownedRemoteSeparate = overlap_communication_and_computation || !is_async_importer_active;
2198  BlockHelperDetails::precompute_A_x_offsets<MatrixType>(amd, interf, g, dm2cm, blocksize, ownedRemoteSeparate);
2199  }
2200 
2201  // If using fused block Jacobi path, allocate diagonal inverses here (d_inv) and find diagonal offsets.
2202  if(use_fused_jacobi) {
2203  btdm.d_inv = btdm_scalar_type_3d_view(do_not_initialize_tag("btdm.d_inv"), interf.nparts, blocksize, blocksize);
2204  auto rowptrs = A_bcrs->getCrsGraph().getLocalRowPtrsDevice();
2205  auto entries = A_bcrs->getCrsGraph().getLocalIndicesDevice();
2206  btdm.diag_offsets = BlockHelperDetails::findDiagOffsets<execution_space, size_type_1d_view>(rowptrs, entries, interf.nparts, blocksize);
2207  }
2208  IFPACK2_BLOCKHELPER_TIMER_FENCE(typename BlockHelperDetails::ImplType<MatrixType>::execution_space)
2209  }
2210 
2211 
2215  template<typename ArgActiveExecutionMemorySpace>
2217 
2218  template<>
2219  struct ExtractAndFactorizeTridiagsDefaultModeAndAlgo<Kokkos::HostSpace> {
2220  typedef KB::Mode::Serial mode_type;
2221 #if defined(__KOKKOSBATCHED_INTEL_MKL_COMPACT_BATCHED__)
2222  typedef KB::Algo::Level3::CompactMKL algo_type;
2223 #else
2224  typedef KB::Algo::Level3::Blocked algo_type;
2225 #endif
2226  static int recommended_team_size(const int /* blksize */,
2227  const int /* vector_length */,
2228  const int /* internal_vector_length */) {
2229  return 1;
2230  }
2231 
2232  };
2233 
2234 #if defined(KOKKOS_ENABLE_CUDA)
2235  static inline int ExtractAndFactorizeRecommendedCudaTeamSize(const int blksize,
2236  const int vector_length,
2237  const int internal_vector_length) {
2238  const int vector_size = vector_length/internal_vector_length;
2239  int total_team_size(0);
2240  if (blksize <= 5) total_team_size = 32;
2241  else if (blksize <= 9) total_team_size = 32; // 64
2242  else if (blksize <= 12) total_team_size = 96;
2243  else if (blksize <= 16) total_team_size = 128;
2244  else if (blksize <= 20) total_team_size = 160;
2245  else total_team_size = 160;
2246  return 2*total_team_size/vector_size;
2247  }
2248  template<>
2249  struct ExtractAndFactorizeTridiagsDefaultModeAndAlgo<Kokkos::CudaSpace> {
2250  typedef KB::Mode::Team mode_type;
2251  typedef KB::Algo::Level3::Unblocked algo_type;
2252  static int recommended_team_size(const int blksize,
2253  const int vector_length,
2254  const int internal_vector_length) {
2255  return ExtractAndFactorizeRecommendedCudaTeamSize(blksize, vector_length, internal_vector_length);
2256  }
2257  };
2258  template<>
2259  struct ExtractAndFactorizeTridiagsDefaultModeAndAlgo<Kokkos::CudaUVMSpace> {
2260  typedef KB::Mode::Team mode_type;
2261  typedef KB::Algo::Level3::Unblocked algo_type;
2262  static int recommended_team_size(const int blksize,
2263  const int vector_length,
2264  const int internal_vector_length) {
2265  return ExtractAndFactorizeRecommendedCudaTeamSize(blksize, vector_length, internal_vector_length);
2266  }
2267  };
2268 #endif
2269 
2270 #if defined(KOKKOS_ENABLE_HIP)
2271  static inline int ExtractAndFactorizeRecommendedHIPTeamSize(const int blksize,
2272  const int vector_length,
2273  const int internal_vector_length) {
2274  const int vector_size = vector_length/internal_vector_length;
2275  int total_team_size(0);
2276  if (blksize <= 5) total_team_size = 32;
2277  else if (blksize <= 9) total_team_size = 32; // 64
2278  else if (blksize <= 12) total_team_size = 96;
2279  else if (blksize <= 16) total_team_size = 128;
2280  else if (blksize <= 20) total_team_size = 160;
2281  else total_team_size = 160;
2282  return 2*total_team_size/vector_size;
2283  }
2284  template<>
2285  struct ExtractAndFactorizeTridiagsDefaultModeAndAlgo<Kokkos::HIPSpace> {
2286  typedef KB::Mode::Team mode_type;
2287  typedef KB::Algo::Level3::Unblocked algo_type;
2288  static int recommended_team_size(const int blksize,
2289  const int vector_length,
2290  const int internal_vector_length) {
2291  return ExtractAndFactorizeRecommendedHIPTeamSize(blksize, vector_length, internal_vector_length);
2292  }
2293  };
2294  template<>
2295  struct ExtractAndFactorizeTridiagsDefaultModeAndAlgo<Kokkos::HIPHostPinnedSpace> {
2296  typedef KB::Mode::Team mode_type;
2297  typedef KB::Algo::Level3::Unblocked algo_type;
2298  static int recommended_team_size(const int blksize,
2299  const int vector_length,
2300  const int internal_vector_length) {
2301  return ExtractAndFactorizeRecommendedHIPTeamSize(blksize, vector_length, internal_vector_length);
2302  }
2303  };
2304 #endif
2305 
2306 #if defined(KOKKOS_ENABLE_SYCL)
2307  static inline int ExtractAndFactorizeRecommendedSYCLTeamSize(const int blksize,
2308  const int vector_length,
2309  const int internal_vector_length) {
2310  const int vector_size = vector_length/internal_vector_length;
2311  int total_team_size(0);
2312  if (blksize <= 5) total_team_size = 32;
2313  else if (blksize <= 9) total_team_size = 32; // 64
2314  else if (blksize <= 12) total_team_size = 96;
2315  else if (blksize <= 16) total_team_size = 128;
2316  else if (blksize <= 20) total_team_size = 160;
2317  else total_team_size = 160;
2318  return 2*total_team_size/vector_size;
2319  }
2320  template<>
2321  struct ExtractAndFactorizeTridiagsDefaultModeAndAlgo<Kokkos::Experimental::SYCLDeviceUSMSpace> {
2322  typedef KB::Mode::Team mode_type;
2323  typedef KB::Algo::Level3::Unblocked algo_type;
2324  static int recommended_team_size(const int blksize,
2325  const int vector_length,
2326  const int internal_vector_length) {
2327  return ExtractAndFactorizeRecommendedSYCLTeamSize(blksize, vector_length, internal_vector_length);
2328  }
2329  };
2330  template<>
2331  struct ExtractAndFactorizeTridiagsDefaultModeAndAlgo<Kokkos::Experimental::SYCLSharedUSMSpace> {
2332  typedef KB::Mode::Team mode_type;
2333  typedef KB::Algo::Level3::Unblocked algo_type;
2334  static int recommended_team_size(const int blksize,
2335  const int vector_length,
2336  const int internal_vector_length) {
2337  return ExtractAndFactorizeRecommendedSYCLTeamSize(blksize, vector_length, internal_vector_length);
2338  }
2339  };
2340 #endif
2341 
2342  template<typename impl_type, typename WWViewType>
2343  KOKKOS_INLINE_FUNCTION
2344  void
2345  solveMultiVector(const typename Kokkos::TeamPolicy<typename impl_type::execution_space>::member_type &member,
2346  const typename impl_type::local_ordinal_type &/* blocksize */,
2347  const typename impl_type::local_ordinal_type &i0,
2348  const typename impl_type::local_ordinal_type &r0,
2349  const typename impl_type::local_ordinal_type &nrows,
2350  const typename impl_type::local_ordinal_type &v,
2351  const ConstUnmanaged<typename impl_type::internal_vector_type_4d_view> D_internal_vector_values,
2352  const Unmanaged<typename impl_type::internal_vector_type_4d_view> X_internal_vector_values,
2353  const WWViewType &WW,
2354  const bool skip_first_pass=false) {
2355  using execution_space = typename impl_type::execution_space;
2356  using team_policy_type = Kokkos::TeamPolicy<execution_space>;
2357  using member_type = typename team_policy_type::member_type;
2358  using local_ordinal_type = typename impl_type::local_ordinal_type;
2359 
2360  typedef SolveTridiagsDefaultModeAndAlgo
2361  <typename execution_space::memory_space> default_mode_and_algo_type;
2362 
2363  typedef typename default_mode_and_algo_type::mode_type default_mode_type;
2364  typedef typename default_mode_and_algo_type::multi_vector_algo_type default_algo_type;
2365 
2366  using btdm_magnitude_type = typename impl_type::btdm_magnitude_type;
2367 
2368  // constant
2369  const auto one = Kokkos::ArithTraits<btdm_magnitude_type>::one();
2370  const auto zero = Kokkos::ArithTraits<btdm_magnitude_type>::zero();
2371 
2372  // subview pattern
2373  auto A = Kokkos::subview(D_internal_vector_values, i0, Kokkos::ALL(), Kokkos::ALL(), v);
2374  auto X1 = Kokkos::subview(X_internal_vector_values, r0, Kokkos::ALL(), Kokkos::ALL(), v);
2375  auto X2 = X1;
2376 
2377  local_ordinal_type i = i0, r = r0;
2378 
2379 
2380  if (nrows > 1) {
2381  // solve Lx = x
2382  if (skip_first_pass) {
2383  i += (nrows-2) * 3;
2384  r += (nrows-2);
2385  A.assign_data( &D_internal_vector_values(i+2,0,0,v) );
2386  X2.assign_data( &X_internal_vector_values(++r,0,0,v) );
2387  A.assign_data( &D_internal_vector_values(i+3,0,0,v) );
2388  KB::Trsm<member_type,
2389  KB::Side::Left,KB::Uplo::Lower,KB::Trans::NoTranspose,KB::Diag::Unit,
2390  default_mode_type,default_algo_type>
2391  ::invoke(member, one, A, X2);
2392  X1.assign_data( X2.data() );
2393  i+=3;
2394  }
2395  else {
2396  KB::Trsm<member_type,
2397  KB::Side::Left,KB::Uplo::Lower,KB::Trans::NoTranspose,KB::Diag::Unit,
2398  default_mode_type,default_algo_type>
2399  ::invoke(member, one, A, X1);
2400  for (local_ordinal_type tr=1;tr<nrows;++tr,i+=3) {
2401  A.assign_data( &D_internal_vector_values(i+2,0,0,v) );
2402  X2.assign_data( &X_internal_vector_values(++r,0,0,v) );
2403  member.team_barrier();
2404  KB::Gemm<member_type,
2405  KB::Trans::NoTranspose,KB::Trans::NoTranspose,
2406  default_mode_type,default_algo_type>
2407  ::invoke(member, -one, A, X1, one, X2);
2408  A.assign_data( &D_internal_vector_values(i+3,0,0,v) );
2409  KB::Trsm<member_type,
2410  KB::Side::Left,KB::Uplo::Lower,KB::Trans::NoTranspose,KB::Diag::Unit,
2411  default_mode_type,default_algo_type>
2412  ::invoke(member, one, A, X2);
2413  X1.assign_data( X2.data() );
2414  }
2415  }
2416 
2417  // solve Ux = x
2418  KB::Trsm<member_type,
2419  KB::Side::Left,KB::Uplo::Upper,KB::Trans::NoTranspose,KB::Diag::NonUnit,
2420  default_mode_type,default_algo_type>
2421  ::invoke(member, one, A, X1);
2422  for (local_ordinal_type tr=nrows;tr>1;--tr) {
2423  i -= 3;
2424  A.assign_data( &D_internal_vector_values(i+1,0,0,v) );
2425  X2.assign_data( &X_internal_vector_values(--r,0,0,v) );
2426  member.team_barrier();
2427  KB::Gemm<member_type,
2428  KB::Trans::NoTranspose,KB::Trans::NoTranspose,
2429  default_mode_type,default_algo_type>
2430  ::invoke(member, -one, A, X1, one, X2);
2431 
2432  A.assign_data( &D_internal_vector_values(i,0,0,v) );
2433  KB::Trsm<member_type,
2434  KB::Side::Left,KB::Uplo::Upper,KB::Trans::NoTranspose,KB::Diag::NonUnit,
2435  default_mode_type,default_algo_type>
2436  ::invoke(member, one, A, X2);
2437  X1.assign_data( X2.data() );
2438  }
2439  } else {
2440  // matrix is already inverted
2441  auto W = Kokkos::subview(WW, Kokkos::ALL(), Kokkos::ALL(), v);
2442  KB::Copy<member_type,KB::Trans::NoTranspose,default_mode_type>
2443  ::invoke(member, X1, W);
2444  member.team_barrier();
2445  KB::Gemm<member_type,
2446  KB::Trans::NoTranspose,KB::Trans::NoTranspose,
2447  default_mode_type,default_algo_type>
2448  ::invoke(member, one, A, W, zero, X1);
2449  }
2450 
2451  }
2452 
2453  template<typename impl_type, typename WWViewType, typename XViewType>
2454  KOKKOS_INLINE_FUNCTION
2455  void
2456  solveSingleVectorNew(const typename Kokkos::TeamPolicy<typename impl_type::execution_space>::member_type &member,
2457  const typename impl_type::local_ordinal_type &blocksize,
2458  const typename impl_type::local_ordinal_type &i0,
2459  const typename impl_type::local_ordinal_type &r0,
2460  const typename impl_type::local_ordinal_type &nrows,
2461  const typename impl_type::local_ordinal_type &v,
2462  const ConstUnmanaged<typename impl_type::internal_vector_type_4d_view> D_internal_vector_values,
2463  const XViewType &X_internal_vector_values, //Unmanaged<typename impl_type::internal_vector_type_4d_view>
2464  const WWViewType &WW) {
2465  using execution_space = typename impl_type::execution_space;
2466  //using team_policy_type = Kokkos::TeamPolicy<execution_space>;
2467  //using member_type = typename team_policy_type::member_type;
2468  using local_ordinal_type = typename impl_type::local_ordinal_type;
2469 
2470  typedef SolveTridiagsDefaultModeAndAlgo
2471  <typename execution_space::memory_space> default_mode_and_algo_type;
2472 
2473  typedef typename default_mode_and_algo_type::mode_type default_mode_type;
2474  typedef typename default_mode_and_algo_type::single_vector_algo_type default_algo_type;
2475 
2476  using btdm_magnitude_type = typename impl_type::btdm_magnitude_type;
2477 
2478  // base pointers
2479  auto A = D_internal_vector_values.data();
2480  auto X = X_internal_vector_values.data();
2481 
2482  // constant
2483  const auto one = Kokkos::ArithTraits<btdm_magnitude_type>::one();
2484  const auto zero = Kokkos::ArithTraits<btdm_magnitude_type>::zero();
2485  //const local_ordinal_type num_vectors = X_scalar_values.extent(2);
2486 
2487  // const local_ordinal_type blocksize = D_scalar_values.extent(1);
2488  const local_ordinal_type astep = D_internal_vector_values.stride_0();
2489  const local_ordinal_type as0 = D_internal_vector_values.stride_1(); //blocksize*vector_length;
2490  const local_ordinal_type as1 = D_internal_vector_values.stride_2(); //vector_length;
2491  const local_ordinal_type xstep = X_internal_vector_values.stride_0();
2492  const local_ordinal_type xs0 = X_internal_vector_values.stride_1(); //vector_length;
2493 
2494  // move to starting point
2495  A += i0*astep + v;
2496  X += r0*xstep + v;
2497 
2498  //for (local_ordinal_type col=0;col<num_vectors;++col)
2499  if (nrows > 1) {
2500  // solve Lx = x
2501  KOKKOSBATCHED_TRSV_LOWER_NO_TRANSPOSE_INTERNAL_INVOKE
2502  (default_mode_type,default_algo_type,
2503  member,
2504  KB::Diag::Unit,
2505  blocksize,blocksize,
2506  one,
2507  A, as0, as1,
2508  X, xs0);
2509 
2510  for (local_ordinal_type tr=1;tr<nrows;++tr) {
2511  member.team_barrier();
2512  KOKKOSBATCHED_GEMV_NO_TRANSPOSE_INTERNAL_INVOKE
2513  (default_mode_type,default_algo_type,
2514  member,
2515  blocksize, blocksize,
2516  -one,
2517  A+2*astep, as0, as1,
2518  X, xs0,
2519  one,
2520  X+1*xstep, xs0);
2521  KOKKOSBATCHED_TRSV_LOWER_NO_TRANSPOSE_INTERNAL_INVOKE
2522  (default_mode_type,default_algo_type,
2523  member,
2524  KB::Diag::Unit,
2525  blocksize,blocksize,
2526  one,
2527  A+3*astep, as0, as1,
2528  X+1*xstep, xs0);
2529 
2530  A += 3*astep;
2531  X += 1*xstep;
2532  }
2533 
2534  // solve Ux = x
2535  KOKKOSBATCHED_TRSV_UPPER_NO_TRANSPOSE_INTERNAL_INVOKE
2536  (default_mode_type,default_algo_type,
2537  member,
2538  KB::Diag::NonUnit,
2539  blocksize, blocksize,
2540  one,
2541  A, as0, as1,
2542  X, xs0);
2543 
2544  for (local_ordinal_type tr=nrows;tr>1;--tr) {
2545  A -= 3*astep;
2546  member.team_barrier();
2547  KOKKOSBATCHED_GEMV_NO_TRANSPOSE_INTERNAL_INVOKE
2548  (default_mode_type,default_algo_type,
2549  member,
2550  blocksize, blocksize,
2551  -one,
2552  A+1*astep, as0, as1,
2553  X, xs0,
2554  one,
2555  X-1*xstep, xs0);
2556  KOKKOSBATCHED_TRSV_UPPER_NO_TRANSPOSE_INTERNAL_INVOKE
2557  (default_mode_type,default_algo_type,
2558  member,
2559  KB::Diag::NonUnit,
2560  blocksize, blocksize,
2561  one,
2562  A, as0, as1,
2563  X-1*xstep,xs0);
2564  X -= 1*xstep;
2565  }
2566  // for multiple rhs
2567  //X += xs1;
2568  } else {
2569  const local_ordinal_type ws0 = WW.stride_0();
2570  auto W = WW.data() + v;
2571  KOKKOSBATCHED_COPY_VECTOR_NO_TRANSPOSE_INTERNAL_INVOKE
2572  (default_mode_type,
2573  member, blocksize, X, xs0, W, ws0);
2574  member.team_barrier();
2575  KOKKOSBATCHED_GEMV_NO_TRANSPOSE_INTERNAL_INVOKE
2576  (default_mode_type,default_algo_type,
2577  member,
2578  blocksize, blocksize,
2579  one,
2580  A, as0, as1,
2581  W, xs0,
2582  zero,
2583  X, xs0);
2584  }
2585  }
2586 
2587  template<typename local_ordinal_type, typename ViewType>
2588  void writeBTDValuesToFile (const local_ordinal_type &n_parts, const ViewType &scalar_values_device, std::string fileName) {
2589 #ifdef IFPACK2_BLOCKTRIDICONTAINER_WRITE_MM
2590  auto scalar_values = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), scalar_values_device);
2591  std::ofstream myfile;
2592  myfile.open (fileName);
2593 
2594  const local_ordinal_type n_parts_per_pack = n_parts < (local_ordinal_type) scalar_values.extent(3) ? n_parts : scalar_values.extent(3);
2595  local_ordinal_type nnz = scalar_values.extent(0) * scalar_values.extent(1) * scalar_values.extent(2) * n_parts_per_pack;
2596  const local_ordinal_type n_blocks = scalar_values.extent(0)*n_parts_per_pack;
2597  const local_ordinal_type n_blocks_per_part = n_blocks/n_parts;
2598 
2599  const local_ordinal_type block_size = scalar_values.extent(1);
2600 
2601  const local_ordinal_type n_rows_per_part = (n_blocks_per_part+2)/3 * block_size;
2602  const local_ordinal_type n_rows = n_rows_per_part*n_parts;
2603 
2604  const local_ordinal_type n_packs = ceil(float(n_parts)/n_parts_per_pack);
2605 
2606  myfile << "%%MatrixMarket matrix coordinate real general"<< std::endl;
2607  myfile << "%%nnz = " << nnz;
2608  myfile << " block size = " << block_size;
2609  myfile << " number of blocks = " << n_blocks;
2610  myfile << " number of parts = " << n_parts;
2611  myfile << " number of blocks per part = " << n_blocks_per_part;
2612  myfile << " number of rows = " << n_rows ;
2613  myfile << " number of cols = " << n_rows;
2614  myfile << " number of packs = " << n_packs << std::endl;
2615 
2616  myfile << n_rows << " " << n_rows << " " << nnz << std::setprecision(9) << std::endl;
2617 
2618  local_ordinal_type current_part_idx, current_block_idx, current_row_offset, current_col_offset, current_row, current_col;
2619  for (local_ordinal_type i_pack=0;i_pack<n_packs;++i_pack) {
2620  for (local_ordinal_type i_part_in_pack=0;i_part_in_pack<n_parts_per_pack;++i_part_in_pack) {
2621  current_part_idx = i_part_in_pack + i_pack * n_parts_per_pack;
2622  for (local_ordinal_type i_block_in_part=0;i_block_in_part<n_blocks_per_part;++i_block_in_part) {
2623  current_block_idx = i_block_in_part + i_pack * n_blocks_per_part;
2624  if (current_block_idx >= (local_ordinal_type) scalar_values.extent(0))
2625  continue;
2626  if (i_block_in_part % 3 == 0) {
2627  current_row_offset = i_block_in_part/3 * block_size;
2628  current_col_offset = i_block_in_part/3 * block_size;
2629  }
2630  else if (i_block_in_part % 3 == 1) {
2631  current_row_offset = (i_block_in_part-1)/3 * block_size;
2632  current_col_offset = ((i_block_in_part-1)/3+1) * block_size;
2633  }
2634  else if (i_block_in_part % 3 == 2) {
2635  current_row_offset = ((i_block_in_part-2)/3+1) * block_size;
2636  current_col_offset = (i_block_in_part-2)/3 * block_size;
2637  }
2638  current_row_offset += current_part_idx * n_rows_per_part;
2639  current_col_offset += current_part_idx * n_rows_per_part;
2640  for (local_ordinal_type i_in_block=0;i_in_block<block_size;++i_in_block) {
2641  for (local_ordinal_type j_in_block=0;j_in_block<block_size;++j_in_block) {
2642  current_row = current_row_offset + i_in_block + 1;
2643  current_col = current_col_offset + j_in_block + 1;
2644  myfile << current_row << " " << current_col << " " << scalar_values(current_block_idx,i_in_block,j_in_block,i_part_in_pack) << std::endl;
2645  }
2646  }
2647  }
2648  }
2649  }
2650 
2651  myfile.close();
2652 #endif
2653  }
2654 
2655  template<typename local_ordinal_type, typename ViewType>
2656  void write4DMultiVectorValuesToFile (const local_ordinal_type &n_parts, const ViewType &scalar_values_device, std::string fileName) {
2657 #ifdef IFPACK2_BLOCKTRIDICONTAINER_WRITE_MM
2658  auto scalar_values = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), scalar_values_device);
2659  std::ofstream myfile;
2660  myfile.open (fileName);
2661 
2662  const local_ordinal_type n_parts_per_pack = n_parts < scalar_values.extent(3) ? n_parts : scalar_values.extent(3);
2663  const local_ordinal_type n_blocks = scalar_values.extent(0)*n_parts_per_pack;
2664  const local_ordinal_type n_blocks_per_part = n_blocks/n_parts;
2665 
2666  const local_ordinal_type block_size = scalar_values.extent(1);
2667  const local_ordinal_type n_cols = scalar_values.extent(2);
2668 
2669  const local_ordinal_type n_rows_per_part = n_blocks_per_part * block_size;
2670  const local_ordinal_type n_rows = n_rows_per_part*n_parts;
2671 
2672  const local_ordinal_type n_packs = ceil(float(n_parts)/n_parts_per_pack);
2673 
2674 
2675  myfile << "%%MatrixMarket matrix array real general"<< std::endl;
2676  myfile << "%%block size = " << block_size;
2677  myfile << " number of blocks = " << n_blocks;
2678  myfile << " number of parts = " << n_parts;
2679  myfile << " number of blocks per part = " << n_blocks_per_part;
2680  myfile << " number of rows = " << n_rows ;
2681  myfile << " number of cols = " << n_cols;
2682  myfile << " number of packs = " << n_packs << std::endl;
2683 
2684  myfile << n_rows << " " << n_cols << std::setprecision(9) << std::endl;
2685 
2686  local_ordinal_type current_part_idx, current_block_idx, current_row_offset;
2687  (void) current_row_offset;
2688  (void) current_part_idx;
2689  for (local_ordinal_type j_in_block=0;j_in_block<n_cols;++j_in_block) {
2690  for (local_ordinal_type i_pack=0;i_pack<n_packs;++i_pack) {
2691  for (local_ordinal_type i_part_in_pack=0;i_part_in_pack<n_parts_per_pack;++i_part_in_pack) {
2692  current_part_idx = i_part_in_pack + i_pack * n_parts_per_pack;
2693  for (local_ordinal_type i_block_in_part=0;i_block_in_part<n_blocks_per_part;++i_block_in_part) {
2694  current_block_idx = i_block_in_part + i_pack * n_blocks_per_part;
2695 
2696  if (current_block_idx >= (local_ordinal_type) scalar_values.extent(0))
2697  continue;
2698  for (local_ordinal_type i_in_block=0;i_in_block<block_size;++i_in_block) {
2699  myfile << scalar_values(current_block_idx,i_in_block,j_in_block,i_part_in_pack) << std::endl;
2700  }
2701  }
2702  }
2703  }
2704  }
2705  myfile.close();
2706 #endif
2707  }
2708 
2709  template<typename local_ordinal_type, typename ViewType>
2710  void write5DMultiVectorValuesToFile (const local_ordinal_type &n_parts, const ViewType &scalar_values_device, std::string fileName) {
2711 #ifdef IFPACK2_BLOCKTRIDICONTAINER_WRITE_MM
2712  auto scalar_values = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), scalar_values_device);
2713  std::ofstream myfile;
2714  myfile.open (fileName);
2715 
2716  const local_ordinal_type n_parts_per_pack = n_parts < scalar_values.extent(4) ? n_parts : scalar_values.extent(4);
2717  const local_ordinal_type n_blocks = scalar_values.extent(1)*n_parts_per_pack;
2718  const local_ordinal_type n_blocks_per_part = n_blocks/n_parts;
2719 
2720  const local_ordinal_type block_size = scalar_values.extent(2);
2721  const local_ordinal_type n_blocks_cols = scalar_values.extent(0);
2722  const local_ordinal_type n_cols = n_blocks_cols * block_size;
2723 
2724  const local_ordinal_type n_rows_per_part = n_blocks_per_part * block_size;
2725  const local_ordinal_type n_rows = n_rows_per_part*n_parts;
2726 
2727  const local_ordinal_type n_packs = ceil(float(n_parts)/n_parts_per_pack);
2728 
2729  myfile << "%%MatrixMarket matrix array real general"<< std::endl;
2730  myfile << "%%block size = " << block_size;
2731  myfile << " number of blocks = " << n_blocks;
2732  myfile << " number of parts = " << n_parts;
2733  myfile << " number of blocks per part = " << n_blocks_per_part;
2734  myfile << " number of rows = " << n_rows ;
2735  myfile << " number of cols = " << n_cols;
2736  myfile << " number of packs = " << n_packs << std::endl;
2737 
2738  myfile << n_rows << " " << n_cols << std::setprecision(9) << std::endl;
2739 
2740  local_ordinal_type current_part_idx, current_block_idx, current_row_offset;
2741  (void) current_row_offset;
2742  (void) current_part_idx;
2743  for (local_ordinal_type i_block_col=0;i_block_col<n_blocks_cols;++i_block_col) {
2744  for (local_ordinal_type j_in_block=0;j_in_block<block_size;++j_in_block) {
2745  for (local_ordinal_type i_pack=0;i_pack<n_packs;++i_pack) {
2746  for (local_ordinal_type i_part_in_pack=0;i_part_in_pack<n_parts_per_pack;++i_part_in_pack) {
2747  current_part_idx = i_part_in_pack + i_pack * n_parts_per_pack;
2748  for (local_ordinal_type i_block_in_part=0;i_block_in_part<n_blocks_per_part;++i_block_in_part) {
2749  current_block_idx = i_block_in_part + i_pack * n_blocks_per_part;
2750 
2751  if (current_block_idx >= (local_ordinal_type) scalar_values.extent(1))
2752  continue;
2753  for (local_ordinal_type i_in_block=0;i_in_block<block_size;++i_in_block) {
2754  myfile << scalar_values(i_block_col,current_block_idx,i_in_block,j_in_block,i_part_in_pack) << std::endl;
2755  }
2756  }
2757  }
2758  }
2759  }
2760  }
2761  myfile.close();
2762 #endif
2763  }
2764 
2765  template<typename local_ordinal_type, typename member_type, typename ViewType1, typename ViewType2>
2766  KOKKOS_INLINE_FUNCTION
2767  void
2768  copy3DView(const member_type &member, const ViewType1 &view1, const ViewType2 &view2) {
2769 /*
2770  // Kokkos::Experimental::local_deep_copy
2771  auto teamVectorRange =
2772  Kokkos::TeamVectorMDRange<Kokkos::Rank<3>, member_type>(
2773  member, view1.extent(0), view1.extent(1), view1.extent(2));
2774 
2775  Kokkos::parallel_for
2776  (teamVectorRange,
2777  [&](const local_ordinal_type &i, const local_ordinal_type &j, const local_ordinal_type &k) {
2778  view1(i,j,k) = view2(i,j,k);
2779  });
2780 */
2781  Kokkos::Experimental::local_deep_copy(member, view1, view2);
2782  }
2783  template<typename MatrixType, int ScratchLevel>
2784  struct ExtractAndFactorizeTridiags {
2785  public:
2786  using impl_type = BlockHelperDetails::ImplType<MatrixType>;
2787  // a functor cannot have both device_type and execution_space; specialization error in kokkos
2788  using execution_space = typename impl_type::execution_space;
2789  using memory_space = typename impl_type::memory_space;
2791  using local_ordinal_type = typename impl_type::local_ordinal_type;
2792  using size_type = typename impl_type::size_type;
2793  using impl_scalar_type = typename impl_type::impl_scalar_type;
2794  using magnitude_type = typename impl_type::magnitude_type;
2796  using row_matrix_type = typename impl_type::tpetra_row_matrix_type;
2797  using crs_graph_type = typename impl_type::tpetra_crs_graph_type;
2799  using local_ordinal_type_1d_view = typename impl_type::local_ordinal_type_1d_view;
2800  using local_ordinal_type_2d_view = typename impl_type::local_ordinal_type_2d_view;
2801  using size_type_1d_view = typename impl_type::size_type_1d_view;
2802  using size_type_2d_view = typename impl_type::size_type_2d_view;
2803  using impl_scalar_type_1d_view_tpetra = typename impl_type::impl_scalar_type_1d_view_tpetra;
2805  using btdm_scalar_type = typename impl_type::btdm_scalar_type;
2806  using btdm_magnitude_type = typename impl_type::btdm_magnitude_type;
2807  using vector_type_3d_view = typename impl_type::vector_type_3d_view;
2808  using vector_type_4d_view = typename impl_type::vector_type_4d_view;
2809  using internal_vector_type_4d_view = typename impl_type::internal_vector_type_4d_view;
2810  using internal_vector_type_5d_view = typename impl_type::internal_vector_type_5d_view;
2811  using btdm_scalar_type_2d_view = typename impl_type::btdm_scalar_type_2d_view;
2812  using btdm_scalar_type_3d_view = typename impl_type::btdm_scalar_type_3d_view;
2813  using btdm_scalar_type_4d_view = typename impl_type::btdm_scalar_type_4d_view;
2814  using btdm_scalar_type_5d_view = typename impl_type::btdm_scalar_type_5d_view;
2815  using internal_vector_scratch_type_3d_view = Scratch<typename impl_type::internal_vector_type_3d_view>;
2816  using btdm_scalar_scratch_type_3d_view = Scratch<typename impl_type::btdm_scalar_type_3d_view>;
2817  using tpetra_block_access_view_type = typename impl_type::tpetra_block_access_view_type; // block crs (layout right)
2818  using local_crs_graph_type = typename impl_type::local_crs_graph_type;
2819  using colinds_view = typename local_crs_graph_type::entries_type;
2820 
2821  using internal_vector_type = typename impl_type::internal_vector_type;
2822  static constexpr int vector_length = impl_type::vector_length;
2823  static constexpr int internal_vector_length = impl_type::internal_vector_length;
2824  static_assert(vector_length >= internal_vector_length, "Ifpack2 BlockTriDi Numeric: vector_length must be at least as large as internal_vector_length");
2825  static_assert(vector_length % internal_vector_length == 0, "Ifpack2 BlockTriDi Numeric: vector_length must be divisible by internal_vector_length");
2826  // half_vector_length is used for block Jacobi factorization.
2827  // Shared memory requirement is twice as large (per vector lane) as for general tridi factorization, so
2828  // reducing vector length (if possible) keeps the shared requirement constant. This avoids the performance
2829  // cliff of switching from level 0 to level 1 scratch.
2830  static constexpr int half_vector_length = impl_type::half_vector_length;
2831 
2833  using team_policy_type = Kokkos::TeamPolicy<execution_space>;
2834  using member_type = typename team_policy_type::member_type;
2835 
2836  private:
2837  // part interface
2838  const ConstUnmanaged<local_ordinal_type_1d_view> partptr, lclrow, packptr, packindices_sub, packptr_sub;
2839  const ConstUnmanaged<local_ordinal_type_2d_view> partptr_sub, part2packrowidx0_sub, packindices_schur;
2840  const local_ordinal_type max_partsz;
2841  // block crs matrix (it could be Kokkos::UVMSpace::size_type, which is int)
2842  using size_type_1d_view_tpetra = Kokkos::View<size_t*,typename impl_type::node_device_type>;
2843  ConstUnmanaged<size_type_1d_view_tpetra> A_block_rowptr;
2844  ConstUnmanaged<size_type_1d_view_tpetra> A_point_rowptr;
2845  ConstUnmanaged<impl_scalar_type_1d_view_tpetra> A_values;
2846  // block tridiags
2847  const ConstUnmanaged<size_type_2d_view> pack_td_ptr, flat_td_ptr, pack_td_ptr_schur;
2848  const ConstUnmanaged<local_ordinal_type_1d_view> A_colindsub;
2849  const Unmanaged<internal_vector_type_4d_view> internal_vector_values, internal_vector_values_schur;
2850  const Unmanaged<internal_vector_type_5d_view> e_internal_vector_values;
2851  const Unmanaged<btdm_scalar_type_4d_view> scalar_values, scalar_values_schur;
2852  const Unmanaged<btdm_scalar_type_5d_view> e_scalar_values;
2853  const Unmanaged<btdm_scalar_type_3d_view> d_inv;
2854  const Unmanaged<size_type_1d_view> diag_offsets;
2855  // shared information
2856  const local_ordinal_type blocksize, blocksize_square;
2857  // diagonal safety
2858  const magnitude_type tiny;
2859  const local_ordinal_type vector_loop_size;
2860 
2861  bool hasBlockCrsMatrix;
2862 
2863  public:
2864  ExtractAndFactorizeTridiags(const BlockTridiags<MatrixType> &btdm_,
2865  const BlockHelperDetails::PartInterface<MatrixType> &interf_,
2868  const magnitude_type& tiny_) :
2869  // interface
2870  partptr(interf_.partptr),
2871  lclrow(interf_.lclrow),
2872  packptr(interf_.packptr),
2873  packindices_sub(interf_.packindices_sub),
2874  packptr_sub(interf_.packptr_sub),
2875  partptr_sub(interf_.partptr_sub),
2876  part2packrowidx0_sub(interf_.part2packrowidx0_sub),
2877  packindices_schur(interf_.packindices_schur),
2878  max_partsz(interf_.max_partsz),
2879  // block tridiags
2880  pack_td_ptr(btdm_.pack_td_ptr),
2881  flat_td_ptr(btdm_.flat_td_ptr),
2882  pack_td_ptr_schur(btdm_.pack_td_ptr_schur),
2883  A_colindsub(btdm_.A_colindsub),
2884  internal_vector_values((internal_vector_type*)btdm_.values.data(),
2885  btdm_.values.extent(0),
2886  btdm_.values.extent(1),
2887  btdm_.values.extent(2),
2888  vector_length/internal_vector_length),
2889  internal_vector_values_schur((internal_vector_type*)btdm_.values_schur.data(),
2890  btdm_.values_schur.extent(0),
2891  btdm_.values_schur.extent(1),
2892  btdm_.values_schur.extent(2),
2893  vector_length/internal_vector_length),
2894  e_internal_vector_values((internal_vector_type*)btdm_.e_values.data(),
2895  btdm_.e_values.extent(0),
2896  btdm_.e_values.extent(1),
2897  btdm_.e_values.extent(2),
2898  btdm_.e_values.extent(3),
2899  vector_length/internal_vector_length),
2900  scalar_values((btdm_scalar_type*)btdm_.values.data(),
2901  btdm_.values.extent(0),
2902  btdm_.values.extent(1),
2903  btdm_.values.extent(2),
2904  vector_length),
2905  scalar_values_schur((btdm_scalar_type*)btdm_.values_schur.data(),
2906  btdm_.values_schur.extent(0),
2907  btdm_.values_schur.extent(1),
2908  btdm_.values_schur.extent(2),
2909  vector_length),
2910  e_scalar_values((btdm_scalar_type*)btdm_.e_values.data(),
2911  btdm_.e_values.extent(0),
2912  btdm_.e_values.extent(1),
2913  btdm_.e_values.extent(2),
2914  btdm_.e_values.extent(3),
2915  vector_length),
2916  d_inv(btdm_.d_inv),
2917  diag_offsets(btdm_.diag_offsets),
2918  blocksize(btdm_.values.extent(1)),
2919  blocksize_square(blocksize*blocksize),
2920  // diagonal weight to avoid zero pivots
2921  tiny(tiny_),
2922  vector_loop_size(vector_length/internal_vector_length) {
2923  using crs_matrix_type = typename impl_type::tpetra_crs_matrix_type;
2924  using block_crs_matrix_type = typename impl_type::tpetra_block_crs_matrix_type;
2925 
2926  auto A_crs = Teuchos::rcp_dynamic_cast<const crs_matrix_type>(A_);
2927  auto A_bcrs = Teuchos::rcp_dynamic_cast<const block_crs_matrix_type>(A_);
2928 
2929  hasBlockCrsMatrix = ! A_bcrs.is_null ();
2930 
2931  A_block_rowptr = G_->getLocalGraphDevice().row_map;
2932  if (hasBlockCrsMatrix) {
2933  A_values = const_cast<block_crs_matrix_type*>(A_bcrs.get())->getValuesDeviceNonConst();
2934  }
2935  else {
2936  A_point_rowptr = A_crs->getCrsGraph()->getLocalGraphDevice().row_map;
2937  A_values = A_crs->getLocalValuesDevice (Tpetra::Access::ReadOnly);
2938  }
2939  }
2940 
2941  private:
2942 
2943  KOKKOS_INLINE_FUNCTION
2944  void
2945  extract(local_ordinal_type partidx,
2946  local_ordinal_type local_subpartidx,
2947  local_ordinal_type npacks) const {
2948 #ifdef IFPACK2_BLOCKTRIDICONTAINER_USE_PRINTF
2949  printf("extract partidx = %d, local_subpartidx = %d, npacks = %d;\n", partidx, local_subpartidx, npacks);
2950 #endif
2951  using tlb = BlockHelperDetails::TpetraLittleBlock<Tpetra::Impl::BlockCrsMatrixLittleBlockArrayLayout>;
2952  const size_type kps = pack_td_ptr(partidx, local_subpartidx);
2953  local_ordinal_type kfs[vector_length] = {};
2954  local_ordinal_type ri0[vector_length] = {};
2955  local_ordinal_type nrows[vector_length] = {};
2956 
2957  for (local_ordinal_type vi=0;vi<npacks;++vi,++partidx) {
2958  kfs[vi] = flat_td_ptr(partidx,local_subpartidx);
2959  ri0[vi] = partptr_sub(pack_td_ptr.extent(0)*local_subpartidx + partidx,0);
2960  nrows[vi] = partptr_sub(pack_td_ptr.extent(0)*local_subpartidx + partidx,1) - ri0[vi];
2961 #ifdef IFPACK2_BLOCKTRIDICONTAINER_USE_PRINTF
2962  printf("kfs[%d] = %d;\n", vi, kfs[vi]);
2963  printf("ri0[%d] = %d;\n", vi, ri0[vi]);
2964  printf("nrows[%d] = %d;\n", vi, nrows[vi]);
2965 #endif
2966  }
2967  local_ordinal_type tr_min = 0;
2968  local_ordinal_type tr_max = nrows[0];
2969  if (local_subpartidx % 2 == 1) {
2970  tr_min -= 1;
2971  tr_max += 1;
2972  }
2973 #ifdef IFPACK2_BLOCKTRIDICONTAINER_USE_PRINTF
2974  printf("tr_min = %d and tr_max = %d;\n", tr_min, tr_max);
2975 #endif
2976  for (local_ordinal_type tr=tr_min,j=0;tr<tr_max;++tr) {
2977  for (local_ordinal_type e=0;e<3;++e) {
2978  if (hasBlockCrsMatrix) {
2979  const impl_scalar_type* block[vector_length] = {};
2980  for (local_ordinal_type vi=0;vi<npacks;++vi) {
2981  const size_type Aj = A_block_rowptr(lclrow(ri0[vi] + tr)) + A_colindsub(kfs[vi] + j);
2982 
2983  block[vi] = &A_values(Aj*blocksize_square);
2984  }
2985  const size_type pi = kps + j;
2986 #ifdef IFPACK2_BLOCKTRIDICONTAINER_USE_PRINTF
2987  printf("Extract pi = %ld, ri0 + tr = %d, kfs + j = %d\n", pi, ri0[0] + tr, kfs[0] + j);
2988 #endif
2989  ++j;
2990  for (local_ordinal_type ii=0;ii<blocksize;++ii) {
2991  for (local_ordinal_type jj=0;jj<blocksize;++jj) {
2992  const auto idx = tlb::getFlatIndex(ii, jj, blocksize);
2993  auto& v = internal_vector_values(pi, ii, jj, 0);
2994  for (local_ordinal_type vi=0;vi<npacks;++vi) {
2995  v[vi] = static_cast<btdm_scalar_type>(block[vi][idx]);
2996  }
2997  }
2998  }
2999  }
3000  else {
3001  const size_type pi = kps + j;
3002 
3003  for (local_ordinal_type vi=0;vi<npacks;++vi) {
3004  const size_type Aj_c = A_colindsub(kfs[vi] + j);
3005 
3006  for (local_ordinal_type ii=0;ii<blocksize;++ii) {
3007  auto point_row_offset = A_point_rowptr(lclrow(ri0[vi] + tr)*blocksize + ii);
3008 
3009  for (local_ordinal_type jj=0;jj<blocksize;++jj) {
3010  scalar_values(pi, ii, jj, vi) = A_values(point_row_offset + Aj_c*blocksize + jj);
3011  }
3012  }
3013  }
3014  ++j;
3015  }
3016  if (nrows[0] == 1) break;
3017  if (local_subpartidx % 2 == 0) {
3018  if (e == 1 && (tr == 0 || tr+1 == nrows[0])) break;
3019  for (local_ordinal_type vi=1;vi<npacks;++vi) {
3020  if ((e == 0 && nrows[vi] == 1) || (e == 1 && tr+1 == nrows[vi])) {
3021  npacks = vi;
3022  break;
3023  }
3024  }
3025  }
3026  else {
3027  if (e == 0 && (tr == -1 || tr == nrows[0])) break;
3028  for (local_ordinal_type vi=1;vi<npacks;++vi) {
3029  if ((e == 0 && nrows[vi] == 1) || (e == 0 && tr == nrows[vi])) {
3030  npacks = vi;
3031  break;
3032  }
3033  }
3034  }
3035  }
3036  }
3037  }
3038 
3039  KOKKOS_INLINE_FUNCTION
3040  void
3041  extract(const member_type &member,
3042  const local_ordinal_type &partidxbeg,
3043  local_ordinal_type local_subpartidx,
3044  const local_ordinal_type &npacks,
3045  const local_ordinal_type &vbeg) const {
3046 #ifdef IFPACK2_BLOCKTRIDICONTAINER_USE_PRINTF
3047  printf("extract partidxbeg = %d, local_subpartidx = %d, npacks = %d, vbeg = %d;\n", partidxbeg, local_subpartidx, npacks, vbeg);
3048 #endif
3049  using tlb = BlockHelperDetails::TpetraLittleBlock<Tpetra::Impl::BlockCrsMatrixLittleBlockArrayLayout>;
3050  local_ordinal_type kfs_vals[internal_vector_length] = {};
3051  local_ordinal_type ri0_vals[internal_vector_length] = {};
3052  local_ordinal_type nrows_vals[internal_vector_length] = {};
3053 
3054  const size_type kps = pack_td_ptr(partidxbeg,local_subpartidx);
3055  for (local_ordinal_type v=vbeg,vi=0;v<npacks && vi<internal_vector_length;++v,++vi) {
3056  kfs_vals[vi] = flat_td_ptr(partidxbeg+vi,local_subpartidx);
3057  ri0_vals[vi] = partptr_sub(pack_td_ptr.extent(0)*local_subpartidx + partidxbeg+vi,0);
3058  nrows_vals[vi] = partptr_sub(pack_td_ptr.extent(0)*local_subpartidx + partidxbeg+vi,1) - ri0_vals[vi];
3059 #ifdef IFPACK2_BLOCKTRIDICONTAINER_USE_PRINTF
3060  printf("kfs_vals[%d] = %d;\n", vi, kfs_vals[vi]);
3061  printf("ri0_vals[%d] = %d;\n", vi, ri0_vals[vi]);
3062  printf("nrows_vals[%d] = %d;\n", vi, nrows_vals[vi]);
3063 #endif
3064  }
3065 
3066  local_ordinal_type j_vals[internal_vector_length] = {};
3067 
3068  local_ordinal_type tr_min = 0;
3069  local_ordinal_type tr_max = nrows_vals[0];
3070  if (local_subpartidx % 2 == 1) {
3071  tr_min -= 1;
3072  tr_max += 1;
3073  }
3074 #ifdef IFPACK2_BLOCKTRIDICONTAINER_USE_PRINTF
3075  printf("tr_min = %d and tr_max = %d;\n", tr_min, tr_max);
3076 #endif
3077  for (local_ordinal_type tr=tr_min;tr<tr_max;++tr) {
3078  for (local_ordinal_type v=vbeg,vi=0;v<npacks && vi<internal_vector_length;++v,++vi) {
3079  const local_ordinal_type nrows = (local_subpartidx % 2 == 0 ? nrows_vals[vi] : nrows_vals[vi]);
3080  if ((local_subpartidx % 2 == 0 && tr < nrows) || (local_subpartidx % 2 == 1 && tr < nrows+1)) {
3081  auto &j = j_vals[vi];
3082  const local_ordinal_type kfs = kfs_vals[vi];
3083  const local_ordinal_type ri0 = ri0_vals[vi];
3084  local_ordinal_type lbeg, lend;
3085  if (local_subpartidx % 2 == 0) {
3086  lbeg = (tr == tr_min ? 1 : 0);
3087  lend = (tr == nrows - 1 ? 2 : 3);
3088  }
3089  else {
3090  lbeg = 0;
3091  lend = 3;
3092  if (tr == tr_min) {
3093  lbeg = 1;
3094  lend = 2;
3095  }
3096  else if (tr == nrows) {
3097  lbeg = 0;
3098  lend = 1;
3099  }
3100  }
3101  if (hasBlockCrsMatrix) {
3102  for (local_ordinal_type l=lbeg;l<lend;++l,++j) {
3103  const size_type Aj = A_block_rowptr(lclrow(ri0 + tr)) + A_colindsub(kfs + j);
3104  const impl_scalar_type* block = &A_values(Aj*blocksize_square);
3105  const size_type pi = kps + j;
3106 #ifdef IFPACK2_BLOCKTRIDICONTAINER_USE_PRINTF
3107  printf("Extract pi = %ld, ri0 + tr = %d, kfs + j = %d, tr = %d, lbeg = %d, lend = %d, l = %d\n", pi, ri0 + tr, kfs + j, tr, lbeg, lend, l);
3108 #endif
3109  Kokkos::parallel_for
3110  (Kokkos::TeamThreadRange(member,blocksize),
3111  [&](const local_ordinal_type &ii) {
3112  for (local_ordinal_type jj=0;jj<blocksize;++jj) {
3113  scalar_values(pi, ii, jj, v) = static_cast<btdm_scalar_type>(block[tlb::getFlatIndex(ii,jj,blocksize)]);
3114  }
3115  });
3116  }
3117  }
3118  else {
3119  for (local_ordinal_type l=lbeg;l<lend;++l,++j) {
3120  const size_type Aj_c = A_colindsub(kfs + j);
3121  const size_type pi = kps + j;
3122  Kokkos::parallel_for
3123  (Kokkos::TeamThreadRange(member,blocksize),
3124  [&](const local_ordinal_type &ii) {
3125  auto point_row_offset = A_point_rowptr(lclrow(ri0 + tr)*blocksize + ii);
3126  for (local_ordinal_type jj=0;jj<blocksize;++jj) {
3127  scalar_values(pi, ii, jj, v) = A_values(point_row_offset + Aj_c*blocksize + jj);
3128  }
3129  });
3130  }
3131  }
3132  }
3133  }
3134  }
3135  }
3136 
3137  template<typename AAViewType,
3138  typename WWViewType>
3139  KOKKOS_INLINE_FUNCTION
3140  void
3141  factorize_subline(const member_type &member,
3142  const local_ordinal_type &i0,
3143  const local_ordinal_type &nrows,
3144  const local_ordinal_type &v,
3145  const AAViewType &AA,
3146  const WWViewType &WW) const {
3147 
3148  typedef ExtractAndFactorizeTridiagsDefaultModeAndAlgo
3149  <typename execution_space::memory_space> default_mode_and_algo_type;
3150 
3151  typedef typename default_mode_and_algo_type::mode_type default_mode_type;
3152  typedef typename default_mode_and_algo_type::algo_type default_algo_type;
3153 
3154  // constant
3155  const auto one = Kokkos::ArithTraits<btdm_magnitude_type>::one();
3156 
3157 #ifdef IFPACK2_BLOCKTRIDICONTAINER_USE_PRINTF
3158  printf("i0 = %d, nrows = %d, v = %d, AA.extent(0) = %ld;\n", i0, nrows, v, AA.extent(0));
3159 #endif
3160 
3161  // subview pattern
3162  auto A = Kokkos::subview(AA, i0, Kokkos::ALL(), Kokkos::ALL(), v);
3163  KB::LU<member_type,
3164  default_mode_type,KB::Algo::LU::Unblocked>
3165  ::invoke(member, A , tiny);
3166 
3167  if (nrows > 1) {
3168  auto B = A;
3169  auto C = A;
3170  local_ordinal_type i = i0;
3171  for (local_ordinal_type tr=1;tr<nrows;++tr,i+=3) {
3172 #ifdef IFPACK2_BLOCKTRIDICONTAINER_USE_PRINTF
3173  printf("tr = %d, i = %d;\n", tr, i);
3174 #endif
3175  B.assign_data( &AA(i+1,0,0,v) );
3176  KB::Trsm<member_type,
3177  KB::Side::Left,KB::Uplo::Lower,KB::Trans::NoTranspose,KB::Diag::Unit,
3178  default_mode_type,default_algo_type>
3179  ::invoke(member, one, A, B);
3180  C.assign_data( &AA(i+2,0,0,v) );
3181  KB::Trsm<member_type,
3182  KB::Side::Right,KB::Uplo::Upper,KB::Trans::NoTranspose,KB::Diag::NonUnit,
3183  default_mode_type,default_algo_type>
3184  ::invoke(member, one, A, C);
3185  A.assign_data( &AA(i+3,0,0,v) );
3186 
3187  member.team_barrier();
3188  KB::Gemm<member_type,
3189  KB::Trans::NoTranspose,KB::Trans::NoTranspose,
3190  default_mode_type,default_algo_type>
3191  ::invoke(member, -one, C, B, one, A);
3192  KB::LU<member_type,
3193  default_mode_type,KB::Algo::LU::Unblocked>
3194  ::invoke(member, A, tiny);
3195  }
3196  } else {
3197  // for block jacobi invert a matrix here
3198  auto W = Kokkos::subview(WW, Kokkos::ALL(), Kokkos::ALL(), v);
3199  KB::Copy<member_type,KB::Trans::NoTranspose,default_mode_type>
3200  ::invoke(member, A, W);
3201  KB::SetIdentity<member_type,default_mode_type>
3202  ::invoke(member, A);
3203  member.team_barrier();
3204  KB::Trsm<member_type,
3205  KB::Side::Left,KB::Uplo::Lower,KB::Trans::NoTranspose,KB::Diag::Unit,
3206  default_mode_type,default_algo_type>
3207  ::invoke(member, one, W, A);
3208  KB::Trsm<member_type,
3209  KB::Side::Left,KB::Uplo::Upper,KB::Trans::NoTranspose,KB::Diag::NonUnit,
3210  default_mode_type,default_algo_type>
3211  ::invoke(member, one, W, A);
3212  }
3213  }
3214 
3215  public:
3216 
3217  struct ExtractAndFactorizeSubLineTag {};
3218  struct ExtractAndFactorizeFusedJacobiTag {};
3219  struct ExtractBCDTag {};
3220  struct ComputeETag {};
3221  struct ComputeSchurTag {};
3222  struct FactorizeSchurTag {};
3223 
3224  KOKKOS_INLINE_FUNCTION
3225  void
3226  operator() (const ExtractAndFactorizeSubLineTag &, const member_type &member) const {
3227  // btdm is packed and sorted from largest one
3228  const local_ordinal_type packidx = packindices_sub(member.league_rank());
3229 
3230  const local_ordinal_type subpartidx = packptr_sub(packidx);
3231  const local_ordinal_type n_parts = part2packrowidx0_sub.extent(0);
3232  const local_ordinal_type local_subpartidx = subpartidx/n_parts;
3233  const local_ordinal_type partidx = subpartidx%n_parts;
3234 
3235  const local_ordinal_type npacks = packptr_sub(packidx+1) - subpartidx;
3236  const local_ordinal_type i0 = pack_td_ptr(partidx,local_subpartidx);
3237  const local_ordinal_type nrows = partptr_sub(subpartidx,1) - partptr_sub(subpartidx,0);
3238 
3239  internal_vector_scratch_type_3d_view
3240  WW(member.team_scratch(ScratchLevel), blocksize, blocksize, vector_loop_size);
3241 
3242 #ifdef IFPACK2_BLOCKTRIDICONTAINER_USE_PRINTF
3243  printf("rank = %d, i0 = %d, npacks = %d, nrows = %d, packidx = %d, subpartidx = %d, partidx = %d, local_subpartidx = %d;\n", member.league_rank(), i0, npacks, nrows, packidx, subpartidx, partidx, local_subpartidx);
3244  printf("vector_loop_size = %d\n", vector_loop_size);
3245 #endif
3246 
3247  if (vector_loop_size == 1) {
3248  extract(partidx, local_subpartidx, npacks);
3249  factorize_subline(member, i0, nrows, 0, internal_vector_values, WW);
3250  } else {
3251  Kokkos::parallel_for
3252  (Kokkos::ThreadVectorRange(member, vector_loop_size),
3253  [&](const local_ordinal_type &v) {
3254  const local_ordinal_type vbeg = v*internal_vector_length;
3255 #ifdef IFPACK2_BLOCKTRIDICONTAINER_USE_PRINTF
3256  printf("i0 = %d, npacks = %d, vbeg = %d;\n", i0, npacks, vbeg);
3257 #endif
3258  if (vbeg < npacks)
3259  extract(member, partidx+vbeg, local_subpartidx, npacks, vbeg);
3260  // this is not safe if vector loop size is different from vector size of
3261  // the team policy. we always make sure this when constructing the team policy
3262  member.team_barrier();
3263  factorize_subline(member, i0, nrows, v, internal_vector_values, WW);
3264  });
3265  }
3266  }
3267 
3268  KOKKOS_INLINE_FUNCTION
3269  void
3270  operator() (const ExtractAndFactorizeFusedJacobiTag&, const member_type &member) const {
3271  using default_mode_and_algo_type = ExtractAndFactorizeTridiagsDefaultModeAndAlgo<typename execution_space::memory_space>;
3272  using default_mode_type = typename default_mode_and_algo_type::mode_type;
3273  using default_algo_type = typename default_mode_and_algo_type::algo_type;
3274  // When fused block Jacobi can be used, the mapping between local rows and parts is trivial (i <-> i)
3275  // We can simply pull the diagonal entry from A into d_inv
3276  btdm_scalar_scratch_type_3d_view WW1(member.team_scratch(ScratchLevel), half_vector_length, blocksize, blocksize);
3277  btdm_scalar_scratch_type_3d_view WW2(member.team_scratch(ScratchLevel), half_vector_length, blocksize, blocksize);
3278  const auto one = Kokkos::ArithTraits<btdm_magnitude_type>::one();
3279  const local_ordinal_type nrows = lclrow.extent(0);
3280  Kokkos::parallel_for
3281  (Kokkos::ThreadVectorRange(member, half_vector_length),
3282  [&](const local_ordinal_type &v) {
3283  local_ordinal_type row = member.league_rank() * half_vector_length + v;
3284  // diagEntry has index of diagonal within row
3285  auto W1 = Kokkos::subview(WW1, v, Kokkos::ALL(), Kokkos::ALL());
3286  auto W2 = Kokkos::subview(WW2, v, Kokkos::ALL(), Kokkos::ALL());
3287  if(row < nrows) {
3288  // View the diagonal block of A in row as 2D row-major
3289  const impl_scalar_type* A_diag = A_values.data() + diag_offsets(row);
3290  // Copy the diag into scratch slice W1
3291  // (copying elements directly is better than KokkosBatched copy)
3292  Kokkos::parallel_for(Kokkos::TeamThreadRange(member, blocksize * blocksize),
3293  [&](int i)
3294  {
3295  W1.data()[i] = A_diag[i];
3296  });
3297  // and set W2 to identity in preparation to invert with 2 x Trsm
3298  KB::SetIdentity<member_type,default_mode_type>
3299  ::invoke(member, W2);
3300  }
3301  else {
3302  // if this vector lane has no block to invert, then set W1 to identity
3303  // so that LU still has a matrix to work on. LU uses team barriers so
3304  // having some lanes run it and some not will deadlock.
3305  KB::SetIdentity<member_type,default_mode_type>
3306  ::invoke(member, W1);
3307  }
3308  member.team_barrier();
3309  // LU factorize in-place
3310  KB::LU<member_type, default_mode_type,KB::Algo::LU::Unblocked>
3311  ::invoke(member, W1, tiny);
3312  member.team_barrier();
3313  KB::Trsm<member_type,
3314  KB::Side::Left,KB::Uplo::Lower,KB::Trans::NoTranspose,KB::Diag::Unit,
3315  default_mode_type,default_algo_type>
3316  ::invoke(member, one, W1, W2);
3317  KB::Trsm<member_type,
3318  KB::Side::Left,KB::Uplo::Upper,KB::Trans::NoTranspose,KB::Diag::NonUnit,
3319  default_mode_type,default_algo_type>
3320  ::invoke(member, one, W1, W2);
3321  member.team_barrier();
3322  if(row < nrows) {
3323  Kokkos::parallel_for(Kokkos::TeamThreadRange(member, blocksize * blocksize),
3324  [&](int i)
3325  {
3326  auto d_inv_block = &d_inv(row, 0, 0);
3327  d_inv_block[i] = W2.data()[i];
3328  });
3329  }
3330  });
3331  }
3332 
3333  KOKKOS_INLINE_FUNCTION
3334  void
3335  operator() (const ExtractBCDTag &, const member_type &member) const {
3336  // btdm is packed and sorted from largest one
3337  const local_ordinal_type packindices_schur_i = member.league_rank() % packindices_schur.extent(0);
3338  const local_ordinal_type packindices_schur_j = member.league_rank() / packindices_schur.extent(0);
3339  const local_ordinal_type packidx = packindices_schur(packindices_schur_i, packindices_schur_j);
3340 
3341  const local_ordinal_type subpartidx = packptr_sub(packidx);
3342  const local_ordinal_type n_parts = part2packrowidx0_sub.extent(0);
3343  const local_ordinal_type local_subpartidx = subpartidx/n_parts;
3344  const local_ordinal_type partidx = subpartidx%n_parts;
3345 
3346  const local_ordinal_type npacks = packptr_sub(packidx+1) - subpartidx;
3347  //const local_ordinal_type i0 = pack_td_ptr(partidx,local_subpartidx);
3348  //const local_ordinal_type nrows = partptr_sub(subpartidx,1) - partptr_sub(subpartidx,0);
3349 
3350  if (vector_loop_size == 1) {
3351  extract(partidx, local_subpartidx, npacks);
3352  }
3353  else {
3354  Kokkos::parallel_for
3355  (Kokkos::ThreadVectorRange(member, vector_loop_size),
3356  [&](const local_ordinal_type &v) {
3357  const local_ordinal_type vbeg = v*internal_vector_length;
3358 #ifdef IFPACK2_BLOCKTRIDICONTAINER_USE_PRINTF
3359  const local_ordinal_type i0 = pack_td_ptr(partidx,local_subpartidx);
3360  printf("i0 = %d, npacks = %d, vbeg = %d;\n", i0, npacks, vbeg);
3361 #endif
3362  if (vbeg < npacks)
3363  extract(member, partidx+vbeg, local_subpartidx, npacks, vbeg);
3364  });
3365  }
3366 
3367  member.team_barrier();
3368 
3369  const size_type kps1 = pack_td_ptr(partidx, local_subpartidx);
3370  const size_type kps2 = pack_td_ptr(partidx, local_subpartidx+1)-1;
3371 
3372  const local_ordinal_type r1 = part2packrowidx0_sub(partidx,local_subpartidx)-1;
3373  const local_ordinal_type r2 = part2packrowidx0_sub(partidx,local_subpartidx)+2;
3374 
3375 #ifdef IFPACK2_BLOCKTRIDICONTAINER_USE_PRINTF
3376  printf("Copy for Schur complement part id = %d from kps1 = %ld to r1 = %d and from kps2 = %ld to r2 = %d partidx = %d local_subpartidx = %d;\n", packidx, kps1, r1, kps2, r2, partidx, local_subpartidx);
3377 #endif
3378 
3379  // Need to copy D to e_internal_vector_values.
3380  copy3DView<local_ordinal_type>(member, Kokkos::subview(e_internal_vector_values, 0, r1, Kokkos::ALL(), Kokkos::ALL(), Kokkos::ALL()),
3381  Kokkos::subview(internal_vector_values, kps1, Kokkos::ALL(), Kokkos::ALL(), Kokkos::ALL()));
3382 
3383  copy3DView<local_ordinal_type>(member, Kokkos::subview(e_internal_vector_values, 1, r2, Kokkos::ALL(), Kokkos::ALL(), Kokkos::ALL()),
3384  Kokkos::subview(internal_vector_values, kps2, Kokkos::ALL(), Kokkos::ALL(), Kokkos::ALL()));
3385 
3386  }
3387 
3388  KOKKOS_INLINE_FUNCTION
3389  void
3390  operator() (const ComputeETag &, const member_type &member) const {
3391  // btdm is packed and sorted from largest one
3392  const local_ordinal_type packidx = packindices_sub(member.league_rank());
3393 
3394  const local_ordinal_type subpartidx = packptr_sub(packidx);
3395  const local_ordinal_type n_parts = part2packrowidx0_sub.extent(0);
3396  const local_ordinal_type local_subpartidx = subpartidx/n_parts;
3397  const local_ordinal_type partidx = subpartidx%n_parts;
3398 
3399  const local_ordinal_type npacks = packptr_sub(packidx+1) - subpartidx;
3400  const local_ordinal_type i0 = pack_td_ptr(partidx,local_subpartidx);
3401  const local_ordinal_type r0 = part2packrowidx0_sub(partidx,local_subpartidx);
3402  const local_ordinal_type nrows = partptr_sub(subpartidx,1) - partptr_sub(subpartidx,0);
3403  const local_ordinal_type num_vectors = blocksize;
3404 
3405  (void) npacks;
3406 
3407  internal_vector_scratch_type_3d_view
3408  WW(member.team_scratch(ScratchLevel), blocksize, num_vectors, vector_loop_size);
3409  if (local_subpartidx == 0) {
3410  Kokkos::parallel_for
3411  (Kokkos::ThreadVectorRange(member, vector_loop_size),[&](const int &v) {
3412  solveMultiVector<impl_type, internal_vector_scratch_type_3d_view> (member, blocksize, i0, r0, nrows, v, internal_vector_values, Kokkos::subview(e_internal_vector_values, 0, Kokkos::ALL(), Kokkos::ALL(), Kokkos::ALL(), Kokkos::ALL()), WW, true);
3413  });
3414  }
3415  else if (local_subpartidx == (local_ordinal_type) part2packrowidx0_sub.extent(1) - 2) {
3416  Kokkos::parallel_for
3417  (Kokkos::ThreadVectorRange(member, vector_loop_size),[&](const int &v) {
3418  solveMultiVector<impl_type, internal_vector_scratch_type_3d_view> (member, blocksize, i0, r0, nrows, v, internal_vector_values, Kokkos::subview(e_internal_vector_values, 1, Kokkos::ALL(), Kokkos::ALL(), Kokkos::ALL(), Kokkos::ALL()), WW);
3419  });
3420  }
3421  else {
3422  Kokkos::parallel_for
3423  (Kokkos::ThreadVectorRange(member, vector_loop_size),[&](const int &v) {
3424  solveMultiVector<impl_type, internal_vector_scratch_type_3d_view> (member, blocksize, i0, r0, nrows, v, internal_vector_values, Kokkos::subview(e_internal_vector_values, 0, Kokkos::ALL(), Kokkos::ALL(), Kokkos::ALL(), Kokkos::ALL()), WW, true);
3425  solveMultiVector<impl_type, internal_vector_scratch_type_3d_view> (member, blocksize, i0, r0, nrows, v, internal_vector_values, Kokkos::subview(e_internal_vector_values, 1, Kokkos::ALL(), Kokkos::ALL(), Kokkos::ALL(), Kokkos::ALL()), WW);
3426  });
3427  }
3428  }
3429 
3430  KOKKOS_INLINE_FUNCTION
3431  void
3432  operator() (const ComputeSchurTag &, const member_type &member) const {
3433  // btdm is packed and sorted from largest one
3434  const local_ordinal_type packindices_schur_i = member.league_rank() % packindices_schur.extent(0);
3435  const local_ordinal_type packindices_schur_j = member.league_rank() / packindices_schur.extent(0);
3436  const local_ordinal_type packidx = packindices_schur(packindices_schur_i, packindices_schur_j);
3437 
3438  const local_ordinal_type subpartidx = packptr_sub(packidx);
3439  const local_ordinal_type n_parts = part2packrowidx0_sub.extent(0);
3440  const local_ordinal_type local_subpartidx = subpartidx/n_parts;
3441  const local_ordinal_type partidx = subpartidx%n_parts;
3442 
3443  //const local_ordinal_type npacks = packptr_sub(packidx+1) - subpartidx;
3444  const local_ordinal_type i0 = pack_td_ptr(partidx,local_subpartidx);
3445  //const local_ordinal_type r0 = part2packrowidx0_sub(partidx,local_subpartidx);
3446  //const local_ordinal_type nrows = partptr_sub(subpartidx,1) - partptr_sub(subpartidx,0);
3447 
3448  // Compute S = D - C E
3449 
3450  const local_ordinal_type local_subpartidx_schur = (local_subpartidx-1)/2;
3451  const local_ordinal_type i0_schur = local_subpartidx_schur == 0 ? pack_td_ptr_schur(partidx,local_subpartidx_schur) : pack_td_ptr_schur(partidx,local_subpartidx_schur) + 1;
3452  const local_ordinal_type i0_offset = local_subpartidx_schur == 0 ? i0+2 : i0+2;
3453 
3454  for (local_ordinal_type i = 0; i < 4; ++i) { //pack_td_ptr_schur(partidx,local_subpartidx_schur+1)-i0_schur
3455  copy3DView<local_ordinal_type>(member, Kokkos::subview(internal_vector_values_schur, i0_schur+i, Kokkos::ALL(), Kokkos::ALL(), Kokkos::ALL()),
3456  Kokkos::subview(internal_vector_values, i0_offset+i, Kokkos::ALL(), Kokkos::ALL(), Kokkos::ALL()));
3457  }
3458 
3459  member.team_barrier();
3460 
3461  const auto one = Kokkos::ArithTraits<btdm_magnitude_type>::one();
3462 
3463  const size_type c_kps1 = pack_td_ptr(partidx, local_subpartidx)+1;
3464  const size_type c_kps2 = pack_td_ptr(partidx, local_subpartidx+1)-2;
3465 
3466  const local_ordinal_type e_r1 = part2packrowidx0_sub(partidx,local_subpartidx)-1;
3467  const local_ordinal_type e_r2 = part2packrowidx0_sub(partidx,local_subpartidx)+2;
3468 
3469  typedef ExtractAndFactorizeTridiagsDefaultModeAndAlgo
3470  <typename execution_space::memory_space> default_mode_and_algo_type;
3471 
3472  typedef typename default_mode_and_algo_type::mode_type default_mode_type;
3473  typedef typename default_mode_and_algo_type::algo_type default_algo_type;
3474 
3475  Kokkos::parallel_for
3476  (Kokkos::ThreadVectorRange(member, vector_loop_size),[&](const int &v) {
3477  for (size_type i = 0; i < pack_td_ptr_schur(partidx,local_subpartidx_schur+1)-pack_td_ptr_schur(partidx,local_subpartidx_schur); ++i) {
3478  local_ordinal_type e_r, e_c, c_kps;
3479 
3480  if ( local_subpartidx_schur == 0 ) {
3481  if ( i == 0 ) {
3482  e_r = e_r1;
3483  e_c = 0;
3484  c_kps = c_kps1;
3485  }
3486  else if ( i == 3 ) {
3487  e_r = e_r2;
3488  e_c = 1;
3489  c_kps = c_kps2;
3490  }
3491  else if ( i == 4 ) {
3492  e_r = e_r2;
3493  e_c = 0;
3494  c_kps = c_kps2;
3495  }
3496  else {
3497  continue;
3498  }
3499  }
3500  else {
3501  if ( i == 0 ) {
3502  e_r = e_r1;
3503  e_c = 1;
3504  c_kps = c_kps1;
3505  }
3506  else if ( i == 1 ) {
3507  e_r = e_r1;
3508  e_c = 0;
3509  c_kps = c_kps1;
3510  }
3511  else if ( i == 4 ) {
3512  e_r = e_r2;
3513  e_c = 1;
3514  c_kps = c_kps2;
3515  }
3516  else if ( i == 5 ) {
3517  e_r = e_r2;
3518  e_c = 0;
3519  c_kps = c_kps2;
3520  }
3521  else {
3522  continue;
3523  }
3524  }
3525 
3526  auto S = Kokkos::subview(internal_vector_values_schur, pack_td_ptr_schur(partidx,local_subpartidx_schur)+i, Kokkos::ALL(), Kokkos::ALL(), v);
3527  auto C = Kokkos::subview(internal_vector_values, c_kps, Kokkos::ALL(), Kokkos::ALL(), v);
3528  auto E = Kokkos::subview(e_internal_vector_values, e_c, e_r, Kokkos::ALL(), Kokkos::ALL(), v);
3529  KB::Gemm<member_type,
3530  KB::Trans::NoTranspose,KB::Trans::NoTranspose,
3531  default_mode_type,default_algo_type>
3532  ::invoke(member, -one, C, E, one, S);
3533  }
3534  });
3535  }
3536 
3537  KOKKOS_INLINE_FUNCTION
3538  void
3539  operator() (const FactorizeSchurTag &, const member_type &member) const {
3540  const local_ordinal_type packidx = packindices_schur(member.league_rank(), 0);
3541 
3542  const local_ordinal_type subpartidx = packptr_sub(packidx);
3543 
3544  const local_ordinal_type n_parts = part2packrowidx0_sub.extent(0);
3545  const local_ordinal_type partidx = subpartidx%n_parts;
3546 
3547  const local_ordinal_type i0 = pack_td_ptr_schur(partidx,0);
3548  const local_ordinal_type nrows = 2*(pack_td_ptr_schur.extent(1)-1);
3549 
3550  internal_vector_scratch_type_3d_view
3551  WW(member.team_scratch(ScratchLevel), blocksize, blocksize, vector_loop_size);
3552 
3553 #ifdef IFPACK2_BLOCKTRIDICONTAINER_USE_PRINTF
3554  printf("FactorizeSchurTag rank = %d, i0 = %d, nrows = %d, vector_loop_size = %d;\n", member.league_rank(), i0, nrows, vector_loop_size);
3555 #endif
3556 
3557  if (vector_loop_size == 1) {
3558  factorize_subline(member, i0, nrows, 0, internal_vector_values_schur, WW);
3559  } else {
3560  Kokkos::parallel_for
3561  (Kokkos::ThreadVectorRange(member, vector_loop_size),
3562  [&](const local_ordinal_type &v) {
3563  factorize_subline(member, i0, nrows, v, internal_vector_values_schur, WW);
3564  });
3565  }
3566  }
3567 
3568  void run() {
3569  IFPACK2_BLOCKTRIDICONTAINER_PROFILER_REGION_BEGIN;
3570  const local_ordinal_type team_size =
3571  ExtractAndFactorizeTridiagsDefaultModeAndAlgo<typename execution_space::memory_space>::
3572  recommended_team_size(blocksize, vector_length, internal_vector_length);
3573  const local_ordinal_type per_team_scratch = internal_vector_scratch_type_3d_view::
3574  shmem_size(blocksize, blocksize, vector_loop_size);
3575 
3576  {
3577 #ifdef IFPACK2_BLOCKTRIDICONTAINER_USE_PRINTF
3578  printf("Start ExtractAndFactorizeSubLineTag\n");
3579 #endif
3580  IFPACK2_BLOCKHELPER_TIMER("BlockTriDi::NumericPhase::ExtractAndFactorizeSubLineTag", ExtractAndFactorizeSubLineTag0);
3581  Kokkos::TeamPolicy<execution_space,ExtractAndFactorizeSubLineTag>
3582  policy(packindices_sub.extent(0), team_size, vector_loop_size);
3583 
3584 
3585  const local_ordinal_type n_parts = part2packrowidx0_sub.extent(0);
3586  writeBTDValuesToFile(n_parts, scalar_values, "before.mm");
3587 
3588  policy.set_scratch_size(ScratchLevel, Kokkos::PerTeam(per_team_scratch));
3589  Kokkos::parallel_for("ExtractAndFactorize::TeamPolicy::run<ExtractAndFactorizeSubLineTag>",
3590  policy, *this);
3591  execution_space().fence();
3592 
3593  writeBTDValuesToFile(n_parts, scalar_values, "after.mm");
3594 #ifdef IFPACK2_BLOCKTRIDICONTAINER_USE_PRINTF
3595  printf("End ExtractAndFactorizeSubLineTag\n");
3596 #endif
3597  }
3598 
3599  if (packindices_schur.extent(1) > 0)
3600  {
3601  {
3602 #ifdef IFPACK2_BLOCKTRIDICONTAINER_USE_PRINTF
3603  printf("Start ExtractBCDTag\n");
3604 #endif
3605  Kokkos::deep_copy(e_scalar_values, Kokkos::ArithTraits<btdm_magnitude_type>::zero());
3606  Kokkos::deep_copy(scalar_values_schur, Kokkos::ArithTraits<btdm_magnitude_type>::zero());
3607 
3608  write5DMultiVectorValuesToFile(part2packrowidx0_sub.extent(0), e_scalar_values, "e_scalar_values_before_extract.mm");
3609 
3610  {
3611  IFPACK2_BLOCKHELPER_TIMER("BlockTriDi::NumericPhase::ExtractBCDTag", ExtractBCDTag0);
3612  Kokkos::TeamPolicy<execution_space,ExtractBCDTag>
3613  policy(packindices_schur.extent(0)*packindices_schur.extent(1), team_size, vector_loop_size);
3614 
3615  policy.set_scratch_size(ScratchLevel, Kokkos::PerTeam(per_team_scratch));
3616  Kokkos::parallel_for("ExtractAndFactorize::TeamPolicy::run<ExtractBCDTag>",
3617  policy, *this);
3618  execution_space().fence();
3619  }
3620 
3621 #ifdef IFPACK2_BLOCKTRIDICONTAINER_USE_PRINTF
3622  printf("End ExtractBCDTag\n");
3623 #endif
3624  writeBTDValuesToFile(part2packrowidx0_sub.extent(0), scalar_values, "after_extraction_of_BCD.mm");
3625 #ifdef IFPACK2_BLOCKTRIDICONTAINER_USE_PRINTF
3626  printf("Start ComputeETag\n");
3627 #endif
3628  write5DMultiVectorValuesToFile(part2packrowidx0_sub.extent(0), e_scalar_values, "e_scalar_values_after_extract.mm");
3629  {
3630  IFPACK2_BLOCKHELPER_TIMER("BlockTriDi::NumericPhase::ComputeETag", ComputeETag0);
3631  Kokkos::TeamPolicy<execution_space,ComputeETag>
3632  policy(packindices_sub.extent(0), team_size, vector_loop_size);
3633 
3634  policy.set_scratch_size(ScratchLevel, Kokkos::PerTeam(per_team_scratch));
3635  Kokkos::parallel_for("ExtractAndFactorize::TeamPolicy::run<ComputeETag>",
3636  policy, *this);
3637  execution_space().fence();
3638  }
3639  write5DMultiVectorValuesToFile(part2packrowidx0_sub.extent(0), e_scalar_values, "e_scalar_values_after_compute.mm");
3640 
3641 #ifdef IFPACK2_BLOCKTRIDICONTAINER_USE_PRINTF
3642  printf("End ComputeETag\n");
3643 #endif
3644  }
3645 
3646  {
3647 #ifdef IFPACK2_BLOCKTRIDICONTAINER_USE_PRINTF
3648  printf("Start ComputeSchurTag\n");
3649 #endif
3650  IFPACK2_BLOCKHELPER_TIMER("BlockTriDi::NumericPhase::ComputeSchurTag", ComputeSchurTag0);
3651  writeBTDValuesToFile(part2packrowidx0_sub.extent(0), scalar_values_schur, "before_schur.mm");
3652  Kokkos::TeamPolicy<execution_space,ComputeSchurTag>
3653  policy(packindices_schur.extent(0)*packindices_schur.extent(1), team_size, vector_loop_size);
3654 
3655  Kokkos::parallel_for("ExtractAndFactorize::TeamPolicy::run<ComputeSchurTag>",
3656  policy, *this);
3657  writeBTDValuesToFile(part2packrowidx0_sub.extent(0), scalar_values_schur, "after_schur.mm");
3658  execution_space().fence();
3659 #ifdef IFPACK2_BLOCKTRIDICONTAINER_USE_PRINTF
3660  printf("End ComputeSchurTag\n");
3661 #endif
3662  }
3663 
3664  {
3665 #ifdef IFPACK2_BLOCKTRIDICONTAINER_USE_PRINTF
3666  printf("Start FactorizeSchurTag\n");
3667 #endif
3668  IFPACK2_BLOCKHELPER_TIMER("BlockTriDi::NumericPhase::FactorizeSchurTag", FactorizeSchurTag0);
3669  Kokkos::TeamPolicy<execution_space,FactorizeSchurTag>
3670  policy(packindices_schur.extent(0), team_size, vector_loop_size);
3671  policy.set_scratch_size(ScratchLevel, Kokkos::PerTeam(per_team_scratch));
3672  Kokkos::parallel_for("ExtractAndFactorize::TeamPolicy::run<FactorizeSchurTag>",
3673  policy, *this);
3674  execution_space().fence();
3675  writeBTDValuesToFile(part2packrowidx0_sub.extent(0), scalar_values_schur, "after_factor_schur.mm");
3676 #ifdef IFPACK2_BLOCKTRIDICONTAINER_USE_PRINTF
3677  printf("End FactorizeSchurTag\n");
3678 #endif
3679  }
3680  }
3681 
3682  IFPACK2_BLOCKTRIDICONTAINER_PROFILER_REGION_END;
3683  }
3684 
3685  void run_fused_jacobi() {
3686  IFPACK2_BLOCKTRIDICONTAINER_PROFILER_REGION_BEGIN;
3687  const local_ordinal_type team_size =
3688  ExtractAndFactorizeTridiagsDefaultModeAndAlgo<typename execution_space::memory_space>::
3689  recommended_team_size(blocksize, half_vector_length, 1);
3690  const local_ordinal_type per_team_scratch =
3691  btdm_scalar_scratch_type_3d_view::shmem_size(blocksize, blocksize, 2 * half_vector_length);
3692  {
3693  IFPACK2_BLOCKHELPER_TIMER("BlockTriDi::NumericPhase::ExtractAndFactorizeFusedJacobi", ExtractAndFactorizeFusedJacobiTag);
3694  Kokkos::TeamPolicy<execution_space, ExtractAndFactorizeFusedJacobiTag>
3695  policy((lclrow.extent(0) + half_vector_length - 1) / half_vector_length, team_size, half_vector_length);
3696 
3697  policy.set_scratch_size(ScratchLevel, Kokkos::PerTeam(per_team_scratch));
3698  Kokkos::parallel_for("ExtractAndFactorize::TeamPolicy::run<ExtractAndFactorizeFusedJacobiTag>",
3699  policy, *this);
3700  }
3701  IFPACK2_BLOCKTRIDICONTAINER_PROFILER_REGION_END;
3702  }
3703  };
3704 
3708  template<typename MatrixType>
3709  void
3710  performNumericPhase(const Teuchos::RCP<const typename BlockHelperDetails::ImplType<MatrixType>::tpetra_row_matrix_type> &A,
3711  const Teuchos::RCP<const typename BlockHelperDetails::ImplType<MatrixType>::tpetra_crs_graph_type> &G,
3712  const BlockHelperDetails::PartInterface<MatrixType> &interf,
3714  const typename BlockHelperDetails::ImplType<MatrixType>::magnitude_type tiny,
3715  bool use_fused_jacobi) {
3716  using impl_type = BlockHelperDetails::ImplType<MatrixType>;
3717  using execution_space = typename impl_type::execution_space;
3718  using team_policy_type = Kokkos::TeamPolicy<execution_space>;
3719  using internal_vector_scratch_type_3d_view = Scratch<typename impl_type::internal_vector_type_3d_view>;
3720  using btdm_scalar_scratch_type_3d_view = Scratch<typename impl_type::btdm_scalar_type_3d_view>;
3721 
3722  IFPACK2_BLOCKHELPER_TIMER("BlockTriDi::NumericPhase", NumericPhase);
3723 
3724  int blocksize = btdm.values.extent(1);
3725  // Both Kokkos policy vector length and SIMD type vector length are hardcoded in KokkosBatched.
3726  // For large block sizes, have to fall back to level 1 scratch.
3727  int scratch_required;
3728  if(!use_fused_jacobi) {
3729  // General path scratch requirement
3730  scratch_required = internal_vector_scratch_type_3d_view::shmem_size(blocksize, blocksize, impl_type::vector_length / impl_type::internal_vector_length);
3731  }
3732  else {
3733  // Block Jacobi scratch requirement: measured in scalars, and uses twice as much (in bytes) per vector lane as the general path.
3734  scratch_required = btdm_scalar_scratch_type_3d_view::shmem_size(blocksize, blocksize, 2 * impl_type::half_vector_length);
3735  }
3736 
3737  int max_scratch = team_policy_type::scratch_size_max(0);
3738 
3739  if(scratch_required < max_scratch) {
3740  // Can use level 0 scratch
3741  ExtractAndFactorizeTridiags<MatrixType, 0> function(btdm, interf, A, G, tiny);
3742  if(!use_fused_jacobi)
3743  function.run();
3744  else
3745  function.run_fused_jacobi();
3746  }
3747  else {
3748  // Not enough level 0 scratch, so fall back to level 1
3749  ExtractAndFactorizeTridiags<MatrixType, 1> function(btdm, interf, A, G, tiny);
3750  if(!use_fused_jacobi)
3751  function.run();
3752  else
3753  function.run_fused_jacobi();
3754  }
3755  IFPACK2_BLOCKHELPER_TIMER_FENCE(typename BlockHelperDetails::ImplType<MatrixType>::execution_space)
3756  }
3757 
3761  template<typename MatrixType>
3763  public:
3765  using execution_space = typename impl_type::execution_space;
3766  using memory_space = typename impl_type::memory_space;
3767 
3768  using local_ordinal_type = typename impl_type::local_ordinal_type;
3769  using impl_scalar_type = typename impl_type::impl_scalar_type;
3770  using btdm_scalar_type = typename impl_type::btdm_scalar_type;
3771  using tpetra_multivector_type = typename impl_type::tpetra_multivector_type;
3772  using local_ordinal_type_1d_view = typename impl_type::local_ordinal_type_1d_view;
3773  using vector_type_3d_view = typename impl_type::vector_type_3d_view;
3774  using impl_scalar_type_2d_view_tpetra = typename impl_type::impl_scalar_type_2d_view_tpetra;
3775  using const_impl_scalar_type_2d_view_tpetra = typename impl_scalar_type_2d_view_tpetra::const_type;
3776  static constexpr int vector_length = impl_type::vector_length;
3777 
3778  using member_type = typename Kokkos::TeamPolicy<execution_space>::member_type;
3779 
3780  private:
3781  // part interface
3782  const ConstUnmanaged<local_ordinal_type_1d_view> partptr;
3783  const ConstUnmanaged<local_ordinal_type_1d_view> packptr;
3784  const ConstUnmanaged<local_ordinal_type_1d_view> part2packrowidx0;
3785  const ConstUnmanaged<local_ordinal_type_1d_view> part2rowidx0;
3786  const ConstUnmanaged<local_ordinal_type_1d_view> lclrow;
3787  const local_ordinal_type blocksize;
3788  const local_ordinal_type num_vectors;
3789 
3790  // packed multivector output (or input)
3791  vector_type_3d_view packed_multivector;
3792  const_impl_scalar_type_2d_view_tpetra scalar_multivector;
3793 
3794  template<typename TagType>
3795  KOKKOS_INLINE_FUNCTION
3796  void copy_multivectors(const local_ordinal_type &j,
3797  const local_ordinal_type &vi,
3798  const local_ordinal_type &pri,
3799  const local_ordinal_type &ri0) const {
3800  for (local_ordinal_type col=0;col<num_vectors;++col)
3801  for (local_ordinal_type i=0;i<blocksize;++i)
3802  packed_multivector(pri, i, col)[vi] = static_cast<btdm_scalar_type>(scalar_multivector(blocksize*lclrow(ri0+j)+i,col));
3803  }
3804 
3805  public:
3806 
3807  MultiVectorConverter(const BlockHelperDetails::PartInterface<MatrixType> &interf,
3808  const vector_type_3d_view &pmv)
3809  : partptr(interf.partptr),
3810  packptr(interf.packptr),
3811  part2packrowidx0(interf.part2packrowidx0),
3812  part2rowidx0(interf.part2rowidx0),
3813  lclrow(interf.lclrow),
3814  blocksize(pmv.extent(1)),
3815  num_vectors(pmv.extent(2)),
3816  packed_multivector(pmv) {}
3817 
3818  // TODO:: modify this routine similar to the team level functions
3819  KOKKOS_INLINE_FUNCTION
3820  void
3821  operator() (const local_ordinal_type &packidx) const {
3822  local_ordinal_type partidx = packptr(packidx);
3823  local_ordinal_type npacks = packptr(packidx+1) - partidx;
3824  const local_ordinal_type pri0 = part2packrowidx0(partidx);
3825 
3826  local_ordinal_type ri0[vector_length] = {};
3827  local_ordinal_type nrows[vector_length] = {};
3828  for (local_ordinal_type v=0;v<npacks;++v,++partidx) {
3829  ri0[v] = part2rowidx0(partidx);
3830  nrows[v] = part2rowidx0(partidx+1) - ri0[v];
3831  }
3832  for (local_ordinal_type j=0;j<nrows[0];++j) {
3833  local_ordinal_type cnt = 1;
3834  for (;cnt<npacks && j!= nrows[cnt];++cnt);
3835  npacks = cnt;
3836  const local_ordinal_type pri = pri0 + j;
3837  for (local_ordinal_type col=0;col<num_vectors;++col)
3838  for (local_ordinal_type i=0;i<blocksize;++i)
3839  for (local_ordinal_type v=0;v<npacks;++v)
3840  packed_multivector(pri, i, col)[v] = static_cast<btdm_scalar_type>(scalar_multivector(blocksize*lclrow(ri0[v]+j)+i,col));
3841  }
3842  }
3843 
3844  KOKKOS_INLINE_FUNCTION
3845  void
3846  operator() (const member_type &member) const {
3847  const local_ordinal_type packidx = member.league_rank();
3848  const local_ordinal_type partidx_begin = packptr(packidx);
3849  const local_ordinal_type npacks = packptr(packidx+1) - partidx_begin;
3850  const local_ordinal_type pri0 = part2packrowidx0(partidx_begin);
3851  Kokkos::parallel_for(Kokkos::ThreadVectorRange(member, npacks), [&](const local_ordinal_type &v) {
3852  const local_ordinal_type partidx = partidx_begin + v;
3853  const local_ordinal_type ri0 = part2rowidx0(partidx);
3854  const local_ordinal_type nrows = part2rowidx0(partidx+1) - ri0;
3855 
3856  if (nrows == 1) {
3857  const local_ordinal_type pri = pri0;
3858  for (local_ordinal_type col=0;col<num_vectors;++col) {
3859  Kokkos::parallel_for(Kokkos::TeamThreadRange(member, blocksize), [&](const local_ordinal_type &i) {
3860  packed_multivector(pri, i, col)[v] = static_cast<btdm_scalar_type>(scalar_multivector(blocksize*lclrow(ri0)+i,col));
3861  });
3862  }
3863  } else {
3864  Kokkos::parallel_for(Kokkos::TeamThreadRange(member, nrows), [&](const local_ordinal_type &j) {
3865  const local_ordinal_type pri = pri0 + j;
3866  for (local_ordinal_type col=0;col<num_vectors;++col)
3867  for (local_ordinal_type i=0;i<blocksize;++i)
3868  packed_multivector(pri, i, col)[v] = static_cast<btdm_scalar_type>(scalar_multivector(blocksize*lclrow(ri0+j)+i,col));
3869  });
3870  }
3871  });
3872  }
3873 
3874  void run(const const_impl_scalar_type_2d_view_tpetra &scalar_multivector_) {
3875  IFPACK2_BLOCKTRIDICONTAINER_PROFILER_REGION_BEGIN;
3876  IFPACK2_BLOCKHELPER_TIMER("BlockTriDi::MultiVectorConverter", MultiVectorConverter0);
3877 
3878  scalar_multivector = scalar_multivector_;
3879  if constexpr (BlockHelperDetails::is_device<execution_space>::value) {
3880  const local_ordinal_type vl = vector_length;
3881  const Kokkos::TeamPolicy<execution_space> policy(packptr.extent(0) - 1, Kokkos::AUTO(), vl);
3882  Kokkos::parallel_for
3883  ("MultiVectorConverter::TeamPolicy", policy, *this);
3884  } else {
3885  const Kokkos::RangePolicy<execution_space> policy(0, packptr.extent(0) - 1);
3886  Kokkos::parallel_for
3887  ("MultiVectorConverter::RangePolicy", policy, *this);
3888  }
3889  IFPACK2_BLOCKTRIDICONTAINER_PROFILER_REGION_END;
3890  IFPACK2_BLOCKHELPER_TIMER_FENCE(execution_space)
3891  }
3892  };
3893 
3897 
3898  template<>
3899  struct SolveTridiagsDefaultModeAndAlgo<Kokkos::HostSpace> {
3900  typedef KB::Mode::Serial mode_type;
3901  typedef KB::Algo::Level2::Unblocked single_vector_algo_type;
3902 #if defined(__KOKKOSBATCHED_INTEL_MKL_COMPACT_BATCHED__)
3903  typedef KB::Algo::Level3::CompactMKL multi_vector_algo_type;
3904 #else
3905  typedef KB::Algo::Level3::Blocked multi_vector_algo_type;
3906 #endif
3907  static int recommended_team_size(const int /* blksize */,
3908  const int /* vector_length */,
3909  const int /* internal_vector_length */) {
3910  return 1;
3911  }
3912  };
3913 
3914 #if defined(KOKKOS_ENABLE_CUDA)
3915  static inline int SolveTridiagsRecommendedCudaTeamSize(const int blksize,
3916  const int vector_length,
3917  const int internal_vector_length) {
3918  const int vector_size = vector_length/internal_vector_length;
3919  int total_team_size(0);
3920  if (blksize <= 5) total_team_size = 32;
3921  else if (blksize <= 9) total_team_size = 32; // 64
3922  else if (blksize <= 12) total_team_size = 96;
3923  else if (blksize <= 16) total_team_size = 128;
3924  else if (blksize <= 20) total_team_size = 160;
3925  else total_team_size = 160;
3926  return total_team_size/vector_size;
3927  }
3928 
3929  template<>
3930  struct SolveTridiagsDefaultModeAndAlgo<Kokkos::CudaSpace> {
3931  typedef KB::Mode::Team mode_type;
3932  typedef KB::Algo::Level2::Unblocked single_vector_algo_type;
3933  typedef KB::Algo::Level3::Unblocked multi_vector_algo_type;
3934  static int recommended_team_size(const int blksize,
3935  const int vector_length,
3936  const int internal_vector_length) {
3937  return SolveTridiagsRecommendedCudaTeamSize(blksize, vector_length, internal_vector_length);
3938  }
3939  };
3940  template<>
3941  struct SolveTridiagsDefaultModeAndAlgo<Kokkos::CudaUVMSpace> {
3942  typedef KB::Mode::Team mode_type;
3943  typedef KB::Algo::Level2::Unblocked single_vector_algo_type;
3944  typedef KB::Algo::Level3::Unblocked multi_vector_algo_type;
3945  static int recommended_team_size(const int blksize,
3946  const int vector_length,
3947  const int internal_vector_length) {
3948  return SolveTridiagsRecommendedCudaTeamSize(blksize, vector_length, internal_vector_length);
3949  }
3950  };
3951 #endif
3952 
3953 #if defined(KOKKOS_ENABLE_HIP)
3954  static inline int SolveTridiagsRecommendedHIPTeamSize(const int blksize,
3955  const int vector_length,
3956  const int internal_vector_length) {
3957  const int vector_size = vector_length/internal_vector_length;
3958  int total_team_size(0);
3959  if (blksize <= 5) total_team_size = 32;
3960  else if (blksize <= 9) total_team_size = 32; // 64
3961  else if (blksize <= 12) total_team_size = 96;
3962  else if (blksize <= 16) total_team_size = 128;
3963  else if (blksize <= 20) total_team_size = 160;
3964  else total_team_size = 160;
3965  return total_team_size/vector_size;
3966  }
3967 
3968  template<>
3969  struct SolveTridiagsDefaultModeAndAlgo<Kokkos::HIPSpace> {
3970  typedef KB::Mode::Team mode_type;
3971  typedef KB::Algo::Level2::Unblocked single_vector_algo_type;
3972  typedef KB::Algo::Level3::Unblocked multi_vector_algo_type;
3973  static int recommended_team_size(const int blksize,
3974  const int vector_length,
3975  const int internal_vector_length) {
3976  return SolveTridiagsRecommendedHIPTeamSize(blksize, vector_length, internal_vector_length);
3977  }
3978  };
3979  template<>
3980  struct SolveTridiagsDefaultModeAndAlgo<Kokkos::HIPHostPinnedSpace> {
3981  typedef KB::Mode::Team mode_type;
3982  typedef KB::Algo::Level2::Unblocked single_vector_algo_type;
3983  typedef KB::Algo::Level3::Unblocked multi_vector_algo_type;
3984  static int recommended_team_size(const int blksize,
3985  const int vector_length,
3986  const int internal_vector_length) {
3987  return SolveTridiagsRecommendedHIPTeamSize(blksize, vector_length, internal_vector_length);
3988  }
3989  };
3990 #endif
3991 
3992 #if defined(KOKKOS_ENABLE_SYCL)
3993  static inline int SolveTridiagsRecommendedSYCLTeamSize(const int blksize,
3994  const int vector_length,
3995  const int internal_vector_length) {
3996  const int vector_size = vector_length/internal_vector_length;
3997  int total_team_size(0);
3998  if (blksize <= 5) total_team_size = 32;
3999  else if (blksize <= 9) total_team_size = 32; // 64
4000  else if (blksize <= 12) total_team_size = 96;
4001  else if (blksize <= 16) total_team_size = 128;
4002  else if (blksize <= 20) total_team_size = 160;
4003  else total_team_size = 160;
4004  return total_team_size/vector_size;
4005  }
4006 
4007  template<>
4008  struct SolveTridiagsDefaultModeAndAlgo<Kokkos::Experimental::SYCLSharedUSMSpace> {
4009  typedef KB::Mode::Team mode_type;
4010  typedef KB::Algo::Level2::Unblocked single_vector_algo_type;
4011  typedef KB::Algo::Level3::Unblocked multi_vector_algo_type;
4012  static int recommended_team_size(const int blksize,
4013  const int vector_length,
4014  const int internal_vector_length) {
4015  return SolveTridiagsRecommendedSYCLTeamSize(blksize, vector_length, internal_vector_length);
4016  }
4017  };
4018  template<>
4019  struct SolveTridiagsDefaultModeAndAlgo<Kokkos::Experimental::SYCLDeviceUSMSpace> {
4020  typedef KB::Mode::Team mode_type;
4021  typedef KB::Algo::Level2::Unblocked single_vector_algo_type;
4022  typedef KB::Algo::Level3::Unblocked multi_vector_algo_type;
4023  static int recommended_team_size(const int blksize,
4024  const int vector_length,
4025  const int internal_vector_length) {
4026  return SolveTridiagsRecommendedSYCLTeamSize(blksize, vector_length, internal_vector_length);
4027  }
4028  };
4029 #endif
4030 
4031 
4032 
4033 
4034  template<typename MatrixType>
4035  struct SolveTridiags {
4036  public:
4037  using impl_type = BlockHelperDetails::ImplType<MatrixType>;
4038  using execution_space = typename impl_type::execution_space;
4039 
4040  using local_ordinal_type = typename impl_type::local_ordinal_type;
4041  using size_type = typename impl_type::size_type;
4042  using impl_scalar_type = typename impl_type::impl_scalar_type;
4043  using magnitude_type = typename impl_type::magnitude_type;
4044  using btdm_scalar_type = typename impl_type::btdm_scalar_type;
4045  using btdm_magnitude_type = typename impl_type::btdm_magnitude_type;
4047  using local_ordinal_type_1d_view = typename impl_type::local_ordinal_type_1d_view;
4048  using local_ordinal_type_2d_view = typename impl_type::local_ordinal_type_2d_view;
4049  using size_type_2d_view = typename impl_type::size_type_2d_view;
4051  using vector_type_3d_view = typename impl_type::vector_type_3d_view;
4052  using internal_vector_type_4d_view = typename impl_type::internal_vector_type_4d_view;
4053  using internal_vector_type_5d_view = typename impl_type::internal_vector_type_5d_view;
4054  using btdm_scalar_type_4d_view = typename impl_type::btdm_scalar_type_4d_view;
4055 
4056  using internal_vector_scratch_type_3d_view = Scratch<typename impl_type::internal_vector_type_3d_view>;
4057 
4058  using internal_vector_type =typename impl_type::internal_vector_type;
4059  static constexpr int vector_length = impl_type::vector_length;
4060  static constexpr int internal_vector_length = impl_type::internal_vector_length;
4061 
4063  using impl_scalar_type_1d_view = typename impl_type::impl_scalar_type_1d_view;
4064  using impl_scalar_type_2d_view_tpetra = typename impl_type::impl_scalar_type_2d_view_tpetra;
4065 
4067  using team_policy_type = Kokkos::TeamPolicy<execution_space>;
4068  using member_type = typename team_policy_type::member_type;
4069 
4070  private:
4071  // part interface
4072  local_ordinal_type n_subparts_per_part;
4073  const ConstUnmanaged<local_ordinal_type_1d_view> partptr;
4074  const ConstUnmanaged<local_ordinal_type_1d_view> packptr;
4075  const ConstUnmanaged<local_ordinal_type_1d_view> packindices_sub;
4076  const ConstUnmanaged<local_ordinal_type_2d_view> packindices_schur;
4077  const ConstUnmanaged<local_ordinal_type_1d_view> part2packrowidx0;
4078  const ConstUnmanaged<local_ordinal_type_2d_view> part2packrowidx0_sub;
4079  const ConstUnmanaged<local_ordinal_type_1d_view> lclrow;
4080  const ConstUnmanaged<local_ordinal_type_1d_view> packptr_sub;
4081 
4082  const ConstUnmanaged<local_ordinal_type_2d_view> partptr_sub;
4083  const ConstUnmanaged<size_type_2d_view> pack_td_ptr_schur;
4084 
4085  // block tridiags
4086  const ConstUnmanaged<size_type_2d_view> pack_td_ptr;
4087 
4088  // block tridiags values
4089  const ConstUnmanaged<internal_vector_type_4d_view> D_internal_vector_values;
4090  const Unmanaged<internal_vector_type_4d_view> X_internal_vector_values;
4091  const Unmanaged<btdm_scalar_type_4d_view> X_internal_scalar_values;
4092 
4093  internal_vector_type_4d_view X_internal_vector_values_schur;
4094 
4095  const ConstUnmanaged<internal_vector_type_4d_view> D_internal_vector_values_schur;
4096  const ConstUnmanaged<internal_vector_type_5d_view> e_internal_vector_values;
4097 
4098 
4099  const local_ordinal_type vector_loop_size;
4100 
4101  // copy to multivectors : damping factor and Y_scalar_multivector
4102  Unmanaged<impl_scalar_type_2d_view_tpetra> Y_scalar_multivector;
4103 #if defined(__CUDA_ARCH__) || defined(__HIP_DEVICE_COMPILE__) || defined(__SYCL_DEVICE_ONLY__)
4104  AtomicUnmanaged<impl_scalar_type_1d_view> Z_scalar_vector;
4105 #else
4106  /* */ Unmanaged<impl_scalar_type_1d_view> Z_scalar_vector;
4107 #endif
4108  const impl_scalar_type df;
4109  const bool compute_diff;
4110 
4111  public:
4112  SolveTridiags(const BlockHelperDetails::PartInterface<MatrixType> &interf,
4113  const BlockTridiags<MatrixType> &btdm,
4114  const vector_type_3d_view &pmv,
4115  const impl_scalar_type damping_factor,
4116  const bool is_norm_manager_active)
4117  :
4118  // interface
4119  n_subparts_per_part(interf.n_subparts_per_part),
4120  partptr(interf.partptr),
4121  packptr(interf.packptr),
4122  packindices_sub(interf.packindices_sub),
4123  packindices_schur(interf.packindices_schur),
4124  part2packrowidx0(interf.part2packrowidx0),
4125  part2packrowidx0_sub(interf.part2packrowidx0_sub),
4126  lclrow(interf.lclrow),
4127  packptr_sub(interf.packptr_sub),
4128  partptr_sub(interf.partptr_sub),
4129  pack_td_ptr_schur(btdm.pack_td_ptr_schur),
4130  // block tridiags and multivector
4131  pack_td_ptr(btdm.pack_td_ptr),
4132  D_internal_vector_values((internal_vector_type*)btdm.values.data(),
4133  btdm.values.extent(0),
4134  btdm.values.extent(1),
4135  btdm.values.extent(2),
4136  vector_length/internal_vector_length),
4137  X_internal_vector_values((internal_vector_type*)pmv.data(),
4138  pmv.extent(0),
4139  pmv.extent(1),
4140  pmv.extent(2),
4141  vector_length/internal_vector_length),
4142  X_internal_scalar_values((btdm_scalar_type*)pmv.data(),
4143  pmv.extent(0),
4144  pmv.extent(1),
4145  pmv.extent(2),
4146  vector_length),
4147  X_internal_vector_values_schur(do_not_initialize_tag("X_internal_vector_values_schur"),
4148  2*(n_subparts_per_part-1) * part2packrowidx0_sub.extent(0),
4149  pmv.extent(1),
4150  pmv.extent(2),
4151  vector_length/internal_vector_length),
4152  D_internal_vector_values_schur((internal_vector_type*)btdm.values_schur.data(),
4153  btdm.values_schur.extent(0),
4154  btdm.values_schur.extent(1),
4155  btdm.values_schur.extent(2),
4156  vector_length/internal_vector_length),
4157  e_internal_vector_values((internal_vector_type*)btdm.e_values.data(),
4158  btdm.e_values.extent(0),
4159  btdm.e_values.extent(1),
4160  btdm.e_values.extent(2),
4161  btdm.e_values.extent(3),
4162  vector_length/internal_vector_length),
4163  vector_loop_size(vector_length/internal_vector_length),
4164  Y_scalar_multivector(),
4165  Z_scalar_vector(),
4166  df(damping_factor),
4167  compute_diff(is_norm_manager_active)
4168  {}
4169 
4170  public:
4171 
4173  KOKKOS_INLINE_FUNCTION
4174  void
4175  copyToFlatMultiVector(const member_type &member,
4176  const local_ordinal_type partidxbeg, // partidx for v = 0
4177  const local_ordinal_type npacks,
4178  const local_ordinal_type pri0,
4179  const local_ordinal_type v, // index with a loop of vector_loop_size
4180  const local_ordinal_type blocksize,
4181  const local_ordinal_type num_vectors) const {
4182  const local_ordinal_type vbeg = v*internal_vector_length;
4183  if (vbeg < npacks) {
4184  local_ordinal_type ri0_vals[internal_vector_length] = {};
4185  local_ordinal_type nrows_vals[internal_vector_length] = {};
4186  for (local_ordinal_type vv=vbeg,vi=0;vv<npacks && vi<internal_vector_length;++vv,++vi) {
4187  const local_ordinal_type partidx = partidxbeg+vv;
4188  ri0_vals[vi] = partptr(partidx);
4189  nrows_vals[vi] = partptr(partidx+1) - ri0_vals[vi];
4190  }
4191 
4192  impl_scalar_type z_partial_sum(0);
4193  if (nrows_vals[0] == 1) {
4194  const local_ordinal_type j=0, pri=pri0;
4195  {
4196  for (local_ordinal_type vv=vbeg,vi=0;vv<npacks && vi<internal_vector_length;++vv,++vi) {
4197  const local_ordinal_type ri0 = ri0_vals[vi];
4198  const local_ordinal_type nrows = nrows_vals[vi];
4199  if (j < nrows) {
4200  Kokkos::parallel_for
4201  (Kokkos::TeamThreadRange(member, blocksize),
4202  [&](const local_ordinal_type &i) {
4203  const local_ordinal_type row = blocksize*lclrow(ri0+j)+i;
4204  for (local_ordinal_type col=0;col<num_vectors;++col) {
4205  impl_scalar_type &y = Y_scalar_multivector(row,col);
4206  const impl_scalar_type yd = X_internal_vector_values(pri, i, col, v)[vi] - y;
4207  y += df*yd;
4208 
4209  {//if (compute_diff) {
4210  const auto yd_abs = Kokkos::ArithTraits<impl_scalar_type>::abs(yd);
4211  z_partial_sum += yd_abs*yd_abs;
4212  }
4213  }
4214  });
4215  }
4216  }
4217  }
4218  } else {
4219  Kokkos::parallel_for
4220  (Kokkos::TeamThreadRange(member, nrows_vals[0]),
4221  [&](const local_ordinal_type &j) {
4222  const local_ordinal_type pri = pri0 + j;
4223  for (local_ordinal_type vv=vbeg,vi=0;vv<npacks && vi<internal_vector_length;++vv,++vi) {
4224  const local_ordinal_type ri0 = ri0_vals[vi];
4225  const local_ordinal_type nrows = nrows_vals[vi];
4226  if (j < nrows) {
4227  for (local_ordinal_type col=0;col<num_vectors;++col) {
4228  for (local_ordinal_type i=0;i<blocksize;++i) {
4229  const local_ordinal_type row = blocksize*lclrow(ri0+j)+i;
4230  impl_scalar_type &y = Y_scalar_multivector(row,col);
4231  const impl_scalar_type yd = X_internal_vector_values(pri, i, col, v)[vi] - y;
4232  y += df*yd;
4233 
4234  {//if (compute_diff) {
4235  const auto yd_abs = Kokkos::ArithTraits<impl_scalar_type>::abs(yd);
4236  z_partial_sum += yd_abs*yd_abs;
4237  }
4238  }
4239  }
4240  }
4241  }
4242  });
4243  }
4244  //if (compute_diff)
4245  Z_scalar_vector(member.league_rank()) += z_partial_sum;
4246  }
4247  }
4248 
4252  template<typename WWViewType>
4253  KOKKOS_INLINE_FUNCTION
4254  void
4255  solveSingleVector(const member_type &member,
4256  const local_ordinal_type &blocksize,
4257  const local_ordinal_type &i0,
4258  const local_ordinal_type &r0,
4259  const local_ordinal_type &nrows,
4260  const local_ordinal_type &v,
4261  const WWViewType &WW) const {
4262 
4263  typedef SolveTridiagsDefaultModeAndAlgo
4264  <typename execution_space::memory_space> default_mode_and_algo_type;
4265 
4266  typedef typename default_mode_and_algo_type::mode_type default_mode_type;
4267  typedef typename default_mode_and_algo_type::single_vector_algo_type default_algo_type;
4268 
4269  // base pointers
4270  auto A = D_internal_vector_values.data();
4271  auto X = X_internal_vector_values.data();
4272 
4273  // constant
4274  const auto one = Kokkos::ArithTraits<btdm_magnitude_type>::one();
4275  const auto zero = Kokkos::ArithTraits<btdm_magnitude_type>::zero();
4276  //const local_ordinal_type num_vectors = X_scalar_values.extent(2);
4277 
4278  // const local_ordinal_type blocksize = D_scalar_values.extent(1);
4279  const local_ordinal_type astep = D_internal_vector_values.stride_0();
4280  const local_ordinal_type as0 = D_internal_vector_values.stride_1(); //blocksize*vector_length;
4281  const local_ordinal_type as1 = D_internal_vector_values.stride_2(); //vector_length;
4282  const local_ordinal_type xstep = X_internal_vector_values.stride_0();
4283  const local_ordinal_type xs0 = X_internal_vector_values.stride_1(); //vector_length;
4284 
4285  // move to starting point
4286  A += i0*astep + v;
4287  X += r0*xstep + v;
4288 
4289  //for (local_ordinal_type col=0;col<num_vectors;++col)
4290  if (nrows > 1) {
4291  // solve Lx = x
4292  KOKKOSBATCHED_TRSV_LOWER_NO_TRANSPOSE_INTERNAL_INVOKE
4293  (default_mode_type,default_algo_type,
4294  member,
4295  KB::Diag::Unit,
4296  blocksize,blocksize,
4297  one,
4298  A, as0, as1,
4299  X, xs0);
4300 
4301  for (local_ordinal_type tr=1;tr<nrows;++tr) {
4302  member.team_barrier();
4303  KOKKOSBATCHED_GEMV_NO_TRANSPOSE_INTERNAL_INVOKE
4304  (default_mode_type,default_algo_type,
4305  member,
4306  blocksize, blocksize,
4307  -one,
4308  A+2*astep, as0, as1,
4309  X, xs0,
4310  one,
4311  X+1*xstep, xs0);
4312  KOKKOSBATCHED_TRSV_LOWER_NO_TRANSPOSE_INTERNAL_INVOKE
4313  (default_mode_type,default_algo_type,
4314  member,
4315  KB::Diag::Unit,
4316  blocksize,blocksize,
4317  one,
4318  A+3*astep, as0, as1,
4319  X+1*xstep, xs0);
4320 
4321  A += 3*astep;
4322  X += 1*xstep;
4323  }
4324 
4325  // solve Ux = x
4326  KOKKOSBATCHED_TRSV_UPPER_NO_TRANSPOSE_INTERNAL_INVOKE
4327  (default_mode_type,default_algo_type,
4328  member,
4329  KB::Diag::NonUnit,
4330  blocksize, blocksize,
4331  one,
4332  A, as0, as1,
4333  X, xs0);
4334 
4335  for (local_ordinal_type tr=nrows;tr>1;--tr) {
4336  A -= 3*astep;
4337  member.team_barrier();
4338  KOKKOSBATCHED_GEMV_NO_TRANSPOSE_INTERNAL_INVOKE
4339  (default_mode_type,default_algo_type,
4340  member,
4341  blocksize, blocksize,
4342  -one,
4343  A+1*astep, as0, as1,
4344  X, xs0,
4345  one,
4346  X-1*xstep, xs0);
4347  KOKKOSBATCHED_TRSV_UPPER_NO_TRANSPOSE_INTERNAL_INVOKE
4348  (default_mode_type,default_algo_type,
4349  member,
4350  KB::Diag::NonUnit,
4351  blocksize, blocksize,
4352  one,
4353  A, as0, as1,
4354  X-1*xstep,xs0);
4355  X -= 1*xstep;
4356  }
4357  // for multiple rhs
4358  //X += xs1;
4359  } else {
4360  const local_ordinal_type ws0 = WW.stride_0();
4361  auto W = WW.data() + v;
4362  KOKKOSBATCHED_COPY_VECTOR_NO_TRANSPOSE_INTERNAL_INVOKE
4363  (default_mode_type,
4364  member, blocksize, X, xs0, W, ws0);
4365  member.team_barrier();
4366  KOKKOSBATCHED_GEMV_NO_TRANSPOSE_INTERNAL_INVOKE
4367  (default_mode_type,default_algo_type,
4368  member,
4369  blocksize, blocksize,
4370  one,
4371  A, as0, as1,
4372  W, xs0,
4373  zero,
4374  X, xs0);
4375  }
4376  }
4377 
4378  template<typename WWViewType>
4379  KOKKOS_INLINE_FUNCTION
4380  void
4381  solveMultiVector(const member_type &member,
4382  const local_ordinal_type &/* blocksize */,
4383  const local_ordinal_type &i0,
4384  const local_ordinal_type &r0,
4385  const local_ordinal_type &nrows,
4386  const local_ordinal_type &v,
4387  const WWViewType &WW) const {
4388 
4389  typedef SolveTridiagsDefaultModeAndAlgo
4390  <typename execution_space::memory_space> default_mode_and_algo_type;
4391 
4392  typedef typename default_mode_and_algo_type::mode_type default_mode_type;
4393  typedef typename default_mode_and_algo_type::multi_vector_algo_type default_algo_type;
4394 
4395  // constant
4396  const auto one = Kokkos::ArithTraits<btdm_magnitude_type>::one();
4397  const auto zero = Kokkos::ArithTraits<btdm_magnitude_type>::zero();
4398 
4399  // subview pattern
4400  auto A = Kokkos::subview(D_internal_vector_values, i0, Kokkos::ALL(), Kokkos::ALL(), v);
4401  auto X1 = Kokkos::subview(X_internal_vector_values, r0, Kokkos::ALL(), Kokkos::ALL(), v);
4402  auto X2 = X1;
4403 
4404  local_ordinal_type i = i0, r = r0;
4405 
4406 
4407  if (nrows > 1) {
4408  // solve Lx = x
4409  KB::Trsm<member_type,
4410  KB::Side::Left,KB::Uplo::Lower,KB::Trans::NoTranspose,KB::Diag::Unit,
4411  default_mode_type,default_algo_type>
4412  ::invoke(member, one, A, X1);
4413  for (local_ordinal_type tr=1;tr<nrows;++tr,i+=3) {
4414  A.assign_data( &D_internal_vector_values(i+2,0,0,v) );
4415  X2.assign_data( &X_internal_vector_values(++r,0,0,v) );
4416  member.team_barrier();
4417  KB::Gemm<member_type,
4418  KB::Trans::NoTranspose,KB::Trans::NoTranspose,
4419  default_mode_type,default_algo_type>
4420  ::invoke(member, -one, A, X1, one, X2);
4421  A.assign_data( &D_internal_vector_values(i+3,0,0,v) );
4422  KB::Trsm<member_type,
4423  KB::Side::Left,KB::Uplo::Lower,KB::Trans::NoTranspose,KB::Diag::Unit,
4424  default_mode_type,default_algo_type>
4425  ::invoke(member, one, A, X2);
4426  X1.assign_data( X2.data() );
4427  }
4428 
4429  // solve Ux = x
4430  KB::Trsm<member_type,
4431  KB::Side::Left,KB::Uplo::Upper,KB::Trans::NoTranspose,KB::Diag::NonUnit,
4432  default_mode_type,default_algo_type>
4433  ::invoke(member, one, A, X1);
4434  for (local_ordinal_type tr=nrows;tr>1;--tr) {
4435  i -= 3;
4436  A.assign_data( &D_internal_vector_values(i+1,0,0,v) );
4437  X2.assign_data( &X_internal_vector_values(--r,0,0,v) );
4438  member.team_barrier();
4439  KB::Gemm<member_type,
4440  KB::Trans::NoTranspose,KB::Trans::NoTranspose,
4441  default_mode_type,default_algo_type>
4442  ::invoke(member, -one, A, X1, one, X2);
4443 
4444  A.assign_data( &D_internal_vector_values(i,0,0,v) );
4445  KB::Trsm<member_type,
4446  KB::Side::Left,KB::Uplo::Upper,KB::Trans::NoTranspose,KB::Diag::NonUnit,
4447  default_mode_type,default_algo_type>
4448  ::invoke(member, one, A, X2);
4449  X1.assign_data( X2.data() );
4450  }
4451  } else {
4452  // matrix is already inverted
4453  auto W = Kokkos::subview(WW, Kokkos::ALL(), Kokkos::ALL(), v);
4454  KB::Copy<member_type,KB::Trans::NoTranspose,default_mode_type>
4455  ::invoke(member, X1, W);
4456  member.team_barrier();
4457  KB::Gemm<member_type,
4458  KB::Trans::NoTranspose,KB::Trans::NoTranspose,
4459  default_mode_type,default_algo_type>
4460  ::invoke(member, one, A, W, zero, X1);
4461  }
4462  }
4463 
4464  template<int B> struct SingleVectorTag {};
4465  template<int B> struct MultiVectorTag {};
4466 
4467  template<int B> struct SingleVectorSubLineTag {};
4468  template<int B> struct MultiVectorSubLineTag {};
4469  template<int B> struct SingleVectorApplyCTag {};
4470  template<int B> struct MultiVectorApplyCTag {};
4471  template<int B> struct SingleVectorSchurTag {};
4472  template<int B> struct MultiVectorSchurTag {};
4473  template<int B> struct SingleVectorApplyETag {};
4474  template<int B> struct MultiVectorApplyETag {};
4475  template<int B> struct SingleVectorCopyToFlatTag {};
4476  template<int B> struct SingleZeroingTag {};
4477 
4478  template<int B>
4479  KOKKOS_INLINE_FUNCTION
4480  void
4481  operator() (const SingleVectorTag<B> &, const member_type &member) const {
4482  const local_ordinal_type packidx = member.league_rank();
4483  const local_ordinal_type partidx = packptr(packidx);
4484  const local_ordinal_type npacks = packptr(packidx+1) - partidx;
4485  const local_ordinal_type pri0 = part2packrowidx0(partidx);
4486  const local_ordinal_type i0 = pack_td_ptr(partidx,0);
4487  const local_ordinal_type r0 = part2packrowidx0(partidx);
4488  const local_ordinal_type nrows = partptr(partidx+1) - partptr(partidx);
4489  const local_ordinal_type blocksize = (B == 0 ? D_internal_vector_values.extent(1) : B);
4490  const local_ordinal_type num_vectors = 1;
4491  internal_vector_scratch_type_3d_view
4492  WW(member.team_scratch(0), blocksize, 1, vector_loop_size);
4493  Kokkos::single(Kokkos::PerTeam(member), [&]() {
4494  Z_scalar_vector(member.league_rank()) = impl_scalar_type(0);
4495  });
4496  Kokkos::parallel_for
4497  (Kokkos::ThreadVectorRange(member, vector_loop_size),[&](const int &v) {
4498  solveSingleVector(member, blocksize, i0, r0, nrows, v, WW);
4499  copyToFlatMultiVector(member, partidx, npacks, pri0, v, blocksize, num_vectors);
4500  });
4501  }
4502 
4503  template<int B>
4504  KOKKOS_INLINE_FUNCTION
4505  void
4506  operator() (const MultiVectorTag<B> &, const member_type &member) const {
4507  const local_ordinal_type packidx = member.league_rank();
4508  const local_ordinal_type partidx = packptr(packidx);
4509  const local_ordinal_type npacks = packptr(packidx+1) - partidx;
4510  const local_ordinal_type pri0 = part2packrowidx0(partidx);
4511  const local_ordinal_type i0 = pack_td_ptr(partidx,0);
4512  const local_ordinal_type r0 = part2packrowidx0(partidx);
4513  const local_ordinal_type nrows = partptr(partidx+1) - partptr(partidx);
4514  const local_ordinal_type blocksize = (B == 0 ? D_internal_vector_values.extent(1) : B);
4515  const local_ordinal_type num_vectors = X_internal_vector_values.extent(2);
4516 
4517  internal_vector_scratch_type_3d_view
4518  WW(member.team_scratch(0), blocksize, num_vectors, vector_loop_size);
4519  Kokkos::single(Kokkos::PerTeam(member), [&]() {
4520  Z_scalar_vector(member.league_rank()) = impl_scalar_type(0);
4521  });
4522  Kokkos::parallel_for
4523  (Kokkos::ThreadVectorRange(member, vector_loop_size),[&](const int &v) {
4524  solveMultiVector(member, blocksize, i0, r0, nrows, v, WW);
4525  copyToFlatMultiVector(member, partidx, npacks, pri0, v, blocksize, num_vectors);
4526  });
4527  }
4528 
4529  template<int B>
4530  KOKKOS_INLINE_FUNCTION
4531  void
4532  operator() (const SingleVectorSubLineTag<B> &, const member_type &member) const {
4533  // btdm is packed and sorted from largest one
4534  const local_ordinal_type packidx = packindices_sub(member.league_rank());
4535 
4536  const local_ordinal_type subpartidx = packptr_sub(packidx);
4537  const local_ordinal_type n_parts = part2packrowidx0_sub.extent(0);
4538  const local_ordinal_type local_subpartidx = subpartidx/n_parts;
4539  const local_ordinal_type partidx = subpartidx%n_parts;
4540 
4541  const local_ordinal_type npacks = packptr_sub(packidx+1) - subpartidx;
4542  const local_ordinal_type i0 = pack_td_ptr(partidx,local_subpartidx);
4543  const local_ordinal_type r0 = part2packrowidx0_sub(partidx,local_subpartidx);
4544  const local_ordinal_type nrows = partptr_sub(subpartidx,1) - partptr_sub(subpartidx,0);
4545  const local_ordinal_type blocksize = e_internal_vector_values.extent(2);
4546 
4547  //(void) i0;
4548  //(void) nrows;
4549  (void) npacks;
4550 
4551  internal_vector_scratch_type_3d_view
4552  WW(member.team_scratch(0), blocksize, 1, vector_loop_size);
4553 
4554  Kokkos::parallel_for
4555  (Kokkos::ThreadVectorRange(member, vector_loop_size),[&](const int &v) {
4556  solveSingleVectorNew<impl_type, internal_vector_scratch_type_3d_view> (member, blocksize, i0, r0, nrows, v, D_internal_vector_values, X_internal_vector_values, WW);
4557  });
4558  }
4559 
4560  template<int B>
4561  KOKKOS_INLINE_FUNCTION
4562  void
4563  operator() (const SingleVectorApplyCTag<B> &, const member_type &member) const {
4564  // btdm is packed and sorted from largest one
4565  //const local_ordinal_type packidx = packindices_schur(member.league_rank());
4566  const local_ordinal_type packidx = packindices_sub(member.league_rank());
4567 
4568  const local_ordinal_type subpartidx = packptr_sub(packidx);
4569  const local_ordinal_type n_parts = part2packrowidx0_sub.extent(0);
4570  const local_ordinal_type local_subpartidx = subpartidx/n_parts;
4571  const local_ordinal_type partidx = subpartidx%n_parts;
4572  const local_ordinal_type blocksize = e_internal_vector_values.extent(2);
4573 
4574  //const local_ordinal_type npacks = packptr_sub(packidx+1) - subpartidx;
4575  const local_ordinal_type i0 = pack_td_ptr(partidx,local_subpartidx);
4576  const local_ordinal_type r0 = part2packrowidx0_sub(partidx,local_subpartidx);
4577  const local_ordinal_type nrows = partptr_sub(subpartidx,1) - partptr_sub(subpartidx,0);
4578 
4579  internal_vector_scratch_type_3d_view
4580  WW(member.team_scratch(0), blocksize, blocksize, vector_loop_size);
4581 
4582  // Compute v_2 = v_2 - C v_1
4583 
4584  const local_ordinal_type local_subpartidx_schur = (local_subpartidx-1)/2;
4585  const local_ordinal_type i0_schur = local_subpartidx_schur == 0 ? pack_td_ptr_schur(partidx,local_subpartidx_schur) : pack_td_ptr_schur(partidx,local_subpartidx_schur) + 1;
4586  const local_ordinal_type i0_offset = local_subpartidx_schur == 0 ? i0+2 : i0+2;
4587 
4588  (void) i0_schur;
4589  (void) i0_offset;
4590 
4591  const auto one = Kokkos::ArithTraits<btdm_magnitude_type>::one();
4592 
4593  const size_type c_kps2 = local_subpartidx > 0 ? pack_td_ptr(partidx, local_subpartidx)-2 : 0;
4594  const size_type c_kps1 = pack_td_ptr(partidx, local_subpartidx+1)+1;
4595 
4596  typedef SolveTridiagsDefaultModeAndAlgo
4597  <typename execution_space::memory_space> default_mode_and_algo_type;
4598 
4599  typedef typename default_mode_and_algo_type::mode_type default_mode_type;
4600  typedef typename default_mode_and_algo_type::single_vector_algo_type default_algo_type;
4601 
4602  if (local_subpartidx == 0) {
4603  Kokkos::parallel_for
4604  (Kokkos::ThreadVectorRange(member, vector_loop_size),[&](const int &v) {
4605  auto v_1 = Kokkos::subview(X_internal_vector_values, r0+nrows-1, Kokkos::ALL(), 0, v);
4606  auto v_2 = Kokkos::subview(X_internal_vector_values, r0+nrows, Kokkos::ALL(), 0, v);
4607  auto C = Kokkos::subview(D_internal_vector_values, c_kps1, Kokkos::ALL(), Kokkos::ALL(), v);
4608 
4609  KOKKOSBATCHED_GEMV_NO_TRANSPOSE_INTERNAL_INVOKE
4610  (default_mode_type,default_algo_type,
4611  member,
4612  blocksize, blocksize,
4613  -one,
4614  C.data(), C.stride_0(), C.stride_1(),
4615  v_1.data(), v_1.stride_0(),
4616  one,
4617  v_2.data(), v_2.stride_0());
4618  });
4619  }
4620  else if (local_subpartidx == (local_ordinal_type) part2packrowidx0_sub.extent(1) - 2) {
4621  Kokkos::parallel_for
4622  (Kokkos::ThreadVectorRange(member, vector_loop_size),[&](const int &v) {
4623  auto v_1 = Kokkos::subview(X_internal_vector_values, r0, Kokkos::ALL(), 0, v);
4624  auto v_2 = Kokkos::subview(X_internal_vector_values, r0-1, Kokkos::ALL(), 0, v);
4625  auto C = Kokkos::subview(D_internal_vector_values, c_kps2, Kokkos::ALL(), Kokkos::ALL(), v);
4626 
4627  KOKKOSBATCHED_GEMV_NO_TRANSPOSE_INTERNAL_INVOKE
4628  (default_mode_type,default_algo_type,
4629  member,
4630  blocksize, blocksize,
4631  -one,
4632  C.data(), C.stride_0(), C.stride_1(),
4633  v_1.data(), v_1.stride_0(),
4634  one,
4635  v_2.data(), v_2.stride_0());
4636  });
4637  }
4638  else {
4639  Kokkos::parallel_for
4640  (Kokkos::ThreadVectorRange(member, vector_loop_size),[&](const int &v) {
4641  {
4642  auto v_1 = Kokkos::subview(X_internal_vector_values, r0+nrows-1, Kokkos::ALL(), 0, v);
4643  auto v_2 = Kokkos::subview(X_internal_vector_values, r0+nrows, Kokkos::ALL(), 0, v);
4644  auto C = Kokkos::subview(D_internal_vector_values, c_kps1, Kokkos::ALL(), Kokkos::ALL(), v);
4645 
4646  KOKKOSBATCHED_GEMV_NO_TRANSPOSE_INTERNAL_INVOKE
4647  (default_mode_type,default_algo_type,
4648  member,
4649  blocksize, blocksize,
4650  -one,
4651  C.data(), C.stride_0(), C.stride_1(),
4652  v_1.data(), v_1.stride_0(),
4653  one,
4654  v_2.data(), v_2.stride_0());
4655  }
4656  {
4657  auto v_1 = Kokkos::subview(X_internal_vector_values, r0, Kokkos::ALL(), 0, v);
4658  auto v_2 = Kokkos::subview(X_internal_vector_values, r0-1, Kokkos::ALL(), 0, v);
4659  auto C = Kokkos::subview(D_internal_vector_values, c_kps2, Kokkos::ALL(), Kokkos::ALL(), v);
4660 
4661  KOKKOSBATCHED_GEMV_NO_TRANSPOSE_INTERNAL_INVOKE
4662  (default_mode_type,default_algo_type,
4663  member,
4664  blocksize, blocksize,
4665  -one,
4666  C.data(), C.stride_0(), C.stride_1(),
4667  v_1.data(), v_1.stride_0(),
4668  one,
4669  v_2.data(), v_2.stride_0());
4670  }
4671  });
4672  }
4673  }
4674 
4675  template<int B>
4676  KOKKOS_INLINE_FUNCTION
4677  void
4678  operator() (const SingleVectorSchurTag<B> &, const member_type &member) const {
4679  const local_ordinal_type packidx = packindices_sub(member.league_rank());
4680 
4681  const local_ordinal_type partidx = packptr_sub(packidx);
4682 
4683  const local_ordinal_type blocksize = e_internal_vector_values.extent(2);
4684 
4685  const local_ordinal_type i0_schur = pack_td_ptr_schur(partidx,0);
4686  const local_ordinal_type nrows = 2*(n_subparts_per_part-1);
4687 
4688  const local_ordinal_type r0_schur = nrows * member.league_rank();
4689 
4690  internal_vector_scratch_type_3d_view
4691  WW(member.team_scratch(0), blocksize, blocksize, vector_loop_size);
4692 
4693  for (local_ordinal_type schur_sub_part = 0; schur_sub_part < n_subparts_per_part-1; ++schur_sub_part) {
4694  const local_ordinal_type r0 = part2packrowidx0_sub(partidx,2*schur_sub_part+1);
4695  for (local_ordinal_type i = 0; i < 2; ++i) {
4696  copy3DView<local_ordinal_type>(member,
4697  Kokkos::subview(X_internal_vector_values_schur, r0_schur+2*schur_sub_part+i, Kokkos::ALL(), Kokkos::ALL(), Kokkos::ALL()),
4698  Kokkos::subview(X_internal_vector_values, r0+i, Kokkos::ALL(), Kokkos::ALL(), Kokkos::ALL()));
4699  }
4700  }
4701 
4702  Kokkos::parallel_for
4703  (Kokkos::ThreadVectorRange(member, vector_loop_size),[&](const int &v) {
4704  solveSingleVectorNew<impl_type, internal_vector_scratch_type_3d_view> (member, blocksize, i0_schur, r0_schur, nrows, v, D_internal_vector_values_schur, X_internal_vector_values_schur, WW);
4705  });
4706 
4707  for (local_ordinal_type schur_sub_part = 0; schur_sub_part < n_subparts_per_part-1; ++schur_sub_part) {
4708  const local_ordinal_type r0 = part2packrowidx0_sub(partidx,2*schur_sub_part+1);
4709  for (local_ordinal_type i = 0; i < 2; ++i) {
4710  copy3DView<local_ordinal_type>(member,
4711  Kokkos::subview(X_internal_vector_values, r0+i, Kokkos::ALL(), Kokkos::ALL(), Kokkos::ALL()),
4712  Kokkos::subview(X_internal_vector_values_schur, r0_schur+2*schur_sub_part+i, Kokkos::ALL(), Kokkos::ALL(), Kokkos::ALL()));
4713  }
4714  }
4715  }
4716 
4717  template<int B>
4718  KOKKOS_INLINE_FUNCTION
4719  void
4720  operator() (const SingleVectorApplyETag<B> &, const member_type &member) const {
4721  const local_ordinal_type packidx = packindices_sub(member.league_rank());
4722 
4723  const local_ordinal_type subpartidx = packptr_sub(packidx);
4724  const local_ordinal_type n_parts = part2packrowidx0_sub.extent(0);
4725  const local_ordinal_type local_subpartidx = subpartidx/n_parts;
4726  const local_ordinal_type partidx = subpartidx%n_parts;
4727  const local_ordinal_type blocksize = e_internal_vector_values.extent(2);
4728 
4729  const local_ordinal_type r0 = part2packrowidx0_sub(partidx,local_subpartidx);
4730  const local_ordinal_type nrows = partptr_sub(subpartidx,1) - partptr_sub(subpartidx,0);
4731 
4732  internal_vector_scratch_type_3d_view
4733  WW(member.team_scratch(0), blocksize, blocksize, vector_loop_size);
4734 
4735  // Compute v_2 = v_2 - C v_1
4736 
4737  const auto one = Kokkos::ArithTraits<btdm_magnitude_type>::one();
4738 
4739  typedef SolveTridiagsDefaultModeAndAlgo
4740  <typename execution_space::memory_space> default_mode_and_algo_type;
4741 
4742  typedef typename default_mode_and_algo_type::mode_type default_mode_type;
4743  typedef typename default_mode_and_algo_type::single_vector_algo_type default_algo_type;
4744 
4745  if (local_subpartidx == 0) {
4746  Kokkos::parallel_for
4747  (Kokkos::ThreadVectorRange(member, vector_loop_size),[&](const int &v) {
4748 
4749  auto v_2 = Kokkos::subview(X_internal_vector_values, r0+nrows, Kokkos::ALL(), 0, v);
4750 
4751  for (local_ordinal_type row = 0; row < nrows; ++row) {
4752  auto v_1 = Kokkos::subview(X_internal_vector_values, r0+row, Kokkos::ALL(), 0, v);
4753  auto E = Kokkos::subview(e_internal_vector_values, 0, r0+row, Kokkos::ALL(), Kokkos::ALL(), v);
4754 
4755  KOKKOSBATCHED_GEMV_NO_TRANSPOSE_INTERNAL_INVOKE
4756  (default_mode_type,default_algo_type,
4757  member,
4758  blocksize, blocksize,
4759  -one,
4760  E.data(), E.stride_0(), E.stride_1(),
4761  v_2.data(), v_2.stride_0(),
4762  one,
4763  v_1.data(), v_1.stride_0());
4764  }
4765  });
4766  }
4767  else if (local_subpartidx == (local_ordinal_type) part2packrowidx0_sub.extent(1) - 2) {
4768  Kokkos::parallel_for
4769  (Kokkos::ThreadVectorRange(member, vector_loop_size),[&](const int &v) {
4770  auto v_2 = Kokkos::subview(X_internal_vector_values, r0-1, Kokkos::ALL(), 0, v);
4771 
4772  for (local_ordinal_type row = 0; row < nrows; ++row) {
4773  auto v_1 = Kokkos::subview(X_internal_vector_values, r0+row, Kokkos::ALL(), 0, v);
4774  auto E = Kokkos::subview(e_internal_vector_values, 1, r0+row, Kokkos::ALL(), Kokkos::ALL(), v);
4775 
4776  KOKKOSBATCHED_GEMV_NO_TRANSPOSE_INTERNAL_INVOKE
4777  (default_mode_type,default_algo_type,
4778  member,
4779  blocksize, blocksize,
4780  -one,
4781  E.data(), E.stride_0(), E.stride_1(),
4782  v_2.data(), v_2.stride_0(),
4783  one,
4784  v_1.data(), v_1.stride_0());
4785  }
4786  });
4787  }
4788  else {
4789  Kokkos::parallel_for
4790  (Kokkos::ThreadVectorRange(member, vector_loop_size),[&](const int &v) {
4791  {
4792  auto v_2 = Kokkos::subview(X_internal_vector_values, r0+nrows, Kokkos::ALL(), 0, v);
4793 
4794  for (local_ordinal_type row = 0; row < nrows; ++row) {
4795  auto v_1 = Kokkos::subview(X_internal_vector_values, r0+row, Kokkos::ALL(), 0, v);
4796  auto E = Kokkos::subview(e_internal_vector_values, 0, r0+row, Kokkos::ALL(), Kokkos::ALL(), v);
4797 
4798  KOKKOSBATCHED_GEMV_NO_TRANSPOSE_INTERNAL_INVOKE
4799  (default_mode_type,default_algo_type,
4800  member,
4801  blocksize, blocksize,
4802  -one,
4803  E.data(), E.stride_0(), E.stride_1(),
4804  v_2.data(), v_2.stride_0(),
4805  one,
4806  v_1.data(), v_1.stride_0());
4807  }
4808  }
4809  {
4810  auto v_2 = Kokkos::subview(X_internal_vector_values, r0-1, Kokkos::ALL(), 0, v);
4811 
4812  for (local_ordinal_type row = 0; row < nrows; ++row) {
4813  auto v_1 = Kokkos::subview(X_internal_vector_values, r0+row, Kokkos::ALL(), 0, v);
4814  auto E = Kokkos::subview(e_internal_vector_values, 1, r0+row, Kokkos::ALL(), Kokkos::ALL(), v);
4815 
4816  KOKKOSBATCHED_GEMV_NO_TRANSPOSE_INTERNAL_INVOKE
4817  (default_mode_type,default_algo_type,
4818  member,
4819  blocksize, blocksize,
4820  -one,
4821  E.data(), E.stride_0(), E.stride_1(),
4822  v_2.data(), v_2.stride_0(),
4823  one,
4824  v_1.data(), v_1.stride_0());
4825  }
4826  }
4827  });
4828  }
4829  }
4830 
4831  template<int B>
4832  KOKKOS_INLINE_FUNCTION
4833  void
4834  operator() (const SingleVectorCopyToFlatTag<B> &, const member_type &member) const {
4835  const local_ordinal_type packidx = member.league_rank();
4836  const local_ordinal_type partidx = packptr(packidx);
4837  const local_ordinal_type npacks = packptr(packidx+1) - partidx;
4838  const local_ordinal_type pri0 = part2packrowidx0(partidx);
4839  const local_ordinal_type blocksize = (B == 0 ? D_internal_vector_values.extent(1) : B);
4840  const local_ordinal_type num_vectors = 1;
4841 
4842  Kokkos::parallel_for
4843  (Kokkos::ThreadVectorRange(member, vector_loop_size),[&](const int &v) {
4844  copyToFlatMultiVector(member, partidx, npacks, pri0, v, blocksize, num_vectors);
4845  });
4846  }
4847 
4848  template<int B>
4849  KOKKOS_INLINE_FUNCTION
4850  void
4851  operator() (const SingleZeroingTag<B> &, const member_type &member) const {
4852  Kokkos::single(Kokkos::PerTeam(member), [&]() {
4853  Z_scalar_vector(member.league_rank()) = impl_scalar_type(0);
4854  });
4855  }
4856 
4857  void run(const impl_scalar_type_2d_view_tpetra &Y,
4858  const impl_scalar_type_1d_view &Z) {
4859  IFPACK2_BLOCKTRIDICONTAINER_PROFILER_REGION_BEGIN;
4860  IFPACK2_BLOCKHELPER_TIMER("BlockTriDi::SolveTridiags", SolveTridiags);
4861 
4863  this->Y_scalar_multivector = Y;
4864  this->Z_scalar_vector = Z;
4865 
4866  const local_ordinal_type num_vectors = X_internal_vector_values.extent(2);
4867  const local_ordinal_type blocksize = D_internal_vector_values.extent(1);
4868 
4869  const local_ordinal_type team_size =
4870  SolveTridiagsDefaultModeAndAlgo<typename execution_space::memory_space>::
4871  recommended_team_size(blocksize, vector_length, internal_vector_length);
4872  const int per_team_scratch = internal_vector_scratch_type_3d_view
4873  ::shmem_size(blocksize, num_vectors, vector_loop_size);
4874 
4875 #if defined(KOKKOS_ENABLE_DEPRECATED_CODE)
4876 #define BLOCKTRIDICONTAINER_DETAILS_SOLVETRIDIAGS(B) \
4877  if (num_vectors == 1) { \
4878  const Kokkos::TeamPolicy<execution_space,SingleVectorTag<B> > \
4879  policy(packptr.extent(0) - 1, team_size, vector_loop_size); \
4880  Kokkos::parallel_for \
4881  ("SolveTridiags::TeamPolicy::run<SingleVector>", \
4882  policy.set_scratch_size(0,Kokkos::PerTeam(per_team_scratch)), *this); \
4883  } else { \
4884  const Kokkos::TeamPolicy<execution_space,MultiVectorTag<B> > \
4885  policy(packptr.extent(0) - 1, team_size, vector_loop_size); \
4886  Kokkos::parallel_for \
4887  ("SolveTridiags::TeamPolicy::run<MultiVector>", \
4888  policy.set_scratch_size(0,Kokkos::PerTeam(per_team_scratch)), *this); \
4889  } break
4890 #else
4891 #define BLOCKTRIDICONTAINER_DETAILS_SOLVETRIDIAGS(B) \
4892  if (num_vectors == 1) { \
4893  if (packindices_schur.extent(1) <= 0) { \
4894  Kokkos::TeamPolicy<execution_space,SingleVectorTag<B> > \
4895  policy(packptr.extent(0) - 1, team_size, vector_loop_size); \
4896  policy.set_scratch_size(0,Kokkos::PerTeam(per_team_scratch)); \
4897  Kokkos::parallel_for \
4898  ("SolveTridiags::TeamPolicy::run<SingleVector>", \
4899  policy, *this); \
4900  } \
4901  else { \
4902  { \
4903  \
4904  Kokkos::TeamPolicy<execution_space,SingleZeroingTag<B> > \
4905  policy(packptr.extent(0) - 1, team_size, vector_loop_size); \
4906  Kokkos::parallel_for \
4907  ("SolveTridiags::TeamPolicy::run<SingleZeroingTag>", \
4908  policy, *this); \
4909  } \
4910  { \
4911  IFPACK2_BLOCKHELPER_TIMER("BlockTriDi::ApplyInverseJacobi::SingleVectorSubLineTag", SingleVectorSubLineTag0); \
4912  write4DMultiVectorValuesToFile(part2packrowidx0_sub.extent(0), X_internal_scalar_values, "x_scalar_values_before_SingleVectorSubLineTag.mm"); \
4913  Kokkos::TeamPolicy<execution_space,SingleVectorSubLineTag<B> > \
4914  policy(packindices_sub.extent(0), team_size, vector_loop_size); \
4915  policy.set_scratch_size(0,Kokkos::PerTeam(per_team_scratch)); \
4916  Kokkos::parallel_for \
4917  ("SolveTridiags::TeamPolicy::run<SingleVector>", \
4918  policy, *this); \
4919  write4DMultiVectorValuesToFile(part2packrowidx0_sub.extent(0), X_internal_scalar_values, "x_scalar_values_after_SingleVectorSubLineTag.mm"); \
4920  IFPACK2_BLOCKHELPER_TIMER_FENCE(execution_space) \
4921  } \
4922  { \
4923  IFPACK2_BLOCKHELPER_TIMER("BlockTriDi::ApplyInverseJacobi::SingleVectorApplyCTag", SingleVectorApplyCTag0); \
4924  write4DMultiVectorValuesToFile(part2packrowidx0_sub.extent(0), X_internal_scalar_values, "x_scalar_values_before_SingleVectorApplyCTag.mm"); \
4925  Kokkos::TeamPolicy<execution_space,SingleVectorApplyCTag<B> > \
4926  policy(packindices_sub.extent(0), team_size, vector_loop_size); \
4927  policy.set_scratch_size(0,Kokkos::PerTeam(per_team_scratch)); \
4928  Kokkos::parallel_for \
4929  ("SolveTridiags::TeamPolicy::run<SingleVector>", \
4930  policy, *this); \
4931  write4DMultiVectorValuesToFile(part2packrowidx0_sub.extent(0), X_internal_scalar_values, "x_scalar_values_after_SingleVectorApplyCTag.mm"); \
4932  IFPACK2_BLOCKHELPER_TIMER_FENCE(execution_space) \
4933  } \
4934  { \
4935  IFPACK2_BLOCKHELPER_TIMER("BlockTriDi::ApplyInverseJacobi::SingleVectorSchurTag", SingleVectorSchurTag0); \
4936  write4DMultiVectorValuesToFile(part2packrowidx0_sub.extent(0), X_internal_scalar_values, "x_scalar_values_before_SingleVectorSchurTag.mm"); \
4937  Kokkos::TeamPolicy<execution_space,SingleVectorSchurTag<B> > \
4938  policy(packindices_schur.extent(0), team_size, vector_loop_size); \
4939  policy.set_scratch_size(0,Kokkos::PerTeam(per_team_scratch)); \
4940  Kokkos::parallel_for \
4941  ("SolveTridiags::TeamPolicy::run<SingleVector>", \
4942  policy, *this); \
4943  write4DMultiVectorValuesToFile(part2packrowidx0_sub.extent(0), X_internal_scalar_values, "x_scalar_values_after_SingleVectorSchurTag.mm"); \
4944  IFPACK2_BLOCKHELPER_TIMER_FENCE(execution_space) \
4945  } \
4946  { \
4947  IFPACK2_BLOCKHELPER_TIMER("BlockTriDi::ApplyInverseJacobi::SingleVectorApplyETag", SingleVectorApplyETag0); \
4948  write4DMultiVectorValuesToFile(part2packrowidx0_sub.extent(0), X_internal_scalar_values, "x_scalar_values_before_SingleVectorApplyETag.mm"); \
4949  Kokkos::TeamPolicy<execution_space,SingleVectorApplyETag<B> > \
4950  policy(packindices_sub.extent(0), team_size, vector_loop_size); \
4951  policy.set_scratch_size(0,Kokkos::PerTeam(per_team_scratch)); \
4952  Kokkos::parallel_for \
4953  ("SolveTridiags::TeamPolicy::run<SingleVector>", \
4954  policy, *this); \
4955  write4DMultiVectorValuesToFile(part2packrowidx0_sub.extent(0), X_internal_scalar_values, "x_scalar_values_after_SingleVectorApplyETag.mm"); \
4956  IFPACK2_BLOCKHELPER_TIMER_FENCE(execution_space) \
4957  } \
4958  { \
4959  \
4960  Kokkos::TeamPolicy<execution_space,SingleVectorCopyToFlatTag<B> > \
4961  policy(packptr.extent(0) - 1, team_size, vector_loop_size); \
4962  Kokkos::parallel_for \
4963  ("SolveTridiags::TeamPolicy::run<SingleVectorCopyToFlatTag>", \
4964  policy, *this); \
4965  } \
4966  } \
4967  } else { \
4968  Kokkos::TeamPolicy<execution_space,MultiVectorTag<B> > \
4969  policy(packptr.extent(0) - 1, team_size, vector_loop_size); \
4970  policy.set_scratch_size(0,Kokkos::PerTeam(per_team_scratch)); \
4971  Kokkos::parallel_for \
4972  ("SolveTridiags::TeamPolicy::run<MultiVector>", \
4973  policy, *this); \
4974  } break
4975 #endif
4976  switch (blocksize) {
4977  case 3: BLOCKTRIDICONTAINER_DETAILS_SOLVETRIDIAGS( 3);
4978  case 5: BLOCKTRIDICONTAINER_DETAILS_SOLVETRIDIAGS( 5);
4979  case 6: BLOCKTRIDICONTAINER_DETAILS_SOLVETRIDIAGS( 6);
4980  case 7: BLOCKTRIDICONTAINER_DETAILS_SOLVETRIDIAGS( 7);
4981  case 10: BLOCKTRIDICONTAINER_DETAILS_SOLVETRIDIAGS(10);
4982  case 11: BLOCKTRIDICONTAINER_DETAILS_SOLVETRIDIAGS(11);
4983  case 12: BLOCKTRIDICONTAINER_DETAILS_SOLVETRIDIAGS(12);
4984  case 13: BLOCKTRIDICONTAINER_DETAILS_SOLVETRIDIAGS(13);
4985  case 16: BLOCKTRIDICONTAINER_DETAILS_SOLVETRIDIAGS(16);
4986  case 17: BLOCKTRIDICONTAINER_DETAILS_SOLVETRIDIAGS(17);
4987  case 18: BLOCKTRIDICONTAINER_DETAILS_SOLVETRIDIAGS(18);
4988  case 19: BLOCKTRIDICONTAINER_DETAILS_SOLVETRIDIAGS(19);
4989  default : BLOCKTRIDICONTAINER_DETAILS_SOLVETRIDIAGS( 0);
4990  }
4991 #undef BLOCKTRIDICONTAINER_DETAILS_SOLVETRIDIAGS
4992 
4993  IFPACK2_BLOCKTRIDICONTAINER_PROFILER_REGION_END;
4994  IFPACK2_BLOCKHELPER_TIMER_FENCE(execution_space)
4995  }
4996  };
4997 
5001  template<typename MatrixType>
5002  int
5004  const Teuchos::RCP<const typename BlockHelperDetails::ImplType<MatrixType>::tpetra_row_matrix_type> &A,
5005  const Teuchos::RCP<const typename BlockHelperDetails::ImplType<MatrixType>::tpetra_crs_graph_type> &G,
5006  const Teuchos::RCP<const typename BlockHelperDetails::ImplType<MatrixType>::tpetra_import_type> &tpetra_importer,
5007  const Teuchos::RCP<AsyncableImport<MatrixType> > &async_importer,
5008  const bool overlap_communication_and_computation,
5009  // tpetra interface
5010  const typename BlockHelperDetails::ImplType<MatrixType>::tpetra_multivector_type &X, // tpetra interface
5011  /* */ typename BlockHelperDetails::ImplType<MatrixType>::tpetra_multivector_type &Y, // tpetra interface
5012  /* */ typename BlockHelperDetails::ImplType<MatrixType>::tpetra_multivector_type &Z, // temporary tpetra interface (seq_method)
5013  /* */ typename BlockHelperDetails::ImplType<MatrixType>::impl_scalar_type_1d_view &W, // temporary tpetra interface (diff)
5014  // local object interface
5015  const BlockHelperDetails::PartInterface<MatrixType> &interf, // mesh interface
5016  const BlockTridiags<MatrixType> &btdm, // packed block tridiagonal matrices
5017  const BlockHelperDetails::AmD<MatrixType> &amd, // R = A - D
5018  /* */ typename BlockHelperDetails::ImplType<MatrixType>::vector_type_1d_view &work, // workspace for packed multivector of right hand side
5019  /* */ BlockHelperDetails::NormManager<MatrixType> &norm_manager,
5020  // preconditioner parameters
5021  const typename BlockHelperDetails::ImplType<MatrixType>::impl_scalar_type &damping_factor,
5022  /* */ bool is_y_zero,
5023  const int max_num_sweeps,
5024  const typename BlockHelperDetails::ImplType<MatrixType>::magnitude_type tol,
5025  const int check_tol_every) {
5026  IFPACK2_BLOCKHELPER_TIMER("BlockTriDi::ApplyInverseJacobi", ApplyInverseJacobi);
5027 
5028  using impl_type = BlockHelperDetails::ImplType<MatrixType>;
5029  using node_memory_space = typename impl_type::node_memory_space;
5030  using local_ordinal_type = typename impl_type::local_ordinal_type;
5031  using size_type = typename impl_type::size_type;
5032  using impl_scalar_type = typename impl_type::impl_scalar_type;
5033  using magnitude_type = typename impl_type::magnitude_type;
5034  using local_ordinal_type_1d_view = typename impl_type::local_ordinal_type_1d_view;
5035  using vector_type_1d_view = typename impl_type::vector_type_1d_view;
5036  using vector_type_3d_view = typename impl_type::vector_type_3d_view;
5037  using tpetra_multivector_type = typename impl_type::tpetra_multivector_type;
5038 
5039  using impl_scalar_type_1d_view = typename impl_type::impl_scalar_type_1d_view;
5040 
5041  // either tpetra importer or async importer must be active
5042  TEUCHOS_TEST_FOR_EXCEPT_MSG(!tpetra_importer.is_null() && !async_importer.is_null(),
5043  "Neither Tpetra importer nor Async importer is null.");
5044  // max number of sweeps should be positive number
5045  TEUCHOS_TEST_FOR_EXCEPT_MSG(max_num_sweeps <= 0,
5046  "Maximum number of sweeps must be >= 1.");
5047 
5048  // const parameters
5049  const bool is_seq_method_requested = !tpetra_importer.is_null();
5050  const bool is_async_importer_active = !async_importer.is_null();
5051  const bool is_norm_manager_active = tol > Kokkos::ArithTraits<magnitude_type>::zero();
5052  const magnitude_type tolerance = tol*tol;
5053  const local_ordinal_type blocksize = btdm.values.extent(1);
5054  const local_ordinal_type num_vectors = Y.getNumVectors();
5055  const local_ordinal_type num_blockrows = interf.part2packrowidx0_back;
5056 
5057  const impl_scalar_type zero(0.0);
5058 
5059  TEUCHOS_TEST_FOR_EXCEPT_MSG(is_norm_manager_active && is_seq_method_requested,
5060  "The seq method for applyInverseJacobi, " <<
5061  "which in any case is for developer use only, " <<
5062  "does not support norm-based termination.");
5063  const bool device_accessible_from_host = Kokkos::SpaceAccessibility<
5064  Kokkos::DefaultHostExecutionSpace, node_memory_space>::accessible;
5065  TEUCHOS_TEST_FOR_EXCEPTION(is_seq_method_requested && !device_accessible_from_host,
5066  std::invalid_argument,
5067  "The seq method for applyInverseJacobi, " <<
5068  "which in any case is for developer use only, " <<
5069  "only supports memory spaces accessible from host.");
5070 
5071  // if workspace is needed more, resize it
5072  const size_type work_span_required = num_blockrows*num_vectors*blocksize;
5073  if (work.span() < work_span_required)
5074  work = vector_type_1d_view("vector workspace 1d view", work_span_required);
5075 
5076  // construct W
5077  const local_ordinal_type W_size = interf.packptr.extent(0)-1;
5078  if (local_ordinal_type(W.extent(0)) < W_size)
5079  W = impl_scalar_type_1d_view("W", W_size);
5080 
5081  typename impl_type::impl_scalar_type_2d_view_tpetra remote_multivector;
5082  {
5083  if (is_seq_method_requested) {
5084  if (Z.getNumVectors() != Y.getNumVectors())
5085  Z = tpetra_multivector_type(tpetra_importer->getTargetMap(), num_vectors, false);
5086  } else {
5087  if (is_async_importer_active) {
5088  // create comm data buffer and keep it here
5089  async_importer->createDataBuffer(num_vectors);
5090  remote_multivector = async_importer->getRemoteMultiVectorLocalView();
5091  }
5092  }
5093  }
5094 
5095  // wrap the workspace with 3d view
5096  vector_type_3d_view pmv(work.data(), num_blockrows, blocksize, num_vectors);
5097  const auto XX = X.getLocalViewDevice(Tpetra::Access::ReadOnly);
5098  const auto YY = Y.getLocalViewDevice(Tpetra::Access::ReadWrite);
5099  const auto ZZ = Z.getLocalViewDevice(Tpetra::Access::ReadWrite);
5100  if (is_y_zero) Kokkos::deep_copy(YY, zero);
5101 
5102  MultiVectorConverter<MatrixType> multivector_converter(interf, pmv);
5103  SolveTridiags<MatrixType> solve_tridiags(interf, btdm, pmv,
5104  damping_factor, is_norm_manager_active);
5105 
5106  const local_ordinal_type_1d_view dummy_local_ordinal_type_1d_view;
5107 
5108 
5109  auto A_crs = Teuchos::rcp_dynamic_cast<const typename impl_type::tpetra_crs_matrix_type>(A);
5110  auto A_bcrs = Teuchos::rcp_dynamic_cast<const typename impl_type::tpetra_block_crs_matrix_type>(A);
5111 
5112  bool hasBlockCrsMatrix = ! A_bcrs.is_null ();
5113 
5114  // This is OK here to use the graph of the A_crs matrix and a block size of 1
5115  const auto g = hasBlockCrsMatrix ? A_bcrs->getCrsGraph() : *(A_crs->getCrsGraph()); // tpetra crs graph object
5116 
5117  BlockHelperDetails::ComputeResidualVector<MatrixType>
5118  compute_residual_vector(amd, G->getLocalGraphDevice(), g.getLocalGraphDevice(), blocksize, interf,
5119  is_async_importer_active ? async_importer->dm2cm : dummy_local_ordinal_type_1d_view,
5120  hasBlockCrsMatrix);
5121 
5122  // norm manager workspace resize
5123  if (is_norm_manager_active)
5124  norm_manager.setCheckFrequency(check_tol_every);
5125 
5126  // iterate
5127  int sweep = 0;
5128  for (;sweep<max_num_sweeps;++sweep) {
5129  {
5130  if (is_y_zero) {
5131  // pmv := x(lclrow)
5132  multivector_converter.run(XX);
5133  } else {
5134  if (is_seq_method_requested) {
5135  // SEQ METHOD IS TESTING ONLY
5136 
5137  // y := x - R y
5138  Z.doImport(Y, *tpetra_importer, Tpetra::REPLACE);
5139  compute_residual_vector.run(YY, XX, ZZ);
5140 
5141  // pmv := y(lclrow).
5142  multivector_converter.run(YY);
5143  } else {
5144  // fused y := x - R y and pmv := y(lclrow);
5145  // real use case does not use overlap comp and comm
5146  if (overlap_communication_and_computation || !is_async_importer_active) {
5147  if (is_async_importer_active) async_importer->asyncSendRecv(YY);
5148  // OverlapTag, compute_owned = true
5149  compute_residual_vector.run(pmv, XX, YY, remote_multivector, true);
5150  if (is_norm_manager_active && norm_manager.checkDone(sweep, tolerance)) {
5151  if (is_async_importer_active) async_importer->cancel();
5152  break;
5153  }
5154  if (is_async_importer_active) {
5155  async_importer->syncRecv();
5156  // OverlapTag, compute_owned = false
5157  compute_residual_vector.run(pmv, XX, YY, remote_multivector, false);
5158  }
5159  } else {
5160  if (is_async_importer_active)
5161  async_importer->syncExchange(YY);
5162  if (is_norm_manager_active && norm_manager.checkDone(sweep, tolerance)) break;
5163  // AsyncTag
5164  compute_residual_vector.run(pmv, XX, YY, remote_multivector);
5165  }
5166  }
5167  }
5168  }
5169 
5170  // pmv := inv(D) pmv.
5171  {
5172  solve_tridiags.run(YY, W);
5173  }
5174  {
5175  if (is_norm_manager_active) {
5176  // y(lclrow) = (b - a) y(lclrow) + a pmv, with b = 1 always.
5177  BlockHelperDetails::reduceVector<MatrixType>(W, norm_manager.getBuffer());
5178  if (sweep + 1 == max_num_sweeps) {
5179  norm_manager.ireduce(sweep, true);
5180  norm_manager.checkDone(sweep + 1, tolerance, true);
5181  } else {
5182  norm_manager.ireduce(sweep);
5183  }
5184  }
5185  }
5186  is_y_zero = false;
5187  }
5188 
5189  //sqrt the norms for the caller's use.
5190  if (is_norm_manager_active) norm_manager.finalize();
5191 
5192  return sweep;
5193  }
5194 
5195  // Implementation of fused block Jacobi for a specific block size,
5196  // or (if B == 0) for a general block size.
5197  template<typename MatrixType, int B>
5198  int
5199  applyFusedBlockJacobi_Impl(
5200  const Teuchos::RCP<const typename BlockHelperDetails::ImplType<MatrixType>::tpetra_import_type> &tpetra_importer,
5201  const Teuchos::RCP<AsyncableImport<MatrixType> > &async_importer,
5202  const bool overlap_communication_and_computation,
5203  // tpetra interface
5204  const typename BlockHelperDetails::ImplType<MatrixType>::tpetra_multivector_type &X, // tpetra interface
5205  /* */ typename BlockHelperDetails::ImplType<MatrixType>::tpetra_multivector_type &Y, // tpetra interface
5206  /* */ typename BlockHelperDetails::ImplType<MatrixType>::impl_scalar_type_1d_view &W, // temporary tpetra interface (diff)
5207  // local object interface
5208  const BlockHelperDetails::PartInterface<MatrixType> &interf, // mesh interface
5209  const BlockTridiags<MatrixType> &btdm, // packed block tridiagonal matrices
5210  const BlockHelperDetails::AmD<MatrixType> &amd, // R = A - D
5211  /* */ typename BlockHelperDetails::ImplType<MatrixType>::impl_scalar_type_1d_view &work, // workspace
5212  /* */ BlockHelperDetails::NormManager<MatrixType> &norm_manager,
5213  // preconditioner parameters
5214  const typename BlockHelperDetails::ImplType<MatrixType>::impl_scalar_type &damping_factor,
5215  /* */ bool is_y_zero,
5216  const int max_num_sweeps,
5217  const typename BlockHelperDetails::ImplType<MatrixType>::magnitude_type tol,
5218  const int check_tol_every) {
5219  using impl_type = BlockHelperDetails::ImplType<MatrixType>;
5220  using node_memory_space = typename impl_type::node_memory_space;
5221  using local_ordinal_type = typename impl_type::local_ordinal_type;
5222  using size_type = typename impl_type::size_type;
5223  using impl_scalar_type = typename impl_type::impl_scalar_type;
5224  using magnitude_type = typename impl_type::magnitude_type;
5225  using local_ordinal_type_1d_view = typename impl_type::local_ordinal_type_1d_view;
5226  using tpetra_multivector_type = typename impl_type::tpetra_multivector_type;
5227  using impl_scalar_type_1d_view = typename impl_type::impl_scalar_type_1d_view;
5228  using impl_scalar_type_2d_view_tpetra = typename impl_type::impl_scalar_type_2d_view_tpetra;
5229 
5230  // the tpetra importer and async importer can't both be active
5231  TEUCHOS_TEST_FOR_EXCEPT_MSG(!tpetra_importer.is_null() && !async_importer.is_null(),
5232  "Neither Tpetra importer nor Async importer is null.");
5233  // max number of sweeps should be positive number
5234  TEUCHOS_TEST_FOR_EXCEPT_MSG(max_num_sweeps <= 0,
5235  "Maximum number of sweeps must be >= 1.");
5236 
5237  // const parameters
5238  const bool is_async_importer_active = !async_importer.is_null();
5239  const bool is_norm_manager_active = tol > Kokkos::ArithTraits<magnitude_type>::zero();
5240  const magnitude_type tolerance = tol*tol;
5241  const local_ordinal_type blocksize = btdm.d_inv.extent(1);
5242  const local_ordinal_type num_vectors = Y.getNumVectors();
5243  const local_ordinal_type num_blockrows = interf.nparts;
5244 
5245  typename impl_type::impl_scalar_type_2d_view_tpetra remote_multivector;
5246  {
5247  if (is_async_importer_active) {
5248  // create comm data buffer and keep it here
5249  async_importer->createDataBuffer(num_vectors);
5250  remote_multivector = async_importer->getRemoteMultiVectorLocalView();
5251  }
5252  }
5253 
5254  const auto XX = X.getLocalViewDevice(Tpetra::Access::ReadOnly);
5255  const auto YY = Y.getLocalViewDevice(Tpetra::Access::ReadWrite);
5256 
5257  const bool two_pass_residual =
5258  overlap_communication_and_computation && is_async_importer_active;
5259 
5260  // Calculate the required work size and reallocate it if not already big enough.
5261  // Check that our assumptions about YY dimension are correct.
5263  size_t(num_blockrows) * blocksize * num_vectors != YY.extent(0) * YY.extent(1),
5264  "Local LHS vector (YY) has total size " << YY.extent(0) << "x" << YY.extent(1) <<
5265  " = " << YY.extent(0) * YY.extent(1) << ",\n" <<
5266  "but expected " << num_blockrows << "x" << blocksize << "x" << num_vectors <<
5267  " = " << size_t(num_blockrows) * blocksize * num_vectors << '\n');
5268  size_type work_required = size_type(num_blockrows) * blocksize * num_vectors;
5269  if (work.extent(0) < work_required) {
5270  work = impl_scalar_type_1d_view(do_not_initialize_tag("flat workspace 1d view"), work_required);
5271  }
5272 
5273  Unmanaged<impl_scalar_type_2d_view_tpetra> y_doublebuf(work.data(), num_blockrows * blocksize, num_vectors);
5274 
5275  // construct W
5276  if (W.extent(0) != size_t(num_blockrows))
5277  W = impl_scalar_type_1d_view(do_not_initialize_tag("W"), num_blockrows);
5278 
5279  // Create the required functors upfront (this is inexpensive - all shallow copies)
5280  BlockHelperDetails::ComputeResidualAndSolve_SolveOnly<MatrixType, B>
5281  functor_solve_only(amd, btdm.d_inv, W, blocksize, damping_factor);
5282  BlockHelperDetails::ComputeResidualAndSolve_1Pass<MatrixType, B>
5283  functor_1pass(amd, btdm.d_inv, W, blocksize, damping_factor);
5284  BlockHelperDetails::ComputeResidualAndSolve_2Pass<MatrixType, B>
5285  functor_2pass(amd, btdm.d_inv, W, blocksize, damping_factor);
5286 
5287  // norm manager workspace resize
5288  if (is_norm_manager_active)
5289  norm_manager.setCheckFrequency(check_tol_every);
5290 
5291  // For double-buffering.
5292  // yy_buffers[current_y] has the current iterate of y.
5293  // yy_buffers[1-current_y] has the next iterate of y.
5294  Unmanaged<impl_scalar_type_2d_view_tpetra> y_buffers[2] = {YY, y_doublebuf};
5295  int current_y = 0;
5296 
5297  // iterate
5298  int sweep = 0;
5299  for (;sweep < max_num_sweeps; ++sweep) {
5300  if (is_y_zero) {
5301  // If y is initially zero, then we are just computing y := damping_factor * Dinv * x
5302  functor_solve_only.run(XX, y_buffers[1-current_y]);
5303  } else {
5304  // real use case does not use overlap comp and comm
5305  if (overlap_communication_and_computation || !is_async_importer_active) {
5306  if (is_async_importer_active) async_importer->asyncSendRecv(y_buffers[current_y]);
5307  if(two_pass_residual) {
5308  // Pass 1 computes owned residual and stores into new y buffer,
5309  // but doesn't apply Dinv or produce a norm yet
5310  functor_2pass.run_pass1(XX, y_buffers[current_y], y_buffers[1-current_y]);
5311  }
5312  else {
5313  // This case happens if running with single rank.
5314  // There are no remote columns, so residual and solve can happen in one step.
5315  functor_1pass.run(XX, y_buffers[current_y], remote_multivector, y_buffers[1-current_y]);
5316  }
5317  if (is_norm_manager_active && norm_manager.checkDone(sweep, tolerance)) {
5318  if (is_async_importer_active) async_importer->cancel();
5319  break;
5320  }
5321  if (is_async_importer_active) {
5322  async_importer->syncRecv();
5323  // Stage 2 finishes computing the residual, then applies Dinv and computes norm.
5324  functor_2pass.run_pass2(y_buffers[current_y], remote_multivector, y_buffers[1-current_y]);
5325  }
5326  } else {
5327  if (is_async_importer_active)
5328  async_importer->syncExchange(y_buffers[current_y]);
5329  if (is_norm_manager_active && norm_manager.checkDone(sweep, tolerance)) break;
5330  // Full residual, Dinv apply, and norm in one kernel
5331  functor_1pass.run(XX, y_buffers[current_y], remote_multivector, y_buffers[1-current_y]);
5332  }
5333  }
5334 
5335  // Compute global norm.
5336  if (is_norm_manager_active) {
5337  BlockHelperDetails::reduceVector<MatrixType>(W, norm_manager.getBuffer());
5338  if (sweep + 1 == max_num_sweeps) {
5339  norm_manager.ireduce(sweep, true);
5340  norm_manager.checkDone(sweep + 1, tolerance, true);
5341  } else {
5342  norm_manager.ireduce(sweep);
5343  }
5344  }
5345  is_y_zero = false;
5346  // flip y buffers for next iteration, or termination if we reached max_num_sweeps.
5347  current_y = 1 - current_y;
5348  }
5349  if(current_y == 1) {
5350  // We finished iterating with y in the double buffer, so copy it to the user's vector.
5351  Kokkos::deep_copy(YY, y_doublebuf);
5352  }
5353 
5354  //sqrt the norms for the caller's use.
5355  if (is_norm_manager_active) norm_manager.finalize();
5356  return sweep;
5357  }
5358 
5362  template<typename MatrixType>
5363  int
5365  const Teuchos::RCP<const typename BlockHelperDetails::ImplType<MatrixType>::tpetra_import_type> &tpetra_importer,
5366  const Teuchos::RCP<AsyncableImport<MatrixType> > &async_importer,
5367  const bool overlap_communication_and_computation,
5368  // tpetra interface
5369  const typename BlockHelperDetails::ImplType<MatrixType>::tpetra_multivector_type &X, // tpetra interface
5370  /* */ typename BlockHelperDetails::ImplType<MatrixType>::tpetra_multivector_type &Y, // tpetra interface
5371  /* */ typename BlockHelperDetails::ImplType<MatrixType>::impl_scalar_type_1d_view &W, // temporary tpetra interface (diff)
5372  // local object interface
5373  const BlockHelperDetails::PartInterface<MatrixType> &interf, // mesh interface
5374  const BlockTridiags<MatrixType> &btdm, // packed block tridiagonal matrices
5375  const BlockHelperDetails::AmD<MatrixType> &amd, // R = A - D
5376  /* */ typename BlockHelperDetails::ImplType<MatrixType>::impl_scalar_type_1d_view &work, // workspace
5377  /* */ BlockHelperDetails::NormManager<MatrixType> &norm_manager,
5378  // preconditioner parameters
5379  const typename BlockHelperDetails::ImplType<MatrixType>::impl_scalar_type &damping_factor,
5380  /* */ bool is_y_zero,
5381  const int max_num_sweeps,
5382  const typename BlockHelperDetails::ImplType<MatrixType>::magnitude_type tol,
5383  const int check_tol_every) {
5384  IFPACK2_BLOCKHELPER_TIMER("BlockTriDi::ApplyFusedBlockJacobi", ApplyFusedBlockJacobi);
5385  int blocksize = btdm.d_inv.extent(1);
5386  int sweep = 0;
5387 #define BLOCKTRIDICONTAINER_APPLY_FUSED_JACOBI(B) { \
5388  sweep = applyFusedBlockJacobi_Impl<MatrixType, B>( \
5389  tpetra_importer, async_importer, overlap_communication_and_computation, \
5390  X, Y, W, interf, btdm, amd, work, \
5391  norm_manager, damping_factor, is_y_zero, \
5392  max_num_sweeps, tol, check_tol_every); \
5393  } break
5394  switch (blocksize) {
5395  case 3: BLOCKTRIDICONTAINER_APPLY_FUSED_JACOBI( 3);
5396  case 5: BLOCKTRIDICONTAINER_APPLY_FUSED_JACOBI( 5);
5397  case 7: BLOCKTRIDICONTAINER_APPLY_FUSED_JACOBI( 7);
5398  case 9: BLOCKTRIDICONTAINER_APPLY_FUSED_JACOBI( 9);
5399  case 10: BLOCKTRIDICONTAINER_APPLY_FUSED_JACOBI(10);
5400  case 11: BLOCKTRIDICONTAINER_APPLY_FUSED_JACOBI(11);
5401  case 16: BLOCKTRIDICONTAINER_APPLY_FUSED_JACOBI(16);
5402  case 17: BLOCKTRIDICONTAINER_APPLY_FUSED_JACOBI(17);
5403  case 18: BLOCKTRIDICONTAINER_APPLY_FUSED_JACOBI(18);
5404  default : BLOCKTRIDICONTAINER_APPLY_FUSED_JACOBI( 0);
5405  }
5406 #undef BLOCKTRIDICONTAINER_APPLY_FUSED_JACOBI
5407 
5408  return sweep;
5409  }
5410 
5411 
5412  template<typename MatrixType>
5413  struct ImplObject {
5414  using impl_type = BlockHelperDetails::ImplType<MatrixType>;
5415  using part_interface_type = BlockHelperDetails::PartInterface<MatrixType>;
5416  using block_tridiags_type = BlockTridiags<MatrixType>;
5417  using amd_type = BlockHelperDetails::AmD<MatrixType>;
5418  using norm_manager_type = BlockHelperDetails::NormManager<MatrixType>;
5419  using async_import_type = AsyncableImport<MatrixType>;
5420 
5421  // distructed objects
5425  Teuchos::RCP<async_import_type> async_importer;
5426  bool overlap_communication_and_computation;
5427 
5428  // copy of Y (mutable to penentrate const)
5429  mutable typename impl_type::tpetra_multivector_type Z;
5430  mutable typename impl_type::impl_scalar_type_1d_view W;
5431 
5432  // local objects
5433  part_interface_type part_interface;
5434  block_tridiags_type block_tridiags; // D
5435  amd_type a_minus_d; // R = A - D
5436 
5437  // whether to use fused block Jacobi path
5438  bool use_fused_jacobi;
5439 
5440  // vector workspace is used for general block tridi case
5441  mutable typename impl_type::vector_type_1d_view work; // right hand side workspace (1D view of vector)
5442  // scalar workspace is used for fused block jacobi case
5443  mutable typename impl_type::impl_scalar_type_1d_view work_flat; // right hand side workspace (1D view of scalar)
5444  mutable norm_manager_type norm_manager;
5445  };
5446 
5447  } // namespace BlockTriDiContainerDetails
5448 
5449 } // namespace Ifpack2
5450 
5451 #endif
Definition: Ifpack2_BlockTriDiContainer_impl.hpp:141
void performNumericPhase(const Teuchos::RCP< const typename BlockHelperDetails::ImplType< MatrixType >::tpetra_row_matrix_type > &A, const Teuchos::RCP< const typename BlockHelperDetails::ImplType< MatrixType >::tpetra_crs_graph_type > &G, const BlockHelperDetails::PartInterface< MatrixType > &interf, BlockTridiags< MatrixType > &btdm, const typename BlockHelperDetails::ImplType< MatrixType >::magnitude_type tiny, bool use_fused_jacobi)
Definition: Ifpack2_BlockTriDiContainer_impl.hpp:3710
#define TEUCHOS_TEST_FOR_EXCEPTION(throw_exception_test, Exception, msg)
size_type size() const
size_t size_type
Definition: Ifpack2_BlockHelper.hpp:253
Teuchos::RCP< AsyncableImport< MatrixType > > createBlockCrsAsyncImporter(const Teuchos::RCP< const typename BlockHelperDetails::ImplType< MatrixType >::tpetra_row_matrix_type > &A)
Definition: Ifpack2_BlockTriDiContainer_impl.hpp:885
int applyInverseJacobi(const Teuchos::RCP< const typename BlockHelperDetails::ImplType< MatrixType >::tpetra_row_matrix_type > &A, const Teuchos::RCP< const typename BlockHelperDetails::ImplType< MatrixType >::tpetra_crs_graph_type > &G, const Teuchos::RCP< const typename BlockHelperDetails::ImplType< MatrixType >::tpetra_import_type > &tpetra_importer, const Teuchos::RCP< AsyncableImport< MatrixType > > &async_importer, const bool overlap_communication_and_computation, const typename BlockHelperDetails::ImplType< MatrixType >::tpetra_multivector_type &X, typename BlockHelperDetails::ImplType< MatrixType >::tpetra_multivector_type &Y, typename BlockHelperDetails::ImplType< MatrixType >::tpetra_multivector_type &Z, typename BlockHelperDetails::ImplType< MatrixType >::impl_scalar_type_1d_view &W, const BlockHelperDetails::PartInterface< MatrixType > &interf, const BlockTridiags< MatrixType > &btdm, const BlockHelperDetails::AmD< MatrixType > &amd, typename BlockHelperDetails::ImplType< MatrixType >::vector_type_1d_view &work, BlockHelperDetails::NormManager< MatrixType > &norm_manager, const typename BlockHelperDetails::ImplType< MatrixType >::impl_scalar_type &damping_factor, bool is_y_zero, const int max_num_sweeps, const typename BlockHelperDetails::ImplType< MatrixType >::magnitude_type tol, const int check_tol_every)
Definition: Ifpack2_BlockTriDiContainer_impl.hpp:5003
TEUCHOS_DEPRECATED RCP< T > rcp(T *p, Dealloc_T dealloc, bool owns_mem)
#define TEUCHOS_TEST_FOR_EXCEPT_MSG(throw_exception_test, msg)
BlockTridiags< MatrixType > createBlockTridiags(const BlockHelperDetails::PartInterface< MatrixType > &interf)
Definition: Ifpack2_BlockTriDiContainer_impl.hpp:1623
Definition: Ifpack2_BlockHelper.hpp:353
void performSymbolicPhase(const Teuchos::RCP< const typename BlockHelperDetails::ImplType< MatrixType >::tpetra_row_matrix_type > &A, const Teuchos::RCP< const typename BlockHelperDetails::ImplType< MatrixType >::tpetra_crs_graph_type > &g, const BlockHelperDetails::PartInterface< MatrixType > &interf, BlockTridiags< MatrixType > &btdm, BlockHelperDetails::AmD< MatrixType > &amd, const bool overlap_communication_and_computation, const Teuchos::RCP< AsyncableImport< MatrixType > > &async_importer, bool useSeqMethod, bool use_fused_jacobi)
Definition: Ifpack2_BlockTriDiContainer_impl.hpp:1865
Kokkos::ViewAllocateWithoutInitializing do_not_initialize_tag
Definition: Ifpack2_BlockTriDiContainer_impl.hpp:97
int applyFusedBlockJacobi(const Teuchos::RCP< const typename BlockHelperDetails::ImplType< MatrixType >::tpetra_import_type > &tpetra_importer, const Teuchos::RCP< AsyncableImport< MatrixType > > &async_importer, const bool overlap_communication_and_computation, const typename BlockHelperDetails::ImplType< MatrixType >::tpetra_multivector_type &X, typename BlockHelperDetails::ImplType< MatrixType >::tpetra_multivector_type &Y, typename BlockHelperDetails::ImplType< MatrixType >::impl_scalar_type_1d_view &W, const BlockHelperDetails::PartInterface< MatrixType > &interf, const BlockTridiags< MatrixType > &btdm, const BlockHelperDetails::AmD< MatrixType > &amd, typename BlockHelperDetails::ImplType< MatrixType >::impl_scalar_type_1d_view &work, BlockHelperDetails::NormManager< MatrixType > &norm_manager, const typename BlockHelperDetails::ImplType< MatrixType >::impl_scalar_type &damping_factor, bool is_y_zero, const int max_num_sweeps, const typename BlockHelperDetails::ImplType< MatrixType >::magnitude_type tol, const int check_tol_every)
Definition: Ifpack2_BlockTriDiContainer_impl.hpp:5364
void send(const Packet sendBuffer[], const Ordinal count, const int destRank, const int tag, const Comm< Ordinal > &comm)
T * getRawPtr() const
Kokkos::Details::ArithTraits< scalar_type >::val_type impl_scalar_type
Definition: Ifpack2_BlockHelper.hpp:262
Definition: Ifpack2_BlockHelper.hpp:188
BlockHelperDetails::PartInterface< MatrixType > createPartInterface(const Teuchos::RCP< const typename BlockHelperDetails::ImplType< MatrixType >::tpetra_row_matrix_type > &A, const Teuchos::RCP< const typename BlockHelperDetails::ImplType< MatrixType >::tpetra_crs_graph_type > &G, const Teuchos::Array< Teuchos::Array< typename BlockHelperDetails::ImplType< MatrixType >::local_ordinal_type > > &partitions, const typename BlockHelperDetails::ImplType< MatrixType >::local_ordinal_type n_subparts_per_part_in)
Definition: Ifpack2_BlockTriDiContainer_impl.hpp:1044
Kokkos::View< size_type *, device_type > size_type_1d_view
Definition: Ifpack2_BlockHelper.hpp:321
Teuchos::RCP< const typename BlockHelperDetails::ImplType< MatrixType >::tpetra_import_type > createBlockCrsTpetraImporter(const Teuchos::RCP< const typename BlockHelperDetails::ImplType< MatrixType >::tpetra_row_matrix_type > &A)
Definition: Ifpack2_BlockTriDiContainer_impl.hpp:164
RCP< CommRequest< Ordinal > > isend(const ArrayRCP< const Packet > &sendBuffer, const int destRank, const int tag, const Comm< Ordinal > &comm)
#define TEUCHOS_ASSERT(assertion_test)
Definition: Ifpack2_BlockHelper.hpp:215
Definition: Ifpack2_BlockHelper.hpp:249
Definition: Ifpack2_BlockTriDiContainer_impl.hpp:1558
Definition: Ifpack2_BlockComputeResidualVector.hpp:23
Definition: Ifpack2_BlockTriDiContainer_impl.hpp:3762