Ifpack2 Templated Preconditioning Package  Version 1.0
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Groups Pages
Ifpack2_BlockTriDiContainer_impl.hpp
1 // @HEADER
2 // *****************************************************************************
3 // Ifpack2: Templated Object-Oriented Algebraic Preconditioner Package
4 //
5 // Copyright 2009 NTESS and the Ifpack2 contributors.
6 // SPDX-License-Identifier: BSD-3-Clause
7 // *****************************************************************************
8 // @HEADER
9 
10 #ifndef IFPACK2_BLOCKTRIDICONTAINER_IMPL_HPP
11 #define IFPACK2_BLOCKTRIDICONTAINER_IMPL_HPP
12 
13 // #define IFPACK2_BLOCKTRIDICONTAINER_WRITE_MM
14 // #define IFPACK2_BLOCKTRIDICONTAINER_USE_PRINTF
15 
17 
18 #include <Tpetra_Details_extractMpiCommFromTeuchos.hpp>
19 #include <Tpetra_Distributor.hpp>
20 #include <Tpetra_BlockMultiVector.hpp>
21 
22 #if KOKKOS_VERSION >= 40799
23 #include <KokkosKernels_ArithTraits.hpp>
24 #else
25 #include <Kokkos_ArithTraits.hpp>
26 #endif
27 #include <KokkosBatched_Util.hpp>
28 #include <KokkosBatched_Vector.hpp>
29 #include <KokkosBatched_Copy_Decl.hpp>
30 #include <KokkosBatched_Copy_Impl.hpp>
31 #include <KokkosBatched_AddRadial_Decl.hpp>
32 #include <KokkosBatched_AddRadial_Impl.hpp>
33 #include <KokkosBatched_SetIdentity_Decl.hpp>
34 #include <KokkosBatched_SetIdentity_Impl.hpp>
35 #include <KokkosBatched_Gemm_Decl.hpp>
36 #include <KokkosBatched_Gemm_Serial_Impl.hpp>
37 #include <KokkosBatched_Gemm_Team_Impl.hpp>
38 #include <KokkosBatched_Gemv_Decl.hpp>
39 #include <KokkosBatched_Gemv_Team_Impl.hpp>
40 #include <KokkosBatched_Trsm_Decl.hpp>
41 #include <KokkosBatched_Trsm_Serial_Impl.hpp>
42 #include <KokkosBatched_Trsm_Team_Impl.hpp>
43 #include <KokkosBatched_Trsv_Decl.hpp>
44 #include <KokkosBatched_Trsv_Serial_Impl.hpp>
45 #include <KokkosBatched_Trsv_Team_Impl.hpp>
46 #include <KokkosBatched_LU_Decl.hpp>
47 #include <KokkosBatched_LU_Serial_Impl.hpp>
48 #include <KokkosBatched_LU_Team_Impl.hpp>
49 
50 #include <KokkosBlas1_nrm1.hpp>
51 #include <KokkosBlas1_nrm2.hpp>
52 
53 #include <memory>
54 
55 #include "Ifpack2_BlockHelper.hpp"
56 #include "Ifpack2_BlockComputeResidualVector.hpp"
57 #include "Ifpack2_BlockComputeResidualAndSolve.hpp"
58 
59 // need to interface this into cmake variable (or only use this flag when it is necessary)
60 // #define IFPACK2_BLOCKTRIDICONTAINER_ENABLE_PROFILE
61 // #undef IFPACK2_BLOCKTRIDICONTAINER_ENABLE_PROFILE
62 #if defined(KOKKOS_ENABLE_CUDA) && defined(IFPACK2_BLOCKTRIDICONTAINER_ENABLE_PROFILE)
63 #include "cuda_profiler_api.h"
64 #endif
65 
66 // I am not 100% sure about the mpi 3 on cuda
67 #if MPI_VERSION >= 3
68 #define IFPACK2_BLOCKTRIDICONTAINER_USE_MPI_3
69 #endif
70 
71 // ::: Experiments :::
72 // define either pinned memory or cudamemory for mpi
73 // if both macros are disabled, it will use tpetra memory space which is uvm space for cuda
74 // if defined, this use pinned memory instead of device pointer
75 // by default, we enable pinned memory
76 #define IFPACK2_BLOCKTRIDICONTAINER_USE_PINNED_MEMORY_FOR_MPI
77 // #define IFPACK2_BLOCKTRIDICONTAINER_USE_CUDA_MEMORY_FOR_MPI
78 
79 // if defined, all views are allocated on cuda space intead of cuda uvm space
80 #define IFPACK2_BLOCKTRIDICONTAINER_USE_CUDA_SPACE
81 
82 // if defined, btdm_scalar_type is used (if impl_scala_type is double, btdm_scalar_type is float)
83 #if defined(HAVE_IFPACK2_BLOCKTRIDICONTAINER_SMALL_SCALAR)
84 #define IFPACK2_BLOCKTRIDICONTAINER_USE_SMALL_SCALAR_FOR_BLOCKTRIDIAG
85 #endif
86 
87 // if defined, it uses multiple execution spaces
88 #define IFPACK2_BLOCKTRIDICONTAINER_USE_EXEC_SPACE_INSTANCES
89 
90 namespace Ifpack2 {
91 
92 namespace BlockTriDiContainerDetails {
93 
94 namespace KB = KokkosBatched;
95 
99 using do_not_initialize_tag = Kokkos::ViewAllocateWithoutInitializing;
100 
101 template <typename MemoryTraitsType, Kokkos::MemoryTraitsFlags flag>
102 using MemoryTraits = Kokkos::MemoryTraits<MemoryTraitsType::is_unmanaged |
103  MemoryTraitsType::is_random_access |
104  flag>;
105 
106 template <typename ViewType>
107 using Unmanaged = Kokkos::View<typename ViewType::data_type,
108  typename ViewType::array_layout,
109  typename ViewType::device_type,
110  MemoryTraits<typename ViewType::memory_traits, Kokkos::Unmanaged>>;
111 template <typename ViewType>
112 using Atomic = Kokkos::View<typename ViewType::data_type,
113  typename ViewType::array_layout,
114  typename ViewType::device_type,
115  MemoryTraits<typename ViewType::memory_traits, Kokkos::Atomic>>;
116 template <typename ViewType>
117 using Const = Kokkos::View<typename ViewType::const_data_type,
118  typename ViewType::array_layout,
119  typename ViewType::device_type,
120  typename ViewType::memory_traits>;
121 template <typename ViewType>
122 using ConstUnmanaged = Const<Unmanaged<ViewType>>;
123 
124 template <typename ViewType>
125 using AtomicUnmanaged = Atomic<Unmanaged<ViewType>>;
126 
127 template <typename ViewType>
128 using Unmanaged = Kokkos::View<typename ViewType::data_type,
129  typename ViewType::array_layout,
130  typename ViewType::device_type,
131  MemoryTraits<typename ViewType::memory_traits, Kokkos::Unmanaged>>;
132 
133 template <typename ViewType>
134 using Scratch = Kokkos::View<typename ViewType::data_type,
135  typename ViewType::array_layout,
136  typename ViewType::execution_space::scratch_memory_space,
137  MemoryTraits<typename ViewType::memory_traits, Kokkos::Unmanaged>>;
138 
142 template <typename T>
144  typedef T type;
145 };
146 #if defined(IFPACK2_BLOCKTRIDICONTAINER_USE_SMALL_SCALAR_FOR_BLOCKTRIDIAG)
147 template <>
148 struct BlockTridiagScalarType<double> {
149  typedef float type;
150 };
151 // template<> struct SmallScalarType<Kokkos::complex<double> > { typedef Kokkos::complex<float> type; };
152 #endif
153 
154 #if defined(KOKKOS_ENABLE_CUDA) && defined(IFPACK2_BLOCKTRIDICONTAINER_ENABLE_PROFILE)
155 #define IFPACK2_BLOCKTRIDICONTAINER_PROFILER_REGION_BEGIN \
156  KOKKOS_IMPL_CUDA_SAFE_CALL(cudaProfilerStart());
157 
158 #define IFPACK2_BLOCKTRIDICONTAINER_PROFILER_REGION_END \
159  { KOKKOS_IMPL_CUDA_SAFE_CALL(cudaProfilerStop()); }
160 #else
161 #define IFPACK2_BLOCKTRIDICONTAINER_PROFILER_REGION_BEGIN
163 #define IFPACK2_BLOCKTRIDICONTAINER_PROFILER_REGION_END
164 #endif
165 
169 template <typename MatrixType>
171 createBlockCrsTpetraImporter(const Teuchos::RCP<const typename BlockHelperDetails::ImplType<MatrixType>::tpetra_row_matrix_type> &A) {
172  IFPACK2_BLOCKHELPER_TIMER("BlockTriDi::CreateBlockCrsTpetraImporter", CreateBlockCrsTpetraImporter);
174  using tpetra_map_type = typename impl_type::tpetra_map_type;
175  using tpetra_mv_type = typename impl_type::tpetra_block_multivector_type;
176  using tpetra_import_type = typename impl_type::tpetra_import_type;
177  using crs_matrix_type = typename impl_type::tpetra_crs_matrix_type;
178  using block_crs_matrix_type = typename impl_type::tpetra_block_crs_matrix_type;
179 
180  auto A_crs = Teuchos::rcp_dynamic_cast<const crs_matrix_type>(A);
181  auto A_bcrs = Teuchos::rcp_dynamic_cast<const block_crs_matrix_type>(A);
182 
183  bool hasBlockCrsMatrix = !A_bcrs.is_null();
184 
185  // This is OK here to use the graph of the A_crs matrix and a block size of 1
186  const auto g = hasBlockCrsMatrix ? A_bcrs->getCrsGraph() : *(A_crs->getCrsGraph()); // tpetra crs graph object
187 
188  const auto blocksize = hasBlockCrsMatrix ? A_bcrs->getBlockSize() : 1;
189  const auto src = Teuchos::rcp(new tpetra_map_type(tpetra_mv_type::makePointMap(*g.getDomainMap(), blocksize)));
190  const auto tgt = Teuchos::rcp(new tpetra_map_type(tpetra_mv_type::makePointMap(*g.getColMap(), blocksize)));
191  IFPACK2_BLOCKHELPER_TIMER_FENCE(typename BlockHelperDetails::ImplType<MatrixType>::execution_space)
192  return Teuchos::rcp(new tpetra_import_type(src, tgt));
193 }
194 
195 // Partial replacement for forward-mode MultiVector::doImport.
196 // Permits overlapped communication and computation, but also supports sync'ed.
197 // I'm finding that overlapped comm/comp can give quite poor performance on some
198 // platforms, so we can't just use it straightforwardly always.
199 
200 template <typename MatrixType>
201 struct AsyncableImport {
202  public:
204 
205  private:
209 #if !defined(HAVE_IFPACK2_MPI)
210  typedef int MPI_Request;
211  typedef int MPI_Comm;
212 #endif
213  using scalar_type = typename impl_type::scalar_type;
216 
217  static int isend(const MPI_Comm comm, const char *buf, int count, int dest, int tag, MPI_Request *ireq) {
218 #ifdef HAVE_IFPACK2_MPI
219  MPI_Request ureq;
220  int ret = MPI_Isend(const_cast<char *>(buf), count, MPI_CHAR, dest, tag, comm, ireq == NULL ? &ureq : ireq);
221  if (ireq == NULL) MPI_Request_free(&ureq);
222  return ret;
223 #else
224  return 0;
225 #endif
226  }
227 
228  static int irecv(const MPI_Comm comm, char *buf, int count, int src, int tag, MPI_Request *ireq) {
229 #ifdef HAVE_IFPACK2_MPI
230  MPI_Request ureq;
231  int ret = MPI_Irecv(buf, count, MPI_CHAR, src, tag, comm, ireq == NULL ? &ureq : ireq);
232  if (ireq == NULL) MPI_Request_free(&ureq);
233  return ret;
234 #else
235  return 0;
236 #endif
237  }
238 
239  static int waitany(int count, MPI_Request *reqs, int *index) {
240 #ifdef HAVE_IFPACK2_MPI
241  return MPI_Waitany(count, reqs, index, MPI_STATUS_IGNORE);
242 #else
243  return 0;
244 #endif
245  }
246 
247  static int waitall(int count, MPI_Request *reqs) {
248 #ifdef HAVE_IFPACK2_MPI
249  return MPI_Waitall(count, reqs, MPI_STATUS_IGNORE);
250 #else
251  return 0;
252 #endif
253  }
254 
255  public:
256  using tpetra_map_type = typename impl_type::tpetra_map_type;
257  using tpetra_import_type = typename impl_type::tpetra_import_type;
258 
259  using local_ordinal_type = typename impl_type::local_ordinal_type;
260  using global_ordinal_type = typename impl_type::global_ordinal_type;
261  using size_type = typename impl_type::size_type;
262  using impl_scalar_type = typename impl_type::impl_scalar_type;
263 
264  using int_1d_view_host = Kokkos::View<int *, Kokkos::HostSpace>;
265  using local_ordinal_type_1d_view_host = Kokkos::View<local_ordinal_type *, Kokkos::HostSpace>;
266 
267  using execution_space = typename impl_type::execution_space;
268  using memory_space = typename impl_type::memory_space;
269  using local_ordinal_type_1d_view = typename impl_type::local_ordinal_type_1d_view;
270  using size_type_1d_view = typename impl_type::size_type_1d_view;
271  using size_type_1d_view_host = Kokkos::View<size_type *, Kokkos::HostSpace>;
272 
273 #if defined(KOKKOS_ENABLE_CUDA)
274  using impl_scalar_type_1d_view =
275  typename std::conditional<std::is_same<execution_space, Kokkos::Cuda>::value,
276 #if defined(IFPACK2_BLOCKTRIDICONTAINER_USE_PINNED_MEMORY_FOR_MPI)
277  Kokkos::View<impl_scalar_type *, Kokkos::CudaHostPinnedSpace>,
278 #elif defined(IFPACK2_BLOCKTRIDICONTAINER_USE_CUDA_MEMORY_FOR_MPI)
279  Kokkos::View<impl_scalar_type *, Kokkos::CudaSpace>,
280 #else // no experimental macros are defined
281  typename impl_type::impl_scalar_type_1d_view,
282 #endif
283  typename impl_type::impl_scalar_type_1d_view>::type;
284 #else
285  using impl_scalar_type_1d_view = typename impl_type::impl_scalar_type_1d_view;
286 #endif
287  using impl_scalar_type_1d_view_host = Kokkos::View<impl_scalar_type *, Kokkos::HostSpace>;
288  using impl_scalar_type_2d_view = typename impl_type::impl_scalar_type_2d_view;
289  using impl_scalar_type_2d_view_tpetra = typename impl_type::impl_scalar_type_2d_view_tpetra;
290 
291 #ifdef HAVE_IFPACK2_MPI
292  MPI_Comm comm;
293 #endif
294 
295  impl_scalar_type_2d_view_tpetra remote_multivector;
296  local_ordinal_type blocksize;
297 
298  template <typename T>
299  struct SendRecvPair {
300  T send, recv;
301  };
302 
303  // (s)end and (r)eceive data:
304  SendRecvPair<int_1d_view_host> pids; // mpi ranks
305  SendRecvPair<std::vector<MPI_Request>> reqs; // MPI_Request is pointer, cannot use kokkos view
306  SendRecvPair<size_type_1d_view> offset; // offsets to local id list and data buffer
307  SendRecvPair<size_type_1d_view_host> offset_host; // offsets to local id list and data buffer
308  SendRecvPair<local_ordinal_type_1d_view> lids; // local id list
309  SendRecvPair<impl_scalar_type_1d_view> buffer; // data buffer
310  SendRecvPair<impl_scalar_type_1d_view_host> buffer_host; // data buffer
311 
312  local_ordinal_type_1d_view dm2cm; // permutation
313 
314 #if defined(KOKKOS_ENABLE_CUDA) || defined(KOKKOS_ENABLE_HIP) || defined(KOKKOS_ENABLE_SYCL)
315  using exec_instance_1d_std_vector = std::vector<execution_space>;
316  exec_instance_1d_std_vector exec_instances;
317 #endif
318 
319  // for cuda
320  public:
321  void setOffsetValues(const Teuchos::ArrayView<const size_t> &lens,
322  const size_type_1d_view &offs) {
323  // wrap lens to kokkos view and deep copy to device
324  Kokkos::View<size_t *, Kokkos::HostSpace> lens_host(const_cast<size_t *>(lens.getRawPtr()), lens.size());
325  const auto lens_device = Kokkos::create_mirror_view_and_copy(memory_space(), lens_host);
326 
327  // exclusive scan
328  const Kokkos::RangePolicy<execution_space> policy(0, offs.extent(0));
329  const local_ordinal_type lens_size = lens_device.extent(0);
330  Kokkos::parallel_scan(
331  "AsyncableImport::RangePolicy::setOffsetValues",
332  policy, KOKKOS_LAMBDA(const local_ordinal_type &i, size_type &update, const bool &final) {
333  if (final)
334  offs(i) = update;
335  update += (i < lens_size ? lens_device[i] : 0);
336  });
337  }
338 
339  void setOffsetValuesHost(const Teuchos::ArrayView<const size_t> &lens,
340  const size_type_1d_view_host &offs) {
341  // wrap lens to kokkos view and deep copy to device
342  Kokkos::View<size_t *, Kokkos::HostSpace> lens_host(const_cast<size_t *>(lens.getRawPtr()), lens.size());
343  const auto lens_device = Kokkos::create_mirror_view_and_copy(memory_space(), lens_host);
344 
345  // exclusive scan
346  offs(0) = 0;
347  for (local_ordinal_type i = 1, iend = offs.extent(0); i < iend; ++i) {
348  offs(i) = offs(i - 1) + lens[i - 1];
349  }
350  }
351 
352  private:
353  void createMpiRequests(const tpetra_import_type &import) {
354  Tpetra::Distributor &distributor = import.getDistributor();
355 
356  // copy pids from distributor
357  const auto pids_from = distributor.getProcsFrom();
358  pids.recv = int_1d_view_host(do_not_initialize_tag("pids recv"), pids_from.size());
359  memcpy(pids.recv.data(), pids_from.getRawPtr(), sizeof(int) * pids.recv.extent(0));
360 
361  const auto pids_to = distributor.getProcsTo();
362  pids.send = int_1d_view_host(do_not_initialize_tag("pids send"), pids_to.size());
363  memcpy(pids.send.data(), pids_to.getRawPtr(), sizeof(int) * pids.send.extent(0));
364 
365  // mpi requests
366  reqs.recv.resize(pids.recv.extent(0));
367  memset(reqs.recv.data(), 0, reqs.recv.size() * sizeof(MPI_Request));
368  reqs.send.resize(pids.send.extent(0));
369  memset(reqs.send.data(), 0, reqs.send.size() * sizeof(MPI_Request));
370 
371  // construct offsets
372 #if 0
373  const auto lengths_to = distributor.getLengthsTo();
374  offset.send = size_type_1d_view(do_not_initialize_tag("offset send"), lengths_to.size() + 1);
375 
376  const auto lengths_from = distributor.getLengthsFrom();
377  offset.recv = size_type_1d_view(do_not_initialize_tag("offset recv"), lengths_from.size() + 1);
378 
379  setOffsetValues(lengths_to, offset.send);
380  offset_host.send = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), offset.send);
381 
382  setOffsetValues(lengths_from, offset.recv);
383  offset_host.recv = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), offset.recv);
384 #else
385  const auto lengths_to = distributor.getLengthsTo();
386  offset_host.send = size_type_1d_view_host(do_not_initialize_tag("offset send"), lengths_to.size() + 1);
387 
388  const auto lengths_from = distributor.getLengthsFrom();
389  offset_host.recv = size_type_1d_view_host(do_not_initialize_tag("offset recv"), lengths_from.size() + 1);
390 
391  setOffsetValuesHost(lengths_to, offset_host.send);
392  // offset.send = Kokkos::create_mirror_view_and_copy(memory_space(), offset_host.send);
393 
394  setOffsetValuesHost(lengths_from, offset_host.recv);
395  // offset.recv = Kokkos::create_mirror_view_and_copy(memory_space(), offset_host.recv);
396 #endif
397  }
398 
399  void createSendRecvIDs(const tpetra_import_type &import) {
400  // For each remote PID, the list of LIDs to receive.
401  const auto remote_lids = import.getRemoteLIDs();
402  const local_ordinal_type_1d_view_host
403  remote_lids_view_host(const_cast<local_ordinal_type *>(remote_lids.getRawPtr()), remote_lids.size());
404  lids.recv = local_ordinal_type_1d_view(do_not_initialize_tag("lids recv"), remote_lids.size());
405  Kokkos::deep_copy(lids.recv, remote_lids_view_host);
406 
407  // For each export PID, the list of LIDs to send.
408  auto epids = import.getExportPIDs();
409  auto elids = import.getExportLIDs();
410  TEUCHOS_ASSERT(epids.size() == elids.size());
411  lids.send = local_ordinal_type_1d_view(do_not_initialize_tag("lids send"), elids.size());
412  auto lids_send_host = Kokkos::create_mirror_view(lids.send);
413 
414  // naive search (not sure if pids or epids are sorted)
415  for (local_ordinal_type cnt = 0, i = 0, iend = pids.send.extent(0); i < iend; ++i) {
416  const auto pid_send_value = pids.send[i];
417  for (local_ordinal_type j = 0, jend = epids.size(); j < jend; ++j)
418  if (epids[j] == pid_send_value) lids_send_host[cnt++] = elids[j];
419  TEUCHOS_ASSERT(static_cast<size_t>(cnt) == offset_host.send[i + 1]);
420  }
421  Kokkos::deep_copy(lids.send, lids_send_host);
422  }
423 
424  void createExecutionSpaceInstances() {
425 #if defined(KOKKOS_ENABLE_CUDA) || defined(KOKKOS_ENABLE_HIP) || defined(KOKKOS_ENABLE_SYCL)
426  // The following line creates 8 streams:
427 #if KOKKOS_VERSION >= 40699
428  exec_instances =
429  Kokkos::Experimental::partition_space(execution_space(), std::vector<int>(8, 1));
430 #else
431  exec_instances =
432  Kokkos::Experimental::partition_space(execution_space(), 1, 1, 1, 1, 1, 1, 1, 1);
433 #endif
434 #endif
435  }
436 
437  public:
438  // for cuda, all tag types are public
439  struct ToBuffer {};
440  struct ToMultiVector {};
441 
442  AsyncableImport(const Teuchos::RCP<const tpetra_map_type> &src_map,
444  const local_ordinal_type blocksize_,
445  const local_ordinal_type_1d_view dm2cm_) {
446  blocksize = blocksize_;
447  dm2cm = dm2cm_;
448 
449 #ifdef HAVE_IFPACK2_MPI
450  comm = Tpetra::Details::extractMpiCommFromTeuchos(*tgt_map->getComm());
451 #endif
452  const tpetra_import_type import(src_map, tgt_map);
453 
454  createMpiRequests(import);
455  createSendRecvIDs(import);
456  createExecutionSpaceInstances();
457  }
458 
459  void createDataBuffer(const local_ordinal_type &num_vectors) {
460  const size_type extent_0 = lids.recv.extent(0) * blocksize;
461  const size_type extent_1 = num_vectors;
462  if (remote_multivector.extent(0) == extent_0 &&
463  remote_multivector.extent(1) == extent_1) {
464  // skip
465  } else {
466  remote_multivector =
467  impl_scalar_type_2d_view_tpetra(do_not_initialize_tag("remote multivector"), extent_0, extent_1);
468 
469  const auto send_buffer_size = offset_host.send[offset_host.send.extent(0) - 1] * blocksize * num_vectors;
470  const auto recv_buffer_size = offset_host.recv[offset_host.recv.extent(0) - 1] * blocksize * num_vectors;
471 
472  buffer.send = impl_scalar_type_1d_view(do_not_initialize_tag("buffer send"), send_buffer_size);
473  buffer.recv = impl_scalar_type_1d_view(do_not_initialize_tag("buffer recv"), recv_buffer_size);
474 
475  if (!Tpetra::Details::Behavior::assumeMpiIsGPUAware()) {
476  buffer_host.send = impl_scalar_type_1d_view_host(do_not_initialize_tag("buffer send"), send_buffer_size);
477  buffer_host.recv = impl_scalar_type_1d_view_host(do_not_initialize_tag("buffer recv"), recv_buffer_size);
478  }
479  }
480  }
481 
482  void cancel() {
483 #ifdef HAVE_IFPACK2_MPI
484  waitall(reqs.recv.size(), reqs.recv.data());
485  waitall(reqs.send.size(), reqs.send.data());
486 #endif
487  }
488 
489  // ======================================================================
490  // Async version using execution space instances
491  // ======================================================================
492 
493 #if defined(KOKKOS_ENABLE_CUDA) || defined(KOKKOS_ENABLE_HIP) || defined(KOKKOS_ENABLE_SYCL)
494  template <typename PackTag>
495  static void copy(const local_ordinal_type_1d_view &lids_,
496  const impl_scalar_type_1d_view &buffer_,
497  const local_ordinal_type ibeg_,
498  const local_ordinal_type iend_,
499  const impl_scalar_type_2d_view_tpetra &multivector_,
500  const local_ordinal_type blocksize_,
501  const execution_space &exec_instance_) {
502  const local_ordinal_type num_vectors = multivector_.extent(1);
503  const local_ordinal_type mv_blocksize = blocksize_ * num_vectors;
504  const local_ordinal_type idiff = iend_ - ibeg_;
505  const auto abase = buffer_.data() + mv_blocksize * ibeg_;
506 
507  using team_policy_type = Kokkos::TeamPolicy<execution_space>;
508  local_ordinal_type vector_size(0);
509  if (blocksize_ <= 4)
510  vector_size = 4;
511  else if (blocksize_ <= 8)
512  vector_size = 8;
513  else if (blocksize_ <= 16)
514  vector_size = 16;
515  else
516  vector_size = 32;
517 
518  const auto work_item_property = Kokkos::Experimental::WorkItemProperty::HintLightWeight;
519  const team_policy_type policy(exec_instance_, idiff, 1, vector_size);
520  Kokkos::parallel_for( //"AsyncableImport::TeamPolicy::copyViaCudaStream",
521  Kokkos::Experimental::require(policy, work_item_property),
522  KOKKOS_LAMBDA(const typename team_policy_type::member_type &member) {
523  const local_ordinal_type i = member.league_rank();
524  Kokkos::parallel_for(Kokkos::TeamThreadRange(member, num_vectors), [&](const local_ordinal_type &j) {
525  auto aptr = abase + blocksize_ * (i + idiff * j);
526  auto bptr = &multivector_(blocksize_ * lids_(i + ibeg_), j);
527  if (std::is_same<PackTag, ToBuffer>::value)
528  Kokkos::parallel_for(Kokkos::ThreadVectorRange(member, blocksize_), [&](const local_ordinal_type &k) {
529  aptr[k] = bptr[k];
530  });
531  else
532  Kokkos::parallel_for(Kokkos::ThreadVectorRange(member, blocksize_), [&](const local_ordinal_type &k) {
533  bptr[k] = aptr[k];
534  });
535  });
536  });
537  }
538 
539  void asyncSendRecvVar1(const impl_scalar_type_2d_view_tpetra &mv) {
540  IFPACK2_BLOCKHELPER_TIMER("BlockTriDi::AsyncableImport::AsyncSendRecv", AsyncSendRecv);
541 
542 #ifdef HAVE_IFPACK2_MPI
543  // constants and reallocate data buffers if necessary
544  const local_ordinal_type num_vectors = mv.extent(1);
545  const local_ordinal_type mv_blocksize = blocksize * num_vectors;
546 
547  // 0. post receive async
548  for (local_ordinal_type i = 0, iend = pids.recv.extent(0); i < iend; ++i) {
549  if (Tpetra::Details::Behavior::assumeMpiIsGPUAware()) {
550  irecv(comm,
551  reinterpret_cast<char *>(buffer.recv.data() + offset_host.recv[i] * mv_blocksize),
552  (offset_host.recv[i + 1] - offset_host.recv[i]) * mv_blocksize * sizeof(impl_scalar_type),
553  pids.recv[i],
554  42,
555  &reqs.recv[i]);
556  } else {
557  irecv(comm,
558  reinterpret_cast<char *>(buffer_host.recv.data() + offset_host.recv[i] * mv_blocksize),
559  (offset_host.recv[i + 1] - offset_host.recv[i]) * mv_blocksize * sizeof(impl_scalar_type),
560  pids.recv[i],
561  42,
562  &reqs.recv[i]);
563  }
564  }
565 
567  execution_space().fence();
568 
569  // 1. async memcpy
570  for (local_ordinal_type i = 0; i < static_cast<local_ordinal_type>(pids.send.extent(0)); ++i) {
571  // 1.0. enqueue pack buffer
572  if (i < 8) exec_instances[i % 8].fence();
573  copy<ToBuffer>(lids.send, buffer.send,
574  offset_host.send(i), offset_host.send(i + 1),
575  mv, blocksize,
576  // execution_space());
577  exec_instances[i % 8]);
578  if (!Tpetra::Details::Behavior::assumeMpiIsGPUAware()) {
579  // if (i<8) exec_instances[i%8].fence();
580  const local_ordinal_type num_vectors = mv.extent(1);
581  const local_ordinal_type mv_blocksize = blocksize * num_vectors;
582 
583  Kokkos::deep_copy(exec_instances[i % 8],
584  Kokkos::subview(buffer_host.send,
585  Kokkos::pair<local_ordinal_type, local_ordinal_type>(
586  offset_host.send(i) * mv_blocksize,
587  offset_host.send(i + 1) * mv_blocksize)),
588  Kokkos::subview(buffer.send,
589  Kokkos::pair<local_ordinal_type, local_ordinal_type>(
590  offset_host.send(i) * mv_blocksize,
591  offset_host.send(i + 1) * mv_blocksize)));
592  }
593  }
595  // execution_space().fence();
596  for (local_ordinal_type i = 0; i < static_cast<local_ordinal_type>(pids.send.extent(0)); ++i) {
597  // 1.1. sync the stream and isend
598  if (i < 8) exec_instances[i % 8].fence();
599  if (Tpetra::Details::Behavior::assumeMpiIsGPUAware()) {
600  isend(comm,
601  reinterpret_cast<const char *>(buffer.send.data() + offset_host.send[i] * mv_blocksize),
602  (offset_host.send[i + 1] - offset_host.send[i]) * mv_blocksize * sizeof(impl_scalar_type),
603  pids.send[i],
604  42,
605  &reqs.send[i]);
606  } else {
607  isend(comm,
608  reinterpret_cast<const char *>(buffer_host.send.data() + offset_host.send[i] * mv_blocksize),
609  (offset_host.send[i + 1] - offset_host.send[i]) * mv_blocksize * sizeof(impl_scalar_type),
610  pids.send[i],
611  42,
612  &reqs.send[i]);
613  }
614  }
615 
616  // 2. poke communication
617  for (local_ordinal_type i = 0, iend = pids.recv.extent(0); i < iend; ++i) {
618  int flag;
619  MPI_Status stat;
620  MPI_Iprobe(pids.recv[i], 42, comm, &flag, &stat);
621  }
622 #endif // HAVE_IFPACK2_MPI
623  IFPACK2_BLOCKHELPER_TIMER_FENCE(execution_space)
624  }
625 
626  void syncRecvVar1() {
627  IFPACK2_BLOCKHELPER_TIMER("BlockTriDi::AsyncableImport::SyncRecv", SyncRecv);
628 #ifdef HAVE_IFPACK2_MPI
629  // 0. wait for receive async.
630  for (local_ordinal_type i = 0; i < static_cast<local_ordinal_type>(pids.recv.extent(0)); ++i) {
631  local_ordinal_type idx = i;
632 
633  // 0.0. wait any
634  waitany(pids.recv.extent(0), reqs.recv.data(), &idx);
635 
636  if (!Tpetra::Details::Behavior::assumeMpiIsGPUAware()) {
637  const local_ordinal_type num_vectors = remote_multivector.extent(1);
638  const local_ordinal_type mv_blocksize = blocksize * num_vectors;
639 
640  Kokkos::deep_copy(
641  Kokkos::subview(buffer.recv,
642  Kokkos::pair<local_ordinal_type, local_ordinal_type>(
643  offset_host.recv(idx) * mv_blocksize,
644  offset_host.recv(idx + 1) * mv_blocksize)),
645  Kokkos::subview(buffer_host.recv,
646  Kokkos::pair<local_ordinal_type, local_ordinal_type>(
647  offset_host.recv(idx) * mv_blocksize,
648  offset_host.recv(idx + 1) * mv_blocksize)));
649  }
650 
651  // 0.1. unpack data after data is moved into a device
652  copy<ToMultiVector>(lids.recv, buffer.recv,
653  offset_host.recv(idx), offset_host.recv(idx + 1),
654  remote_multivector, blocksize,
655  exec_instances[idx % 8]);
656  }
657 
658  // 1. fire up all cuda events
659  Kokkos::fence();
660 
661  // 2. cleanup all open comm
662  waitall(reqs.send.size(), reqs.send.data());
663 #endif // HAVE_IFPACK2_MPI
664  IFPACK2_BLOCKHELPER_TIMER_FENCE(execution_space)
665  }
666 #endif // defined(KOKKOS_ENABLE_CUDA|HIP|SYCL)
667 
668  // ======================================================================
669  // Generic version without using execution space instances
670  // - only difference between device and host architecture is on using team
671  // or range policies.
672  // ======================================================================
673  template <typename PackTag>
674  static void copy(const local_ordinal_type_1d_view &lids_,
675  const impl_scalar_type_1d_view &buffer_,
676  const local_ordinal_type &ibeg_,
677  const local_ordinal_type &iend_,
678  const impl_scalar_type_2d_view_tpetra &multivector_,
679  const local_ordinal_type blocksize_) {
680  const local_ordinal_type num_vectors = multivector_.extent(1);
681  const local_ordinal_type mv_blocksize = blocksize_ * num_vectors;
682  const local_ordinal_type idiff = iend_ - ibeg_;
683  const auto abase = buffer_.data() + mv_blocksize * ibeg_;
684  if constexpr (BlockHelperDetails::is_device<execution_space>::value) {
685  using team_policy_type = Kokkos::TeamPolicy<execution_space>;
686  local_ordinal_type vector_size(0);
687  if (blocksize_ <= 4)
688  vector_size = 4;
689  else if (blocksize_ <= 8)
690  vector_size = 8;
691  else if (blocksize_ <= 16)
692  vector_size = 16;
693  else
694  vector_size = 32;
695  const team_policy_type policy(idiff, 1, vector_size);
696  Kokkos::parallel_for(
697  "AsyncableImport::TeamPolicy::copy",
698  policy, KOKKOS_LAMBDA(const typename team_policy_type::member_type &member) {
699  const local_ordinal_type i = member.league_rank();
700  Kokkos::parallel_for(Kokkos::TeamThreadRange(member, num_vectors), [&](const local_ordinal_type &j) {
701  auto aptr = abase + blocksize_ * (i + idiff * j);
702  auto bptr = &multivector_(blocksize_ * lids_(i + ibeg_), j);
703  if (std::is_same<PackTag, ToBuffer>::value)
704  Kokkos::parallel_for(Kokkos::ThreadVectorRange(member, blocksize_), [&](const local_ordinal_type &k) {
705  aptr[k] = bptr[k];
706  });
707  else
708  Kokkos::parallel_for(Kokkos::ThreadVectorRange(member, blocksize_), [&](const local_ordinal_type &k) {
709  bptr[k] = aptr[k];
710  });
711  });
712  });
713  } else {
714  const Kokkos::RangePolicy<execution_space> policy(0, idiff * num_vectors);
715  Kokkos::parallel_for(
716  "AsyncableImport::RangePolicy::copy",
717  policy, KOKKOS_LAMBDA(const local_ordinal_type &ij) {
718  const local_ordinal_type i = ij % idiff;
719  const local_ordinal_type j = ij / idiff;
720  auto aptr = abase + blocksize_ * (i + idiff * j);
721  auto bptr = &multivector_(blocksize_ * lids_(i + ibeg_), j);
722  auto from = std::is_same<PackTag, ToBuffer>::value ? bptr : aptr;
723  auto to = std::is_same<PackTag, ToBuffer>::value ? aptr : bptr;
724  memcpy(to, from, sizeof(impl_scalar_type) * blocksize_);
725  });
726  }
727  }
728 
732  void asyncSendRecvVar0(const impl_scalar_type_2d_view_tpetra &mv) {
733  IFPACK2_BLOCKHELPER_TIMER("BlockTriDi::AsyncableImport::AsyncSendRecv", AsyncSendRecv);
734 
735 #ifdef HAVE_IFPACK2_MPI
736  // constants and reallocate data buffers if necessary
737  const local_ordinal_type num_vectors = mv.extent(1);
738  const local_ordinal_type mv_blocksize = blocksize * num_vectors;
739 
740  // receive async
741  for (local_ordinal_type i = 0, iend = pids.recv.extent(0); i < iend; ++i) {
742  if (Tpetra::Details::Behavior::assumeMpiIsGPUAware()) {
743  irecv(comm,
744  reinterpret_cast<char *>(buffer.recv.data() + offset_host.recv[i] * mv_blocksize),
745  (offset_host.recv[i + 1] - offset_host.recv[i]) * mv_blocksize * sizeof(impl_scalar_type),
746  pids.recv[i],
747  42,
748  &reqs.recv[i]);
749  } else {
750  irecv(comm,
751  reinterpret_cast<char *>(buffer_host.recv.data() + offset_host.recv[i] * mv_blocksize),
752  (offset_host.recv[i + 1] - offset_host.recv[i]) * mv_blocksize * sizeof(impl_scalar_type),
753  pids.recv[i],
754  42,
755  &reqs.recv[i]);
756  }
757  }
758 
759  // send async
760  for (local_ordinal_type i = 0, iend = pids.send.extent(0); i < iend; ++i) {
761  copy<ToBuffer>(lids.send, buffer.send, offset_host.send(i), offset_host.send(i + 1),
762  mv, blocksize);
763  Kokkos::fence();
764  if (Tpetra::Details::Behavior::assumeMpiIsGPUAware()) {
765  isend(comm,
766  reinterpret_cast<const char *>(buffer.send.data() + offset_host.send[i] * mv_blocksize),
767  (offset_host.send[i + 1] - offset_host.send[i]) * mv_blocksize * sizeof(impl_scalar_type),
768  pids.send[i],
769  42,
770  &reqs.send[i]);
771  } else {
772  Kokkos::deep_copy(
773  Kokkos::subview(buffer_host.send,
774  Kokkos::pair<local_ordinal_type, local_ordinal_type>(
775  offset_host.send(i) * mv_blocksize,
776  offset_host.send(i + 1) * mv_blocksize)),
777  Kokkos::subview(buffer.send,
778  Kokkos::pair<local_ordinal_type, local_ordinal_type>(
779  offset_host.send(i) * mv_blocksize,
780  offset_host.send(i + 1) * mv_blocksize)));
781  isend(comm,
782  reinterpret_cast<const char *>(buffer_host.send.data() + offset_host.send[i] * mv_blocksize),
783  (offset_host.send[i + 1] - offset_host.send[i]) * mv_blocksize * sizeof(impl_scalar_type),
784  pids.send[i],
785  42,
786  &reqs.send[i]);
787  }
788  }
789 
790  // I find that issuing an Iprobe seems to nudge some MPIs into action,
791  // which helps with overlapped comm/comp performance.
792  for (local_ordinal_type i = 0, iend = pids.recv.extent(0); i < iend; ++i) {
793  int flag;
794  MPI_Status stat;
795  MPI_Iprobe(pids.recv[i], 42, comm, &flag, &stat);
796  }
797 #endif
798  IFPACK2_BLOCKHELPER_TIMER_FENCE(execution_space)
799  }
800 
801  void syncRecvVar0() {
802  IFPACK2_BLOCKHELPER_TIMER("BlockTriDi::AsyncableImport::SyncRecv", SyncRecv);
803 #ifdef HAVE_IFPACK2_MPI
804  // receive async.
805  for (local_ordinal_type i = 0, iend = pids.recv.extent(0); i < iend; ++i) {
806  local_ordinal_type idx = i;
807  waitany(pids.recv.extent(0), reqs.recv.data(), &idx);
808  if (!Tpetra::Details::Behavior::assumeMpiIsGPUAware()) {
809  const local_ordinal_type num_vectors = remote_multivector.extent(1);
810  const local_ordinal_type mv_blocksize = blocksize * num_vectors;
811  Kokkos::deep_copy(
812  Kokkos::subview(buffer.recv,
813  Kokkos::pair<local_ordinal_type, local_ordinal_type>(
814  offset_host.recv(idx) * mv_blocksize,
815  offset_host.recv(idx + 1) * mv_blocksize)),
816  Kokkos::subview(buffer_host.recv,
817  Kokkos::pair<local_ordinal_type, local_ordinal_type>(
818  offset_host.recv(idx) * mv_blocksize,
819  offset_host.recv(idx + 1) * mv_blocksize)));
820  }
821  copy<ToMultiVector>(lids.recv, buffer.recv, offset_host.recv(idx), offset_host.recv(idx + 1),
822  remote_multivector, blocksize);
823  }
824  // wait on the sends to match all Isends with a cleanup operation.
825  waitall(reqs.send.size(), reqs.send.data());
826 #endif
827  IFPACK2_BLOCKHELPER_TIMER_FENCE(execution_space)
828  }
829 
833  void asyncSendRecv(const impl_scalar_type_2d_view_tpetra &mv) {
834 #if defined(KOKKOS_ENABLE_CUDA) || defined(KOKKOS_ENABLE_HIP) || defined(KOKKOS_ENABLE_SYCL)
835 #if defined(IFPACK2_BLOCKTRIDICONTAINER_USE_EXEC_SPACE_INSTANCES)
836  asyncSendRecvVar1(mv);
837 #else
838  asyncSendRecvVar0(mv);
839 #endif
840 #else
841  asyncSendRecvVar0(mv);
842 #endif
843  }
844  void syncRecv() {
845 #if defined(KOKKOS_ENABLE_CUDA) || defined(KOKKOS_ENABLE_HIP) || defined(KOKKOS_ENABLE_SYCL)
846 #if defined(IFPACK2_BLOCKTRIDICONTAINER_USE_EXEC_SPACE_INSTANCES)
847  syncRecvVar1();
848 #else
849  syncRecvVar0();
850 #endif
851 #else
852  syncRecvVar0();
853 #endif
854  }
855 
856  void syncExchange(const impl_scalar_type_2d_view_tpetra &mv) {
857  IFPACK2_BLOCKHELPER_TIMER("BlockTriDi::AsyncableImport::SyncExchange", SyncExchange);
858  asyncSendRecv(mv);
859  syncRecv();
860  IFPACK2_BLOCKHELPER_TIMER_FENCE(execution_space)
861  }
862 
863  impl_scalar_type_2d_view_tpetra getRemoteMultiVectorLocalView() const { return remote_multivector; }
864 };
865 
866 template <typename ViewType1, typename ViewType2>
867 struct are_same_struct {
868  ViewType1 keys1;
869  ViewType2 keys2;
870 
871  are_same_struct(ViewType1 keys1_, ViewType2 keys2_)
872  : keys1(keys1_)
873  , keys2(keys2_) {}
874  KOKKOS_INLINE_FUNCTION
875  void operator()(int i, unsigned int &count) const {
876  if (keys1(i) != keys2(i)) count++;
877  }
878 };
879 
880 template <typename ViewType1, typename ViewType2>
881 bool are_same(ViewType1 keys1, ViewType2 keys2) {
882  unsigned int are_same_ = 0;
883 
884  Kokkos::parallel_reduce(Kokkos::RangePolicy<typename ViewType1::execution_space>(0, keys1.extent(0)),
885  are_same_struct(keys1, keys2),
886  are_same_);
887  return are_same_ == 0;
888 }
889 
893 template <typename MatrixType>
895 createBlockCrsAsyncImporter(const Teuchos::RCP<const typename BlockHelperDetails::ImplType<MatrixType>::tpetra_row_matrix_type> &A) {
896  IFPACK2_BLOCKHELPER_TIMER("createBlockCrsAsyncImporter", createBlockCrsAsyncImporter);
898  using tpetra_map_type = typename impl_type::tpetra_map_type;
899  using local_ordinal_type = typename impl_type::local_ordinal_type;
900  using global_ordinal_type = typename impl_type::global_ordinal_type;
901  using local_ordinal_type_1d_view = typename impl_type::local_ordinal_type_1d_view;
902  using crs_matrix_type = typename impl_type::tpetra_crs_matrix_type;
903  using block_crs_matrix_type = typename impl_type::tpetra_block_crs_matrix_type;
904  using global_indices_array_device_type = Kokkos::View<const global_ordinal_type *, typename tpetra_map_type::device_type>;
905 
906  auto A_crs = Teuchos::rcp_dynamic_cast<const crs_matrix_type>(A);
907  auto A_bcrs = Teuchos::rcp_dynamic_cast<const block_crs_matrix_type>(A);
908 
909  bool hasBlockCrsMatrix = !A_bcrs.is_null();
910 
911  // This is OK here to use the graph of the A_crs matrix and a block size of 1
912  const auto g = hasBlockCrsMatrix ? A_bcrs->getCrsGraph() : *(A_crs->getCrsGraph()); // tpetra crs graph object
913 
914  const auto blocksize = hasBlockCrsMatrix ? A_bcrs->getBlockSize() : 1;
915  const auto domain_map = g.getDomainMap();
916  const auto column_map = g.getColMap();
917 
918  std::vector<global_ordinal_type> gids;
919 
920  Kokkos::Subview<global_indices_array_device_type, std::pair<int, int>> column_map_global_iD_last;
921 
922  bool separate_remotes = true, found_first = false, need_owned_permutation = false;
923  {
924  IFPACK2_BLOCKHELPER_TIMER("createBlockCrsAsyncImporter::loop_over_local_elements", loop_over_local_elements);
925 
926  global_indices_array_device_type column_map_global_iD = column_map->getMyGlobalIndicesDevice();
927  global_indices_array_device_type domain_map_global_iD = domain_map->getMyGlobalIndicesDevice();
928 
929  if (are_same(domain_map_global_iD, column_map_global_iD)) {
930  // this should be the most likely path
931  separate_remotes = true;
932  need_owned_permutation = false;
933 
934  column_map_global_iD_last = Kokkos::subview(column_map_global_iD,
935  std::pair<int, int>(domain_map_global_iD.extent(0), column_map_global_iD.extent(0)));
936  } else {
937  // This loop is relatively expensive
938  for (size_t i = 0; i < column_map->getLocalNumElements(); ++i) {
939  const global_ordinal_type gid = column_map->getGlobalElement(i);
940  if (!domain_map->isNodeGlobalElement(gid)) {
941  found_first = true;
942  gids.push_back(gid);
943  } else if (found_first) {
944  separate_remotes = false;
945  break;
946  }
947  if (!found_first && !need_owned_permutation &&
948  domain_map->getLocalElement(gid) != static_cast<local_ordinal_type>(i)) {
949  // The owned part of the domain and column maps are different
950  // orderings. We *could* do a super efficient impl of this case in the
951  // num_sweeps > 1 case by adding complexity to PermuteAndRepack. But,
952  // really, if a caller cares about speed, they wouldn't make different
953  // local permutations like this. So we punt on the best impl and go for
954  // a pretty good one: the permutation is done in place in
955  // compute_b_minus_Rx for the pure-owned part of the MVP. The only cost
956  // is the presumably worse memory access pattern of the input vector.
957  need_owned_permutation = true;
958  }
959  }
960  }
961  IFPACK2_BLOCKHELPER_TIMER_FENCE(typename BlockHelperDetails::ImplType<MatrixType>::execution_space)
962  }
963 
964  if (separate_remotes) {
965  IFPACK2_BLOCKHELPER_TIMER("createBlockCrsAsyncImporter::separate_remotes", separate_remotes);
967  const auto parsimonious_col_map = need_owned_permutation ? Teuchos::rcp(new tpetra_map_type(invalid, gids.data(), gids.size(), 0, domain_map->getComm())) : Teuchos::rcp(new tpetra_map_type(invalid, column_map_global_iD_last, 0, domain_map->getComm()));
968  if (parsimonious_col_map->getGlobalNumElements() > 0) {
969  // make the importer only if needed.
970  local_ordinal_type_1d_view dm2cm;
971  if (need_owned_permutation) {
972  dm2cm = local_ordinal_type_1d_view(do_not_initialize_tag("dm2cm"), domain_map->getLocalNumElements());
973  const auto dm2cm_host = Kokkos::create_mirror_view(dm2cm);
974  for (size_t i = 0; i < domain_map->getLocalNumElements(); ++i)
975  dm2cm_host(i) = domain_map->getLocalElement(column_map->getGlobalElement(i));
976  Kokkos::deep_copy(dm2cm, dm2cm_host);
977  }
978  IFPACK2_BLOCKHELPER_TIMER_FENCE(typename BlockHelperDetails::ImplType<MatrixType>::execution_space)
979  return Teuchos::rcp(new AsyncableImport<MatrixType>(domain_map, parsimonious_col_map, blocksize, dm2cm));
980  }
981  }
982  IFPACK2_BLOCKHELPER_TIMER_FENCE(typename BlockHelperDetails::ImplType<MatrixType>::execution_space)
983  return Teuchos::null;
984 }
985 
986 template <typename local_ordinal_type>
987 local_ordinal_type costTRSM(const local_ordinal_type block_size) {
988  return block_size * block_size;
989 }
990 
991 template <typename local_ordinal_type>
992 local_ordinal_type costGEMV(const local_ordinal_type block_size) {
993  return 2 * block_size * block_size;
994 }
995 
996 template <typename local_ordinal_type>
997 local_ordinal_type costTriDiagSolve(const local_ordinal_type subline_length, const local_ordinal_type block_size) {
998  return 2 * subline_length * costTRSM(block_size) + 2 * (subline_length - 1) * costGEMV(block_size);
999 }
1000 
1001 template <typename local_ordinal_type>
1002 local_ordinal_type costSolveSchur(const local_ordinal_type num_parts,
1003  const local_ordinal_type num_teams,
1004  const local_ordinal_type line_length,
1005  const local_ordinal_type block_size,
1006  const local_ordinal_type n_subparts_per_part) {
1007  const local_ordinal_type subline_length = ceil(double(line_length - (n_subparts_per_part - 1) * 2) / n_subparts_per_part);
1008  if (subline_length < 1) {
1009  return INT_MAX;
1010  }
1011 
1012  const local_ordinal_type p_n_lines = ceil(double(num_parts) / num_teams);
1013  const local_ordinal_type p_n_sublines = ceil(double(n_subparts_per_part) * num_parts / num_teams);
1014  const local_ordinal_type p_n_sublines_2 = ceil(double(n_subparts_per_part - 1) * num_parts / num_teams);
1015 
1016  const local_ordinal_type p_costApplyE = p_n_sublines_2 * subline_length * 2 * costGEMV(block_size);
1017  const local_ordinal_type p_costApplyS = p_n_lines * costTriDiagSolve((n_subparts_per_part - 1) * 2, block_size);
1018  const local_ordinal_type p_costApplyAinv = p_n_sublines * costTriDiagSolve(subline_length, block_size);
1019  const local_ordinal_type p_costApplyC = p_n_sublines_2 * 2 * costGEMV(block_size);
1020 
1021  if (n_subparts_per_part == 1) {
1022  return p_costApplyAinv;
1023  }
1024  return p_costApplyE + p_costApplyS + p_costApplyAinv + p_costApplyC;
1025 }
1026 
1027 template <typename local_ordinal_type>
1028 local_ordinal_type getAutomaticNSubparts(const local_ordinal_type num_parts,
1029  const local_ordinal_type num_teams,
1030  const local_ordinal_type line_length,
1031  const local_ordinal_type block_size) {
1032  // BMK: replaced theoretical model with empirical model
1033  // This is a linear regression based on data from a grid search.
1034  // The independent terms in the regression are:
1035  // - "parallelism surplus" - smaller when problem has enough lines to saturate GPU, larger otherwise
1036  // - log2 of the line length
1037  // - block size
1038  double parallelismSurplus = Kokkos::sqrt((double)num_teams / num_parts);
1039  double logLineLength = Kokkos::log2((double)line_length);
1040  (void)logLineLength;
1041  // Directly predict with linear model
1042 #if defined(KOKKOS_ARCH_AMD_GFX942) || defined(KOKKOS_ARCH_AMD_GFX942_APU)
1043  // MI300-specific data
1044  double modeled = -9.2312 + 4.6946 * parallelismSurplus + 0.4095 * block_size + 0.966 * logLineLength;
1045  // Do not split lines if there is plenty of parallelism
1046  if (parallelismSurplus < 0.3)
1047  modeled = 1;
1048 #elif defined(KOKKOS_ARCH_HOPPER) || defined(KOKKOS_ARCH_BLACKWELL)
1049  // Based on H100 data
1050  double modeled = -9.6053 + 4.7477 * parallelismSurplus + 0.2338 * block_size + 1.0794 * logLineLength;
1051  // On H100, performance degrades rapidly if small lines are split too many times
1052  double maxSplit = (double)line_length / 8;
1053  if (modeled > maxSplit)
1054  modeled = maxSplit;
1055 #elif defined(KOKKOS_ENABLE_CUDA)
1056  // Based on V100 data, line splitting is profitable in fewer cases
1057  // (only when there are few, long lines)
1058  double modeled = 1;
1059  if (parallelismSurplus > 1 && line_length > 64)
1060  modeled = 4;
1061 #elif defined(KOKKOS_ENABLE_HIP)
1062  // Based on MI250X data
1063  double modeled = -8.6214 + 7.3468 * parallelismSurplus + 0.3596 * block_size + 0.6673 * logLineLength;
1064 #else
1065  // GPUs other than CUDA or HIP: default to simple model that works for V100
1066  double modeled = 1;
1067  if (parallelismSurplus > 1 && line_length > 64)
1068  modeled = 4;
1069 #endif
1070 
1071  // Round to nearest integer
1072  local_ordinal_type n_subparts_per_part = 0.5 + modeled;
1073  // Do not split lines if there is plenty of parallelism available
1074  if (parallelismSurplus < 0.3)
1075  n_subparts_per_part = 1;
1076  // Clamp the result to valid range
1077  // Criteria for valid n_subparts_per_part (where connection_length is 2 for wide separators)
1078  // line_length >= n_subparts_per_part + (n_subparts_per_part - 1) * connection_length
1079  // Equivalently:
1080  // line_length >= n_subparts_per_part + n_subparts_per_part * 2 - 2
1081  // line_length >= 3 * n_subparts_per_part - 2
1082  local_ordinal_type min_subparts_per_part = 1;
1083  local_ordinal_type max_subparts_per_part = (line_length + 2) / 3;
1084  // Limit memory usage from too many sublines
1085  if (max_subparts_per_part > 16)
1086  max_subparts_per_part = 16;
1087  if (n_subparts_per_part < min_subparts_per_part)
1088  n_subparts_per_part = min_subparts_per_part;
1089  if (n_subparts_per_part > max_subparts_per_part)
1090  n_subparts_per_part = max_subparts_per_part;
1091  return n_subparts_per_part;
1092 }
1093 
1094 template <typename ArgActiveExecutionMemorySpace>
1095 struct SolveTridiagsDefaultModeAndAlgo;
1096 
1100 template <typename MatrixType>
1101 BlockHelperDetails::PartInterface<MatrixType>
1102 createPartInterface(const Teuchos::RCP<const typename BlockHelperDetails::ImplType<MatrixType>::tpetra_row_matrix_type> &A,
1103  const Teuchos::RCP<const typename BlockHelperDetails::ImplType<MatrixType>::tpetra_crs_graph_type> &G,
1104  const Teuchos::Array<Teuchos::Array<typename BlockHelperDetails::ImplType<MatrixType>::local_ordinal_type>> &partitions,
1105  const typename BlockHelperDetails::ImplType<MatrixType>::local_ordinal_type n_subparts_per_part_in) {
1106  IFPACK2_BLOCKHELPER_TIMER("createPartInterface", createPartInterface);
1107  using impl_type = BlockHelperDetails::ImplType<MatrixType>;
1108  using local_ordinal_type = typename impl_type::local_ordinal_type;
1109  using local_ordinal_type_1d_view = typename impl_type::local_ordinal_type_1d_view;
1110  using local_ordinal_type_2d_view = typename impl_type::local_ordinal_type_2d_view;
1111  using size_type = typename impl_type::size_type;
1112 
1113  auto bA = Teuchos::rcp_dynamic_cast<const typename BlockHelperDetails::ImplType<MatrixType>::tpetra_block_crs_matrix_type>(A);
1114 
1115  TEUCHOS_ASSERT(!bA.is_null() || G->getLocalNumRows() != 0);
1116  const local_ordinal_type blocksize = bA.is_null() ? A->getLocalNumRows() / G->getLocalNumRows() : A->getBlockSize();
1117  constexpr int vector_length = impl_type::vector_length;
1118  constexpr int internal_vector_length = impl_type::internal_vector_length;
1119 
1120  const auto comm = A->getRowMap()->getComm();
1121 
1122  BlockHelperDetails::PartInterface<MatrixType> interf;
1123 
1124  const local_ordinal_type A_n_lclrows = G->getLocalNumRows();
1125  const bool jacobi = partitions.size() == 0 || partitions.size() == A_n_lclrows;
1126  const local_ordinal_type nparts = jacobi ? A_n_lclrows : partitions.size();
1127 
1128  typedef std::pair<local_ordinal_type, local_ordinal_type> size_idx_pair_type;
1129  std::vector<size_idx_pair_type> partsz(nparts);
1130 
1131  if (!jacobi) {
1132  for (local_ordinal_type i = 0; i < nparts; ++i)
1133  partsz[i] = size_idx_pair_type(partitions[i].size(), i);
1134  std::sort(partsz.begin(), partsz.end(),
1135  [](const size_idx_pair_type &x, const size_idx_pair_type &y) {
1136  return x.first > y.first;
1137  });
1138  }
1139 
1140  local_ordinal_type n_subparts_per_part;
1141  if (jacobi) {
1142  n_subparts_per_part = 1;
1143  } else {
1144  if (n_subparts_per_part_in == -1) {
1145  // If the number of subparts is set to -1, the user let the algorithm
1146  // decides the value automatically
1147  using execution_space = typename impl_type::execution_space;
1148 
1149  // Line splitting only benefits GPUs
1150  if constexpr (impl_type::node_type::is_gpu) {
1151  const int line_length = partsz[0].first;
1152 
1153  const local_ordinal_type team_size =
1154  SolveTridiagsDefaultModeAndAlgo<typename execution_space::memory_space>::
1155  recommended_team_size(blocksize, vector_length, internal_vector_length);
1156 
1157  const local_ordinal_type num_teams = std::max(1, execution_space().concurrency() / (team_size * vector_length));
1158  n_subparts_per_part = getAutomaticNSubparts(nparts, num_teams, line_length, blocksize);
1159 #ifdef IFPACK2_BLOCKTRIDICONTAINER_USE_PRINTF
1160  printf("Automatically chosen n_subparts_per_part = %d for nparts = %d, num_teams = %d, team_size = %d, line_length = %d, and blocksize = %d;\n", n_subparts_per_part, nparts, num_teams, team_size, line_length, blocksize);
1161 #endif
1162  } else {
1163  n_subparts_per_part = 1;
1164 #ifdef IFPACK2_BLOCKTRIDICONTAINER_USE_PRINTF
1165  printf("Automatically chosen n_subparts_per_part = 1 for CPU backend\n");
1166 #endif
1167  }
1168  } else {
1169  n_subparts_per_part = n_subparts_per_part_in;
1170  }
1171  }
1172 
1173  // Total number of sub lines:
1174  const local_ordinal_type n_sub_parts = nparts * n_subparts_per_part;
1175  // Total number of sub lines + the Schur complement blocks.
1176  // For a given live 2 sub lines implies one Schur complement, 3 sub lines implies two Schur complements etc.
1177  const local_ordinal_type n_sub_parts_and_schur = n_sub_parts + nparts * (n_subparts_per_part - 1);
1178 
1179 #if defined(BLOCKTRIDICONTAINER_DEBUG)
1180  local_ordinal_type nrows = 0;
1181  if (jacobi)
1182  nrows = nparts;
1183  else
1184  for (local_ordinal_type i = 0; i < nparts; ++i) nrows += partitions[i].size();
1185 
1186  TEUCHOS_TEST_FOR_EXCEPT_MSG(nrows != A_n_lclrows, BlockHelperDetails::get_msg_prefix(comm) << "The #rows implied by the local partition is not "
1187  << "the same as getLocalNumRows: " << nrows << " vs " << A_n_lclrows);
1188 #endif
1189 
1190  // permutation vector
1191  std::vector<local_ordinal_type> p;
1192  if (jacobi) {
1193  interf.max_partsz = 1;
1194  interf.max_subpartsz = 0;
1195  interf.n_subparts_per_part = 1;
1196  interf.nparts = nparts;
1197  } else {
1198  // reorder parts to maximize simd packing efficiency
1199  p.resize(nparts);
1200 
1201  for (local_ordinal_type i = 0; i < nparts; ++i)
1202  p[i] = partsz[i].second;
1203 
1204  interf.max_partsz = partsz[0].first;
1205 
1206  constexpr local_ordinal_type connection_length = 2;
1207  const local_ordinal_type sub_line_length = (interf.max_partsz - (n_subparts_per_part - 1) * connection_length) / n_subparts_per_part;
1208  const local_ordinal_type last_sub_line_length = interf.max_partsz - (n_subparts_per_part - 1) * (connection_length + sub_line_length);
1209 
1210  interf.max_subpartsz = (sub_line_length > last_sub_line_length) ? sub_line_length : last_sub_line_length;
1211  interf.n_subparts_per_part = n_subparts_per_part;
1212  interf.nparts = nparts;
1213  }
1214 
1215  // allocate parts
1216  interf.partptr = local_ordinal_type_1d_view(do_not_initialize_tag("partptr"), nparts + 1);
1217  interf.lclrow = local_ordinal_type_1d_view(do_not_initialize_tag("lclrow"), A_n_lclrows);
1218  interf.part2rowidx0 = local_ordinal_type_1d_view(do_not_initialize_tag("part2rowidx0"), nparts + 1);
1219  interf.part2packrowidx0 = local_ordinal_type_1d_view(do_not_initialize_tag("part2packrowidx0"), nparts + 1);
1220  interf.rowidx2part = local_ordinal_type_1d_view(do_not_initialize_tag("rowidx2part"), A_n_lclrows);
1221 
1222  interf.part2rowidx0_sub = local_ordinal_type_1d_view(do_not_initialize_tag("part2rowidx0_sub"), n_sub_parts_and_schur + 1);
1223  interf.part2packrowidx0_sub = local_ordinal_type_2d_view(do_not_initialize_tag("part2packrowidx0_sub"), nparts, 2 * n_subparts_per_part);
1224  interf.rowidx2part_sub = local_ordinal_type_1d_view(do_not_initialize_tag("rowidx2part"), A_n_lclrows);
1225 
1226  interf.partptr_sub = local_ordinal_type_2d_view(do_not_initialize_tag("partptr_sub"), n_sub_parts_and_schur, 2);
1227 
1228  // mirror to host and compute on host execution space
1229  const auto partptr = Kokkos::create_mirror_view(interf.partptr);
1230  const auto partptr_sub = Kokkos::create_mirror_view(interf.partptr_sub);
1231 
1232  const auto lclrow = Kokkos::create_mirror_view(interf.lclrow);
1233  const auto part2rowidx0 = Kokkos::create_mirror_view(interf.part2rowidx0);
1234  const auto part2packrowidx0 = Kokkos::create_mirror_view(interf.part2packrowidx0);
1235  const auto rowidx2part = Kokkos::create_mirror_view(interf.rowidx2part);
1236 
1237  const auto part2rowidx0_sub = Kokkos::create_mirror_view(interf.part2rowidx0_sub);
1238  const auto part2packrowidx0_sub = Kokkos::create_mirror_view(Kokkos::HostSpace(), interf.part2packrowidx0_sub);
1239  const auto rowidx2part_sub = Kokkos::create_mirror_view(interf.rowidx2part_sub);
1240 
1241  // Determine parts.
1242  interf.row_contiguous = true;
1243  partptr(0) = 0;
1244  part2rowidx0(0) = 0;
1245  part2packrowidx0(0) = 0;
1246  local_ordinal_type pack_nrows = 0;
1247  local_ordinal_type pack_nrows_sub = 0;
1248  if (jacobi) {
1249  IFPACK2_BLOCKHELPER_TIMER("compute part indices (Jacobi)", Jacobi);
1250  // Jacobi (all lines have length 1) means that A_n_lclrows == nparts,
1251  // so the mapping between parts and rows is trivial.
1252  // Note: we can leave interf.row_contiguous = true, since for all i: lclrow(i) == i
1253  for (local_ordinal_type i = 0; i <= nparts; ++i) {
1254  part2rowidx0(i) = i;
1255  partptr(i) = i;
1256  }
1257  for (local_ordinal_type i = 0; i < nparts; ++i) {
1258  rowidx2part(i) = i;
1259  lclrow(i) = i;
1260  }
1261  for (local_ordinal_type ip = 0; ip < nparts; ++ip) {
1262  // assume No overlap.
1263  if (ip % vector_length == 0) pack_nrows = 1;
1264  part2packrowidx0(ip + 1) = part2packrowidx0(ip) + ((ip + 1) % vector_length == 0 || ip + 1 == nparts ? pack_nrows : 0);
1265  }
1266  part2rowidx0_sub(0) = 0;
1267  partptr_sub(0, 0) = 0;
1268 
1269  for (local_ordinal_type ip = 0; ip < nparts; ++ip) {
1270  constexpr local_ordinal_type ipnrows = 1;
1271  const local_ordinal_type full_line_length = partptr(ip + 1) - partptr(ip);
1272 
1273  TEUCHOS_TEST_FOR_EXCEPTION(full_line_length != ipnrows, std::logic_error,
1274  "In the part " << ip);
1275 
1276  constexpr local_ordinal_type connection_length = 2;
1277 
1278  if (full_line_length < n_subparts_per_part + (n_subparts_per_part - 1) * connection_length)
1279  TEUCHOS_TEST_FOR_EXCEPTION(true, std::logic_error,
1280  "The part " << ip << " is too short to use " << n_subparts_per_part << " sub parts.");
1281 
1282  const local_ordinal_type sub_line_length = (full_line_length - (n_subparts_per_part - 1) * connection_length) / n_subparts_per_part;
1283  const local_ordinal_type last_sub_line_length = full_line_length - (n_subparts_per_part - 1) * (connection_length + sub_line_length);
1284 
1285  if (ip % vector_length == 0) pack_nrows_sub = ipnrows;
1286 
1287  for (local_ordinal_type local_sub_ip = 0; local_sub_ip < n_subparts_per_part; ++local_sub_ip) {
1288  const local_ordinal_type sub_ip = nparts * (2 * local_sub_ip) + ip;
1289  const local_ordinal_type schur_ip = nparts * (2 * local_sub_ip + 1) + ip;
1290  if (local_sub_ip != n_subparts_per_part - 1) {
1291  if (local_sub_ip != 0) {
1292  partptr_sub(sub_ip, 0) = partptr_sub(nparts * (2 * local_sub_ip - 1) + ip, 1);
1293  } else if (ip != 0) {
1294  partptr_sub(sub_ip, 0) = partptr_sub(nparts * 2 * (n_subparts_per_part - 1) + ip - 1, 1);
1295  }
1296  partptr_sub(sub_ip, 1) = sub_line_length + partptr_sub(sub_ip, 0);
1297  partptr_sub(schur_ip, 0) = partptr_sub(sub_ip, 1);
1298  partptr_sub(schur_ip, 1) = connection_length + partptr_sub(schur_ip, 0);
1299 
1300  part2rowidx0_sub(sub_ip + 1) = part2rowidx0_sub(sub_ip) + sub_line_length;
1301  part2rowidx0_sub(sub_ip + 2) = part2rowidx0_sub(sub_ip + 1) + connection_length;
1302 
1303 #ifdef IFPACK2_BLOCKTRIDICONTAINER_USE_PRINTF
1304  printf("Sub Part index = %d, first LID associated to the sub part = %d, sub part size = %d;\n", sub_ip, partptr_sub(ip, 2 * local_sub_ip), sub_line_length);
1305  printf("Sub Part index Schur = %d, first LID associated to the sub part = %d, sub part size = %d;\n", sub_ip + 1, partptr_sub(ip, 2 * local_sub_ip + 1), connection_length);
1306 #endif
1307  } else {
1308  if (local_sub_ip != 0) {
1309  partptr_sub(sub_ip, 0) = partptr_sub(nparts * (2 * local_sub_ip - 1) + ip, 1);
1310  } else if (ip != 0) {
1311  partptr_sub(sub_ip, 0) = partptr_sub(nparts * 2 * (n_subparts_per_part - 1) + ip - 1, 1);
1312  }
1313  partptr_sub(sub_ip, 1) = last_sub_line_length + partptr_sub(sub_ip, 0);
1314 
1315  part2rowidx0_sub(sub_ip + 1) = part2rowidx0_sub(sub_ip) + last_sub_line_length;
1316 
1317 #ifdef IFPACK2_BLOCKTRIDICONTAINER_USE_PRINTF
1318  printf("Sub Part index = %d, first LID associated to the sub part = %d, sub part size = %d;\n", sub_ip, partptr_sub(ip, 2 * local_sub_ip), last_sub_line_length);
1319 #endif
1320  }
1321  }
1322  }
1323 
1324 #ifdef IFPACK2_BLOCKTRIDICONTAINER_WRITE_MM
1325  std::cout << "partptr_sub = " << std::endl;
1326  for (size_type i = 0; i < partptr_sub.extent(0); ++i) {
1327  for (size_type j = 0; j < partptr_sub.extent(1); ++j) {
1328  std::cout << partptr_sub(i, j) << " ";
1329  }
1330  std::cout << std::endl;
1331  }
1332  std::cout << "partptr_sub end" << std::endl;
1333 #endif
1334 
1335  {
1336  local_ordinal_type npacks = ceil(float(nparts) / vector_length);
1337 
1338  local_ordinal_type ip_max = nparts > vector_length ? vector_length : nparts;
1339  for (local_ordinal_type ip = 0; ip < ip_max; ++ip) {
1340  part2packrowidx0_sub(ip, 0) = 0;
1341  }
1342  for (local_ordinal_type ipack = 0; ipack < npacks; ++ipack) {
1343  if (ipack != 0) {
1344  local_ordinal_type ip_min = ipack * vector_length;
1345  ip_max = nparts > (ipack + 1) * vector_length ? (ipack + 1) * vector_length : nparts;
1346  for (local_ordinal_type ip = ip_min; ip < ip_max; ++ip) {
1347  part2packrowidx0_sub(ip, 0) = part2packrowidx0_sub(ip - vector_length, part2packrowidx0_sub.extent(1) - 1);
1348  }
1349  }
1350 
1351  for (size_type local_sub_ip = 0; local_sub_ip < part2packrowidx0_sub.extent(1) - 1; ++local_sub_ip) {
1352  local_ordinal_type ip_min = ipack * vector_length;
1353  ip_max = nparts > (ipack + 1) * vector_length ? (ipack + 1) * vector_length : nparts;
1354 
1355  const local_ordinal_type full_line_length = partptr(ip_min + 1) - partptr(ip_min);
1356 
1357  constexpr local_ordinal_type connection_length = 2;
1358 
1359  const local_ordinal_type sub_line_length = (full_line_length - (n_subparts_per_part - 1) * connection_length) / n_subparts_per_part;
1360  const local_ordinal_type last_sub_line_length = full_line_length - (n_subparts_per_part - 1) * (connection_length + sub_line_length);
1361 
1362  if (local_sub_ip % 2 == 0) pack_nrows_sub = sub_line_length;
1363  if (local_sub_ip % 2 == 1) pack_nrows_sub = connection_length;
1364  if (local_sub_ip == part2packrowidx0_sub.extent(1) - 2) pack_nrows_sub = last_sub_line_length;
1365 
1366  part2packrowidx0_sub(ip_min, local_sub_ip + 1) = part2packrowidx0_sub(ip_min, local_sub_ip) + pack_nrows_sub;
1367 
1368  for (local_ordinal_type ip = ip_min + 1; ip < ip_max; ++ip) {
1369  part2packrowidx0_sub(ip, local_sub_ip + 1) = part2packrowidx0_sub(ip_min, local_sub_ip + 1);
1370  }
1371  }
1372  }
1373 
1374  Kokkos::deep_copy(interf.part2packrowidx0_sub, part2packrowidx0_sub);
1375  }
1376  IFPACK2_BLOCKHELPER_TIMER_FENCE(typename BlockHelperDetails::ImplType<MatrixType>::execution_space)
1377  } else {
1378  IFPACK2_BLOCKHELPER_TIMER("compute part indices", indices);
1379  for (local_ordinal_type ip = 0; ip < nparts; ++ip) {
1380  const auto *part = &partitions[p[ip]];
1381  const local_ordinal_type ipnrows = part->size();
1382  TEUCHOS_ASSERT(ip == 0 || (ipnrows <= static_cast<local_ordinal_type>(partitions[p[ip - 1]].size())));
1383  TEUCHOS_TEST_FOR_EXCEPT_MSG(ipnrows == 0,
1384  BlockHelperDetails::get_msg_prefix(comm)
1385  << "partition " << p[ip]
1386  << " is empty, which is not allowed.");
1387  // assume No overlap.
1388  part2rowidx0(ip + 1) = part2rowidx0(ip) + ipnrows;
1389  // Since parts are ordered in decreasing size, the size of the first
1390  // part in a pack is the size for all parts in the pack.
1391  if (ip % vector_length == 0) pack_nrows = ipnrows;
1392  part2packrowidx0(ip + 1) = part2packrowidx0(ip) + ((ip + 1) % vector_length == 0 || ip + 1 == nparts ? pack_nrows : 0);
1393  const local_ordinal_type offset = partptr(ip);
1394  for (local_ordinal_type i = 0; i < ipnrows; ++i) {
1395  const auto lcl_row = (*part)[i];
1396  TEUCHOS_TEST_FOR_EXCEPT_MSG(lcl_row < 0 || lcl_row >= A_n_lclrows,
1397  BlockHelperDetails::get_msg_prefix(comm)
1398  << "partitions[" << p[ip] << "]["
1399  << i << "] = " << lcl_row
1400  << " but input matrix implies limits of [0, " << A_n_lclrows - 1
1401  << "].");
1402  lclrow(offset + i) = lcl_row;
1403  rowidx2part(offset + i) = ip;
1404  if (interf.row_contiguous && offset + i > 0 && lclrow((offset + i) - 1) + 1 != lcl_row)
1405  interf.row_contiguous = false;
1406  }
1407  partptr(ip + 1) = offset + ipnrows;
1408 
1409 #ifdef IFPACK2_BLOCKTRIDICONTAINER_USE_PRINTF
1410  printf("Part index = ip = %d, first LID associated to the part = partptr(ip) = offset = %d, part->size() = ipnrows = %d;\n", ip, offset, ipnrows);
1411  printf("partptr(%d+1) = %d\n", ip, partptr(ip + 1));
1412 #endif
1413  }
1414 
1415  part2rowidx0_sub(0) = 0;
1416  partptr_sub(0, 0) = 0;
1417  // const local_ordinal_type number_pack_per_sub_part = ceil(float(nparts)/vector_length);
1418 
1419  for (local_ordinal_type ip = 0; ip < nparts; ++ip) {
1420  const auto *part = &partitions[p[ip]];
1421  const local_ordinal_type ipnrows = part->size();
1422  const local_ordinal_type full_line_length = partptr(ip + 1) - partptr(ip);
1423 
1424  TEUCHOS_TEST_FOR_EXCEPTION(full_line_length != ipnrows, std::logic_error,
1425  "In the part " << ip);
1426 
1427  constexpr local_ordinal_type connection_length = 2;
1428 
1429  if (full_line_length < n_subparts_per_part + (n_subparts_per_part - 1) * connection_length)
1430  TEUCHOS_TEST_FOR_EXCEPTION(true, std::logic_error,
1431  "The part " << ip << " is too short to use " << n_subparts_per_part << " sub parts.");
1432 
1433  const local_ordinal_type sub_line_length = (full_line_length - (n_subparts_per_part - 1) * connection_length) / n_subparts_per_part;
1434  const local_ordinal_type last_sub_line_length = full_line_length - (n_subparts_per_part - 1) * (connection_length + sub_line_length);
1435 
1436  if (ip % vector_length == 0) pack_nrows_sub = ipnrows;
1437 
1438  for (local_ordinal_type local_sub_ip = 0; local_sub_ip < n_subparts_per_part; ++local_sub_ip) {
1439  const local_ordinal_type sub_ip = nparts * (2 * local_sub_ip) + ip;
1440  const local_ordinal_type schur_ip = nparts * (2 * local_sub_ip + 1) + ip;
1441  if (local_sub_ip != n_subparts_per_part - 1) {
1442  if (local_sub_ip != 0) {
1443  partptr_sub(sub_ip, 0) = partptr_sub(nparts * (2 * local_sub_ip - 1) + ip, 1);
1444  } else if (ip != 0) {
1445  partptr_sub(sub_ip, 0) = partptr_sub(nparts * 2 * (n_subparts_per_part - 1) + ip - 1, 1);
1446  }
1447  partptr_sub(sub_ip, 1) = sub_line_length + partptr_sub(sub_ip, 0);
1448  partptr_sub(schur_ip, 0) = partptr_sub(sub_ip, 1);
1449  partptr_sub(schur_ip, 1) = connection_length + partptr_sub(schur_ip, 0);
1450 
1451  part2rowidx0_sub(sub_ip + 1) = part2rowidx0_sub(sub_ip) + sub_line_length;
1452  part2rowidx0_sub(sub_ip + 2) = part2rowidx0_sub(sub_ip + 1) + connection_length;
1453 
1454 #ifdef IFPACK2_BLOCKTRIDICONTAINER_USE_PRINTF
1455  printf("Sub Part index = %d, first LID associated to the sub part = %d, sub part size = %d;\n", sub_ip, partptr_sub(sub_ip, 0), sub_line_length);
1456  printf("Sub Part index Schur = %d, first LID associated to the sub part = %d, sub part size = %d;\n", sub_ip + 1, partptr_sub(ip, 2 * local_sub_ip + 1), connection_length);
1457 #endif
1458  } else {
1459  if (local_sub_ip != 0) {
1460  partptr_sub(sub_ip, 0) = partptr_sub(nparts * (2 * local_sub_ip - 1) + ip, 1);
1461  } else if (ip != 0) {
1462  partptr_sub(sub_ip, 0) = partptr_sub(nparts * 2 * (n_subparts_per_part - 1) + ip - 1, 1);
1463  }
1464  partptr_sub(sub_ip, 1) = last_sub_line_length + partptr_sub(sub_ip, 0);
1465 
1466  part2rowidx0_sub(sub_ip + 1) = part2rowidx0_sub(sub_ip) + last_sub_line_length;
1467 
1468 #ifdef IFPACK2_BLOCKTRIDICONTAINER_USE_PRINTF
1469  printf("Sub Part index = %d, first LID associated to the sub part = %d, sub part size = %d;\n", sub_ip, partptr_sub(sub_ip, 0), last_sub_line_length);
1470 #endif
1471  }
1472  }
1473  }
1474 
1475  {
1476  local_ordinal_type npacks = ceil(float(nparts) / vector_length);
1477 
1478  local_ordinal_type ip_max = nparts > vector_length ? vector_length : nparts;
1479  for (local_ordinal_type ip = 0; ip < ip_max; ++ip) {
1480  part2packrowidx0_sub(ip, 0) = 0;
1481  }
1482  for (local_ordinal_type ipack = 0; ipack < npacks; ++ipack) {
1483  if (ipack != 0) {
1484  local_ordinal_type ip_min = ipack * vector_length;
1485  ip_max = nparts > (ipack + 1) * vector_length ? (ipack + 1) * vector_length : nparts;
1486  for (local_ordinal_type ip = ip_min; ip < ip_max; ++ip) {
1487  part2packrowidx0_sub(ip, 0) = part2packrowidx0_sub(ip - vector_length, part2packrowidx0_sub.extent(1) - 1);
1488  }
1489  }
1490 
1491  for (size_type local_sub_ip = 0; local_sub_ip < part2packrowidx0_sub.extent(1) - 1; ++local_sub_ip) {
1492  local_ordinal_type ip_min = ipack * vector_length;
1493  ip_max = nparts > (ipack + 1) * vector_length ? (ipack + 1) * vector_length : nparts;
1494 
1495  const local_ordinal_type full_line_length = partptr(ip_min + 1) - partptr(ip_min);
1496 
1497  constexpr local_ordinal_type connection_length = 2;
1498 
1499  const local_ordinal_type sub_line_length = (full_line_length - (n_subparts_per_part - 1) * connection_length) / n_subparts_per_part;
1500  const local_ordinal_type last_sub_line_length = full_line_length - (n_subparts_per_part - 1) * (connection_length + sub_line_length);
1501 
1502  if (local_sub_ip % 2 == 0) pack_nrows_sub = sub_line_length;
1503  if (local_sub_ip % 2 == 1) pack_nrows_sub = connection_length;
1504  if (local_sub_ip == part2packrowidx0_sub.extent(1) - 2) pack_nrows_sub = last_sub_line_length;
1505 
1506  part2packrowidx0_sub(ip_min, local_sub_ip + 1) = part2packrowidx0_sub(ip_min, local_sub_ip) + pack_nrows_sub;
1507 
1508  for (local_ordinal_type ip = ip_min + 1; ip < ip_max; ++ip) {
1509  part2packrowidx0_sub(ip, local_sub_ip + 1) = part2packrowidx0_sub(ip_min, local_sub_ip + 1);
1510  }
1511  }
1512  }
1513 
1514  Kokkos::deep_copy(interf.part2packrowidx0_sub, part2packrowidx0_sub);
1515  }
1516  IFPACK2_BLOCKHELPER_TIMER_FENCE(typename BlockHelperDetails::ImplType<MatrixType>::execution_space)
1517  }
1518 #if defined(BLOCKTRIDICONTAINER_DEBUG)
1519  TEUCHOS_ASSERT(partptr(nparts) == nrows);
1520 #endif
1521  if (lclrow(0) != 0) interf.row_contiguous = false;
1522 
1523  Kokkos::deep_copy(interf.partptr, partptr);
1524  Kokkos::deep_copy(interf.lclrow, lclrow);
1525 
1526  Kokkos::deep_copy(interf.partptr_sub, partptr_sub);
1527 
1528  // assume No overlap. Thus:
1529  interf.part2rowidx0 = interf.partptr;
1530  Kokkos::deep_copy(interf.part2packrowidx0, part2packrowidx0);
1531 
1532  interf.part2packrowidx0_back = part2packrowidx0_sub(part2packrowidx0_sub.extent(0) - 1, part2packrowidx0_sub.extent(1) - 1);
1533  Kokkos::deep_copy(interf.rowidx2part, rowidx2part);
1534 
1535  { // Fill packptr.
1536  IFPACK2_BLOCKHELPER_TIMER("Fill packptr", packptr0);
1537  local_ordinal_type npacks = ceil(float(nparts) / vector_length) * (part2packrowidx0_sub.extent(1) - 1);
1538  npacks = 0;
1539  for (local_ordinal_type ip = 1; ip <= nparts; ++ip) // n_sub_parts_and_schur
1540  if (part2packrowidx0(ip) != part2packrowidx0(ip - 1))
1541  ++npacks;
1542 
1543  interf.packptr = local_ordinal_type_1d_view(do_not_initialize_tag("packptr"), npacks + 1);
1544  const auto packptr = Kokkos::create_mirror_view(interf.packptr);
1545  packptr(0) = 0;
1546  for (local_ordinal_type ip = 1, k = 1; ip <= nparts; ++ip)
1547  if (part2packrowidx0(ip) != part2packrowidx0(ip - 1))
1548  packptr(k++) = ip;
1549 
1550  Kokkos::deep_copy(interf.packptr, packptr);
1551 
1552  local_ordinal_type npacks_per_subpart = ceil(float(nparts) / vector_length);
1553  npacks = ceil(float(nparts) / vector_length) * (part2packrowidx0_sub.extent(1) - 1);
1554 
1555  interf.packindices_sub = local_ordinal_type_1d_view(do_not_initialize_tag("packindices_sub"), npacks_per_subpart * n_subparts_per_part);
1556  interf.packindices_schur = local_ordinal_type_2d_view(do_not_initialize_tag("packindices_schur"), npacks_per_subpart, n_subparts_per_part - 1);
1557 
1558  const auto packindices_sub = Kokkos::create_mirror_view(interf.packindices_sub);
1559  const auto packindices_schur = Kokkos::create_mirror_view(interf.packindices_schur);
1560 
1561  // Fill packindices_sub and packindices_schur
1562  for (local_ordinal_type local_sub_ip = 0; local_sub_ip < n_subparts_per_part - 1; ++local_sub_ip) {
1563  for (local_ordinal_type local_pack_ip = 0; local_pack_ip < npacks_per_subpart; ++local_pack_ip) {
1564  packindices_sub(local_sub_ip * npacks_per_subpart + local_pack_ip) = 2 * local_sub_ip * npacks_per_subpart + local_pack_ip;
1565  packindices_schur(local_pack_ip, local_sub_ip) = 2 * local_sub_ip * npacks_per_subpart + local_pack_ip + npacks_per_subpart;
1566  }
1567  }
1568 
1569  for (local_ordinal_type local_pack_ip = 0; local_pack_ip < npacks_per_subpart; ++local_pack_ip) {
1570  packindices_sub((n_subparts_per_part - 1) * npacks_per_subpart + local_pack_ip) = 2 * (n_subparts_per_part - 1) * npacks_per_subpart + local_pack_ip;
1571  }
1572 
1573 #ifdef IFPACK2_BLOCKTRIDICONTAINER_WRITE_MM
1574  std::cout << "packindices_sub = " << std::endl;
1575  for (size_type i = 0; i < packindices_sub.extent(0); ++i) {
1576  std::cout << packindices_sub(i) << " ";
1577  }
1578  std::cout << std::endl;
1579  std::cout << "packindices_sub end" << std::endl;
1580 
1581  std::cout << "packindices_schur = " << std::endl;
1582  for (size_type i = 0; i < packindices_schur.extent(0); ++i) {
1583  for (size_type j = 0; j < packindices_schur.extent(1); ++j) {
1584  std::cout << packindices_schur(i, j) << " ";
1585  }
1586  std::cout << std::endl;
1587  }
1588 
1589  std::cout << "packindices_schur end" << std::endl;
1590 #endif
1591 
1592  Kokkos::deep_copy(interf.packindices_sub, packindices_sub);
1593  Kokkos::deep_copy(interf.packindices_schur, packindices_schur);
1594 
1595  interf.packptr_sub = local_ordinal_type_1d_view(do_not_initialize_tag("packptr"), npacks + 1);
1596  const auto packptr_sub = Kokkos::create_mirror_view(interf.packptr_sub);
1597  packptr_sub(0) = 0;
1598  for (local_ordinal_type k = 0; k < npacks + 1; ++k)
1599  packptr_sub(k) = packptr(k % npacks_per_subpart) + (k / npacks_per_subpart) * packptr(npacks_per_subpart);
1600 
1601  Kokkos::deep_copy(interf.packptr_sub, packptr_sub);
1602  IFPACK2_BLOCKHELPER_TIMER_FENCE(typename BlockHelperDetails::ImplType<MatrixType>::execution_space)
1603  }
1604  IFPACK2_BLOCKHELPER_TIMER_FENCE(typename BlockHelperDetails::ImplType<MatrixType>::execution_space)
1605 
1606  return interf;
1607 }
1608 
1612 template <typename MatrixType>
1615  using local_ordinal_type_1d_view = typename impl_type::local_ordinal_type_1d_view;
1616  using size_type_1d_view = typename impl_type::size_type_1d_view;
1617  using size_type_2d_view = typename impl_type::size_type_2d_view;
1618  using vector_type_3d_view = typename impl_type::vector_type_3d_view;
1619  using vector_type_4d_view = typename impl_type::vector_type_4d_view;
1620  using btdm_scalar_type_3d_view = typename impl_type::btdm_scalar_type_3d_view;
1621  using internal_vector_type_3d_view = typename impl_type::internal_vector_type_3d_view;
1622 
1623  // flat_td_ptr(i) is the index into flat-array values of the start of the
1624  // i'th tridiag. pack_td_ptr is the same, but for packs. If vector_length ==
1625  // 1, pack_td_ptr is the same as flat_td_ptr; if vector_length > 1, then i %
1626  // vector_length is the position in the pack.
1627  size_type_2d_view flat_td_ptr, pack_td_ptr, pack_td_ptr_schur;
1628  // List of local column indices into A from which to grab
1629  // data. flat_td_ptr(i) points to the start of the i'th tridiag's data.
1630  local_ordinal_type_1d_view A_colindsub;
1631  // Tridiag block values. pack_td_ptr(i) points to the start of the i'th
1632  // tridiag's pack, and i % vector_length gives the position in the pack.
1633  vector_type_3d_view values;
1634  // Schur block values. pack_td_ptr_schur(i) points to the start of the i'th
1635  // Schur's pack, and i % vector_length gives the position in the pack.
1636  vector_type_3d_view values_schur;
1637  // inv(A_00)*A_01 block values.
1638  vector_type_4d_view e_values;
1639  // If doing Schur line splitting: space for permuted version of X,
1640  // to be used during the Schur complement block solves (SolveTridiags, SingleVectorSchurTag).
1641  // Otherwise, this is not allocated.
1642  internal_vector_type_3d_view X_internal_vector_values_schur;
1643 
1644  // The following are for fused block Jacobi only.
1645  // For block row i, diag_offset(i)...diag_offset(i + bs^2)
1646  // is the range of scalars for the diagonal block.
1647  size_type_1d_view diag_offsets;
1648  // For fused residual+solve block Jacobi case,
1649  // this contains the diagonal block inverses in flat, local row indexing:
1650  // d_inv(row, :, :) gives the row-major block for row.
1651  btdm_scalar_type_3d_view d_inv;
1652 
1653  bool is_diagonal_only;
1654 
1655  BlockTridiags() = default;
1656  BlockTridiags(const BlockTridiags &b) = default;
1657 
1658  // Index into row-major block of a tridiag.
1659  template <typename idx_type>
1660  static KOKKOS_FORCEINLINE_FUNCTION
1661  idx_type
1662  IndexToRow(const idx_type &ind) { return (ind + 1) / 3; }
1663  // Given a row of a row-major tridiag, return the index of the first block
1664  // in that row.
1665  template <typename idx_type>
1666  static KOKKOS_FORCEINLINE_FUNCTION
1667  idx_type
1668  RowToIndex(const idx_type &row) { return row > 0 ? 3 * row - 1 : 0; }
1669  // Number of blocks in a tridiag having a given number of rows.
1670  template <typename idx_type>
1671  static KOKKOS_FORCEINLINE_FUNCTION
1672  idx_type
1673  NumBlocks(const idx_type &nrows) { return nrows > 0 ? 3 * nrows - 2 : 0; }
1674  // Number of blocks associated to a Schur complement having a given number of rows.
1675  template <typename idx_type>
1676  static KOKKOS_FORCEINLINE_FUNCTION
1677  idx_type
1678  NumBlocksSchur(const idx_type &nrows) { return nrows > 0 ? 3 * nrows + 2 : 0; }
1679 };
1680 
1684 template <typename MatrixType>
1686 createBlockTridiags(const BlockHelperDetails::PartInterface<MatrixType> &interf) {
1687  IFPACK2_BLOCKHELPER_TIMER("createBlockTridiags", createBlockTridiags0);
1688  using impl_type = BlockHelperDetails::ImplType<MatrixType>;
1689  using execution_space = typename impl_type::execution_space;
1690  using local_ordinal_type = typename impl_type::local_ordinal_type;
1691  using size_type = typename impl_type::size_type;
1692  using size_type_2d_view = typename impl_type::size_type_2d_view;
1693 
1694  constexpr int vector_length = impl_type::vector_length;
1695 
1697 
1698  const local_ordinal_type ntridiags = interf.partptr_sub.extent(0);
1699 
1700  { // construct the flat index pointers into the tridiag values array.
1701  btdm.flat_td_ptr = size_type_2d_view(do_not_initialize_tag("btdm.flat_td_ptr"), interf.nparts, 2 * interf.n_subparts_per_part);
1702  const Kokkos::RangePolicy<execution_space> policy(0, 2 * interf.nparts * interf.n_subparts_per_part);
1703  Kokkos::parallel_scan(
1704  "createBlockTridiags::RangePolicy::flat_td_ptr",
1705  policy, KOKKOS_LAMBDA(const local_ordinal_type &i, size_type &update, const bool &final) {
1706  const local_ordinal_type partidx = i / (2 * interf.n_subparts_per_part);
1707  const local_ordinal_type local_subpartidx = i % (2 * interf.n_subparts_per_part);
1708 
1709  if (final) {
1710  btdm.flat_td_ptr(partidx, local_subpartidx) = update;
1711  }
1712  if (local_subpartidx != (2 * interf.n_subparts_per_part - 1)) {
1713  const local_ordinal_type nrows = interf.partptr_sub(interf.nparts * local_subpartidx + partidx, 1) - interf.partptr_sub(interf.nparts * local_subpartidx + partidx, 0);
1714  if (local_subpartidx % 2 == 0)
1715  update += btdm.NumBlocks(nrows);
1716  else
1717  update += btdm.NumBlocksSchur(nrows);
1718  }
1719  });
1720 
1721  const auto nblocks = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), Kokkos::subview(btdm.flat_td_ptr, interf.nparts - 1, 2 * interf.n_subparts_per_part - 1));
1722  btdm.is_diagonal_only = (static_cast<local_ordinal_type>(nblocks()) == ntridiags);
1723  }
1724 
1725  // And the packed index pointers.
1726  if (vector_length == 1) {
1727  btdm.pack_td_ptr = btdm.flat_td_ptr;
1728  } else {
1729  // const local_ordinal_type npacks = interf.packptr_sub.extent(0) - 1;
1730 
1731  local_ordinal_type npacks_per_subpart = 0;
1732  const auto part2packrowidx0 = Kokkos::create_mirror_view(interf.part2packrowidx0);
1733  Kokkos::deep_copy(part2packrowidx0, interf.part2packrowidx0);
1734  for (local_ordinal_type ip = 1; ip <= interf.nparts; ++ip) // n_sub_parts_and_schur
1735  if (part2packrowidx0(ip) != part2packrowidx0(ip - 1))
1736  ++npacks_per_subpart;
1737 
1738  btdm.pack_td_ptr = size_type_2d_view(do_not_initialize_tag("btdm.pack_td_ptr"), interf.nparts, 2 * interf.n_subparts_per_part);
1739  const Kokkos::RangePolicy<execution_space> policy(0, npacks_per_subpart);
1740 
1741  Kokkos::parallel_for(
1742  "createBlockTridiags::RangePolicy::pack_td_ptr",
1743  policy, KOKKOS_LAMBDA(const local_ordinal_type &i) {
1744  for (local_ordinal_type j = 0; j < 2 * interf.n_subparts_per_part; ++j) {
1745  const local_ordinal_type pack_id = (j == 2 * interf.n_subparts_per_part - 1) ? i + (j - 1) * npacks_per_subpart : i + j * npacks_per_subpart;
1746  const local_ordinal_type nparts_in_pack = interf.packptr_sub(pack_id + 1) - interf.packptr_sub(pack_id);
1747 
1748  const local_ordinal_type parti = interf.packptr_sub(pack_id);
1749  const local_ordinal_type partidx = parti % interf.nparts;
1750 
1751  for (local_ordinal_type pti = 0; pti < nparts_in_pack; ++pti) {
1752  btdm.pack_td_ptr(partidx + pti, j) = btdm.flat_td_ptr(i, j);
1753  }
1754  }
1755  });
1756  }
1757 
1758  btdm.pack_td_ptr_schur = size_type_2d_view(do_not_initialize_tag("btdm.pack_td_ptr_schur"), interf.nparts, interf.n_subparts_per_part);
1759 
1760  const auto host_pack_td_ptr_schur = Kokkos::create_mirror_view(btdm.pack_td_ptr_schur);
1761  constexpr local_ordinal_type connection_length = 2;
1762 
1763  host_pack_td_ptr_schur(0, 0) = 0;
1764  for (local_ordinal_type i = 0; i < interf.nparts; ++i) {
1765  if (i % vector_length == 0) {
1766  if (i != 0)
1767  host_pack_td_ptr_schur(i, 0) = host_pack_td_ptr_schur(i - 1, host_pack_td_ptr_schur.extent(1) - 1);
1768  for (local_ordinal_type j = 0; j < interf.n_subparts_per_part - 1; ++j) {
1769  host_pack_td_ptr_schur(i, j + 1) = host_pack_td_ptr_schur(i, j) + btdm.NumBlocks(connection_length) + (j != 0 ? 1 : 0) + (j != interf.n_subparts_per_part - 2 ? 1 : 0);
1770  }
1771  } else {
1772  for (local_ordinal_type j = 0; j < interf.n_subparts_per_part; ++j) {
1773  host_pack_td_ptr_schur(i, j) = host_pack_td_ptr_schur(i - 1, j);
1774  }
1775  }
1776  }
1777 
1778  Kokkos::deep_copy(btdm.pack_td_ptr_schur, host_pack_td_ptr_schur);
1779 
1780 #ifdef IFPACK2_BLOCKTRIDICONTAINER_WRITE_MM
1781  const auto host_flat_td_ptr = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), btdm.flat_td_ptr);
1782  std::cout << "flat_td_ptr = " << std::endl;
1783  for (size_type i = 0; i < host_flat_td_ptr.extent(0); ++i) {
1784  for (size_type j = 0; j < host_flat_td_ptr.extent(1); ++j) {
1785  std::cout << host_flat_td_ptr(i, j) << " ";
1786  }
1787  std::cout << std::endl;
1788  }
1789  std::cout << "flat_td_ptr end" << std::endl;
1790 
1791  const auto host_pack_td_ptr = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), btdm.pack_td_ptr);
1792 
1793  std::cout << "pack_td_ptr = " << std::endl;
1794  for (size_type i = 0; i < host_pack_td_ptr.extent(0); ++i) {
1795  for (size_type j = 0; j < host_pack_td_ptr.extent(1); ++j) {
1796  std::cout << host_pack_td_ptr(i, j) << " ";
1797  }
1798  std::cout << std::endl;
1799  }
1800  std::cout << "pack_td_ptr end" << std::endl;
1801 
1802  std::cout << "pack_td_ptr_schur = " << std::endl;
1803  for (size_type i = 0; i < host_pack_td_ptr_schur.extent(0); ++i) {
1804  for (size_type j = 0; j < host_pack_td_ptr_schur.extent(1); ++j) {
1805  std::cout << host_pack_td_ptr_schur(i, j) << " ";
1806  }
1807  std::cout << std::endl;
1808  }
1809  std::cout << "pack_td_ptr_schur end" << std::endl;
1810 #endif
1811 
1812  // values and A_colindsub are created in the symbolic phase
1813  IFPACK2_BLOCKHELPER_TIMER_FENCE(typename BlockHelperDetails::ImplType<MatrixType>::execution_space)
1814 
1815  return btdm;
1816 }
1817 
1818 // Set the tridiags to be I to the full pack block size. That way, if a
1819 // tridiag within a pack is shorter than the longest one, the extra blocks are
1820 // processed in a safe way. Similarly, in the solve phase, if the extra blocks
1821 // in the packed multvector are 0, and the tridiag LU reflects the extra I
1822 // blocks, then the solve proceeds as though the extra blocks aren't
1823 // present. Since this extra work is part of the SIMD calls, it's not actually
1824 // extra work. Instead, it means we don't have to put checks or masks in, or
1825 // quiet NaNs. This functor has to be called just once, in the symbolic phase,
1826 // since the numeric phase fills in only the used entries, leaving these I
1827 // blocks intact.
1828 template <typename MatrixType>
1829 void setTridiagsToIdentity(const BlockTridiags<MatrixType> &btdm,
1830  const typename BlockHelperDetails::ImplType<MatrixType>::local_ordinal_type_1d_view &packptr) {
1831  using impl_type = BlockHelperDetails::ImplType<MatrixType>;
1832  using execution_space = typename impl_type::execution_space;
1833  using local_ordinal_type = typename impl_type::local_ordinal_type;
1834  using size_type_2d_view = typename impl_type::size_type_2d_view;
1835 
1836  const ConstUnmanaged<size_type_2d_view> pack_td_ptr(btdm.pack_td_ptr);
1837  const local_ordinal_type blocksize = btdm.values.extent(1);
1838 
1839  {
1840  const int vector_length = impl_type::vector_length;
1841  const int internal_vector_length = impl_type::internal_vector_length;
1842 
1843  using btdm_scalar_type = typename impl_type::btdm_scalar_type;
1844  using internal_vector_type = typename impl_type::internal_vector_type;
1845  using internal_vector_type_4d_view =
1846  typename impl_type::internal_vector_type_4d_view;
1847 
1848  using team_policy_type = Kokkos::TeamPolicy<execution_space>;
1849  const internal_vector_type_4d_view values(reinterpret_cast<internal_vector_type *>(btdm.values.data()),
1850  btdm.values.extent(0),
1851  btdm.values.extent(1),
1852  btdm.values.extent(2),
1853  vector_length / internal_vector_length);
1854  const local_ordinal_type vector_loop_size = values.extent(3);
1855 #if defined(KOKKOS_ENABLE_CUDA) && defined(__CUDA_ARCH__)
1856  local_ordinal_type total_team_size(0);
1857  if (blocksize <= 5)
1858  total_team_size = 32;
1859  else if (blocksize <= 9)
1860  total_team_size = 64;
1861  else if (blocksize <= 12)
1862  total_team_size = 96;
1863  else if (blocksize <= 16)
1864  total_team_size = 128;
1865  else if (blocksize <= 20)
1866  total_team_size = 160;
1867  else
1868  total_team_size = 160;
1869  const local_ordinal_type team_size = total_team_size / vector_loop_size;
1870  const team_policy_type policy(packptr.extent(0) - 1, team_size, vector_loop_size);
1871 #elif defined(KOKKOS_ENABLE_HIP)
1872  // FIXME: HIP
1873  // These settings might be completely wrong
1874  // will have to do some experiments to decide
1875  // what makes sense on AMD GPUs
1876  local_ordinal_type total_team_size(0);
1877  if (blocksize <= 5)
1878  total_team_size = 32;
1879  else if (blocksize <= 9)
1880  total_team_size = 64;
1881  else if (blocksize <= 12)
1882  total_team_size = 96;
1883  else if (blocksize <= 16)
1884  total_team_size = 128;
1885  else if (blocksize <= 20)
1886  total_team_size = 160;
1887  else
1888  total_team_size = 160;
1889  const local_ordinal_type team_size = total_team_size / vector_loop_size;
1890  const team_policy_type policy(packptr.extent(0) - 1, team_size, vector_loop_size);
1891 #elif defined(KOKKOS_ENABLE_SYCL)
1892  // SYCL: FIXME
1893  local_ordinal_type total_team_size(0);
1894  if (blocksize <= 5)
1895  total_team_size = 32;
1896  else if (blocksize <= 9)
1897  total_team_size = 64;
1898  else if (blocksize <= 12)
1899  total_team_size = 96;
1900  else if (blocksize <= 16)
1901  total_team_size = 128;
1902  else if (blocksize <= 20)
1903  total_team_size = 160;
1904  else
1905  total_team_size = 160;
1906  const local_ordinal_type team_size = total_team_size / vector_loop_size;
1907  const team_policy_type policy(packptr.extent(0) - 1, team_size, vector_loop_size);
1908 #else
1909  // Host architecture: team size is always one
1910  const team_policy_type policy(packptr.extent(0) - 1, 1, 1);
1911 #endif
1912  Kokkos::parallel_for(
1913  "setTridiagsToIdentity::TeamPolicy",
1914  policy, KOKKOS_LAMBDA(const typename team_policy_type::member_type &member) {
1915  const local_ordinal_type k = member.league_rank();
1916  const local_ordinal_type ibeg = pack_td_ptr(packptr(k), 0);
1917  const local_ordinal_type iend = pack_td_ptr(packptr(k), pack_td_ptr.extent(1) - 1);
1918 
1919  const local_ordinal_type diff = iend - ibeg;
1920  const local_ordinal_type icount = diff / 3 + (diff % 3 > 0);
1921  const btdm_scalar_type one(1);
1922  Kokkos::parallel_for(Kokkos::ThreadVectorRange(member, vector_loop_size), [&](const int &v) {
1923  Kokkos::parallel_for(Kokkos::TeamThreadRange(member, icount), [&](const local_ordinal_type &ii) {
1924  const local_ordinal_type i = ibeg + ii * 3;
1925  for (local_ordinal_type j = 0; j < blocksize; ++j) {
1926  values(i, j, j, v) = one;
1927  }
1928  });
1929  });
1930  });
1931  }
1932 }
1933 
1937 template <typename MatrixType>
1938 void performSymbolicPhase(const Teuchos::RCP<const typename BlockHelperDetails::ImplType<MatrixType>::tpetra_row_matrix_type> &A,
1939  const Teuchos::RCP<const typename BlockHelperDetails::ImplType<MatrixType>::tpetra_crs_graph_type> &g,
1940  const BlockHelperDetails::PartInterface<MatrixType> &interf,
1943  const bool overlap_communication_and_computation,
1944  const Teuchos::RCP<AsyncableImport<MatrixType>> &async_importer,
1945  bool useSeqMethod,
1946  bool use_fused_jacobi) {
1947  IFPACK2_BLOCKHELPER_TIMER("BlockTriDi::SymbolicPhase", SymbolicPhase);
1948 
1949  using impl_type = BlockHelperDetails::ImplType<MatrixType>;
1950 
1951  using execution_space = typename impl_type::execution_space;
1952  using host_execution_space = typename impl_type::host_execution_space;
1953 
1954  using local_ordinal_type = typename impl_type::local_ordinal_type;
1955  using global_ordinal_type = typename impl_type::global_ordinal_type;
1956  using size_type = typename impl_type::size_type;
1957  using local_ordinal_type_1d_view = typename impl_type::local_ordinal_type_1d_view;
1958  using size_type_1d_view = typename impl_type::size_type_1d_view;
1959  using vector_type_3d_view = typename impl_type::vector_type_3d_view;
1960  using vector_type_4d_view = typename impl_type::vector_type_4d_view;
1961  using internal_vector_type_3d_view = typename impl_type::internal_vector_type_3d_view;
1962  using crs_matrix_type = typename impl_type::tpetra_crs_matrix_type;
1963  using block_crs_matrix_type = typename impl_type::tpetra_block_crs_matrix_type;
1964  using btdm_scalar_type_3d_view = typename impl_type::btdm_scalar_type_3d_view;
1965 
1966  constexpr int vector_length = impl_type::vector_length;
1967  constexpr int internal_vector_length = impl_type::internal_vector_length;
1968 
1969  const auto comm = A->getRowMap()->getComm();
1970 
1971  auto A_crs = Teuchos::rcp_dynamic_cast<const crs_matrix_type>(A);
1972  auto A_bcrs = Teuchos::rcp_dynamic_cast<const block_crs_matrix_type>(A);
1973 
1974  bool hasBlockCrsMatrix = !A_bcrs.is_null();
1975  TEUCHOS_ASSERT(hasBlockCrsMatrix || g->getLocalNumRows() != 0);
1976  const local_ordinal_type blocksize = hasBlockCrsMatrix ? A->getBlockSize() : A->getLocalNumRows() / g->getLocalNumRows();
1977 
1978  // mirroring to host
1979  const auto partptr = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), interf.partptr);
1980  const auto lclrow = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), interf.lclrow);
1981  const auto rowidx2part = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), interf.rowidx2part);
1982  const auto part2rowidx0 = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), interf.part2rowidx0);
1983  const auto packptr = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), interf.packptr);
1984 
1985  const local_ordinal_type nrows = partptr(partptr.extent(0) - 1);
1986 
1987  Kokkos::View<local_ordinal_type *, host_execution_space> col2row("col2row", A->getLocalNumCols());
1988 
1989  // find column to row map on host
1990 
1991  Kokkos::deep_copy(col2row, Teuchos::OrdinalTraits<local_ordinal_type>::invalid());
1992  {
1993  const auto rowmap = g->getRowMap();
1994  const auto colmap = g->getColMap();
1995  const auto dommap = g->getDomainMap();
1996  TEUCHOS_ASSERT(!(rowmap.is_null() || colmap.is_null() || dommap.is_null()));
1997  rowmap->lazyPushToHost();
1998  colmap->lazyPushToHost();
1999  dommap->lazyPushToHost();
2000 
2001 #if !defined(__CUDA_ARCH__) && !defined(__HIP_DEVICE_COMPILE__) && !defined(__SYCL_DEVICE_ONLY__)
2002  const Kokkos::RangePolicy<host_execution_space> policy(0, nrows);
2003  Kokkos::parallel_for(
2004  "performSymbolicPhase::RangePolicy::col2row",
2005  policy, KOKKOS_LAMBDA(const local_ordinal_type &lr) {
2006  const global_ordinal_type gid = rowmap->getGlobalElement(lr);
2008  if (dommap->isNodeGlobalElement(gid)) {
2009  const local_ordinal_type lc = colmap->getLocalElement(gid);
2010 #if defined(BLOCKTRIDICONTAINER_DEBUG)
2012  BlockHelperDetails::get_msg_prefix(comm) << "GID " << gid
2013  << " gives an invalid local column.");
2014 #endif
2015  col2row(lc) = lr;
2016  }
2017  });
2018 #endif
2019  }
2020 
2021  // construct the D and R graphs in A = D + R.
2022  {
2023  const auto local_graph = g->getLocalGraphHost();
2024  const auto local_graph_rowptr = local_graph.row_map;
2025  TEUCHOS_ASSERT(local_graph_rowptr.size() == static_cast<size_t>(nrows + 1));
2026  const auto local_graph_colidx = local_graph.entries;
2027 
2028  // assume no overlap.
2029 
2030  Kokkos::View<local_ordinal_type *, host_execution_space> lclrow2idx("lclrow2idx", nrows);
2031  {
2032  const Kokkos::RangePolicy<host_execution_space> policy(0, nrows);
2033  Kokkos::parallel_for(
2034  "performSymbolicPhase::RangePolicy::lclrow2idx",
2035  policy, KOKKOS_LAMBDA(const local_ordinal_type &i) {
2036  lclrow2idx[lclrow(i)] = i;
2037  });
2038  }
2039 
2040  // count (block) nnzs in D and R.
2042  typename sum_reducer_type::value_type sum_reducer_value;
2043  {
2044  const Kokkos::RangePolicy<host_execution_space> policy(0, nrows);
2045  Kokkos::parallel_reduce
2046  // profiling interface does not work
2047  ( //"performSymbolicPhase::RangePolicy::count_nnz",
2048  policy, KOKKOS_LAMBDA(const local_ordinal_type &lr, typename sum_reducer_type::value_type &update) {
2049  // LID -> index.
2050  const local_ordinal_type ri0 = lclrow2idx[lr];
2051  const local_ordinal_type pi0 = rowidx2part(ri0);
2052  for (size_type j = local_graph_rowptr(lr); j < local_graph_rowptr(lr + 1); ++j) {
2053  const local_ordinal_type lc = local_graph_colidx(j);
2054  const local_ordinal_type lc2r = col2row[lc];
2055  bool incr_R = false;
2056  do { // breakable
2057  if (lc2r == (local_ordinal_type)-1) {
2058  incr_R = true;
2059  break;
2060  }
2061  const local_ordinal_type ri = lclrow2idx[lc2r];
2062  const local_ordinal_type pi = rowidx2part(ri);
2063  if (pi != pi0) {
2064  incr_R = true;
2065  break;
2066  }
2067  // Test for being in the tridiag. This is done in index space. In
2068  // LID space, tridiag LIDs in a row are not necessarily related by
2069  // {-1, 0, 1}.
2070  if (ri0 + 1 >= ri && ri0 <= ri + 1)
2071  ++update.v[0]; // D_nnz
2072  else
2073  incr_R = true;
2074  } while (0);
2075  if (incr_R) {
2076  if (lc < nrows)
2077  ++update.v[1]; // R_nnz_owned
2078  else
2079  ++update.v[2]; // R_nnz_remote
2080  }
2081  }
2082  },
2083  sum_reducer_type(sum_reducer_value));
2084  }
2085  size_type D_nnz = sum_reducer_value.v[0];
2086  size_type R_nnz_owned = sum_reducer_value.v[1];
2087  size_type R_nnz_remote = sum_reducer_value.v[2];
2088 
2089  if (!overlap_communication_and_computation) {
2090  R_nnz_owned += R_nnz_remote;
2091  R_nnz_remote = 0;
2092  }
2093 
2094  // construct the D_00 graph.
2095  {
2096  const auto flat_td_ptr = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), btdm.flat_td_ptr);
2097 
2098  btdm.A_colindsub = local_ordinal_type_1d_view("btdm.A_colindsub", D_nnz);
2099  const auto D_A_colindsub = Kokkos::create_mirror_view(btdm.A_colindsub);
2100 
2101 #if defined(BLOCKTRIDICONTAINER_DEBUG)
2102  Kokkos::deep_copy(D_A_colindsub, Teuchos::OrdinalTraits<local_ordinal_type>::invalid());
2103 #endif
2104 
2105  const local_ordinal_type nparts = partptr.extent(0) - 1;
2106 
2107  {
2108  const Kokkos::RangePolicy<host_execution_space> policy(0, nparts);
2109  Kokkos::parallel_for(
2110  "performSymbolicPhase::RangePolicy<host_execution_space>::D_graph",
2111  policy, KOKKOS_LAMBDA(const local_ordinal_type &pi0) {
2112  const local_ordinal_type part_ri0 = part2rowidx0(pi0);
2113  local_ordinal_type offset = 0;
2114  for (local_ordinal_type ri0 = partptr(pi0); ri0 < partptr(pi0 + 1); ++ri0) {
2115  const local_ordinal_type td_row_os = btdm.RowToIndex(ri0 - part_ri0) + offset;
2116  offset = 1;
2117  const local_ordinal_type lr0 = lclrow(ri0);
2118  const size_type j0 = local_graph_rowptr(lr0);
2119  for (size_type j = j0; j < local_graph_rowptr(lr0 + 1); ++j) {
2120  const local_ordinal_type lc = local_graph_colidx(j);
2121  const local_ordinal_type lc2r = col2row[lc];
2122  if (lc2r == (local_ordinal_type)-1) continue;
2123  const local_ordinal_type ri = lclrow2idx[lc2r];
2124  const local_ordinal_type pi = rowidx2part(ri);
2125  if (pi != pi0) continue;
2126  if (ri + 1 < ri0 || ri > ri0 + 1) continue;
2127  const local_ordinal_type row_entry = j - j0;
2128  D_A_colindsub(flat_td_ptr(pi0, 0) + ((td_row_os + ri) - ri0)) = row_entry;
2129  }
2130  }
2131  });
2132  }
2133 #if defined(BLOCKTRIDICONTAINER_DEBUG)
2134  for (size_t i = 0; i < D_A_colindsub.extent(0); ++i)
2136 #endif
2137  Kokkos::deep_copy(btdm.A_colindsub, D_A_colindsub);
2138 
2139  // Allocate values.
2140  {
2141  const auto pack_td_ptr_last = Kokkos::subview(btdm.pack_td_ptr, btdm.pack_td_ptr.extent(0) - 1, btdm.pack_td_ptr.extent(1) - 1);
2142  const auto num_packed_blocks = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), pack_td_ptr_last);
2143  btdm.values = vector_type_3d_view("btdm.values", num_packed_blocks(), blocksize, blocksize);
2144 
2145  if (interf.n_subparts_per_part > 1) {
2146  const auto pack_td_ptr_schur_last = Kokkos::subview(btdm.pack_td_ptr_schur, btdm.pack_td_ptr_schur.extent(0) - 1, btdm.pack_td_ptr_schur.extent(1) - 1);
2147  const auto num_packed_blocks_schur = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), pack_td_ptr_schur_last);
2148  btdm.values_schur = vector_type_3d_view("btdm.values_schur", num_packed_blocks_schur(), blocksize, blocksize);
2149  }
2150 
2151  if (vector_length > 1) setTridiagsToIdentity(btdm, interf.packptr);
2152  }
2153  }
2154 
2155  // Construct the R graph.
2156  {
2157  amd.rowptr = size_type_1d_view("amd.rowptr", nrows + 1);
2158  amd.A_colindsub = local_ordinal_type_1d_view(do_not_initialize_tag("amd.A_colindsub"), R_nnz_owned);
2159 
2160  const auto R_rowptr = Kokkos::create_mirror_view(amd.rowptr);
2161  const auto R_A_colindsub = Kokkos::create_mirror_view(amd.A_colindsub);
2162 
2163  amd.rowptr_remote = size_type_1d_view("amd.rowptr_remote", overlap_communication_and_computation ? nrows + 1 : 0);
2164  amd.A_colindsub_remote = local_ordinal_type_1d_view(do_not_initialize_tag("amd.A_colindsub_remote"), R_nnz_remote);
2165 
2166  const auto R_rowptr_remote = Kokkos::create_mirror_view(amd.rowptr_remote);
2167  const auto R_A_colindsub_remote = Kokkos::create_mirror_view(amd.A_colindsub_remote);
2168 
2169  {
2170  const Kokkos::RangePolicy<host_execution_space> policy(0, nrows);
2171  Kokkos::parallel_for(
2172  "performSymbolicPhase::RangePolicy<host_execution_space>::R_graph_count",
2173  policy, KOKKOS_LAMBDA(const local_ordinal_type &lr) {
2174  const local_ordinal_type ri0 = lclrow2idx[lr];
2175  const local_ordinal_type pi0 = rowidx2part(ri0);
2176  const size_type j0 = local_graph_rowptr(lr);
2177  for (size_type j = j0; j < local_graph_rowptr(lr + 1); ++j) {
2178  const local_ordinal_type lc = local_graph_colidx(j);
2179  const local_ordinal_type lc2r = col2row[lc];
2180  if (lc2r != (local_ordinal_type)-1) {
2181  const local_ordinal_type ri = lclrow2idx[lc2r];
2182  const local_ordinal_type pi = rowidx2part(ri);
2183  if (pi == pi0 && ri + 1 >= ri0 && ri <= ri0 + 1) {
2184  continue;
2185  }
2186  }
2187  // exclusive scan will be performed later
2188  if (!overlap_communication_and_computation || lc < nrows) {
2189  ++R_rowptr(lr);
2190  } else {
2191  ++R_rowptr_remote(lr);
2192  }
2193  }
2194  });
2195  }
2196 
2197  // exclusive scan
2199  {
2200  Kokkos::RangePolicy<host_execution_space> policy(0, nrows + 1);
2201  Kokkos::parallel_scan(
2202  "performSymbolicPhase::RangePolicy<host_execution_space>::R_graph_fill",
2203  policy, KOKKOS_LAMBDA(const local_ordinal_type &lr, update_type &update, const bool &final) {
2204  update_type val;
2205  val.v[0] = R_rowptr(lr);
2206  if (overlap_communication_and_computation)
2207  val.v[1] = R_rowptr_remote(lr);
2208 
2209  if (final) {
2210  R_rowptr(lr) = update.v[0];
2211  if (overlap_communication_and_computation)
2212  R_rowptr_remote(lr) = update.v[1];
2213 
2214  if (lr < nrows) {
2215  const local_ordinal_type ri0 = lclrow2idx[lr];
2216  const local_ordinal_type pi0 = rowidx2part(ri0);
2217 
2218  size_type cnt_rowptr = R_rowptr(lr);
2219  size_type cnt_rowptr_remote = overlap_communication_and_computation ? R_rowptr_remote(lr) : 0; // when not overlap_communication_and_computation, this value is garbage
2220 
2221  const size_type j0 = local_graph_rowptr(lr);
2222  for (size_type j = j0; j < local_graph_rowptr(lr + 1); ++j) {
2223  const local_ordinal_type lc = local_graph_colidx(j);
2224  const local_ordinal_type lc2r = col2row[lc];
2225  if (lc2r != (local_ordinal_type)-1) {
2226  const local_ordinal_type ri = lclrow2idx[lc2r];
2227  const local_ordinal_type pi = rowidx2part(ri);
2228  if (pi == pi0 && ri + 1 >= ri0 && ri <= ri0 + 1)
2229  continue;
2230  }
2231  const local_ordinal_type row_entry = j - j0;
2232  if (!overlap_communication_and_computation || lc < nrows)
2233  R_A_colindsub(cnt_rowptr++) = row_entry;
2234  else
2235  R_A_colindsub_remote(cnt_rowptr_remote++) = row_entry;
2236  }
2237  }
2238  }
2239  update += val;
2240  });
2241  }
2242  TEUCHOS_ASSERT(R_rowptr(nrows) == R_nnz_owned);
2243  Kokkos::deep_copy(amd.rowptr, R_rowptr);
2244  Kokkos::deep_copy(amd.A_colindsub, R_A_colindsub);
2245  if (overlap_communication_and_computation) {
2246  TEUCHOS_ASSERT(R_rowptr_remote(nrows) == R_nnz_remote);
2247  Kokkos::deep_copy(amd.rowptr_remote, R_rowptr_remote);
2248  Kokkos::deep_copy(amd.A_colindsub_remote, R_A_colindsub_remote);
2249  }
2250 
2251  // Allocate or view values.
2252  if (hasBlockCrsMatrix)
2253  amd.tpetra_values = (const_cast<block_crs_matrix_type *>(A_bcrs.get())->getValuesDeviceNonConst());
2254  else {
2255  amd.tpetra_values = (const_cast<crs_matrix_type *>(A_crs.get()))->getLocalValuesDevice(Tpetra::Access::ReadWrite);
2256  }
2257  }
2258 
2259  if (interf.n_subparts_per_part > 1) {
2260  // If doing Schur complement line splitting, allocate E and space for permuted X
2261  btdm.e_values = vector_type_4d_view("btdm.e_values", 2, interf.part2packrowidx0_back, blocksize, blocksize);
2262  btdm.X_internal_vector_values_schur = internal_vector_type_3d_view(
2263  do_not_initialize_tag("X_internal_vector_values_schur"),
2264  2 * (interf.n_subparts_per_part - 1) * interf.part2packrowidx0_sub.extent(0),
2265  blocksize,
2266  vector_length / internal_vector_length);
2267  }
2268  }
2269  // Precompute offsets of each A and x entry to speed up residual.
2270  // Applies if all of these are true:
2271  // - hasBlockCrsMatrix
2272  // - execution_space is a GPU
2273  // - !useSeqMethod (since this uses a different scheme for indexing A,x)
2274  //
2275  // Reading A, x take up to 4 and 6 levels of indirection respectively,
2276  // but precomputing the offsets reduces it to 2 for both (get index, then value)
2277  if (BlockHelperDetails::is_device<execution_space>::value && !useSeqMethod && hasBlockCrsMatrix) {
2278  bool is_async_importer_active = !async_importer.is_null();
2279  local_ordinal_type_1d_view dm2cm = is_async_importer_active ? async_importer->dm2cm : local_ordinal_type_1d_view();
2280  bool ownedRemoteSeparate = overlap_communication_and_computation || !is_async_importer_active;
2281  BlockHelperDetails::precompute_A_x_offsets<MatrixType>(amd, interf, g, dm2cm, blocksize, ownedRemoteSeparate);
2282  }
2283 
2284  // If using fused block Jacobi path, allocate diagonal inverses here (d_inv) and find diagonal offsets.
2285  if (use_fused_jacobi) {
2286  btdm.d_inv = btdm_scalar_type_3d_view(do_not_initialize_tag("btdm.d_inv"), interf.nparts, blocksize, blocksize);
2287  auto rowptrs = A_bcrs->getCrsGraph().getLocalRowPtrsDevice();
2288  auto entries = A_bcrs->getCrsGraph().getLocalIndicesDevice();
2289  btdm.diag_offsets = BlockHelperDetails::findDiagOffsets<execution_space, size_type_1d_view>(rowptrs, entries, interf.nparts, blocksize);
2290  }
2291  IFPACK2_BLOCKHELPER_TIMER_FENCE(typename BlockHelperDetails::ImplType<MatrixType>::execution_space)
2292 }
2293 
2297 template <typename ArgActiveExecutionMemorySpace>
2299 
2300 template <>
2301 struct ExtractAndFactorizeTridiagsDefaultModeAndAlgo<Kokkos::HostSpace> {
2302  typedef KB::Mode::Serial mode_type;
2303 #if defined(__KOKKOSBATCHED_INTEL_MKL_COMPACT_BATCHED__)
2304  typedef KB::Algo::Level3::CompactMKL algo_type;
2305 #else
2306  typedef KB::Algo::Level3::Blocked algo_type;
2307 #endif
2308  static int recommended_team_size(const int /* blksize */,
2309  const int /* vector_length */,
2310  const int /* internal_vector_length */) {
2311  return 1;
2312  }
2313 };
2314 
2315 #if defined(KOKKOS_ENABLE_CUDA)
2316 static inline int ExtractAndFactorizeRecommendedCudaTeamSize(const int blksize,
2317  const int vector_length,
2318  const int internal_vector_length) {
2319  const int vector_size = vector_length / internal_vector_length;
2320  int total_team_size(0);
2321  if (blksize <= 5)
2322  total_team_size = 32;
2323  else if (blksize <= 9)
2324  total_team_size = 32; // 64
2325  else if (blksize <= 12)
2326  total_team_size = 96;
2327  else if (blksize <= 16)
2328  total_team_size = 128;
2329  else if (blksize <= 20)
2330  total_team_size = 160;
2331  else
2332  total_team_size = 160;
2333  return 2 * total_team_size / vector_size;
2334 }
2335 template <>
2336 struct ExtractAndFactorizeTridiagsDefaultModeAndAlgo<Kokkos::CudaSpace> {
2337  typedef KB::Mode::Team mode_type;
2338  typedef KB::Algo::Level3::Unblocked algo_type;
2339  static int recommended_team_size(const int blksize,
2340  const int vector_length,
2341  const int internal_vector_length) {
2342  return ExtractAndFactorizeRecommendedCudaTeamSize(blksize, vector_length, internal_vector_length);
2343  }
2344 };
2345 template <>
2346 struct ExtractAndFactorizeTridiagsDefaultModeAndAlgo<Kokkos::CudaUVMSpace> {
2347  typedef KB::Mode::Team mode_type;
2348  typedef KB::Algo::Level3::Unblocked algo_type;
2349  static int recommended_team_size(const int blksize,
2350  const int vector_length,
2351  const int internal_vector_length) {
2352  return ExtractAndFactorizeRecommendedCudaTeamSize(blksize, vector_length, internal_vector_length);
2353  }
2354 };
2355 #endif
2356 
2357 #if defined(KOKKOS_ENABLE_HIP)
2358 static inline int ExtractAndFactorizeRecommendedHIPTeamSize(const int blksize,
2359  const int vector_length,
2360  const int internal_vector_length) {
2361  const int vector_size = vector_length / internal_vector_length;
2362  int total_team_size(0);
2363  if (blksize <= 5)
2364  total_team_size = 32;
2365  else if (blksize <= 9)
2366  total_team_size = 32; // 64
2367  else if (blksize <= 12)
2368  total_team_size = 96;
2369  else if (blksize <= 16)
2370  total_team_size = 128;
2371  else if (blksize <= 20)
2372  total_team_size = 160;
2373  else
2374  total_team_size = 160;
2375  return 2 * total_team_size / vector_size;
2376 }
2377 template <>
2378 struct ExtractAndFactorizeTridiagsDefaultModeAndAlgo<Kokkos::HIPSpace> {
2379  typedef KB::Mode::Team mode_type;
2380  typedef KB::Algo::Level3::Unblocked algo_type;
2381  static int recommended_team_size(const int blksize,
2382  const int vector_length,
2383  const int internal_vector_length) {
2384  return ExtractAndFactorizeRecommendedHIPTeamSize(blksize, vector_length, internal_vector_length);
2385  }
2386 };
2387 template <>
2388 struct ExtractAndFactorizeTridiagsDefaultModeAndAlgo<Kokkos::HIPHostPinnedSpace> {
2389  typedef KB::Mode::Team mode_type;
2390  typedef KB::Algo::Level3::Unblocked algo_type;
2391  static int recommended_team_size(const int blksize,
2392  const int vector_length,
2393  const int internal_vector_length) {
2394  return ExtractAndFactorizeRecommendedHIPTeamSize(blksize, vector_length, internal_vector_length);
2395  }
2396 };
2397 #endif
2398 
2399 #if defined(KOKKOS_ENABLE_SYCL)
2400 static inline int ExtractAndFactorizeRecommendedSYCLTeamSize(const int blksize,
2401  const int vector_length,
2402  const int internal_vector_length) {
2403  const int vector_size = vector_length / internal_vector_length;
2404  int total_team_size(0);
2405  if (blksize <= 5)
2406  total_team_size = 32;
2407  else if (blksize <= 9)
2408  total_team_size = 32; // 64
2409  else if (blksize <= 12)
2410  total_team_size = 96;
2411  else if (blksize <= 16)
2412  total_team_size = 128;
2413  else if (blksize <= 20)
2414  total_team_size = 160;
2415  else
2416  total_team_size = 160;
2417  return 2 * total_team_size / vector_size;
2418 }
2419 template <>
2420 struct ExtractAndFactorizeTridiagsDefaultModeAndAlgo<Kokkos::Experimental::SYCLDeviceUSMSpace> {
2421  typedef KB::Mode::Team mode_type;
2422  typedef KB::Algo::Level3::Unblocked algo_type;
2423  static int recommended_team_size(const int blksize,
2424  const int vector_length,
2425  const int internal_vector_length) {
2426  return ExtractAndFactorizeRecommendedSYCLTeamSize(blksize, vector_length, internal_vector_length);
2427  }
2428 };
2429 template <>
2430 struct ExtractAndFactorizeTridiagsDefaultModeAndAlgo<Kokkos::Experimental::SYCLSharedUSMSpace> {
2431  typedef KB::Mode::Team mode_type;
2432  typedef KB::Algo::Level3::Unblocked algo_type;
2433  static int recommended_team_size(const int blksize,
2434  const int vector_length,
2435  const int internal_vector_length) {
2436  return ExtractAndFactorizeRecommendedSYCLTeamSize(blksize, vector_length, internal_vector_length);
2437  }
2438 };
2439 #endif
2440 
2441 template <typename impl_type, typename WWViewType>
2442 KOKKOS_INLINE_FUNCTION void
2443 solveMultiVector(const typename Kokkos::TeamPolicy<typename impl_type::execution_space>::member_type &member,
2444  const typename impl_type::local_ordinal_type & /* blocksize */,
2445  const typename impl_type::local_ordinal_type &i0,
2446  const typename impl_type::local_ordinal_type &r0,
2447  const typename impl_type::local_ordinal_type &nrows,
2448  const typename impl_type::local_ordinal_type &v,
2449  const ConstUnmanaged<typename impl_type::internal_vector_type_4d_view> D_internal_vector_values,
2450  const Unmanaged<typename impl_type::internal_vector_type_4d_view> X_internal_vector_values,
2451  const WWViewType &WW,
2452  const bool skip_first_pass = false) {
2453  using execution_space = typename impl_type::execution_space;
2454  using team_policy_type = Kokkos::TeamPolicy<execution_space>;
2455  using member_type = typename team_policy_type::member_type;
2456  using local_ordinal_type = typename impl_type::local_ordinal_type;
2457 
2458  typedef SolveTridiagsDefaultModeAndAlgo<typename execution_space::memory_space> default_mode_and_algo_type;
2459 
2460  typedef typename default_mode_and_algo_type::mode_type default_mode_type;
2461  typedef typename default_mode_and_algo_type::multi_vector_algo_type default_algo_type;
2462 
2463  using btdm_magnitude_type = typename impl_type::btdm_magnitude_type;
2464 
2465  // constant
2466 #if KOKKOS_VERSION >= 40799
2467  const auto one = KokkosKernels::ArithTraits<btdm_magnitude_type>::one();
2468 #else
2469  const auto one = Kokkos::ArithTraits<btdm_magnitude_type>::one();
2470 #endif
2471 #if KOKKOS_VERSION >= 40799
2472  const auto zero = KokkosKernels::ArithTraits<btdm_magnitude_type>::zero();
2473 #else
2474  const auto zero = Kokkos::ArithTraits<btdm_magnitude_type>::zero();
2475 #endif
2476 
2477  // subview pattern
2478  auto A = Kokkos::subview(D_internal_vector_values, i0, Kokkos::ALL(), Kokkos::ALL(), v);
2479  auto X1 = Kokkos::subview(X_internal_vector_values, r0, Kokkos::ALL(), Kokkos::ALL(), v);
2480  auto X2 = X1;
2481 
2482  local_ordinal_type i = i0, r = r0;
2483 
2484  if (nrows > 1) {
2485  // solve Lx = x
2486  if (skip_first_pass) {
2487  i += (nrows - 2) * 3;
2488  r += (nrows - 2);
2489  A.assign_data(&D_internal_vector_values(i + 2, 0, 0, v));
2490  X2.assign_data(&X_internal_vector_values(++r, 0, 0, v));
2491  A.assign_data(&D_internal_vector_values(i + 3, 0, 0, v));
2492  KB::Trsm<member_type,
2493  KB::Side::Left, KB::Uplo::Lower, KB::Trans::NoTranspose, KB::Diag::Unit,
2494  default_mode_type, default_algo_type>::invoke(member, one, A, X2);
2495  X1.assign_data(X2.data());
2496  i += 3;
2497  } else {
2498  KB::Trsm<member_type,
2499  KB::Side::Left, KB::Uplo::Lower, KB::Trans::NoTranspose, KB::Diag::Unit,
2500  default_mode_type, default_algo_type>::invoke(member, one, A, X1);
2501  for (local_ordinal_type tr = 1; tr < nrows; ++tr, i += 3) {
2502  A.assign_data(&D_internal_vector_values(i + 2, 0, 0, v));
2503  X2.assign_data(&X_internal_vector_values(++r, 0, 0, v));
2504  member.team_barrier();
2505  KB::Gemm<member_type,
2506  KB::Trans::NoTranspose, KB::Trans::NoTranspose,
2507  default_mode_type, default_algo_type>::invoke(member, -one, A, X1, one, X2);
2508  A.assign_data(&D_internal_vector_values(i + 3, 0, 0, v));
2509  KB::Trsm<member_type,
2510  KB::Side::Left, KB::Uplo::Lower, KB::Trans::NoTranspose, KB::Diag::Unit,
2511  default_mode_type, default_algo_type>::invoke(member, one, A, X2);
2512  X1.assign_data(X2.data());
2513  }
2514  }
2515 
2516  // solve Ux = x
2517  KB::Trsm<member_type,
2518  KB::Side::Left, KB::Uplo::Upper, KB::Trans::NoTranspose, KB::Diag::NonUnit,
2519  default_mode_type, default_algo_type>::invoke(member, one, A, X1);
2520  for (local_ordinal_type tr = nrows; tr > 1; --tr) {
2521  i -= 3;
2522  A.assign_data(&D_internal_vector_values(i + 1, 0, 0, v));
2523  X2.assign_data(&X_internal_vector_values(--r, 0, 0, v));
2524  member.team_barrier();
2525  KB::Gemm<member_type,
2526  KB::Trans::NoTranspose, KB::Trans::NoTranspose,
2527  default_mode_type, default_algo_type>::invoke(member, -one, A, X1, one, X2);
2528 
2529  A.assign_data(&D_internal_vector_values(i, 0, 0, v));
2530  KB::Trsm<member_type,
2531  KB::Side::Left, KB::Uplo::Upper, KB::Trans::NoTranspose, KB::Diag::NonUnit,
2532  default_mode_type, default_algo_type>::invoke(member, one, A, X2);
2533  X1.assign_data(X2.data());
2534  }
2535  } else {
2536  // matrix is already inverted
2537  auto W = Kokkos::subview(WW, Kokkos::ALL(), Kokkos::ALL(), v);
2538  KB::Copy<member_type, KB::Trans::NoTranspose, default_mode_type>::invoke(member, X1, W);
2539  member.team_barrier();
2540  KB::Gemm<member_type,
2541  KB::Trans::NoTranspose, KB::Trans::NoTranspose,
2542  default_mode_type, default_algo_type>::invoke(member, one, A, W, zero, X1);
2543  }
2544 }
2545 
2546 template <typename impl_type, typename WWViewType, typename XViewType>
2547 KOKKOS_INLINE_FUNCTION void
2548 solveSingleVectorNew(const typename Kokkos::TeamPolicy<typename impl_type::execution_space>::member_type &member,
2549  const typename impl_type::local_ordinal_type &blocksize,
2550  const typename impl_type::local_ordinal_type &i0,
2551  const typename impl_type::local_ordinal_type &r0,
2552  const typename impl_type::local_ordinal_type &nrows,
2553  const typename impl_type::local_ordinal_type &v,
2554  const ConstUnmanaged<typename impl_type::internal_vector_type_4d_view> D_internal_vector_values,
2555  const XViewType &X_internal_vector_values, // Unmanaged<typename impl_type::internal_vector_type_4d_view>
2556  const WWViewType &WW) {
2557  using execution_space = typename impl_type::execution_space;
2558  // using team_policy_type = Kokkos::TeamPolicy<execution_space>;
2559  // using member_type = typename team_policy_type::member_type;
2560  using local_ordinal_type = typename impl_type::local_ordinal_type;
2561 
2562  typedef SolveTridiagsDefaultModeAndAlgo<typename execution_space::memory_space> default_mode_and_algo_type;
2563 
2564  typedef typename default_mode_and_algo_type::mode_type default_mode_type;
2565  typedef typename default_mode_and_algo_type::single_vector_algo_type default_algo_type;
2566 
2567  using btdm_magnitude_type = typename impl_type::btdm_magnitude_type;
2568 
2569  // base pointers
2570  auto A = D_internal_vector_values.data();
2571  auto X = X_internal_vector_values.data();
2572 
2573  // constant
2574 #if KOKKOS_VERSION >= 40799
2575  const auto one = KokkosKernels::ArithTraits<btdm_magnitude_type>::one();
2576 #else
2577  const auto one = Kokkos::ArithTraits<btdm_magnitude_type>::one();
2578 #endif
2579 #if KOKKOS_VERSION >= 40799
2580  const auto zero = KokkosKernels::ArithTraits<btdm_magnitude_type>::zero();
2581 #else
2582  const auto zero = Kokkos::ArithTraits<btdm_magnitude_type>::zero();
2583 #endif
2584  // const local_ordinal_type num_vectors = X_scalar_values.extent(2);
2585 
2586  // const local_ordinal_type blocksize = D_scalar_values.extent(1);
2587  const local_ordinal_type astep = D_internal_vector_values.stride(0);
2588  const local_ordinal_type as0 = D_internal_vector_values.stride(1); // blocksize*vector_length;
2589  const local_ordinal_type as1 = D_internal_vector_values.stride(2); // vector_length;
2590  const local_ordinal_type xstep = X_internal_vector_values.stride(0);
2591  const local_ordinal_type xs0 = X_internal_vector_values.stride(1); // vector_length;
2592 
2593  // move to starting point
2594  A += i0 * astep + v;
2595  X += r0 * xstep + v;
2596 
2597  // for (local_ordinal_type col=0;col<num_vectors;++col)
2598  if (nrows > 1) {
2599  // solve Lx = x
2600  KOKKOSBATCHED_TRSV_LOWER_NO_TRANSPOSE_INTERNAL_INVOKE(default_mode_type, default_algo_type,
2601  member,
2602  KB::Diag::Unit,
2603  blocksize, blocksize,
2604  one,
2605  A, as0, as1,
2606  X, xs0);
2607 
2608  for (local_ordinal_type tr = 1; tr < nrows; ++tr) {
2609  member.team_barrier();
2610  KOKKOSBATCHED_GEMV_NO_TRANSPOSE_INTERNAL_INVOKE(default_mode_type, default_algo_type,
2611  member,
2612  blocksize, blocksize,
2613  -one,
2614  A + 2 * astep, as0, as1,
2615  X, xs0,
2616  one,
2617  X + 1 * xstep, xs0);
2618  KOKKOSBATCHED_TRSV_LOWER_NO_TRANSPOSE_INTERNAL_INVOKE(default_mode_type, default_algo_type,
2619  member,
2620  KB::Diag::Unit,
2621  blocksize, blocksize,
2622  one,
2623  A + 3 * astep, as0, as1,
2624  X + 1 * xstep, xs0);
2625 
2626  A += 3 * astep;
2627  X += 1 * xstep;
2628  }
2629 
2630  // solve Ux = x
2631  KOKKOSBATCHED_TRSV_UPPER_NO_TRANSPOSE_INTERNAL_INVOKE(default_mode_type, default_algo_type,
2632  member,
2633  KB::Diag::NonUnit,
2634  blocksize, blocksize,
2635  one,
2636  A, as0, as1,
2637  X, xs0);
2638 
2639  for (local_ordinal_type tr = nrows; tr > 1; --tr) {
2640  A -= 3 * astep;
2641  member.team_barrier();
2642  KOKKOSBATCHED_GEMV_NO_TRANSPOSE_INTERNAL_INVOKE(default_mode_type, default_algo_type,
2643  member,
2644  blocksize, blocksize,
2645  -one,
2646  A + 1 * astep, as0, as1,
2647  X, xs0,
2648  one,
2649  X - 1 * xstep, xs0);
2650  KOKKOSBATCHED_TRSV_UPPER_NO_TRANSPOSE_INTERNAL_INVOKE(default_mode_type, default_algo_type,
2651  member,
2652  KB::Diag::NonUnit,
2653  blocksize, blocksize,
2654  one,
2655  A, as0, as1,
2656  X - 1 * xstep, xs0);
2657  X -= 1 * xstep;
2658  }
2659  // for multiple rhs
2660  // X += xs1;
2661  } else {
2662  const local_ordinal_type ws0 = WW.stride(0);
2663  auto W = WW.data() + v;
2664  KOKKOSBATCHED_COPY_VECTOR_NO_TRANSPOSE_INTERNAL_INVOKE(default_mode_type,
2665  member, blocksize, X, xs0, W, ws0);
2666  member.team_barrier();
2667  KOKKOSBATCHED_GEMV_NO_TRANSPOSE_INTERNAL_INVOKE(default_mode_type, default_algo_type,
2668  member,
2669  blocksize, blocksize,
2670  one,
2671  A, as0, as1,
2672  W, xs0,
2673  zero,
2674  X, xs0);
2675  }
2676 }
2677 
2678 template <typename local_ordinal_type, typename ViewType>
2679 void writeBTDValuesToFile(const local_ordinal_type &n_parts, const ViewType &scalar_values_device, std::string fileName) {
2680 #ifdef IFPACK2_BLOCKTRIDICONTAINER_WRITE_MM
2681  auto scalar_values = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), scalar_values_device);
2682  std::ofstream myfile;
2683  myfile.open(fileName);
2684 
2685  const local_ordinal_type n_parts_per_pack = n_parts < (local_ordinal_type)scalar_values.extent(3) ? n_parts : scalar_values.extent(3);
2686  local_ordinal_type nnz = scalar_values.extent(0) * scalar_values.extent(1) * scalar_values.extent(2) * n_parts_per_pack;
2687  const local_ordinal_type n_blocks = scalar_values.extent(0) * n_parts_per_pack;
2688  const local_ordinal_type n_blocks_per_part = n_blocks / n_parts;
2689 
2690  const local_ordinal_type block_size = scalar_values.extent(1);
2691 
2692  const local_ordinal_type n_rows_per_part = (n_blocks_per_part + 2) / 3 * block_size;
2693  const local_ordinal_type n_rows = n_rows_per_part * n_parts;
2694 
2695  const local_ordinal_type n_packs = ceil(float(n_parts) / n_parts_per_pack);
2696 
2697  myfile << "%%MatrixMarket matrix coordinate real general" << std::endl;
2698  myfile << "%%nnz = " << nnz;
2699  myfile << " block size = " << block_size;
2700  myfile << " number of blocks = " << n_blocks;
2701  myfile << " number of parts = " << n_parts;
2702  myfile << " number of blocks per part = " << n_blocks_per_part;
2703  myfile << " number of rows = " << n_rows;
2704  myfile << " number of cols = " << n_rows;
2705  myfile << " number of packs = " << n_packs << std::endl;
2706 
2707  myfile << n_rows << " " << n_rows << " " << nnz << std::setprecision(9) << std::endl;
2708 
2709  local_ordinal_type current_part_idx, current_block_idx, current_row_offset, current_col_offset, current_row, current_col;
2710  for (local_ordinal_type i_pack = 0; i_pack < n_packs; ++i_pack) {
2711  for (local_ordinal_type i_part_in_pack = 0; i_part_in_pack < n_parts_per_pack; ++i_part_in_pack) {
2712  current_part_idx = i_part_in_pack + i_pack * n_parts_per_pack;
2713  for (local_ordinal_type i_block_in_part = 0; i_block_in_part < n_blocks_per_part; ++i_block_in_part) {
2714  current_block_idx = i_block_in_part + i_pack * n_blocks_per_part;
2715  if (current_block_idx >= (local_ordinal_type)scalar_values.extent(0))
2716  continue;
2717  if (i_block_in_part % 3 == 0) {
2718  current_row_offset = i_block_in_part / 3 * block_size;
2719  current_col_offset = i_block_in_part / 3 * block_size;
2720  } else if (i_block_in_part % 3 == 1) {
2721  current_row_offset = (i_block_in_part - 1) / 3 * block_size;
2722  current_col_offset = ((i_block_in_part - 1) / 3 + 1) * block_size;
2723  } else if (i_block_in_part % 3 == 2) {
2724  current_row_offset = ((i_block_in_part - 2) / 3 + 1) * block_size;
2725  current_col_offset = (i_block_in_part - 2) / 3 * block_size;
2726  }
2727  current_row_offset += current_part_idx * n_rows_per_part;
2728  current_col_offset += current_part_idx * n_rows_per_part;
2729  for (local_ordinal_type i_in_block = 0; i_in_block < block_size; ++i_in_block) {
2730  for (local_ordinal_type j_in_block = 0; j_in_block < block_size; ++j_in_block) {
2731  current_row = current_row_offset + i_in_block + 1;
2732  current_col = current_col_offset + j_in_block + 1;
2733  myfile << current_row << " " << current_col << " " << scalar_values(current_block_idx, i_in_block, j_in_block, i_part_in_pack) << std::endl;
2734  }
2735  }
2736  }
2737  }
2738  }
2739 
2740  myfile.close();
2741 #endif
2742 }
2743 
2744 template <typename local_ordinal_type, typename ViewType>
2745 void write4DMultiVectorValuesToFile(const local_ordinal_type &n_parts, const ViewType &scalar_values_device, std::string fileName) {
2746 #ifdef IFPACK2_BLOCKTRIDICONTAINER_WRITE_MM
2747  auto scalar_values = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), scalar_values_device);
2748  std::ofstream myfile;
2749  myfile.open(fileName);
2750 
2751  const local_ordinal_type n_parts_per_pack = n_parts < scalar_values.extent(3) ? n_parts : scalar_values.extent(3);
2752  const local_ordinal_type n_blocks = scalar_values.extent(0) * n_parts_per_pack;
2753  const local_ordinal_type n_blocks_per_part = n_blocks / n_parts;
2754 
2755  const local_ordinal_type block_size = scalar_values.extent(1);
2756  const local_ordinal_type n_cols = scalar_values.extent(2);
2757 
2758  const local_ordinal_type n_rows_per_part = n_blocks_per_part * block_size;
2759  const local_ordinal_type n_rows = n_rows_per_part * n_parts;
2760 
2761  const local_ordinal_type n_packs = ceil(float(n_parts) / n_parts_per_pack);
2762 
2763  myfile << "%%MatrixMarket matrix array real general" << std::endl;
2764  myfile << "%%block size = " << block_size;
2765  myfile << " number of blocks = " << n_blocks;
2766  myfile << " number of parts = " << n_parts;
2767  myfile << " number of blocks per part = " << n_blocks_per_part;
2768  myfile << " number of rows = " << n_rows;
2769  myfile << " number of cols = " << n_cols;
2770  myfile << " number of packs = " << n_packs << std::endl;
2771 
2772  myfile << n_rows << " " << n_cols << std::setprecision(9) << std::endl;
2773 
2774  local_ordinal_type current_part_idx, current_block_idx, current_row_offset;
2775  (void)current_row_offset;
2776  (void)current_part_idx;
2777  for (local_ordinal_type j_in_block = 0; j_in_block < n_cols; ++j_in_block) {
2778  for (local_ordinal_type i_pack = 0; i_pack < n_packs; ++i_pack) {
2779  for (local_ordinal_type i_part_in_pack = 0; i_part_in_pack < n_parts_per_pack; ++i_part_in_pack) {
2780  current_part_idx = i_part_in_pack + i_pack * n_parts_per_pack;
2781  for (local_ordinal_type i_block_in_part = 0; i_block_in_part < n_blocks_per_part; ++i_block_in_part) {
2782  current_block_idx = i_block_in_part + i_pack * n_blocks_per_part;
2783 
2784  if (current_block_idx >= (local_ordinal_type)scalar_values.extent(0))
2785  continue;
2786  for (local_ordinal_type i_in_block = 0; i_in_block < block_size; ++i_in_block) {
2787  myfile << scalar_values(current_block_idx, i_in_block, j_in_block, i_part_in_pack) << std::endl;
2788  }
2789  }
2790  }
2791  }
2792  }
2793  myfile.close();
2794 #endif
2795 }
2796 
2797 template <typename local_ordinal_type, typename ViewType>
2798 void write5DMultiVectorValuesToFile(const local_ordinal_type &n_parts, const ViewType &scalar_values_device, std::string fileName) {
2799 #ifdef IFPACK2_BLOCKTRIDICONTAINER_WRITE_MM
2800  auto scalar_values = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), scalar_values_device);
2801  std::ofstream myfile;
2802  myfile.open(fileName);
2803 
2804  const local_ordinal_type n_parts_per_pack = n_parts < scalar_values.extent(4) ? n_parts : scalar_values.extent(4);
2805  const local_ordinal_type n_blocks = scalar_values.extent(1) * n_parts_per_pack;
2806  const local_ordinal_type n_blocks_per_part = n_blocks / n_parts;
2807 
2808  const local_ordinal_type block_size = scalar_values.extent(2);
2809  const local_ordinal_type n_blocks_cols = scalar_values.extent(0);
2810  const local_ordinal_type n_cols = n_blocks_cols * block_size;
2811 
2812  const local_ordinal_type n_rows_per_part = n_blocks_per_part * block_size;
2813  const local_ordinal_type n_rows = n_rows_per_part * n_parts;
2814 
2815  const local_ordinal_type n_packs = ceil(float(n_parts) / n_parts_per_pack);
2816 
2817  myfile << "%%MatrixMarket matrix array real general" << std::endl;
2818  myfile << "%%block size = " << block_size;
2819  myfile << " number of blocks = " << n_blocks;
2820  myfile << " number of parts = " << n_parts;
2821  myfile << " number of blocks per part = " << n_blocks_per_part;
2822  myfile << " number of rows = " << n_rows;
2823  myfile << " number of cols = " << n_cols;
2824  myfile << " number of packs = " << n_packs << std::endl;
2825 
2826  myfile << n_rows << " " << n_cols << std::setprecision(9) << std::endl;
2827 
2828  local_ordinal_type current_part_idx, current_block_idx, current_row_offset;
2829  (void)current_row_offset;
2830  (void)current_part_idx;
2831  for (local_ordinal_type i_block_col = 0; i_block_col < n_blocks_cols; ++i_block_col) {
2832  for (local_ordinal_type j_in_block = 0; j_in_block < block_size; ++j_in_block) {
2833  for (local_ordinal_type i_pack = 0; i_pack < n_packs; ++i_pack) {
2834  for (local_ordinal_type i_part_in_pack = 0; i_part_in_pack < n_parts_per_pack; ++i_part_in_pack) {
2835  current_part_idx = i_part_in_pack + i_pack * n_parts_per_pack;
2836  for (local_ordinal_type i_block_in_part = 0; i_block_in_part < n_blocks_per_part; ++i_block_in_part) {
2837  current_block_idx = i_block_in_part + i_pack * n_blocks_per_part;
2838 
2839  if (current_block_idx >= (local_ordinal_type)scalar_values.extent(1))
2840  continue;
2841  for (local_ordinal_type i_in_block = 0; i_in_block < block_size; ++i_in_block) {
2842  myfile << scalar_values(i_block_col, current_block_idx, i_in_block, j_in_block, i_part_in_pack) << std::endl;
2843  }
2844  }
2845  }
2846  }
2847  }
2848  }
2849  myfile.close();
2850 #endif
2851 }
2852 
2853 template <typename local_ordinal_type, typename member_type, typename ViewType1, typename ViewType2>
2854 KOKKOS_INLINE_FUNCTION void
2855 copy3DView(const member_type &member, const ViewType1 &view1, const ViewType2 &view2) {
2856  /*
2857  // Kokkos::Experimental::local_deep_copy
2858  auto teamVectorRange =
2859  Kokkos::TeamVectorMDRange<Kokkos::Rank<3>, member_type>(
2860  member, view1.extent(0), view1.extent(1), view1.extent(2));
2861 
2862  Kokkos::parallel_for
2863  (teamVectorRange,
2864  [&](const local_ordinal_type &i, const local_ordinal_type &j, const local_ordinal_type &k) {
2865  view1(i,j,k) = view2(i,j,k);
2866  });
2867  */
2868  Kokkos::Experimental::local_deep_copy(member, view1, view2);
2869 }
2870 template <typename MatrixType, int ScratchLevel>
2871 struct ExtractAndFactorizeTridiags {
2872  public:
2873  using impl_type = BlockHelperDetails::ImplType<MatrixType>;
2874  // a functor cannot have both device_type and execution_space; specialization error in kokkos
2875  using execution_space = typename impl_type::execution_space;
2876  using memory_space = typename impl_type::memory_space;
2878  using local_ordinal_type = typename impl_type::local_ordinal_type;
2879  using size_type = typename impl_type::size_type;
2880  using impl_scalar_type = typename impl_type::impl_scalar_type;
2881  using magnitude_type = typename impl_type::magnitude_type;
2883  using row_matrix_type = typename impl_type::tpetra_row_matrix_type;
2884  using crs_graph_type = typename impl_type::tpetra_crs_graph_type;
2886  using local_ordinal_type_1d_view = typename impl_type::local_ordinal_type_1d_view;
2887  using local_ordinal_type_2d_view = typename impl_type::local_ordinal_type_2d_view;
2888  using size_type_1d_view = typename impl_type::size_type_1d_view;
2889  using size_type_2d_view = typename impl_type::size_type_2d_view;
2890  using impl_scalar_type_1d_view_tpetra = typename impl_type::impl_scalar_type_1d_view_tpetra;
2892  using btdm_scalar_type = typename impl_type::btdm_scalar_type;
2893  using btdm_magnitude_type = typename impl_type::btdm_magnitude_type;
2894  using vector_type_3d_view = typename impl_type::vector_type_3d_view;
2895  using vector_type_4d_view = typename impl_type::vector_type_4d_view;
2896  using internal_vector_type_4d_view = typename impl_type::internal_vector_type_4d_view;
2897  using internal_vector_type_5d_view = typename impl_type::internal_vector_type_5d_view;
2898  using btdm_scalar_type_2d_view = typename impl_type::btdm_scalar_type_2d_view;
2899  using btdm_scalar_type_3d_view = typename impl_type::btdm_scalar_type_3d_view;
2900  using btdm_scalar_type_4d_view = typename impl_type::btdm_scalar_type_4d_view;
2901  using btdm_scalar_type_5d_view = typename impl_type::btdm_scalar_type_5d_view;
2902  using internal_vector_scratch_type_3d_view = Scratch<typename impl_type::internal_vector_type_3d_view>;
2903  using btdm_scalar_scratch_type_3d_view = Scratch<typename impl_type::btdm_scalar_type_3d_view>;
2904  using tpetra_block_access_view_type = typename impl_type::tpetra_block_access_view_type; // block crs (layout right)
2905  using local_crs_graph_type = typename impl_type::local_crs_graph_type;
2906  using colinds_view = typename local_crs_graph_type::entries_type;
2907 
2908  using internal_vector_type = typename impl_type::internal_vector_type;
2909  static constexpr int vector_length = impl_type::vector_length;
2910  static constexpr int internal_vector_length = impl_type::internal_vector_length;
2911  static_assert(vector_length >= internal_vector_length, "Ifpack2 BlockTriDi Numeric: vector_length must be at least as large as internal_vector_length");
2912  static_assert(vector_length % internal_vector_length == 0, "Ifpack2 BlockTriDi Numeric: vector_length must be divisible by internal_vector_length");
2913  // half_vector_length is used for block Jacobi factorization.
2914  // Shared memory requirement is twice as large (per vector lane) as for general tridi factorization, so
2915  // reducing vector length (if possible) keeps the shared requirement constant. This avoids the performance
2916  // cliff of switching from level 0 to level 1 scratch.
2917  static constexpr int half_vector_length = impl_type::half_vector_length;
2918 
2920  using team_policy_type = Kokkos::TeamPolicy<execution_space>;
2921  using member_type = typename team_policy_type::member_type;
2922 
2923  private:
2924  // part interface
2925  const ConstUnmanaged<local_ordinal_type_1d_view> partptr, lclrow, packptr, packindices_sub, packptr_sub;
2926  const ConstUnmanaged<local_ordinal_type_2d_view> partptr_sub, part2packrowidx0_sub, packindices_schur;
2927  const local_ordinal_type max_partsz;
2928  // block crs matrix (it could be Kokkos::UVMSpace::size_type, which is int)
2929  using size_type_1d_view_tpetra = Kokkos::View<size_t *, typename impl_type::node_device_type>;
2930  ConstUnmanaged<size_type_1d_view_tpetra> A_block_rowptr;
2931  ConstUnmanaged<size_type_1d_view_tpetra> A_point_rowptr;
2932  ConstUnmanaged<impl_scalar_type_1d_view_tpetra> A_values;
2933  // block tridiags
2934  const ConstUnmanaged<size_type_2d_view> pack_td_ptr, flat_td_ptr, pack_td_ptr_schur;
2935  const ConstUnmanaged<local_ordinal_type_1d_view> A_colindsub;
2936  const Unmanaged<internal_vector_type_4d_view> internal_vector_values, internal_vector_values_schur;
2937  const Unmanaged<internal_vector_type_5d_view> e_internal_vector_values;
2938  const Unmanaged<btdm_scalar_type_4d_view> scalar_values, scalar_values_schur;
2939  const Unmanaged<btdm_scalar_type_5d_view> e_scalar_values;
2940  const Unmanaged<btdm_scalar_type_3d_view> d_inv;
2941  const Unmanaged<size_type_1d_view> diag_offsets;
2942  // shared information
2943  const local_ordinal_type blocksize, blocksize_square;
2944  // diagonal safety
2945  const magnitude_type tiny;
2946  const local_ordinal_type vector_loop_size;
2947 
2948  bool hasBlockCrsMatrix;
2949 
2950  public:
2951  ExtractAndFactorizeTridiags(const BlockTridiags<MatrixType> &btdm_,
2952  const BlockHelperDetails::PartInterface<MatrixType> &interf_,
2955  const magnitude_type &tiny_)
2956  : // interface
2957  partptr(interf_.partptr)
2958  , lclrow(interf_.lclrow)
2959  , packptr(interf_.packptr)
2960  , packindices_sub(interf_.packindices_sub)
2961  , packptr_sub(interf_.packptr_sub)
2962  , partptr_sub(interf_.partptr_sub)
2963  , part2packrowidx0_sub(interf_.part2packrowidx0_sub)
2964  , packindices_schur(interf_.packindices_schur)
2965  , max_partsz(interf_.max_partsz)
2966  ,
2967  // block tridiags
2968  pack_td_ptr(btdm_.pack_td_ptr)
2969  , flat_td_ptr(btdm_.flat_td_ptr)
2970  , pack_td_ptr_schur(btdm_.pack_td_ptr_schur)
2971  , A_colindsub(btdm_.A_colindsub)
2972  , internal_vector_values((internal_vector_type *)btdm_.values.data(),
2973  btdm_.values.extent(0),
2974  btdm_.values.extent(1),
2975  btdm_.values.extent(2),
2976  vector_length / internal_vector_length)
2977  , internal_vector_values_schur((internal_vector_type *)btdm_.values_schur.data(),
2978  btdm_.values_schur.extent(0),
2979  btdm_.values_schur.extent(1),
2980  btdm_.values_schur.extent(2),
2981  vector_length / internal_vector_length)
2982  , e_internal_vector_values((internal_vector_type *)btdm_.e_values.data(),
2983  btdm_.e_values.extent(0),
2984  btdm_.e_values.extent(1),
2985  btdm_.e_values.extent(2),
2986  btdm_.e_values.extent(3),
2987  vector_length / internal_vector_length)
2988  , scalar_values((btdm_scalar_type *)btdm_.values.data(),
2989  btdm_.values.extent(0),
2990  btdm_.values.extent(1),
2991  btdm_.values.extent(2),
2992  vector_length)
2993  , scalar_values_schur((btdm_scalar_type *)btdm_.values_schur.data(),
2994  btdm_.values_schur.extent(0),
2995  btdm_.values_schur.extent(1),
2996  btdm_.values_schur.extent(2),
2997  vector_length)
2998  , e_scalar_values((btdm_scalar_type *)btdm_.e_values.data(),
2999  btdm_.e_values.extent(0),
3000  btdm_.e_values.extent(1),
3001  btdm_.e_values.extent(2),
3002  btdm_.e_values.extent(3),
3003  vector_length)
3004  , d_inv(btdm_.d_inv)
3005  , diag_offsets(btdm_.diag_offsets)
3006  , blocksize(btdm_.values.extent(1))
3007  , blocksize_square(blocksize * blocksize)
3008  ,
3009  // diagonal weight to avoid zero pivots
3010  tiny(tiny_)
3011  , vector_loop_size(vector_length / internal_vector_length) {
3012  using crs_matrix_type = typename impl_type::tpetra_crs_matrix_type;
3013  using block_crs_matrix_type = typename impl_type::tpetra_block_crs_matrix_type;
3014 
3015  auto A_crs = Teuchos::rcp_dynamic_cast<const crs_matrix_type>(A_);
3016  auto A_bcrs = Teuchos::rcp_dynamic_cast<const block_crs_matrix_type>(A_);
3017 
3018  hasBlockCrsMatrix = !A_bcrs.is_null();
3019 
3020  A_block_rowptr = G_->getLocalGraphDevice().row_map;
3021  if (hasBlockCrsMatrix) {
3022  A_values = const_cast<block_crs_matrix_type *>(A_bcrs.get())->getValuesDeviceNonConst();
3023  } else {
3024  A_point_rowptr = A_crs->getCrsGraph()->getLocalGraphDevice().row_map;
3025  A_values = A_crs->getLocalValuesDevice(Tpetra::Access::ReadOnly);
3026  }
3027  }
3028 
3029  private:
3030  KOKKOS_INLINE_FUNCTION
3031  void
3032  extract(local_ordinal_type partidx,
3033  local_ordinal_type local_subpartidx,
3034  local_ordinal_type npacks) const {
3035 #ifdef IFPACK2_BLOCKTRIDICONTAINER_USE_PRINTF
3036  printf("extract partidx = %d, local_subpartidx = %d, npacks = %d;\n", partidx, local_subpartidx, npacks);
3037 #endif
3038  using tlb = BlockHelperDetails::TpetraLittleBlock<Tpetra::Impl::BlockCrsMatrixLittleBlockArrayLayout>;
3039  const size_type kps = pack_td_ptr(partidx, local_subpartidx);
3040  local_ordinal_type kfs[vector_length] = {};
3041  local_ordinal_type ri0[vector_length] = {};
3042  local_ordinal_type nrows[vector_length] = {};
3043 
3044  for (local_ordinal_type vi = 0; vi < npacks; ++vi, ++partidx) {
3045  kfs[vi] = flat_td_ptr(partidx, local_subpartidx);
3046  ri0[vi] = partptr_sub(pack_td_ptr.extent(0) * local_subpartidx + partidx, 0);
3047  nrows[vi] = partptr_sub(pack_td_ptr.extent(0) * local_subpartidx + partidx, 1) - ri0[vi];
3048 #ifdef IFPACK2_BLOCKTRIDICONTAINER_USE_PRINTF
3049  printf("kfs[%d] = %d;\n", vi, kfs[vi]);
3050  printf("ri0[%d] = %d;\n", vi, ri0[vi]);
3051  printf("nrows[%d] = %d;\n", vi, nrows[vi]);
3052 #endif
3053  }
3054  local_ordinal_type tr_min = 0;
3055  local_ordinal_type tr_max = nrows[0];
3056  if (local_subpartidx % 2 == 1) {
3057  tr_min -= 1;
3058  tr_max += 1;
3059  }
3060 #ifdef IFPACK2_BLOCKTRIDICONTAINER_USE_PRINTF
3061  printf("tr_min = %d and tr_max = %d;\n", tr_min, tr_max);
3062 #endif
3063  for (local_ordinal_type tr = tr_min, j = 0; tr < tr_max; ++tr) {
3064  for (local_ordinal_type e = 0; e < 3; ++e) {
3065  if (hasBlockCrsMatrix) {
3066  const impl_scalar_type *block[vector_length] = {};
3067  for (local_ordinal_type vi = 0; vi < npacks; ++vi) {
3068  const size_type Aj = A_block_rowptr(lclrow(ri0[vi] + tr)) + A_colindsub(kfs[vi] + j);
3069 
3070  block[vi] = &A_values(Aj * blocksize_square);
3071  }
3072  const size_type pi = kps + j;
3073 #ifdef IFPACK2_BLOCKTRIDICONTAINER_USE_PRINTF
3074  printf("Extract pi = %ld, ri0 + tr = %d, kfs + j = %d\n", pi, ri0[0] + tr, kfs[0] + j);
3075 #endif
3076  ++j;
3077  for (local_ordinal_type ii = 0; ii < blocksize; ++ii) {
3078  for (local_ordinal_type jj = 0; jj < blocksize; ++jj) {
3079  const auto idx = tlb::getFlatIndex(ii, jj, blocksize);
3080  auto &v = internal_vector_values(pi, ii, jj, 0);
3081  for (local_ordinal_type vi = 0; vi < npacks; ++vi) {
3082  v[vi] = static_cast<btdm_scalar_type>(block[vi][idx]);
3083  }
3084  }
3085  }
3086  } else {
3087  const size_type pi = kps + j;
3088 
3089  for (local_ordinal_type vi = 0; vi < npacks; ++vi) {
3090  const size_type Aj_c = A_colindsub(kfs[vi] + j);
3091 
3092  for (local_ordinal_type ii = 0; ii < blocksize; ++ii) {
3093  auto point_row_offset = A_point_rowptr(lclrow(ri0[vi] + tr) * blocksize + ii);
3094 
3095  for (local_ordinal_type jj = 0; jj < blocksize; ++jj) {
3096  scalar_values(pi, ii, jj, vi) = A_values(point_row_offset + Aj_c * blocksize + jj);
3097  }
3098  }
3099  }
3100  ++j;
3101  }
3102  if (nrows[0] == 1) break;
3103  if (local_subpartidx % 2 == 0) {
3104  if (e == 1 && (tr == 0 || tr + 1 == nrows[0])) break;
3105  for (local_ordinal_type vi = 1; vi < npacks; ++vi) {
3106  if ((e == 0 && nrows[vi] == 1) || (e == 1 && tr + 1 == nrows[vi])) {
3107  npacks = vi;
3108  break;
3109  }
3110  }
3111  } else {
3112  if (e == 0 && (tr == -1 || tr == nrows[0])) break;
3113  for (local_ordinal_type vi = 1; vi < npacks; ++vi) {
3114  if ((e == 0 && nrows[vi] == 1) || (e == 0 && tr == nrows[vi])) {
3115  npacks = vi;
3116  break;
3117  }
3118  }
3119  }
3120  }
3121  }
3122  }
3123 
3124  KOKKOS_INLINE_FUNCTION
3125  void
3126  extract(const member_type &member,
3127  const local_ordinal_type &partidxbeg,
3128  local_ordinal_type local_subpartidx,
3129  const local_ordinal_type &npacks,
3130  const local_ordinal_type &vbeg) const {
3131 #ifdef IFPACK2_BLOCKTRIDICONTAINER_USE_PRINTF
3132  printf("extract partidxbeg = %d, local_subpartidx = %d, npacks = %d, vbeg = %d;\n", partidxbeg, local_subpartidx, npacks, vbeg);
3133 #endif
3134  using tlb = BlockHelperDetails::TpetraLittleBlock<Tpetra::Impl::BlockCrsMatrixLittleBlockArrayLayout>;
3135  local_ordinal_type kfs_vals[internal_vector_length] = {};
3136  local_ordinal_type ri0_vals[internal_vector_length] = {};
3137  local_ordinal_type nrows_vals[internal_vector_length] = {};
3138 
3139  const size_type kps = pack_td_ptr(partidxbeg, local_subpartidx);
3140  for (local_ordinal_type v = vbeg, vi = 0; v < npacks && vi < internal_vector_length; ++v, ++vi) {
3141  kfs_vals[vi] = flat_td_ptr(partidxbeg + vi, local_subpartidx);
3142  ri0_vals[vi] = partptr_sub(pack_td_ptr.extent(0) * local_subpartidx + partidxbeg + vi, 0);
3143  nrows_vals[vi] = partptr_sub(pack_td_ptr.extent(0) * local_subpartidx + partidxbeg + vi, 1) - ri0_vals[vi];
3144 #ifdef IFPACK2_BLOCKTRIDICONTAINER_USE_PRINTF
3145  printf("kfs_vals[%d] = %d;\n", vi, kfs_vals[vi]);
3146  printf("ri0_vals[%d] = %d;\n", vi, ri0_vals[vi]);
3147  printf("nrows_vals[%d] = %d;\n", vi, nrows_vals[vi]);
3148 #endif
3149  }
3150 
3151  local_ordinal_type j_vals[internal_vector_length] = {};
3152 
3153  local_ordinal_type tr_min = 0;
3154  local_ordinal_type tr_max = nrows_vals[0];
3155  if (local_subpartidx % 2 == 1) {
3156  tr_min -= 1;
3157  tr_max += 1;
3158  }
3159 #ifdef IFPACK2_BLOCKTRIDICONTAINER_USE_PRINTF
3160  printf("tr_min = %d and tr_max = %d;\n", tr_min, tr_max);
3161 #endif
3162  for (local_ordinal_type tr = tr_min; tr < tr_max; ++tr) {
3163  for (local_ordinal_type v = vbeg, vi = 0; v < npacks && vi < internal_vector_length; ++v, ++vi) {
3164  const local_ordinal_type nrows = (local_subpartidx % 2 == 0 ? nrows_vals[vi] : nrows_vals[vi]);
3165  if ((local_subpartidx % 2 == 0 && tr < nrows) || (local_subpartidx % 2 == 1 && tr < nrows + 1)) {
3166  auto &j = j_vals[vi];
3167  const local_ordinal_type kfs = kfs_vals[vi];
3168  const local_ordinal_type ri0 = ri0_vals[vi];
3169  local_ordinal_type lbeg, lend;
3170  if (local_subpartidx % 2 == 0) {
3171  lbeg = (tr == tr_min ? 1 : 0);
3172  lend = (tr == nrows - 1 ? 2 : 3);
3173  } else {
3174  lbeg = 0;
3175  lend = 3;
3176  if (tr == tr_min) {
3177  lbeg = 1;
3178  lend = 2;
3179  } else if (tr == nrows) {
3180  lbeg = 0;
3181  lend = 1;
3182  }
3183  }
3184  if (hasBlockCrsMatrix) {
3185  for (local_ordinal_type l = lbeg; l < lend; ++l, ++j) {
3186  const size_type Aj = A_block_rowptr(lclrow(ri0 + tr)) + A_colindsub(kfs + j);
3187  const impl_scalar_type *block = &A_values(Aj * blocksize_square);
3188  const size_type pi = kps + j;
3189 #ifdef IFPACK2_BLOCKTRIDICONTAINER_USE_PRINTF
3190  printf("Extract pi = %ld, ri0 + tr = %d, kfs + j = %d, tr = %d, lbeg = %d, lend = %d, l = %d\n", pi, ri0 + tr, kfs + j, tr, lbeg, lend, l);
3191 #endif
3192  Kokkos::parallel_for(Kokkos::TeamThreadRange(member, blocksize),
3193  [&](const local_ordinal_type &ii) {
3194  for (local_ordinal_type jj = 0; jj < blocksize; ++jj) {
3195  scalar_values(pi, ii, jj, v) = static_cast<btdm_scalar_type>(block[tlb::getFlatIndex(ii, jj, blocksize)]);
3196  }
3197  });
3198  }
3199  } else {
3200  for (local_ordinal_type l = lbeg; l < lend; ++l, ++j) {
3201  const size_type Aj_c = A_colindsub(kfs + j);
3202  const size_type pi = kps + j;
3203  Kokkos::parallel_for(Kokkos::TeamThreadRange(member, blocksize),
3204  [&](const local_ordinal_type &ii) {
3205  auto point_row_offset = A_point_rowptr(lclrow(ri0 + tr) * blocksize + ii);
3206  for (local_ordinal_type jj = 0; jj < blocksize; ++jj) {
3207  scalar_values(pi, ii, jj, v) = A_values(point_row_offset + Aj_c * blocksize + jj);
3208  }
3209  });
3210  }
3211  }
3212  }
3213  }
3214  }
3215  }
3216 
3217  template <typename AAViewType,
3218  typename WWViewType>
3219  KOKKOS_INLINE_FUNCTION void
3220  factorize_subline(const member_type &member,
3221  const local_ordinal_type &i0,
3222  const local_ordinal_type &nrows,
3223  const local_ordinal_type &v,
3224  const AAViewType &AA,
3225  const WWViewType &WW) const {
3226  typedef ExtractAndFactorizeTridiagsDefaultModeAndAlgo<typename execution_space::memory_space> default_mode_and_algo_type;
3227 
3228  typedef typename default_mode_and_algo_type::mode_type default_mode_type;
3229  typedef typename default_mode_and_algo_type::algo_type default_algo_type;
3230 
3231  // constant
3232 #if KOKKOS_VERSION >= 40799
3233  const auto one = KokkosKernels::ArithTraits<btdm_magnitude_type>::one();
3234 #else
3235  const auto one = Kokkos::ArithTraits<btdm_magnitude_type>::one();
3236 #endif
3237 
3238 #ifdef IFPACK2_BLOCKTRIDICONTAINER_USE_PRINTF
3239  printf("i0 = %d, nrows = %d, v = %d, AA.extent(0) = %ld;\n", i0, nrows, v, AA.extent(0));
3240 #endif
3241 
3242  // subview pattern
3243  auto A = Kokkos::subview(AA, i0, Kokkos::ALL(), Kokkos::ALL(), v);
3244  KB::LU<member_type,
3245  default_mode_type, KB::Algo::LU::Unblocked>::invoke(member, A, tiny);
3246 
3247  if (nrows > 1) {
3248  auto B = A;
3249  auto C = A;
3250  local_ordinal_type i = i0;
3251  for (local_ordinal_type tr = 1; tr < nrows; ++tr, i += 3) {
3252 #ifdef IFPACK2_BLOCKTRIDICONTAINER_USE_PRINTF
3253  printf("tr = %d, i = %d;\n", tr, i);
3254 #endif
3255  B.assign_data(&AA(i + 1, 0, 0, v));
3256  KB::Trsm<member_type,
3257  KB::Side::Left, KB::Uplo::Lower, KB::Trans::NoTranspose, KB::Diag::Unit,
3258  default_mode_type, default_algo_type>::invoke(member, one, A, B);
3259  C.assign_data(&AA(i + 2, 0, 0, v));
3260  KB::Trsm<member_type,
3261  KB::Side::Right, KB::Uplo::Upper, KB::Trans::NoTranspose, KB::Diag::NonUnit,
3262  default_mode_type, default_algo_type>::invoke(member, one, A, C);
3263  A.assign_data(&AA(i + 3, 0, 0, v));
3264 
3265  member.team_barrier();
3266  KB::Gemm<member_type,
3267  KB::Trans::NoTranspose, KB::Trans::NoTranspose,
3268  default_mode_type, default_algo_type>::invoke(member, -one, C, B, one, A);
3269  KB::LU<member_type,
3270  default_mode_type, KB::Algo::LU::Unblocked>::invoke(member, A, tiny);
3271  }
3272  } else {
3273  // for block jacobi invert a matrix here
3274  auto W = Kokkos::subview(WW, Kokkos::ALL(), Kokkos::ALL(), v);
3275  KB::Copy<member_type, KB::Trans::NoTranspose, default_mode_type>::invoke(member, A, W);
3276  KB::SetIdentity<member_type, default_mode_type>::invoke(member, A);
3277  member.team_barrier();
3278  KB::Trsm<member_type,
3279  KB::Side::Left, KB::Uplo::Lower, KB::Trans::NoTranspose, KB::Diag::Unit,
3280  default_mode_type, default_algo_type>::invoke(member, one, W, A);
3281  KB::Trsm<member_type,
3282  KB::Side::Left, KB::Uplo::Upper, KB::Trans::NoTranspose, KB::Diag::NonUnit,
3283  default_mode_type, default_algo_type>::invoke(member, one, W, A);
3284  }
3285  }
3286 
3287  public:
3288  struct ExtractAndFactorizeSubLineTag {};
3289  struct ExtractAndFactorizeFusedJacobiTag {};
3290  struct ExtractBCDTag {};
3291  struct ComputeETag {};
3292  struct ComputeSchurTag {};
3293  struct FactorizeSchurTag {};
3294 
3295  KOKKOS_INLINE_FUNCTION
3296  void
3297  operator()(const ExtractAndFactorizeSubLineTag &, const member_type &member) const {
3298  // btdm is packed and sorted from largest one
3299  const local_ordinal_type packidx = packindices_sub(member.league_rank());
3300 
3301  const local_ordinal_type subpartidx = packptr_sub(packidx);
3302  const local_ordinal_type n_parts = part2packrowidx0_sub.extent(0);
3303  const local_ordinal_type local_subpartidx = subpartidx / n_parts;
3304  const local_ordinal_type partidx = subpartidx % n_parts;
3305 
3306  const local_ordinal_type npacks = packptr_sub(packidx + 1) - subpartidx;
3307  const local_ordinal_type i0 = pack_td_ptr(partidx, local_subpartidx);
3308  const local_ordinal_type nrows = partptr_sub(subpartidx, 1) - partptr_sub(subpartidx, 0);
3309 
3310  internal_vector_scratch_type_3d_view
3311  WW(member.team_scratch(ScratchLevel), blocksize, blocksize, vector_loop_size);
3312 
3313 #ifdef IFPACK2_BLOCKTRIDICONTAINER_USE_PRINTF
3314  printf("rank = %d, i0 = %d, npacks = %d, nrows = %d, packidx = %d, subpartidx = %d, partidx = %d, local_subpartidx = %d;\n", member.league_rank(), i0, npacks, nrows, packidx, subpartidx, partidx, local_subpartidx);
3315  printf("vector_loop_size = %d\n", vector_loop_size);
3316 #endif
3317 
3318  if (vector_loop_size == 1) {
3319  extract(partidx, local_subpartidx, npacks);
3320  factorize_subline(member, i0, nrows, 0, internal_vector_values, WW);
3321  } else {
3322  Kokkos::parallel_for(Kokkos::ThreadVectorRange(member, vector_loop_size),
3323  [&](const local_ordinal_type &v) {
3324  const local_ordinal_type vbeg = v * internal_vector_length;
3325 #ifdef IFPACK2_BLOCKTRIDICONTAINER_USE_PRINTF
3326  printf("i0 = %d, npacks = %d, vbeg = %d;\n", i0, npacks, vbeg);
3327 #endif
3328  if (vbeg < npacks)
3329  extract(member, partidx + vbeg, local_subpartidx, npacks, vbeg);
3330  // this is not safe if vector loop size is different from vector size of
3331  // the team policy. we always make sure this when constructing the team policy
3332  member.team_barrier();
3333  factorize_subline(member, i0, nrows, v, internal_vector_values, WW);
3334  });
3335  }
3336  }
3337 
3338  KOKKOS_INLINE_FUNCTION
3339  void
3340  operator()(const ExtractAndFactorizeFusedJacobiTag &, const member_type &member) const {
3341  using default_mode_and_algo_type = ExtractAndFactorizeTridiagsDefaultModeAndAlgo<typename execution_space::memory_space>;
3342  using default_mode_type = typename default_mode_and_algo_type::mode_type;
3343  using default_algo_type = typename default_mode_and_algo_type::algo_type;
3344  // When fused block Jacobi can be used, the mapping between local rows and parts is trivial (i <-> i)
3345  // We can simply pull the diagonal entry from A into d_inv
3346  btdm_scalar_scratch_type_3d_view WW1(member.team_scratch(ScratchLevel), half_vector_length, blocksize, blocksize);
3347  btdm_scalar_scratch_type_3d_view WW2(member.team_scratch(ScratchLevel), half_vector_length, blocksize, blocksize);
3348 #if KOKKOS_VERSION >= 40799
3349  const auto one = KokkosKernels::ArithTraits<btdm_magnitude_type>::one();
3350 #else
3351  const auto one = Kokkos::ArithTraits<btdm_magnitude_type>::one();
3352 #endif
3353  const local_ordinal_type nrows = lclrow.extent(0);
3354  Kokkos::parallel_for(Kokkos::ThreadVectorRange(member, half_vector_length),
3355  [&](const local_ordinal_type &v) {
3356  local_ordinal_type row = member.league_rank() * half_vector_length + v;
3357  // diagEntry has index of diagonal within row
3358  auto W1 = Kokkos::subview(WW1, v, Kokkos::ALL(), Kokkos::ALL());
3359  auto W2 = Kokkos::subview(WW2, v, Kokkos::ALL(), Kokkos::ALL());
3360  if (row < nrows) {
3361  // View the diagonal block of A in row as 2D row-major
3362  const impl_scalar_type *A_diag = A_values.data() + diag_offsets(row);
3363  // Copy the diag into scratch slice W1
3364  // (copying elements directly is better than KokkosBatched copy)
3365  Kokkos::parallel_for(Kokkos::TeamThreadRange(member, blocksize * blocksize),
3366  [&](int i) {
3367  W1.data()[i] = A_diag[i];
3368  });
3369  // and set W2 to identity in preparation to invert with 2 x Trsm
3370  KB::SetIdentity<member_type, default_mode_type>::invoke(member, W2);
3371  } else {
3372  // if this vector lane has no block to invert, then set W1 to identity
3373  // so that LU still has a matrix to work on. LU uses team barriers so
3374  // having some lanes run it and some not will deadlock.
3375  KB::SetIdentity<member_type, default_mode_type>::invoke(member, W1);
3376  }
3377  member.team_barrier();
3378  // LU factorize in-place
3379  KB::LU<member_type, default_mode_type, KB::Algo::LU::Unblocked>::invoke(member, W1, tiny);
3380  member.team_barrier();
3381  KB::Trsm<member_type,
3382  KB::Side::Left, KB::Uplo::Lower, KB::Trans::NoTranspose, KB::Diag::Unit,
3383  default_mode_type, default_algo_type>::invoke(member, one, W1, W2);
3384  KB::Trsm<member_type,
3385  KB::Side::Left, KB::Uplo::Upper, KB::Trans::NoTranspose, KB::Diag::NonUnit,
3386  default_mode_type, default_algo_type>::invoke(member, one, W1, W2);
3387  member.team_barrier();
3388  if (row < nrows) {
3389  Kokkos::parallel_for(Kokkos::TeamThreadRange(member, blocksize * blocksize),
3390  [&](int i) {
3391  auto d_inv_block = &d_inv(row, 0, 0);
3392  d_inv_block[i] = W2.data()[i];
3393  });
3394  }
3395  });
3396  }
3397 
3398  KOKKOS_INLINE_FUNCTION
3399  void
3400  operator()(const ExtractBCDTag &, const member_type &member) const {
3401  // btdm is packed and sorted from largest one
3402  const local_ordinal_type packindices_schur_i = member.league_rank() % packindices_schur.extent(0);
3403  const local_ordinal_type packindices_schur_j = member.league_rank() / packindices_schur.extent(0);
3404  const local_ordinal_type packidx = packindices_schur(packindices_schur_i, packindices_schur_j);
3405 
3406  const local_ordinal_type subpartidx = packptr_sub(packidx);
3407  const local_ordinal_type n_parts = part2packrowidx0_sub.extent(0);
3408  const local_ordinal_type local_subpartidx = subpartidx / n_parts;
3409  const local_ordinal_type partidx = subpartidx % n_parts;
3410 
3411  const local_ordinal_type npacks = packptr_sub(packidx + 1) - subpartidx;
3412  // const local_ordinal_type i0 = pack_td_ptr(partidx,local_subpartidx);
3413  // const local_ordinal_type nrows = partptr_sub(subpartidx,1) - partptr_sub(subpartidx,0);
3414 
3415  if (vector_loop_size == 1) {
3416  extract(partidx, local_subpartidx, npacks);
3417  } else {
3418  Kokkos::parallel_for(Kokkos::ThreadVectorRange(member, vector_loop_size),
3419  [&](const local_ordinal_type &v) {
3420  const local_ordinal_type vbeg = v * internal_vector_length;
3421 #ifdef IFPACK2_BLOCKTRIDICONTAINER_USE_PRINTF
3422  const local_ordinal_type i0 = pack_td_ptr(partidx, local_subpartidx);
3423  printf("i0 = %d, npacks = %d, vbeg = %d;\n", i0, npacks, vbeg);
3424 #endif
3425  if (vbeg < npacks)
3426  extract(member, partidx + vbeg, local_subpartidx, npacks, vbeg);
3427  });
3428  }
3429 
3430  member.team_barrier();
3431 
3432  const size_type kps1 = pack_td_ptr(partidx, local_subpartidx);
3433  const size_type kps2 = pack_td_ptr(partidx, local_subpartidx + 1) - 1;
3434 
3435  const local_ordinal_type r1 = part2packrowidx0_sub(partidx, local_subpartidx) - 1;
3436  const local_ordinal_type r2 = part2packrowidx0_sub(partidx, local_subpartidx) + 2;
3437 
3438 #ifdef IFPACK2_BLOCKTRIDICONTAINER_USE_PRINTF
3439  printf("Copy for Schur complement part id = %d from kps1 = %ld to r1 = %d and from kps2 = %ld to r2 = %d partidx = %d local_subpartidx = %d;\n", packidx, kps1, r1, kps2, r2, partidx, local_subpartidx);
3440 #endif
3441 
3442  // Need to copy D to e_internal_vector_values.
3443  copy3DView<local_ordinal_type>(member, Kokkos::subview(e_internal_vector_values, 0, r1, Kokkos::ALL(), Kokkos::ALL(), Kokkos::ALL()),
3444  Kokkos::subview(internal_vector_values, kps1, Kokkos::ALL(), Kokkos::ALL(), Kokkos::ALL()));
3445 
3446  copy3DView<local_ordinal_type>(member, Kokkos::subview(e_internal_vector_values, 1, r2, Kokkos::ALL(), Kokkos::ALL(), Kokkos::ALL()),
3447  Kokkos::subview(internal_vector_values, kps2, Kokkos::ALL(), Kokkos::ALL(), Kokkos::ALL()));
3448  }
3449 
3450  KOKKOS_INLINE_FUNCTION
3451  void
3452  operator()(const ComputeETag &, const member_type &member) const {
3453  // btdm is packed and sorted from largest one
3454  const local_ordinal_type packidx = packindices_sub(member.league_rank());
3455 
3456  const local_ordinal_type subpartidx = packptr_sub(packidx);
3457  const local_ordinal_type n_parts = part2packrowidx0_sub.extent(0);
3458  const local_ordinal_type local_subpartidx = subpartidx / n_parts;
3459  const local_ordinal_type partidx = subpartidx % n_parts;
3460 
3461  const local_ordinal_type npacks = packptr_sub(packidx + 1) - subpartidx;
3462  const local_ordinal_type i0 = pack_td_ptr(partidx, local_subpartidx);
3463  const local_ordinal_type r0 = part2packrowidx0_sub(partidx, local_subpartidx);
3464  const local_ordinal_type nrows = partptr_sub(subpartidx, 1) - partptr_sub(subpartidx, 0);
3465  const local_ordinal_type num_vectors = blocksize;
3466 
3467  (void)npacks;
3468 
3469  internal_vector_scratch_type_3d_view
3470  WW(member.team_scratch(ScratchLevel), blocksize, num_vectors, vector_loop_size);
3471  if (local_subpartidx == 0) {
3472  Kokkos::parallel_for(Kokkos::ThreadVectorRange(member, vector_loop_size), [&](const int &v) {
3473  solveMultiVector<impl_type, internal_vector_scratch_type_3d_view>(member, blocksize, i0, r0, nrows, v, internal_vector_values, Kokkos::subview(e_internal_vector_values, 0, Kokkos::ALL(), Kokkos::ALL(), Kokkos::ALL(), Kokkos::ALL()), WW, true);
3474  });
3475  } else if (local_subpartidx == (local_ordinal_type)part2packrowidx0_sub.extent(1) - 2) {
3476  Kokkos::parallel_for(Kokkos::ThreadVectorRange(member, vector_loop_size), [&](const int &v) {
3477  solveMultiVector<impl_type, internal_vector_scratch_type_3d_view>(member, blocksize, i0, r0, nrows, v, internal_vector_values, Kokkos::subview(e_internal_vector_values, 1, Kokkos::ALL(), Kokkos::ALL(), Kokkos::ALL(), Kokkos::ALL()), WW);
3478  });
3479  } else {
3480  Kokkos::parallel_for(Kokkos::ThreadVectorRange(member, vector_loop_size), [&](const int &v) {
3481  solveMultiVector<impl_type, internal_vector_scratch_type_3d_view>(member, blocksize, i0, r0, nrows, v, internal_vector_values, Kokkos::subview(e_internal_vector_values, 0, Kokkos::ALL(), Kokkos::ALL(), Kokkos::ALL(), Kokkos::ALL()), WW, true);
3482  solveMultiVector<impl_type, internal_vector_scratch_type_3d_view>(member, blocksize, i0, r0, nrows, v, internal_vector_values, Kokkos::subview(e_internal_vector_values, 1, Kokkos::ALL(), Kokkos::ALL(), Kokkos::ALL(), Kokkos::ALL()), WW);
3483  });
3484  }
3485  }
3486 
3487  KOKKOS_INLINE_FUNCTION
3488  void
3489  operator()(const ComputeSchurTag &, const member_type &member) const {
3490  // btdm is packed and sorted from largest one
3491  const local_ordinal_type packindices_schur_i = member.league_rank() % packindices_schur.extent(0);
3492  const local_ordinal_type packindices_schur_j = member.league_rank() / packindices_schur.extent(0);
3493  const local_ordinal_type packidx = packindices_schur(packindices_schur_i, packindices_schur_j);
3494 
3495  const local_ordinal_type subpartidx = packptr_sub(packidx);
3496  const local_ordinal_type n_parts = part2packrowidx0_sub.extent(0);
3497  const local_ordinal_type local_subpartidx = subpartidx / n_parts;
3498  const local_ordinal_type partidx = subpartidx % n_parts;
3499 
3500  // const local_ordinal_type npacks = packptr_sub(packidx+1) - subpartidx;
3501  const local_ordinal_type i0 = pack_td_ptr(partidx, local_subpartidx);
3502  // const local_ordinal_type r0 = part2packrowidx0_sub(partidx,local_subpartidx);
3503  // const local_ordinal_type nrows = partptr_sub(subpartidx,1) - partptr_sub(subpartidx,0);
3504 
3505  // Compute S = D - C E
3506 
3507  const local_ordinal_type local_subpartidx_schur = (local_subpartidx - 1) / 2;
3508  const local_ordinal_type i0_schur = local_subpartidx_schur == 0 ? pack_td_ptr_schur(partidx, local_subpartidx_schur) : pack_td_ptr_schur(partidx, local_subpartidx_schur) + 1;
3509  const local_ordinal_type i0_offset = local_subpartidx_schur == 0 ? i0 + 2 : i0 + 2;
3510 
3511  for (local_ordinal_type i = 0; i < 4; ++i) { // pack_td_ptr_schur(partidx,local_subpartidx_schur+1)-i0_schur
3512  copy3DView<local_ordinal_type>(member, Kokkos::subview(internal_vector_values_schur, i0_schur + i, Kokkos::ALL(), Kokkos::ALL(), Kokkos::ALL()),
3513  Kokkos::subview(internal_vector_values, i0_offset + i, Kokkos::ALL(), Kokkos::ALL(), Kokkos::ALL()));
3514  }
3515 
3516  member.team_barrier();
3517 
3518 #if KOKKOS_VERSION >= 40799
3519  const auto one = KokkosKernels::ArithTraits<btdm_magnitude_type>::one();
3520 #else
3521  const auto one = Kokkos::ArithTraits<btdm_magnitude_type>::one();
3522 #endif
3523 
3524  const size_type c_kps1 = pack_td_ptr(partidx, local_subpartidx) + 1;
3525  const size_type c_kps2 = pack_td_ptr(partidx, local_subpartidx + 1) - 2;
3526 
3527  const local_ordinal_type e_r1 = part2packrowidx0_sub(partidx, local_subpartidx) - 1;
3528  const local_ordinal_type e_r2 = part2packrowidx0_sub(partidx, local_subpartidx) + 2;
3529 
3530  typedef ExtractAndFactorizeTridiagsDefaultModeAndAlgo<typename execution_space::memory_space> default_mode_and_algo_type;
3531 
3532  typedef typename default_mode_and_algo_type::mode_type default_mode_type;
3533  typedef typename default_mode_and_algo_type::algo_type default_algo_type;
3534 
3535  Kokkos::parallel_for(Kokkos::ThreadVectorRange(member, vector_loop_size), [&](const int &v) {
3536  for (size_type i = 0; i < pack_td_ptr_schur(partidx, local_subpartidx_schur + 1) - pack_td_ptr_schur(partidx, local_subpartidx_schur); ++i) {
3537  local_ordinal_type e_r, e_c, c_kps;
3538 
3539  if (local_subpartidx_schur == 0) {
3540  if (i == 0) {
3541  e_r = e_r1;
3542  e_c = 0;
3543  c_kps = c_kps1;
3544  } else if (i == 3) {
3545  e_r = e_r2;
3546  e_c = 1;
3547  c_kps = c_kps2;
3548  } else if (i == 4) {
3549  e_r = e_r2;
3550  e_c = 0;
3551  c_kps = c_kps2;
3552  } else {
3553  continue;
3554  }
3555  } else {
3556  if (i == 0) {
3557  e_r = e_r1;
3558  e_c = 1;
3559  c_kps = c_kps1;
3560  } else if (i == 1) {
3561  e_r = e_r1;
3562  e_c = 0;
3563  c_kps = c_kps1;
3564  } else if (i == 4) {
3565  e_r = e_r2;
3566  e_c = 1;
3567  c_kps = c_kps2;
3568  } else if (i == 5) {
3569  e_r = e_r2;
3570  e_c = 0;
3571  c_kps = c_kps2;
3572  } else {
3573  continue;
3574  }
3575  }
3576 
3577  auto S = Kokkos::subview(internal_vector_values_schur, pack_td_ptr_schur(partidx, local_subpartidx_schur) + i, Kokkos::ALL(), Kokkos::ALL(), v);
3578  auto C = Kokkos::subview(internal_vector_values, c_kps, Kokkos::ALL(), Kokkos::ALL(), v);
3579  auto E = Kokkos::subview(e_internal_vector_values, e_c, e_r, Kokkos::ALL(), Kokkos::ALL(), v);
3580  KB::Gemm<member_type,
3581  KB::Trans::NoTranspose, KB::Trans::NoTranspose,
3582  default_mode_type, default_algo_type>::invoke(member, -one, C, E, one, S);
3583  }
3584  });
3585  }
3586 
3587  KOKKOS_INLINE_FUNCTION
3588  void
3589  operator()(const FactorizeSchurTag &, const member_type &member) const {
3590  const local_ordinal_type packidx = packindices_schur(member.league_rank(), 0);
3591 
3592  const local_ordinal_type subpartidx = packptr_sub(packidx);
3593 
3594  const local_ordinal_type n_parts = part2packrowidx0_sub.extent(0);
3595  const local_ordinal_type partidx = subpartidx % n_parts;
3596 
3597  const local_ordinal_type i0 = pack_td_ptr_schur(partidx, 0);
3598  const local_ordinal_type nrows = 2 * (pack_td_ptr_schur.extent(1) - 1);
3599 
3600  internal_vector_scratch_type_3d_view
3601  WW(member.team_scratch(ScratchLevel), blocksize, blocksize, vector_loop_size);
3602 
3603 #ifdef IFPACK2_BLOCKTRIDICONTAINER_USE_PRINTF
3604  printf("FactorizeSchurTag rank = %d, i0 = %d, nrows = %d, vector_loop_size = %d;\n", member.league_rank(), i0, nrows, vector_loop_size);
3605 #endif
3606 
3607  if (vector_loop_size == 1) {
3608  factorize_subline(member, i0, nrows, 0, internal_vector_values_schur, WW);
3609  } else {
3610  Kokkos::parallel_for(Kokkos::ThreadVectorRange(member, vector_loop_size),
3611  [&](const local_ordinal_type &v) {
3612  factorize_subline(member, i0, nrows, v, internal_vector_values_schur, WW);
3613  });
3614  }
3615  }
3616 
3617  void run() {
3618  IFPACK2_BLOCKTRIDICONTAINER_PROFILER_REGION_BEGIN;
3619  const local_ordinal_type team_size =
3620  ExtractAndFactorizeTridiagsDefaultModeAndAlgo<typename execution_space::memory_space>::
3621  recommended_team_size(blocksize, vector_length, internal_vector_length);
3622  const local_ordinal_type per_team_scratch = internal_vector_scratch_type_3d_view::
3623  shmem_size(blocksize, blocksize, vector_loop_size);
3624 
3625  {
3626 #ifdef IFPACK2_BLOCKTRIDICONTAINER_USE_PRINTF
3627  printf("Start ExtractAndFactorizeSubLineTag\n");
3628 #endif
3629  IFPACK2_BLOCKHELPER_TIMER("BlockTriDi::NumericPhase::ExtractAndFactorizeSubLineTag", ExtractAndFactorizeSubLineTag0);
3630  Kokkos::TeamPolicy<execution_space, ExtractAndFactorizeSubLineTag>
3631  policy(packindices_sub.extent(0), team_size, vector_loop_size);
3632 
3633  const local_ordinal_type n_parts = part2packrowidx0_sub.extent(0);
3634  writeBTDValuesToFile(n_parts, scalar_values, "before.mm");
3635 
3636  policy.set_scratch_size(ScratchLevel, Kokkos::PerTeam(per_team_scratch));
3637  Kokkos::parallel_for("ExtractAndFactorize::TeamPolicy::run<ExtractAndFactorizeSubLineTag>",
3638  policy, *this);
3639  execution_space().fence();
3640 
3641  writeBTDValuesToFile(n_parts, scalar_values, "after.mm");
3642 #ifdef IFPACK2_BLOCKTRIDICONTAINER_USE_PRINTF
3643  printf("End ExtractAndFactorizeSubLineTag\n");
3644 #endif
3645  }
3646 
3647  if (packindices_schur.extent(1) > 0) {
3648  {
3649 #ifdef IFPACK2_BLOCKTRIDICONTAINER_USE_PRINTF
3650  printf("Start ExtractBCDTag\n");
3651 #endif
3652 #if KOKKOS_VERSION >= 40799
3653  Kokkos::deep_copy(e_scalar_values, KokkosKernels::ArithTraits<btdm_magnitude_type>::zero());
3654 #else
3655  Kokkos::deep_copy(e_scalar_values, Kokkos::ArithTraits<btdm_magnitude_type>::zero());
3656 #endif
3657 #if KOKKOS_VERSION >= 40799
3658  Kokkos::deep_copy(scalar_values_schur, KokkosKernels::ArithTraits<btdm_magnitude_type>::zero());
3659 #else
3660  Kokkos::deep_copy(scalar_values_schur, Kokkos::ArithTraits<btdm_magnitude_type>::zero());
3661 #endif
3662 
3663  write5DMultiVectorValuesToFile(part2packrowidx0_sub.extent(0), e_scalar_values, "e_scalar_values_before_extract.mm");
3664 
3665  {
3666  IFPACK2_BLOCKHELPER_TIMER("BlockTriDi::NumericPhase::ExtractBCDTag", ExtractBCDTag0);
3667  Kokkos::TeamPolicy<execution_space, ExtractBCDTag>
3668  policy(packindices_schur.extent(0) * packindices_schur.extent(1), team_size, vector_loop_size);
3669 
3670  policy.set_scratch_size(ScratchLevel, Kokkos::PerTeam(per_team_scratch));
3671  Kokkos::parallel_for("ExtractAndFactorize::TeamPolicy::run<ExtractBCDTag>",
3672  policy, *this);
3673  execution_space().fence();
3674  }
3675 
3676 #ifdef IFPACK2_BLOCKTRIDICONTAINER_USE_PRINTF
3677  printf("End ExtractBCDTag\n");
3678 #endif
3679  writeBTDValuesToFile(part2packrowidx0_sub.extent(0), scalar_values, "after_extraction_of_BCD.mm");
3680 #ifdef IFPACK2_BLOCKTRIDICONTAINER_USE_PRINTF
3681  printf("Start ComputeETag\n");
3682 #endif
3683  write5DMultiVectorValuesToFile(part2packrowidx0_sub.extent(0), e_scalar_values, "e_scalar_values_after_extract.mm");
3684  {
3685  IFPACK2_BLOCKHELPER_TIMER("BlockTriDi::NumericPhase::ComputeETag", ComputeETag0);
3686  Kokkos::TeamPolicy<execution_space, ComputeETag>
3687  policy(packindices_sub.extent(0), team_size, vector_loop_size);
3688 
3689  policy.set_scratch_size(ScratchLevel, Kokkos::PerTeam(per_team_scratch));
3690  Kokkos::parallel_for("ExtractAndFactorize::TeamPolicy::run<ComputeETag>",
3691  policy, *this);
3692  execution_space().fence();
3693  }
3694  write5DMultiVectorValuesToFile(part2packrowidx0_sub.extent(0), e_scalar_values, "e_scalar_values_after_compute.mm");
3695 
3696 #ifdef IFPACK2_BLOCKTRIDICONTAINER_USE_PRINTF
3697  printf("End ComputeETag\n");
3698 #endif
3699  }
3700 
3701  {
3702 #ifdef IFPACK2_BLOCKTRIDICONTAINER_USE_PRINTF
3703  printf("Start ComputeSchurTag\n");
3704 #endif
3705  IFPACK2_BLOCKHELPER_TIMER("BlockTriDi::NumericPhase::ComputeSchurTag", ComputeSchurTag0);
3706  writeBTDValuesToFile(part2packrowidx0_sub.extent(0), scalar_values_schur, "before_schur.mm");
3707  Kokkos::TeamPolicy<execution_space, ComputeSchurTag>
3708  policy(packindices_schur.extent(0) * packindices_schur.extent(1), team_size, vector_loop_size);
3709 
3710  Kokkos::parallel_for("ExtractAndFactorize::TeamPolicy::run<ComputeSchurTag>",
3711  policy, *this);
3712  writeBTDValuesToFile(part2packrowidx0_sub.extent(0), scalar_values_schur, "after_schur.mm");
3713  execution_space().fence();
3714 #ifdef IFPACK2_BLOCKTRIDICONTAINER_USE_PRINTF
3715  printf("End ComputeSchurTag\n");
3716 #endif
3717  }
3718 
3719  {
3720 #ifdef IFPACK2_BLOCKTRIDICONTAINER_USE_PRINTF
3721  printf("Start FactorizeSchurTag\n");
3722 #endif
3723  IFPACK2_BLOCKHELPER_TIMER("BlockTriDi::NumericPhase::FactorizeSchurTag", FactorizeSchurTag0);
3724  Kokkos::TeamPolicy<execution_space, FactorizeSchurTag>
3725  policy(packindices_schur.extent(0), team_size, vector_loop_size);
3726  policy.set_scratch_size(ScratchLevel, Kokkos::PerTeam(per_team_scratch));
3727  Kokkos::parallel_for("ExtractAndFactorize::TeamPolicy::run<FactorizeSchurTag>",
3728  policy, *this);
3729  execution_space().fence();
3730  writeBTDValuesToFile(part2packrowidx0_sub.extent(0), scalar_values_schur, "after_factor_schur.mm");
3731 #ifdef IFPACK2_BLOCKTRIDICONTAINER_USE_PRINTF
3732  printf("End FactorizeSchurTag\n");
3733 #endif
3734  }
3735  }
3736 
3737  IFPACK2_BLOCKTRIDICONTAINER_PROFILER_REGION_END;
3738  }
3739 
3740  void run_fused_jacobi() {
3741  IFPACK2_BLOCKTRIDICONTAINER_PROFILER_REGION_BEGIN;
3742  const local_ordinal_type team_size =
3743  ExtractAndFactorizeTridiagsDefaultModeAndAlgo<typename execution_space::memory_space>::
3744  recommended_team_size(blocksize, half_vector_length, 1);
3745  const local_ordinal_type per_team_scratch =
3746  btdm_scalar_scratch_type_3d_view::shmem_size(blocksize, blocksize, 2 * half_vector_length);
3747  {
3748  IFPACK2_BLOCKHELPER_TIMER("BlockTriDi::NumericPhase::ExtractAndFactorizeFusedJacobi", ExtractAndFactorizeFusedJacobiTag);
3749  Kokkos::TeamPolicy<execution_space, ExtractAndFactorizeFusedJacobiTag>
3750  policy((lclrow.extent(0) + half_vector_length - 1) / half_vector_length, team_size, half_vector_length);
3751 
3752  policy.set_scratch_size(ScratchLevel, Kokkos::PerTeam(per_team_scratch));
3753  Kokkos::parallel_for("ExtractAndFactorize::TeamPolicy::run<ExtractAndFactorizeFusedJacobiTag>",
3754  policy, *this);
3755  }
3756  IFPACK2_BLOCKTRIDICONTAINER_PROFILER_REGION_END;
3757  }
3758 };
3759 
3763 template <typename MatrixType>
3764 void performNumericPhase(const Teuchos::RCP<const typename BlockHelperDetails::ImplType<MatrixType>::tpetra_row_matrix_type> &A,
3765  const Teuchos::RCP<const typename BlockHelperDetails::ImplType<MatrixType>::tpetra_crs_graph_type> &G,
3766  const BlockHelperDetails::PartInterface<MatrixType> &interf,
3768  const typename BlockHelperDetails::ImplType<MatrixType>::magnitude_type tiny,
3769  bool use_fused_jacobi) {
3770  using impl_type = BlockHelperDetails::ImplType<MatrixType>;
3771  using execution_space = typename impl_type::execution_space;
3772  using team_policy_type = Kokkos::TeamPolicy<execution_space>;
3773  using internal_vector_scratch_type_3d_view = Scratch<typename impl_type::internal_vector_type_3d_view>;
3774  using btdm_scalar_scratch_type_3d_view = Scratch<typename impl_type::btdm_scalar_type_3d_view>;
3775 
3776  IFPACK2_BLOCKHELPER_TIMER("BlockTriDi::NumericPhase", NumericPhase);
3777 
3778  int blocksize = btdm.values.extent(1);
3779  // Both Kokkos policy vector length and SIMD type vector length are hardcoded in KokkosBatched.
3780  // For large block sizes, have to fall back to level 1 scratch.
3781  int scratch_required;
3782  if (!use_fused_jacobi) {
3783  // General path scratch requirement
3784  scratch_required = internal_vector_scratch_type_3d_view::shmem_size(blocksize, blocksize, impl_type::vector_length / impl_type::internal_vector_length);
3785  } else {
3786  // Block Jacobi scratch requirement: measured in scalars, and uses twice as much (in bytes) per vector lane as the general path.
3787  scratch_required = btdm_scalar_scratch_type_3d_view::shmem_size(blocksize, blocksize, 2 * impl_type::half_vector_length);
3788  }
3789 
3790  int max_scratch = team_policy_type::scratch_size_max(0);
3791 
3792  if (scratch_required < max_scratch) {
3793  // Can use level 0 scratch
3794  ExtractAndFactorizeTridiags<MatrixType, 0> function(btdm, interf, A, G, tiny);
3795  if (!use_fused_jacobi)
3796  function.run();
3797  else
3798  function.run_fused_jacobi();
3799  } else {
3800  // Not enough level 0 scratch, so fall back to level 1
3801  ExtractAndFactorizeTridiags<MatrixType, 1> function(btdm, interf, A, G, tiny);
3802  if (!use_fused_jacobi)
3803  function.run();
3804  else
3805  function.run_fused_jacobi();
3806  }
3807  IFPACK2_BLOCKHELPER_TIMER_FENCE(typename BlockHelperDetails::ImplType<MatrixType>::execution_space)
3808 }
3809 
3813 template <typename MatrixType>
3815  public:
3817  using execution_space = typename impl_type::execution_space;
3818  using memory_space = typename impl_type::memory_space;
3819 
3820  using local_ordinal_type = typename impl_type::local_ordinal_type;
3821  using impl_scalar_type = typename impl_type::impl_scalar_type;
3822  using btdm_scalar_type = typename impl_type::btdm_scalar_type;
3823  using tpetra_multivector_type = typename impl_type::tpetra_multivector_type;
3824  using local_ordinal_type_1d_view = typename impl_type::local_ordinal_type_1d_view;
3825  using vector_type_3d_view = typename impl_type::vector_type_3d_view;
3826  using impl_scalar_type_2d_view_tpetra = typename impl_type::impl_scalar_type_2d_view_tpetra;
3827  using const_impl_scalar_type_2d_view_tpetra = typename impl_scalar_type_2d_view_tpetra::const_type;
3828  static constexpr int vector_length = impl_type::vector_length;
3829 
3830  using member_type = typename Kokkos::TeamPolicy<execution_space>::member_type;
3831 
3832  private:
3833  // part interface
3834  const ConstUnmanaged<local_ordinal_type_1d_view> partptr;
3835  const ConstUnmanaged<local_ordinal_type_1d_view> packptr;
3836  const ConstUnmanaged<local_ordinal_type_1d_view> part2packrowidx0;
3837  const ConstUnmanaged<local_ordinal_type_1d_view> part2rowidx0;
3838  const ConstUnmanaged<local_ordinal_type_1d_view> lclrow;
3839  const local_ordinal_type blocksize;
3840  const local_ordinal_type num_vectors;
3841 
3842  // packed multivector output (or input)
3843  vector_type_3d_view packed_multivector;
3844  const_impl_scalar_type_2d_view_tpetra scalar_multivector;
3845 
3846  template <typename TagType>
3847  KOKKOS_INLINE_FUNCTION void copy_multivectors(const local_ordinal_type &j,
3848  const local_ordinal_type &vi,
3849  const local_ordinal_type &pri,
3850  const local_ordinal_type &ri0) const {
3851  for (local_ordinal_type col = 0; col < num_vectors; ++col)
3852  for (local_ordinal_type i = 0; i < blocksize; ++i)
3853  packed_multivector(pri, i, col)[vi] = static_cast<btdm_scalar_type>(scalar_multivector(blocksize * lclrow(ri0 + j) + i, col));
3854  }
3855 
3856  public:
3857  MultiVectorConverter(const BlockHelperDetails::PartInterface<MatrixType> &interf,
3858  const vector_type_3d_view &pmv)
3859  : partptr(interf.partptr)
3860  , packptr(interf.packptr)
3861  , part2packrowidx0(interf.part2packrowidx0)
3862  , part2rowidx0(interf.part2rowidx0)
3863  , lclrow(interf.lclrow)
3864  , blocksize(pmv.extent(1))
3865  , num_vectors(pmv.extent(2))
3866  , packed_multivector(pmv) {}
3867 
3868  // TODO:: modify this routine similar to the team level functions
3869  KOKKOS_INLINE_FUNCTION
3870  void
3871  operator()(const local_ordinal_type &packidx) const {
3872  local_ordinal_type partidx = packptr(packidx);
3873  local_ordinal_type npacks = packptr(packidx + 1) - partidx;
3874  const local_ordinal_type pri0 = part2packrowidx0(partidx);
3875 
3876  local_ordinal_type ri0[vector_length] = {};
3877  local_ordinal_type nrows[vector_length] = {};
3878  for (local_ordinal_type v = 0; v < npacks; ++v, ++partidx) {
3879  ri0[v] = part2rowidx0(partidx);
3880  nrows[v] = part2rowidx0(partidx + 1) - ri0[v];
3881  }
3882  for (local_ordinal_type j = 0; j < nrows[0]; ++j) {
3883  local_ordinal_type cnt = 1;
3884  for (; cnt < npacks && j != nrows[cnt]; ++cnt)
3885  ;
3886  npacks = cnt;
3887  const local_ordinal_type pri = pri0 + j;
3888  for (local_ordinal_type col = 0; col < num_vectors; ++col)
3889  for (local_ordinal_type i = 0; i < blocksize; ++i)
3890  for (local_ordinal_type v = 0; v < npacks; ++v)
3891  packed_multivector(pri, i, col)[v] = static_cast<btdm_scalar_type>(scalar_multivector(blocksize * lclrow(ri0[v] + j) + i, col));
3892  }
3893  }
3894 
3895  KOKKOS_INLINE_FUNCTION
3896  void
3897  operator()(const member_type &member) const {
3898  const local_ordinal_type packidx = member.league_rank();
3899  const local_ordinal_type partidx_begin = packptr(packidx);
3900  const local_ordinal_type npacks = packptr(packidx + 1) - partidx_begin;
3901  const local_ordinal_type pri0 = part2packrowidx0(partidx_begin);
3902  Kokkos::parallel_for(Kokkos::ThreadVectorRange(member, npacks), [&](const local_ordinal_type &v) {
3903  const local_ordinal_type partidx = partidx_begin + v;
3904  const local_ordinal_type ri0 = part2rowidx0(partidx);
3905  const local_ordinal_type nrows = part2rowidx0(partidx + 1) - ri0;
3906 
3907  if (nrows == 1) {
3908  const local_ordinal_type pri = pri0;
3909  for (local_ordinal_type col = 0; col < num_vectors; ++col) {
3910  Kokkos::parallel_for(Kokkos::TeamThreadRange(member, blocksize), [&](const local_ordinal_type &i) {
3911  packed_multivector(pri, i, col)[v] = static_cast<btdm_scalar_type>(scalar_multivector(blocksize * lclrow(ri0) + i, col));
3912  });
3913  }
3914  } else {
3915  Kokkos::parallel_for(Kokkos::TeamThreadRange(member, nrows), [&](const local_ordinal_type &j) {
3916  const local_ordinal_type pri = pri0 + j;
3917  for (local_ordinal_type col = 0; col < num_vectors; ++col)
3918  for (local_ordinal_type i = 0; i < blocksize; ++i)
3919  packed_multivector(pri, i, col)[v] = static_cast<btdm_scalar_type>(scalar_multivector(blocksize * lclrow(ri0 + j) + i, col));
3920  });
3921  }
3922  });
3923  }
3924 
3925  void run(const const_impl_scalar_type_2d_view_tpetra &scalar_multivector_) {
3926  IFPACK2_BLOCKTRIDICONTAINER_PROFILER_REGION_BEGIN;
3927  IFPACK2_BLOCKHELPER_TIMER("BlockTriDi::MultiVectorConverter", MultiVectorConverter0);
3928 
3929  scalar_multivector = scalar_multivector_;
3930  if constexpr (BlockHelperDetails::is_device<execution_space>::value) {
3931  const local_ordinal_type vl = vector_length;
3932  const Kokkos::TeamPolicy<execution_space> policy(packptr.extent(0) - 1, Kokkos::AUTO(), vl);
3933  Kokkos::parallel_for("MultiVectorConverter::TeamPolicy", policy, *this);
3934  } else {
3935  const Kokkos::RangePolicy<execution_space> policy(0, packptr.extent(0) - 1);
3936  Kokkos::parallel_for("MultiVectorConverter::RangePolicy", policy, *this);
3937  }
3938  IFPACK2_BLOCKTRIDICONTAINER_PROFILER_REGION_END;
3939  IFPACK2_BLOCKHELPER_TIMER_FENCE(execution_space)
3940  }
3941 };
3942 
3946 
3947 template <>
3948 struct SolveTridiagsDefaultModeAndAlgo<Kokkos::HostSpace> {
3949  typedef KB::Mode::Serial mode_type;
3950  typedef KB::Algo::Level2::Unblocked single_vector_algo_type;
3951 #if defined(__KOKKOSBATCHED_INTEL_MKL_COMPACT_BATCHED__)
3952  typedef KB::Algo::Level3::CompactMKL multi_vector_algo_type;
3953 #else
3954  typedef KB::Algo::Level3::Blocked multi_vector_algo_type;
3955 #endif
3956  static int recommended_team_size(const int /* blksize */,
3957  const int /* vector_length */,
3958  const int /* internal_vector_length */) {
3959  return 1;
3960  }
3961 };
3962 
3963 #if defined(KOKKOS_ENABLE_CUDA)
3964 static inline int SolveTridiagsRecommendedCudaTeamSize(const int blksize,
3965  const int vector_length,
3966  const int internal_vector_length) {
3967  const int vector_size = vector_length / internal_vector_length;
3968  int total_team_size(0);
3969  if (blksize <= 5)
3970  total_team_size = 32;
3971  else if (blksize <= 9)
3972  total_team_size = 32; // 64
3973  else if (blksize <= 12)
3974  total_team_size = 96;
3975  else if (blksize <= 16)
3976  total_team_size = 128;
3977  else if (blksize <= 20)
3978  total_team_size = 160;
3979  else
3980  total_team_size = 160;
3981  return total_team_size / vector_size;
3982 }
3983 
3984 template <>
3985 struct SolveTridiagsDefaultModeAndAlgo<Kokkos::CudaSpace> {
3986  typedef KB::Mode::Team mode_type;
3987  typedef KB::Algo::Level2::Unblocked single_vector_algo_type;
3988  typedef KB::Algo::Level3::Unblocked multi_vector_algo_type;
3989  static int recommended_team_size(const int blksize,
3990  const int vector_length,
3991  const int internal_vector_length) {
3992  return SolveTridiagsRecommendedCudaTeamSize(blksize, vector_length, internal_vector_length);
3993  }
3994 };
3995 template <>
3996 struct SolveTridiagsDefaultModeAndAlgo<Kokkos::CudaUVMSpace> {
3997  typedef KB::Mode::Team mode_type;
3998  typedef KB::Algo::Level2::Unblocked single_vector_algo_type;
3999  typedef KB::Algo::Level3::Unblocked multi_vector_algo_type;
4000  static int recommended_team_size(const int blksize,
4001  const int vector_length,
4002  const int internal_vector_length) {
4003  return SolveTridiagsRecommendedCudaTeamSize(blksize, vector_length, internal_vector_length);
4004  }
4005 };
4006 #endif
4007 
4008 #if defined(KOKKOS_ENABLE_HIP)
4009 static inline int SolveTridiagsRecommendedHIPTeamSize(const int blksize,
4010  const int vector_length,
4011  const int internal_vector_length) {
4012  const int vector_size = vector_length / internal_vector_length;
4013  int total_team_size(0);
4014  if (blksize <= 5)
4015  total_team_size = 32;
4016  else if (blksize <= 9)
4017  total_team_size = 32; // 64
4018  else if (blksize <= 12)
4019  total_team_size = 96;
4020  else if (blksize <= 16)
4021  total_team_size = 128;
4022  else if (blksize <= 20)
4023  total_team_size = 160;
4024  else
4025  total_team_size = 160;
4026  return total_team_size / vector_size;
4027 }
4028 
4029 template <>
4030 struct SolveTridiagsDefaultModeAndAlgo<Kokkos::HIPSpace> {
4031  typedef KB::Mode::Team mode_type;
4032  typedef KB::Algo::Level2::Unblocked single_vector_algo_type;
4033  typedef KB::Algo::Level3::Unblocked multi_vector_algo_type;
4034  static int recommended_team_size(const int blksize,
4035  const int vector_length,
4036  const int internal_vector_length) {
4037  return SolveTridiagsRecommendedHIPTeamSize(blksize, vector_length, internal_vector_length);
4038  }
4039 };
4040 template <>
4041 struct SolveTridiagsDefaultModeAndAlgo<Kokkos::HIPHostPinnedSpace> {
4042  typedef KB::Mode::Team mode_type;
4043  typedef KB::Algo::Level2::Unblocked single_vector_algo_type;
4044  typedef KB::Algo::Level3::Unblocked multi_vector_algo_type;
4045  static int recommended_team_size(const int blksize,
4046  const int vector_length,
4047  const int internal_vector_length) {
4048  return SolveTridiagsRecommendedHIPTeamSize(blksize, vector_length, internal_vector_length);
4049  }
4050 };
4051 #endif
4052 
4053 #if defined(KOKKOS_ENABLE_SYCL)
4054 static inline int SolveTridiagsRecommendedSYCLTeamSize(const int blksize,
4055  const int vector_length,
4056  const int internal_vector_length) {
4057  const int vector_size = vector_length / internal_vector_length;
4058  int total_team_size(0);
4059  if (blksize <= 5)
4060  total_team_size = 32;
4061  else if (blksize <= 9)
4062  total_team_size = 32; // 64
4063  else if (blksize <= 12)
4064  total_team_size = 96;
4065  else if (blksize <= 16)
4066  total_team_size = 128;
4067  else if (blksize <= 20)
4068  total_team_size = 160;
4069  else
4070  total_team_size = 160;
4071  return total_team_size / vector_size;
4072 }
4073 
4074 template <>
4075 struct SolveTridiagsDefaultModeAndAlgo<Kokkos::Experimental::SYCLSharedUSMSpace> {
4076  typedef KB::Mode::Team mode_type;
4077  typedef KB::Algo::Level2::Unblocked single_vector_algo_type;
4078  typedef KB::Algo::Level3::Unblocked multi_vector_algo_type;
4079  static int recommended_team_size(const int blksize,
4080  const int vector_length,
4081  const int internal_vector_length) {
4082  return SolveTridiagsRecommendedSYCLTeamSize(blksize, vector_length, internal_vector_length);
4083  }
4084 };
4085 template <>
4086 struct SolveTridiagsDefaultModeAndAlgo<Kokkos::Experimental::SYCLDeviceUSMSpace> {
4087  typedef KB::Mode::Team mode_type;
4088  typedef KB::Algo::Level2::Unblocked single_vector_algo_type;
4089  typedef KB::Algo::Level3::Unblocked multi_vector_algo_type;
4090  static int recommended_team_size(const int blksize,
4091  const int vector_length,
4092  const int internal_vector_length) {
4093  return SolveTridiagsRecommendedSYCLTeamSize(blksize, vector_length, internal_vector_length);
4094  }
4095 };
4096 #endif
4097 
4098 template <typename MatrixType>
4099 struct SolveTridiags {
4100  public:
4101  using impl_type = BlockHelperDetails::ImplType<MatrixType>;
4102  using execution_space = typename impl_type::execution_space;
4103 
4104  using local_ordinal_type = typename impl_type::local_ordinal_type;
4105  using size_type = typename impl_type::size_type;
4106  using impl_scalar_type = typename impl_type::impl_scalar_type;
4107  using magnitude_type = typename impl_type::magnitude_type;
4108  using btdm_scalar_type = typename impl_type::btdm_scalar_type;
4109  using btdm_magnitude_type = typename impl_type::btdm_magnitude_type;
4111  using local_ordinal_type_1d_view = typename impl_type::local_ordinal_type_1d_view;
4112  using local_ordinal_type_2d_view = typename impl_type::local_ordinal_type_2d_view;
4113  using size_type_2d_view = typename impl_type::size_type_2d_view;
4115  using vector_type_3d_view = typename impl_type::vector_type_3d_view;
4116  using internal_vector_type_3d_view = typename impl_type::internal_vector_type_3d_view;
4117  using internal_vector_type_4d_view = typename impl_type::internal_vector_type_4d_view;
4118  using internal_vector_type_5d_view = typename impl_type::internal_vector_type_5d_view;
4119  using btdm_scalar_type_4d_view = typename impl_type::btdm_scalar_type_4d_view;
4120 
4121  using internal_vector_scratch_type_3d_view = Scratch<typename impl_type::internal_vector_type_3d_view>;
4122 
4123  using internal_vector_type = typename impl_type::internal_vector_type;
4124  static constexpr int vector_length = impl_type::vector_length;
4125  static constexpr int internal_vector_length = impl_type::internal_vector_length;
4126 
4128  using impl_scalar_type_1d_view = typename impl_type::impl_scalar_type_1d_view;
4129  using impl_scalar_type_2d_view_tpetra = typename impl_type::impl_scalar_type_2d_view_tpetra;
4130 
4132  using team_policy_type = Kokkos::TeamPolicy<execution_space>;
4133  using member_type = typename team_policy_type::member_type;
4134 
4135  private:
4136  // part interface
4137  local_ordinal_type n_subparts_per_part;
4138  const ConstUnmanaged<local_ordinal_type_1d_view> partptr;
4139  const ConstUnmanaged<local_ordinal_type_1d_view> packptr;
4140  const ConstUnmanaged<local_ordinal_type_1d_view> packindices_sub;
4141  const ConstUnmanaged<local_ordinal_type_2d_view> packindices_schur;
4142  const ConstUnmanaged<local_ordinal_type_1d_view> part2packrowidx0;
4143  const ConstUnmanaged<local_ordinal_type_2d_view> part2packrowidx0_sub;
4144  const ConstUnmanaged<local_ordinal_type_1d_view> lclrow;
4145  const ConstUnmanaged<local_ordinal_type_1d_view> packptr_sub;
4146 
4147  const ConstUnmanaged<local_ordinal_type_2d_view> partptr_sub;
4148  const ConstUnmanaged<size_type_2d_view> pack_td_ptr_schur;
4149 
4150  // block tridiags
4151  const ConstUnmanaged<size_type_2d_view> pack_td_ptr;
4152 
4153  // block tridiags values
4154  const ConstUnmanaged<internal_vector_type_4d_view> D_internal_vector_values;
4155  const Unmanaged<internal_vector_type_4d_view> X_internal_vector_values;
4156  const Unmanaged<btdm_scalar_type_4d_view> X_internal_scalar_values;
4157 
4158  const Unmanaged<internal_vector_type_3d_view> X_internal_vector_values_schur;
4159 
4160  const ConstUnmanaged<internal_vector_type_4d_view> D_internal_vector_values_schur;
4161  const ConstUnmanaged<internal_vector_type_5d_view> e_internal_vector_values;
4162 
4163  const local_ordinal_type vector_loop_size;
4164 
4165  // copy to multivectors : damping factor and Y_scalar_multivector
4166  Unmanaged<impl_scalar_type_2d_view_tpetra> Y_scalar_multivector;
4167 #if defined(__CUDA_ARCH__) || defined(__HIP_DEVICE_COMPILE__) || defined(__SYCL_DEVICE_ONLY__)
4168  AtomicUnmanaged<impl_scalar_type_1d_view> Z_scalar_vector;
4169 #else
4170  /* */ Unmanaged<impl_scalar_type_1d_view> Z_scalar_vector;
4171 #endif
4172  const impl_scalar_type df;
4173  const bool compute_diff;
4174  // Schur solve only supports solving one vector at a time (currently).
4175  // If solving on a multivector, we loop over each vec in the solve.
4176  // This is the current vec being solved.
4177  local_ordinal_type active_schur_solve_vec;
4178 
4179  public:
4180  SolveTridiags(const BlockHelperDetails::PartInterface<MatrixType> &interf,
4181  const BlockTridiags<MatrixType> &btdm,
4182  const vector_type_3d_view &pmv,
4183  const impl_scalar_type damping_factor,
4184  const bool is_norm_manager_active)
4185  : // interface
4186  n_subparts_per_part(interf.n_subparts_per_part)
4187  , partptr(interf.partptr)
4188  , packptr(interf.packptr)
4189  , packindices_sub(interf.packindices_sub)
4190  , packindices_schur(interf.packindices_schur)
4191  , part2packrowidx0(interf.part2packrowidx0)
4192  , part2packrowidx0_sub(interf.part2packrowidx0_sub)
4193  , lclrow(interf.lclrow)
4194  , packptr_sub(interf.packptr_sub)
4195  , partptr_sub(interf.partptr_sub)
4196  , pack_td_ptr_schur(btdm.pack_td_ptr_schur)
4197  ,
4198  // block tridiags and multivector
4199  pack_td_ptr(btdm.pack_td_ptr)
4200  , D_internal_vector_values((internal_vector_type *)btdm.values.data(),
4201  btdm.values.extent(0),
4202  btdm.values.extent(1),
4203  btdm.values.extent(2),
4204  vector_length / internal_vector_length)
4205  , X_internal_vector_values((internal_vector_type *)pmv.data(),
4206  pmv.extent(0),
4207  pmv.extent(1),
4208  pmv.extent(2),
4209  vector_length / internal_vector_length)
4210  , X_internal_scalar_values((btdm_scalar_type *)pmv.data(),
4211  pmv.extent(0),
4212  pmv.extent(1),
4213  pmv.extent(2),
4214  vector_length)
4215  , X_internal_vector_values_schur(btdm.X_internal_vector_values_schur)
4216  , D_internal_vector_values_schur((internal_vector_type *)btdm.values_schur.data(),
4217  btdm.values_schur.extent(0),
4218  btdm.values_schur.extent(1),
4219  btdm.values_schur.extent(2),
4220  vector_length / internal_vector_length)
4221  , e_internal_vector_values((internal_vector_type *)btdm.e_values.data(),
4222  btdm.e_values.extent(0),
4223  btdm.e_values.extent(1),
4224  btdm.e_values.extent(2),
4225  btdm.e_values.extent(3),
4226  vector_length / internal_vector_length)
4227  , vector_loop_size(vector_length / internal_vector_length)
4228  , Y_scalar_multivector()
4229  , Z_scalar_vector()
4230  , df(damping_factor)
4231  , compute_diff(is_norm_manager_active)
4232  , active_schur_solve_vec(0) {}
4233 
4234  public:
4236  KOKKOS_INLINE_FUNCTION
4237  void
4238  copyToFlatMultiVector(const member_type &member,
4239  const local_ordinal_type partidxbeg, // partidx for v = 0
4240  const local_ordinal_type npacks,
4241  const local_ordinal_type pri0,
4242  const local_ordinal_type v, // index with a loop of vector_loop_size
4243  const local_ordinal_type blocksize,
4244  const local_ordinal_type num_vectors) const {
4245  const local_ordinal_type vbeg = v * internal_vector_length;
4246  if (vbeg < npacks) {
4247  local_ordinal_type ri0_vals[internal_vector_length] = {};
4248  local_ordinal_type nrows_vals[internal_vector_length] = {};
4249  for (local_ordinal_type vv = vbeg, vi = 0; vv < npacks && vi < internal_vector_length; ++vv, ++vi) {
4250  const local_ordinal_type partidx = partidxbeg + vv;
4251  ri0_vals[vi] = partptr(partidx);
4252  nrows_vals[vi] = partptr(partidx + 1) - ri0_vals[vi];
4253  }
4254 
4255  impl_scalar_type z_partial_sum(0);
4256  if (nrows_vals[0] == 1) {
4257  const local_ordinal_type j = 0, pri = pri0;
4258  {
4259  for (local_ordinal_type vv = vbeg, vi = 0; vv < npacks && vi < internal_vector_length; ++vv, ++vi) {
4260  const local_ordinal_type ri0 = ri0_vals[vi];
4261  const local_ordinal_type nrows = nrows_vals[vi];
4262  if (j < nrows) {
4263  Kokkos::parallel_for(Kokkos::TeamThreadRange(member, blocksize),
4264  [&](const local_ordinal_type &i) {
4265  const local_ordinal_type row = blocksize * lclrow(ri0 + j) + i;
4266  for (local_ordinal_type col = 0; col < num_vectors; ++col) {
4267  impl_scalar_type &y = Y_scalar_multivector(row, col);
4268  const impl_scalar_type yd = X_internal_vector_values(pri, i, col, v)[vi] - y;
4269  y += df * yd;
4270 
4271  { // if (compute_diff) {
4272 #if KOKKOS_VERSION >= 40799
4273  const auto yd_abs = KokkosKernels::ArithTraits<impl_scalar_type>::abs(yd);
4274 #else
4275  const auto yd_abs = Kokkos::ArithTraits<impl_scalar_type>::abs(yd);
4276 #endif
4277  z_partial_sum += yd_abs * yd_abs;
4278  }
4279  }
4280  });
4281  }
4282  }
4283  }
4284  } else {
4285  Kokkos::parallel_for(Kokkos::TeamThreadRange(member, nrows_vals[0]),
4286  [&](const local_ordinal_type &j) {
4287  const local_ordinal_type pri = pri0 + j;
4288  for (local_ordinal_type vv = vbeg, vi = 0; vv < npacks && vi < internal_vector_length; ++vv, ++vi) {
4289  const local_ordinal_type ri0 = ri0_vals[vi];
4290  const local_ordinal_type nrows = nrows_vals[vi];
4291  if (j < nrows) {
4292  for (local_ordinal_type col = 0; col < num_vectors; ++col) {
4293  for (local_ordinal_type i = 0; i < blocksize; ++i) {
4294  const local_ordinal_type row = blocksize * lclrow(ri0 + j) + i;
4295  impl_scalar_type &y = Y_scalar_multivector(row, col);
4296  const impl_scalar_type yd = X_internal_vector_values(pri, i, col, v)[vi] - y;
4297  y += df * yd;
4298 
4299  { // if (compute_diff) {
4300 #if KOKKOS_VERSION >= 40799
4301  const auto yd_abs = KokkosKernels::ArithTraits<impl_scalar_type>::abs(yd);
4302 #else
4303  const auto yd_abs = Kokkos::ArithTraits<impl_scalar_type>::abs(yd);
4304 #endif
4305  z_partial_sum += yd_abs * yd_abs;
4306  }
4307  }
4308  }
4309  }
4310  }
4311  });
4312  }
4313  // if (compute_diff)
4314  Z_scalar_vector(member.league_rank()) += z_partial_sum;
4315  }
4316  }
4317 
4321  template <typename WWViewType>
4322  KOKKOS_INLINE_FUNCTION void
4323  solveSingleVector(const member_type &member,
4324  const local_ordinal_type &blocksize,
4325  const local_ordinal_type &i0,
4326  const local_ordinal_type &r0,
4327  const local_ordinal_type &nrows,
4328  const local_ordinal_type &v,
4329  const WWViewType &WW) const {
4330  typedef SolveTridiagsDefaultModeAndAlgo<typename execution_space::memory_space> default_mode_and_algo_type;
4331 
4332  typedef typename default_mode_and_algo_type::mode_type default_mode_type;
4333  typedef typename default_mode_and_algo_type::single_vector_algo_type default_algo_type;
4334 
4335  // base pointers
4336  auto A = D_internal_vector_values.data();
4337  auto X = X_internal_vector_values.data();
4338 
4339  // constant
4340 #if KOKKOS_VERSION >= 40799
4341  const auto one = KokkosKernels::ArithTraits<btdm_magnitude_type>::one();
4342 #else
4343  const auto one = Kokkos::ArithTraits<btdm_magnitude_type>::one();
4344 #endif
4345 #if KOKKOS_VERSION >= 40799
4346  const auto zero = KokkosKernels::ArithTraits<btdm_magnitude_type>::zero();
4347 #else
4348  const auto zero = Kokkos::ArithTraits<btdm_magnitude_type>::zero();
4349 #endif
4350  // const local_ordinal_type num_vectors = X_scalar_values.extent(2);
4351 
4352  // const local_ordinal_type blocksize = D_scalar_values.extent(1);
4353  const local_ordinal_type astep = D_internal_vector_values.stride(0);
4354  const local_ordinal_type as0 = D_internal_vector_values.stride(1); // blocksize*vector_length;
4355  const local_ordinal_type as1 = D_internal_vector_values.stride(2); // vector_length;
4356  const local_ordinal_type xstep = X_internal_vector_values.stride(0);
4357  const local_ordinal_type xs0 = X_internal_vector_values.stride(1); // vector_length;
4358 
4359  // move to starting point
4360  A += i0 * astep + v;
4361  X += r0 * xstep + v;
4362 
4363  // for (local_ordinal_type col=0;col<num_vectors;++col)
4364  if (nrows > 1) {
4365  // solve Lx = x
4366  KOKKOSBATCHED_TRSV_LOWER_NO_TRANSPOSE_INTERNAL_INVOKE(default_mode_type, default_algo_type,
4367  member,
4368  KB::Diag::Unit,
4369  blocksize, blocksize,
4370  one,
4371  A, as0, as1,
4372  X, xs0);
4373 
4374  for (local_ordinal_type tr = 1; tr < nrows; ++tr) {
4375  member.team_barrier();
4376  KOKKOSBATCHED_GEMV_NO_TRANSPOSE_INTERNAL_INVOKE(default_mode_type, default_algo_type,
4377  member,
4378  blocksize, blocksize,
4379  -one,
4380  A + 2 * astep, as0, as1,
4381  X, xs0,
4382  one,
4383  X + 1 * xstep, xs0);
4384  KOKKOSBATCHED_TRSV_LOWER_NO_TRANSPOSE_INTERNAL_INVOKE(default_mode_type, default_algo_type,
4385  member,
4386  KB::Diag::Unit,
4387  blocksize, blocksize,
4388  one,
4389  A + 3 * astep, as0, as1,
4390  X + 1 * xstep, xs0);
4391 
4392  A += 3 * astep;
4393  X += 1 * xstep;
4394  }
4395 
4396  // solve Ux = x
4397  KOKKOSBATCHED_TRSV_UPPER_NO_TRANSPOSE_INTERNAL_INVOKE(default_mode_type, default_algo_type,
4398  member,
4399  KB::Diag::NonUnit,
4400  blocksize, blocksize,
4401  one,
4402  A, as0, as1,
4403  X, xs0);
4404 
4405  for (local_ordinal_type tr = nrows; tr > 1; --tr) {
4406  A -= 3 * astep;
4407  member.team_barrier();
4408  KOKKOSBATCHED_GEMV_NO_TRANSPOSE_INTERNAL_INVOKE(default_mode_type, default_algo_type,
4409  member,
4410  blocksize, blocksize,
4411  -one,
4412  A + 1 * astep, as0, as1,
4413  X, xs0,
4414  one,
4415  X - 1 * xstep, xs0);
4416  KOKKOSBATCHED_TRSV_UPPER_NO_TRANSPOSE_INTERNAL_INVOKE(default_mode_type, default_algo_type,
4417  member,
4418  KB::Diag::NonUnit,
4419  blocksize, blocksize,
4420  one,
4421  A, as0, as1,
4422  X - 1 * xstep, xs0);
4423  X -= 1 * xstep;
4424  }
4425  // for multiple rhs
4426  // X += xs1;
4427  } else {
4428  const local_ordinal_type ws0 = WW.stride(0);
4429  auto W = WW.data() + v;
4430  KOKKOSBATCHED_COPY_VECTOR_NO_TRANSPOSE_INTERNAL_INVOKE(default_mode_type,
4431  member, blocksize, X, xs0, W, ws0);
4432  member.team_barrier();
4433  KOKKOSBATCHED_GEMV_NO_TRANSPOSE_INTERNAL_INVOKE(default_mode_type, default_algo_type,
4434  member,
4435  blocksize, blocksize,
4436  one,
4437  A, as0, as1,
4438  W, xs0,
4439  zero,
4440  X, xs0);
4441  }
4442  }
4443 
4444  template <typename WWViewType>
4445  KOKKOS_INLINE_FUNCTION void
4446  solveMultiVector(const member_type &member,
4447  const local_ordinal_type & /* blocksize */,
4448  const local_ordinal_type &i0,
4449  const local_ordinal_type &r0,
4450  const local_ordinal_type &nrows,
4451  const local_ordinal_type &v,
4452  const WWViewType &WW) const {
4453  typedef SolveTridiagsDefaultModeAndAlgo<typename execution_space::memory_space> default_mode_and_algo_type;
4454 
4455  typedef typename default_mode_and_algo_type::mode_type default_mode_type;
4456  typedef typename default_mode_and_algo_type::multi_vector_algo_type default_algo_type;
4457 
4458  // constant
4459 #if KOKKOS_VERSION >= 40799
4460  const auto one = KokkosKernels::ArithTraits<btdm_magnitude_type>::one();
4461 #else
4462  const auto one = Kokkos::ArithTraits<btdm_magnitude_type>::one();
4463 #endif
4464 #if KOKKOS_VERSION >= 40799
4465  const auto zero = KokkosKernels::ArithTraits<btdm_magnitude_type>::zero();
4466 #else
4467  const auto zero = Kokkos::ArithTraits<btdm_magnitude_type>::zero();
4468 #endif
4469 
4470  // subview pattern
4471  auto A = Kokkos::subview(D_internal_vector_values, i0, Kokkos::ALL(), Kokkos::ALL(), v);
4472  auto X1 = Kokkos::subview(X_internal_vector_values, r0, Kokkos::ALL(), Kokkos::ALL(), v);
4473  auto X2 = X1;
4474 
4475  local_ordinal_type i = i0, r = r0;
4476 
4477  if (nrows > 1) {
4478  // solve Lx = x
4479  KB::Trsm<member_type,
4480  KB::Side::Left, KB::Uplo::Lower, KB::Trans::NoTranspose, KB::Diag::Unit,
4481  default_mode_type, default_algo_type>::invoke(member, one, A, X1);
4482  for (local_ordinal_type tr = 1; tr < nrows; ++tr, i += 3) {
4483  A.assign_data(&D_internal_vector_values(i + 2, 0, 0, v));
4484  X2.assign_data(&X_internal_vector_values(++r, 0, 0, v));
4485  member.team_barrier();
4486  KB::Gemm<member_type,
4487  KB::Trans::NoTranspose, KB::Trans::NoTranspose,
4488  default_mode_type, default_algo_type>::invoke(member, -one, A, X1, one, X2);
4489  A.assign_data(&D_internal_vector_values(i + 3, 0, 0, v));
4490  KB::Trsm<member_type,
4491  KB::Side::Left, KB::Uplo::Lower, KB::Trans::NoTranspose, KB::Diag::Unit,
4492  default_mode_type, default_algo_type>::invoke(member, one, A, X2);
4493  X1.assign_data(X2.data());
4494  }
4495 
4496  // solve Ux = x
4497  KB::Trsm<member_type,
4498  KB::Side::Left, KB::Uplo::Upper, KB::Trans::NoTranspose, KB::Diag::NonUnit,
4499  default_mode_type, default_algo_type>::invoke(member, one, A, X1);
4500  for (local_ordinal_type tr = nrows; tr > 1; --tr) {
4501  i -= 3;
4502  A.assign_data(&D_internal_vector_values(i + 1, 0, 0, v));
4503  X2.assign_data(&X_internal_vector_values(--r, 0, 0, v));
4504  member.team_barrier();
4505  KB::Gemm<member_type,
4506  KB::Trans::NoTranspose, KB::Trans::NoTranspose,
4507  default_mode_type, default_algo_type>::invoke(member, -one, A, X1, one, X2);
4508 
4509  A.assign_data(&D_internal_vector_values(i, 0, 0, v));
4510  KB::Trsm<member_type,
4511  KB::Side::Left, KB::Uplo::Upper, KB::Trans::NoTranspose, KB::Diag::NonUnit,
4512  default_mode_type, default_algo_type>::invoke(member, one, A, X2);
4513  X1.assign_data(X2.data());
4514  }
4515  } else {
4516  // matrix is already inverted
4517  auto W = Kokkos::subview(WW, Kokkos::ALL(), Kokkos::ALL(), v);
4518  KB::Copy<member_type, KB::Trans::NoTranspose, default_mode_type>::invoke(member, X1, W);
4519  member.team_barrier();
4520  KB::Gemm<member_type,
4521  KB::Trans::NoTranspose, KB::Trans::NoTranspose,
4522  default_mode_type, default_algo_type>::invoke(member, one, A, W, zero, X1);
4523  }
4524  }
4525 
4526  template <int B>
4527  struct SingleVectorTag {};
4528  template <int B>
4529  struct MultiVectorTag {};
4530 
4531  template <int B>
4532  struct SingleVectorSubLineTag {};
4533  template <int B>
4534  struct SingleVectorApplyCTag {};
4535  template <int B>
4536  struct SingleVectorSchurTag {};
4537  template <int B>
4538  struct SingleVectorApplyETag {};
4539  template <int B>
4540  struct CopyVectorToFlatTag {};
4541  template <int B>
4542  struct SingleZeroingTag {};
4543 
4544  template <int B>
4545  KOKKOS_INLINE_FUNCTION void
4546  operator()(const SingleVectorTag<B> &, const member_type &member) const {
4547  const local_ordinal_type packidx = member.league_rank();
4548  const local_ordinal_type partidx = packptr(packidx);
4549  const local_ordinal_type npacks = packptr(packidx + 1) - partidx;
4550  const local_ordinal_type pri0 = part2packrowidx0(partidx);
4551  const local_ordinal_type i0 = pack_td_ptr(partidx, 0);
4552  const local_ordinal_type r0 = part2packrowidx0(partidx);
4553  const local_ordinal_type nrows = partptr(partidx + 1) - partptr(partidx);
4554  const local_ordinal_type blocksize = (B == 0 ? D_internal_vector_values.extent(1) : B);
4555  const local_ordinal_type num_vectors = 1;
4556  internal_vector_scratch_type_3d_view
4557  WW(member.team_scratch(0), blocksize, 1, vector_loop_size);
4558  Kokkos::single(Kokkos::PerTeam(member), [&]() {
4559  Z_scalar_vector(member.league_rank()) = impl_scalar_type(0);
4560  });
4561  Kokkos::parallel_for(Kokkos::ThreadVectorRange(member, vector_loop_size), [&](const int &v) {
4562  solveSingleVector(member, blocksize, i0, r0, nrows, v, WW);
4563  copyToFlatMultiVector(member, partidx, npacks, pri0, v, blocksize, num_vectors);
4564  });
4565  }
4566 
4567  template <int B>
4568  KOKKOS_INLINE_FUNCTION void
4569  operator()(const MultiVectorTag<B> &, const member_type &member) const {
4570  const local_ordinal_type packidx = member.league_rank();
4571  const local_ordinal_type partidx = packptr(packidx);
4572  const local_ordinal_type npacks = packptr(packidx + 1) - partidx;
4573  const local_ordinal_type pri0 = part2packrowidx0(partidx);
4574  const local_ordinal_type i0 = pack_td_ptr(partidx, 0);
4575  const local_ordinal_type r0 = part2packrowidx0(partidx);
4576  const local_ordinal_type nrows = partptr(partidx + 1) - partptr(partidx);
4577  const local_ordinal_type blocksize = (B == 0 ? D_internal_vector_values.extent(1) : B);
4578  const local_ordinal_type num_vectors = X_internal_vector_values.extent(2);
4579 
4580  internal_vector_scratch_type_3d_view
4581  WW(member.team_scratch(0), blocksize, num_vectors, vector_loop_size);
4582  Kokkos::single(Kokkos::PerTeam(member), [&]() {
4583  Z_scalar_vector(member.league_rank()) = impl_scalar_type(0);
4584  });
4585  Kokkos::parallel_for(Kokkos::ThreadVectorRange(member, vector_loop_size), [&](const int &v) {
4586  solveMultiVector(member, blocksize, i0, r0, nrows, v, WW);
4587  copyToFlatMultiVector(member, partidx, npacks, pri0, v, blocksize, num_vectors);
4588  });
4589  }
4590 
4591  template <int B>
4592  KOKKOS_INLINE_FUNCTION void
4593  operator()(const SingleVectorSubLineTag<B> &, const member_type &member) const {
4594  // btdm is packed and sorted from largest one
4595  const local_ordinal_type packidx = packindices_sub(member.league_rank());
4596 
4597  const local_ordinal_type subpartidx = packptr_sub(packidx);
4598  const local_ordinal_type n_parts = part2packrowidx0_sub.extent(0);
4599  const local_ordinal_type local_subpartidx = subpartidx / n_parts;
4600  const local_ordinal_type partidx = subpartidx % n_parts;
4601 
4602  const local_ordinal_type npacks = packptr_sub(packidx + 1) - subpartidx;
4603  const local_ordinal_type i0 = pack_td_ptr(partidx, local_subpartidx);
4604  const local_ordinal_type r0 = part2packrowidx0_sub(partidx, local_subpartidx);
4605  const local_ordinal_type nrows = partptr_sub(subpartidx, 1) - partptr_sub(subpartidx, 0);
4606  const local_ordinal_type blocksize = e_internal_vector_values.extent(2);
4607 
4608  //(void) i0;
4609  //(void) nrows;
4610  (void)npacks;
4611 
4612  internal_vector_scratch_type_3d_view
4613  WW(member.team_scratch(0), blocksize, 1, vector_loop_size);
4614 
4615  Kokkos::parallel_for(Kokkos::ThreadVectorRange(member, vector_loop_size), [&](const int &v) {
4616  auto X_internal_vec = Kokkos::subview(X_internal_vector_values, Kokkos::ALL(), Kokkos::ALL(), active_schur_solve_vec, Kokkos::ALL());
4617  solveSingleVectorNew<impl_type, internal_vector_scratch_type_3d_view>(member, blocksize, i0, r0, nrows, v, D_internal_vector_values, X_internal_vec, WW);
4618  });
4619  }
4620 
4621  template <int B>
4622  KOKKOS_INLINE_FUNCTION void
4623  operator()(const SingleVectorApplyCTag<B> &, const member_type &member) const {
4624  // btdm is packed and sorted from largest one
4625  // const local_ordinal_type packidx = packindices_schur(member.league_rank());
4626  const local_ordinal_type packidx = packindices_sub(member.league_rank());
4627 
4628  const local_ordinal_type subpartidx = packptr_sub(packidx);
4629  const local_ordinal_type n_parts = part2packrowidx0_sub.extent(0);
4630  const local_ordinal_type local_subpartidx = subpartidx / n_parts;
4631  const local_ordinal_type partidx = subpartidx % n_parts;
4632  const local_ordinal_type blocksize = e_internal_vector_values.extent(2);
4633 
4634  // const local_ordinal_type npacks = packptr_sub(packidx+1) - subpartidx;
4635  const local_ordinal_type i0 = pack_td_ptr(partidx, local_subpartidx);
4636  const local_ordinal_type r0 = part2packrowidx0_sub(partidx, local_subpartidx);
4637  const local_ordinal_type nrows = partptr_sub(subpartidx, 1) - partptr_sub(subpartidx, 0);
4638 
4639  // Compute v_2 = v_2 - C v_1
4640 
4641  const local_ordinal_type local_subpartidx_schur = (local_subpartidx - 1) / 2;
4642  const local_ordinal_type i0_schur = local_subpartidx_schur == 0 ? pack_td_ptr_schur(partidx, local_subpartidx_schur) : pack_td_ptr_schur(partidx, local_subpartidx_schur) + 1;
4643  const local_ordinal_type i0_offset = local_subpartidx_schur == 0 ? i0 + 2 : i0 + 2;
4644 
4645  (void)i0_schur;
4646  (void)i0_offset;
4647 
4648 #if KOKKOS_VERSION >= 40799
4649  const auto one = KokkosKernels::ArithTraits<btdm_magnitude_type>::one();
4650 #else
4651  const auto one = Kokkos::ArithTraits<btdm_magnitude_type>::one();
4652 #endif
4653 
4654  const size_type c_kps2 = local_subpartidx > 0 ? pack_td_ptr(partidx, local_subpartidx) - 2 : 0;
4655  const size_type c_kps1 = pack_td_ptr(partidx, local_subpartidx + 1) + 1;
4656 
4657  typedef SolveTridiagsDefaultModeAndAlgo<typename execution_space::memory_space> default_mode_and_algo_type;
4658 
4659  typedef typename default_mode_and_algo_type::mode_type default_mode_type;
4660  typedef typename default_mode_and_algo_type::single_vector_algo_type default_algo_type;
4661 
4662  if (local_subpartidx == 0) {
4663  Kokkos::parallel_for(Kokkos::ThreadVectorRange(member, vector_loop_size), [&](const int &v) {
4664  auto v_1 = Kokkos::subview(X_internal_vector_values, r0 + nrows - 1, Kokkos::ALL(), active_schur_solve_vec, v);
4665  auto v_2 = Kokkos::subview(X_internal_vector_values, r0 + nrows, Kokkos::ALL(), active_schur_solve_vec, v);
4666  auto C = Kokkos::subview(D_internal_vector_values, c_kps1, Kokkos::ALL(), Kokkos::ALL(), v);
4667 
4668  KOKKOSBATCHED_GEMV_NO_TRANSPOSE_INTERNAL_INVOKE(default_mode_type, default_algo_type,
4669  member,
4670  blocksize, blocksize,
4671  -one,
4672  C.data(), C.stride(0), C.stride(1),
4673  v_1.data(), v_1.stride(0),
4674  one,
4675  v_2.data(), v_2.stride(0));
4676  });
4677  } else if (local_subpartidx == (local_ordinal_type)part2packrowidx0_sub.extent(1) - 2) {
4678  Kokkos::parallel_for(Kokkos::ThreadVectorRange(member, vector_loop_size), [&](const int &v) {
4679  auto v_1 = Kokkos::subview(X_internal_vector_values, r0, Kokkos::ALL(), active_schur_solve_vec, v);
4680  auto v_2 = Kokkos::subview(X_internal_vector_values, r0 - 1, Kokkos::ALL(), active_schur_solve_vec, v);
4681  auto C = Kokkos::subview(D_internal_vector_values, c_kps2, Kokkos::ALL(), Kokkos::ALL(), v);
4682 
4683  KOKKOSBATCHED_GEMV_NO_TRANSPOSE_INTERNAL_INVOKE(default_mode_type, default_algo_type,
4684  member,
4685  blocksize, blocksize,
4686  -one,
4687  C.data(), C.stride(0), C.stride(1),
4688  v_1.data(), v_1.stride(0),
4689  one,
4690  v_2.data(), v_2.stride(0));
4691  });
4692  } else {
4693  Kokkos::parallel_for(Kokkos::ThreadVectorRange(member, vector_loop_size), [&](const int &v) {
4694  {
4695  auto v_1 = Kokkos::subview(X_internal_vector_values, r0 + nrows - 1, Kokkos::ALL(), active_schur_solve_vec, v);
4696  auto v_2 = Kokkos::subview(X_internal_vector_values, r0 + nrows, Kokkos::ALL(), active_schur_solve_vec, v);
4697  auto C = Kokkos::subview(D_internal_vector_values, c_kps1, Kokkos::ALL(), Kokkos::ALL(), v);
4698 
4699  KOKKOSBATCHED_GEMV_NO_TRANSPOSE_INTERNAL_INVOKE(default_mode_type, default_algo_type,
4700  member,
4701  blocksize, blocksize,
4702  -one,
4703  C.data(), C.stride(0), C.stride(1),
4704  v_1.data(), v_1.stride(0),
4705  one,
4706  v_2.data(), v_2.stride(0));
4707  }
4708  {
4709  auto v_1 = Kokkos::subview(X_internal_vector_values, r0, Kokkos::ALL(), active_schur_solve_vec, v);
4710  auto v_2 = Kokkos::subview(X_internal_vector_values, r0 - 1, Kokkos::ALL(), active_schur_solve_vec, v);
4711  auto C = Kokkos::subview(D_internal_vector_values, c_kps2, Kokkos::ALL(), Kokkos::ALL(), v);
4712 
4713  KOKKOSBATCHED_GEMV_NO_TRANSPOSE_INTERNAL_INVOKE(default_mode_type, default_algo_type,
4714  member,
4715  blocksize, blocksize,
4716  -one,
4717  C.data(), C.stride(0), C.stride(1),
4718  v_1.data(), v_1.stride(0),
4719  one,
4720  v_2.data(), v_2.stride(0));
4721  }
4722  });
4723  }
4724  }
4725 
4726  template <int B>
4727  KOKKOS_INLINE_FUNCTION void
4728  operator()(const SingleVectorSchurTag<B> &, const member_type &member) const {
4729  const local_ordinal_type packidx = packindices_sub(member.league_rank());
4730 
4731  const local_ordinal_type partidx = packptr_sub(packidx);
4732 
4733  const local_ordinal_type blocksize = e_internal_vector_values.extent(2);
4734 
4735  const local_ordinal_type i0_schur = pack_td_ptr_schur(partidx, 0);
4736  const local_ordinal_type nrows = 2 * (n_subparts_per_part - 1);
4737 
4738  const local_ordinal_type r0_schur = nrows * member.league_rank();
4739 
4740  internal_vector_scratch_type_3d_view
4741  WW(member.team_scratch(0), blocksize, blocksize, vector_loop_size);
4742 
4743  for (local_ordinal_type schur_sub_part = 0; schur_sub_part < n_subparts_per_part - 1; ++schur_sub_part) {
4744  const local_ordinal_type r0 = part2packrowidx0_sub(partidx, 2 * schur_sub_part + 1);
4745  for (local_ordinal_type i = 0; i < 2; ++i) {
4746  copy3DView<local_ordinal_type>(member,
4747  Kokkos::subview(X_internal_vector_values_schur, r0_schur + 2 * schur_sub_part + i, Kokkos::ALL(), Kokkos::ALL()),
4748  Kokkos::subview(X_internal_vector_values, r0 + i, Kokkos::ALL(), active_schur_solve_vec, Kokkos::ALL()));
4749  }
4750  }
4751 
4752  Kokkos::parallel_for(Kokkos::ThreadVectorRange(member, vector_loop_size), [&](const int &v) {
4753  solveSingleVectorNew<impl_type, internal_vector_scratch_type_3d_view>(member, blocksize, i0_schur, r0_schur, nrows, v, D_internal_vector_values_schur, X_internal_vector_values_schur, WW);
4754  });
4755 
4756  for (local_ordinal_type schur_sub_part = 0; schur_sub_part < n_subparts_per_part - 1; ++schur_sub_part) {
4757  const local_ordinal_type r0 = part2packrowidx0_sub(partidx, 2 * schur_sub_part + 1);
4758  for (local_ordinal_type i = 0; i < 2; ++i) {
4759  copy3DView<local_ordinal_type>(member,
4760  Kokkos::subview(X_internal_vector_values, r0 + i, Kokkos::ALL(), active_schur_solve_vec, Kokkos::ALL()),
4761  Kokkos::subview(X_internal_vector_values_schur, r0_schur + 2 * schur_sub_part + i, Kokkos::ALL(), Kokkos::ALL()));
4762  }
4763  }
4764  }
4765 
4766  template <int B>
4767  KOKKOS_INLINE_FUNCTION void
4768  operator()(const SingleVectorApplyETag<B> &, const member_type &member) const {
4769  const local_ordinal_type packidx = packindices_sub(member.league_rank());
4770 
4771  const local_ordinal_type subpartidx = packptr_sub(packidx);
4772  const local_ordinal_type n_parts = part2packrowidx0_sub.extent(0);
4773  const local_ordinal_type local_subpartidx = subpartidx / n_parts;
4774  const local_ordinal_type partidx = subpartidx % n_parts;
4775  const local_ordinal_type blocksize = e_internal_vector_values.extent(2);
4776 
4777  const local_ordinal_type r0 = part2packrowidx0_sub(partidx, local_subpartidx);
4778  const local_ordinal_type nrows = partptr_sub(subpartidx, 1) - partptr_sub(subpartidx, 0);
4779 
4780  // Compute v_2 = v_2 - C v_1
4781 
4782 #if KOKKOS_VERSION >= 40799
4783  const auto one = KokkosKernels::ArithTraits<btdm_magnitude_type>::one();
4784 #else
4785  const auto one = Kokkos::ArithTraits<btdm_magnitude_type>::one();
4786 #endif
4787 
4788  typedef SolveTridiagsDefaultModeAndAlgo<typename execution_space::memory_space> default_mode_and_algo_type;
4789 
4790  typedef typename default_mode_and_algo_type::mode_type default_mode_type;
4791  typedef typename default_mode_and_algo_type::single_vector_algo_type default_algo_type;
4792 
4793  if (local_subpartidx == 0) {
4794  Kokkos::parallel_for(Kokkos::ThreadVectorRange(member, vector_loop_size), [&](const int &v) {
4795  auto v_2 = Kokkos::subview(X_internal_vector_values, r0 + nrows, Kokkos::ALL(), active_schur_solve_vec, v);
4796 
4797  for (local_ordinal_type row = 0; row < nrows; ++row) {
4798  auto v_1 = Kokkos::subview(X_internal_vector_values, r0 + row, Kokkos::ALL(), active_schur_solve_vec, v);
4799  auto E = Kokkos::subview(e_internal_vector_values, 0, r0 + row, Kokkos::ALL(), Kokkos::ALL(), v);
4800 
4801  KOKKOSBATCHED_GEMV_NO_TRANSPOSE_INTERNAL_INVOKE(default_mode_type, default_algo_type,
4802  member,
4803  blocksize, blocksize,
4804  -one,
4805  E.data(), E.stride(0), E.stride(1),
4806  v_2.data(), v_2.stride(0),
4807  one,
4808  v_1.data(), v_1.stride(0));
4809  }
4810  });
4811  } else if (local_subpartidx == (local_ordinal_type)part2packrowidx0_sub.extent(1) - 2) {
4812  Kokkos::parallel_for(Kokkos::ThreadVectorRange(member, vector_loop_size), [&](const int &v) {
4813  auto v_2 = Kokkos::subview(X_internal_vector_values, r0 - 1, Kokkos::ALL(), active_schur_solve_vec, v);
4814 
4815  for (local_ordinal_type row = 0; row < nrows; ++row) {
4816  auto v_1 = Kokkos::subview(X_internal_vector_values, r0 + row, Kokkos::ALL(), active_schur_solve_vec, v);
4817  auto E = Kokkos::subview(e_internal_vector_values, 1, r0 + row, Kokkos::ALL(), Kokkos::ALL(), v);
4818 
4819  KOKKOSBATCHED_GEMV_NO_TRANSPOSE_INTERNAL_INVOKE(default_mode_type, default_algo_type,
4820  member,
4821  blocksize, blocksize,
4822  -one,
4823  E.data(), E.stride(0), E.stride(1),
4824  v_2.data(), v_2.stride(0),
4825  one,
4826  v_1.data(), v_1.stride(0));
4827  }
4828  });
4829  } else {
4830  Kokkos::parallel_for(Kokkos::ThreadVectorRange(member, vector_loop_size), [&](const int &v) {
4831  {
4832  auto v_2 = Kokkos::subview(X_internal_vector_values, r0 + nrows, Kokkos::ALL(), active_schur_solve_vec, v);
4833 
4834  for (local_ordinal_type row = 0; row < nrows; ++row) {
4835  auto v_1 = Kokkos::subview(X_internal_vector_values, r0 + row, Kokkos::ALL(), active_schur_solve_vec, v);
4836  auto E = Kokkos::subview(e_internal_vector_values, 0, r0 + row, Kokkos::ALL(), Kokkos::ALL(), v);
4837 
4838  KOKKOSBATCHED_GEMV_NO_TRANSPOSE_INTERNAL_INVOKE(default_mode_type, default_algo_type,
4839  member,
4840  blocksize, blocksize,
4841  -one,
4842  E.data(), E.stride(0), E.stride(1),
4843  v_2.data(), v_2.stride(0),
4844  one,
4845  v_1.data(), v_1.stride(0));
4846  }
4847  }
4848  {
4849  auto v_2 = Kokkos::subview(X_internal_vector_values, r0 - 1, Kokkos::ALL(), active_schur_solve_vec, v);
4850 
4851  for (local_ordinal_type row = 0; row < nrows; ++row) {
4852  auto v_1 = Kokkos::subview(X_internal_vector_values, r0 + row, Kokkos::ALL(), active_schur_solve_vec, v);
4853  auto E = Kokkos::subview(e_internal_vector_values, 1, r0 + row, Kokkos::ALL(), Kokkos::ALL(), v);
4854 
4855  KOKKOSBATCHED_GEMV_NO_TRANSPOSE_INTERNAL_INVOKE(default_mode_type, default_algo_type,
4856  member,
4857  blocksize, blocksize,
4858  -one,
4859  E.data(), E.stride(0), E.stride(1),
4860  v_2.data(), v_2.stride(0),
4861  one,
4862  v_1.data(), v_1.stride(0));
4863  }
4864  }
4865  });
4866  }
4867  }
4868 
4869  template <int B>
4870  KOKKOS_INLINE_FUNCTION void
4871  operator()(const CopyVectorToFlatTag<B> &, const member_type &member) const {
4872  const local_ordinal_type packidx = member.league_rank();
4873  const local_ordinal_type partidx = packptr(packidx);
4874  const local_ordinal_type npacks = packptr(packidx + 1) - partidx;
4875  const local_ordinal_type pri0 = part2packrowidx0(partidx);
4876  const local_ordinal_type blocksize = (B == 0 ? D_internal_vector_values.extent(1) : B);
4877  const local_ordinal_type num_vectors = X_internal_vector_values.extent(2);
4878 
4879  Kokkos::parallel_for(Kokkos::ThreadVectorRange(member, vector_loop_size), [&](const int &v) {
4880  copyToFlatMultiVector(member, partidx, npacks, pri0, v, blocksize, num_vectors);
4881  });
4882  }
4883 
4884  template <int B>
4885  KOKKOS_INLINE_FUNCTION void
4886  operator()(const SingleZeroingTag<B> &, const member_type &member) const {
4887  Kokkos::single(Kokkos::PerTeam(member), [&]() {
4888  Z_scalar_vector(member.league_rank()) = impl_scalar_type(0);
4889  });
4890  }
4891 
4892  void run(const impl_scalar_type_2d_view_tpetra &Y,
4893  const impl_scalar_type_1d_view &Z) {
4894  IFPACK2_BLOCKTRIDICONTAINER_PROFILER_REGION_BEGIN;
4895  IFPACK2_BLOCKHELPER_TIMER("BlockTriDi::SolveTridiags", SolveTridiags);
4896 
4898  this->Y_scalar_multivector = Y;
4899  this->Z_scalar_vector = Z;
4900 
4901  const local_ordinal_type num_vectors = X_internal_vector_values.extent(2);
4902  const local_ordinal_type blocksize = D_internal_vector_values.extent(1);
4903 
4904  const local_ordinal_type team_size =
4905  SolveTridiagsDefaultModeAndAlgo<typename execution_space::memory_space>::
4906  recommended_team_size(blocksize, vector_length, internal_vector_length);
4907  const int per_team_scratch = internal_vector_scratch_type_3d_view ::shmem_size(blocksize, num_vectors, vector_loop_size);
4908 
4909 #define BLOCKTRIDICONTAINER_DETAILS_SOLVETRIDIAGS(B) \
4910  if (packindices_schur.extent(1) <= 0) { \
4911  if (num_vectors == 1) { \
4912  Kokkos::TeamPolicy<execution_space, SingleVectorTag<B>> \
4913  policy(packptr.extent(0) - 1, team_size, vector_loop_size); \
4914  policy.set_scratch_size(0, Kokkos::PerTeam(per_team_scratch)); \
4915  Kokkos::parallel_for("SolveTridiags::TeamPolicy::run<SingleVector>", \
4916  policy, *this); \
4917  } else { \
4918  Kokkos::TeamPolicy<execution_space, MultiVectorTag<B>> \
4919  policy(packptr.extent(0) - 1, team_size, vector_loop_size); \
4920  policy.set_scratch_size(0, Kokkos::PerTeam(per_team_scratch)); \
4921  Kokkos::parallel_for("SolveTridiags::TeamPolicy::run<MultiVector>", \
4922  policy, *this); \
4923  } \
4924  } else { \
4925  { \
4926  Kokkos::TeamPolicy<execution_space, SingleZeroingTag<B>> \
4927  policy(packptr.extent(0) - 1, team_size, vector_loop_size); \
4928  Kokkos::parallel_for("SolveTridiags::TeamPolicy::run<SingleZeroingTag>", \
4929  policy, *this); \
4930  } \
4931  for (local_ordinal_type vec = 0; vec < num_vectors; vec++) { \
4932  this->active_schur_solve_vec = vec; \
4933  { \
4934  IFPACK2_BLOCKHELPER_TIMER("BlockTriDi::ApplyInverseJacobi::SingleVectorSubLineTag", SingleVectorSubLineTag0); \
4935  write4DMultiVectorValuesToFile(part2packrowidx0_sub.extent(0), X_internal_scalar_values, "x_scalar_values_before_SingleVectorSubLineTag.mm"); \
4936  Kokkos::TeamPolicy<execution_space, SingleVectorSubLineTag<B>> \
4937  policy(packindices_sub.extent(0), team_size, vector_loop_size); \
4938  policy.set_scratch_size(0, Kokkos::PerTeam(per_team_scratch)); \
4939  Kokkos::parallel_for("SolveTridiags::TeamPolicy::run<SingleVector>", \
4940  policy, *this); \
4941  write4DMultiVectorValuesToFile(part2packrowidx0_sub.extent(0), X_internal_scalar_values, "x_scalar_values_after_SingleVectorSubLineTag.mm"); \
4942  IFPACK2_BLOCKHELPER_TIMER_FENCE(execution_space) \
4943  } \
4944  { \
4945  IFPACK2_BLOCKHELPER_TIMER("BlockTriDi::ApplyInverseJacobi::SingleVectorApplyCTag", SingleVectorApplyCTag0); \
4946  write4DMultiVectorValuesToFile(part2packrowidx0_sub.extent(0), X_internal_scalar_values, "x_scalar_values_before_SingleVectorApplyCTag.mm"); \
4947  Kokkos::TeamPolicy<execution_space, SingleVectorApplyCTag<B>> \
4948  policy(packindices_sub.extent(0), team_size, vector_loop_size); \
4949  Kokkos::parallel_for("SolveTridiags::TeamPolicy::run<SingleVector>", \
4950  policy, *this); \
4951  write4DMultiVectorValuesToFile(part2packrowidx0_sub.extent(0), X_internal_scalar_values, "x_scalar_values_after_SingleVectorApplyCTag.mm"); \
4952  IFPACK2_BLOCKHELPER_TIMER_FENCE(execution_space) \
4953  } \
4954  { \
4955  IFPACK2_BLOCKHELPER_TIMER("BlockTriDi::ApplyInverseJacobi::SingleVectorSchurTag", SingleVectorSchurTag0); \
4956  write4DMultiVectorValuesToFile(part2packrowidx0_sub.extent(0), X_internal_scalar_values, "x_scalar_values_before_SingleVectorSchurTag.mm"); \
4957  Kokkos::TeamPolicy<execution_space, SingleVectorSchurTag<B>> \
4958  policy(packindices_schur.extent(0), team_size, vector_loop_size); \
4959  policy.set_scratch_size(0, Kokkos::PerTeam(per_team_scratch)); \
4960  Kokkos::parallel_for("SolveTridiags::TeamPolicy::run<SingleVector>", \
4961  policy, *this); \
4962  write4DMultiVectorValuesToFile(part2packrowidx0_sub.extent(0), X_internal_scalar_values, "x_scalar_values_after_SingleVectorSchurTag.mm"); \
4963  IFPACK2_BLOCKHELPER_TIMER_FENCE(execution_space) \
4964  } \
4965  { \
4966  IFPACK2_BLOCKHELPER_TIMER("BlockTriDi::ApplyInverseJacobi::SingleVectorApplyETag", SingleVectorApplyETag0); \
4967  write4DMultiVectorValuesToFile(part2packrowidx0_sub.extent(0), X_internal_scalar_values, "x_scalar_values_before_SingleVectorApplyETag.mm"); \
4968  Kokkos::TeamPolicy<execution_space, SingleVectorApplyETag<B>> \
4969  policy(packindices_sub.extent(0), team_size, vector_loop_size); \
4970  Kokkos::parallel_for("SolveTridiags::TeamPolicy::run<SingleVector>", \
4971  policy, *this); \
4972  write4DMultiVectorValuesToFile(part2packrowidx0_sub.extent(0), X_internal_scalar_values, "x_scalar_values_after_SingleVectorApplyETag.mm"); \
4973  IFPACK2_BLOCKHELPER_TIMER_FENCE(execution_space) \
4974  } \
4975  } \
4976  { \
4977  Kokkos::TeamPolicy<execution_space, CopyVectorToFlatTag<B>> \
4978  policy(packptr.extent(0) - 1, team_size, vector_loop_size); \
4979  Kokkos::parallel_for("SolveTridiags::TeamPolicy::run<CopyVectorToFlatTag>", \
4980  policy, *this); \
4981  } \
4982  } \
4983  break
4984  switch (blocksize) {
4985  case 3: BLOCKTRIDICONTAINER_DETAILS_SOLVETRIDIAGS(3);
4986  case 5: BLOCKTRIDICONTAINER_DETAILS_SOLVETRIDIAGS(5);
4987  case 6: BLOCKTRIDICONTAINER_DETAILS_SOLVETRIDIAGS(6);
4988  case 7: BLOCKTRIDICONTAINER_DETAILS_SOLVETRIDIAGS(7);
4989  case 10: BLOCKTRIDICONTAINER_DETAILS_SOLVETRIDIAGS(10);
4990  case 11: BLOCKTRIDICONTAINER_DETAILS_SOLVETRIDIAGS(11);
4991  case 12: BLOCKTRIDICONTAINER_DETAILS_SOLVETRIDIAGS(12);
4992  case 13: BLOCKTRIDICONTAINER_DETAILS_SOLVETRIDIAGS(13);
4993  case 16: BLOCKTRIDICONTAINER_DETAILS_SOLVETRIDIAGS(16);
4994  case 17: BLOCKTRIDICONTAINER_DETAILS_SOLVETRIDIAGS(17);
4995  case 18: BLOCKTRIDICONTAINER_DETAILS_SOLVETRIDIAGS(18);
4996  case 19: BLOCKTRIDICONTAINER_DETAILS_SOLVETRIDIAGS(19);
4997  default: BLOCKTRIDICONTAINER_DETAILS_SOLVETRIDIAGS(0);
4998  }
4999 #undef BLOCKTRIDICONTAINER_DETAILS_SOLVETRIDIAGS
5000 
5001  IFPACK2_BLOCKTRIDICONTAINER_PROFILER_REGION_END;
5002  IFPACK2_BLOCKHELPER_TIMER_FENCE(execution_space)
5003  }
5004 };
5005 
5009 template <typename MatrixType>
5010 int applyInverseJacobi( // importer
5011  const Teuchos::RCP<const typename BlockHelperDetails::ImplType<MatrixType>::tpetra_row_matrix_type> &A,
5012  const Teuchos::RCP<const typename BlockHelperDetails::ImplType<MatrixType>::tpetra_crs_graph_type> &G,
5013  const Teuchos::RCP<const typename BlockHelperDetails::ImplType<MatrixType>::tpetra_import_type> &tpetra_importer,
5014  const Teuchos::RCP<AsyncableImport<MatrixType>> &async_importer,
5015  const bool overlap_communication_and_computation,
5016  // tpetra interface
5017  const typename BlockHelperDetails::ImplType<MatrixType>::tpetra_multivector_type &X, // tpetra interface
5018  /* */ typename BlockHelperDetails::ImplType<MatrixType>::tpetra_multivector_type &Y, // tpetra interface
5019  /* */ typename BlockHelperDetails::ImplType<MatrixType>::tpetra_multivector_type &Z, // temporary tpetra interface (seq_method)
5020  /* */ typename BlockHelperDetails::ImplType<MatrixType>::impl_scalar_type_1d_view &W, // temporary tpetra interface (diff)
5021  // local object interface
5022  const BlockHelperDetails::PartInterface<MatrixType> &interf, // mesh interface
5023  const BlockTridiags<MatrixType> &btdm, // packed block tridiagonal matrices
5024  const BlockHelperDetails::AmD<MatrixType> &amd, // R = A - D
5025  /* */ typename BlockHelperDetails::ImplType<MatrixType>::vector_type_1d_view &work, // workspace for packed multivector of right hand side
5026  /* */ BlockHelperDetails::NormManager<MatrixType> &norm_manager,
5027  // preconditioner parameters
5028  const typename BlockHelperDetails::ImplType<MatrixType>::impl_scalar_type &damping_factor,
5029  /* */ bool is_y_zero,
5030  const int max_num_sweeps,
5031  const typename BlockHelperDetails::ImplType<MatrixType>::magnitude_type tol,
5032  const int check_tol_every) {
5033  IFPACK2_BLOCKHELPER_TIMER("BlockTriDi::ApplyInverseJacobi", ApplyInverseJacobi);
5034 
5035  using impl_type = BlockHelperDetails::ImplType<MatrixType>;
5036  using node_memory_space = typename impl_type::node_memory_space;
5037  using local_ordinal_type = typename impl_type::local_ordinal_type;
5038  using size_type = typename impl_type::size_type;
5039  using impl_scalar_type = typename impl_type::impl_scalar_type;
5040  using magnitude_type = typename impl_type::magnitude_type;
5041  using local_ordinal_type_1d_view = typename impl_type::local_ordinal_type_1d_view;
5042  using vector_type_1d_view = typename impl_type::vector_type_1d_view;
5043  using vector_type_3d_view = typename impl_type::vector_type_3d_view;
5044  using tpetra_multivector_type = typename impl_type::tpetra_multivector_type;
5045 
5046  using impl_scalar_type_1d_view = typename impl_type::impl_scalar_type_1d_view;
5047 
5048  // either tpetra importer or async importer must be active
5049  TEUCHOS_TEST_FOR_EXCEPT_MSG(!tpetra_importer.is_null() && !async_importer.is_null(),
5050  "Neither Tpetra importer nor Async importer is null.");
5051  // max number of sweeps should be positive number
5052  TEUCHOS_TEST_FOR_EXCEPT_MSG(max_num_sweeps <= 0,
5053  "Maximum number of sweeps must be >= 1.");
5054 
5055  // const parameters
5056  const bool is_seq_method_requested = !tpetra_importer.is_null();
5057  const bool is_async_importer_active = !async_importer.is_null();
5058 #if KOKKOS_VERSION >= 40799
5059  const bool is_norm_manager_active = tol > KokkosKernels::ArithTraits<magnitude_type>::zero();
5060 #else
5061  const bool is_norm_manager_active = tol > Kokkos::ArithTraits<magnitude_type>::zero();
5062 #endif
5063  const magnitude_type tolerance = tol * tol;
5064  const local_ordinal_type blocksize = btdm.values.extent(1);
5065  const local_ordinal_type num_vectors = Y.getNumVectors();
5066  const local_ordinal_type num_blockrows = interf.part2packrowidx0_back;
5067 
5068  const impl_scalar_type zero(0.0);
5069 
5070  TEUCHOS_TEST_FOR_EXCEPT_MSG(is_norm_manager_active && is_seq_method_requested,
5071  "The seq method for applyInverseJacobi, "
5072  << "which in any case is for developer use only, "
5073  << "does not support norm-based termination.");
5074  const bool device_accessible_from_host = Kokkos::SpaceAccessibility<
5075  Kokkos::DefaultHostExecutionSpace, node_memory_space>::accessible;
5076  TEUCHOS_TEST_FOR_EXCEPTION(is_seq_method_requested && !device_accessible_from_host,
5077  std::invalid_argument,
5078  "The seq method for applyInverseJacobi, "
5079  << "which in any case is for developer use only, "
5080  << "only supports memory spaces accessible from host.");
5081 
5082  // if workspace is needed more, resize it
5083  const size_type work_span_required = num_blockrows * num_vectors * blocksize;
5084  if (work.span() < work_span_required)
5085  work = vector_type_1d_view("vector workspace 1d view", work_span_required);
5086 
5087  // construct W
5088  const local_ordinal_type W_size = interf.packptr.extent(0) - 1;
5089  if (local_ordinal_type(W.extent(0)) < W_size)
5090  W = impl_scalar_type_1d_view("W", W_size);
5091 
5092  typename impl_type::impl_scalar_type_2d_view_tpetra remote_multivector;
5093  {
5094  if (is_seq_method_requested) {
5095  if (Z.getNumVectors() != Y.getNumVectors())
5096  Z = tpetra_multivector_type(tpetra_importer->getTargetMap(), num_vectors, false);
5097  } else {
5098  if (is_async_importer_active) {
5099  // create comm data buffer and keep it here
5100  async_importer->createDataBuffer(num_vectors);
5101  remote_multivector = async_importer->getRemoteMultiVectorLocalView();
5102  }
5103  }
5104  }
5105 
5106  // wrap the workspace with 3d view
5107  vector_type_3d_view pmv(work.data(), num_blockrows, blocksize, num_vectors);
5108  const auto XX = X.getLocalViewDevice(Tpetra::Access::ReadOnly);
5109  const auto YY = Y.getLocalViewDevice(Tpetra::Access::ReadWrite);
5110  const auto ZZ = Z.getLocalViewDevice(Tpetra::Access::ReadWrite);
5111  if (is_y_zero) Kokkos::deep_copy(YY, zero);
5112 
5113  MultiVectorConverter<MatrixType> multivector_converter(interf, pmv);
5114  SolveTridiags<MatrixType> solve_tridiags(interf, btdm, pmv,
5115  damping_factor, is_norm_manager_active);
5116 
5117  const local_ordinal_type_1d_view dummy_local_ordinal_type_1d_view;
5118 
5119  auto A_crs = Teuchos::rcp_dynamic_cast<const typename impl_type::tpetra_crs_matrix_type>(A);
5120  auto A_bcrs = Teuchos::rcp_dynamic_cast<const typename impl_type::tpetra_block_crs_matrix_type>(A);
5121 
5122  bool hasBlockCrsMatrix = !A_bcrs.is_null();
5123 
5124  // This is OK here to use the graph of the A_crs matrix and a block size of 1
5125  const auto g = hasBlockCrsMatrix ? A_bcrs->getCrsGraph() : *(A_crs->getCrsGraph()); // tpetra crs graph object
5126 
5127  BlockHelperDetails::ComputeResidualVector<MatrixType>
5128  compute_residual_vector(amd, G->getLocalGraphDevice(), g.getLocalGraphDevice(), blocksize, interf,
5129  is_async_importer_active ? async_importer->dm2cm : dummy_local_ordinal_type_1d_view,
5130  hasBlockCrsMatrix);
5131 
5132  // norm manager workspace resize
5133  if (is_norm_manager_active)
5134  norm_manager.setCheckFrequency(check_tol_every);
5135 
5136  // iterate
5137  int sweep = 0;
5138  for (; sweep < max_num_sweeps; ++sweep) {
5139  {
5140  if (is_y_zero) {
5141  // pmv := x(lclrow)
5142  multivector_converter.run(XX);
5143  } else {
5144  if (is_seq_method_requested) {
5145  // SEQ METHOD IS TESTING ONLY
5146 
5147  // y := x - R y
5148  Z.doImport(Y, *tpetra_importer, Tpetra::REPLACE);
5149  compute_residual_vector.run(YY, XX, ZZ);
5150 
5151  // pmv := y(lclrow).
5152  multivector_converter.run(YY);
5153  } else {
5154  // fused y := x - R y and pmv := y(lclrow);
5155  // real use case does not use overlap comp and comm
5156  if (overlap_communication_and_computation || !is_async_importer_active) {
5157  if (is_async_importer_active) async_importer->asyncSendRecv(YY);
5158  // OverlapTag, compute_owned = true
5159  compute_residual_vector.run(pmv, XX, YY, remote_multivector, true);
5160  if (is_norm_manager_active && norm_manager.checkDone(sweep, tolerance)) {
5161  if (is_async_importer_active) async_importer->cancel();
5162  break;
5163  }
5164  if (is_async_importer_active) {
5165  async_importer->syncRecv();
5166  // OverlapTag, compute_owned = false
5167  compute_residual_vector.run(pmv, XX, YY, remote_multivector, false);
5168  }
5169  } else {
5170  if (is_async_importer_active)
5171  async_importer->syncExchange(YY);
5172  if (is_norm_manager_active && norm_manager.checkDone(sweep, tolerance)) break;
5173  // AsyncTag
5174  compute_residual_vector.run(pmv, XX, YY, remote_multivector);
5175  }
5176  }
5177  }
5178  }
5179 
5180  // pmv := inv(D) pmv.
5181  {
5182  solve_tridiags.run(YY, W);
5183  }
5184  {
5185  if (is_norm_manager_active) {
5186  // y(lclrow) = (b - a) y(lclrow) + a pmv, with b = 1 always.
5187  BlockHelperDetails::reduceVector<MatrixType>(W, norm_manager.getBuffer());
5188  if (sweep + 1 == max_num_sweeps) {
5189  norm_manager.ireduce(sweep, true);
5190  norm_manager.checkDone(sweep + 1, tolerance, true);
5191  } else {
5192  norm_manager.ireduce(sweep);
5193  }
5194  }
5195  }
5196  is_y_zero = false;
5197  }
5198 
5199  // sqrt the norms for the caller's use.
5200  if (is_norm_manager_active) norm_manager.finalize();
5201 
5202  return sweep;
5203 }
5204 
5205 // Implementation of fused block Jacobi for a specific block size,
5206 // or (if B == 0) for a general block size.
5207 template <typename MatrixType, int B>
5208 int applyFusedBlockJacobi_Impl(
5209  const Teuchos::RCP<const typename BlockHelperDetails::ImplType<MatrixType>::tpetra_import_type> &tpetra_importer,
5210  const Teuchos::RCP<AsyncableImport<MatrixType>> &async_importer,
5211  const bool overlap_communication_and_computation,
5212  // tpetra interface
5213  const typename BlockHelperDetails::ImplType<MatrixType>::tpetra_multivector_type &X, // tpetra interface
5214  /* */ typename BlockHelperDetails::ImplType<MatrixType>::tpetra_multivector_type &Y, // tpetra interface
5215  /* */ typename BlockHelperDetails::ImplType<MatrixType>::impl_scalar_type_1d_view &W, // temporary tpetra interface (diff)
5216  // local object interface
5217  const BlockHelperDetails::PartInterface<MatrixType> &interf, // mesh interface
5218  const BlockTridiags<MatrixType> &btdm, // packed block tridiagonal matrices
5219  const BlockHelperDetails::AmD<MatrixType> &amd, // R = A - D
5220  /* */ typename BlockHelperDetails::ImplType<MatrixType>::impl_scalar_type_1d_view &work, // workspace
5221  /* */ BlockHelperDetails::NormManager<MatrixType> &norm_manager,
5222  // preconditioner parameters
5223  const typename BlockHelperDetails::ImplType<MatrixType>::impl_scalar_type &damping_factor,
5224  /* */ bool is_y_zero,
5225  const int max_num_sweeps,
5226  const typename BlockHelperDetails::ImplType<MatrixType>::magnitude_type tol,
5227  const int check_tol_every) {
5228  using impl_type = BlockHelperDetails::ImplType<MatrixType>;
5229  using local_ordinal_type = typename impl_type::local_ordinal_type;
5230  using size_type = typename impl_type::size_type;
5231  using magnitude_type = typename impl_type::magnitude_type;
5232  using impl_scalar_type_1d_view = typename impl_type::impl_scalar_type_1d_view;
5233  using impl_scalar_type_2d_view_tpetra = typename impl_type::impl_scalar_type_2d_view_tpetra;
5234 
5235  // the tpetra importer and async importer can't both be active
5236  TEUCHOS_TEST_FOR_EXCEPT_MSG(!tpetra_importer.is_null() && !async_importer.is_null(),
5237  "Neither Tpetra importer nor Async importer is null.");
5238  // max number of sweeps should be positive number
5239  TEUCHOS_TEST_FOR_EXCEPT_MSG(max_num_sweeps <= 0,
5240  "Maximum number of sweeps must be >= 1.");
5241 
5242  // const parameters
5243  const bool is_async_importer_active = !async_importer.is_null();
5244 #if KOKKOS_VERSION >= 40799
5245  const bool is_norm_manager_active = tol > KokkosKernels::ArithTraits<magnitude_type>::zero();
5246 #else
5247  const bool is_norm_manager_active = tol > Kokkos::ArithTraits<magnitude_type>::zero();
5248 #endif
5249  const magnitude_type tolerance = tol * tol;
5250  const local_ordinal_type blocksize = btdm.d_inv.extent(1);
5251  const local_ordinal_type num_vectors = Y.getNumVectors();
5252  const local_ordinal_type num_blockrows = interf.nparts;
5253 
5254  typename impl_type::impl_scalar_type_2d_view_tpetra remote_multivector;
5255  {
5256  if (is_async_importer_active) {
5257  // create comm data buffer and keep it here
5258  async_importer->createDataBuffer(num_vectors);
5259  remote_multivector = async_importer->getRemoteMultiVectorLocalView();
5260  }
5261  }
5262 
5263  const auto XX = X.getLocalViewDevice(Tpetra::Access::ReadOnly);
5264  const auto YY = Y.getLocalViewDevice(Tpetra::Access::ReadWrite);
5265 
5266  const bool two_pass_residual =
5267  overlap_communication_and_computation && is_async_importer_active;
5268 
5269  // Calculate the required work size and reallocate it if not already big enough.
5270  // Check that our assumptions about YY dimension are correct.
5272  size_t(num_blockrows) * blocksize * num_vectors != YY.extent(0) * YY.extent(1),
5273  "Local LHS vector (YY) has total size " << YY.extent(0) << "x" << YY.extent(1) << " = " << YY.extent(0) * YY.extent(1) << ",\n"
5274  << "but expected " << num_blockrows << "x" << blocksize << "x" << num_vectors << " = " << size_t(num_blockrows) * blocksize * num_vectors << '\n');
5275  size_type work_required = size_type(num_blockrows) * blocksize * num_vectors;
5276  if (work.extent(0) < work_required) {
5277  work = impl_scalar_type_1d_view(do_not_initialize_tag("flat workspace 1d view"), work_required);
5278  }
5279 
5280  Unmanaged<impl_scalar_type_2d_view_tpetra> y_doublebuf(work.data(), num_blockrows * blocksize, num_vectors);
5281 
5282  // construct W
5283  if (W.extent(0) != size_t(num_blockrows))
5284  W = impl_scalar_type_1d_view(do_not_initialize_tag("W"), num_blockrows);
5285 
5286  // Create the required functors upfront (this is inexpensive - all shallow copies)
5287  BlockHelperDetails::ComputeResidualAndSolve_SolveOnly<MatrixType, B>
5288  functor_solve_only(amd, btdm.d_inv, W, blocksize, damping_factor);
5289  BlockHelperDetails::ComputeResidualAndSolve_1Pass<MatrixType, B>
5290  functor_1pass(amd, btdm.d_inv, W, blocksize, damping_factor);
5291  BlockHelperDetails::ComputeResidualAndSolve_2Pass<MatrixType, B>
5292  functor_2pass(amd, btdm.d_inv, W, blocksize, damping_factor);
5293 
5294  // norm manager workspace resize
5295  if (is_norm_manager_active)
5296  norm_manager.setCheckFrequency(check_tol_every);
5297 
5298  // For double-buffering.
5299  // yy_buffers[current_y] has the current iterate of y.
5300  // yy_buffers[1-current_y] has the next iterate of y.
5301  Unmanaged<impl_scalar_type_2d_view_tpetra> y_buffers[2] = {YY, y_doublebuf};
5302  int current_y = 0;
5303 
5304  // iterate
5305  int sweep = 0;
5306  for (; sweep < max_num_sweeps; ++sweep) {
5307  if (is_y_zero) {
5308  // If y is initially zero, then we are just computing y := damping_factor * Dinv * x
5309  functor_solve_only.run(XX, y_buffers[1 - current_y]);
5310  } else {
5311  // real use case does not use overlap comp and comm
5312  if (overlap_communication_and_computation || !is_async_importer_active) {
5313  if (is_async_importer_active) async_importer->asyncSendRecv(y_buffers[current_y]);
5314  if (two_pass_residual) {
5315  // Pass 1 computes owned residual and stores into new y buffer,
5316  // but doesn't apply Dinv or produce a norm yet
5317  functor_2pass.run_pass1(XX, y_buffers[current_y], y_buffers[1 - current_y]);
5318  } else {
5319  // This case happens if running with single rank.
5320  // There are no remote columns, so residual and solve can happen in one step.
5321  functor_1pass.run(XX, y_buffers[current_y], remote_multivector, y_buffers[1 - current_y]);
5322  }
5323  if (is_norm_manager_active && norm_manager.checkDone(sweep, tolerance)) {
5324  if (is_async_importer_active) async_importer->cancel();
5325  break;
5326  }
5327  if (is_async_importer_active) {
5328  async_importer->syncRecv();
5329  // Stage 2 finishes computing the residual, then applies Dinv and computes norm.
5330  functor_2pass.run_pass2(y_buffers[current_y], remote_multivector, y_buffers[1 - current_y]);
5331  }
5332  } else {
5333  if (is_async_importer_active)
5334  async_importer->syncExchange(y_buffers[current_y]);
5335  if (is_norm_manager_active && norm_manager.checkDone(sweep, tolerance)) break;
5336  // Full residual, Dinv apply, and norm in one kernel
5337  functor_1pass.run(XX, y_buffers[current_y], remote_multivector, y_buffers[1 - current_y]);
5338  }
5339  }
5340 
5341  // Compute global norm.
5342  if (is_norm_manager_active) {
5343  BlockHelperDetails::reduceVector<MatrixType>(W, norm_manager.getBuffer());
5344  if (sweep + 1 == max_num_sweeps) {
5345  norm_manager.ireduce(sweep, true);
5346  norm_manager.checkDone(sweep + 1, tolerance, true);
5347  } else {
5348  norm_manager.ireduce(sweep);
5349  }
5350  }
5351  is_y_zero = false;
5352  // flip y buffers for next iteration, or termination if we reached max_num_sweeps.
5353  current_y = 1 - current_y;
5354  }
5355  if (current_y == 1) {
5356  // We finished iterating with y in the double buffer, so copy it to the user's vector.
5357  Kokkos::deep_copy(YY, y_doublebuf);
5358  }
5359 
5360  // sqrt the norms for the caller's use.
5361  if (is_norm_manager_active) norm_manager.finalize();
5362  return sweep;
5363 }
5364 
5368 template <typename MatrixType>
5370  const Teuchos::RCP<const typename BlockHelperDetails::ImplType<MatrixType>::tpetra_import_type> &tpetra_importer,
5371  const Teuchos::RCP<AsyncableImport<MatrixType>> &async_importer,
5372  const bool overlap_communication_and_computation,
5373  // tpetra interface
5374  const typename BlockHelperDetails::ImplType<MatrixType>::tpetra_multivector_type &X, // tpetra interface
5375  /* */ typename BlockHelperDetails::ImplType<MatrixType>::tpetra_multivector_type &Y, // tpetra interface
5376  /* */ typename BlockHelperDetails::ImplType<MatrixType>::impl_scalar_type_1d_view &W, // temporary tpetra interface (diff)
5377  // local object interface
5378  const BlockHelperDetails::PartInterface<MatrixType> &interf, // mesh interface
5379  const BlockTridiags<MatrixType> &btdm, // packed block tridiagonal matrices
5380  const BlockHelperDetails::AmD<MatrixType> &amd, // R = A - D
5381  /* */ typename BlockHelperDetails::ImplType<MatrixType>::impl_scalar_type_1d_view &work, // workspace
5382  /* */ BlockHelperDetails::NormManager<MatrixType> &norm_manager,
5383  // preconditioner parameters
5384  const typename BlockHelperDetails::ImplType<MatrixType>::impl_scalar_type &damping_factor,
5385  /* */ bool is_y_zero,
5386  const int max_num_sweeps,
5387  const typename BlockHelperDetails::ImplType<MatrixType>::magnitude_type tol,
5388  const int check_tol_every) {
5389  IFPACK2_BLOCKHELPER_TIMER("BlockTriDi::ApplyFusedBlockJacobi", ApplyFusedBlockJacobi);
5390  int blocksize = btdm.d_inv.extent(1);
5391  int sweep = 0;
5392 #define BLOCKTRIDICONTAINER_APPLY_FUSED_JACOBI(B) \
5393  { \
5394  sweep = applyFusedBlockJacobi_Impl<MatrixType, B>( \
5395  tpetra_importer, async_importer, overlap_communication_and_computation, \
5396  X, Y, W, interf, btdm, amd, work, \
5397  norm_manager, damping_factor, is_y_zero, \
5398  max_num_sweeps, tol, check_tol_every); \
5399  } \
5400  break
5401  switch (blocksize) {
5402  case 3: BLOCKTRIDICONTAINER_APPLY_FUSED_JACOBI(3);
5403  case 5: BLOCKTRIDICONTAINER_APPLY_FUSED_JACOBI(5);
5404  case 7: BLOCKTRIDICONTAINER_APPLY_FUSED_JACOBI(7);
5405  case 9: BLOCKTRIDICONTAINER_APPLY_FUSED_JACOBI(9);
5406  case 10: BLOCKTRIDICONTAINER_APPLY_FUSED_JACOBI(10);
5407  case 11: BLOCKTRIDICONTAINER_APPLY_FUSED_JACOBI(11);
5408  case 16: BLOCKTRIDICONTAINER_APPLY_FUSED_JACOBI(16);
5409  case 17: BLOCKTRIDICONTAINER_APPLY_FUSED_JACOBI(17);
5410  case 18: BLOCKTRIDICONTAINER_APPLY_FUSED_JACOBI(18);
5411  default: BLOCKTRIDICONTAINER_APPLY_FUSED_JACOBI(0);
5412  }
5413 #undef BLOCKTRIDICONTAINER_APPLY_FUSED_JACOBI
5414 
5415  return sweep;
5416 }
5417 
5418 template <typename MatrixType>
5419 struct ImplObject {
5420  using impl_type = BlockHelperDetails::ImplType<MatrixType>;
5421  using part_interface_type = BlockHelperDetails::PartInterface<MatrixType>;
5422  using block_tridiags_type = BlockTridiags<MatrixType>;
5423  using amd_type = BlockHelperDetails::AmD<MatrixType>;
5424  using norm_manager_type = BlockHelperDetails::NormManager<MatrixType>;
5425  using async_import_type = AsyncableImport<MatrixType>;
5426 
5427  // distructed objects
5431  Teuchos::RCP<async_import_type> async_importer;
5432  bool overlap_communication_and_computation;
5433 
5434  // copy of Y (mutable to penentrate const)
5435  mutable typename impl_type::tpetra_multivector_type Z;
5436  mutable typename impl_type::impl_scalar_type_1d_view W;
5437 
5438  // local objects
5439  part_interface_type part_interface;
5440  block_tridiags_type block_tridiags; // D
5441  amd_type a_minus_d; // R = A - D
5442 
5443  // whether to use fused block Jacobi path
5444  bool use_fused_jacobi;
5445 
5446  // vector workspace is used for general block tridi case
5447  mutable typename impl_type::vector_type_1d_view work; // right hand side workspace (1D view of vector)
5448  // scalar workspace is used for fused block jacobi case
5449  mutable typename impl_type::impl_scalar_type_1d_view work_flat; // right hand side workspace (1D view of scalar)
5450  mutable norm_manager_type norm_manager;
5451 };
5452 
5453 } // namespace BlockTriDiContainerDetails
5454 
5455 } // namespace Ifpack2
5456 
5457 #endif
Definition: Ifpack2_BlockTriDiContainer_impl.hpp:143
int applyFusedBlockJacobi(const Teuchos::RCP< const typename BlockHelperDetails::ImplType< MatrixType >::tpetra_import_type > &tpetra_importer, const Teuchos::RCP< AsyncableImport< MatrixType >> &async_importer, const bool overlap_communication_and_computation, const typename BlockHelperDetails::ImplType< MatrixType >::tpetra_multivector_type &X, typename BlockHelperDetails::ImplType< MatrixType >::tpetra_multivector_type &Y, typename BlockHelperDetails::ImplType< MatrixType >::impl_scalar_type_1d_view &W, const BlockHelperDetails::PartInterface< MatrixType > &interf, const BlockTridiags< MatrixType > &btdm, const BlockHelperDetails::AmD< MatrixType > &amd, typename BlockHelperDetails::ImplType< MatrixType >::impl_scalar_type_1d_view &work, BlockHelperDetails::NormManager< MatrixType > &norm_manager, const typename BlockHelperDetails::ImplType< MatrixType >::impl_scalar_type &damping_factor, bool is_y_zero, const int max_num_sweeps, const typename BlockHelperDetails::ImplType< MatrixType >::magnitude_type tol, const int check_tol_every)
Definition: Ifpack2_BlockTriDiContainer_impl.hpp:5369
void performNumericPhase(const Teuchos::RCP< const typename BlockHelperDetails::ImplType< MatrixType >::tpetra_row_matrix_type > &A, const Teuchos::RCP< const typename BlockHelperDetails::ImplType< MatrixType >::tpetra_crs_graph_type > &G, const BlockHelperDetails::PartInterface< MatrixType > &interf, BlockTridiags< MatrixType > &btdm, const typename BlockHelperDetails::ImplType< MatrixType >::magnitude_type tiny, bool use_fused_jacobi)
Definition: Ifpack2_BlockTriDiContainer_impl.hpp:3764
void performSymbolicPhase(const Teuchos::RCP< const typename BlockHelperDetails::ImplType< MatrixType >::tpetra_row_matrix_type > &A, const Teuchos::RCP< const typename BlockHelperDetails::ImplType< MatrixType >::tpetra_crs_graph_type > &g, const BlockHelperDetails::PartInterface< MatrixType > &interf, BlockTridiags< MatrixType > &btdm, BlockHelperDetails::AmD< MatrixType > &amd, const bool overlap_communication_and_computation, const Teuchos::RCP< AsyncableImport< MatrixType >> &async_importer, bool useSeqMethod, bool use_fused_jacobi)
Definition: Ifpack2_BlockTriDiContainer_impl.hpp:1938
#define TEUCHOS_TEST_FOR_EXCEPTION(throw_exception_test, Exception, msg)
size_type size() const
size_t size_type
Definition: Ifpack2_BlockHelper.hpp:274
Kokkos::ArithTraits< scalar_type >::val_type impl_scalar_type
Definition: Ifpack2_BlockHelper.hpp:286
BlockHelperDetails::PartInterface< MatrixType > createPartInterface(const Teuchos::RCP< const typename BlockHelperDetails::ImplType< MatrixType >::tpetra_row_matrix_type > &A, const Teuchos::RCP< const typename BlockHelperDetails::ImplType< MatrixType >::tpetra_crs_graph_type > &G, const Teuchos::Array< Teuchos::Array< typename BlockHelperDetails::ImplType< MatrixType >::local_ordinal_type >> &partitions, const typename BlockHelperDetails::ImplType< MatrixType >::local_ordinal_type n_subparts_per_part_in)
Definition: Ifpack2_BlockTriDiContainer_impl.hpp:1102
Teuchos::RCP< AsyncableImport< MatrixType > > createBlockCrsAsyncImporter(const Teuchos::RCP< const typename BlockHelperDetails::ImplType< MatrixType >::tpetra_row_matrix_type > &A)
Definition: Ifpack2_BlockTriDiContainer_impl.hpp:895
TEUCHOS_DEPRECATED RCP< T > rcp(T *p, Dealloc_T dealloc, bool owns_mem)
#define TEUCHOS_TEST_FOR_EXCEPT_MSG(throw_exception_test, msg)
BlockTridiags< MatrixType > createBlockTridiags(const BlockHelperDetails::PartInterface< MatrixType > &interf)
Definition: Ifpack2_BlockTriDiContainer_impl.hpp:1686
Kokkos::View< size_type *, device_type > size_type_1d_view
Definition: Ifpack2_BlockHelper.hpp:358
Definition: Ifpack2_BlockHelper.hpp:389
Kokkos::ViewAllocateWithoutInitializing do_not_initialize_tag
Definition: Ifpack2_BlockTriDiContainer_impl.hpp:99
void send(const Packet sendBuffer[], const Ordinal count, const int destRank, const int tag, const Comm< Ordinal > &comm)
T * getRawPtr() const
Definition: Ifpack2_BlockHelper.hpp:211
Teuchos::RCP< const typename BlockHelperDetails::ImplType< MatrixType >::tpetra_import_type > createBlockCrsTpetraImporter(const Teuchos::RCP< const typename BlockHelperDetails::ImplType< MatrixType >::tpetra_row_matrix_type > &A)
Definition: Ifpack2_BlockTriDiContainer_impl.hpp:171
RCP< CommRequest< Ordinal > > isend(const ArrayRCP< const Packet > &sendBuffer, const int destRank, const int tag, const Comm< Ordinal > &comm)
#define TEUCHOS_ASSERT(assertion_test)
Definition: Ifpack2_BlockHelper.hpp:236
Definition: Ifpack2_BlockHelper.hpp:270
int applyInverseJacobi(const Teuchos::RCP< const typename BlockHelperDetails::ImplType< MatrixType >::tpetra_row_matrix_type > &A, const Teuchos::RCP< const typename BlockHelperDetails::ImplType< MatrixType >::tpetra_crs_graph_type > &G, const Teuchos::RCP< const typename BlockHelperDetails::ImplType< MatrixType >::tpetra_import_type > &tpetra_importer, const Teuchos::RCP< AsyncableImport< MatrixType >> &async_importer, const bool overlap_communication_and_computation, const typename BlockHelperDetails::ImplType< MatrixType >::tpetra_multivector_type &X, typename BlockHelperDetails::ImplType< MatrixType >::tpetra_multivector_type &Y, typename BlockHelperDetails::ImplType< MatrixType >::tpetra_multivector_type &Z, typename BlockHelperDetails::ImplType< MatrixType >::impl_scalar_type_1d_view &W, const BlockHelperDetails::PartInterface< MatrixType > &interf, const BlockTridiags< MatrixType > &btdm, const BlockHelperDetails::AmD< MatrixType > &amd, typename BlockHelperDetails::ImplType< MatrixType >::vector_type_1d_view &work, BlockHelperDetails::NormManager< MatrixType > &norm_manager, const typename BlockHelperDetails::ImplType< MatrixType >::impl_scalar_type &damping_factor, bool is_y_zero, const int max_num_sweeps, const typename BlockHelperDetails::ImplType< MatrixType >::magnitude_type tol, const int check_tol_every)
Definition: Ifpack2_BlockTriDiContainer_impl.hpp:5010
Definition: Ifpack2_BlockTriDiContainer_impl.hpp:1613
Definition: Ifpack2_BlockComputeResidualVector.hpp:23
Definition: Ifpack2_BlockTriDiContainer_impl.hpp:3814