Ifpack2 Templated Preconditioning Package  Version 1.0
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Groups Pages
Ifpack2_BlockTriDiContainer_impl.hpp
1 // @HEADER
2 // *****************************************************************************
3 // Ifpack2: Templated Object-Oriented Algebraic Preconditioner Package
4 //
5 // Copyright 2009 NTESS and the Ifpack2 contributors.
6 // SPDX-License-Identifier: BSD-3-Clause
7 // *****************************************************************************
8 // @HEADER
9 
10 #ifndef IFPACK2_BLOCKTRIDICONTAINER_IMPL_HPP
11 #define IFPACK2_BLOCKTRIDICONTAINER_IMPL_HPP
12 
13 // #define IFPACK2_BLOCKTRIDICONTAINER_WRITE_MM
14 // #define IFPACK2_BLOCKTRIDICONTAINER_USE_PRINTF
15 
17 
18 #include <Tpetra_Details_extractMpiCommFromTeuchos.hpp>
19 #include <Tpetra_Distributor.hpp>
20 #include <Tpetra_BlockMultiVector.hpp>
21 
22 #if KOKKOS_VERSION >= 40799
23 #include <KokkosKernels_ArithTraits.hpp>
24 #else
25 #include <Kokkos_ArithTraits.hpp>
26 #endif
27 #include <KokkosBatched_Util.hpp>
28 #include <KokkosBatched_Vector.hpp>
29 #include <KokkosBatched_Copy_Decl.hpp>
30 #include <KokkosBatched_Copy_Impl.hpp>
31 #include <KokkosBatched_AddRadial_Decl.hpp>
32 #include <KokkosBatched_AddRadial_Impl.hpp>
33 #include <KokkosBatched_SetIdentity_Decl.hpp>
34 #include <KokkosBatched_SetIdentity_Impl.hpp>
35 #include <KokkosBatched_Gemm_Decl.hpp>
36 #include <KokkosBatched_Gemm_Serial_Impl.hpp>
37 #include <KokkosBatched_Gemm_Team_Impl.hpp>
38 #include <KokkosBatched_Gemv_Decl.hpp>
39 #include <KokkosBatched_Gemv_Team_Impl.hpp>
40 #include <KokkosBatched_Trsm_Decl.hpp>
41 #include <KokkosBatched_Trsm_Serial_Impl.hpp>
42 #include <KokkosBatched_Trsm_Team_Impl.hpp>
43 #include <KokkosBatched_Trsv_Decl.hpp>
44 #include <KokkosBatched_Trsv_Serial_Impl.hpp>
45 #include <KokkosBatched_Trsv_Team_Impl.hpp>
46 #include <KokkosBatched_LU_Decl.hpp>
47 #include <KokkosBatched_LU_Serial_Impl.hpp>
48 #include <KokkosBatched_LU_Team_Impl.hpp>
49 
50 #include <KokkosBlas1_nrm1.hpp>
51 #include <KokkosBlas1_nrm2.hpp>
52 
53 #include <memory>
54 
55 #include "Ifpack2_BlockHelper.hpp"
56 #include "Ifpack2_BlockComputeResidualVector.hpp"
57 #include "Ifpack2_BlockComputeResidualAndSolve.hpp"
58 
59 // need to interface this into cmake variable (or only use this flag when it is necessary)
60 // #define IFPACK2_BLOCKTRIDICONTAINER_ENABLE_PROFILE
61 // #undef IFPACK2_BLOCKTRIDICONTAINER_ENABLE_PROFILE
62 #if defined(KOKKOS_ENABLE_CUDA) && defined(IFPACK2_BLOCKTRIDICONTAINER_ENABLE_PROFILE)
63 #include "cuda_profiler_api.h"
64 #endif
65 
66 // I am not 100% sure about the mpi 3 on cuda
67 #if MPI_VERSION >= 3
68 #define IFPACK2_BLOCKTRIDICONTAINER_USE_MPI_3
69 #endif
70 
71 // ::: Experiments :::
72 // define either pinned memory or cudamemory for mpi
73 // if both macros are disabled, it will use tpetra memory space which is uvm space for cuda
74 // if defined, this use pinned memory instead of device pointer
75 // by default, we enable pinned memory
76 #define IFPACK2_BLOCKTRIDICONTAINER_USE_PINNED_MEMORY_FOR_MPI
77 // #define IFPACK2_BLOCKTRIDICONTAINER_USE_CUDA_MEMORY_FOR_MPI
78 
79 // if defined, all views are allocated on cuda space intead of cuda uvm space
80 #define IFPACK2_BLOCKTRIDICONTAINER_USE_CUDA_SPACE
81 
82 // if defined, btdm_scalar_type is used (if impl_scala_type is double, btdm_scalar_type is float)
83 #if defined(HAVE_IFPACK2_BLOCKTRIDICONTAINER_SMALL_SCALAR)
84 #define IFPACK2_BLOCKTRIDICONTAINER_USE_SMALL_SCALAR_FOR_BLOCKTRIDIAG
85 #endif
86 
87 // if defined, it uses multiple execution spaces
88 #define IFPACK2_BLOCKTRIDICONTAINER_USE_EXEC_SPACE_INSTANCES
89 
90 namespace Ifpack2 {
91 
92 namespace BlockTriDiContainerDetails {
93 
94 namespace KB = KokkosBatched;
95 
99 using do_not_initialize_tag = Kokkos::ViewAllocateWithoutInitializing;
100 
101 template <typename MemoryTraitsType, Kokkos::MemoryTraitsFlags flag>
102 using MemoryTraits = Kokkos::MemoryTraits<MemoryTraitsType::is_unmanaged |
103  MemoryTraitsType::is_random_access |
104  flag>;
105 
106 template <typename ViewType>
107 using Unmanaged = Kokkos::View<typename ViewType::data_type,
108  typename ViewType::array_layout,
109  typename ViewType::device_type,
110  MemoryTraits<typename ViewType::memory_traits, Kokkos::Unmanaged>>;
111 template <typename ViewType>
112 using Atomic = Kokkos::View<typename ViewType::data_type,
113  typename ViewType::array_layout,
114  typename ViewType::device_type,
115  MemoryTraits<typename ViewType::memory_traits, Kokkos::Atomic>>;
116 template <typename ViewType>
117 using Const = Kokkos::View<typename ViewType::const_data_type,
118  typename ViewType::array_layout,
119  typename ViewType::device_type,
120  typename ViewType::memory_traits>;
121 template <typename ViewType>
122 using ConstUnmanaged = Const<Unmanaged<ViewType>>;
123 
124 template <typename ViewType>
125 using AtomicUnmanaged = Atomic<Unmanaged<ViewType>>;
126 
127 template <typename ViewType>
128 using Unmanaged = Kokkos::View<typename ViewType::data_type,
129  typename ViewType::array_layout,
130  typename ViewType::device_type,
131  MemoryTraits<typename ViewType::memory_traits, Kokkos::Unmanaged>>;
132 
133 template <typename ViewType>
134 using Scratch = Kokkos::View<typename ViewType::data_type,
135  typename ViewType::array_layout,
136  typename ViewType::execution_space::scratch_memory_space,
137  MemoryTraits<typename ViewType::memory_traits, Kokkos::Unmanaged>>;
138 
142 template <typename T>
144  typedef T type;
145 };
146 #if defined(IFPACK2_BLOCKTRIDICONTAINER_USE_SMALL_SCALAR_FOR_BLOCKTRIDIAG)
147 template <>
148 struct BlockTridiagScalarType<double> {
149  typedef float type;
150 };
151 // template<> struct SmallScalarType<Kokkos::complex<double> > { typedef Kokkos::complex<float> type; };
152 #endif
153 
154 #if defined(KOKKOS_ENABLE_CUDA) && defined(IFPACK2_BLOCKTRIDICONTAINER_ENABLE_PROFILE)
155 #define IFPACK2_BLOCKTRIDICONTAINER_PROFILER_REGION_BEGIN \
156  KOKKOS_IMPL_CUDA_SAFE_CALL(cudaProfilerStart());
157 
158 #define IFPACK2_BLOCKTRIDICONTAINER_PROFILER_REGION_END \
159  { KOKKOS_IMPL_CUDA_SAFE_CALL(cudaProfilerStop()); }
160 #else
161 #define IFPACK2_BLOCKTRIDICONTAINER_PROFILER_REGION_BEGIN
163 #define IFPACK2_BLOCKTRIDICONTAINER_PROFILER_REGION_END
164 #endif
165 
169 template <typename MatrixType>
171 createBlockCrsTpetraImporter(const Teuchos::RCP<const typename BlockHelperDetails::ImplType<MatrixType>::tpetra_row_matrix_type> &A) {
172  IFPACK2_BLOCKHELPER_TIMER("BlockTriDi::CreateBlockCrsTpetraImporter", CreateBlockCrsTpetraImporter);
174  using tpetra_map_type = typename impl_type::tpetra_map_type;
175  using tpetra_mv_type = typename impl_type::tpetra_block_multivector_type;
176  using tpetra_import_type = typename impl_type::tpetra_import_type;
177  using crs_matrix_type = typename impl_type::tpetra_crs_matrix_type;
178  using block_crs_matrix_type = typename impl_type::tpetra_block_crs_matrix_type;
179 
180  auto A_crs = Teuchos::rcp_dynamic_cast<const crs_matrix_type>(A);
181  auto A_bcrs = Teuchos::rcp_dynamic_cast<const block_crs_matrix_type>(A);
182 
183  bool hasBlockCrsMatrix = !A_bcrs.is_null();
184 
185  // This is OK here to use the graph of the A_crs matrix and a block size of 1
186  const auto g = hasBlockCrsMatrix ? A_bcrs->getCrsGraph() : *(A_crs->getCrsGraph()); // tpetra crs graph object
187 
188  const auto blocksize = hasBlockCrsMatrix ? A_bcrs->getBlockSize() : 1;
189  const auto src = Teuchos::rcp(new tpetra_map_type(tpetra_mv_type::makePointMap(*g.getDomainMap(), blocksize)));
190  const auto tgt = Teuchos::rcp(new tpetra_map_type(tpetra_mv_type::makePointMap(*g.getColMap(), blocksize)));
191  IFPACK2_BLOCKHELPER_TIMER_FENCE(typename BlockHelperDetails::ImplType<MatrixType>::execution_space)
192  return Teuchos::rcp(new tpetra_import_type(src, tgt));
193 }
194 
195 // Partial replacement for forward-mode MultiVector::doImport.
196 // Permits overlapped communication and computation, but also supports sync'ed.
197 // I'm finding that overlapped comm/comp can give quite poor performance on some
198 // platforms, so we can't just use it straightforwardly always.
199 
200 template <typename MatrixType>
201 struct AsyncableImport {
202  public:
204 
205  private:
209 #if !defined(HAVE_IFPACK2_MPI)
210  typedef int MPI_Request;
211  typedef int MPI_Comm;
212 #endif
213  using scalar_type = typename impl_type::scalar_type;
216 
217  static int isend(const MPI_Comm comm, const char *buf, int count, int dest, int tag, MPI_Request *ireq) {
218 #ifdef HAVE_IFPACK2_MPI
219  MPI_Request ureq;
220  int ret = MPI_Isend(const_cast<char *>(buf), count, MPI_CHAR, dest, tag, comm, ireq == NULL ? &ureq : ireq);
221  if (ireq == NULL) MPI_Request_free(&ureq);
222  return ret;
223 #else
224  return 0;
225 #endif
226  }
227 
228  static int irecv(const MPI_Comm comm, char *buf, int count, int src, int tag, MPI_Request *ireq) {
229 #ifdef HAVE_IFPACK2_MPI
230  MPI_Request ureq;
231  int ret = MPI_Irecv(buf, count, MPI_CHAR, src, tag, comm, ireq == NULL ? &ureq : ireq);
232  if (ireq == NULL) MPI_Request_free(&ureq);
233  return ret;
234 #else
235  return 0;
236 #endif
237  }
238 
239  static int waitany(int count, MPI_Request *reqs, int *index) {
240 #ifdef HAVE_IFPACK2_MPI
241  return MPI_Waitany(count, reqs, index, MPI_STATUS_IGNORE);
242 #else
243  return 0;
244 #endif
245  }
246 
247  static int waitall(int count, MPI_Request *reqs) {
248 #ifdef HAVE_IFPACK2_MPI
249  return MPI_Waitall(count, reqs, MPI_STATUS_IGNORE);
250 #else
251  return 0;
252 #endif
253  }
254 
255  public:
256  using tpetra_map_type = typename impl_type::tpetra_map_type;
257  using tpetra_import_type = typename impl_type::tpetra_import_type;
258 
259  using local_ordinal_type = typename impl_type::local_ordinal_type;
260  using global_ordinal_type = typename impl_type::global_ordinal_type;
261  using size_type = typename impl_type::size_type;
262  using impl_scalar_type = typename impl_type::impl_scalar_type;
263 
264  using int_1d_view_host = Kokkos::View<int *, Kokkos::HostSpace>;
265  using local_ordinal_type_1d_view_host = Kokkos::View<local_ordinal_type *, Kokkos::HostSpace>;
266 
267  using execution_space = typename impl_type::execution_space;
268  using memory_space = typename impl_type::memory_space;
269  using local_ordinal_type_1d_view = typename impl_type::local_ordinal_type_1d_view;
270  using size_type_1d_view = typename impl_type::size_type_1d_view;
271  using size_type_1d_view_host = Kokkos::View<size_type *, Kokkos::HostSpace>;
272 
273 #if defined(KOKKOS_ENABLE_CUDA)
274  using impl_scalar_type_1d_view =
275  typename std::conditional<std::is_same<execution_space, Kokkos::Cuda>::value,
276 #if defined(IFPACK2_BLOCKTRIDICONTAINER_USE_PINNED_MEMORY_FOR_MPI)
277  Kokkos::View<impl_scalar_type *, Kokkos::CudaHostPinnedSpace>,
278 #elif defined(IFPACK2_BLOCKTRIDICONTAINER_USE_CUDA_MEMORY_FOR_MPI)
279  Kokkos::View<impl_scalar_type *, Kokkos::CudaSpace>,
280 #else // no experimental macros are defined
281  typename impl_type::impl_scalar_type_1d_view,
282 #endif
283  typename impl_type::impl_scalar_type_1d_view>::type;
284 #else
285  using impl_scalar_type_1d_view = typename impl_type::impl_scalar_type_1d_view;
286 #endif
287  using impl_scalar_type_1d_view_host = Kokkos::View<impl_scalar_type *, Kokkos::HostSpace>;
288  using impl_scalar_type_2d_view = typename impl_type::impl_scalar_type_2d_view;
289  using impl_scalar_type_2d_view_tpetra = typename impl_type::impl_scalar_type_2d_view_tpetra;
290 
291 #ifdef HAVE_IFPACK2_MPI
292  MPI_Comm comm;
293 #endif
294 
295  impl_scalar_type_2d_view_tpetra remote_multivector;
296  local_ordinal_type blocksize;
297 
298  template <typename T>
299  struct SendRecvPair {
300  T send, recv;
301  };
302 
303  // (s)end and (r)eceive data:
304  SendRecvPair<int_1d_view_host> pids; // mpi ranks
305  SendRecvPair<std::vector<MPI_Request>> reqs; // MPI_Request is pointer, cannot use kokkos view
306  SendRecvPair<size_type_1d_view> offset; // offsets to local id list and data buffer
307  SendRecvPair<size_type_1d_view_host> offset_host; // offsets to local id list and data buffer
308  SendRecvPair<local_ordinal_type_1d_view> lids; // local id list
309  SendRecvPair<impl_scalar_type_1d_view> buffer; // data buffer
310  SendRecvPair<impl_scalar_type_1d_view_host> buffer_host; // data buffer
311 
312  local_ordinal_type_1d_view dm2cm; // permutation
313 
314 #if defined(KOKKOS_ENABLE_CUDA) || defined(KOKKOS_ENABLE_HIP) || defined(KOKKOS_ENABLE_SYCL)
315  using exec_instance_1d_std_vector = std::vector<execution_space>;
316  exec_instance_1d_std_vector exec_instances;
317 #endif
318 
319  // for cuda
320  public:
321  void setOffsetValues(const Teuchos::ArrayView<const size_t> &lens,
322  const size_type_1d_view &offs) {
323  // wrap lens to kokkos view and deep copy to device
324  Kokkos::View<size_t *, Kokkos::HostSpace> lens_host(const_cast<size_t *>(lens.getRawPtr()), lens.size());
325  const auto lens_device = Kokkos::create_mirror_view_and_copy(memory_space(), lens_host);
326 
327  // exclusive scan
328  const Kokkos::RangePolicy<execution_space> policy(0, offs.extent(0));
329  const local_ordinal_type lens_size = lens_device.extent(0);
330  Kokkos::parallel_scan(
331  "AsyncableImport::RangePolicy::setOffsetValues",
332  policy, KOKKOS_LAMBDA(const local_ordinal_type &i, size_type &update, const bool &final) {
333  if (final)
334  offs(i) = update;
335  update += (i < lens_size ? lens_device[i] : 0);
336  });
337  }
338 
339  void setOffsetValuesHost(const Teuchos::ArrayView<const size_t> &lens,
340  const size_type_1d_view_host &offs) {
341  // wrap lens to kokkos view and deep copy to device
342  Kokkos::View<size_t *, Kokkos::HostSpace> lens_host(const_cast<size_t *>(lens.getRawPtr()), lens.size());
343  const auto lens_device = Kokkos::create_mirror_view_and_copy(memory_space(), lens_host);
344 
345  // exclusive scan
346  offs(0) = 0;
347  for (local_ordinal_type i = 1, iend = offs.extent(0); i < iend; ++i) {
348  offs(i) = offs(i - 1) + lens[i - 1];
349  }
350  }
351 
352  private:
353  void createMpiRequests(const tpetra_import_type &import) {
354  Tpetra::Distributor &distributor = import.getDistributor();
355 
356  // copy pids from distributor
357  const auto pids_from = distributor.getProcsFrom();
358  pids.recv = int_1d_view_host(do_not_initialize_tag("pids recv"), pids_from.size());
359  memcpy(pids.recv.data(), pids_from.getRawPtr(), sizeof(int) * pids.recv.extent(0));
360 
361  const auto pids_to = distributor.getProcsTo();
362  pids.send = int_1d_view_host(do_not_initialize_tag("pids send"), pids_to.size());
363  memcpy(pids.send.data(), pids_to.getRawPtr(), sizeof(int) * pids.send.extent(0));
364 
365  // mpi requests
366  reqs.recv.resize(pids.recv.extent(0));
367  memset(reqs.recv.data(), 0, reqs.recv.size() * sizeof(MPI_Request));
368  reqs.send.resize(pids.send.extent(0));
369  memset(reqs.send.data(), 0, reqs.send.size() * sizeof(MPI_Request));
370 
371  // construct offsets
372 #if 0
373  const auto lengths_to = distributor.getLengthsTo();
374  offset.send = size_type_1d_view(do_not_initialize_tag("offset send"), lengths_to.size() + 1);
375 
376  const auto lengths_from = distributor.getLengthsFrom();
377  offset.recv = size_type_1d_view(do_not_initialize_tag("offset recv"), lengths_from.size() + 1);
378 
379  setOffsetValues(lengths_to, offset.send);
380  offset_host.send = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), offset.send);
381 
382  setOffsetValues(lengths_from, offset.recv);
383  offset_host.recv = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), offset.recv);
384 #else
385  const auto lengths_to = distributor.getLengthsTo();
386  offset_host.send = size_type_1d_view_host(do_not_initialize_tag("offset send"), lengths_to.size() + 1);
387 
388  const auto lengths_from = distributor.getLengthsFrom();
389  offset_host.recv = size_type_1d_view_host(do_not_initialize_tag("offset recv"), lengths_from.size() + 1);
390 
391  setOffsetValuesHost(lengths_to, offset_host.send);
392  // offset.send = Kokkos::create_mirror_view_and_copy(memory_space(), offset_host.send);
393 
394  setOffsetValuesHost(lengths_from, offset_host.recv);
395  // offset.recv = Kokkos::create_mirror_view_and_copy(memory_space(), offset_host.recv);
396 #endif
397  }
398 
399  void createSendRecvIDs(const tpetra_import_type &import) {
400  // For each remote PID, the list of LIDs to receive.
401  const auto remote_lids = import.getRemoteLIDs();
402  const local_ordinal_type_1d_view_host
403  remote_lids_view_host(const_cast<local_ordinal_type *>(remote_lids.getRawPtr()), remote_lids.size());
404  lids.recv = local_ordinal_type_1d_view(do_not_initialize_tag("lids recv"), remote_lids.size());
405  Kokkos::deep_copy(lids.recv, remote_lids_view_host);
406 
407  // For each export PID, the list of LIDs to send.
408  auto epids = import.getExportPIDs();
409  auto elids = import.getExportLIDs();
410  TEUCHOS_ASSERT(epids.size() == elids.size());
411  lids.send = local_ordinal_type_1d_view(do_not_initialize_tag("lids send"), elids.size());
412  auto lids_send_host = Kokkos::create_mirror_view(lids.send);
413 
414  // naive search (not sure if pids or epids are sorted)
415  for (local_ordinal_type cnt = 0, i = 0, iend = pids.send.extent(0); i < iend; ++i) {
416  const auto pid_send_value = pids.send[i];
417  for (local_ordinal_type j = 0, jend = epids.size(); j < jend; ++j)
418  if (epids[j] == pid_send_value) lids_send_host[cnt++] = elids[j];
419  TEUCHOS_ASSERT(static_cast<size_t>(cnt) == offset_host.send[i + 1]);
420  }
421  Kokkos::deep_copy(lids.send, lids_send_host);
422  }
423 
424  void createExecutionSpaceInstances() {
425 #if defined(KOKKOS_ENABLE_CUDA) || defined(KOKKOS_ENABLE_HIP) || defined(KOKKOS_ENABLE_SYCL)
426  // The following line creates 8 streams:
427 #if KOKKOS_VERSION >= 40699
428  exec_instances =
429  Kokkos::Experimental::partition_space(execution_space(), std::vector<int>(8, 1));
430 #else
431  exec_instances =
432  Kokkos::Experimental::partition_space(execution_space(), 1, 1, 1, 1, 1, 1, 1, 1);
433 #endif
434 #endif
435  }
436 
437  public:
438  // for cuda, all tag types are public
439  struct ToBuffer {};
440  struct ToMultiVector {};
441 
442  AsyncableImport(const Teuchos::RCP<const tpetra_map_type> &src_map,
444  const local_ordinal_type blocksize_,
445  const local_ordinal_type_1d_view dm2cm_) {
446  blocksize = blocksize_;
447  dm2cm = dm2cm_;
448 
449 #ifdef HAVE_IFPACK2_MPI
450  comm = Tpetra::Details::extractMpiCommFromTeuchos(*tgt_map->getComm());
451 #endif
452  const tpetra_import_type import(src_map, tgt_map);
453 
454  createMpiRequests(import);
455  createSendRecvIDs(import);
456  createExecutionSpaceInstances();
457  }
458 
459  void createDataBuffer(const local_ordinal_type &num_vectors) {
460  const size_type extent_0 = lids.recv.extent(0) * blocksize;
461  const size_type extent_1 = num_vectors;
462  if (remote_multivector.extent(0) == extent_0 &&
463  remote_multivector.extent(1) == extent_1) {
464  // skip
465  } else {
466  remote_multivector =
467  impl_scalar_type_2d_view_tpetra(do_not_initialize_tag("remote multivector"), extent_0, extent_1);
468 
469  const auto send_buffer_size = offset_host.send[offset_host.send.extent(0) - 1] * blocksize * num_vectors;
470  const auto recv_buffer_size = offset_host.recv[offset_host.recv.extent(0) - 1] * blocksize * num_vectors;
471 
472  buffer.send = impl_scalar_type_1d_view(do_not_initialize_tag("buffer send"), send_buffer_size);
473  buffer.recv = impl_scalar_type_1d_view(do_not_initialize_tag("buffer recv"), recv_buffer_size);
474 
475  if (!Tpetra::Details::Behavior::assumeMpiIsGPUAware()) {
476  buffer_host.send = impl_scalar_type_1d_view_host(do_not_initialize_tag("buffer send"), send_buffer_size);
477  buffer_host.recv = impl_scalar_type_1d_view_host(do_not_initialize_tag("buffer recv"), recv_buffer_size);
478  }
479  }
480  }
481 
482  void cancel() {
483 #ifdef HAVE_IFPACK2_MPI
484  waitall(reqs.recv.size(), reqs.recv.data());
485  waitall(reqs.send.size(), reqs.send.data());
486 #endif
487  }
488 
489  // ======================================================================
490  // Async version using execution space instances
491  // ======================================================================
492 
493 #if defined(KOKKOS_ENABLE_CUDA) || defined(KOKKOS_ENABLE_HIP) || defined(KOKKOS_ENABLE_SYCL)
494  template <typename PackTag>
495  static void copy(const local_ordinal_type_1d_view &lids_,
496  const impl_scalar_type_1d_view &buffer_,
497  const local_ordinal_type ibeg_,
498  const local_ordinal_type iend_,
499  const impl_scalar_type_2d_view_tpetra &multivector_,
500  const local_ordinal_type blocksize_,
501  const execution_space &exec_instance_) {
502  const local_ordinal_type num_vectors = multivector_.extent(1);
503  const local_ordinal_type mv_blocksize = blocksize_ * num_vectors;
504  const local_ordinal_type idiff = iend_ - ibeg_;
505  const auto abase = buffer_.data() + mv_blocksize * ibeg_;
506 
507  using team_policy_type = Kokkos::TeamPolicy<execution_space>;
508  local_ordinal_type vector_size(0);
509  if (blocksize_ <= 4)
510  vector_size = 4;
511  else if (blocksize_ <= 8)
512  vector_size = 8;
513  else if (blocksize_ <= 16)
514  vector_size = 16;
515  else
516  vector_size = 32;
517 
518  const auto work_item_property = Kokkos::Experimental::WorkItemProperty::HintLightWeight;
519  const team_policy_type policy(exec_instance_, idiff, 1, vector_size);
520  Kokkos::parallel_for( //"AsyncableImport::TeamPolicy::copyViaCudaStream",
521  Kokkos::Experimental::require(policy, work_item_property),
522  KOKKOS_LAMBDA(const typename team_policy_type::member_type &member) {
523  const local_ordinal_type i = member.league_rank();
524  Kokkos::parallel_for(Kokkos::TeamThreadRange(member, num_vectors), [&](const local_ordinal_type &j) {
525  auto aptr = abase + blocksize_ * (i + idiff * j);
526  auto bptr = &multivector_(blocksize_ * lids_(i + ibeg_), j);
527  if (std::is_same<PackTag, ToBuffer>::value)
528  Kokkos::parallel_for(Kokkos::ThreadVectorRange(member, blocksize_), [&](const local_ordinal_type &k) {
529  aptr[k] = bptr[k];
530  });
531  else
532  Kokkos::parallel_for(Kokkos::ThreadVectorRange(member, blocksize_), [&](const local_ordinal_type &k) {
533  bptr[k] = aptr[k];
534  });
535  });
536  });
537  }
538 
539  void asyncSendRecvVar1(const impl_scalar_type_2d_view_tpetra &mv) {
540  IFPACK2_BLOCKHELPER_TIMER("BlockTriDi::AsyncableImport::AsyncSendRecv", AsyncSendRecv);
541 
542 #ifdef HAVE_IFPACK2_MPI
543  // constants and reallocate data buffers if necessary
544  const local_ordinal_type num_vectors = mv.extent(1);
545  const local_ordinal_type mv_blocksize = blocksize * num_vectors;
546 
547  // 0. post receive async
548  for (local_ordinal_type i = 0, iend = pids.recv.extent(0); i < iend; ++i) {
549  if (Tpetra::Details::Behavior::assumeMpiIsGPUAware()) {
550  irecv(comm,
551  reinterpret_cast<char *>(buffer.recv.data() + offset_host.recv[i] * mv_blocksize),
552  (offset_host.recv[i + 1] - offset_host.recv[i]) * mv_blocksize * sizeof(impl_scalar_type),
553  pids.recv[i],
554  42,
555  &reqs.recv[i]);
556  } else {
557  irecv(comm,
558  reinterpret_cast<char *>(buffer_host.recv.data() + offset_host.recv[i] * mv_blocksize),
559  (offset_host.recv[i + 1] - offset_host.recv[i]) * mv_blocksize * sizeof(impl_scalar_type),
560  pids.recv[i],
561  42,
562  &reqs.recv[i]);
563  }
564  }
565 
567  execution_space().fence();
568 
569  // 1. async memcpy
570  for (local_ordinal_type i = 0; i < static_cast<local_ordinal_type>(pids.send.extent(0)); ++i) {
571  // 1.0. enqueue pack buffer
572  if (i < 8) exec_instances[i % 8].fence();
573  copy<ToBuffer>(lids.send, buffer.send,
574  offset_host.send(i), offset_host.send(i + 1),
575  mv, blocksize,
576  // execution_space());
577  exec_instances[i % 8]);
578  if (!Tpetra::Details::Behavior::assumeMpiIsGPUAware()) {
579  // if (i<8) exec_instances[i%8].fence();
580  const local_ordinal_type num_vectors = mv.extent(1);
581  const local_ordinal_type mv_blocksize = blocksize * num_vectors;
582 
583  Kokkos::deep_copy(exec_instances[i % 8],
584  Kokkos::subview(buffer_host.send,
585  Kokkos::pair<local_ordinal_type, local_ordinal_type>(
586  offset_host.send(i) * mv_blocksize,
587  offset_host.send(i + 1) * mv_blocksize)),
588  Kokkos::subview(buffer.send,
589  Kokkos::pair<local_ordinal_type, local_ordinal_type>(
590  offset_host.send(i) * mv_blocksize,
591  offset_host.send(i + 1) * mv_blocksize)));
592  }
593  }
595  // execution_space().fence();
596  for (local_ordinal_type i = 0; i < static_cast<local_ordinal_type>(pids.send.extent(0)); ++i) {
597  // 1.1. sync the stream and isend
598  if (i < 8) exec_instances[i % 8].fence();
599  if (Tpetra::Details::Behavior::assumeMpiIsGPUAware()) {
600  isend(comm,
601  reinterpret_cast<const char *>(buffer.send.data() + offset_host.send[i] * mv_blocksize),
602  (offset_host.send[i + 1] - offset_host.send[i]) * mv_blocksize * sizeof(impl_scalar_type),
603  pids.send[i],
604  42,
605  &reqs.send[i]);
606  } else {
607  isend(comm,
608  reinterpret_cast<const char *>(buffer_host.send.data() + offset_host.send[i] * mv_blocksize),
609  (offset_host.send[i + 1] - offset_host.send[i]) * mv_blocksize * sizeof(impl_scalar_type),
610  pids.send[i],
611  42,
612  &reqs.send[i]);
613  }
614  }
615 
616  // 2. poke communication
617  for (local_ordinal_type i = 0, iend = pids.recv.extent(0); i < iend; ++i) {
618  int flag;
619  MPI_Status stat;
620  MPI_Iprobe(pids.recv[i], 42, comm, &flag, &stat);
621  }
622 #endif // HAVE_IFPACK2_MPI
623  IFPACK2_BLOCKHELPER_TIMER_FENCE(execution_space)
624  }
625 
626  void syncRecvVar1() {
627  IFPACK2_BLOCKHELPER_TIMER("BlockTriDi::AsyncableImport::SyncRecv", SyncRecv);
628 #ifdef HAVE_IFPACK2_MPI
629  // 0. wait for receive async.
630  for (local_ordinal_type i = 0; i < static_cast<local_ordinal_type>(pids.recv.extent(0)); ++i) {
631  local_ordinal_type idx = i;
632 
633  // 0.0. wait any
634  waitany(pids.recv.extent(0), reqs.recv.data(), &idx);
635 
636  if (!Tpetra::Details::Behavior::assumeMpiIsGPUAware()) {
637  const local_ordinal_type num_vectors = remote_multivector.extent(1);
638  const local_ordinal_type mv_blocksize = blocksize * num_vectors;
639 
640  Kokkos::deep_copy(
641  Kokkos::subview(buffer.recv,
642  Kokkos::pair<local_ordinal_type, local_ordinal_type>(
643  offset_host.recv(idx) * mv_blocksize,
644  offset_host.recv(idx + 1) * mv_blocksize)),
645  Kokkos::subview(buffer_host.recv,
646  Kokkos::pair<local_ordinal_type, local_ordinal_type>(
647  offset_host.recv(idx) * mv_blocksize,
648  offset_host.recv(idx + 1) * mv_blocksize)));
649  }
650 
651  // 0.1. unpack data after data is moved into a device
652  copy<ToMultiVector>(lids.recv, buffer.recv,
653  offset_host.recv(idx), offset_host.recv(idx + 1),
654  remote_multivector, blocksize,
655  exec_instances[idx % 8]);
656  }
657 
658  // 1. fire up all cuda events
659  Kokkos::fence();
660 
661  // 2. cleanup all open comm
662  waitall(reqs.send.size(), reqs.send.data());
663 #endif // HAVE_IFPACK2_MPI
664  IFPACK2_BLOCKHELPER_TIMER_FENCE(execution_space)
665  }
666 #endif // defined(KOKKOS_ENABLE_CUDA|HIP|SYCL)
667 
668  // ======================================================================
669  // Generic version without using execution space instances
670  // - only difference between device and host architecture is on using team
671  // or range policies.
672  // ======================================================================
673  template <typename PackTag>
674  static void copy(const local_ordinal_type_1d_view &lids_,
675  const impl_scalar_type_1d_view &buffer_,
676  const local_ordinal_type &ibeg_,
677  const local_ordinal_type &iend_,
678  const impl_scalar_type_2d_view_tpetra &multivector_,
679  const local_ordinal_type blocksize_) {
680  const local_ordinal_type num_vectors = multivector_.extent(1);
681  const local_ordinal_type mv_blocksize = blocksize_ * num_vectors;
682  const local_ordinal_type idiff = iend_ - ibeg_;
683  const auto abase = buffer_.data() + mv_blocksize * ibeg_;
684  if constexpr (BlockHelperDetails::is_device<execution_space>::value) {
685  using team_policy_type = Kokkos::TeamPolicy<execution_space>;
686  local_ordinal_type vector_size(0);
687  if (blocksize_ <= 4)
688  vector_size = 4;
689  else if (blocksize_ <= 8)
690  vector_size = 8;
691  else if (blocksize_ <= 16)
692  vector_size = 16;
693  else
694  vector_size = 32;
695  const team_policy_type policy(idiff, 1, vector_size);
696  Kokkos::parallel_for(
697  "AsyncableImport::TeamPolicy::copy",
698  policy, KOKKOS_LAMBDA(const typename team_policy_type::member_type &member) {
699  const local_ordinal_type i = member.league_rank();
700  Kokkos::parallel_for(Kokkos::TeamThreadRange(member, num_vectors), [&](const local_ordinal_type &j) {
701  auto aptr = abase + blocksize_ * (i + idiff * j);
702  auto bptr = &multivector_(blocksize_ * lids_(i + ibeg_), j);
703  if (std::is_same<PackTag, ToBuffer>::value)
704  Kokkos::parallel_for(Kokkos::ThreadVectorRange(member, blocksize_), [&](const local_ordinal_type &k) {
705  aptr[k] = bptr[k];
706  });
707  else
708  Kokkos::parallel_for(Kokkos::ThreadVectorRange(member, blocksize_), [&](const local_ordinal_type &k) {
709  bptr[k] = aptr[k];
710  });
711  });
712  });
713  } else {
714  const Kokkos::RangePolicy<execution_space> policy(0, idiff * num_vectors);
715  Kokkos::parallel_for(
716  "AsyncableImport::RangePolicy::copy",
717  policy, KOKKOS_LAMBDA(const local_ordinal_type &ij) {
718  const local_ordinal_type i = ij % idiff;
719  const local_ordinal_type j = ij / idiff;
720  auto aptr = abase + blocksize_ * (i + idiff * j);
721  auto bptr = &multivector_(blocksize_ * lids_(i + ibeg_), j);
722  auto from = std::is_same<PackTag, ToBuffer>::value ? bptr : aptr;
723  auto to = std::is_same<PackTag, ToBuffer>::value ? aptr : bptr;
724  memcpy(to, from, sizeof(impl_scalar_type) * blocksize_);
725  });
726  }
727  }
728 
732  void asyncSendRecvVar0(const impl_scalar_type_2d_view_tpetra &mv) {
733  IFPACK2_BLOCKHELPER_TIMER("BlockTriDi::AsyncableImport::AsyncSendRecv", AsyncSendRecv);
734 
735 #ifdef HAVE_IFPACK2_MPI
736  // constants and reallocate data buffers if necessary
737  const local_ordinal_type num_vectors = mv.extent(1);
738  const local_ordinal_type mv_blocksize = blocksize * num_vectors;
739 
740  // receive async
741  for (local_ordinal_type i = 0, iend = pids.recv.extent(0); i < iend; ++i) {
742  if (Tpetra::Details::Behavior::assumeMpiIsGPUAware()) {
743  irecv(comm,
744  reinterpret_cast<char *>(buffer.recv.data() + offset_host.recv[i] * mv_blocksize),
745  (offset_host.recv[i + 1] - offset_host.recv[i]) * mv_blocksize * sizeof(impl_scalar_type),
746  pids.recv[i],
747  42,
748  &reqs.recv[i]);
749  } else {
750  irecv(comm,
751  reinterpret_cast<char *>(buffer_host.recv.data() + offset_host.recv[i] * mv_blocksize),
752  (offset_host.recv[i + 1] - offset_host.recv[i]) * mv_blocksize * sizeof(impl_scalar_type),
753  pids.recv[i],
754  42,
755  &reqs.recv[i]);
756  }
757  }
758 
759  // send async
760  for (local_ordinal_type i = 0, iend = pids.send.extent(0); i < iend; ++i) {
761  copy<ToBuffer>(lids.send, buffer.send, offset_host.send(i), offset_host.send(i + 1),
762  mv, blocksize);
763  Kokkos::fence();
764  if (Tpetra::Details::Behavior::assumeMpiIsGPUAware()) {
765  isend(comm,
766  reinterpret_cast<const char *>(buffer.send.data() + offset_host.send[i] * mv_blocksize),
767  (offset_host.send[i + 1] - offset_host.send[i]) * mv_blocksize * sizeof(impl_scalar_type),
768  pids.send[i],
769  42,
770  &reqs.send[i]);
771  } else {
772  Kokkos::deep_copy(
773  Kokkos::subview(buffer_host.send,
774  Kokkos::pair<local_ordinal_type, local_ordinal_type>(
775  offset_host.send(i) * mv_blocksize,
776  offset_host.send(i + 1) * mv_blocksize)),
777  Kokkos::subview(buffer.send,
778  Kokkos::pair<local_ordinal_type, local_ordinal_type>(
779  offset_host.send(i) * mv_blocksize,
780  offset_host.send(i + 1) * mv_blocksize)));
781  isend(comm,
782  reinterpret_cast<const char *>(buffer_host.send.data() + offset_host.send[i] * mv_blocksize),
783  (offset_host.send[i + 1] - offset_host.send[i]) * mv_blocksize * sizeof(impl_scalar_type),
784  pids.send[i],
785  42,
786  &reqs.send[i]);
787  }
788  }
789 
790  // I find that issuing an Iprobe seems to nudge some MPIs into action,
791  // which helps with overlapped comm/comp performance.
792  for (local_ordinal_type i = 0, iend = pids.recv.extent(0); i < iend; ++i) {
793  int flag;
794  MPI_Status stat;
795  MPI_Iprobe(pids.recv[i], 42, comm, &flag, &stat);
796  }
797 #endif
798  IFPACK2_BLOCKHELPER_TIMER_FENCE(execution_space)
799  }
800 
801  void syncRecvVar0() {
802  IFPACK2_BLOCKHELPER_TIMER("BlockTriDi::AsyncableImport::SyncRecv", SyncRecv);
803 #ifdef HAVE_IFPACK2_MPI
804  // receive async.
805  for (local_ordinal_type i = 0, iend = pids.recv.extent(0); i < iend; ++i) {
806  local_ordinal_type idx = i;
807  waitany(pids.recv.extent(0), reqs.recv.data(), &idx);
808  if (!Tpetra::Details::Behavior::assumeMpiIsGPUAware()) {
809  const local_ordinal_type num_vectors = remote_multivector.extent(1);
810  const local_ordinal_type mv_blocksize = blocksize * num_vectors;
811  Kokkos::deep_copy(
812  Kokkos::subview(buffer.recv,
813  Kokkos::pair<local_ordinal_type, local_ordinal_type>(
814  offset_host.recv(idx) * mv_blocksize,
815  offset_host.recv(idx + 1) * mv_blocksize)),
816  Kokkos::subview(buffer_host.recv,
817  Kokkos::pair<local_ordinal_type, local_ordinal_type>(
818  offset_host.recv(idx) * mv_blocksize,
819  offset_host.recv(idx + 1) * mv_blocksize)));
820  }
821  copy<ToMultiVector>(lids.recv, buffer.recv, offset_host.recv(idx), offset_host.recv(idx + 1),
822  remote_multivector, blocksize);
823  }
824  // wait on the sends to match all Isends with a cleanup operation.
825  waitall(reqs.send.size(), reqs.send.data());
826 #endif
827  IFPACK2_BLOCKHELPER_TIMER_FENCE(execution_space)
828  }
829 
833  void asyncSendRecv(const impl_scalar_type_2d_view_tpetra &mv) {
834 #if defined(KOKKOS_ENABLE_CUDA) || defined(KOKKOS_ENABLE_HIP) || defined(KOKKOS_ENABLE_SYCL)
835 #if defined(IFPACK2_BLOCKTRIDICONTAINER_USE_EXEC_SPACE_INSTANCES)
836  asyncSendRecvVar1(mv);
837 #else
838  asyncSendRecvVar0(mv);
839 #endif
840 #else
841  asyncSendRecvVar0(mv);
842 #endif
843  }
844  void syncRecv() {
845 #if defined(KOKKOS_ENABLE_CUDA) || defined(KOKKOS_ENABLE_HIP) || defined(KOKKOS_ENABLE_SYCL)
846 #if defined(IFPACK2_BLOCKTRIDICONTAINER_USE_EXEC_SPACE_INSTANCES)
847  syncRecvVar1();
848 #else
849  syncRecvVar0();
850 #endif
851 #else
852  syncRecvVar0();
853 #endif
854  }
855 
856  void syncExchange(const impl_scalar_type_2d_view_tpetra &mv) {
857  IFPACK2_BLOCKHELPER_TIMER("BlockTriDi::AsyncableImport::SyncExchange", SyncExchange);
858  asyncSendRecv(mv);
859  syncRecv();
860  IFPACK2_BLOCKHELPER_TIMER_FENCE(execution_space)
861  }
862 
863  impl_scalar_type_2d_view_tpetra getRemoteMultiVectorLocalView() const { return remote_multivector; }
864 };
865 
866 template <typename ViewType1, typename ViewType2>
867 struct are_same_struct {
868  ViewType1 keys1;
869  ViewType2 keys2;
870 
871  are_same_struct(ViewType1 keys1_, ViewType2 keys2_)
872  : keys1(keys1_)
873  , keys2(keys2_) {}
874  KOKKOS_INLINE_FUNCTION
875  void operator()(int i, unsigned int &count) const {
876  if (keys1(i) != keys2(i)) count++;
877  }
878 };
879 
880 template <typename ViewType1, typename ViewType2>
881 bool are_same(ViewType1 keys1, ViewType2 keys2) {
882  unsigned int are_same_ = 0;
883 
884  Kokkos::parallel_reduce(Kokkos::RangePolicy<typename ViewType1::execution_space>(0, keys1.extent(0)),
885  are_same_struct(keys1, keys2),
886  are_same_);
887  return are_same_ == 0;
888 }
889 
893 template <typename MatrixType>
895 createBlockCrsAsyncImporter(const Teuchos::RCP<const typename BlockHelperDetails::ImplType<MatrixType>::tpetra_row_matrix_type> &A) {
896  IFPACK2_BLOCKHELPER_TIMER("createBlockCrsAsyncImporter", createBlockCrsAsyncImporter);
898  using tpetra_map_type = typename impl_type::tpetra_map_type;
899  using local_ordinal_type = typename impl_type::local_ordinal_type;
900  using global_ordinal_type = typename impl_type::global_ordinal_type;
901  using local_ordinal_type_1d_view = typename impl_type::local_ordinal_type_1d_view;
902  using crs_matrix_type = typename impl_type::tpetra_crs_matrix_type;
903  using block_crs_matrix_type = typename impl_type::tpetra_block_crs_matrix_type;
904  using global_indices_array_device_type = Kokkos::View<const global_ordinal_type *, typename tpetra_map_type::device_type>;
905 
906  auto A_crs = Teuchos::rcp_dynamic_cast<const crs_matrix_type>(A);
907  auto A_bcrs = Teuchos::rcp_dynamic_cast<const block_crs_matrix_type>(A);
908 
909  bool hasBlockCrsMatrix = !A_bcrs.is_null();
910 
911  // This is OK here to use the graph of the A_crs matrix and a block size of 1
912  const auto g = hasBlockCrsMatrix ? A_bcrs->getCrsGraph() : *(A_crs->getCrsGraph()); // tpetra crs graph object
913 
914  const auto blocksize = hasBlockCrsMatrix ? A_bcrs->getBlockSize() : 1;
915  const auto domain_map = g.getDomainMap();
916  const auto column_map = g.getColMap();
917 
918  std::vector<global_ordinal_type> gids;
919 
920  Kokkos::Subview<global_indices_array_device_type, std::pair<int, int>> column_map_global_iD_last;
921 
922  bool separate_remotes = true, found_first = false, need_owned_permutation = false;
923  {
924  IFPACK2_BLOCKHELPER_TIMER("createBlockCrsAsyncImporter::loop_over_local_elements", loop_over_local_elements);
925 
926  global_indices_array_device_type column_map_global_iD = column_map->getMyGlobalIndicesDevice();
927  global_indices_array_device_type domain_map_global_iD = domain_map->getMyGlobalIndicesDevice();
928 
929  if (are_same(domain_map_global_iD, column_map_global_iD)) {
930  // this should be the most likely path
931  separate_remotes = true;
932  need_owned_permutation = false;
933 
934  column_map_global_iD_last = Kokkos::subview(column_map_global_iD,
935  std::pair<int, int>(domain_map_global_iD.extent(0), column_map_global_iD.extent(0)));
936  } else {
937  // This loop is relatively expensive
938  for (size_t i = 0; i < column_map->getLocalNumElements(); ++i) {
939  const global_ordinal_type gid = column_map->getGlobalElement(i);
940  if (!domain_map->isNodeGlobalElement(gid)) {
941  found_first = true;
942  gids.push_back(gid);
943  } else if (found_first) {
944  separate_remotes = false;
945  break;
946  }
947  if (!found_first && !need_owned_permutation &&
948  domain_map->getLocalElement(gid) != static_cast<local_ordinal_type>(i)) {
949  // The owned part of the domain and column maps are different
950  // orderings. We *could* do a super efficient impl of this case in the
951  // num_sweeps > 1 case by adding complexity to PermuteAndRepack. But,
952  // really, if a caller cares about speed, they wouldn't make different
953  // local permutations like this. So we punt on the best impl and go for
954  // a pretty good one: the permutation is done in place in
955  // compute_b_minus_Rx for the pure-owned part of the MVP. The only cost
956  // is the presumably worse memory access pattern of the input vector.
957  need_owned_permutation = true;
958  }
959  }
960  }
961  IFPACK2_BLOCKHELPER_TIMER_FENCE(typename BlockHelperDetails::ImplType<MatrixType>::execution_space)
962  }
963 
964  if (separate_remotes) {
965  IFPACK2_BLOCKHELPER_TIMER("createBlockCrsAsyncImporter::separate_remotes", separate_remotes);
967  const auto parsimonious_col_map = need_owned_permutation ? Teuchos::rcp(new tpetra_map_type(invalid, gids.data(), gids.size(), 0, domain_map->getComm())) : Teuchos::rcp(new tpetra_map_type(invalid, column_map_global_iD_last, 0, domain_map->getComm()));
968  if (parsimonious_col_map->getGlobalNumElements() > 0) {
969  // make the importer only if needed.
970  local_ordinal_type_1d_view dm2cm;
971  if (need_owned_permutation) {
972  dm2cm = local_ordinal_type_1d_view(do_not_initialize_tag("dm2cm"), domain_map->getLocalNumElements());
973  const auto dm2cm_host = Kokkos::create_mirror_view(dm2cm);
974  for (size_t i = 0; i < domain_map->getLocalNumElements(); ++i)
975  dm2cm_host(i) = domain_map->getLocalElement(column_map->getGlobalElement(i));
976  Kokkos::deep_copy(dm2cm, dm2cm_host);
977  }
978  IFPACK2_BLOCKHELPER_TIMER_FENCE(typename BlockHelperDetails::ImplType<MatrixType>::execution_space)
979  return Teuchos::rcp(new AsyncableImport<MatrixType>(domain_map, parsimonious_col_map, blocksize, dm2cm));
980  }
981  }
982  IFPACK2_BLOCKHELPER_TIMER_FENCE(typename BlockHelperDetails::ImplType<MatrixType>::execution_space)
983  return Teuchos::null;
984 }
985 
986 template <typename local_ordinal_type>
987 local_ordinal_type costTRSM(const local_ordinal_type block_size) {
988  return block_size * block_size;
989 }
990 
991 template <typename local_ordinal_type>
992 local_ordinal_type costGEMV(const local_ordinal_type block_size) {
993  return 2 * block_size * block_size;
994 }
995 
996 template <typename local_ordinal_type>
997 local_ordinal_type costTriDiagSolve(const local_ordinal_type subline_length, const local_ordinal_type block_size) {
998  return 2 * subline_length * costTRSM(block_size) + 2 * (subline_length - 1) * costGEMV(block_size);
999 }
1000 
1001 template <typename local_ordinal_type>
1002 local_ordinal_type costSolveSchur(const local_ordinal_type num_parts,
1003  const local_ordinal_type num_teams,
1004  const local_ordinal_type line_length,
1005  const local_ordinal_type block_size,
1006  const local_ordinal_type n_subparts_per_part) {
1007  const local_ordinal_type subline_length = ceil(double(line_length - (n_subparts_per_part - 1) * 2) / n_subparts_per_part);
1008  if (subline_length < 1) {
1009  return INT_MAX;
1010  }
1011 
1012  const local_ordinal_type p_n_lines = ceil(double(num_parts) / num_teams);
1013  const local_ordinal_type p_n_sublines = ceil(double(n_subparts_per_part) * num_parts / num_teams);
1014  const local_ordinal_type p_n_sublines_2 = ceil(double(n_subparts_per_part - 1) * num_parts / num_teams);
1015 
1016  const local_ordinal_type p_costApplyE = p_n_sublines_2 * subline_length * 2 * costGEMV(block_size);
1017  const local_ordinal_type p_costApplyS = p_n_lines * costTriDiagSolve((n_subparts_per_part - 1) * 2, block_size);
1018  const local_ordinal_type p_costApplyAinv = p_n_sublines * costTriDiagSolve(subline_length, block_size);
1019  const local_ordinal_type p_costApplyC = p_n_sublines_2 * 2 * costGEMV(block_size);
1020 
1021  if (n_subparts_per_part == 1) {
1022  return p_costApplyAinv;
1023  }
1024  return p_costApplyE + p_costApplyS + p_costApplyAinv + p_costApplyC;
1025 }
1026 
1027 template <typename local_ordinal_type>
1028 local_ordinal_type getAutomaticNSubparts(const local_ordinal_type num_parts,
1029  const local_ordinal_type num_teams,
1030  const local_ordinal_type line_length,
1031  const local_ordinal_type block_size) {
1032  // BMK: replaced theoretical model with empirical model
1033  // This is a linear regression based on data from a grid search.
1034  // The independent terms in the regression are:
1035  // - "parallelism surplus" - smaller when problem has enough lines to saturate GPU, larger otherwise
1036  // - log2 of the line length
1037  // - block size
1038  double parallelismSurplus = Kokkos::sqrt((double)num_teams / num_parts);
1039  double logLineLength = Kokkos::log2((double)line_length);
1040  (void)logLineLength;
1041  // Directly predict with linear model
1042 #if defined(KOKKOS_ARCH_AMD_GFX942) || defined(KOKKOS_ARCH_AMD_GFX942_APU)
1043  // MI300-specific data
1044  double modeled = -9.2312 + 4.6946 * parallelismSurplus + 0.4095 * block_size + 0.966 * logLineLength;
1045  // Do not split lines if there is plenty of parallelism
1046  if (parallelismSurplus < 0.3)
1047  modeled = 1;
1048 #elif defined(KOKKOS_ARCH_HOPPER) || defined(KOKKOS_ARCH_BLACKWELL)
1049  // Based on H100 data
1050  double modeled = -9.6053 + 4.7477 * parallelismSurplus + 0.2338 * block_size + 1.0794 * logLineLength;
1051  // On H100, performance degrades rapidly if small lines are split too many times
1052  double maxSplit = (double)line_length / 8;
1053  if (modeled > maxSplit)
1054  modeled = maxSplit;
1055 #elif defined(KOKKOS_ENABLE_CUDA)
1056  // Based on V100 data, line splitting is profitable in fewer cases
1057  // (only when there are few, long lines)
1058  double modeled = 1;
1059  if (parallelismSurplus > 1 && line_length > 64)
1060  modeled = 4;
1061 #elif defined(KOKKOS_ENABLE_HIP)
1062  // Based on MI250X data
1063  double modeled = -8.6214 + 7.3468 * parallelismSurplus + 0.3596 * block_size + 0.6673 * logLineLength;
1064 #else
1065  // GPUs other than CUDA or HIP: default to simple model that works for V100
1066  double modeled = 1;
1067  if (parallelismSurplus > 1 && line_length > 64)
1068  modeled = 4;
1069 #endif
1070 
1071  // Round to nearest integer
1072  local_ordinal_type n_subparts_per_part = 0.5 + modeled;
1073  // Do not split lines if there is plenty of parallelism available
1074  if (parallelismSurplus < 0.3)
1075  n_subparts_per_part = 1;
1076  // Clamp the result to valid range
1077  // Criteria for valid n_subparts_per_part (where connection_length is 2 for wide separators)
1078  // line_length >= n_subparts_per_part + (n_subparts_per_part - 1) * connection_length
1079  // Equivalently:
1080  // line_length >= n_subparts_per_part + n_subparts_per_part * 2 - 2
1081  // line_length >= 3 * n_subparts_per_part - 2
1082  local_ordinal_type min_subparts_per_part = 1;
1083  local_ordinal_type max_subparts_per_part = (line_length + 2) / 3;
1084  // Limit memory usage from too many sublines
1085  if (max_subparts_per_part > 16)
1086  max_subparts_per_part = 16;
1087  if (n_subparts_per_part < min_subparts_per_part)
1088  n_subparts_per_part = min_subparts_per_part;
1089  if (n_subparts_per_part > max_subparts_per_part)
1090  n_subparts_per_part = max_subparts_per_part;
1091  return n_subparts_per_part;
1092 }
1093 
1094 template <typename ArgActiveExecutionMemorySpace>
1095 struct SolveTridiagsDefaultModeAndAlgo;
1096 
1100 template <typename MatrixType>
1101 BlockHelperDetails::PartInterface<MatrixType>
1102 createPartInterface(const Teuchos::RCP<const typename BlockHelperDetails::ImplType<MatrixType>::tpetra_row_matrix_type> &A,
1103  const Teuchos::RCP<const typename BlockHelperDetails::ImplType<MatrixType>::tpetra_crs_graph_type> &G,
1104  const Teuchos::Array<Teuchos::Array<typename BlockHelperDetails::ImplType<MatrixType>::local_ordinal_type>> &partitions,
1105  const typename BlockHelperDetails::ImplType<MatrixType>::local_ordinal_type n_subparts_per_part_in) {
1106  IFPACK2_BLOCKHELPER_TIMER("createPartInterface", createPartInterface);
1107  using impl_type = BlockHelperDetails::ImplType<MatrixType>;
1108  using local_ordinal_type = typename impl_type::local_ordinal_type;
1109  using local_ordinal_type_1d_view = typename impl_type::local_ordinal_type_1d_view;
1110  using local_ordinal_type_2d_view = typename impl_type::local_ordinal_type_2d_view;
1111  using size_type = typename impl_type::size_type;
1112 
1113  auto bA = Teuchos::rcp_dynamic_cast<const typename BlockHelperDetails::ImplType<MatrixType>::tpetra_block_crs_matrix_type>(A);
1114 
1115  TEUCHOS_ASSERT(!bA.is_null() || G->getLocalNumRows() != 0);
1116  const local_ordinal_type blocksize = bA.is_null() ? A->getLocalNumRows() / G->getLocalNumRows() : A->getBlockSize();
1117  constexpr int vector_length = impl_type::vector_length;
1118  constexpr int internal_vector_length = impl_type::internal_vector_length;
1119 
1120  const auto comm = A->getRowMap()->getComm();
1121 
1122  BlockHelperDetails::PartInterface<MatrixType> interf;
1123 
1124  const local_ordinal_type A_n_lclrows = G->getLocalNumRows();
1125  const bool jacobi = partitions.size() == 0 || partitions.size() == A_n_lclrows;
1126  const local_ordinal_type nparts = jacobi ? A_n_lclrows : partitions.size();
1127 
1128  typedef std::pair<local_ordinal_type, local_ordinal_type> size_idx_pair_type;
1129  std::vector<size_idx_pair_type> partsz(nparts);
1130 
1131  if (!jacobi) {
1132  for (local_ordinal_type i = 0; i < nparts; ++i)
1133  partsz[i] = size_idx_pair_type(partitions[i].size(), i);
1134  std::sort(partsz.begin(), partsz.end(),
1135  [](const size_idx_pair_type &x, const size_idx_pair_type &y) {
1136  return x.first > y.first;
1137  });
1138  }
1139 
1140  local_ordinal_type n_subparts_per_part;
1141  if (jacobi) {
1142  n_subparts_per_part = 1;
1143  } else {
1144  if (n_subparts_per_part_in == -1) {
1145  // If the number of subparts is set to -1, the user let the algorithm
1146  // decides the value automatically
1147  using execution_space = typename impl_type::execution_space;
1148 
1149  // Line splitting only benefits GPUs
1150  if constexpr (impl_type::node_type::is_gpu) {
1151  const int line_length = partsz[0].first;
1152 
1153  const local_ordinal_type team_size =
1154  SolveTridiagsDefaultModeAndAlgo<typename execution_space::memory_space>::
1155  recommended_team_size(blocksize, vector_length, internal_vector_length);
1156 
1157  const local_ordinal_type num_teams = std::max(1, execution_space().concurrency() / (team_size * vector_length));
1158  n_subparts_per_part = getAutomaticNSubparts(nparts, num_teams, line_length, blocksize);
1159 #ifdef IFPACK2_BLOCKTRIDICONTAINER_USE_PRINTF
1160  printf("Automatically chosen n_subparts_per_part = %d for nparts = %d, num_teams = %d, team_size = %d, line_length = %d, and blocksize = %d;\n", n_subparts_per_part, nparts, num_teams, team_size, line_length, blocksize);
1161 #endif
1162  } else {
1163  n_subparts_per_part = 1;
1164 #ifdef IFPACK2_BLOCKTRIDICONTAINER_USE_PRINTF
1165  printf("Automatically chosen n_subparts_per_part = 1 for CPU backend\n");
1166 #endif
1167  }
1168  } else {
1169  n_subparts_per_part = n_subparts_per_part_in;
1170  }
1171  }
1172 
1173  // Total number of sub lines:
1174  const local_ordinal_type n_sub_parts = nparts * n_subparts_per_part;
1175  // Total number of sub lines + the Schur complement blocks.
1176  // For a given live 2 sub lines implies one Schur complement, 3 sub lines implies two Schur complements etc.
1177  const local_ordinal_type n_sub_parts_and_schur = n_sub_parts + nparts * (n_subparts_per_part - 1);
1178 
1179 #if defined(BLOCKTRIDICONTAINER_DEBUG)
1180  local_ordinal_type nrows = 0;
1181  if (jacobi)
1182  nrows = nparts;
1183  else
1184  for (local_ordinal_type i = 0; i < nparts; ++i) nrows += partitions[i].size();
1185 
1186  TEUCHOS_TEST_FOR_EXCEPT_MSG(nrows != A_n_lclrows, BlockHelperDetails::get_msg_prefix(comm) << "The #rows implied by the local partition is not "
1187  << "the same as getLocalNumRows: " << nrows << " vs " << A_n_lclrows);
1188 #endif
1189 
1190  // permutation vector
1191  std::vector<local_ordinal_type> p;
1192  if (jacobi) {
1193  interf.max_partsz = 1;
1194  interf.max_subpartsz = 0;
1195  interf.n_subparts_per_part = 1;
1196  interf.nparts = nparts;
1197  } else {
1198  // reorder parts to maximize simd packing efficiency
1199  p.resize(nparts);
1200 
1201  for (local_ordinal_type i = 0; i < nparts; ++i)
1202  p[i] = partsz[i].second;
1203 
1204  interf.max_partsz = partsz[0].first;
1205 
1206  constexpr local_ordinal_type connection_length = 2;
1207  const local_ordinal_type sub_line_length = (interf.max_partsz - (n_subparts_per_part - 1) * connection_length) / n_subparts_per_part;
1208  const local_ordinal_type last_sub_line_length = interf.max_partsz - (n_subparts_per_part - 1) * (connection_length + sub_line_length);
1209 
1210  interf.max_subpartsz = (sub_line_length > last_sub_line_length) ? sub_line_length : last_sub_line_length;
1211  interf.n_subparts_per_part = n_subparts_per_part;
1212  interf.nparts = nparts;
1213  }
1214 
1215  // allocate parts
1216  interf.partptr = local_ordinal_type_1d_view(do_not_initialize_tag("partptr"), nparts + 1);
1217  interf.lclrow = local_ordinal_type_1d_view(do_not_initialize_tag("lclrow"), A_n_lclrows);
1218  interf.part2rowidx0 = local_ordinal_type_1d_view(do_not_initialize_tag("part2rowidx0"), nparts + 1);
1219  interf.part2packrowidx0 = local_ordinal_type_1d_view(do_not_initialize_tag("part2packrowidx0"), nparts + 1);
1220  interf.rowidx2part = local_ordinal_type_1d_view(do_not_initialize_tag("rowidx2part"), A_n_lclrows);
1221 
1222  interf.part2rowidx0_sub = local_ordinal_type_1d_view(do_not_initialize_tag("part2rowidx0_sub"), n_sub_parts_and_schur + 1);
1223  interf.part2packrowidx0_sub = local_ordinal_type_2d_view(do_not_initialize_tag("part2packrowidx0_sub"), nparts, 2 * n_subparts_per_part);
1224  interf.rowidx2part_sub = local_ordinal_type_1d_view(do_not_initialize_tag("rowidx2part"), A_n_lclrows);
1225 
1226  interf.partptr_sub = local_ordinal_type_2d_view(do_not_initialize_tag("partptr_sub"), n_sub_parts_and_schur, 2);
1227 
1228  // mirror to host and compute on host execution space
1229  const auto partptr = Kokkos::create_mirror_view(interf.partptr);
1230  const auto partptr_sub = Kokkos::create_mirror_view(interf.partptr_sub);
1231 
1232  const auto lclrow = Kokkos::create_mirror_view(interf.lclrow);
1233  const auto part2rowidx0 = Kokkos::create_mirror_view(interf.part2rowidx0);
1234  const auto part2packrowidx0 = Kokkos::create_mirror_view(interf.part2packrowidx0);
1235  const auto rowidx2part = Kokkos::create_mirror_view(interf.rowidx2part);
1236 
1237  const auto part2rowidx0_sub = Kokkos::create_mirror_view(interf.part2rowidx0_sub);
1238  const auto part2packrowidx0_sub = Kokkos::create_mirror_view(Kokkos::HostSpace(), interf.part2packrowidx0_sub);
1239  const auto rowidx2part_sub = Kokkos::create_mirror_view(interf.rowidx2part_sub);
1240 
1241  // Determine parts.
1242  interf.row_contiguous = true;
1243  partptr(0) = 0;
1244  part2rowidx0(0) = 0;
1245  part2packrowidx0(0) = 0;
1246  local_ordinal_type pack_nrows = 0;
1247  local_ordinal_type pack_nrows_sub = 0;
1248  if (jacobi) {
1249  IFPACK2_BLOCKHELPER_TIMER("compute part indices (Jacobi)", Jacobi);
1250  // Jacobi (all lines have length 1) means that A_n_lclrows == nparts,
1251  // so the mapping between parts and rows is trivial.
1252  // Note: we can leave interf.row_contiguous = true, since for all i: lclrow(i) == i
1253  for (local_ordinal_type i = 0; i <= nparts; ++i) {
1254  part2rowidx0(i) = i;
1255  partptr(i) = i;
1256  }
1257  for (local_ordinal_type i = 0; i < nparts; ++i) {
1258  rowidx2part(i) = i;
1259  lclrow(i) = i;
1260  }
1261  for (local_ordinal_type ip = 0; ip < nparts; ++ip) {
1262  // assume No overlap.
1263  if (ip % vector_length == 0) pack_nrows = 1;
1264  part2packrowidx0(ip + 1) = part2packrowidx0(ip) + ((ip + 1) % vector_length == 0 || ip + 1 == nparts ? pack_nrows : 0);
1265  }
1266  part2rowidx0_sub(0) = 0;
1267  partptr_sub(0, 0) = 0;
1268 
1269  for (local_ordinal_type ip = 0; ip < nparts; ++ip) {
1270  constexpr local_ordinal_type ipnrows = 1;
1271  const local_ordinal_type full_line_length = partptr(ip + 1) - partptr(ip);
1272 
1273  TEUCHOS_TEST_FOR_EXCEPTION(full_line_length != ipnrows, std::logic_error,
1274  "In the part " << ip);
1275 
1276  constexpr local_ordinal_type connection_length = 2;
1277 
1278  if (full_line_length < n_subparts_per_part + (n_subparts_per_part - 1) * connection_length)
1279  TEUCHOS_TEST_FOR_EXCEPTION(true, std::logic_error,
1280  "The part " << ip << " is too short to use " << n_subparts_per_part << " sub parts.");
1281 
1282  const local_ordinal_type sub_line_length = (full_line_length - (n_subparts_per_part - 1) * connection_length) / n_subparts_per_part;
1283  const local_ordinal_type last_sub_line_length = full_line_length - (n_subparts_per_part - 1) * (connection_length + sub_line_length);
1284 
1285  if (ip % vector_length == 0) pack_nrows_sub = ipnrows;
1286 
1287  for (local_ordinal_type local_sub_ip = 0; local_sub_ip < n_subparts_per_part; ++local_sub_ip) {
1288  const local_ordinal_type sub_ip = nparts * (2 * local_sub_ip) + ip;
1289  const local_ordinal_type schur_ip = nparts * (2 * local_sub_ip + 1) + ip;
1290  if (local_sub_ip != n_subparts_per_part - 1) {
1291  if (local_sub_ip != 0) {
1292  partptr_sub(sub_ip, 0) = partptr_sub(nparts * (2 * local_sub_ip - 1) + ip, 1);
1293  } else if (ip != 0) {
1294  partptr_sub(sub_ip, 0) = partptr_sub(nparts * 2 * (n_subparts_per_part - 1) + ip - 1, 1);
1295  }
1296  partptr_sub(sub_ip, 1) = sub_line_length + partptr_sub(sub_ip, 0);
1297  partptr_sub(schur_ip, 0) = partptr_sub(sub_ip, 1);
1298  partptr_sub(schur_ip, 1) = connection_length + partptr_sub(schur_ip, 0);
1299 
1300  part2rowidx0_sub(sub_ip + 1) = part2rowidx0_sub(sub_ip) + sub_line_length;
1301  part2rowidx0_sub(sub_ip + 2) = part2rowidx0_sub(sub_ip + 1) + connection_length;
1302 
1303 #ifdef IFPACK2_BLOCKTRIDICONTAINER_USE_PRINTF
1304  printf("Sub Part index = %d, first LID associated to the sub part = %d, sub part size = %d;\n", sub_ip, partptr_sub(ip, 2 * local_sub_ip), sub_line_length);
1305  printf("Sub Part index Schur = %d, first LID associated to the sub part = %d, sub part size = %d;\n", sub_ip + 1, partptr_sub(ip, 2 * local_sub_ip + 1), connection_length);
1306 #endif
1307  } else {
1308  if (local_sub_ip != 0) {
1309  partptr_sub(sub_ip, 0) = partptr_sub(nparts * (2 * local_sub_ip - 1) + ip, 1);
1310  } else if (ip != 0) {
1311  partptr_sub(sub_ip, 0) = partptr_sub(nparts * 2 * (n_subparts_per_part - 1) + ip - 1, 1);
1312  }
1313  partptr_sub(sub_ip, 1) = last_sub_line_length + partptr_sub(sub_ip, 0);
1314 
1315  part2rowidx0_sub(sub_ip + 1) = part2rowidx0_sub(sub_ip) + last_sub_line_length;
1316 
1317 #ifdef IFPACK2_BLOCKTRIDICONTAINER_USE_PRINTF
1318  printf("Sub Part index = %d, first LID associated to the sub part = %d, sub part size = %d;\n", sub_ip, partptr_sub(ip, 2 * local_sub_ip), last_sub_line_length);
1319 #endif
1320  }
1321  }
1322  }
1323 
1324 #ifdef IFPACK2_BLOCKTRIDICONTAINER_WRITE_MM
1325  std::cout << "partptr_sub = " << std::endl;
1326  for (size_type i = 0; i < partptr_sub.extent(0); ++i) {
1327  for (size_type j = 0; j < partptr_sub.extent(1); ++j) {
1328  std::cout << partptr_sub(i, j) << " ";
1329  }
1330  std::cout << std::endl;
1331  }
1332  std::cout << "partptr_sub end" << std::endl;
1333 #endif
1334 
1335  {
1336  local_ordinal_type npacks = ceil(float(nparts) / vector_length);
1337 
1338  local_ordinal_type ip_max = nparts > vector_length ? vector_length : nparts;
1339  for (local_ordinal_type ip = 0; ip < ip_max; ++ip) {
1340  part2packrowidx0_sub(ip, 0) = 0;
1341  }
1342  for (local_ordinal_type ipack = 0; ipack < npacks; ++ipack) {
1343  if (ipack != 0) {
1344  local_ordinal_type ip_min = ipack * vector_length;
1345  ip_max = nparts > (ipack + 1) * vector_length ? (ipack + 1) * vector_length : nparts;
1346  for (local_ordinal_type ip = ip_min; ip < ip_max; ++ip) {
1347  part2packrowidx0_sub(ip, 0) = part2packrowidx0_sub(ip - vector_length, part2packrowidx0_sub.extent(1) - 1);
1348  }
1349  }
1350 
1351  for (size_type local_sub_ip = 0; local_sub_ip < part2packrowidx0_sub.extent(1) - 1; ++local_sub_ip) {
1352  local_ordinal_type ip_min = ipack * vector_length;
1353  ip_max = nparts > (ipack + 1) * vector_length ? (ipack + 1) * vector_length : nparts;
1354 
1355  const local_ordinal_type full_line_length = partptr(ip_min + 1) - partptr(ip_min);
1356 
1357  constexpr local_ordinal_type connection_length = 2;
1358 
1359  const local_ordinal_type sub_line_length = (full_line_length - (n_subparts_per_part - 1) * connection_length) / n_subparts_per_part;
1360  const local_ordinal_type last_sub_line_length = full_line_length - (n_subparts_per_part - 1) * (connection_length + sub_line_length);
1361 
1362  if (local_sub_ip % 2 == 0) pack_nrows_sub = sub_line_length;
1363  if (local_sub_ip % 2 == 1) pack_nrows_sub = connection_length;
1364  if (local_sub_ip == part2packrowidx0_sub.extent(1) - 2) pack_nrows_sub = last_sub_line_length;
1365 
1366  part2packrowidx0_sub(ip_min, local_sub_ip + 1) = part2packrowidx0_sub(ip_min, local_sub_ip) + pack_nrows_sub;
1367 
1368  for (local_ordinal_type ip = ip_min + 1; ip < ip_max; ++ip) {
1369  part2packrowidx0_sub(ip, local_sub_ip + 1) = part2packrowidx0_sub(ip_min, local_sub_ip + 1);
1370  }
1371  }
1372  }
1373 
1374  Kokkos::deep_copy(interf.part2packrowidx0_sub, part2packrowidx0_sub);
1375  }
1376  IFPACK2_BLOCKHELPER_TIMER_FENCE(typename BlockHelperDetails::ImplType<MatrixType>::execution_space)
1377  } else {
1378  IFPACK2_BLOCKHELPER_TIMER("compute part indices", indices);
1379  for (local_ordinal_type ip = 0; ip < nparts; ++ip) {
1380  const auto *part = &partitions[p[ip]];
1381  const local_ordinal_type ipnrows = part->size();
1382  TEUCHOS_ASSERT(ip == 0 || (ipnrows <= static_cast<local_ordinal_type>(partitions[p[ip - 1]].size())));
1383  TEUCHOS_TEST_FOR_EXCEPT_MSG(ipnrows == 0,
1384  BlockHelperDetails::get_msg_prefix(comm)
1385  << "partition " << p[ip]
1386  << " is empty, which is not allowed.");
1387  // assume No overlap.
1388  part2rowidx0(ip + 1) = part2rowidx0(ip) + ipnrows;
1389  // Since parts are ordered in decreasing size, the size of the first
1390  // part in a pack is the size for all parts in the pack.
1391  if (ip % vector_length == 0) pack_nrows = ipnrows;
1392  part2packrowidx0(ip + 1) = part2packrowidx0(ip) + ((ip + 1) % vector_length == 0 || ip + 1 == nparts ? pack_nrows : 0);
1393  const local_ordinal_type offset = partptr(ip);
1394  for (local_ordinal_type i = 0; i < ipnrows; ++i) {
1395  const auto lcl_row = (*part)[i];
1396  TEUCHOS_TEST_FOR_EXCEPT_MSG(lcl_row < 0 || lcl_row >= A_n_lclrows,
1397  BlockHelperDetails::get_msg_prefix(comm)
1398  << "partitions[" << p[ip] << "]["
1399  << i << "] = " << lcl_row
1400  << " but input matrix implies limits of [0, " << A_n_lclrows - 1
1401  << "].");
1402  lclrow(offset + i) = lcl_row;
1403  rowidx2part(offset + i) = ip;
1404  if (interf.row_contiguous && offset + i > 0 && lclrow((offset + i) - 1) + 1 != lcl_row)
1405  interf.row_contiguous = false;
1406  }
1407  partptr(ip + 1) = offset + ipnrows;
1408 
1409 #ifdef IFPACK2_BLOCKTRIDICONTAINER_USE_PRINTF
1410  printf("Part index = ip = %d, first LID associated to the part = partptr(ip) = offset = %d, part->size() = ipnrows = %d;\n", ip, offset, ipnrows);
1411  printf("partptr(%d+1) = %d\n", ip, partptr(ip + 1));
1412 #endif
1413  }
1414 
1415  part2rowidx0_sub(0) = 0;
1416  partptr_sub(0, 0) = 0;
1417  // const local_ordinal_type number_pack_per_sub_part = ceil(float(nparts)/vector_length);
1418 
1419  for (local_ordinal_type ip = 0; ip < nparts; ++ip) {
1420  const auto *part = &partitions[p[ip]];
1421  const local_ordinal_type ipnrows = part->size();
1422  const local_ordinal_type full_line_length = partptr(ip + 1) - partptr(ip);
1423 
1424  TEUCHOS_TEST_FOR_EXCEPTION(full_line_length != ipnrows, std::logic_error,
1425  "In the part " << ip);
1426 
1427  constexpr local_ordinal_type connection_length = 2;
1428 
1429  if (full_line_length < n_subparts_per_part + (n_subparts_per_part - 1) * connection_length)
1430  TEUCHOS_TEST_FOR_EXCEPTION(true, std::logic_error,
1431  "The part " << ip << " is too short to use " << n_subparts_per_part << " sub parts.");
1432 
1433  const local_ordinal_type sub_line_length = (full_line_length - (n_subparts_per_part - 1) * connection_length) / n_subparts_per_part;
1434  const local_ordinal_type last_sub_line_length = full_line_length - (n_subparts_per_part - 1) * (connection_length + sub_line_length);
1435 
1436  if (ip % vector_length == 0) pack_nrows_sub = ipnrows;
1437 
1438  for (local_ordinal_type local_sub_ip = 0; local_sub_ip < n_subparts_per_part; ++local_sub_ip) {
1439  const local_ordinal_type sub_ip = nparts * (2 * local_sub_ip) + ip;
1440  const local_ordinal_type schur_ip = nparts * (2 * local_sub_ip + 1) + ip;
1441  if (local_sub_ip != n_subparts_per_part - 1) {
1442  if (local_sub_ip != 0) {
1443  partptr_sub(sub_ip, 0) = partptr_sub(nparts * (2 * local_sub_ip - 1) + ip, 1);
1444  } else if (ip != 0) {
1445  partptr_sub(sub_ip, 0) = partptr_sub(nparts * 2 * (n_subparts_per_part - 1) + ip - 1, 1);
1446  }
1447  partptr_sub(sub_ip, 1) = sub_line_length + partptr_sub(sub_ip, 0);
1448  partptr_sub(schur_ip, 0) = partptr_sub(sub_ip, 1);
1449  partptr_sub(schur_ip, 1) = connection_length + partptr_sub(schur_ip, 0);
1450 
1451  part2rowidx0_sub(sub_ip + 1) = part2rowidx0_sub(sub_ip) + sub_line_length;
1452  part2rowidx0_sub(sub_ip + 2) = part2rowidx0_sub(sub_ip + 1) + connection_length;
1453 
1454 #ifdef IFPACK2_BLOCKTRIDICONTAINER_USE_PRINTF
1455  printf("Sub Part index = %d, first LID associated to the sub part = %d, sub part size = %d;\n", sub_ip, partptr_sub(sub_ip, 0), sub_line_length);
1456  printf("Sub Part index Schur = %d, first LID associated to the sub part = %d, sub part size = %d;\n", sub_ip + 1, partptr_sub(ip, 2 * local_sub_ip + 1), connection_length);
1457 #endif
1458  } else {
1459  if (local_sub_ip != 0) {
1460  partptr_sub(sub_ip, 0) = partptr_sub(nparts * (2 * local_sub_ip - 1) + ip, 1);
1461  } else if (ip != 0) {
1462  partptr_sub(sub_ip, 0) = partptr_sub(nparts * 2 * (n_subparts_per_part - 1) + ip - 1, 1);
1463  }
1464  partptr_sub(sub_ip, 1) = last_sub_line_length + partptr_sub(sub_ip, 0);
1465 
1466  part2rowidx0_sub(sub_ip + 1) = part2rowidx0_sub(sub_ip) + last_sub_line_length;
1467 
1468 #ifdef IFPACK2_BLOCKTRIDICONTAINER_USE_PRINTF
1469  printf("Sub Part index = %d, first LID associated to the sub part = %d, sub part size = %d;\n", sub_ip, partptr_sub(sub_ip, 0), last_sub_line_length);
1470 #endif
1471  }
1472  }
1473  }
1474 
1475  {
1476  local_ordinal_type npacks = ceil(float(nparts) / vector_length);
1477 
1478  local_ordinal_type ip_max = nparts > vector_length ? vector_length : nparts;
1479  for (local_ordinal_type ip = 0; ip < ip_max; ++ip) {
1480  part2packrowidx0_sub(ip, 0) = 0;
1481  }
1482  for (local_ordinal_type ipack = 0; ipack < npacks; ++ipack) {
1483  if (ipack != 0) {
1484  local_ordinal_type ip_min = ipack * vector_length;
1485  ip_max = nparts > (ipack + 1) * vector_length ? (ipack + 1) * vector_length : nparts;
1486  for (local_ordinal_type ip = ip_min; ip < ip_max; ++ip) {
1487  part2packrowidx0_sub(ip, 0) = part2packrowidx0_sub(ip - vector_length, part2packrowidx0_sub.extent(1) - 1);
1488  }
1489  }
1490 
1491  for (size_type local_sub_ip = 0; local_sub_ip < part2packrowidx0_sub.extent(1) - 1; ++local_sub_ip) {
1492  local_ordinal_type ip_min = ipack * vector_length;
1493  ip_max = nparts > (ipack + 1) * vector_length ? (ipack + 1) * vector_length : nparts;
1494 
1495  const local_ordinal_type full_line_length = partptr(ip_min + 1) - partptr(ip_min);
1496 
1497  constexpr local_ordinal_type connection_length = 2;
1498 
1499  const local_ordinal_type sub_line_length = (full_line_length - (n_subparts_per_part - 1) * connection_length) / n_subparts_per_part;
1500  const local_ordinal_type last_sub_line_length = full_line_length - (n_subparts_per_part - 1) * (connection_length + sub_line_length);
1501 
1502  if (local_sub_ip % 2 == 0) pack_nrows_sub = sub_line_length;
1503  if (local_sub_ip % 2 == 1) pack_nrows_sub = connection_length;
1504  if (local_sub_ip == part2packrowidx0_sub.extent(1) - 2) pack_nrows_sub = last_sub_line_length;
1505 
1506  part2packrowidx0_sub(ip_min, local_sub_ip + 1) = part2packrowidx0_sub(ip_min, local_sub_ip) + pack_nrows_sub;
1507 
1508  for (local_ordinal_type ip = ip_min + 1; ip < ip_max; ++ip) {
1509  part2packrowidx0_sub(ip, local_sub_ip + 1) = part2packrowidx0_sub(ip_min, local_sub_ip + 1);
1510  }
1511  }
1512  }
1513 
1514  Kokkos::deep_copy(interf.part2packrowidx0_sub, part2packrowidx0_sub);
1515  }
1516  IFPACK2_BLOCKHELPER_TIMER_FENCE(typename BlockHelperDetails::ImplType<MatrixType>::execution_space)
1517  }
1518 #if defined(BLOCKTRIDICONTAINER_DEBUG)
1519  TEUCHOS_ASSERT(partptr(nparts) == nrows);
1520 #endif
1521  if (lclrow(0) != 0) interf.row_contiguous = false;
1522 
1523  Kokkos::deep_copy(interf.partptr, partptr);
1524  Kokkos::deep_copy(interf.lclrow, lclrow);
1525 
1526  Kokkos::deep_copy(interf.partptr_sub, partptr_sub);
1527 
1528  // assume No overlap. Thus:
1529  interf.part2rowidx0 = interf.partptr;
1530  Kokkos::deep_copy(interf.part2packrowidx0, part2packrowidx0);
1531 
1532  interf.part2packrowidx0_back = part2packrowidx0_sub(part2packrowidx0_sub.extent(0) - 1, part2packrowidx0_sub.extent(1) - 1);
1533  Kokkos::deep_copy(interf.rowidx2part, rowidx2part);
1534 
1535  { // Fill packptr.
1536  IFPACK2_BLOCKHELPER_TIMER("Fill packptr", packptr0);
1537  local_ordinal_type npacks = ceil(float(nparts) / vector_length) * (part2packrowidx0_sub.extent(1) - 1);
1538  npacks = 0;
1539  for (local_ordinal_type ip = 1; ip <= nparts; ++ip) // n_sub_parts_and_schur
1540  if (part2packrowidx0(ip) != part2packrowidx0(ip - 1))
1541  ++npacks;
1542 
1543  interf.packptr = local_ordinal_type_1d_view(do_not_initialize_tag("packptr"), npacks + 1);
1544  const auto packptr = Kokkos::create_mirror_view(interf.packptr);
1545  packptr(0) = 0;
1546  for (local_ordinal_type ip = 1, k = 1; ip <= nparts; ++ip)
1547  if (part2packrowidx0(ip) != part2packrowidx0(ip - 1))
1548  packptr(k++) = ip;
1549 
1550  Kokkos::deep_copy(interf.packptr, packptr);
1551 
1552  local_ordinal_type npacks_per_subpart = ceil(float(nparts) / vector_length);
1553  npacks = ceil(float(nparts) / vector_length) * (part2packrowidx0_sub.extent(1) - 1);
1554 
1555  interf.packindices_sub = local_ordinal_type_1d_view(do_not_initialize_tag("packindices_sub"), npacks_per_subpart * n_subparts_per_part);
1556  interf.packindices_schur = local_ordinal_type_2d_view(do_not_initialize_tag("packindices_schur"), npacks_per_subpart, n_subparts_per_part - 1);
1557 
1558  const auto packindices_sub = Kokkos::create_mirror_view(interf.packindices_sub);
1559  const auto packindices_schur = Kokkos::create_mirror_view(interf.packindices_schur);
1560 
1561  // Fill packindices_sub and packindices_schur
1562  for (local_ordinal_type local_sub_ip = 0; local_sub_ip < n_subparts_per_part - 1; ++local_sub_ip) {
1563  for (local_ordinal_type local_pack_ip = 0; local_pack_ip < npacks_per_subpart; ++local_pack_ip) {
1564  packindices_sub(local_sub_ip * npacks_per_subpart + local_pack_ip) = 2 * local_sub_ip * npacks_per_subpart + local_pack_ip;
1565  packindices_schur(local_pack_ip, local_sub_ip) = 2 * local_sub_ip * npacks_per_subpart + local_pack_ip + npacks_per_subpart;
1566  }
1567  }
1568 
1569  for (local_ordinal_type local_pack_ip = 0; local_pack_ip < npacks_per_subpart; ++local_pack_ip) {
1570  packindices_sub((n_subparts_per_part - 1) * npacks_per_subpart + local_pack_ip) = 2 * (n_subparts_per_part - 1) * npacks_per_subpart + local_pack_ip;
1571  }
1572 
1573 #ifdef IFPACK2_BLOCKTRIDICONTAINER_WRITE_MM
1574  std::cout << "packindices_sub = " << std::endl;
1575  for (size_type i = 0; i < packindices_sub.extent(0); ++i) {
1576  std::cout << packindices_sub(i) << " ";
1577  }
1578  std::cout << std::endl;
1579  std::cout << "packindices_sub end" << std::endl;
1580 
1581  std::cout << "packindices_schur = " << std::endl;
1582  for (size_type i = 0; i < packindices_schur.extent(0); ++i) {
1583  for (size_type j = 0; j < packindices_schur.extent(1); ++j) {
1584  std::cout << packindices_schur(i, j) << " ";
1585  }
1586  std::cout << std::endl;
1587  }
1588 
1589  std::cout << "packindices_schur end" << std::endl;
1590 #endif
1591 
1592  Kokkos::deep_copy(interf.packindices_sub, packindices_sub);
1593  Kokkos::deep_copy(interf.packindices_schur, packindices_schur);
1594 
1595  interf.packptr_sub = local_ordinal_type_1d_view(do_not_initialize_tag("packptr"), npacks + 1);
1596  const auto packptr_sub = Kokkos::create_mirror_view(interf.packptr_sub);
1597  packptr_sub(0) = 0;
1598  for (local_ordinal_type k = 0; k < npacks + 1; ++k)
1599  packptr_sub(k) = packptr(k % npacks_per_subpart) + (k / npacks_per_subpart) * packptr(npacks_per_subpart);
1600 
1601  Kokkos::deep_copy(interf.packptr_sub, packptr_sub);
1602  IFPACK2_BLOCKHELPER_TIMER_FENCE(typename BlockHelperDetails::ImplType<MatrixType>::execution_space)
1603  }
1604  IFPACK2_BLOCKHELPER_TIMER_FENCE(typename BlockHelperDetails::ImplType<MatrixType>::execution_space)
1605 
1606  return interf;
1607 }
1608 
1612 template <typename MatrixType>
1615  using local_ordinal_type_1d_view = typename impl_type::local_ordinal_type_1d_view;
1616  using size_type_1d_view = typename impl_type::size_type_1d_view;
1617  using size_type_2d_view = typename impl_type::size_type_2d_view;
1618  using vector_type_3d_view = typename impl_type::vector_type_3d_view;
1619  using vector_type_4d_view = typename impl_type::vector_type_4d_view;
1620  using btdm_scalar_type_3d_view = typename impl_type::btdm_scalar_type_3d_view;
1621 
1622  // flat_td_ptr(i) is the index into flat-array values of the start of the
1623  // i'th tridiag. pack_td_ptr is the same, but for packs. If vector_length ==
1624  // 1, pack_td_ptr is the same as flat_td_ptr; if vector_length > 1, then i %
1625  // vector_length is the position in the pack.
1626  size_type_2d_view flat_td_ptr, pack_td_ptr, pack_td_ptr_schur;
1627  // List of local column indices into A from which to grab
1628  // data. flat_td_ptr(i) points to the start of the i'th tridiag's data.
1629  local_ordinal_type_1d_view A_colindsub;
1630  // Tridiag block values. pack_td_ptr(i) points to the start of the i'th
1631  // tridiag's pack, and i % vector_length gives the position in the pack.
1632  vector_type_3d_view values;
1633  // Schur block values. pack_td_ptr_schur(i) points to the start of the i'th
1634  // Schur's pack, and i % vector_length gives the position in the pack.
1635  vector_type_3d_view values_schur;
1636  // inv(A_00)*A_01 block values.
1637  vector_type_4d_view e_values;
1638 
1639  // The following are for fused block Jacobi only.
1640  // For block row i, diag_offset(i)...diag_offset(i + bs^2)
1641  // is the range of scalars for the diagonal block.
1642  size_type_1d_view diag_offsets;
1643  // For fused residual+solve block Jacobi case,
1644  // this contains the diagonal block inverses in flat, local row indexing:
1645  // d_inv(row, :, :) gives the row-major block for row.
1646  btdm_scalar_type_3d_view d_inv;
1647 
1648  bool is_diagonal_only;
1649 
1650  BlockTridiags() = default;
1651  BlockTridiags(const BlockTridiags &b) = default;
1652 
1653  // Index into row-major block of a tridiag.
1654  template <typename idx_type>
1655  static KOKKOS_FORCEINLINE_FUNCTION
1656  idx_type
1657  IndexToRow(const idx_type &ind) { return (ind + 1) / 3; }
1658  // Given a row of a row-major tridiag, return the index of the first block
1659  // in that row.
1660  template <typename idx_type>
1661  static KOKKOS_FORCEINLINE_FUNCTION
1662  idx_type
1663  RowToIndex(const idx_type &row) { return row > 0 ? 3 * row - 1 : 0; }
1664  // Number of blocks in a tridiag having a given number of rows.
1665  template <typename idx_type>
1666  static KOKKOS_FORCEINLINE_FUNCTION
1667  idx_type
1668  NumBlocks(const idx_type &nrows) { return nrows > 0 ? 3 * nrows - 2 : 0; }
1669  // Number of blocks associated to a Schur complement having a given number of rows.
1670  template <typename idx_type>
1671  static KOKKOS_FORCEINLINE_FUNCTION
1672  idx_type
1673  NumBlocksSchur(const idx_type &nrows) { return nrows > 0 ? 3 * nrows + 2 : 0; }
1674 };
1675 
1679 template <typename MatrixType>
1681 createBlockTridiags(const BlockHelperDetails::PartInterface<MatrixType> &interf) {
1682  IFPACK2_BLOCKHELPER_TIMER("createBlockTridiags", createBlockTridiags0);
1683  using impl_type = BlockHelperDetails::ImplType<MatrixType>;
1684  using execution_space = typename impl_type::execution_space;
1685  using local_ordinal_type = typename impl_type::local_ordinal_type;
1686  using size_type = typename impl_type::size_type;
1687  using size_type_2d_view = typename impl_type::size_type_2d_view;
1688 
1689  constexpr int vector_length = impl_type::vector_length;
1690 
1692 
1693  const local_ordinal_type ntridiags = interf.partptr_sub.extent(0);
1694 
1695  { // construct the flat index pointers into the tridiag values array.
1696  btdm.flat_td_ptr = size_type_2d_view(do_not_initialize_tag("btdm.flat_td_ptr"), interf.nparts, 2 * interf.n_subparts_per_part);
1697  const Kokkos::RangePolicy<execution_space> policy(0, 2 * interf.nparts * interf.n_subparts_per_part);
1698  Kokkos::parallel_scan(
1699  "createBlockTridiags::RangePolicy::flat_td_ptr",
1700  policy, KOKKOS_LAMBDA(const local_ordinal_type &i, size_type &update, const bool &final) {
1701  const local_ordinal_type partidx = i / (2 * interf.n_subparts_per_part);
1702  const local_ordinal_type local_subpartidx = i % (2 * interf.n_subparts_per_part);
1703 
1704  if (final) {
1705  btdm.flat_td_ptr(partidx, local_subpartidx) = update;
1706  }
1707  if (local_subpartidx != (2 * interf.n_subparts_per_part - 1)) {
1708  const local_ordinal_type nrows = interf.partptr_sub(interf.nparts * local_subpartidx + partidx, 1) - interf.partptr_sub(interf.nparts * local_subpartidx + partidx, 0);
1709  if (local_subpartidx % 2 == 0)
1710  update += btdm.NumBlocks(nrows);
1711  else
1712  update += btdm.NumBlocksSchur(nrows);
1713  }
1714  });
1715 
1716  const auto nblocks = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), Kokkos::subview(btdm.flat_td_ptr, interf.nparts - 1, 2 * interf.n_subparts_per_part - 1));
1717  btdm.is_diagonal_only = (static_cast<local_ordinal_type>(nblocks()) == ntridiags);
1718  }
1719 
1720  // And the packed index pointers.
1721  if (vector_length == 1) {
1722  btdm.pack_td_ptr = btdm.flat_td_ptr;
1723  } else {
1724  // const local_ordinal_type npacks = interf.packptr_sub.extent(0) - 1;
1725 
1726  local_ordinal_type npacks_per_subpart = 0;
1727  const auto part2packrowidx0 = Kokkos::create_mirror_view(interf.part2packrowidx0);
1728  Kokkos::deep_copy(part2packrowidx0, interf.part2packrowidx0);
1729  for (local_ordinal_type ip = 1; ip <= interf.nparts; ++ip) // n_sub_parts_and_schur
1730  if (part2packrowidx0(ip) != part2packrowidx0(ip - 1))
1731  ++npacks_per_subpart;
1732 
1733  btdm.pack_td_ptr = size_type_2d_view(do_not_initialize_tag("btdm.pack_td_ptr"), interf.nparts, 2 * interf.n_subparts_per_part);
1734  const Kokkos::RangePolicy<execution_space> policy(0, npacks_per_subpart);
1735 
1736  Kokkos::parallel_for(
1737  "createBlockTridiags::RangePolicy::pack_td_ptr",
1738  policy, KOKKOS_LAMBDA(const local_ordinal_type &i) {
1739  for (local_ordinal_type j = 0; j < 2 * interf.n_subparts_per_part; ++j) {
1740  const local_ordinal_type pack_id = (j == 2 * interf.n_subparts_per_part - 1) ? i + (j - 1) * npacks_per_subpart : i + j * npacks_per_subpart;
1741  const local_ordinal_type nparts_in_pack = interf.packptr_sub(pack_id + 1) - interf.packptr_sub(pack_id);
1742 
1743  const local_ordinal_type parti = interf.packptr_sub(pack_id);
1744  const local_ordinal_type partidx = parti % interf.nparts;
1745 
1746  for (local_ordinal_type pti = 0; pti < nparts_in_pack; ++pti) {
1747  btdm.pack_td_ptr(partidx + pti, j) = btdm.flat_td_ptr(i, j);
1748  }
1749  }
1750  });
1751  }
1752 
1753  btdm.pack_td_ptr_schur = size_type_2d_view(do_not_initialize_tag("btdm.pack_td_ptr_schur"), interf.nparts, interf.n_subparts_per_part);
1754 
1755  const auto host_pack_td_ptr_schur = Kokkos::create_mirror_view(btdm.pack_td_ptr_schur);
1756  constexpr local_ordinal_type connection_length = 2;
1757 
1758  host_pack_td_ptr_schur(0, 0) = 0;
1759  for (local_ordinal_type i = 0; i < interf.nparts; ++i) {
1760  if (i % vector_length == 0) {
1761  if (i != 0)
1762  host_pack_td_ptr_schur(i, 0) = host_pack_td_ptr_schur(i - 1, host_pack_td_ptr_schur.extent(1) - 1);
1763  for (local_ordinal_type j = 0; j < interf.n_subparts_per_part - 1; ++j) {
1764  host_pack_td_ptr_schur(i, j + 1) = host_pack_td_ptr_schur(i, j) + btdm.NumBlocks(connection_length) + (j != 0 ? 1 : 0) + (j != interf.n_subparts_per_part - 2 ? 1 : 0);
1765  }
1766  } else {
1767  for (local_ordinal_type j = 0; j < interf.n_subparts_per_part; ++j) {
1768  host_pack_td_ptr_schur(i, j) = host_pack_td_ptr_schur(i - 1, j);
1769  }
1770  }
1771  }
1772 
1773  Kokkos::deep_copy(btdm.pack_td_ptr_schur, host_pack_td_ptr_schur);
1774 
1775 #ifdef IFPACK2_BLOCKTRIDICONTAINER_WRITE_MM
1776  const auto host_flat_td_ptr = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), btdm.flat_td_ptr);
1777  std::cout << "flat_td_ptr = " << std::endl;
1778  for (size_type i = 0; i < host_flat_td_ptr.extent(0); ++i) {
1779  for (size_type j = 0; j < host_flat_td_ptr.extent(1); ++j) {
1780  std::cout << host_flat_td_ptr(i, j) << " ";
1781  }
1782  std::cout << std::endl;
1783  }
1784  std::cout << "flat_td_ptr end" << std::endl;
1785 
1786  const auto host_pack_td_ptr = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), btdm.pack_td_ptr);
1787 
1788  std::cout << "pack_td_ptr = " << std::endl;
1789  for (size_type i = 0; i < host_pack_td_ptr.extent(0); ++i) {
1790  for (size_type j = 0; j < host_pack_td_ptr.extent(1); ++j) {
1791  std::cout << host_pack_td_ptr(i, j) << " ";
1792  }
1793  std::cout << std::endl;
1794  }
1795  std::cout << "pack_td_ptr end" << std::endl;
1796 
1797  std::cout << "pack_td_ptr_schur = " << std::endl;
1798  for (size_type i = 0; i < host_pack_td_ptr_schur.extent(0); ++i) {
1799  for (size_type j = 0; j < host_pack_td_ptr_schur.extent(1); ++j) {
1800  std::cout << host_pack_td_ptr_schur(i, j) << " ";
1801  }
1802  std::cout << std::endl;
1803  }
1804  std::cout << "pack_td_ptr_schur end" << std::endl;
1805 #endif
1806 
1807  // values and A_colindsub are created in the symbolic phase
1808  IFPACK2_BLOCKHELPER_TIMER_FENCE(typename BlockHelperDetails::ImplType<MatrixType>::execution_space)
1809 
1810  return btdm;
1811 }
1812 
1813 // Set the tridiags to be I to the full pack block size. That way, if a
1814 // tridiag within a pack is shorter than the longest one, the extra blocks are
1815 // processed in a safe way. Similarly, in the solve phase, if the extra blocks
1816 // in the packed multvector are 0, and the tridiag LU reflects the extra I
1817 // blocks, then the solve proceeds as though the extra blocks aren't
1818 // present. Since this extra work is part of the SIMD calls, it's not actually
1819 // extra work. Instead, it means we don't have to put checks or masks in, or
1820 // quiet NaNs. This functor has to be called just once, in the symbolic phase,
1821 // since the numeric phase fills in only the used entries, leaving these I
1822 // blocks intact.
1823 template <typename MatrixType>
1824 void setTridiagsToIdentity(const BlockTridiags<MatrixType> &btdm,
1825  const typename BlockHelperDetails::ImplType<MatrixType>::local_ordinal_type_1d_view &packptr) {
1826  using impl_type = BlockHelperDetails::ImplType<MatrixType>;
1827  using execution_space = typename impl_type::execution_space;
1828  using local_ordinal_type = typename impl_type::local_ordinal_type;
1829  using size_type_2d_view = typename impl_type::size_type_2d_view;
1830 
1831  const ConstUnmanaged<size_type_2d_view> pack_td_ptr(btdm.pack_td_ptr);
1832  const local_ordinal_type blocksize = btdm.values.extent(1);
1833 
1834  {
1835  const int vector_length = impl_type::vector_length;
1836  const int internal_vector_length = impl_type::internal_vector_length;
1837 
1838  using btdm_scalar_type = typename impl_type::btdm_scalar_type;
1839  using internal_vector_type = typename impl_type::internal_vector_type;
1840  using internal_vector_type_4d_view =
1841  typename impl_type::internal_vector_type_4d_view;
1842 
1843  using team_policy_type = Kokkos::TeamPolicy<execution_space>;
1844  const internal_vector_type_4d_view values(reinterpret_cast<internal_vector_type *>(btdm.values.data()),
1845  btdm.values.extent(0),
1846  btdm.values.extent(1),
1847  btdm.values.extent(2),
1848  vector_length / internal_vector_length);
1849  const local_ordinal_type vector_loop_size = values.extent(3);
1850 #if defined(KOKKOS_ENABLE_CUDA) && defined(__CUDA_ARCH__)
1851  local_ordinal_type total_team_size(0);
1852  if (blocksize <= 5)
1853  total_team_size = 32;
1854  else if (blocksize <= 9)
1855  total_team_size = 64;
1856  else if (blocksize <= 12)
1857  total_team_size = 96;
1858  else if (blocksize <= 16)
1859  total_team_size = 128;
1860  else if (blocksize <= 20)
1861  total_team_size = 160;
1862  else
1863  total_team_size = 160;
1864  const local_ordinal_type team_size = total_team_size / vector_loop_size;
1865  const team_policy_type policy(packptr.extent(0) - 1, team_size, vector_loop_size);
1866 #elif defined(KOKKOS_ENABLE_HIP)
1867  // FIXME: HIP
1868  // These settings might be completely wrong
1869  // will have to do some experiments to decide
1870  // what makes sense on AMD GPUs
1871  local_ordinal_type total_team_size(0);
1872  if (blocksize <= 5)
1873  total_team_size = 32;
1874  else if (blocksize <= 9)
1875  total_team_size = 64;
1876  else if (blocksize <= 12)
1877  total_team_size = 96;
1878  else if (blocksize <= 16)
1879  total_team_size = 128;
1880  else if (blocksize <= 20)
1881  total_team_size = 160;
1882  else
1883  total_team_size = 160;
1884  const local_ordinal_type team_size = total_team_size / vector_loop_size;
1885  const team_policy_type policy(packptr.extent(0) - 1, team_size, vector_loop_size);
1886 #elif defined(KOKKOS_ENABLE_SYCL)
1887  // SYCL: FIXME
1888  local_ordinal_type total_team_size(0);
1889  if (blocksize <= 5)
1890  total_team_size = 32;
1891  else if (blocksize <= 9)
1892  total_team_size = 64;
1893  else if (blocksize <= 12)
1894  total_team_size = 96;
1895  else if (blocksize <= 16)
1896  total_team_size = 128;
1897  else if (blocksize <= 20)
1898  total_team_size = 160;
1899  else
1900  total_team_size = 160;
1901  const local_ordinal_type team_size = total_team_size / vector_loop_size;
1902  const team_policy_type policy(packptr.extent(0) - 1, team_size, vector_loop_size);
1903 #else
1904  // Host architecture: team size is always one
1905  const team_policy_type policy(packptr.extent(0) - 1, 1, 1);
1906 #endif
1907  Kokkos::parallel_for(
1908  "setTridiagsToIdentity::TeamPolicy",
1909  policy, KOKKOS_LAMBDA(const typename team_policy_type::member_type &member) {
1910  const local_ordinal_type k = member.league_rank();
1911  const local_ordinal_type ibeg = pack_td_ptr(packptr(k), 0);
1912  const local_ordinal_type iend = pack_td_ptr(packptr(k), pack_td_ptr.extent(1) - 1);
1913 
1914  const local_ordinal_type diff = iend - ibeg;
1915  const local_ordinal_type icount = diff / 3 + (diff % 3 > 0);
1916  const btdm_scalar_type one(1);
1917  Kokkos::parallel_for(Kokkos::ThreadVectorRange(member, vector_loop_size), [&](const int &v) {
1918  Kokkos::parallel_for(Kokkos::TeamThreadRange(member, icount), [&](const local_ordinal_type &ii) {
1919  const local_ordinal_type i = ibeg + ii * 3;
1920  for (local_ordinal_type j = 0; j < blocksize; ++j) {
1921  values(i, j, j, v) = one;
1922  }
1923  });
1924  });
1925  });
1926  }
1927 }
1928 
1932 template <typename MatrixType>
1933 void performSymbolicPhase(const Teuchos::RCP<const typename BlockHelperDetails::ImplType<MatrixType>::tpetra_row_matrix_type> &A,
1934  const Teuchos::RCP<const typename BlockHelperDetails::ImplType<MatrixType>::tpetra_crs_graph_type> &g,
1935  const BlockHelperDetails::PartInterface<MatrixType> &interf,
1938  const bool overlap_communication_and_computation,
1939  const Teuchos::RCP<AsyncableImport<MatrixType>> &async_importer,
1940  bool useSeqMethod,
1941  bool use_fused_jacobi) {
1942  IFPACK2_BLOCKHELPER_TIMER("BlockTriDi::SymbolicPhase", SymbolicPhase);
1943 
1944  using impl_type = BlockHelperDetails::ImplType<MatrixType>;
1945 
1946  using execution_space = typename impl_type::execution_space;
1947  using host_execution_space = typename impl_type::host_execution_space;
1948 
1949  using local_ordinal_type = typename impl_type::local_ordinal_type;
1950  using global_ordinal_type = typename impl_type::global_ordinal_type;
1951  using size_type = typename impl_type::size_type;
1952  using local_ordinal_type_1d_view = typename impl_type::local_ordinal_type_1d_view;
1953  using size_type_1d_view = typename impl_type::size_type_1d_view;
1954  using vector_type_3d_view = typename impl_type::vector_type_3d_view;
1955  using vector_type_4d_view = typename impl_type::vector_type_4d_view;
1956  using crs_matrix_type = typename impl_type::tpetra_crs_matrix_type;
1957  using block_crs_matrix_type = typename impl_type::tpetra_block_crs_matrix_type;
1958  using btdm_scalar_type_3d_view = typename impl_type::btdm_scalar_type_3d_view;
1959 
1960  constexpr int vector_length = impl_type::vector_length;
1961 
1962  const auto comm = A->getRowMap()->getComm();
1963 
1964  auto A_crs = Teuchos::rcp_dynamic_cast<const crs_matrix_type>(A);
1965  auto A_bcrs = Teuchos::rcp_dynamic_cast<const block_crs_matrix_type>(A);
1966 
1967  bool hasBlockCrsMatrix = !A_bcrs.is_null();
1968  TEUCHOS_ASSERT(hasBlockCrsMatrix || g->getLocalNumRows() != 0);
1969  const local_ordinal_type blocksize = hasBlockCrsMatrix ? A->getBlockSize() : A->getLocalNumRows() / g->getLocalNumRows();
1970 
1971  // mirroring to host
1972  const auto partptr = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), interf.partptr);
1973  const auto lclrow = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), interf.lclrow);
1974  const auto rowidx2part = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), interf.rowidx2part);
1975  const auto part2rowidx0 = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), interf.part2rowidx0);
1976  const auto packptr = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), interf.packptr);
1977 
1978  const local_ordinal_type nrows = partptr(partptr.extent(0) - 1);
1979 
1980  Kokkos::View<local_ordinal_type *, host_execution_space> col2row("col2row", A->getLocalNumCols());
1981 
1982  // find column to row map on host
1983 
1984  Kokkos::deep_copy(col2row, Teuchos::OrdinalTraits<local_ordinal_type>::invalid());
1985  {
1986  const auto rowmap = g->getRowMap();
1987  const auto colmap = g->getColMap();
1988  const auto dommap = g->getDomainMap();
1989  TEUCHOS_ASSERT(!(rowmap.is_null() || colmap.is_null() || dommap.is_null()));
1990  rowmap->lazyPushToHost();
1991  colmap->lazyPushToHost();
1992  dommap->lazyPushToHost();
1993 
1994 #if !defined(__CUDA_ARCH__) && !defined(__HIP_DEVICE_COMPILE__) && !defined(__SYCL_DEVICE_ONLY__)
1995  const Kokkos::RangePolicy<host_execution_space> policy(0, nrows);
1996  Kokkos::parallel_for(
1997  "performSymbolicPhase::RangePolicy::col2row",
1998  policy, KOKKOS_LAMBDA(const local_ordinal_type &lr) {
1999  const global_ordinal_type gid = rowmap->getGlobalElement(lr);
2001  if (dommap->isNodeGlobalElement(gid)) {
2002  const local_ordinal_type lc = colmap->getLocalElement(gid);
2003 #if defined(BLOCKTRIDICONTAINER_DEBUG)
2005  BlockHelperDetails::get_msg_prefix(comm) << "GID " << gid
2006  << " gives an invalid local column.");
2007 #endif
2008  col2row(lc) = lr;
2009  }
2010  });
2011 #endif
2012  }
2013 
2014  // construct the D and R graphs in A = D + R.
2015  {
2016  const auto local_graph = g->getLocalGraphHost();
2017  const auto local_graph_rowptr = local_graph.row_map;
2018  TEUCHOS_ASSERT(local_graph_rowptr.size() == static_cast<size_t>(nrows + 1));
2019  const auto local_graph_colidx = local_graph.entries;
2020 
2021  // assume no overlap.
2022 
2023  Kokkos::View<local_ordinal_type *, host_execution_space> lclrow2idx("lclrow2idx", nrows);
2024  {
2025  const Kokkos::RangePolicy<host_execution_space> policy(0, nrows);
2026  Kokkos::parallel_for(
2027  "performSymbolicPhase::RangePolicy::lclrow2idx",
2028  policy, KOKKOS_LAMBDA(const local_ordinal_type &i) {
2029  lclrow2idx[lclrow(i)] = i;
2030  });
2031  }
2032 
2033  // count (block) nnzs in D and R.
2035  typename sum_reducer_type::value_type sum_reducer_value;
2036  {
2037  const Kokkos::RangePolicy<host_execution_space> policy(0, nrows);
2038  Kokkos::parallel_reduce
2039  // profiling interface does not work
2040  ( //"performSymbolicPhase::RangePolicy::count_nnz",
2041  policy, KOKKOS_LAMBDA(const local_ordinal_type &lr, typename sum_reducer_type::value_type &update) {
2042  // LID -> index.
2043  const local_ordinal_type ri0 = lclrow2idx[lr];
2044  const local_ordinal_type pi0 = rowidx2part(ri0);
2045  for (size_type j = local_graph_rowptr(lr); j < local_graph_rowptr(lr + 1); ++j) {
2046  const local_ordinal_type lc = local_graph_colidx(j);
2047  const local_ordinal_type lc2r = col2row[lc];
2048  bool incr_R = false;
2049  do { // breakable
2050  if (lc2r == (local_ordinal_type)-1) {
2051  incr_R = true;
2052  break;
2053  }
2054  const local_ordinal_type ri = lclrow2idx[lc2r];
2055  const local_ordinal_type pi = rowidx2part(ri);
2056  if (pi != pi0) {
2057  incr_R = true;
2058  break;
2059  }
2060  // Test for being in the tridiag. This is done in index space. In
2061  // LID space, tridiag LIDs in a row are not necessarily related by
2062  // {-1, 0, 1}.
2063  if (ri0 + 1 >= ri && ri0 <= ri + 1)
2064  ++update.v[0]; // D_nnz
2065  else
2066  incr_R = true;
2067  } while (0);
2068  if (incr_R) {
2069  if (lc < nrows)
2070  ++update.v[1]; // R_nnz_owned
2071  else
2072  ++update.v[2]; // R_nnz_remote
2073  }
2074  }
2075  },
2076  sum_reducer_type(sum_reducer_value));
2077  }
2078  size_type D_nnz = sum_reducer_value.v[0];
2079  size_type R_nnz_owned = sum_reducer_value.v[1];
2080  size_type R_nnz_remote = sum_reducer_value.v[2];
2081 
2082  if (!overlap_communication_and_computation) {
2083  R_nnz_owned += R_nnz_remote;
2084  R_nnz_remote = 0;
2085  }
2086 
2087  // construct the D_00 graph.
2088  {
2089  const auto flat_td_ptr = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), btdm.flat_td_ptr);
2090 
2091  btdm.A_colindsub = local_ordinal_type_1d_view("btdm.A_colindsub", D_nnz);
2092  const auto D_A_colindsub = Kokkos::create_mirror_view(btdm.A_colindsub);
2093 
2094 #if defined(BLOCKTRIDICONTAINER_DEBUG)
2095  Kokkos::deep_copy(D_A_colindsub, Teuchos::OrdinalTraits<local_ordinal_type>::invalid());
2096 #endif
2097 
2098  const local_ordinal_type nparts = partptr.extent(0) - 1;
2099 
2100  {
2101  const Kokkos::RangePolicy<host_execution_space> policy(0, nparts);
2102  Kokkos::parallel_for(
2103  "performSymbolicPhase::RangePolicy<host_execution_space>::D_graph",
2104  policy, KOKKOS_LAMBDA(const local_ordinal_type &pi0) {
2105  const local_ordinal_type part_ri0 = part2rowidx0(pi0);
2106  local_ordinal_type offset = 0;
2107  for (local_ordinal_type ri0 = partptr(pi0); ri0 < partptr(pi0 + 1); ++ri0) {
2108  const local_ordinal_type td_row_os = btdm.RowToIndex(ri0 - part_ri0) + offset;
2109  offset = 1;
2110  const local_ordinal_type lr0 = lclrow(ri0);
2111  const size_type j0 = local_graph_rowptr(lr0);
2112  for (size_type j = j0; j < local_graph_rowptr(lr0 + 1); ++j) {
2113  const local_ordinal_type lc = local_graph_colidx(j);
2114  const local_ordinal_type lc2r = col2row[lc];
2115  if (lc2r == (local_ordinal_type)-1) continue;
2116  const local_ordinal_type ri = lclrow2idx[lc2r];
2117  const local_ordinal_type pi = rowidx2part(ri);
2118  if (pi != pi0) continue;
2119  if (ri + 1 < ri0 || ri > ri0 + 1) continue;
2120  const local_ordinal_type row_entry = j - j0;
2121  D_A_colindsub(flat_td_ptr(pi0, 0) + ((td_row_os + ri) - ri0)) = row_entry;
2122  }
2123  }
2124  });
2125  }
2126 #if defined(BLOCKTRIDICONTAINER_DEBUG)
2127  for (size_t i = 0; i < D_A_colindsub.extent(0); ++i)
2129 #endif
2130  Kokkos::deep_copy(btdm.A_colindsub, D_A_colindsub);
2131 
2132  // Allocate values.
2133  {
2134  const auto pack_td_ptr_last = Kokkos::subview(btdm.pack_td_ptr, btdm.pack_td_ptr.extent(0) - 1, btdm.pack_td_ptr.extent(1) - 1);
2135  const auto num_packed_blocks = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), pack_td_ptr_last);
2136  btdm.values = vector_type_3d_view("btdm.values", num_packed_blocks(), blocksize, blocksize);
2137 
2138  if (interf.n_subparts_per_part > 1) {
2139  const auto pack_td_ptr_schur_last = Kokkos::subview(btdm.pack_td_ptr_schur, btdm.pack_td_ptr_schur.extent(0) - 1, btdm.pack_td_ptr_schur.extent(1) - 1);
2140  const auto num_packed_blocks_schur = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), pack_td_ptr_schur_last);
2141  btdm.values_schur = vector_type_3d_view("btdm.values_schur", num_packed_blocks_schur(), blocksize, blocksize);
2142  }
2143 
2144  if (vector_length > 1) setTridiagsToIdentity(btdm, interf.packptr);
2145  }
2146  }
2147 
2148  // Construct the R graph.
2149  {
2150  amd.rowptr = size_type_1d_view("amd.rowptr", nrows + 1);
2151  amd.A_colindsub = local_ordinal_type_1d_view(do_not_initialize_tag("amd.A_colindsub"), R_nnz_owned);
2152 
2153  const auto R_rowptr = Kokkos::create_mirror_view(amd.rowptr);
2154  const auto R_A_colindsub = Kokkos::create_mirror_view(amd.A_colindsub);
2155 
2156  amd.rowptr_remote = size_type_1d_view("amd.rowptr_remote", overlap_communication_and_computation ? nrows + 1 : 0);
2157  amd.A_colindsub_remote = local_ordinal_type_1d_view(do_not_initialize_tag("amd.A_colindsub_remote"), R_nnz_remote);
2158 
2159  const auto R_rowptr_remote = Kokkos::create_mirror_view(amd.rowptr_remote);
2160  const auto R_A_colindsub_remote = Kokkos::create_mirror_view(amd.A_colindsub_remote);
2161 
2162  {
2163  const Kokkos::RangePolicy<host_execution_space> policy(0, nrows);
2164  Kokkos::parallel_for(
2165  "performSymbolicPhase::RangePolicy<host_execution_space>::R_graph_count",
2166  policy, KOKKOS_LAMBDA(const local_ordinal_type &lr) {
2167  const local_ordinal_type ri0 = lclrow2idx[lr];
2168  const local_ordinal_type pi0 = rowidx2part(ri0);
2169  const size_type j0 = local_graph_rowptr(lr);
2170  for (size_type j = j0; j < local_graph_rowptr(lr + 1); ++j) {
2171  const local_ordinal_type lc = local_graph_colidx(j);
2172  const local_ordinal_type lc2r = col2row[lc];
2173  if (lc2r != (local_ordinal_type)-1) {
2174  const local_ordinal_type ri = lclrow2idx[lc2r];
2175  const local_ordinal_type pi = rowidx2part(ri);
2176  if (pi == pi0 && ri + 1 >= ri0 && ri <= ri0 + 1) {
2177  continue;
2178  }
2179  }
2180  // exclusive scan will be performed later
2181  if (!overlap_communication_and_computation || lc < nrows) {
2182  ++R_rowptr(lr);
2183  } else {
2184  ++R_rowptr_remote(lr);
2185  }
2186  }
2187  });
2188  }
2189 
2190  // exclusive scan
2192  {
2193  Kokkos::RangePolicy<host_execution_space> policy(0, nrows + 1);
2194  Kokkos::parallel_scan(
2195  "performSymbolicPhase::RangePolicy<host_execution_space>::R_graph_fill",
2196  policy, KOKKOS_LAMBDA(const local_ordinal_type &lr, update_type &update, const bool &final) {
2197  update_type val;
2198  val.v[0] = R_rowptr(lr);
2199  if (overlap_communication_and_computation)
2200  val.v[1] = R_rowptr_remote(lr);
2201 
2202  if (final) {
2203  R_rowptr(lr) = update.v[0];
2204  if (overlap_communication_and_computation)
2205  R_rowptr_remote(lr) = update.v[1];
2206 
2207  if (lr < nrows) {
2208  const local_ordinal_type ri0 = lclrow2idx[lr];
2209  const local_ordinal_type pi0 = rowidx2part(ri0);
2210 
2211  size_type cnt_rowptr = R_rowptr(lr);
2212  size_type cnt_rowptr_remote = overlap_communication_and_computation ? R_rowptr_remote(lr) : 0; // when not overlap_communication_and_computation, this value is garbage
2213 
2214  const size_type j0 = local_graph_rowptr(lr);
2215  for (size_type j = j0; j < local_graph_rowptr(lr + 1); ++j) {
2216  const local_ordinal_type lc = local_graph_colidx(j);
2217  const local_ordinal_type lc2r = col2row[lc];
2218  if (lc2r != (local_ordinal_type)-1) {
2219  const local_ordinal_type ri = lclrow2idx[lc2r];
2220  const local_ordinal_type pi = rowidx2part(ri);
2221  if (pi == pi0 && ri + 1 >= ri0 && ri <= ri0 + 1)
2222  continue;
2223  }
2224  const local_ordinal_type row_entry = j - j0;
2225  if (!overlap_communication_and_computation || lc < nrows)
2226  R_A_colindsub(cnt_rowptr++) = row_entry;
2227  else
2228  R_A_colindsub_remote(cnt_rowptr_remote++) = row_entry;
2229  }
2230  }
2231  }
2232  update += val;
2233  });
2234  }
2235  TEUCHOS_ASSERT(R_rowptr(nrows) == R_nnz_owned);
2236  Kokkos::deep_copy(amd.rowptr, R_rowptr);
2237  Kokkos::deep_copy(amd.A_colindsub, R_A_colindsub);
2238  if (overlap_communication_and_computation) {
2239  TEUCHOS_ASSERT(R_rowptr_remote(nrows) == R_nnz_remote);
2240  Kokkos::deep_copy(amd.rowptr_remote, R_rowptr_remote);
2241  Kokkos::deep_copy(amd.A_colindsub_remote, R_A_colindsub_remote);
2242  }
2243 
2244  // Allocate or view values.
2245  if (hasBlockCrsMatrix)
2246  amd.tpetra_values = (const_cast<block_crs_matrix_type *>(A_bcrs.get())->getValuesDeviceNonConst());
2247  else {
2248  amd.tpetra_values = (const_cast<crs_matrix_type *>(A_crs.get()))->getLocalValuesDevice(Tpetra::Access::ReadWrite);
2249  }
2250  }
2251 
2252  // Allocate view for E and initialize the values with B:
2253 
2254  if (interf.n_subparts_per_part > 1)
2255  btdm.e_values = vector_type_4d_view("btdm.e_values", 2, interf.part2packrowidx0_back, blocksize, blocksize);
2256  }
2257  // Precompute offsets of each A and x entry to speed up residual.
2258  // Applies if all of these are true:
2259  // - hasBlockCrsMatrix
2260  // - execution_space is a GPU
2261  // - !useSeqMethod (since this uses a different scheme for indexing A,x)
2262  //
2263  // Reading A, x take up to 4 and 6 levels of indirection respectively,
2264  // but precomputing the offsets reduces it to 2 for both (get index, then value)
2265  if (BlockHelperDetails::is_device<execution_space>::value && !useSeqMethod && hasBlockCrsMatrix) {
2266  bool is_async_importer_active = !async_importer.is_null();
2267  local_ordinal_type_1d_view dm2cm = is_async_importer_active ? async_importer->dm2cm : local_ordinal_type_1d_view();
2268  bool ownedRemoteSeparate = overlap_communication_and_computation || !is_async_importer_active;
2269  BlockHelperDetails::precompute_A_x_offsets<MatrixType>(amd, interf, g, dm2cm, blocksize, ownedRemoteSeparate);
2270  }
2271 
2272  // If using fused block Jacobi path, allocate diagonal inverses here (d_inv) and find diagonal offsets.
2273  if (use_fused_jacobi) {
2274  btdm.d_inv = btdm_scalar_type_3d_view(do_not_initialize_tag("btdm.d_inv"), interf.nparts, blocksize, blocksize);
2275  auto rowptrs = A_bcrs->getCrsGraph().getLocalRowPtrsDevice();
2276  auto entries = A_bcrs->getCrsGraph().getLocalIndicesDevice();
2277  btdm.diag_offsets = BlockHelperDetails::findDiagOffsets<execution_space, size_type_1d_view>(rowptrs, entries, interf.nparts, blocksize);
2278  }
2279  IFPACK2_BLOCKHELPER_TIMER_FENCE(typename BlockHelperDetails::ImplType<MatrixType>::execution_space)
2280 }
2281 
2285 template <typename ArgActiveExecutionMemorySpace>
2287 
2288 template <>
2289 struct ExtractAndFactorizeTridiagsDefaultModeAndAlgo<Kokkos::HostSpace> {
2290  typedef KB::Mode::Serial mode_type;
2291 #if defined(__KOKKOSBATCHED_INTEL_MKL_COMPACT_BATCHED__)
2292  typedef KB::Algo::Level3::CompactMKL algo_type;
2293 #else
2294  typedef KB::Algo::Level3::Blocked algo_type;
2295 #endif
2296  static int recommended_team_size(const int /* blksize */,
2297  const int /* vector_length */,
2298  const int /* internal_vector_length */) {
2299  return 1;
2300  }
2301 };
2302 
2303 #if defined(KOKKOS_ENABLE_CUDA)
2304 static inline int ExtractAndFactorizeRecommendedCudaTeamSize(const int blksize,
2305  const int vector_length,
2306  const int internal_vector_length) {
2307  const int vector_size = vector_length / internal_vector_length;
2308  int total_team_size(0);
2309  if (blksize <= 5)
2310  total_team_size = 32;
2311  else if (blksize <= 9)
2312  total_team_size = 32; // 64
2313  else if (blksize <= 12)
2314  total_team_size = 96;
2315  else if (blksize <= 16)
2316  total_team_size = 128;
2317  else if (blksize <= 20)
2318  total_team_size = 160;
2319  else
2320  total_team_size = 160;
2321  return 2 * total_team_size / vector_size;
2322 }
2323 template <>
2324 struct ExtractAndFactorizeTridiagsDefaultModeAndAlgo<Kokkos::CudaSpace> {
2325  typedef KB::Mode::Team mode_type;
2326  typedef KB::Algo::Level3::Unblocked algo_type;
2327  static int recommended_team_size(const int blksize,
2328  const int vector_length,
2329  const int internal_vector_length) {
2330  return ExtractAndFactorizeRecommendedCudaTeamSize(blksize, vector_length, internal_vector_length);
2331  }
2332 };
2333 template <>
2334 struct ExtractAndFactorizeTridiagsDefaultModeAndAlgo<Kokkos::CudaUVMSpace> {
2335  typedef KB::Mode::Team mode_type;
2336  typedef KB::Algo::Level3::Unblocked algo_type;
2337  static int recommended_team_size(const int blksize,
2338  const int vector_length,
2339  const int internal_vector_length) {
2340  return ExtractAndFactorizeRecommendedCudaTeamSize(blksize, vector_length, internal_vector_length);
2341  }
2342 };
2343 #endif
2344 
2345 #if defined(KOKKOS_ENABLE_HIP)
2346 static inline int ExtractAndFactorizeRecommendedHIPTeamSize(const int blksize,
2347  const int vector_length,
2348  const int internal_vector_length) {
2349  const int vector_size = vector_length / internal_vector_length;
2350  int total_team_size(0);
2351  if (blksize <= 5)
2352  total_team_size = 32;
2353  else if (blksize <= 9)
2354  total_team_size = 32; // 64
2355  else if (blksize <= 12)
2356  total_team_size = 96;
2357  else if (blksize <= 16)
2358  total_team_size = 128;
2359  else if (blksize <= 20)
2360  total_team_size = 160;
2361  else
2362  total_team_size = 160;
2363  return 2 * total_team_size / vector_size;
2364 }
2365 template <>
2366 struct ExtractAndFactorizeTridiagsDefaultModeAndAlgo<Kokkos::HIPSpace> {
2367  typedef KB::Mode::Team mode_type;
2368  typedef KB::Algo::Level3::Unblocked algo_type;
2369  static int recommended_team_size(const int blksize,
2370  const int vector_length,
2371  const int internal_vector_length) {
2372  return ExtractAndFactorizeRecommendedHIPTeamSize(blksize, vector_length, internal_vector_length);
2373  }
2374 };
2375 template <>
2376 struct ExtractAndFactorizeTridiagsDefaultModeAndAlgo<Kokkos::HIPHostPinnedSpace> {
2377  typedef KB::Mode::Team mode_type;
2378  typedef KB::Algo::Level3::Unblocked algo_type;
2379  static int recommended_team_size(const int blksize,
2380  const int vector_length,
2381  const int internal_vector_length) {
2382  return ExtractAndFactorizeRecommendedHIPTeamSize(blksize, vector_length, internal_vector_length);
2383  }
2384 };
2385 #endif
2386 
2387 #if defined(KOKKOS_ENABLE_SYCL)
2388 static inline int ExtractAndFactorizeRecommendedSYCLTeamSize(const int blksize,
2389  const int vector_length,
2390  const int internal_vector_length) {
2391  const int vector_size = vector_length / internal_vector_length;
2392  int total_team_size(0);
2393  if (blksize <= 5)
2394  total_team_size = 32;
2395  else if (blksize <= 9)
2396  total_team_size = 32; // 64
2397  else if (blksize <= 12)
2398  total_team_size = 96;
2399  else if (blksize <= 16)
2400  total_team_size = 128;
2401  else if (blksize <= 20)
2402  total_team_size = 160;
2403  else
2404  total_team_size = 160;
2405  return 2 * total_team_size / vector_size;
2406 }
2407 template <>
2408 struct ExtractAndFactorizeTridiagsDefaultModeAndAlgo<Kokkos::Experimental::SYCLDeviceUSMSpace> {
2409  typedef KB::Mode::Team mode_type;
2410  typedef KB::Algo::Level3::Unblocked algo_type;
2411  static int recommended_team_size(const int blksize,
2412  const int vector_length,
2413  const int internal_vector_length) {
2414  return ExtractAndFactorizeRecommendedSYCLTeamSize(blksize, vector_length, internal_vector_length);
2415  }
2416 };
2417 template <>
2418 struct ExtractAndFactorizeTridiagsDefaultModeAndAlgo<Kokkos::Experimental::SYCLSharedUSMSpace> {
2419  typedef KB::Mode::Team mode_type;
2420  typedef KB::Algo::Level3::Unblocked algo_type;
2421  static int recommended_team_size(const int blksize,
2422  const int vector_length,
2423  const int internal_vector_length) {
2424  return ExtractAndFactorizeRecommendedSYCLTeamSize(blksize, vector_length, internal_vector_length);
2425  }
2426 };
2427 #endif
2428 
2429 template <typename impl_type, typename WWViewType>
2430 KOKKOS_INLINE_FUNCTION void
2431 solveMultiVector(const typename Kokkos::TeamPolicy<typename impl_type::execution_space>::member_type &member,
2432  const typename impl_type::local_ordinal_type & /* blocksize */,
2433  const typename impl_type::local_ordinal_type &i0,
2434  const typename impl_type::local_ordinal_type &r0,
2435  const typename impl_type::local_ordinal_type &nrows,
2436  const typename impl_type::local_ordinal_type &v,
2437  const ConstUnmanaged<typename impl_type::internal_vector_type_4d_view> D_internal_vector_values,
2438  const Unmanaged<typename impl_type::internal_vector_type_4d_view> X_internal_vector_values,
2439  const WWViewType &WW,
2440  const bool skip_first_pass = false) {
2441  using execution_space = typename impl_type::execution_space;
2442  using team_policy_type = Kokkos::TeamPolicy<execution_space>;
2443  using member_type = typename team_policy_type::member_type;
2444  using local_ordinal_type = typename impl_type::local_ordinal_type;
2445 
2446  typedef SolveTridiagsDefaultModeAndAlgo<typename execution_space::memory_space> default_mode_and_algo_type;
2447 
2448  typedef typename default_mode_and_algo_type::mode_type default_mode_type;
2449  typedef typename default_mode_and_algo_type::multi_vector_algo_type default_algo_type;
2450 
2451  using btdm_magnitude_type = typename impl_type::btdm_magnitude_type;
2452 
2453  // constant
2454 #if KOKKOS_VERSION >= 40799
2455  const auto one = KokkosKernels::ArithTraits<btdm_magnitude_type>::one();
2456 #else
2457  const auto one = Kokkos::ArithTraits<btdm_magnitude_type>::one();
2458 #endif
2459 #if KOKKOS_VERSION >= 40799
2460  const auto zero = KokkosKernels::ArithTraits<btdm_magnitude_type>::zero();
2461 #else
2462  const auto zero = Kokkos::ArithTraits<btdm_magnitude_type>::zero();
2463 #endif
2464 
2465  // subview pattern
2466  auto A = Kokkos::subview(D_internal_vector_values, i0, Kokkos::ALL(), Kokkos::ALL(), v);
2467  auto X1 = Kokkos::subview(X_internal_vector_values, r0, Kokkos::ALL(), Kokkos::ALL(), v);
2468  auto X2 = X1;
2469 
2470  local_ordinal_type i = i0, r = r0;
2471 
2472  if (nrows > 1) {
2473  // solve Lx = x
2474  if (skip_first_pass) {
2475  i += (nrows - 2) * 3;
2476  r += (nrows - 2);
2477  A.assign_data(&D_internal_vector_values(i + 2, 0, 0, v));
2478  X2.assign_data(&X_internal_vector_values(++r, 0, 0, v));
2479  A.assign_data(&D_internal_vector_values(i + 3, 0, 0, v));
2480  KB::Trsm<member_type,
2481  KB::Side::Left, KB::Uplo::Lower, KB::Trans::NoTranspose, KB::Diag::Unit,
2482  default_mode_type, default_algo_type>::invoke(member, one, A, X2);
2483  X1.assign_data(X2.data());
2484  i += 3;
2485  } else {
2486  KB::Trsm<member_type,
2487  KB::Side::Left, KB::Uplo::Lower, KB::Trans::NoTranspose, KB::Diag::Unit,
2488  default_mode_type, default_algo_type>::invoke(member, one, A, X1);
2489  for (local_ordinal_type tr = 1; tr < nrows; ++tr, i += 3) {
2490  A.assign_data(&D_internal_vector_values(i + 2, 0, 0, v));
2491  X2.assign_data(&X_internal_vector_values(++r, 0, 0, v));
2492  member.team_barrier();
2493  KB::Gemm<member_type,
2494  KB::Trans::NoTranspose, KB::Trans::NoTranspose,
2495  default_mode_type, default_algo_type>::invoke(member, -one, A, X1, one, X2);
2496  A.assign_data(&D_internal_vector_values(i + 3, 0, 0, v));
2497  KB::Trsm<member_type,
2498  KB::Side::Left, KB::Uplo::Lower, KB::Trans::NoTranspose, KB::Diag::Unit,
2499  default_mode_type, default_algo_type>::invoke(member, one, A, X2);
2500  X1.assign_data(X2.data());
2501  }
2502  }
2503 
2504  // solve Ux = x
2505  KB::Trsm<member_type,
2506  KB::Side::Left, KB::Uplo::Upper, KB::Trans::NoTranspose, KB::Diag::NonUnit,
2507  default_mode_type, default_algo_type>::invoke(member, one, A, X1);
2508  for (local_ordinal_type tr = nrows; tr > 1; --tr) {
2509  i -= 3;
2510  A.assign_data(&D_internal_vector_values(i + 1, 0, 0, v));
2511  X2.assign_data(&X_internal_vector_values(--r, 0, 0, v));
2512  member.team_barrier();
2513  KB::Gemm<member_type,
2514  KB::Trans::NoTranspose, KB::Trans::NoTranspose,
2515  default_mode_type, default_algo_type>::invoke(member, -one, A, X1, one, X2);
2516 
2517  A.assign_data(&D_internal_vector_values(i, 0, 0, v));
2518  KB::Trsm<member_type,
2519  KB::Side::Left, KB::Uplo::Upper, KB::Trans::NoTranspose, KB::Diag::NonUnit,
2520  default_mode_type, default_algo_type>::invoke(member, one, A, X2);
2521  X1.assign_data(X2.data());
2522  }
2523  } else {
2524  // matrix is already inverted
2525  auto W = Kokkos::subview(WW, Kokkos::ALL(), Kokkos::ALL(), v);
2526  KB::Copy<member_type, KB::Trans::NoTranspose, default_mode_type>::invoke(member, X1, W);
2527  member.team_barrier();
2528  KB::Gemm<member_type,
2529  KB::Trans::NoTranspose, KB::Trans::NoTranspose,
2530  default_mode_type, default_algo_type>::invoke(member, one, A, W, zero, X1);
2531  }
2532 }
2533 
2534 template <typename impl_type, typename WWViewType, typename XViewType>
2535 KOKKOS_INLINE_FUNCTION void
2536 solveSingleVectorNew(const typename Kokkos::TeamPolicy<typename impl_type::execution_space>::member_type &member,
2537  const typename impl_type::local_ordinal_type &blocksize,
2538  const typename impl_type::local_ordinal_type &i0,
2539  const typename impl_type::local_ordinal_type &r0,
2540  const typename impl_type::local_ordinal_type &nrows,
2541  const typename impl_type::local_ordinal_type &v,
2542  const ConstUnmanaged<typename impl_type::internal_vector_type_4d_view> D_internal_vector_values,
2543  const XViewType &X_internal_vector_values, // Unmanaged<typename impl_type::internal_vector_type_4d_view>
2544  const WWViewType &WW) {
2545  using execution_space = typename impl_type::execution_space;
2546  // using team_policy_type = Kokkos::TeamPolicy<execution_space>;
2547  // using member_type = typename team_policy_type::member_type;
2548  using local_ordinal_type = typename impl_type::local_ordinal_type;
2549 
2550  typedef SolveTridiagsDefaultModeAndAlgo<typename execution_space::memory_space> default_mode_and_algo_type;
2551 
2552  typedef typename default_mode_and_algo_type::mode_type default_mode_type;
2553  typedef typename default_mode_and_algo_type::single_vector_algo_type default_algo_type;
2554 
2555  using btdm_magnitude_type = typename impl_type::btdm_magnitude_type;
2556 
2557  // base pointers
2558  auto A = D_internal_vector_values.data();
2559  auto X = X_internal_vector_values.data();
2560 
2561  // constant
2562 #if KOKKOS_VERSION >= 40799
2563  const auto one = KokkosKernels::ArithTraits<btdm_magnitude_type>::one();
2564 #else
2565  const auto one = Kokkos::ArithTraits<btdm_magnitude_type>::one();
2566 #endif
2567 #if KOKKOS_VERSION >= 40799
2568  const auto zero = KokkosKernels::ArithTraits<btdm_magnitude_type>::zero();
2569 #else
2570  const auto zero = Kokkos::ArithTraits<btdm_magnitude_type>::zero();
2571 #endif
2572  // const local_ordinal_type num_vectors = X_scalar_values.extent(2);
2573 
2574  // const local_ordinal_type blocksize = D_scalar_values.extent(1);
2575  const local_ordinal_type astep = D_internal_vector_values.stride(0);
2576  const local_ordinal_type as0 = D_internal_vector_values.stride(1); // blocksize*vector_length;
2577  const local_ordinal_type as1 = D_internal_vector_values.stride(2); // vector_length;
2578  const local_ordinal_type xstep = X_internal_vector_values.stride(0);
2579  const local_ordinal_type xs0 = X_internal_vector_values.stride(1); // vector_length;
2580 
2581  // move to starting point
2582  A += i0 * astep + v;
2583  X += r0 * xstep + v;
2584 
2585  // for (local_ordinal_type col=0;col<num_vectors;++col)
2586  if (nrows > 1) {
2587  // solve Lx = x
2588  KOKKOSBATCHED_TRSV_LOWER_NO_TRANSPOSE_INTERNAL_INVOKE(default_mode_type, default_algo_type,
2589  member,
2590  KB::Diag::Unit,
2591  blocksize, blocksize,
2592  one,
2593  A, as0, as1,
2594  X, xs0);
2595 
2596  for (local_ordinal_type tr = 1; tr < nrows; ++tr) {
2597  member.team_barrier();
2598  KOKKOSBATCHED_GEMV_NO_TRANSPOSE_INTERNAL_INVOKE(default_mode_type, default_algo_type,
2599  member,
2600  blocksize, blocksize,
2601  -one,
2602  A + 2 * astep, as0, as1,
2603  X, xs0,
2604  one,
2605  X + 1 * xstep, xs0);
2606  KOKKOSBATCHED_TRSV_LOWER_NO_TRANSPOSE_INTERNAL_INVOKE(default_mode_type, default_algo_type,
2607  member,
2608  KB::Diag::Unit,
2609  blocksize, blocksize,
2610  one,
2611  A + 3 * astep, as0, as1,
2612  X + 1 * xstep, xs0);
2613 
2614  A += 3 * astep;
2615  X += 1 * xstep;
2616  }
2617 
2618  // solve Ux = x
2619  KOKKOSBATCHED_TRSV_UPPER_NO_TRANSPOSE_INTERNAL_INVOKE(default_mode_type, default_algo_type,
2620  member,
2621  KB::Diag::NonUnit,
2622  blocksize, blocksize,
2623  one,
2624  A, as0, as1,
2625  X, xs0);
2626 
2627  for (local_ordinal_type tr = nrows; tr > 1; --tr) {
2628  A -= 3 * astep;
2629  member.team_barrier();
2630  KOKKOSBATCHED_GEMV_NO_TRANSPOSE_INTERNAL_INVOKE(default_mode_type, default_algo_type,
2631  member,
2632  blocksize, blocksize,
2633  -one,
2634  A + 1 * astep, as0, as1,
2635  X, xs0,
2636  one,
2637  X - 1 * xstep, xs0);
2638  KOKKOSBATCHED_TRSV_UPPER_NO_TRANSPOSE_INTERNAL_INVOKE(default_mode_type, default_algo_type,
2639  member,
2640  KB::Diag::NonUnit,
2641  blocksize, blocksize,
2642  one,
2643  A, as0, as1,
2644  X - 1 * xstep, xs0);
2645  X -= 1 * xstep;
2646  }
2647  // for multiple rhs
2648  // X += xs1;
2649  } else {
2650  const local_ordinal_type ws0 = WW.stride(0);
2651  auto W = WW.data() + v;
2652  KOKKOSBATCHED_COPY_VECTOR_NO_TRANSPOSE_INTERNAL_INVOKE(default_mode_type,
2653  member, blocksize, X, xs0, W, ws0);
2654  member.team_barrier();
2655  KOKKOSBATCHED_GEMV_NO_TRANSPOSE_INTERNAL_INVOKE(default_mode_type, default_algo_type,
2656  member,
2657  blocksize, blocksize,
2658  one,
2659  A, as0, as1,
2660  W, xs0,
2661  zero,
2662  X, xs0);
2663  }
2664 }
2665 
2666 template <typename local_ordinal_type, typename ViewType>
2667 void writeBTDValuesToFile(const local_ordinal_type &n_parts, const ViewType &scalar_values_device, std::string fileName) {
2668 #ifdef IFPACK2_BLOCKTRIDICONTAINER_WRITE_MM
2669  auto scalar_values = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), scalar_values_device);
2670  std::ofstream myfile;
2671  myfile.open(fileName);
2672 
2673  const local_ordinal_type n_parts_per_pack = n_parts < (local_ordinal_type)scalar_values.extent(3) ? n_parts : scalar_values.extent(3);
2674  local_ordinal_type nnz = scalar_values.extent(0) * scalar_values.extent(1) * scalar_values.extent(2) * n_parts_per_pack;
2675  const local_ordinal_type n_blocks = scalar_values.extent(0) * n_parts_per_pack;
2676  const local_ordinal_type n_blocks_per_part = n_blocks / n_parts;
2677 
2678  const local_ordinal_type block_size = scalar_values.extent(1);
2679 
2680  const local_ordinal_type n_rows_per_part = (n_blocks_per_part + 2) / 3 * block_size;
2681  const local_ordinal_type n_rows = n_rows_per_part * n_parts;
2682 
2683  const local_ordinal_type n_packs = ceil(float(n_parts) / n_parts_per_pack);
2684 
2685  myfile << "%%MatrixMarket matrix coordinate real general" << std::endl;
2686  myfile << "%%nnz = " << nnz;
2687  myfile << " block size = " << block_size;
2688  myfile << " number of blocks = " << n_blocks;
2689  myfile << " number of parts = " << n_parts;
2690  myfile << " number of blocks per part = " << n_blocks_per_part;
2691  myfile << " number of rows = " << n_rows;
2692  myfile << " number of cols = " << n_rows;
2693  myfile << " number of packs = " << n_packs << std::endl;
2694 
2695  myfile << n_rows << " " << n_rows << " " << nnz << std::setprecision(9) << std::endl;
2696 
2697  local_ordinal_type current_part_idx, current_block_idx, current_row_offset, current_col_offset, current_row, current_col;
2698  for (local_ordinal_type i_pack = 0; i_pack < n_packs; ++i_pack) {
2699  for (local_ordinal_type i_part_in_pack = 0; i_part_in_pack < n_parts_per_pack; ++i_part_in_pack) {
2700  current_part_idx = i_part_in_pack + i_pack * n_parts_per_pack;
2701  for (local_ordinal_type i_block_in_part = 0; i_block_in_part < n_blocks_per_part; ++i_block_in_part) {
2702  current_block_idx = i_block_in_part + i_pack * n_blocks_per_part;
2703  if (current_block_idx >= (local_ordinal_type)scalar_values.extent(0))
2704  continue;
2705  if (i_block_in_part % 3 == 0) {
2706  current_row_offset = i_block_in_part / 3 * block_size;
2707  current_col_offset = i_block_in_part / 3 * block_size;
2708  } else if (i_block_in_part % 3 == 1) {
2709  current_row_offset = (i_block_in_part - 1) / 3 * block_size;
2710  current_col_offset = ((i_block_in_part - 1) / 3 + 1) * block_size;
2711  } else if (i_block_in_part % 3 == 2) {
2712  current_row_offset = ((i_block_in_part - 2) / 3 + 1) * block_size;
2713  current_col_offset = (i_block_in_part - 2) / 3 * block_size;
2714  }
2715  current_row_offset += current_part_idx * n_rows_per_part;
2716  current_col_offset += current_part_idx * n_rows_per_part;
2717  for (local_ordinal_type i_in_block = 0; i_in_block < block_size; ++i_in_block) {
2718  for (local_ordinal_type j_in_block = 0; j_in_block < block_size; ++j_in_block) {
2719  current_row = current_row_offset + i_in_block + 1;
2720  current_col = current_col_offset + j_in_block + 1;
2721  myfile << current_row << " " << current_col << " " << scalar_values(current_block_idx, i_in_block, j_in_block, i_part_in_pack) << std::endl;
2722  }
2723  }
2724  }
2725  }
2726  }
2727 
2728  myfile.close();
2729 #endif
2730 }
2731 
2732 template <typename local_ordinal_type, typename ViewType>
2733 void write4DMultiVectorValuesToFile(const local_ordinal_type &n_parts, const ViewType &scalar_values_device, std::string fileName) {
2734 #ifdef IFPACK2_BLOCKTRIDICONTAINER_WRITE_MM
2735  auto scalar_values = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), scalar_values_device);
2736  std::ofstream myfile;
2737  myfile.open(fileName);
2738 
2739  const local_ordinal_type n_parts_per_pack = n_parts < scalar_values.extent(3) ? n_parts : scalar_values.extent(3);
2740  const local_ordinal_type n_blocks = scalar_values.extent(0) * n_parts_per_pack;
2741  const local_ordinal_type n_blocks_per_part = n_blocks / n_parts;
2742 
2743  const local_ordinal_type block_size = scalar_values.extent(1);
2744  const local_ordinal_type n_cols = scalar_values.extent(2);
2745 
2746  const local_ordinal_type n_rows_per_part = n_blocks_per_part * block_size;
2747  const local_ordinal_type n_rows = n_rows_per_part * n_parts;
2748 
2749  const local_ordinal_type n_packs = ceil(float(n_parts) / n_parts_per_pack);
2750 
2751  myfile << "%%MatrixMarket matrix array real general" << std::endl;
2752  myfile << "%%block size = " << block_size;
2753  myfile << " number of blocks = " << n_blocks;
2754  myfile << " number of parts = " << n_parts;
2755  myfile << " number of blocks per part = " << n_blocks_per_part;
2756  myfile << " number of rows = " << n_rows;
2757  myfile << " number of cols = " << n_cols;
2758  myfile << " number of packs = " << n_packs << std::endl;
2759 
2760  myfile << n_rows << " " << n_cols << std::setprecision(9) << std::endl;
2761 
2762  local_ordinal_type current_part_idx, current_block_idx, current_row_offset;
2763  (void)current_row_offset;
2764  (void)current_part_idx;
2765  for (local_ordinal_type j_in_block = 0; j_in_block < n_cols; ++j_in_block) {
2766  for (local_ordinal_type i_pack = 0; i_pack < n_packs; ++i_pack) {
2767  for (local_ordinal_type i_part_in_pack = 0; i_part_in_pack < n_parts_per_pack; ++i_part_in_pack) {
2768  current_part_idx = i_part_in_pack + i_pack * n_parts_per_pack;
2769  for (local_ordinal_type i_block_in_part = 0; i_block_in_part < n_blocks_per_part; ++i_block_in_part) {
2770  current_block_idx = i_block_in_part + i_pack * n_blocks_per_part;
2771 
2772  if (current_block_idx >= (local_ordinal_type)scalar_values.extent(0))
2773  continue;
2774  for (local_ordinal_type i_in_block = 0; i_in_block < block_size; ++i_in_block) {
2775  myfile << scalar_values(current_block_idx, i_in_block, j_in_block, i_part_in_pack) << std::endl;
2776  }
2777  }
2778  }
2779  }
2780  }
2781  myfile.close();
2782 #endif
2783 }
2784 
2785 template <typename local_ordinal_type, typename ViewType>
2786 void write5DMultiVectorValuesToFile(const local_ordinal_type &n_parts, const ViewType &scalar_values_device, std::string fileName) {
2787 #ifdef IFPACK2_BLOCKTRIDICONTAINER_WRITE_MM
2788  auto scalar_values = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), scalar_values_device);
2789  std::ofstream myfile;
2790  myfile.open(fileName);
2791 
2792  const local_ordinal_type n_parts_per_pack = n_parts < scalar_values.extent(4) ? n_parts : scalar_values.extent(4);
2793  const local_ordinal_type n_blocks = scalar_values.extent(1) * n_parts_per_pack;
2794  const local_ordinal_type n_blocks_per_part = n_blocks / n_parts;
2795 
2796  const local_ordinal_type block_size = scalar_values.extent(2);
2797  const local_ordinal_type n_blocks_cols = scalar_values.extent(0);
2798  const local_ordinal_type n_cols = n_blocks_cols * block_size;
2799 
2800  const local_ordinal_type n_rows_per_part = n_blocks_per_part * block_size;
2801  const local_ordinal_type n_rows = n_rows_per_part * n_parts;
2802 
2803  const local_ordinal_type n_packs = ceil(float(n_parts) / n_parts_per_pack);
2804 
2805  myfile << "%%MatrixMarket matrix array real general" << std::endl;
2806  myfile << "%%block size = " << block_size;
2807  myfile << " number of blocks = " << n_blocks;
2808  myfile << " number of parts = " << n_parts;
2809  myfile << " number of blocks per part = " << n_blocks_per_part;
2810  myfile << " number of rows = " << n_rows;
2811  myfile << " number of cols = " << n_cols;
2812  myfile << " number of packs = " << n_packs << std::endl;
2813 
2814  myfile << n_rows << " " << n_cols << std::setprecision(9) << std::endl;
2815 
2816  local_ordinal_type current_part_idx, current_block_idx, current_row_offset;
2817  (void)current_row_offset;
2818  (void)current_part_idx;
2819  for (local_ordinal_type i_block_col = 0; i_block_col < n_blocks_cols; ++i_block_col) {
2820  for (local_ordinal_type j_in_block = 0; j_in_block < block_size; ++j_in_block) {
2821  for (local_ordinal_type i_pack = 0; i_pack < n_packs; ++i_pack) {
2822  for (local_ordinal_type i_part_in_pack = 0; i_part_in_pack < n_parts_per_pack; ++i_part_in_pack) {
2823  current_part_idx = i_part_in_pack + i_pack * n_parts_per_pack;
2824  for (local_ordinal_type i_block_in_part = 0; i_block_in_part < n_blocks_per_part; ++i_block_in_part) {
2825  current_block_idx = i_block_in_part + i_pack * n_blocks_per_part;
2826 
2827  if (current_block_idx >= (local_ordinal_type)scalar_values.extent(1))
2828  continue;
2829  for (local_ordinal_type i_in_block = 0; i_in_block < block_size; ++i_in_block) {
2830  myfile << scalar_values(i_block_col, current_block_idx, i_in_block, j_in_block, i_part_in_pack) << std::endl;
2831  }
2832  }
2833  }
2834  }
2835  }
2836  }
2837  myfile.close();
2838 #endif
2839 }
2840 
2841 template <typename local_ordinal_type, typename member_type, typename ViewType1, typename ViewType2>
2842 KOKKOS_INLINE_FUNCTION void
2843 copy3DView(const member_type &member, const ViewType1 &view1, const ViewType2 &view2) {
2844  /*
2845  // Kokkos::Experimental::local_deep_copy
2846  auto teamVectorRange =
2847  Kokkos::TeamVectorMDRange<Kokkos::Rank<3>, member_type>(
2848  member, view1.extent(0), view1.extent(1), view1.extent(2));
2849 
2850  Kokkos::parallel_for
2851  (teamVectorRange,
2852  [&](const local_ordinal_type &i, const local_ordinal_type &j, const local_ordinal_type &k) {
2853  view1(i,j,k) = view2(i,j,k);
2854  });
2855  */
2856  Kokkos::Experimental::local_deep_copy(member, view1, view2);
2857 }
2858 template <typename MatrixType, int ScratchLevel>
2859 struct ExtractAndFactorizeTridiags {
2860  public:
2861  using impl_type = BlockHelperDetails::ImplType<MatrixType>;
2862  // a functor cannot have both device_type and execution_space; specialization error in kokkos
2863  using execution_space = typename impl_type::execution_space;
2864  using memory_space = typename impl_type::memory_space;
2866  using local_ordinal_type = typename impl_type::local_ordinal_type;
2867  using size_type = typename impl_type::size_type;
2868  using impl_scalar_type = typename impl_type::impl_scalar_type;
2869  using magnitude_type = typename impl_type::magnitude_type;
2871  using row_matrix_type = typename impl_type::tpetra_row_matrix_type;
2872  using crs_graph_type = typename impl_type::tpetra_crs_graph_type;
2874  using local_ordinal_type_1d_view = typename impl_type::local_ordinal_type_1d_view;
2875  using local_ordinal_type_2d_view = typename impl_type::local_ordinal_type_2d_view;
2876  using size_type_1d_view = typename impl_type::size_type_1d_view;
2877  using size_type_2d_view = typename impl_type::size_type_2d_view;
2878  using impl_scalar_type_1d_view_tpetra = typename impl_type::impl_scalar_type_1d_view_tpetra;
2880  using btdm_scalar_type = typename impl_type::btdm_scalar_type;
2881  using btdm_magnitude_type = typename impl_type::btdm_magnitude_type;
2882  using vector_type_3d_view = typename impl_type::vector_type_3d_view;
2883  using vector_type_4d_view = typename impl_type::vector_type_4d_view;
2884  using internal_vector_type_4d_view = typename impl_type::internal_vector_type_4d_view;
2885  using internal_vector_type_5d_view = typename impl_type::internal_vector_type_5d_view;
2886  using btdm_scalar_type_2d_view = typename impl_type::btdm_scalar_type_2d_view;
2887  using btdm_scalar_type_3d_view = typename impl_type::btdm_scalar_type_3d_view;
2888  using btdm_scalar_type_4d_view = typename impl_type::btdm_scalar_type_4d_view;
2889  using btdm_scalar_type_5d_view = typename impl_type::btdm_scalar_type_5d_view;
2890  using internal_vector_scratch_type_3d_view = Scratch<typename impl_type::internal_vector_type_3d_view>;
2891  using btdm_scalar_scratch_type_3d_view = Scratch<typename impl_type::btdm_scalar_type_3d_view>;
2892  using tpetra_block_access_view_type = typename impl_type::tpetra_block_access_view_type; // block crs (layout right)
2893  using local_crs_graph_type = typename impl_type::local_crs_graph_type;
2894  using colinds_view = typename local_crs_graph_type::entries_type;
2895 
2896  using internal_vector_type = typename impl_type::internal_vector_type;
2897  static constexpr int vector_length = impl_type::vector_length;
2898  static constexpr int internal_vector_length = impl_type::internal_vector_length;
2899  static_assert(vector_length >= internal_vector_length, "Ifpack2 BlockTriDi Numeric: vector_length must be at least as large as internal_vector_length");
2900  static_assert(vector_length % internal_vector_length == 0, "Ifpack2 BlockTriDi Numeric: vector_length must be divisible by internal_vector_length");
2901  // half_vector_length is used for block Jacobi factorization.
2902  // Shared memory requirement is twice as large (per vector lane) as for general tridi factorization, so
2903  // reducing vector length (if possible) keeps the shared requirement constant. This avoids the performance
2904  // cliff of switching from level 0 to level 1 scratch.
2905  static constexpr int half_vector_length = impl_type::half_vector_length;
2906 
2908  using team_policy_type = Kokkos::TeamPolicy<execution_space>;
2909  using member_type = typename team_policy_type::member_type;
2910 
2911  private:
2912  // part interface
2913  const ConstUnmanaged<local_ordinal_type_1d_view> partptr, lclrow, packptr, packindices_sub, packptr_sub;
2914  const ConstUnmanaged<local_ordinal_type_2d_view> partptr_sub, part2packrowidx0_sub, packindices_schur;
2915  const local_ordinal_type max_partsz;
2916  // block crs matrix (it could be Kokkos::UVMSpace::size_type, which is int)
2917  using size_type_1d_view_tpetra = Kokkos::View<size_t *, typename impl_type::node_device_type>;
2918  ConstUnmanaged<size_type_1d_view_tpetra> A_block_rowptr;
2919  ConstUnmanaged<size_type_1d_view_tpetra> A_point_rowptr;
2920  ConstUnmanaged<impl_scalar_type_1d_view_tpetra> A_values;
2921  // block tridiags
2922  const ConstUnmanaged<size_type_2d_view> pack_td_ptr, flat_td_ptr, pack_td_ptr_schur;
2923  const ConstUnmanaged<local_ordinal_type_1d_view> A_colindsub;
2924  const Unmanaged<internal_vector_type_4d_view> internal_vector_values, internal_vector_values_schur;
2925  const Unmanaged<internal_vector_type_5d_view> e_internal_vector_values;
2926  const Unmanaged<btdm_scalar_type_4d_view> scalar_values, scalar_values_schur;
2927  const Unmanaged<btdm_scalar_type_5d_view> e_scalar_values;
2928  const Unmanaged<btdm_scalar_type_3d_view> d_inv;
2929  const Unmanaged<size_type_1d_view> diag_offsets;
2930  // shared information
2931  const local_ordinal_type blocksize, blocksize_square;
2932  // diagonal safety
2933  const magnitude_type tiny;
2934  const local_ordinal_type vector_loop_size;
2935 
2936  bool hasBlockCrsMatrix;
2937 
2938  public:
2939  ExtractAndFactorizeTridiags(const BlockTridiags<MatrixType> &btdm_,
2940  const BlockHelperDetails::PartInterface<MatrixType> &interf_,
2943  const magnitude_type &tiny_)
2944  : // interface
2945  partptr(interf_.partptr)
2946  , lclrow(interf_.lclrow)
2947  , packptr(interf_.packptr)
2948  , packindices_sub(interf_.packindices_sub)
2949  , packptr_sub(interf_.packptr_sub)
2950  , partptr_sub(interf_.partptr_sub)
2951  , part2packrowidx0_sub(interf_.part2packrowidx0_sub)
2952  , packindices_schur(interf_.packindices_schur)
2953  , max_partsz(interf_.max_partsz)
2954  ,
2955  // block tridiags
2956  pack_td_ptr(btdm_.pack_td_ptr)
2957  , flat_td_ptr(btdm_.flat_td_ptr)
2958  , pack_td_ptr_schur(btdm_.pack_td_ptr_schur)
2959  , A_colindsub(btdm_.A_colindsub)
2960  , internal_vector_values((internal_vector_type *)btdm_.values.data(),
2961  btdm_.values.extent(0),
2962  btdm_.values.extent(1),
2963  btdm_.values.extent(2),
2964  vector_length / internal_vector_length)
2965  , internal_vector_values_schur((internal_vector_type *)btdm_.values_schur.data(),
2966  btdm_.values_schur.extent(0),
2967  btdm_.values_schur.extent(1),
2968  btdm_.values_schur.extent(2),
2969  vector_length / internal_vector_length)
2970  , e_internal_vector_values((internal_vector_type *)btdm_.e_values.data(),
2971  btdm_.e_values.extent(0),
2972  btdm_.e_values.extent(1),
2973  btdm_.e_values.extent(2),
2974  btdm_.e_values.extent(3),
2975  vector_length / internal_vector_length)
2976  , scalar_values((btdm_scalar_type *)btdm_.values.data(),
2977  btdm_.values.extent(0),
2978  btdm_.values.extent(1),
2979  btdm_.values.extent(2),
2980  vector_length)
2981  , scalar_values_schur((btdm_scalar_type *)btdm_.values_schur.data(),
2982  btdm_.values_schur.extent(0),
2983  btdm_.values_schur.extent(1),
2984  btdm_.values_schur.extent(2),
2985  vector_length)
2986  , e_scalar_values((btdm_scalar_type *)btdm_.e_values.data(),
2987  btdm_.e_values.extent(0),
2988  btdm_.e_values.extent(1),
2989  btdm_.e_values.extent(2),
2990  btdm_.e_values.extent(3),
2991  vector_length)
2992  , d_inv(btdm_.d_inv)
2993  , diag_offsets(btdm_.diag_offsets)
2994  , blocksize(btdm_.values.extent(1))
2995  , blocksize_square(blocksize * blocksize)
2996  ,
2997  // diagonal weight to avoid zero pivots
2998  tiny(tiny_)
2999  , vector_loop_size(vector_length / internal_vector_length) {
3000  using crs_matrix_type = typename impl_type::tpetra_crs_matrix_type;
3001  using block_crs_matrix_type = typename impl_type::tpetra_block_crs_matrix_type;
3002 
3003  auto A_crs = Teuchos::rcp_dynamic_cast<const crs_matrix_type>(A_);
3004  auto A_bcrs = Teuchos::rcp_dynamic_cast<const block_crs_matrix_type>(A_);
3005 
3006  hasBlockCrsMatrix = !A_bcrs.is_null();
3007 
3008  A_block_rowptr = G_->getLocalGraphDevice().row_map;
3009  if (hasBlockCrsMatrix) {
3010  A_values = const_cast<block_crs_matrix_type *>(A_bcrs.get())->getValuesDeviceNonConst();
3011  } else {
3012  A_point_rowptr = A_crs->getCrsGraph()->getLocalGraphDevice().row_map;
3013  A_values = A_crs->getLocalValuesDevice(Tpetra::Access::ReadOnly);
3014  }
3015  }
3016 
3017  private:
3018  KOKKOS_INLINE_FUNCTION
3019  void
3020  extract(local_ordinal_type partidx,
3021  local_ordinal_type local_subpartidx,
3022  local_ordinal_type npacks) const {
3023 #ifdef IFPACK2_BLOCKTRIDICONTAINER_USE_PRINTF
3024  printf("extract partidx = %d, local_subpartidx = %d, npacks = %d;\n", partidx, local_subpartidx, npacks);
3025 #endif
3026  using tlb = BlockHelperDetails::TpetraLittleBlock<Tpetra::Impl::BlockCrsMatrixLittleBlockArrayLayout>;
3027  const size_type kps = pack_td_ptr(partidx, local_subpartidx);
3028  local_ordinal_type kfs[vector_length] = {};
3029  local_ordinal_type ri0[vector_length] = {};
3030  local_ordinal_type nrows[vector_length] = {};
3031 
3032  for (local_ordinal_type vi = 0; vi < npacks; ++vi, ++partidx) {
3033  kfs[vi] = flat_td_ptr(partidx, local_subpartidx);
3034  ri0[vi] = partptr_sub(pack_td_ptr.extent(0) * local_subpartidx + partidx, 0);
3035  nrows[vi] = partptr_sub(pack_td_ptr.extent(0) * local_subpartidx + partidx, 1) - ri0[vi];
3036 #ifdef IFPACK2_BLOCKTRIDICONTAINER_USE_PRINTF
3037  printf("kfs[%d] = %d;\n", vi, kfs[vi]);
3038  printf("ri0[%d] = %d;\n", vi, ri0[vi]);
3039  printf("nrows[%d] = %d;\n", vi, nrows[vi]);
3040 #endif
3041  }
3042  local_ordinal_type tr_min = 0;
3043  local_ordinal_type tr_max = nrows[0];
3044  if (local_subpartidx % 2 == 1) {
3045  tr_min -= 1;
3046  tr_max += 1;
3047  }
3048 #ifdef IFPACK2_BLOCKTRIDICONTAINER_USE_PRINTF
3049  printf("tr_min = %d and tr_max = %d;\n", tr_min, tr_max);
3050 #endif
3051  for (local_ordinal_type tr = tr_min, j = 0; tr < tr_max; ++tr) {
3052  for (local_ordinal_type e = 0; e < 3; ++e) {
3053  if (hasBlockCrsMatrix) {
3054  const impl_scalar_type *block[vector_length] = {};
3055  for (local_ordinal_type vi = 0; vi < npacks; ++vi) {
3056  const size_type Aj = A_block_rowptr(lclrow(ri0[vi] + tr)) + A_colindsub(kfs[vi] + j);
3057 
3058  block[vi] = &A_values(Aj * blocksize_square);
3059  }
3060  const size_type pi = kps + j;
3061 #ifdef IFPACK2_BLOCKTRIDICONTAINER_USE_PRINTF
3062  printf("Extract pi = %ld, ri0 + tr = %d, kfs + j = %d\n", pi, ri0[0] + tr, kfs[0] + j);
3063 #endif
3064  ++j;
3065  for (local_ordinal_type ii = 0; ii < blocksize; ++ii) {
3066  for (local_ordinal_type jj = 0; jj < blocksize; ++jj) {
3067  const auto idx = tlb::getFlatIndex(ii, jj, blocksize);
3068  auto &v = internal_vector_values(pi, ii, jj, 0);
3069  for (local_ordinal_type vi = 0; vi < npacks; ++vi) {
3070  v[vi] = static_cast<btdm_scalar_type>(block[vi][idx]);
3071  }
3072  }
3073  }
3074  } else {
3075  const size_type pi = kps + j;
3076 
3077  for (local_ordinal_type vi = 0; vi < npacks; ++vi) {
3078  const size_type Aj_c = A_colindsub(kfs[vi] + j);
3079 
3080  for (local_ordinal_type ii = 0; ii < blocksize; ++ii) {
3081  auto point_row_offset = A_point_rowptr(lclrow(ri0[vi] + tr) * blocksize + ii);
3082 
3083  for (local_ordinal_type jj = 0; jj < blocksize; ++jj) {
3084  scalar_values(pi, ii, jj, vi) = A_values(point_row_offset + Aj_c * blocksize + jj);
3085  }
3086  }
3087  }
3088  ++j;
3089  }
3090  if (nrows[0] == 1) break;
3091  if (local_subpartidx % 2 == 0) {
3092  if (e == 1 && (tr == 0 || tr + 1 == nrows[0])) break;
3093  for (local_ordinal_type vi = 1; vi < npacks; ++vi) {
3094  if ((e == 0 && nrows[vi] == 1) || (e == 1 && tr + 1 == nrows[vi])) {
3095  npacks = vi;
3096  break;
3097  }
3098  }
3099  } else {
3100  if (e == 0 && (tr == -1 || tr == nrows[0])) break;
3101  for (local_ordinal_type vi = 1; vi < npacks; ++vi) {
3102  if ((e == 0 && nrows[vi] == 1) || (e == 0 && tr == nrows[vi])) {
3103  npacks = vi;
3104  break;
3105  }
3106  }
3107  }
3108  }
3109  }
3110  }
3111 
3112  KOKKOS_INLINE_FUNCTION
3113  void
3114  extract(const member_type &member,
3115  const local_ordinal_type &partidxbeg,
3116  local_ordinal_type local_subpartidx,
3117  const local_ordinal_type &npacks,
3118  const local_ordinal_type &vbeg) const {
3119 #ifdef IFPACK2_BLOCKTRIDICONTAINER_USE_PRINTF
3120  printf("extract partidxbeg = %d, local_subpartidx = %d, npacks = %d, vbeg = %d;\n", partidxbeg, local_subpartidx, npacks, vbeg);
3121 #endif
3122  using tlb = BlockHelperDetails::TpetraLittleBlock<Tpetra::Impl::BlockCrsMatrixLittleBlockArrayLayout>;
3123  local_ordinal_type kfs_vals[internal_vector_length] = {};
3124  local_ordinal_type ri0_vals[internal_vector_length] = {};
3125  local_ordinal_type nrows_vals[internal_vector_length] = {};
3126 
3127  const size_type kps = pack_td_ptr(partidxbeg, local_subpartidx);
3128  for (local_ordinal_type v = vbeg, vi = 0; v < npacks && vi < internal_vector_length; ++v, ++vi) {
3129  kfs_vals[vi] = flat_td_ptr(partidxbeg + vi, local_subpartidx);
3130  ri0_vals[vi] = partptr_sub(pack_td_ptr.extent(0) * local_subpartidx + partidxbeg + vi, 0);
3131  nrows_vals[vi] = partptr_sub(pack_td_ptr.extent(0) * local_subpartidx + partidxbeg + vi, 1) - ri0_vals[vi];
3132 #ifdef IFPACK2_BLOCKTRIDICONTAINER_USE_PRINTF
3133  printf("kfs_vals[%d] = %d;\n", vi, kfs_vals[vi]);
3134  printf("ri0_vals[%d] = %d;\n", vi, ri0_vals[vi]);
3135  printf("nrows_vals[%d] = %d;\n", vi, nrows_vals[vi]);
3136 #endif
3137  }
3138 
3139  local_ordinal_type j_vals[internal_vector_length] = {};
3140 
3141  local_ordinal_type tr_min = 0;
3142  local_ordinal_type tr_max = nrows_vals[0];
3143  if (local_subpartidx % 2 == 1) {
3144  tr_min -= 1;
3145  tr_max += 1;
3146  }
3147 #ifdef IFPACK2_BLOCKTRIDICONTAINER_USE_PRINTF
3148  printf("tr_min = %d and tr_max = %d;\n", tr_min, tr_max);
3149 #endif
3150  for (local_ordinal_type tr = tr_min; tr < tr_max; ++tr) {
3151  for (local_ordinal_type v = vbeg, vi = 0; v < npacks && vi < internal_vector_length; ++v, ++vi) {
3152  const local_ordinal_type nrows = (local_subpartidx % 2 == 0 ? nrows_vals[vi] : nrows_vals[vi]);
3153  if ((local_subpartidx % 2 == 0 && tr < nrows) || (local_subpartidx % 2 == 1 && tr < nrows + 1)) {
3154  auto &j = j_vals[vi];
3155  const local_ordinal_type kfs = kfs_vals[vi];
3156  const local_ordinal_type ri0 = ri0_vals[vi];
3157  local_ordinal_type lbeg, lend;
3158  if (local_subpartidx % 2 == 0) {
3159  lbeg = (tr == tr_min ? 1 : 0);
3160  lend = (tr == nrows - 1 ? 2 : 3);
3161  } else {
3162  lbeg = 0;
3163  lend = 3;
3164  if (tr == tr_min) {
3165  lbeg = 1;
3166  lend = 2;
3167  } else if (tr == nrows) {
3168  lbeg = 0;
3169  lend = 1;
3170  }
3171  }
3172  if (hasBlockCrsMatrix) {
3173  for (local_ordinal_type l = lbeg; l < lend; ++l, ++j) {
3174  const size_type Aj = A_block_rowptr(lclrow(ri0 + tr)) + A_colindsub(kfs + j);
3175  const impl_scalar_type *block = &A_values(Aj * blocksize_square);
3176  const size_type pi = kps + j;
3177 #ifdef IFPACK2_BLOCKTRIDICONTAINER_USE_PRINTF
3178  printf("Extract pi = %ld, ri0 + tr = %d, kfs + j = %d, tr = %d, lbeg = %d, lend = %d, l = %d\n", pi, ri0 + tr, kfs + j, tr, lbeg, lend, l);
3179 #endif
3180  Kokkos::parallel_for(Kokkos::TeamThreadRange(member, blocksize),
3181  [&](const local_ordinal_type &ii) {
3182  for (local_ordinal_type jj = 0; jj < blocksize; ++jj) {
3183  scalar_values(pi, ii, jj, v) = static_cast<btdm_scalar_type>(block[tlb::getFlatIndex(ii, jj, blocksize)]);
3184  }
3185  });
3186  }
3187  } else {
3188  for (local_ordinal_type l = lbeg; l < lend; ++l, ++j) {
3189  const size_type Aj_c = A_colindsub(kfs + j);
3190  const size_type pi = kps + j;
3191  Kokkos::parallel_for(Kokkos::TeamThreadRange(member, blocksize),
3192  [&](const local_ordinal_type &ii) {
3193  auto point_row_offset = A_point_rowptr(lclrow(ri0 + tr) * blocksize + ii);
3194  for (local_ordinal_type jj = 0; jj < blocksize; ++jj) {
3195  scalar_values(pi, ii, jj, v) = A_values(point_row_offset + Aj_c * blocksize + jj);
3196  }
3197  });
3198  }
3199  }
3200  }
3201  }
3202  }
3203  }
3204 
3205  template <typename AAViewType,
3206  typename WWViewType>
3207  KOKKOS_INLINE_FUNCTION void
3208  factorize_subline(const member_type &member,
3209  const local_ordinal_type &i0,
3210  const local_ordinal_type &nrows,
3211  const local_ordinal_type &v,
3212  const AAViewType &AA,
3213  const WWViewType &WW) const {
3214  typedef ExtractAndFactorizeTridiagsDefaultModeAndAlgo<typename execution_space::memory_space> default_mode_and_algo_type;
3215 
3216  typedef typename default_mode_and_algo_type::mode_type default_mode_type;
3217  typedef typename default_mode_and_algo_type::algo_type default_algo_type;
3218 
3219  // constant
3220 #if KOKKOS_VERSION >= 40799
3221  const auto one = KokkosKernels::ArithTraits<btdm_magnitude_type>::one();
3222 #else
3223  const auto one = Kokkos::ArithTraits<btdm_magnitude_type>::one();
3224 #endif
3225 
3226 #ifdef IFPACK2_BLOCKTRIDICONTAINER_USE_PRINTF
3227  printf("i0 = %d, nrows = %d, v = %d, AA.extent(0) = %ld;\n", i0, nrows, v, AA.extent(0));
3228 #endif
3229 
3230  // subview pattern
3231  auto A = Kokkos::subview(AA, i0, Kokkos::ALL(), Kokkos::ALL(), v);
3232  KB::LU<member_type,
3233  default_mode_type, KB::Algo::LU::Unblocked>::invoke(member, A, tiny);
3234 
3235  if (nrows > 1) {
3236  auto B = A;
3237  auto C = A;
3238  local_ordinal_type i = i0;
3239  for (local_ordinal_type tr = 1; tr < nrows; ++tr, i += 3) {
3240 #ifdef IFPACK2_BLOCKTRIDICONTAINER_USE_PRINTF
3241  printf("tr = %d, i = %d;\n", tr, i);
3242 #endif
3243  B.assign_data(&AA(i + 1, 0, 0, v));
3244  KB::Trsm<member_type,
3245  KB::Side::Left, KB::Uplo::Lower, KB::Trans::NoTranspose, KB::Diag::Unit,
3246  default_mode_type, default_algo_type>::invoke(member, one, A, B);
3247  C.assign_data(&AA(i + 2, 0, 0, v));
3248  KB::Trsm<member_type,
3249  KB::Side::Right, KB::Uplo::Upper, KB::Trans::NoTranspose, KB::Diag::NonUnit,
3250  default_mode_type, default_algo_type>::invoke(member, one, A, C);
3251  A.assign_data(&AA(i + 3, 0, 0, v));
3252 
3253  member.team_barrier();
3254  KB::Gemm<member_type,
3255  KB::Trans::NoTranspose, KB::Trans::NoTranspose,
3256  default_mode_type, default_algo_type>::invoke(member, -one, C, B, one, A);
3257  KB::LU<member_type,
3258  default_mode_type, KB::Algo::LU::Unblocked>::invoke(member, A, tiny);
3259  }
3260  } else {
3261  // for block jacobi invert a matrix here
3262  auto W = Kokkos::subview(WW, Kokkos::ALL(), Kokkos::ALL(), v);
3263  KB::Copy<member_type, KB::Trans::NoTranspose, default_mode_type>::invoke(member, A, W);
3264  KB::SetIdentity<member_type, default_mode_type>::invoke(member, A);
3265  member.team_barrier();
3266  KB::Trsm<member_type,
3267  KB::Side::Left, KB::Uplo::Lower, KB::Trans::NoTranspose, KB::Diag::Unit,
3268  default_mode_type, default_algo_type>::invoke(member, one, W, A);
3269  KB::Trsm<member_type,
3270  KB::Side::Left, KB::Uplo::Upper, KB::Trans::NoTranspose, KB::Diag::NonUnit,
3271  default_mode_type, default_algo_type>::invoke(member, one, W, A);
3272  }
3273  }
3274 
3275  public:
3276  struct ExtractAndFactorizeSubLineTag {};
3277  struct ExtractAndFactorizeFusedJacobiTag {};
3278  struct ExtractBCDTag {};
3279  struct ComputeETag {};
3280  struct ComputeSchurTag {};
3281  struct FactorizeSchurTag {};
3282 
3283  KOKKOS_INLINE_FUNCTION
3284  void
3285  operator()(const ExtractAndFactorizeSubLineTag &, const member_type &member) const {
3286  // btdm is packed and sorted from largest one
3287  const local_ordinal_type packidx = packindices_sub(member.league_rank());
3288 
3289  const local_ordinal_type subpartidx = packptr_sub(packidx);
3290  const local_ordinal_type n_parts = part2packrowidx0_sub.extent(0);
3291  const local_ordinal_type local_subpartidx = subpartidx / n_parts;
3292  const local_ordinal_type partidx = subpartidx % n_parts;
3293 
3294  const local_ordinal_type npacks = packptr_sub(packidx + 1) - subpartidx;
3295  const local_ordinal_type i0 = pack_td_ptr(partidx, local_subpartidx);
3296  const local_ordinal_type nrows = partptr_sub(subpartidx, 1) - partptr_sub(subpartidx, 0);
3297 
3298  internal_vector_scratch_type_3d_view
3299  WW(member.team_scratch(ScratchLevel), blocksize, blocksize, vector_loop_size);
3300 
3301 #ifdef IFPACK2_BLOCKTRIDICONTAINER_USE_PRINTF
3302  printf("rank = %d, i0 = %d, npacks = %d, nrows = %d, packidx = %d, subpartidx = %d, partidx = %d, local_subpartidx = %d;\n", member.league_rank(), i0, npacks, nrows, packidx, subpartidx, partidx, local_subpartidx);
3303  printf("vector_loop_size = %d\n", vector_loop_size);
3304 #endif
3305 
3306  if (vector_loop_size == 1) {
3307  extract(partidx, local_subpartidx, npacks);
3308  factorize_subline(member, i0, nrows, 0, internal_vector_values, WW);
3309  } else {
3310  Kokkos::parallel_for(Kokkos::ThreadVectorRange(member, vector_loop_size),
3311  [&](const local_ordinal_type &v) {
3312  const local_ordinal_type vbeg = v * internal_vector_length;
3313 #ifdef IFPACK2_BLOCKTRIDICONTAINER_USE_PRINTF
3314  printf("i0 = %d, npacks = %d, vbeg = %d;\n", i0, npacks, vbeg);
3315 #endif
3316  if (vbeg < npacks)
3317  extract(member, partidx + vbeg, local_subpartidx, npacks, vbeg);
3318  // this is not safe if vector loop size is different from vector size of
3319  // the team policy. we always make sure this when constructing the team policy
3320  member.team_barrier();
3321  factorize_subline(member, i0, nrows, v, internal_vector_values, WW);
3322  });
3323  }
3324  }
3325 
3326  KOKKOS_INLINE_FUNCTION
3327  void
3328  operator()(const ExtractAndFactorizeFusedJacobiTag &, const member_type &member) const {
3329  using default_mode_and_algo_type = ExtractAndFactorizeTridiagsDefaultModeAndAlgo<typename execution_space::memory_space>;
3330  using default_mode_type = typename default_mode_and_algo_type::mode_type;
3331  using default_algo_type = typename default_mode_and_algo_type::algo_type;
3332  // When fused block Jacobi can be used, the mapping between local rows and parts is trivial (i <-> i)
3333  // We can simply pull the diagonal entry from A into d_inv
3334  btdm_scalar_scratch_type_3d_view WW1(member.team_scratch(ScratchLevel), half_vector_length, blocksize, blocksize);
3335  btdm_scalar_scratch_type_3d_view WW2(member.team_scratch(ScratchLevel), half_vector_length, blocksize, blocksize);
3336 #if KOKKOS_VERSION >= 40799
3337  const auto one = KokkosKernels::ArithTraits<btdm_magnitude_type>::one();
3338 #else
3339  const auto one = Kokkos::ArithTraits<btdm_magnitude_type>::one();
3340 #endif
3341  const local_ordinal_type nrows = lclrow.extent(0);
3342  Kokkos::parallel_for(Kokkos::ThreadVectorRange(member, half_vector_length),
3343  [&](const local_ordinal_type &v) {
3344  local_ordinal_type row = member.league_rank() * half_vector_length + v;
3345  // diagEntry has index of diagonal within row
3346  auto W1 = Kokkos::subview(WW1, v, Kokkos::ALL(), Kokkos::ALL());
3347  auto W2 = Kokkos::subview(WW2, v, Kokkos::ALL(), Kokkos::ALL());
3348  if (row < nrows) {
3349  // View the diagonal block of A in row as 2D row-major
3350  const impl_scalar_type *A_diag = A_values.data() + diag_offsets(row);
3351  // Copy the diag into scratch slice W1
3352  // (copying elements directly is better than KokkosBatched copy)
3353  Kokkos::parallel_for(Kokkos::TeamThreadRange(member, blocksize * blocksize),
3354  [&](int i) {
3355  W1.data()[i] = A_diag[i];
3356  });
3357  // and set W2 to identity in preparation to invert with 2 x Trsm
3358  KB::SetIdentity<member_type, default_mode_type>::invoke(member, W2);
3359  } else {
3360  // if this vector lane has no block to invert, then set W1 to identity
3361  // so that LU still has a matrix to work on. LU uses team barriers so
3362  // having some lanes run it and some not will deadlock.
3363  KB::SetIdentity<member_type, default_mode_type>::invoke(member, W1);
3364  }
3365  member.team_barrier();
3366  // LU factorize in-place
3367  KB::LU<member_type, default_mode_type, KB::Algo::LU::Unblocked>::invoke(member, W1, tiny);
3368  member.team_barrier();
3369  KB::Trsm<member_type,
3370  KB::Side::Left, KB::Uplo::Lower, KB::Trans::NoTranspose, KB::Diag::Unit,
3371  default_mode_type, default_algo_type>::invoke(member, one, W1, W2);
3372  KB::Trsm<member_type,
3373  KB::Side::Left, KB::Uplo::Upper, KB::Trans::NoTranspose, KB::Diag::NonUnit,
3374  default_mode_type, default_algo_type>::invoke(member, one, W1, W2);
3375  member.team_barrier();
3376  if (row < nrows) {
3377  Kokkos::parallel_for(Kokkos::TeamThreadRange(member, blocksize * blocksize),
3378  [&](int i) {
3379  auto d_inv_block = &d_inv(row, 0, 0);
3380  d_inv_block[i] = W2.data()[i];
3381  });
3382  }
3383  });
3384  }
3385 
3386  KOKKOS_INLINE_FUNCTION
3387  void
3388  operator()(const ExtractBCDTag &, const member_type &member) const {
3389  // btdm is packed and sorted from largest one
3390  const local_ordinal_type packindices_schur_i = member.league_rank() % packindices_schur.extent(0);
3391  const local_ordinal_type packindices_schur_j = member.league_rank() / packindices_schur.extent(0);
3392  const local_ordinal_type packidx = packindices_schur(packindices_schur_i, packindices_schur_j);
3393 
3394  const local_ordinal_type subpartidx = packptr_sub(packidx);
3395  const local_ordinal_type n_parts = part2packrowidx0_sub.extent(0);
3396  const local_ordinal_type local_subpartidx = subpartidx / n_parts;
3397  const local_ordinal_type partidx = subpartidx % n_parts;
3398 
3399  const local_ordinal_type npacks = packptr_sub(packidx + 1) - subpartidx;
3400  // const local_ordinal_type i0 = pack_td_ptr(partidx,local_subpartidx);
3401  // const local_ordinal_type nrows = partptr_sub(subpartidx,1) - partptr_sub(subpartidx,0);
3402 
3403  if (vector_loop_size == 1) {
3404  extract(partidx, local_subpartidx, npacks);
3405  } else {
3406  Kokkos::parallel_for(Kokkos::ThreadVectorRange(member, vector_loop_size),
3407  [&](const local_ordinal_type &v) {
3408  const local_ordinal_type vbeg = v * internal_vector_length;
3409 #ifdef IFPACK2_BLOCKTRIDICONTAINER_USE_PRINTF
3410  const local_ordinal_type i0 = pack_td_ptr(partidx, local_subpartidx);
3411  printf("i0 = %d, npacks = %d, vbeg = %d;\n", i0, npacks, vbeg);
3412 #endif
3413  if (vbeg < npacks)
3414  extract(member, partidx + vbeg, local_subpartidx, npacks, vbeg);
3415  });
3416  }
3417 
3418  member.team_barrier();
3419 
3420  const size_type kps1 = pack_td_ptr(partidx, local_subpartidx);
3421  const size_type kps2 = pack_td_ptr(partidx, local_subpartidx + 1) - 1;
3422 
3423  const local_ordinal_type r1 = part2packrowidx0_sub(partidx, local_subpartidx) - 1;
3424  const local_ordinal_type r2 = part2packrowidx0_sub(partidx, local_subpartidx) + 2;
3425 
3426 #ifdef IFPACK2_BLOCKTRIDICONTAINER_USE_PRINTF
3427  printf("Copy for Schur complement part id = %d from kps1 = %ld to r1 = %d and from kps2 = %ld to r2 = %d partidx = %d local_subpartidx = %d;\n", packidx, kps1, r1, kps2, r2, partidx, local_subpartidx);
3428 #endif
3429 
3430  // Need to copy D to e_internal_vector_values.
3431  copy3DView<local_ordinal_type>(member, Kokkos::subview(e_internal_vector_values, 0, r1, Kokkos::ALL(), Kokkos::ALL(), Kokkos::ALL()),
3432  Kokkos::subview(internal_vector_values, kps1, Kokkos::ALL(), Kokkos::ALL(), Kokkos::ALL()));
3433 
3434  copy3DView<local_ordinal_type>(member, Kokkos::subview(e_internal_vector_values, 1, r2, Kokkos::ALL(), Kokkos::ALL(), Kokkos::ALL()),
3435  Kokkos::subview(internal_vector_values, kps2, Kokkos::ALL(), Kokkos::ALL(), Kokkos::ALL()));
3436  }
3437 
3438  KOKKOS_INLINE_FUNCTION
3439  void
3440  operator()(const ComputeETag &, const member_type &member) const {
3441  // btdm is packed and sorted from largest one
3442  const local_ordinal_type packidx = packindices_sub(member.league_rank());
3443 
3444  const local_ordinal_type subpartidx = packptr_sub(packidx);
3445  const local_ordinal_type n_parts = part2packrowidx0_sub.extent(0);
3446  const local_ordinal_type local_subpartidx = subpartidx / n_parts;
3447  const local_ordinal_type partidx = subpartidx % n_parts;
3448 
3449  const local_ordinal_type npacks = packptr_sub(packidx + 1) - subpartidx;
3450  const local_ordinal_type i0 = pack_td_ptr(partidx, local_subpartidx);
3451  const local_ordinal_type r0 = part2packrowidx0_sub(partidx, local_subpartidx);
3452  const local_ordinal_type nrows = partptr_sub(subpartidx, 1) - partptr_sub(subpartidx, 0);
3453  const local_ordinal_type num_vectors = blocksize;
3454 
3455  (void)npacks;
3456 
3457  internal_vector_scratch_type_3d_view
3458  WW(member.team_scratch(ScratchLevel), blocksize, num_vectors, vector_loop_size);
3459  if (local_subpartidx == 0) {
3460  Kokkos::parallel_for(Kokkos::ThreadVectorRange(member, vector_loop_size), [&](const int &v) {
3461  solveMultiVector<impl_type, internal_vector_scratch_type_3d_view>(member, blocksize, i0, r0, nrows, v, internal_vector_values, Kokkos::subview(e_internal_vector_values, 0, Kokkos::ALL(), Kokkos::ALL(), Kokkos::ALL(), Kokkos::ALL()), WW, true);
3462  });
3463  } else if (local_subpartidx == (local_ordinal_type)part2packrowidx0_sub.extent(1) - 2) {
3464  Kokkos::parallel_for(Kokkos::ThreadVectorRange(member, vector_loop_size), [&](const int &v) {
3465  solveMultiVector<impl_type, internal_vector_scratch_type_3d_view>(member, blocksize, i0, r0, nrows, v, internal_vector_values, Kokkos::subview(e_internal_vector_values, 1, Kokkos::ALL(), Kokkos::ALL(), Kokkos::ALL(), Kokkos::ALL()), WW);
3466  });
3467  } else {
3468  Kokkos::parallel_for(Kokkos::ThreadVectorRange(member, vector_loop_size), [&](const int &v) {
3469  solveMultiVector<impl_type, internal_vector_scratch_type_3d_view>(member, blocksize, i0, r0, nrows, v, internal_vector_values, Kokkos::subview(e_internal_vector_values, 0, Kokkos::ALL(), Kokkos::ALL(), Kokkos::ALL(), Kokkos::ALL()), WW, true);
3470  solveMultiVector<impl_type, internal_vector_scratch_type_3d_view>(member, blocksize, i0, r0, nrows, v, internal_vector_values, Kokkos::subview(e_internal_vector_values, 1, Kokkos::ALL(), Kokkos::ALL(), Kokkos::ALL(), Kokkos::ALL()), WW);
3471  });
3472  }
3473  }
3474 
3475  KOKKOS_INLINE_FUNCTION
3476  void
3477  operator()(const ComputeSchurTag &, const member_type &member) const {
3478  // btdm is packed and sorted from largest one
3479  const local_ordinal_type packindices_schur_i = member.league_rank() % packindices_schur.extent(0);
3480  const local_ordinal_type packindices_schur_j = member.league_rank() / packindices_schur.extent(0);
3481  const local_ordinal_type packidx = packindices_schur(packindices_schur_i, packindices_schur_j);
3482 
3483  const local_ordinal_type subpartidx = packptr_sub(packidx);
3484  const local_ordinal_type n_parts = part2packrowidx0_sub.extent(0);
3485  const local_ordinal_type local_subpartidx = subpartidx / n_parts;
3486  const local_ordinal_type partidx = subpartidx % n_parts;
3487 
3488  // const local_ordinal_type npacks = packptr_sub(packidx+1) - subpartidx;
3489  const local_ordinal_type i0 = pack_td_ptr(partidx, local_subpartidx);
3490  // const local_ordinal_type r0 = part2packrowidx0_sub(partidx,local_subpartidx);
3491  // const local_ordinal_type nrows = partptr_sub(subpartidx,1) - partptr_sub(subpartidx,0);
3492 
3493  // Compute S = D - C E
3494 
3495  const local_ordinal_type local_subpartidx_schur = (local_subpartidx - 1) / 2;
3496  const local_ordinal_type i0_schur = local_subpartidx_schur == 0 ? pack_td_ptr_schur(partidx, local_subpartidx_schur) : pack_td_ptr_schur(partidx, local_subpartidx_schur) + 1;
3497  const local_ordinal_type i0_offset = local_subpartidx_schur == 0 ? i0 + 2 : i0 + 2;
3498 
3499  for (local_ordinal_type i = 0; i < 4; ++i) { // pack_td_ptr_schur(partidx,local_subpartidx_schur+1)-i0_schur
3500  copy3DView<local_ordinal_type>(member, Kokkos::subview(internal_vector_values_schur, i0_schur + i, Kokkos::ALL(), Kokkos::ALL(), Kokkos::ALL()),
3501  Kokkos::subview(internal_vector_values, i0_offset + i, Kokkos::ALL(), Kokkos::ALL(), Kokkos::ALL()));
3502  }
3503 
3504  member.team_barrier();
3505 
3506 #if KOKKOS_VERSION >= 40799
3507  const auto one = KokkosKernels::ArithTraits<btdm_magnitude_type>::one();
3508 #else
3509  const auto one = Kokkos::ArithTraits<btdm_magnitude_type>::one();
3510 #endif
3511 
3512  const size_type c_kps1 = pack_td_ptr(partidx, local_subpartidx) + 1;
3513  const size_type c_kps2 = pack_td_ptr(partidx, local_subpartidx + 1) - 2;
3514 
3515  const local_ordinal_type e_r1 = part2packrowidx0_sub(partidx, local_subpartidx) - 1;
3516  const local_ordinal_type e_r2 = part2packrowidx0_sub(partidx, local_subpartidx) + 2;
3517 
3518  typedef ExtractAndFactorizeTridiagsDefaultModeAndAlgo<typename execution_space::memory_space> default_mode_and_algo_type;
3519 
3520  typedef typename default_mode_and_algo_type::mode_type default_mode_type;
3521  typedef typename default_mode_and_algo_type::algo_type default_algo_type;
3522 
3523  Kokkos::parallel_for(Kokkos::ThreadVectorRange(member, vector_loop_size), [&](const int &v) {
3524  for (size_type i = 0; i < pack_td_ptr_schur(partidx, local_subpartidx_schur + 1) - pack_td_ptr_schur(partidx, local_subpartidx_schur); ++i) {
3525  local_ordinal_type e_r, e_c, c_kps;
3526 
3527  if (local_subpartidx_schur == 0) {
3528  if (i == 0) {
3529  e_r = e_r1;
3530  e_c = 0;
3531  c_kps = c_kps1;
3532  } else if (i == 3) {
3533  e_r = e_r2;
3534  e_c = 1;
3535  c_kps = c_kps2;
3536  } else if (i == 4) {
3537  e_r = e_r2;
3538  e_c = 0;
3539  c_kps = c_kps2;
3540  } else {
3541  continue;
3542  }
3543  } else {
3544  if (i == 0) {
3545  e_r = e_r1;
3546  e_c = 1;
3547  c_kps = c_kps1;
3548  } else if (i == 1) {
3549  e_r = e_r1;
3550  e_c = 0;
3551  c_kps = c_kps1;
3552  } else if (i == 4) {
3553  e_r = e_r2;
3554  e_c = 1;
3555  c_kps = c_kps2;
3556  } else if (i == 5) {
3557  e_r = e_r2;
3558  e_c = 0;
3559  c_kps = c_kps2;
3560  } else {
3561  continue;
3562  }
3563  }
3564 
3565  auto S = Kokkos::subview(internal_vector_values_schur, pack_td_ptr_schur(partidx, local_subpartidx_schur) + i, Kokkos::ALL(), Kokkos::ALL(), v);
3566  auto C = Kokkos::subview(internal_vector_values, c_kps, Kokkos::ALL(), Kokkos::ALL(), v);
3567  auto E = Kokkos::subview(e_internal_vector_values, e_c, e_r, Kokkos::ALL(), Kokkos::ALL(), v);
3568  KB::Gemm<member_type,
3569  KB::Trans::NoTranspose, KB::Trans::NoTranspose,
3570  default_mode_type, default_algo_type>::invoke(member, -one, C, E, one, S);
3571  }
3572  });
3573  }
3574 
3575  KOKKOS_INLINE_FUNCTION
3576  void
3577  operator()(const FactorizeSchurTag &, const member_type &member) const {
3578  const local_ordinal_type packidx = packindices_schur(member.league_rank(), 0);
3579 
3580  const local_ordinal_type subpartidx = packptr_sub(packidx);
3581 
3582  const local_ordinal_type n_parts = part2packrowidx0_sub.extent(0);
3583  const local_ordinal_type partidx = subpartidx % n_parts;
3584 
3585  const local_ordinal_type i0 = pack_td_ptr_schur(partidx, 0);
3586  const local_ordinal_type nrows = 2 * (pack_td_ptr_schur.extent(1) - 1);
3587 
3588  internal_vector_scratch_type_3d_view
3589  WW(member.team_scratch(ScratchLevel), blocksize, blocksize, vector_loop_size);
3590 
3591 #ifdef IFPACK2_BLOCKTRIDICONTAINER_USE_PRINTF
3592  printf("FactorizeSchurTag rank = %d, i0 = %d, nrows = %d, vector_loop_size = %d;\n", member.league_rank(), i0, nrows, vector_loop_size);
3593 #endif
3594 
3595  if (vector_loop_size == 1) {
3596  factorize_subline(member, i0, nrows, 0, internal_vector_values_schur, WW);
3597  } else {
3598  Kokkos::parallel_for(Kokkos::ThreadVectorRange(member, vector_loop_size),
3599  [&](const local_ordinal_type &v) {
3600  factorize_subline(member, i0, nrows, v, internal_vector_values_schur, WW);
3601  });
3602  }
3603  }
3604 
3605  void run() {
3606  IFPACK2_BLOCKTRIDICONTAINER_PROFILER_REGION_BEGIN;
3607  const local_ordinal_type team_size =
3608  ExtractAndFactorizeTridiagsDefaultModeAndAlgo<typename execution_space::memory_space>::
3609  recommended_team_size(blocksize, vector_length, internal_vector_length);
3610  const local_ordinal_type per_team_scratch = internal_vector_scratch_type_3d_view::
3611  shmem_size(blocksize, blocksize, vector_loop_size);
3612 
3613  {
3614 #ifdef IFPACK2_BLOCKTRIDICONTAINER_USE_PRINTF
3615  printf("Start ExtractAndFactorizeSubLineTag\n");
3616 #endif
3617  IFPACK2_BLOCKHELPER_TIMER("BlockTriDi::NumericPhase::ExtractAndFactorizeSubLineTag", ExtractAndFactorizeSubLineTag0);
3618  Kokkos::TeamPolicy<execution_space, ExtractAndFactorizeSubLineTag>
3619  policy(packindices_sub.extent(0), team_size, vector_loop_size);
3620 
3621  const local_ordinal_type n_parts = part2packrowidx0_sub.extent(0);
3622  writeBTDValuesToFile(n_parts, scalar_values, "before.mm");
3623 
3624  policy.set_scratch_size(ScratchLevel, Kokkos::PerTeam(per_team_scratch));
3625  Kokkos::parallel_for("ExtractAndFactorize::TeamPolicy::run<ExtractAndFactorizeSubLineTag>",
3626  policy, *this);
3627  execution_space().fence();
3628 
3629  writeBTDValuesToFile(n_parts, scalar_values, "after.mm");
3630 #ifdef IFPACK2_BLOCKTRIDICONTAINER_USE_PRINTF
3631  printf("End ExtractAndFactorizeSubLineTag\n");
3632 #endif
3633  }
3634 
3635  if (packindices_schur.extent(1) > 0) {
3636  {
3637 #ifdef IFPACK2_BLOCKTRIDICONTAINER_USE_PRINTF
3638  printf("Start ExtractBCDTag\n");
3639 #endif
3640 #if KOKKOS_VERSION >= 40799
3641  Kokkos::deep_copy(e_scalar_values, KokkosKernels::ArithTraits<btdm_magnitude_type>::zero());
3642 #else
3643  Kokkos::deep_copy(e_scalar_values, Kokkos::ArithTraits<btdm_magnitude_type>::zero());
3644 #endif
3645 #if KOKKOS_VERSION >= 40799
3646  Kokkos::deep_copy(scalar_values_schur, KokkosKernels::ArithTraits<btdm_magnitude_type>::zero());
3647 #else
3648  Kokkos::deep_copy(scalar_values_schur, Kokkos::ArithTraits<btdm_magnitude_type>::zero());
3649 #endif
3650 
3651  write5DMultiVectorValuesToFile(part2packrowidx0_sub.extent(0), e_scalar_values, "e_scalar_values_before_extract.mm");
3652 
3653  {
3654  IFPACK2_BLOCKHELPER_TIMER("BlockTriDi::NumericPhase::ExtractBCDTag", ExtractBCDTag0);
3655  Kokkos::TeamPolicy<execution_space, ExtractBCDTag>
3656  policy(packindices_schur.extent(0) * packindices_schur.extent(1), team_size, vector_loop_size);
3657 
3658  policy.set_scratch_size(ScratchLevel, Kokkos::PerTeam(per_team_scratch));
3659  Kokkos::parallel_for("ExtractAndFactorize::TeamPolicy::run<ExtractBCDTag>",
3660  policy, *this);
3661  execution_space().fence();
3662  }
3663 
3664 #ifdef IFPACK2_BLOCKTRIDICONTAINER_USE_PRINTF
3665  printf("End ExtractBCDTag\n");
3666 #endif
3667  writeBTDValuesToFile(part2packrowidx0_sub.extent(0), scalar_values, "after_extraction_of_BCD.mm");
3668 #ifdef IFPACK2_BLOCKTRIDICONTAINER_USE_PRINTF
3669  printf("Start ComputeETag\n");
3670 #endif
3671  write5DMultiVectorValuesToFile(part2packrowidx0_sub.extent(0), e_scalar_values, "e_scalar_values_after_extract.mm");
3672  {
3673  IFPACK2_BLOCKHELPER_TIMER("BlockTriDi::NumericPhase::ComputeETag", ComputeETag0);
3674  Kokkos::TeamPolicy<execution_space, ComputeETag>
3675  policy(packindices_sub.extent(0), team_size, vector_loop_size);
3676 
3677  policy.set_scratch_size(ScratchLevel, Kokkos::PerTeam(per_team_scratch));
3678  Kokkos::parallel_for("ExtractAndFactorize::TeamPolicy::run<ComputeETag>",
3679  policy, *this);
3680  execution_space().fence();
3681  }
3682  write5DMultiVectorValuesToFile(part2packrowidx0_sub.extent(0), e_scalar_values, "e_scalar_values_after_compute.mm");
3683 
3684 #ifdef IFPACK2_BLOCKTRIDICONTAINER_USE_PRINTF
3685  printf("End ComputeETag\n");
3686 #endif
3687  }
3688 
3689  {
3690 #ifdef IFPACK2_BLOCKTRIDICONTAINER_USE_PRINTF
3691  printf("Start ComputeSchurTag\n");
3692 #endif
3693  IFPACK2_BLOCKHELPER_TIMER("BlockTriDi::NumericPhase::ComputeSchurTag", ComputeSchurTag0);
3694  writeBTDValuesToFile(part2packrowidx0_sub.extent(0), scalar_values_schur, "before_schur.mm");
3695  Kokkos::TeamPolicy<execution_space, ComputeSchurTag>
3696  policy(packindices_schur.extent(0) * packindices_schur.extent(1), team_size, vector_loop_size);
3697 
3698  Kokkos::parallel_for("ExtractAndFactorize::TeamPolicy::run<ComputeSchurTag>",
3699  policy, *this);
3700  writeBTDValuesToFile(part2packrowidx0_sub.extent(0), scalar_values_schur, "after_schur.mm");
3701  execution_space().fence();
3702 #ifdef IFPACK2_BLOCKTRIDICONTAINER_USE_PRINTF
3703  printf("End ComputeSchurTag\n");
3704 #endif
3705  }
3706 
3707  {
3708 #ifdef IFPACK2_BLOCKTRIDICONTAINER_USE_PRINTF
3709  printf("Start FactorizeSchurTag\n");
3710 #endif
3711  IFPACK2_BLOCKHELPER_TIMER("BlockTriDi::NumericPhase::FactorizeSchurTag", FactorizeSchurTag0);
3712  Kokkos::TeamPolicy<execution_space, FactorizeSchurTag>
3713  policy(packindices_schur.extent(0), team_size, vector_loop_size);
3714  policy.set_scratch_size(ScratchLevel, Kokkos::PerTeam(per_team_scratch));
3715  Kokkos::parallel_for("ExtractAndFactorize::TeamPolicy::run<FactorizeSchurTag>",
3716  policy, *this);
3717  execution_space().fence();
3718  writeBTDValuesToFile(part2packrowidx0_sub.extent(0), scalar_values_schur, "after_factor_schur.mm");
3719 #ifdef IFPACK2_BLOCKTRIDICONTAINER_USE_PRINTF
3720  printf("End FactorizeSchurTag\n");
3721 #endif
3722  }
3723  }
3724 
3725  IFPACK2_BLOCKTRIDICONTAINER_PROFILER_REGION_END;
3726  }
3727 
3728  void run_fused_jacobi() {
3729  IFPACK2_BLOCKTRIDICONTAINER_PROFILER_REGION_BEGIN;
3730  const local_ordinal_type team_size =
3731  ExtractAndFactorizeTridiagsDefaultModeAndAlgo<typename execution_space::memory_space>::
3732  recommended_team_size(blocksize, half_vector_length, 1);
3733  const local_ordinal_type per_team_scratch =
3734  btdm_scalar_scratch_type_3d_view::shmem_size(blocksize, blocksize, 2 * half_vector_length);
3735  {
3736  IFPACK2_BLOCKHELPER_TIMER("BlockTriDi::NumericPhase::ExtractAndFactorizeFusedJacobi", ExtractAndFactorizeFusedJacobiTag);
3737  Kokkos::TeamPolicy<execution_space, ExtractAndFactorizeFusedJacobiTag>
3738  policy((lclrow.extent(0) + half_vector_length - 1) / half_vector_length, team_size, half_vector_length);
3739 
3740  policy.set_scratch_size(ScratchLevel, Kokkos::PerTeam(per_team_scratch));
3741  Kokkos::parallel_for("ExtractAndFactorize::TeamPolicy::run<ExtractAndFactorizeFusedJacobiTag>",
3742  policy, *this);
3743  }
3744  IFPACK2_BLOCKTRIDICONTAINER_PROFILER_REGION_END;
3745  }
3746 };
3747 
3751 template <typename MatrixType>
3752 void performNumericPhase(const Teuchos::RCP<const typename BlockHelperDetails::ImplType<MatrixType>::tpetra_row_matrix_type> &A,
3753  const Teuchos::RCP<const typename BlockHelperDetails::ImplType<MatrixType>::tpetra_crs_graph_type> &G,
3754  const BlockHelperDetails::PartInterface<MatrixType> &interf,
3756  const typename BlockHelperDetails::ImplType<MatrixType>::magnitude_type tiny,
3757  bool use_fused_jacobi) {
3758  using impl_type = BlockHelperDetails::ImplType<MatrixType>;
3759  using execution_space = typename impl_type::execution_space;
3760  using team_policy_type = Kokkos::TeamPolicy<execution_space>;
3761  using internal_vector_scratch_type_3d_view = Scratch<typename impl_type::internal_vector_type_3d_view>;
3762  using btdm_scalar_scratch_type_3d_view = Scratch<typename impl_type::btdm_scalar_type_3d_view>;
3763 
3764  IFPACK2_BLOCKHELPER_TIMER("BlockTriDi::NumericPhase", NumericPhase);
3765 
3766  int blocksize = btdm.values.extent(1);
3767  // Both Kokkos policy vector length and SIMD type vector length are hardcoded in KokkosBatched.
3768  // For large block sizes, have to fall back to level 1 scratch.
3769  int scratch_required;
3770  if (!use_fused_jacobi) {
3771  // General path scratch requirement
3772  scratch_required = internal_vector_scratch_type_3d_view::shmem_size(blocksize, blocksize, impl_type::vector_length / impl_type::internal_vector_length);
3773  } else {
3774  // Block Jacobi scratch requirement: measured in scalars, and uses twice as much (in bytes) per vector lane as the general path.
3775  scratch_required = btdm_scalar_scratch_type_3d_view::shmem_size(blocksize, blocksize, 2 * impl_type::half_vector_length);
3776  }
3777 
3778  int max_scratch = team_policy_type::scratch_size_max(0);
3779 
3780  if (scratch_required < max_scratch) {
3781  // Can use level 0 scratch
3782  ExtractAndFactorizeTridiags<MatrixType, 0> function(btdm, interf, A, G, tiny);
3783  if (!use_fused_jacobi)
3784  function.run();
3785  else
3786  function.run_fused_jacobi();
3787  } else {
3788  // Not enough level 0 scratch, so fall back to level 1
3789  ExtractAndFactorizeTridiags<MatrixType, 1> function(btdm, interf, A, G, tiny);
3790  if (!use_fused_jacobi)
3791  function.run();
3792  else
3793  function.run_fused_jacobi();
3794  }
3795  IFPACK2_BLOCKHELPER_TIMER_FENCE(typename BlockHelperDetails::ImplType<MatrixType>::execution_space)
3796 }
3797 
3801 template <typename MatrixType>
3803  public:
3805  using execution_space = typename impl_type::execution_space;
3806  using memory_space = typename impl_type::memory_space;
3807 
3808  using local_ordinal_type = typename impl_type::local_ordinal_type;
3809  using impl_scalar_type = typename impl_type::impl_scalar_type;
3810  using btdm_scalar_type = typename impl_type::btdm_scalar_type;
3811  using tpetra_multivector_type = typename impl_type::tpetra_multivector_type;
3812  using local_ordinal_type_1d_view = typename impl_type::local_ordinal_type_1d_view;
3813  using vector_type_3d_view = typename impl_type::vector_type_3d_view;
3814  using impl_scalar_type_2d_view_tpetra = typename impl_type::impl_scalar_type_2d_view_tpetra;
3815  using const_impl_scalar_type_2d_view_tpetra = typename impl_scalar_type_2d_view_tpetra::const_type;
3816  static constexpr int vector_length = impl_type::vector_length;
3817 
3818  using member_type = typename Kokkos::TeamPolicy<execution_space>::member_type;
3819 
3820  private:
3821  // part interface
3822  const ConstUnmanaged<local_ordinal_type_1d_view> partptr;
3823  const ConstUnmanaged<local_ordinal_type_1d_view> packptr;
3824  const ConstUnmanaged<local_ordinal_type_1d_view> part2packrowidx0;
3825  const ConstUnmanaged<local_ordinal_type_1d_view> part2rowidx0;
3826  const ConstUnmanaged<local_ordinal_type_1d_view> lclrow;
3827  const local_ordinal_type blocksize;
3828  const local_ordinal_type num_vectors;
3829 
3830  // packed multivector output (or input)
3831  vector_type_3d_view packed_multivector;
3832  const_impl_scalar_type_2d_view_tpetra scalar_multivector;
3833 
3834  template <typename TagType>
3835  KOKKOS_INLINE_FUNCTION void copy_multivectors(const local_ordinal_type &j,
3836  const local_ordinal_type &vi,
3837  const local_ordinal_type &pri,
3838  const local_ordinal_type &ri0) const {
3839  for (local_ordinal_type col = 0; col < num_vectors; ++col)
3840  for (local_ordinal_type i = 0; i < blocksize; ++i)
3841  packed_multivector(pri, i, col)[vi] = static_cast<btdm_scalar_type>(scalar_multivector(blocksize * lclrow(ri0 + j) + i, col));
3842  }
3843 
3844  public:
3845  MultiVectorConverter(const BlockHelperDetails::PartInterface<MatrixType> &interf,
3846  const vector_type_3d_view &pmv)
3847  : partptr(interf.partptr)
3848  , packptr(interf.packptr)
3849  , part2packrowidx0(interf.part2packrowidx0)
3850  , part2rowidx0(interf.part2rowidx0)
3851  , lclrow(interf.lclrow)
3852  , blocksize(pmv.extent(1))
3853  , num_vectors(pmv.extent(2))
3854  , packed_multivector(pmv) {}
3855 
3856  // TODO:: modify this routine similar to the team level functions
3857  KOKKOS_INLINE_FUNCTION
3858  void
3859  operator()(const local_ordinal_type &packidx) const {
3860  local_ordinal_type partidx = packptr(packidx);
3861  local_ordinal_type npacks = packptr(packidx + 1) - partidx;
3862  const local_ordinal_type pri0 = part2packrowidx0(partidx);
3863 
3864  local_ordinal_type ri0[vector_length] = {};
3865  local_ordinal_type nrows[vector_length] = {};
3866  for (local_ordinal_type v = 0; v < npacks; ++v, ++partidx) {
3867  ri0[v] = part2rowidx0(partidx);
3868  nrows[v] = part2rowidx0(partidx + 1) - ri0[v];
3869  }
3870  for (local_ordinal_type j = 0; j < nrows[0]; ++j) {
3871  local_ordinal_type cnt = 1;
3872  for (; cnt < npacks && j != nrows[cnt]; ++cnt)
3873  ;
3874  npacks = cnt;
3875  const local_ordinal_type pri = pri0 + j;
3876  for (local_ordinal_type col = 0; col < num_vectors; ++col)
3877  for (local_ordinal_type i = 0; i < blocksize; ++i)
3878  for (local_ordinal_type v = 0; v < npacks; ++v)
3879  packed_multivector(pri, i, col)[v] = static_cast<btdm_scalar_type>(scalar_multivector(blocksize * lclrow(ri0[v] + j) + i, col));
3880  }
3881  }
3882 
3883  KOKKOS_INLINE_FUNCTION
3884  void
3885  operator()(const member_type &member) const {
3886  const local_ordinal_type packidx = member.league_rank();
3887  const local_ordinal_type partidx_begin = packptr(packidx);
3888  const local_ordinal_type npacks = packptr(packidx + 1) - partidx_begin;
3889  const local_ordinal_type pri0 = part2packrowidx0(partidx_begin);
3890  Kokkos::parallel_for(Kokkos::ThreadVectorRange(member, npacks), [&](const local_ordinal_type &v) {
3891  const local_ordinal_type partidx = partidx_begin + v;
3892  const local_ordinal_type ri0 = part2rowidx0(partidx);
3893  const local_ordinal_type nrows = part2rowidx0(partidx + 1) - ri0;
3894 
3895  if (nrows == 1) {
3896  const local_ordinal_type pri = pri0;
3897  for (local_ordinal_type col = 0; col < num_vectors; ++col) {
3898  Kokkos::parallel_for(Kokkos::TeamThreadRange(member, blocksize), [&](const local_ordinal_type &i) {
3899  packed_multivector(pri, i, col)[v] = static_cast<btdm_scalar_type>(scalar_multivector(blocksize * lclrow(ri0) + i, col));
3900  });
3901  }
3902  } else {
3903  Kokkos::parallel_for(Kokkos::TeamThreadRange(member, nrows), [&](const local_ordinal_type &j) {
3904  const local_ordinal_type pri = pri0 + j;
3905  for (local_ordinal_type col = 0; col < num_vectors; ++col)
3906  for (local_ordinal_type i = 0; i < blocksize; ++i)
3907  packed_multivector(pri, i, col)[v] = static_cast<btdm_scalar_type>(scalar_multivector(blocksize * lclrow(ri0 + j) + i, col));
3908  });
3909  }
3910  });
3911  }
3912 
3913  void run(const const_impl_scalar_type_2d_view_tpetra &scalar_multivector_) {
3914  IFPACK2_BLOCKTRIDICONTAINER_PROFILER_REGION_BEGIN;
3915  IFPACK2_BLOCKHELPER_TIMER("BlockTriDi::MultiVectorConverter", MultiVectorConverter0);
3916 
3917  scalar_multivector = scalar_multivector_;
3918  if constexpr (BlockHelperDetails::is_device<execution_space>::value) {
3919  const local_ordinal_type vl = vector_length;
3920  const Kokkos::TeamPolicy<execution_space> policy(packptr.extent(0) - 1, Kokkos::AUTO(), vl);
3921  Kokkos::parallel_for("MultiVectorConverter::TeamPolicy", policy, *this);
3922  } else {
3923  const Kokkos::RangePolicy<execution_space> policy(0, packptr.extent(0) - 1);
3924  Kokkos::parallel_for("MultiVectorConverter::RangePolicy", policy, *this);
3925  }
3926  IFPACK2_BLOCKTRIDICONTAINER_PROFILER_REGION_END;
3927  IFPACK2_BLOCKHELPER_TIMER_FENCE(execution_space)
3928  }
3929 };
3930 
3934 
3935 template <>
3936 struct SolveTridiagsDefaultModeAndAlgo<Kokkos::HostSpace> {
3937  typedef KB::Mode::Serial mode_type;
3938  typedef KB::Algo::Level2::Unblocked single_vector_algo_type;
3939 #if defined(__KOKKOSBATCHED_INTEL_MKL_COMPACT_BATCHED__)
3940  typedef KB::Algo::Level3::CompactMKL multi_vector_algo_type;
3941 #else
3942  typedef KB::Algo::Level3::Blocked multi_vector_algo_type;
3943 #endif
3944  static int recommended_team_size(const int /* blksize */,
3945  const int /* vector_length */,
3946  const int /* internal_vector_length */) {
3947  return 1;
3948  }
3949 };
3950 
3951 #if defined(KOKKOS_ENABLE_CUDA)
3952 static inline int SolveTridiagsRecommendedCudaTeamSize(const int blksize,
3953  const int vector_length,
3954  const int internal_vector_length) {
3955  const int vector_size = vector_length / internal_vector_length;
3956  int total_team_size(0);
3957  if (blksize <= 5)
3958  total_team_size = 32;
3959  else if (blksize <= 9)
3960  total_team_size = 32; // 64
3961  else if (blksize <= 12)
3962  total_team_size = 96;
3963  else if (blksize <= 16)
3964  total_team_size = 128;
3965  else if (blksize <= 20)
3966  total_team_size = 160;
3967  else
3968  total_team_size = 160;
3969  return total_team_size / vector_size;
3970 }
3971 
3972 template <>
3973 struct SolveTridiagsDefaultModeAndAlgo<Kokkos::CudaSpace> {
3974  typedef KB::Mode::Team mode_type;
3975  typedef KB::Algo::Level2::Unblocked single_vector_algo_type;
3976  typedef KB::Algo::Level3::Unblocked multi_vector_algo_type;
3977  static int recommended_team_size(const int blksize,
3978  const int vector_length,
3979  const int internal_vector_length) {
3980  return SolveTridiagsRecommendedCudaTeamSize(blksize, vector_length, internal_vector_length);
3981  }
3982 };
3983 template <>
3984 struct SolveTridiagsDefaultModeAndAlgo<Kokkos::CudaUVMSpace> {
3985  typedef KB::Mode::Team mode_type;
3986  typedef KB::Algo::Level2::Unblocked single_vector_algo_type;
3987  typedef KB::Algo::Level3::Unblocked multi_vector_algo_type;
3988  static int recommended_team_size(const int blksize,
3989  const int vector_length,
3990  const int internal_vector_length) {
3991  return SolveTridiagsRecommendedCudaTeamSize(blksize, vector_length, internal_vector_length);
3992  }
3993 };
3994 #endif
3995 
3996 #if defined(KOKKOS_ENABLE_HIP)
3997 static inline int SolveTridiagsRecommendedHIPTeamSize(const int blksize,
3998  const int vector_length,
3999  const int internal_vector_length) {
4000  const int vector_size = vector_length / internal_vector_length;
4001  int total_team_size(0);
4002  if (blksize <= 5)
4003  total_team_size = 32;
4004  else if (blksize <= 9)
4005  total_team_size = 32; // 64
4006  else if (blksize <= 12)
4007  total_team_size = 96;
4008  else if (blksize <= 16)
4009  total_team_size = 128;
4010  else if (blksize <= 20)
4011  total_team_size = 160;
4012  else
4013  total_team_size = 160;
4014  return total_team_size / vector_size;
4015 }
4016 
4017 template <>
4018 struct SolveTridiagsDefaultModeAndAlgo<Kokkos::HIPSpace> {
4019  typedef KB::Mode::Team mode_type;
4020  typedef KB::Algo::Level2::Unblocked single_vector_algo_type;
4021  typedef KB::Algo::Level3::Unblocked multi_vector_algo_type;
4022  static int recommended_team_size(const int blksize,
4023  const int vector_length,
4024  const int internal_vector_length) {
4025  return SolveTridiagsRecommendedHIPTeamSize(blksize, vector_length, internal_vector_length);
4026  }
4027 };
4028 template <>
4029 struct SolveTridiagsDefaultModeAndAlgo<Kokkos::HIPHostPinnedSpace> {
4030  typedef KB::Mode::Team mode_type;
4031  typedef KB::Algo::Level2::Unblocked single_vector_algo_type;
4032  typedef KB::Algo::Level3::Unblocked multi_vector_algo_type;
4033  static int recommended_team_size(const int blksize,
4034  const int vector_length,
4035  const int internal_vector_length) {
4036  return SolveTridiagsRecommendedHIPTeamSize(blksize, vector_length, internal_vector_length);
4037  }
4038 };
4039 #endif
4040 
4041 #if defined(KOKKOS_ENABLE_SYCL)
4042 static inline int SolveTridiagsRecommendedSYCLTeamSize(const int blksize,
4043  const int vector_length,
4044  const int internal_vector_length) {
4045  const int vector_size = vector_length / internal_vector_length;
4046  int total_team_size(0);
4047  if (blksize <= 5)
4048  total_team_size = 32;
4049  else if (blksize <= 9)
4050  total_team_size = 32; // 64
4051  else if (blksize <= 12)
4052  total_team_size = 96;
4053  else if (blksize <= 16)
4054  total_team_size = 128;
4055  else if (blksize <= 20)
4056  total_team_size = 160;
4057  else
4058  total_team_size = 160;
4059  return total_team_size / vector_size;
4060 }
4061 
4062 template <>
4063 struct SolveTridiagsDefaultModeAndAlgo<Kokkos::Experimental::SYCLSharedUSMSpace> {
4064  typedef KB::Mode::Team mode_type;
4065  typedef KB::Algo::Level2::Unblocked single_vector_algo_type;
4066  typedef KB::Algo::Level3::Unblocked multi_vector_algo_type;
4067  static int recommended_team_size(const int blksize,
4068  const int vector_length,
4069  const int internal_vector_length) {
4070  return SolveTridiagsRecommendedSYCLTeamSize(blksize, vector_length, internal_vector_length);
4071  }
4072 };
4073 template <>
4074 struct SolveTridiagsDefaultModeAndAlgo<Kokkos::Experimental::SYCLDeviceUSMSpace> {
4075  typedef KB::Mode::Team mode_type;
4076  typedef KB::Algo::Level2::Unblocked single_vector_algo_type;
4077  typedef KB::Algo::Level3::Unblocked multi_vector_algo_type;
4078  static int recommended_team_size(const int blksize,
4079  const int vector_length,
4080  const int internal_vector_length) {
4081  return SolveTridiagsRecommendedSYCLTeamSize(blksize, vector_length, internal_vector_length);
4082  }
4083 };
4084 #endif
4085 
4086 template <typename MatrixType>
4087 struct SolveTridiags {
4088  public:
4089  using impl_type = BlockHelperDetails::ImplType<MatrixType>;
4090  using execution_space = typename impl_type::execution_space;
4091 
4092  using local_ordinal_type = typename impl_type::local_ordinal_type;
4093  using size_type = typename impl_type::size_type;
4094  using impl_scalar_type = typename impl_type::impl_scalar_type;
4095  using magnitude_type = typename impl_type::magnitude_type;
4096  using btdm_scalar_type = typename impl_type::btdm_scalar_type;
4097  using btdm_magnitude_type = typename impl_type::btdm_magnitude_type;
4099  using local_ordinal_type_1d_view = typename impl_type::local_ordinal_type_1d_view;
4100  using local_ordinal_type_2d_view = typename impl_type::local_ordinal_type_2d_view;
4101  using size_type_2d_view = typename impl_type::size_type_2d_view;
4103  using vector_type_3d_view = typename impl_type::vector_type_3d_view;
4104  using internal_vector_type_4d_view = typename impl_type::internal_vector_type_4d_view;
4105  using internal_vector_type_5d_view = typename impl_type::internal_vector_type_5d_view;
4106  using btdm_scalar_type_4d_view = typename impl_type::btdm_scalar_type_4d_view;
4107 
4108  using internal_vector_scratch_type_3d_view = Scratch<typename impl_type::internal_vector_type_3d_view>;
4109 
4110  using internal_vector_type = typename impl_type::internal_vector_type;
4111  static constexpr int vector_length = impl_type::vector_length;
4112  static constexpr int internal_vector_length = impl_type::internal_vector_length;
4113 
4115  using impl_scalar_type_1d_view = typename impl_type::impl_scalar_type_1d_view;
4116  using impl_scalar_type_2d_view_tpetra = typename impl_type::impl_scalar_type_2d_view_tpetra;
4117 
4119  using team_policy_type = Kokkos::TeamPolicy<execution_space>;
4120  using member_type = typename team_policy_type::member_type;
4121 
4122  private:
4123  // part interface
4124  local_ordinal_type n_subparts_per_part;
4125  const ConstUnmanaged<local_ordinal_type_1d_view> partptr;
4126  const ConstUnmanaged<local_ordinal_type_1d_view> packptr;
4127  const ConstUnmanaged<local_ordinal_type_1d_view> packindices_sub;
4128  const ConstUnmanaged<local_ordinal_type_2d_view> packindices_schur;
4129  const ConstUnmanaged<local_ordinal_type_1d_view> part2packrowidx0;
4130  const ConstUnmanaged<local_ordinal_type_2d_view> part2packrowidx0_sub;
4131  const ConstUnmanaged<local_ordinal_type_1d_view> lclrow;
4132  const ConstUnmanaged<local_ordinal_type_1d_view> packptr_sub;
4133 
4134  const ConstUnmanaged<local_ordinal_type_2d_view> partptr_sub;
4135  const ConstUnmanaged<size_type_2d_view> pack_td_ptr_schur;
4136 
4137  // block tridiags
4138  const ConstUnmanaged<size_type_2d_view> pack_td_ptr;
4139 
4140  // block tridiags values
4141  const ConstUnmanaged<internal_vector_type_4d_view> D_internal_vector_values;
4142  const Unmanaged<internal_vector_type_4d_view> X_internal_vector_values;
4143  const Unmanaged<btdm_scalar_type_4d_view> X_internal_scalar_values;
4144 
4145  internal_vector_type_4d_view X_internal_vector_values_schur;
4146 
4147  const ConstUnmanaged<internal_vector_type_4d_view> D_internal_vector_values_schur;
4148  const ConstUnmanaged<internal_vector_type_5d_view> e_internal_vector_values;
4149 
4150  const local_ordinal_type vector_loop_size;
4151 
4152  // copy to multivectors : damping factor and Y_scalar_multivector
4153  Unmanaged<impl_scalar_type_2d_view_tpetra> Y_scalar_multivector;
4154 #if defined(__CUDA_ARCH__) || defined(__HIP_DEVICE_COMPILE__) || defined(__SYCL_DEVICE_ONLY__)
4155  AtomicUnmanaged<impl_scalar_type_1d_view> Z_scalar_vector;
4156 #else
4157  /* */ Unmanaged<impl_scalar_type_1d_view> Z_scalar_vector;
4158 #endif
4159  const impl_scalar_type df;
4160  const bool compute_diff;
4161 
4162  public:
4163  SolveTridiags(const BlockHelperDetails::PartInterface<MatrixType> &interf,
4164  const BlockTridiags<MatrixType> &btdm,
4165  const vector_type_3d_view &pmv,
4166  const impl_scalar_type damping_factor,
4167  const bool is_norm_manager_active)
4168  : // interface
4169  n_subparts_per_part(interf.n_subparts_per_part)
4170  , partptr(interf.partptr)
4171  , packptr(interf.packptr)
4172  , packindices_sub(interf.packindices_sub)
4173  , packindices_schur(interf.packindices_schur)
4174  , part2packrowidx0(interf.part2packrowidx0)
4175  , part2packrowidx0_sub(interf.part2packrowidx0_sub)
4176  , lclrow(interf.lclrow)
4177  , packptr_sub(interf.packptr_sub)
4178  , partptr_sub(interf.partptr_sub)
4179  , pack_td_ptr_schur(btdm.pack_td_ptr_schur)
4180  ,
4181  // block tridiags and multivector
4182  pack_td_ptr(btdm.pack_td_ptr)
4183  , D_internal_vector_values((internal_vector_type *)btdm.values.data(),
4184  btdm.values.extent(0),
4185  btdm.values.extent(1),
4186  btdm.values.extent(2),
4187  vector_length / internal_vector_length)
4188  , X_internal_vector_values((internal_vector_type *)pmv.data(),
4189  pmv.extent(0),
4190  pmv.extent(1),
4191  pmv.extent(2),
4192  vector_length / internal_vector_length)
4193  , X_internal_scalar_values((btdm_scalar_type *)pmv.data(),
4194  pmv.extent(0),
4195  pmv.extent(1),
4196  pmv.extent(2),
4197  vector_length)
4198  , X_internal_vector_values_schur(do_not_initialize_tag("X_internal_vector_values_schur"),
4199  2 * (n_subparts_per_part - 1) * part2packrowidx0_sub.extent(0),
4200  pmv.extent(1),
4201  pmv.extent(2),
4202  vector_length / internal_vector_length)
4203  , D_internal_vector_values_schur((internal_vector_type *)btdm.values_schur.data(),
4204  btdm.values_schur.extent(0),
4205  btdm.values_schur.extent(1),
4206  btdm.values_schur.extent(2),
4207  vector_length / internal_vector_length)
4208  , e_internal_vector_values((internal_vector_type *)btdm.e_values.data(),
4209  btdm.e_values.extent(0),
4210  btdm.e_values.extent(1),
4211  btdm.e_values.extent(2),
4212  btdm.e_values.extent(3),
4213  vector_length / internal_vector_length)
4214  , vector_loop_size(vector_length / internal_vector_length)
4215  , Y_scalar_multivector()
4216  , Z_scalar_vector()
4217  , df(damping_factor)
4218  , compute_diff(is_norm_manager_active) {}
4219 
4220  public:
4222  KOKKOS_INLINE_FUNCTION
4223  void
4224  copyToFlatMultiVector(const member_type &member,
4225  const local_ordinal_type partidxbeg, // partidx for v = 0
4226  const local_ordinal_type npacks,
4227  const local_ordinal_type pri0,
4228  const local_ordinal_type v, // index with a loop of vector_loop_size
4229  const local_ordinal_type blocksize,
4230  const local_ordinal_type num_vectors) const {
4231  const local_ordinal_type vbeg = v * internal_vector_length;
4232  if (vbeg < npacks) {
4233  local_ordinal_type ri0_vals[internal_vector_length] = {};
4234  local_ordinal_type nrows_vals[internal_vector_length] = {};
4235  for (local_ordinal_type vv = vbeg, vi = 0; vv < npacks && vi < internal_vector_length; ++vv, ++vi) {
4236  const local_ordinal_type partidx = partidxbeg + vv;
4237  ri0_vals[vi] = partptr(partidx);
4238  nrows_vals[vi] = partptr(partidx + 1) - ri0_vals[vi];
4239  }
4240 
4241  impl_scalar_type z_partial_sum(0);
4242  if (nrows_vals[0] == 1) {
4243  const local_ordinal_type j = 0, pri = pri0;
4244  {
4245  for (local_ordinal_type vv = vbeg, vi = 0; vv < npacks && vi < internal_vector_length; ++vv, ++vi) {
4246  const local_ordinal_type ri0 = ri0_vals[vi];
4247  const local_ordinal_type nrows = nrows_vals[vi];
4248  if (j < nrows) {
4249  Kokkos::parallel_for(Kokkos::TeamThreadRange(member, blocksize),
4250  [&](const local_ordinal_type &i) {
4251  const local_ordinal_type row = blocksize * lclrow(ri0 + j) + i;
4252  for (local_ordinal_type col = 0; col < num_vectors; ++col) {
4253  impl_scalar_type &y = Y_scalar_multivector(row, col);
4254  const impl_scalar_type yd = X_internal_vector_values(pri, i, col, v)[vi] - y;
4255  y += df * yd;
4256 
4257  { // if (compute_diff) {
4258 #if KOKKOS_VERSION >= 40799
4259  const auto yd_abs = KokkosKernels::ArithTraits<impl_scalar_type>::abs(yd);
4260 #else
4261  const auto yd_abs = Kokkos::ArithTraits<impl_scalar_type>::abs(yd);
4262 #endif
4263  z_partial_sum += yd_abs * yd_abs;
4264  }
4265  }
4266  });
4267  }
4268  }
4269  }
4270  } else {
4271  Kokkos::parallel_for(Kokkos::TeamThreadRange(member, nrows_vals[0]),
4272  [&](const local_ordinal_type &j) {
4273  const local_ordinal_type pri = pri0 + j;
4274  for (local_ordinal_type vv = vbeg, vi = 0; vv < npacks && vi < internal_vector_length; ++vv, ++vi) {
4275  const local_ordinal_type ri0 = ri0_vals[vi];
4276  const local_ordinal_type nrows = nrows_vals[vi];
4277  if (j < nrows) {
4278  for (local_ordinal_type col = 0; col < num_vectors; ++col) {
4279  for (local_ordinal_type i = 0; i < blocksize; ++i) {
4280  const local_ordinal_type row = blocksize * lclrow(ri0 + j) + i;
4281  impl_scalar_type &y = Y_scalar_multivector(row, col);
4282  const impl_scalar_type yd = X_internal_vector_values(pri, i, col, v)[vi] - y;
4283  y += df * yd;
4284 
4285  { // if (compute_diff) {
4286 #if KOKKOS_VERSION >= 40799
4287  const auto yd_abs = KokkosKernels::ArithTraits<impl_scalar_type>::abs(yd);
4288 #else
4289  const auto yd_abs = Kokkos::ArithTraits<impl_scalar_type>::abs(yd);
4290 #endif
4291  z_partial_sum += yd_abs * yd_abs;
4292  }
4293  }
4294  }
4295  }
4296  }
4297  });
4298  }
4299  // if (compute_diff)
4300  Z_scalar_vector(member.league_rank()) += z_partial_sum;
4301  }
4302  }
4303 
4307  template <typename WWViewType>
4308  KOKKOS_INLINE_FUNCTION void
4309  solveSingleVector(const member_type &member,
4310  const local_ordinal_type &blocksize,
4311  const local_ordinal_type &i0,
4312  const local_ordinal_type &r0,
4313  const local_ordinal_type &nrows,
4314  const local_ordinal_type &v,
4315  const WWViewType &WW) const {
4316  typedef SolveTridiagsDefaultModeAndAlgo<typename execution_space::memory_space> default_mode_and_algo_type;
4317 
4318  typedef typename default_mode_and_algo_type::mode_type default_mode_type;
4319  typedef typename default_mode_and_algo_type::single_vector_algo_type default_algo_type;
4320 
4321  // base pointers
4322  auto A = D_internal_vector_values.data();
4323  auto X = X_internal_vector_values.data();
4324 
4325  // constant
4326 #if KOKKOS_VERSION >= 40799
4327  const auto one = KokkosKernels::ArithTraits<btdm_magnitude_type>::one();
4328 #else
4329  const auto one = Kokkos::ArithTraits<btdm_magnitude_type>::one();
4330 #endif
4331 #if KOKKOS_VERSION >= 40799
4332  const auto zero = KokkosKernels::ArithTraits<btdm_magnitude_type>::zero();
4333 #else
4334  const auto zero = Kokkos::ArithTraits<btdm_magnitude_type>::zero();
4335 #endif
4336  // const local_ordinal_type num_vectors = X_scalar_values.extent(2);
4337 
4338  // const local_ordinal_type blocksize = D_scalar_values.extent(1);
4339  const local_ordinal_type astep = D_internal_vector_values.stride(0);
4340  const local_ordinal_type as0 = D_internal_vector_values.stride(1); // blocksize*vector_length;
4341  const local_ordinal_type as1 = D_internal_vector_values.stride(2); // vector_length;
4342  const local_ordinal_type xstep = X_internal_vector_values.stride(0);
4343  const local_ordinal_type xs0 = X_internal_vector_values.stride(1); // vector_length;
4344 
4345  // move to starting point
4346  A += i0 * astep + v;
4347  X += r0 * xstep + v;
4348 
4349  // for (local_ordinal_type col=0;col<num_vectors;++col)
4350  if (nrows > 1) {
4351  // solve Lx = x
4352  KOKKOSBATCHED_TRSV_LOWER_NO_TRANSPOSE_INTERNAL_INVOKE(default_mode_type, default_algo_type,
4353  member,
4354  KB::Diag::Unit,
4355  blocksize, blocksize,
4356  one,
4357  A, as0, as1,
4358  X, xs0);
4359 
4360  for (local_ordinal_type tr = 1; tr < nrows; ++tr) {
4361  member.team_barrier();
4362  KOKKOSBATCHED_GEMV_NO_TRANSPOSE_INTERNAL_INVOKE(default_mode_type, default_algo_type,
4363  member,
4364  blocksize, blocksize,
4365  -one,
4366  A + 2 * astep, as0, as1,
4367  X, xs0,
4368  one,
4369  X + 1 * xstep, xs0);
4370  KOKKOSBATCHED_TRSV_LOWER_NO_TRANSPOSE_INTERNAL_INVOKE(default_mode_type, default_algo_type,
4371  member,
4372  KB::Diag::Unit,
4373  blocksize, blocksize,
4374  one,
4375  A + 3 * astep, as0, as1,
4376  X + 1 * xstep, xs0);
4377 
4378  A += 3 * astep;
4379  X += 1 * xstep;
4380  }
4381 
4382  // solve Ux = x
4383  KOKKOSBATCHED_TRSV_UPPER_NO_TRANSPOSE_INTERNAL_INVOKE(default_mode_type, default_algo_type,
4384  member,
4385  KB::Diag::NonUnit,
4386  blocksize, blocksize,
4387  one,
4388  A, as0, as1,
4389  X, xs0);
4390 
4391  for (local_ordinal_type tr = nrows; tr > 1; --tr) {
4392  A -= 3 * astep;
4393  member.team_barrier();
4394  KOKKOSBATCHED_GEMV_NO_TRANSPOSE_INTERNAL_INVOKE(default_mode_type, default_algo_type,
4395  member,
4396  blocksize, blocksize,
4397  -one,
4398  A + 1 * astep, as0, as1,
4399  X, xs0,
4400  one,
4401  X - 1 * xstep, xs0);
4402  KOKKOSBATCHED_TRSV_UPPER_NO_TRANSPOSE_INTERNAL_INVOKE(default_mode_type, default_algo_type,
4403  member,
4404  KB::Diag::NonUnit,
4405  blocksize, blocksize,
4406  one,
4407  A, as0, as1,
4408  X - 1 * xstep, xs0);
4409  X -= 1 * xstep;
4410  }
4411  // for multiple rhs
4412  // X += xs1;
4413  } else {
4414  const local_ordinal_type ws0 = WW.stride(0);
4415  auto W = WW.data() + v;
4416  KOKKOSBATCHED_COPY_VECTOR_NO_TRANSPOSE_INTERNAL_INVOKE(default_mode_type,
4417  member, blocksize, X, xs0, W, ws0);
4418  member.team_barrier();
4419  KOKKOSBATCHED_GEMV_NO_TRANSPOSE_INTERNAL_INVOKE(default_mode_type, default_algo_type,
4420  member,
4421  blocksize, blocksize,
4422  one,
4423  A, as0, as1,
4424  W, xs0,
4425  zero,
4426  X, xs0);
4427  }
4428  }
4429 
4430  template <typename WWViewType>
4431  KOKKOS_INLINE_FUNCTION void
4432  solveMultiVector(const member_type &member,
4433  const local_ordinal_type & /* blocksize */,
4434  const local_ordinal_type &i0,
4435  const local_ordinal_type &r0,
4436  const local_ordinal_type &nrows,
4437  const local_ordinal_type &v,
4438  const WWViewType &WW) const {
4439  typedef SolveTridiagsDefaultModeAndAlgo<typename execution_space::memory_space> default_mode_and_algo_type;
4440 
4441  typedef typename default_mode_and_algo_type::mode_type default_mode_type;
4442  typedef typename default_mode_and_algo_type::multi_vector_algo_type default_algo_type;
4443 
4444  // constant
4445 #if KOKKOS_VERSION >= 40799
4446  const auto one = KokkosKernels::ArithTraits<btdm_magnitude_type>::one();
4447 #else
4448  const auto one = Kokkos::ArithTraits<btdm_magnitude_type>::one();
4449 #endif
4450 #if KOKKOS_VERSION >= 40799
4451  const auto zero = KokkosKernels::ArithTraits<btdm_magnitude_type>::zero();
4452 #else
4453  const auto zero = Kokkos::ArithTraits<btdm_magnitude_type>::zero();
4454 #endif
4455 
4456  // subview pattern
4457  auto A = Kokkos::subview(D_internal_vector_values, i0, Kokkos::ALL(), Kokkos::ALL(), v);
4458  auto X1 = Kokkos::subview(X_internal_vector_values, r0, Kokkos::ALL(), Kokkos::ALL(), v);
4459  auto X2 = X1;
4460 
4461  local_ordinal_type i = i0, r = r0;
4462 
4463  if (nrows > 1) {
4464  // solve Lx = x
4465  KB::Trsm<member_type,
4466  KB::Side::Left, KB::Uplo::Lower, KB::Trans::NoTranspose, KB::Diag::Unit,
4467  default_mode_type, default_algo_type>::invoke(member, one, A, X1);
4468  for (local_ordinal_type tr = 1; tr < nrows; ++tr, i += 3) {
4469  A.assign_data(&D_internal_vector_values(i + 2, 0, 0, v));
4470  X2.assign_data(&X_internal_vector_values(++r, 0, 0, v));
4471  member.team_barrier();
4472  KB::Gemm<member_type,
4473  KB::Trans::NoTranspose, KB::Trans::NoTranspose,
4474  default_mode_type, default_algo_type>::invoke(member, -one, A, X1, one, X2);
4475  A.assign_data(&D_internal_vector_values(i + 3, 0, 0, v));
4476  KB::Trsm<member_type,
4477  KB::Side::Left, KB::Uplo::Lower, KB::Trans::NoTranspose, KB::Diag::Unit,
4478  default_mode_type, default_algo_type>::invoke(member, one, A, X2);
4479  X1.assign_data(X2.data());
4480  }
4481 
4482  // solve Ux = x
4483  KB::Trsm<member_type,
4484  KB::Side::Left, KB::Uplo::Upper, KB::Trans::NoTranspose, KB::Diag::NonUnit,
4485  default_mode_type, default_algo_type>::invoke(member, one, A, X1);
4486  for (local_ordinal_type tr = nrows; tr > 1; --tr) {
4487  i -= 3;
4488  A.assign_data(&D_internal_vector_values(i + 1, 0, 0, v));
4489  X2.assign_data(&X_internal_vector_values(--r, 0, 0, v));
4490  member.team_barrier();
4491  KB::Gemm<member_type,
4492  KB::Trans::NoTranspose, KB::Trans::NoTranspose,
4493  default_mode_type, default_algo_type>::invoke(member, -one, A, X1, one, X2);
4494 
4495  A.assign_data(&D_internal_vector_values(i, 0, 0, v));
4496  KB::Trsm<member_type,
4497  KB::Side::Left, KB::Uplo::Upper, KB::Trans::NoTranspose, KB::Diag::NonUnit,
4498  default_mode_type, default_algo_type>::invoke(member, one, A, X2);
4499  X1.assign_data(X2.data());
4500  }
4501  } else {
4502  // matrix is already inverted
4503  auto W = Kokkos::subview(WW, Kokkos::ALL(), Kokkos::ALL(), v);
4504  KB::Copy<member_type, KB::Trans::NoTranspose, default_mode_type>::invoke(member, X1, W);
4505  member.team_barrier();
4506  KB::Gemm<member_type,
4507  KB::Trans::NoTranspose, KB::Trans::NoTranspose,
4508  default_mode_type, default_algo_type>::invoke(member, one, A, W, zero, X1);
4509  }
4510  }
4511 
4512  template <int B>
4513  struct SingleVectorTag {};
4514  template <int B>
4515  struct MultiVectorTag {};
4516 
4517  template <int B>
4518  struct SingleVectorSubLineTag {};
4519  template <int B>
4520  struct MultiVectorSubLineTag {};
4521  template <int B>
4522  struct SingleVectorApplyCTag {};
4523  template <int B>
4524  struct MultiVectorApplyCTag {};
4525  template <int B>
4526  struct SingleVectorSchurTag {};
4527  template <int B>
4528  struct MultiVectorSchurTag {};
4529  template <int B>
4530  struct SingleVectorApplyETag {};
4531  template <int B>
4532  struct MultiVectorApplyETag {};
4533  template <int B>
4534  struct SingleVectorCopyToFlatTag {};
4535  template <int B>
4536  struct SingleZeroingTag {};
4537 
4538  template <int B>
4539  KOKKOS_INLINE_FUNCTION void
4540  operator()(const SingleVectorTag<B> &, const member_type &member) const {
4541  const local_ordinal_type packidx = member.league_rank();
4542  const local_ordinal_type partidx = packptr(packidx);
4543  const local_ordinal_type npacks = packptr(packidx + 1) - partidx;
4544  const local_ordinal_type pri0 = part2packrowidx0(partidx);
4545  const local_ordinal_type i0 = pack_td_ptr(partidx, 0);
4546  const local_ordinal_type r0 = part2packrowidx0(partidx);
4547  const local_ordinal_type nrows = partptr(partidx + 1) - partptr(partidx);
4548  const local_ordinal_type blocksize = (B == 0 ? D_internal_vector_values.extent(1) : B);
4549  const local_ordinal_type num_vectors = 1;
4550  internal_vector_scratch_type_3d_view
4551  WW(member.team_scratch(0), blocksize, 1, vector_loop_size);
4552  Kokkos::single(Kokkos::PerTeam(member), [&]() {
4553  Z_scalar_vector(member.league_rank()) = impl_scalar_type(0);
4554  });
4555  Kokkos::parallel_for(Kokkos::ThreadVectorRange(member, vector_loop_size), [&](const int &v) {
4556  solveSingleVector(member, blocksize, i0, r0, nrows, v, WW);
4557  copyToFlatMultiVector(member, partidx, npacks, pri0, v, blocksize, num_vectors);
4558  });
4559  }
4560 
4561  template <int B>
4562  KOKKOS_INLINE_FUNCTION void
4563  operator()(const MultiVectorTag<B> &, const member_type &member) const {
4564  const local_ordinal_type packidx = member.league_rank();
4565  const local_ordinal_type partidx = packptr(packidx);
4566  const local_ordinal_type npacks = packptr(packidx + 1) - partidx;
4567  const local_ordinal_type pri0 = part2packrowidx0(partidx);
4568  const local_ordinal_type i0 = pack_td_ptr(partidx, 0);
4569  const local_ordinal_type r0 = part2packrowidx0(partidx);
4570  const local_ordinal_type nrows = partptr(partidx + 1) - partptr(partidx);
4571  const local_ordinal_type blocksize = (B == 0 ? D_internal_vector_values.extent(1) : B);
4572  const local_ordinal_type num_vectors = X_internal_vector_values.extent(2);
4573 
4574  internal_vector_scratch_type_3d_view
4575  WW(member.team_scratch(0), blocksize, num_vectors, vector_loop_size);
4576  Kokkos::single(Kokkos::PerTeam(member), [&]() {
4577  Z_scalar_vector(member.league_rank()) = impl_scalar_type(0);
4578  });
4579  Kokkos::parallel_for(Kokkos::ThreadVectorRange(member, vector_loop_size), [&](const int &v) {
4580  solveMultiVector(member, blocksize, i0, r0, nrows, v, WW);
4581  copyToFlatMultiVector(member, partidx, npacks, pri0, v, blocksize, num_vectors);
4582  });
4583  }
4584 
4585  template <int B>
4586  KOKKOS_INLINE_FUNCTION void
4587  operator()(const SingleVectorSubLineTag<B> &, const member_type &member) const {
4588  // btdm is packed and sorted from largest one
4589  const local_ordinal_type packidx = packindices_sub(member.league_rank());
4590 
4591  const local_ordinal_type subpartidx = packptr_sub(packidx);
4592  const local_ordinal_type n_parts = part2packrowidx0_sub.extent(0);
4593  const local_ordinal_type local_subpartidx = subpartidx / n_parts;
4594  const local_ordinal_type partidx = subpartidx % n_parts;
4595 
4596  const local_ordinal_type npacks = packptr_sub(packidx + 1) - subpartidx;
4597  const local_ordinal_type i0 = pack_td_ptr(partidx, local_subpartidx);
4598  const local_ordinal_type r0 = part2packrowidx0_sub(partidx, local_subpartidx);
4599  const local_ordinal_type nrows = partptr_sub(subpartidx, 1) - partptr_sub(subpartidx, 0);
4600  const local_ordinal_type blocksize = e_internal_vector_values.extent(2);
4601 
4602  //(void) i0;
4603  //(void) nrows;
4604  (void)npacks;
4605 
4606  internal_vector_scratch_type_3d_view
4607  WW(member.team_scratch(0), blocksize, 1, vector_loop_size);
4608 
4609  Kokkos::parallel_for(Kokkos::ThreadVectorRange(member, vector_loop_size), [&](const int &v) {
4610  solveSingleVectorNew<impl_type, internal_vector_scratch_type_3d_view>(member, blocksize, i0, r0, nrows, v, D_internal_vector_values, X_internal_vector_values, WW);
4611  });
4612  }
4613 
4614  template <int B>
4615  KOKKOS_INLINE_FUNCTION void
4616  operator()(const SingleVectorApplyCTag<B> &, const member_type &member) const {
4617  // btdm is packed and sorted from largest one
4618  // const local_ordinal_type packidx = packindices_schur(member.league_rank());
4619  const local_ordinal_type packidx = packindices_sub(member.league_rank());
4620 
4621  const local_ordinal_type subpartidx = packptr_sub(packidx);
4622  const local_ordinal_type n_parts = part2packrowidx0_sub.extent(0);
4623  const local_ordinal_type local_subpartidx = subpartidx / n_parts;
4624  const local_ordinal_type partidx = subpartidx % n_parts;
4625  const local_ordinal_type blocksize = e_internal_vector_values.extent(2);
4626 
4627  // const local_ordinal_type npacks = packptr_sub(packidx+1) - subpartidx;
4628  const local_ordinal_type i0 = pack_td_ptr(partidx, local_subpartidx);
4629  const local_ordinal_type r0 = part2packrowidx0_sub(partidx, local_subpartidx);
4630  const local_ordinal_type nrows = partptr_sub(subpartidx, 1) - partptr_sub(subpartidx, 0);
4631 
4632  internal_vector_scratch_type_3d_view
4633  WW(member.team_scratch(0), blocksize, blocksize, vector_loop_size);
4634 
4635  // Compute v_2 = v_2 - C v_1
4636 
4637  const local_ordinal_type local_subpartidx_schur = (local_subpartidx - 1) / 2;
4638  const local_ordinal_type i0_schur = local_subpartidx_schur == 0 ? pack_td_ptr_schur(partidx, local_subpartidx_schur) : pack_td_ptr_schur(partidx, local_subpartidx_schur) + 1;
4639  const local_ordinal_type i0_offset = local_subpartidx_schur == 0 ? i0 + 2 : i0 + 2;
4640 
4641  (void)i0_schur;
4642  (void)i0_offset;
4643 
4644 #if KOKKOS_VERSION >= 40799
4645  const auto one = KokkosKernels::ArithTraits<btdm_magnitude_type>::one();
4646 #else
4647  const auto one = Kokkos::ArithTraits<btdm_magnitude_type>::one();
4648 #endif
4649 
4650  const size_type c_kps2 = local_subpartidx > 0 ? pack_td_ptr(partidx, local_subpartidx) - 2 : 0;
4651  const size_type c_kps1 = pack_td_ptr(partidx, local_subpartidx + 1) + 1;
4652 
4653  typedef SolveTridiagsDefaultModeAndAlgo<typename execution_space::memory_space> default_mode_and_algo_type;
4654 
4655  typedef typename default_mode_and_algo_type::mode_type default_mode_type;
4656  typedef typename default_mode_and_algo_type::single_vector_algo_type default_algo_type;
4657 
4658  if (local_subpartidx == 0) {
4659  Kokkos::parallel_for(Kokkos::ThreadVectorRange(member, vector_loop_size), [&](const int &v) {
4660  auto v_1 = Kokkos::subview(X_internal_vector_values, r0 + nrows - 1, Kokkos::ALL(), 0, v);
4661  auto v_2 = Kokkos::subview(X_internal_vector_values, r0 + nrows, Kokkos::ALL(), 0, v);
4662  auto C = Kokkos::subview(D_internal_vector_values, c_kps1, Kokkos::ALL(), Kokkos::ALL(), v);
4663 
4664  KOKKOSBATCHED_GEMV_NO_TRANSPOSE_INTERNAL_INVOKE(default_mode_type, default_algo_type,
4665  member,
4666  blocksize, blocksize,
4667  -one,
4668  C.data(), C.stride(0), C.stride(1),
4669  v_1.data(), v_1.stride(0),
4670  one,
4671  v_2.data(), v_2.stride(0));
4672  });
4673  } else if (local_subpartidx == (local_ordinal_type)part2packrowidx0_sub.extent(1) - 2) {
4674  Kokkos::parallel_for(Kokkos::ThreadVectorRange(member, vector_loop_size), [&](const int &v) {
4675  auto v_1 = Kokkos::subview(X_internal_vector_values, r0, Kokkos::ALL(), 0, v);
4676  auto v_2 = Kokkos::subview(X_internal_vector_values, r0 - 1, Kokkos::ALL(), 0, v);
4677  auto C = Kokkos::subview(D_internal_vector_values, c_kps2, Kokkos::ALL(), Kokkos::ALL(), v);
4678 
4679  KOKKOSBATCHED_GEMV_NO_TRANSPOSE_INTERNAL_INVOKE(default_mode_type, default_algo_type,
4680  member,
4681  blocksize, blocksize,
4682  -one,
4683  C.data(), C.stride(0), C.stride(1),
4684  v_1.data(), v_1.stride(0),
4685  one,
4686  v_2.data(), v_2.stride(0));
4687  });
4688  } else {
4689  Kokkos::parallel_for(Kokkos::ThreadVectorRange(member, vector_loop_size), [&](const int &v) {
4690  {
4691  auto v_1 = Kokkos::subview(X_internal_vector_values, r0 + nrows - 1, Kokkos::ALL(), 0, v);
4692  auto v_2 = Kokkos::subview(X_internal_vector_values, r0 + nrows, Kokkos::ALL(), 0, v);
4693  auto C = Kokkos::subview(D_internal_vector_values, c_kps1, Kokkos::ALL(), Kokkos::ALL(), v);
4694 
4695  KOKKOSBATCHED_GEMV_NO_TRANSPOSE_INTERNAL_INVOKE(default_mode_type, default_algo_type,
4696  member,
4697  blocksize, blocksize,
4698  -one,
4699  C.data(), C.stride(0), C.stride(1),
4700  v_1.data(), v_1.stride(0),
4701  one,
4702  v_2.data(), v_2.stride(0));
4703  }
4704  {
4705  auto v_1 = Kokkos::subview(X_internal_vector_values, r0, Kokkos::ALL(), 0, v);
4706  auto v_2 = Kokkos::subview(X_internal_vector_values, r0 - 1, Kokkos::ALL(), 0, v);
4707  auto C = Kokkos::subview(D_internal_vector_values, c_kps2, Kokkos::ALL(), Kokkos::ALL(), v);
4708 
4709  KOKKOSBATCHED_GEMV_NO_TRANSPOSE_INTERNAL_INVOKE(default_mode_type, default_algo_type,
4710  member,
4711  blocksize, blocksize,
4712  -one,
4713  C.data(), C.stride(0), C.stride(1),
4714  v_1.data(), v_1.stride(0),
4715  one,
4716  v_2.data(), v_2.stride(0));
4717  }
4718  });
4719  }
4720  }
4721 
4722  template <int B>
4723  KOKKOS_INLINE_FUNCTION void
4724  operator()(const SingleVectorSchurTag<B> &, const member_type &member) const {
4725  const local_ordinal_type packidx = packindices_sub(member.league_rank());
4726 
4727  const local_ordinal_type partidx = packptr_sub(packidx);
4728 
4729  const local_ordinal_type blocksize = e_internal_vector_values.extent(2);
4730 
4731  const local_ordinal_type i0_schur = pack_td_ptr_schur(partidx, 0);
4732  const local_ordinal_type nrows = 2 * (n_subparts_per_part - 1);
4733 
4734  const local_ordinal_type r0_schur = nrows * member.league_rank();
4735 
4736  internal_vector_scratch_type_3d_view
4737  WW(member.team_scratch(0), blocksize, blocksize, vector_loop_size);
4738 
4739  for (local_ordinal_type schur_sub_part = 0; schur_sub_part < n_subparts_per_part - 1; ++schur_sub_part) {
4740  const local_ordinal_type r0 = part2packrowidx0_sub(partidx, 2 * schur_sub_part + 1);
4741  for (local_ordinal_type i = 0; i < 2; ++i) {
4742  copy3DView<local_ordinal_type>(member,
4743  Kokkos::subview(X_internal_vector_values_schur, r0_schur + 2 * schur_sub_part + i, Kokkos::ALL(), Kokkos::ALL(), Kokkos::ALL()),
4744  Kokkos::subview(X_internal_vector_values, r0 + i, Kokkos::ALL(), Kokkos::ALL(), Kokkos::ALL()));
4745  }
4746  }
4747 
4748  Kokkos::parallel_for(Kokkos::ThreadVectorRange(member, vector_loop_size), [&](const int &v) {
4749  solveSingleVectorNew<impl_type, internal_vector_scratch_type_3d_view>(member, blocksize, i0_schur, r0_schur, nrows, v, D_internal_vector_values_schur, X_internal_vector_values_schur, WW);
4750  });
4751 
4752  for (local_ordinal_type schur_sub_part = 0; schur_sub_part < n_subparts_per_part - 1; ++schur_sub_part) {
4753  const local_ordinal_type r0 = part2packrowidx0_sub(partidx, 2 * schur_sub_part + 1);
4754  for (local_ordinal_type i = 0; i < 2; ++i) {
4755  copy3DView<local_ordinal_type>(member,
4756  Kokkos::subview(X_internal_vector_values, r0 + i, Kokkos::ALL(), Kokkos::ALL(), Kokkos::ALL()),
4757  Kokkos::subview(X_internal_vector_values_schur, r0_schur + 2 * schur_sub_part + i, Kokkos::ALL(), Kokkos::ALL(), Kokkos::ALL()));
4758  }
4759  }
4760  }
4761 
4762  template <int B>
4763  KOKKOS_INLINE_FUNCTION void
4764  operator()(const SingleVectorApplyETag<B> &, const member_type &member) const {
4765  const local_ordinal_type packidx = packindices_sub(member.league_rank());
4766 
4767  const local_ordinal_type subpartidx = packptr_sub(packidx);
4768  const local_ordinal_type n_parts = part2packrowidx0_sub.extent(0);
4769  const local_ordinal_type local_subpartidx = subpartidx / n_parts;
4770  const local_ordinal_type partidx = subpartidx % n_parts;
4771  const local_ordinal_type blocksize = e_internal_vector_values.extent(2);
4772 
4773  const local_ordinal_type r0 = part2packrowidx0_sub(partidx, local_subpartidx);
4774  const local_ordinal_type nrows = partptr_sub(subpartidx, 1) - partptr_sub(subpartidx, 0);
4775 
4776  internal_vector_scratch_type_3d_view
4777  WW(member.team_scratch(0), blocksize, blocksize, vector_loop_size);
4778 
4779  // Compute v_2 = v_2 - C v_1
4780 
4781 #if KOKKOS_VERSION >= 40799
4782  const auto one = KokkosKernels::ArithTraits<btdm_magnitude_type>::one();
4783 #else
4784  const auto one = Kokkos::ArithTraits<btdm_magnitude_type>::one();
4785 #endif
4786 
4787  typedef SolveTridiagsDefaultModeAndAlgo<typename execution_space::memory_space> default_mode_and_algo_type;
4788 
4789  typedef typename default_mode_and_algo_type::mode_type default_mode_type;
4790  typedef typename default_mode_and_algo_type::single_vector_algo_type default_algo_type;
4791 
4792  if (local_subpartidx == 0) {
4793  Kokkos::parallel_for(Kokkos::ThreadVectorRange(member, vector_loop_size), [&](const int &v) {
4794  auto v_2 = Kokkos::subview(X_internal_vector_values, r0 + nrows, Kokkos::ALL(), 0, v);
4795 
4796  for (local_ordinal_type row = 0; row < nrows; ++row) {
4797  auto v_1 = Kokkos::subview(X_internal_vector_values, r0 + row, Kokkos::ALL(), 0, v);
4798  auto E = Kokkos::subview(e_internal_vector_values, 0, r0 + row, Kokkos::ALL(), Kokkos::ALL(), v);
4799 
4800  KOKKOSBATCHED_GEMV_NO_TRANSPOSE_INTERNAL_INVOKE(default_mode_type, default_algo_type,
4801  member,
4802  blocksize, blocksize,
4803  -one,
4804  E.data(), E.stride(0), E.stride(1),
4805  v_2.data(), v_2.stride(0),
4806  one,
4807  v_1.data(), v_1.stride(0));
4808  }
4809  });
4810  } else if (local_subpartidx == (local_ordinal_type)part2packrowidx0_sub.extent(1) - 2) {
4811  Kokkos::parallel_for(Kokkos::ThreadVectorRange(member, vector_loop_size), [&](const int &v) {
4812  auto v_2 = Kokkos::subview(X_internal_vector_values, r0 - 1, Kokkos::ALL(), 0, v);
4813 
4814  for (local_ordinal_type row = 0; row < nrows; ++row) {
4815  auto v_1 = Kokkos::subview(X_internal_vector_values, r0 + row, Kokkos::ALL(), 0, v);
4816  auto E = Kokkos::subview(e_internal_vector_values, 1, r0 + row, Kokkos::ALL(), Kokkos::ALL(), v);
4817 
4818  KOKKOSBATCHED_GEMV_NO_TRANSPOSE_INTERNAL_INVOKE(default_mode_type, default_algo_type,
4819  member,
4820  blocksize, blocksize,
4821  -one,
4822  E.data(), E.stride(0), E.stride(1),
4823  v_2.data(), v_2.stride(0),
4824  one,
4825  v_1.data(), v_1.stride(0));
4826  }
4827  });
4828  } else {
4829  Kokkos::parallel_for(Kokkos::ThreadVectorRange(member, vector_loop_size), [&](const int &v) {
4830  {
4831  auto v_2 = Kokkos::subview(X_internal_vector_values, r0 + nrows, Kokkos::ALL(), 0, v);
4832 
4833  for (local_ordinal_type row = 0; row < nrows; ++row) {
4834  auto v_1 = Kokkos::subview(X_internal_vector_values, r0 + row, Kokkos::ALL(), 0, v);
4835  auto E = Kokkos::subview(e_internal_vector_values, 0, r0 + row, Kokkos::ALL(), Kokkos::ALL(), v);
4836 
4837  KOKKOSBATCHED_GEMV_NO_TRANSPOSE_INTERNAL_INVOKE(default_mode_type, default_algo_type,
4838  member,
4839  blocksize, blocksize,
4840  -one,
4841  E.data(), E.stride(0), E.stride(1),
4842  v_2.data(), v_2.stride(0),
4843  one,
4844  v_1.data(), v_1.stride(0));
4845  }
4846  }
4847  {
4848  auto v_2 = Kokkos::subview(X_internal_vector_values, r0 - 1, Kokkos::ALL(), 0, v);
4849 
4850  for (local_ordinal_type row = 0; row < nrows; ++row) {
4851  auto v_1 = Kokkos::subview(X_internal_vector_values, r0 + row, Kokkos::ALL(), 0, v);
4852  auto E = Kokkos::subview(e_internal_vector_values, 1, r0 + row, Kokkos::ALL(), Kokkos::ALL(), v);
4853 
4854  KOKKOSBATCHED_GEMV_NO_TRANSPOSE_INTERNAL_INVOKE(default_mode_type, default_algo_type,
4855  member,
4856  blocksize, blocksize,
4857  -one,
4858  E.data(), E.stride(0), E.stride(1),
4859  v_2.data(), v_2.stride(0),
4860  one,
4861  v_1.data(), v_1.stride(0));
4862  }
4863  }
4864  });
4865  }
4866  }
4867 
4868  template <int B>
4869  KOKKOS_INLINE_FUNCTION void
4870  operator()(const SingleVectorCopyToFlatTag<B> &, const member_type &member) const {
4871  const local_ordinal_type packidx = member.league_rank();
4872  const local_ordinal_type partidx = packptr(packidx);
4873  const local_ordinal_type npacks = packptr(packidx + 1) - partidx;
4874  const local_ordinal_type pri0 = part2packrowidx0(partidx);
4875  const local_ordinal_type blocksize = (B == 0 ? D_internal_vector_values.extent(1) : B);
4876  const local_ordinal_type num_vectors = 1;
4877 
4878  Kokkos::parallel_for(Kokkos::ThreadVectorRange(member, vector_loop_size), [&](const int &v) {
4879  copyToFlatMultiVector(member, partidx, npacks, pri0, v, blocksize, num_vectors);
4880  });
4881  }
4882 
4883  template <int B>
4884  KOKKOS_INLINE_FUNCTION void
4885  operator()(const SingleZeroingTag<B> &, const member_type &member) const {
4886  Kokkos::single(Kokkos::PerTeam(member), [&]() {
4887  Z_scalar_vector(member.league_rank()) = impl_scalar_type(0);
4888  });
4889  }
4890 
4891  void run(const impl_scalar_type_2d_view_tpetra &Y,
4892  const impl_scalar_type_1d_view &Z) {
4893  IFPACK2_BLOCKTRIDICONTAINER_PROFILER_REGION_BEGIN;
4894  IFPACK2_BLOCKHELPER_TIMER("BlockTriDi::SolveTridiags", SolveTridiags);
4895 
4897  this->Y_scalar_multivector = Y;
4898  this->Z_scalar_vector = Z;
4899 
4900  const local_ordinal_type num_vectors = X_internal_vector_values.extent(2);
4901  const local_ordinal_type blocksize = D_internal_vector_values.extent(1);
4902 
4903  const local_ordinal_type team_size =
4904  SolveTridiagsDefaultModeAndAlgo<typename execution_space::memory_space>::
4905  recommended_team_size(blocksize, vector_length, internal_vector_length);
4906  const int per_team_scratch = internal_vector_scratch_type_3d_view ::shmem_size(blocksize, num_vectors, vector_loop_size);
4907 
4908 #if defined(KOKKOS_ENABLE_DEPRECATED_CODE)
4909 #define BLOCKTRIDICONTAINER_DETAILS_SOLVETRIDIAGS(B) \
4910  if (num_vectors == 1) { \
4911  const Kokkos::TeamPolicy<execution_space, SingleVectorTag<B>> \
4912  policy(packptr.extent(0) - 1, team_size, vector_loop_size); \
4913  Kokkos::parallel_for("SolveTridiags::TeamPolicy::run<SingleVector>", \
4914  policy.set_scratch_size(0, Kokkos::PerTeam(per_team_scratch)), *this); \
4915  } else { \
4916  const Kokkos::TeamPolicy<execution_space, MultiVectorTag<B>> \
4917  policy(packptr.extent(0) - 1, team_size, vector_loop_size); \
4918  Kokkos::parallel_for("SolveTridiags::TeamPolicy::run<MultiVector>", \
4919  policy.set_scratch_size(0, Kokkos::PerTeam(per_team_scratch)), *this); \
4920  } \
4921  break
4922 #else
4923 #define BLOCKTRIDICONTAINER_DETAILS_SOLVETRIDIAGS(B) \
4924  if (num_vectors == 1) { \
4925  if (packindices_schur.extent(1) <= 0) { \
4926  Kokkos::TeamPolicy<execution_space, SingleVectorTag<B>> \
4927  policy(packptr.extent(0) - 1, team_size, vector_loop_size); \
4928  policy.set_scratch_size(0, Kokkos::PerTeam(per_team_scratch)); \
4929  Kokkos::parallel_for("SolveTridiags::TeamPolicy::run<SingleVector>", \
4930  policy, *this); \
4931  } else { \
4932  { \
4933  Kokkos::TeamPolicy<execution_space, SingleZeroingTag<B>> \
4934  policy(packptr.extent(0) - 1, team_size, vector_loop_size); \
4935  Kokkos::parallel_for("SolveTridiags::TeamPolicy::run<SingleZeroingTag>", \
4936  policy, *this); \
4937  } \
4938  { \
4939  IFPACK2_BLOCKHELPER_TIMER("BlockTriDi::ApplyInverseJacobi::SingleVectorSubLineTag", SingleVectorSubLineTag0); \
4940  write4DMultiVectorValuesToFile(part2packrowidx0_sub.extent(0), X_internal_scalar_values, "x_scalar_values_before_SingleVectorSubLineTag.mm"); \
4941  Kokkos::TeamPolicy<execution_space, SingleVectorSubLineTag<B>> \
4942  policy(packindices_sub.extent(0), team_size, vector_loop_size); \
4943  policy.set_scratch_size(0, Kokkos::PerTeam(per_team_scratch)); \
4944  Kokkos::parallel_for("SolveTridiags::TeamPolicy::run<SingleVector>", \
4945  policy, *this); \
4946  write4DMultiVectorValuesToFile(part2packrowidx0_sub.extent(0), X_internal_scalar_values, "x_scalar_values_after_SingleVectorSubLineTag.mm"); \
4947  IFPACK2_BLOCKHELPER_TIMER_FENCE(execution_space) \
4948  } \
4949  { \
4950  IFPACK2_BLOCKHELPER_TIMER("BlockTriDi::ApplyInverseJacobi::SingleVectorApplyCTag", SingleVectorApplyCTag0); \
4951  write4DMultiVectorValuesToFile(part2packrowidx0_sub.extent(0), X_internal_scalar_values, "x_scalar_values_before_SingleVectorApplyCTag.mm"); \
4952  Kokkos::TeamPolicy<execution_space, SingleVectorApplyCTag<B>> \
4953  policy(packindices_sub.extent(0), team_size, vector_loop_size); \
4954  policy.set_scratch_size(0, Kokkos::PerTeam(per_team_scratch)); \
4955  Kokkos::parallel_for("SolveTridiags::TeamPolicy::run<SingleVector>", \
4956  policy, *this); \
4957  write4DMultiVectorValuesToFile(part2packrowidx0_sub.extent(0), X_internal_scalar_values, "x_scalar_values_after_SingleVectorApplyCTag.mm"); \
4958  IFPACK2_BLOCKHELPER_TIMER_FENCE(execution_space) \
4959  } \
4960  { \
4961  IFPACK2_BLOCKHELPER_TIMER("BlockTriDi::ApplyInverseJacobi::SingleVectorSchurTag", SingleVectorSchurTag0); \
4962  write4DMultiVectorValuesToFile(part2packrowidx0_sub.extent(0), X_internal_scalar_values, "x_scalar_values_before_SingleVectorSchurTag.mm"); \
4963  Kokkos::TeamPolicy<execution_space, SingleVectorSchurTag<B>> \
4964  policy(packindices_schur.extent(0), team_size, vector_loop_size); \
4965  policy.set_scratch_size(0, Kokkos::PerTeam(per_team_scratch)); \
4966  Kokkos::parallel_for("SolveTridiags::TeamPolicy::run<SingleVector>", \
4967  policy, *this); \
4968  write4DMultiVectorValuesToFile(part2packrowidx0_sub.extent(0), X_internal_scalar_values, "x_scalar_values_after_SingleVectorSchurTag.mm"); \
4969  IFPACK2_BLOCKHELPER_TIMER_FENCE(execution_space) \
4970  } \
4971  { \
4972  IFPACK2_BLOCKHELPER_TIMER("BlockTriDi::ApplyInverseJacobi::SingleVectorApplyETag", SingleVectorApplyETag0); \
4973  write4DMultiVectorValuesToFile(part2packrowidx0_sub.extent(0), X_internal_scalar_values, "x_scalar_values_before_SingleVectorApplyETag.mm"); \
4974  Kokkos::TeamPolicy<execution_space, SingleVectorApplyETag<B>> \
4975  policy(packindices_sub.extent(0), team_size, vector_loop_size); \
4976  policy.set_scratch_size(0, Kokkos::PerTeam(per_team_scratch)); \
4977  Kokkos::parallel_for("SolveTridiags::TeamPolicy::run<SingleVector>", \
4978  policy, *this); \
4979  write4DMultiVectorValuesToFile(part2packrowidx0_sub.extent(0), X_internal_scalar_values, "x_scalar_values_after_SingleVectorApplyETag.mm"); \
4980  IFPACK2_BLOCKHELPER_TIMER_FENCE(execution_space) \
4981  } \
4982  { \
4983  Kokkos::TeamPolicy<execution_space, SingleVectorCopyToFlatTag<B>> \
4984  policy(packptr.extent(0) - 1, team_size, vector_loop_size); \
4985  Kokkos::parallel_for("SolveTridiags::TeamPolicy::run<SingleVectorCopyToFlatTag>", \
4986  policy, *this); \
4987  } \
4988  } \
4989  } else { \
4990  Kokkos::TeamPolicy<execution_space, MultiVectorTag<B>> \
4991  policy(packptr.extent(0) - 1, team_size, vector_loop_size); \
4992  policy.set_scratch_size(0, Kokkos::PerTeam(per_team_scratch)); \
4993  Kokkos::parallel_for("SolveTridiags::TeamPolicy::run<MultiVector>", \
4994  policy, *this); \
4995  } \
4996  break
4997 #endif
4998  switch (blocksize) {
4999  case 3: BLOCKTRIDICONTAINER_DETAILS_SOLVETRIDIAGS(3);
5000  case 5: BLOCKTRIDICONTAINER_DETAILS_SOLVETRIDIAGS(5);
5001  case 6: BLOCKTRIDICONTAINER_DETAILS_SOLVETRIDIAGS(6);
5002  case 7: BLOCKTRIDICONTAINER_DETAILS_SOLVETRIDIAGS(7);
5003  case 10: BLOCKTRIDICONTAINER_DETAILS_SOLVETRIDIAGS(10);
5004  case 11: BLOCKTRIDICONTAINER_DETAILS_SOLVETRIDIAGS(11);
5005  case 12: BLOCKTRIDICONTAINER_DETAILS_SOLVETRIDIAGS(12);
5006  case 13: BLOCKTRIDICONTAINER_DETAILS_SOLVETRIDIAGS(13);
5007  case 16: BLOCKTRIDICONTAINER_DETAILS_SOLVETRIDIAGS(16);
5008  case 17: BLOCKTRIDICONTAINER_DETAILS_SOLVETRIDIAGS(17);
5009  case 18: BLOCKTRIDICONTAINER_DETAILS_SOLVETRIDIAGS(18);
5010  case 19: BLOCKTRIDICONTAINER_DETAILS_SOLVETRIDIAGS(19);
5011  default: BLOCKTRIDICONTAINER_DETAILS_SOLVETRIDIAGS(0);
5012  }
5013 #undef BLOCKTRIDICONTAINER_DETAILS_SOLVETRIDIAGS
5014 
5015  IFPACK2_BLOCKTRIDICONTAINER_PROFILER_REGION_END;
5016  IFPACK2_BLOCKHELPER_TIMER_FENCE(execution_space)
5017  }
5018 };
5019 
5023 template <typename MatrixType>
5024 int applyInverseJacobi( // importer
5025  const Teuchos::RCP<const typename BlockHelperDetails::ImplType<MatrixType>::tpetra_row_matrix_type> &A,
5026  const Teuchos::RCP<const typename BlockHelperDetails::ImplType<MatrixType>::tpetra_crs_graph_type> &G,
5027  const Teuchos::RCP<const typename BlockHelperDetails::ImplType<MatrixType>::tpetra_import_type> &tpetra_importer,
5028  const Teuchos::RCP<AsyncableImport<MatrixType>> &async_importer,
5029  const bool overlap_communication_and_computation,
5030  // tpetra interface
5031  const typename BlockHelperDetails::ImplType<MatrixType>::tpetra_multivector_type &X, // tpetra interface
5032  /* */ typename BlockHelperDetails::ImplType<MatrixType>::tpetra_multivector_type &Y, // tpetra interface
5033  /* */ typename BlockHelperDetails::ImplType<MatrixType>::tpetra_multivector_type &Z, // temporary tpetra interface (seq_method)
5034  /* */ typename BlockHelperDetails::ImplType<MatrixType>::impl_scalar_type_1d_view &W, // temporary tpetra interface (diff)
5035  // local object interface
5036  const BlockHelperDetails::PartInterface<MatrixType> &interf, // mesh interface
5037  const BlockTridiags<MatrixType> &btdm, // packed block tridiagonal matrices
5038  const BlockHelperDetails::AmD<MatrixType> &amd, // R = A - D
5039  /* */ typename BlockHelperDetails::ImplType<MatrixType>::vector_type_1d_view &work, // workspace for packed multivector of right hand side
5040  /* */ BlockHelperDetails::NormManager<MatrixType> &norm_manager,
5041  // preconditioner parameters
5042  const typename BlockHelperDetails::ImplType<MatrixType>::impl_scalar_type &damping_factor,
5043  /* */ bool is_y_zero,
5044  const int max_num_sweeps,
5045  const typename BlockHelperDetails::ImplType<MatrixType>::magnitude_type tol,
5046  const int check_tol_every) {
5047  IFPACK2_BLOCKHELPER_TIMER("BlockTriDi::ApplyInverseJacobi", ApplyInverseJacobi);
5048 
5049  using impl_type = BlockHelperDetails::ImplType<MatrixType>;
5050  using node_memory_space = typename impl_type::node_memory_space;
5051  using local_ordinal_type = typename impl_type::local_ordinal_type;
5052  using size_type = typename impl_type::size_type;
5053  using impl_scalar_type = typename impl_type::impl_scalar_type;
5054  using magnitude_type = typename impl_type::magnitude_type;
5055  using local_ordinal_type_1d_view = typename impl_type::local_ordinal_type_1d_view;
5056  using vector_type_1d_view = typename impl_type::vector_type_1d_view;
5057  using vector_type_3d_view = typename impl_type::vector_type_3d_view;
5058  using tpetra_multivector_type = typename impl_type::tpetra_multivector_type;
5059 
5060  using impl_scalar_type_1d_view = typename impl_type::impl_scalar_type_1d_view;
5061 
5062  // either tpetra importer or async importer must be active
5063  TEUCHOS_TEST_FOR_EXCEPT_MSG(!tpetra_importer.is_null() && !async_importer.is_null(),
5064  "Neither Tpetra importer nor Async importer is null.");
5065  // max number of sweeps should be positive number
5066  TEUCHOS_TEST_FOR_EXCEPT_MSG(max_num_sweeps <= 0,
5067  "Maximum number of sweeps must be >= 1.");
5068 
5069  // const parameters
5070  const bool is_seq_method_requested = !tpetra_importer.is_null();
5071  const bool is_async_importer_active = !async_importer.is_null();
5072 #if KOKKOS_VERSION >= 40799
5073  const bool is_norm_manager_active = tol > KokkosKernels::ArithTraits<magnitude_type>::zero();
5074 #else
5075  const bool is_norm_manager_active = tol > Kokkos::ArithTraits<magnitude_type>::zero();
5076 #endif
5077  const magnitude_type tolerance = tol * tol;
5078  const local_ordinal_type blocksize = btdm.values.extent(1);
5079  const local_ordinal_type num_vectors = Y.getNumVectors();
5080  const local_ordinal_type num_blockrows = interf.part2packrowidx0_back;
5081 
5082  const impl_scalar_type zero(0.0);
5083 
5084  TEUCHOS_TEST_FOR_EXCEPT_MSG(is_norm_manager_active && is_seq_method_requested,
5085  "The seq method for applyInverseJacobi, "
5086  << "which in any case is for developer use only, "
5087  << "does not support norm-based termination.");
5088  const bool device_accessible_from_host = Kokkos::SpaceAccessibility<
5089  Kokkos::DefaultHostExecutionSpace, node_memory_space>::accessible;
5090  TEUCHOS_TEST_FOR_EXCEPTION(is_seq_method_requested && !device_accessible_from_host,
5091  std::invalid_argument,
5092  "The seq method for applyInverseJacobi, "
5093  << "which in any case is for developer use only, "
5094  << "only supports memory spaces accessible from host.");
5095 
5096  // if workspace is needed more, resize it
5097  const size_type work_span_required = num_blockrows * num_vectors * blocksize;
5098  if (work.span() < work_span_required)
5099  work = vector_type_1d_view("vector workspace 1d view", work_span_required);
5100 
5101  // construct W
5102  const local_ordinal_type W_size = interf.packptr.extent(0) - 1;
5103  if (local_ordinal_type(W.extent(0)) < W_size)
5104  W = impl_scalar_type_1d_view("W", W_size);
5105 
5106  typename impl_type::impl_scalar_type_2d_view_tpetra remote_multivector;
5107  {
5108  if (is_seq_method_requested) {
5109  if (Z.getNumVectors() != Y.getNumVectors())
5110  Z = tpetra_multivector_type(tpetra_importer->getTargetMap(), num_vectors, false);
5111  } else {
5112  if (is_async_importer_active) {
5113  // create comm data buffer and keep it here
5114  async_importer->createDataBuffer(num_vectors);
5115  remote_multivector = async_importer->getRemoteMultiVectorLocalView();
5116  }
5117  }
5118  }
5119 
5120  // wrap the workspace with 3d view
5121  vector_type_3d_view pmv(work.data(), num_blockrows, blocksize, num_vectors);
5122  const auto XX = X.getLocalViewDevice(Tpetra::Access::ReadOnly);
5123  const auto YY = Y.getLocalViewDevice(Tpetra::Access::ReadWrite);
5124  const auto ZZ = Z.getLocalViewDevice(Tpetra::Access::ReadWrite);
5125  if (is_y_zero) Kokkos::deep_copy(YY, zero);
5126 
5127  MultiVectorConverter<MatrixType> multivector_converter(interf, pmv);
5128  SolveTridiags<MatrixType> solve_tridiags(interf, btdm, pmv,
5129  damping_factor, is_norm_manager_active);
5130 
5131  const local_ordinal_type_1d_view dummy_local_ordinal_type_1d_view;
5132 
5133  auto A_crs = Teuchos::rcp_dynamic_cast<const typename impl_type::tpetra_crs_matrix_type>(A);
5134  auto A_bcrs = Teuchos::rcp_dynamic_cast<const typename impl_type::tpetra_block_crs_matrix_type>(A);
5135 
5136  bool hasBlockCrsMatrix = !A_bcrs.is_null();
5137 
5138  // This is OK here to use the graph of the A_crs matrix and a block size of 1
5139  const auto g = hasBlockCrsMatrix ? A_bcrs->getCrsGraph() : *(A_crs->getCrsGraph()); // tpetra crs graph object
5140 
5141  BlockHelperDetails::ComputeResidualVector<MatrixType>
5142  compute_residual_vector(amd, G->getLocalGraphDevice(), g.getLocalGraphDevice(), blocksize, interf,
5143  is_async_importer_active ? async_importer->dm2cm : dummy_local_ordinal_type_1d_view,
5144  hasBlockCrsMatrix);
5145 
5146  // norm manager workspace resize
5147  if (is_norm_manager_active)
5148  norm_manager.setCheckFrequency(check_tol_every);
5149 
5150  // iterate
5151  int sweep = 0;
5152  for (; sweep < max_num_sweeps; ++sweep) {
5153  {
5154  if (is_y_zero) {
5155  // pmv := x(lclrow)
5156  multivector_converter.run(XX);
5157  } else {
5158  if (is_seq_method_requested) {
5159  // SEQ METHOD IS TESTING ONLY
5160 
5161  // y := x - R y
5162  Z.doImport(Y, *tpetra_importer, Tpetra::REPLACE);
5163  compute_residual_vector.run(YY, XX, ZZ);
5164 
5165  // pmv := y(lclrow).
5166  multivector_converter.run(YY);
5167  } else {
5168  // fused y := x - R y and pmv := y(lclrow);
5169  // real use case does not use overlap comp and comm
5170  if (overlap_communication_and_computation || !is_async_importer_active) {
5171  if (is_async_importer_active) async_importer->asyncSendRecv(YY);
5172  // OverlapTag, compute_owned = true
5173  compute_residual_vector.run(pmv, XX, YY, remote_multivector, true);
5174  if (is_norm_manager_active && norm_manager.checkDone(sweep, tolerance)) {
5175  if (is_async_importer_active) async_importer->cancel();
5176  break;
5177  }
5178  if (is_async_importer_active) {
5179  async_importer->syncRecv();
5180  // OverlapTag, compute_owned = false
5181  compute_residual_vector.run(pmv, XX, YY, remote_multivector, false);
5182  }
5183  } else {
5184  if (is_async_importer_active)
5185  async_importer->syncExchange(YY);
5186  if (is_norm_manager_active && norm_manager.checkDone(sweep, tolerance)) break;
5187  // AsyncTag
5188  compute_residual_vector.run(pmv, XX, YY, remote_multivector);
5189  }
5190  }
5191  }
5192  }
5193 
5194  // pmv := inv(D) pmv.
5195  {
5196  solve_tridiags.run(YY, W);
5197  }
5198  {
5199  if (is_norm_manager_active) {
5200  // y(lclrow) = (b - a) y(lclrow) + a pmv, with b = 1 always.
5201  BlockHelperDetails::reduceVector<MatrixType>(W, norm_manager.getBuffer());
5202  if (sweep + 1 == max_num_sweeps) {
5203  norm_manager.ireduce(sweep, true);
5204  norm_manager.checkDone(sweep + 1, tolerance, true);
5205  } else {
5206  norm_manager.ireduce(sweep);
5207  }
5208  }
5209  }
5210  is_y_zero = false;
5211  }
5212 
5213  // sqrt the norms for the caller's use.
5214  if (is_norm_manager_active) norm_manager.finalize();
5215 
5216  return sweep;
5217 }
5218 
5219 // Implementation of fused block Jacobi for a specific block size,
5220 // or (if B == 0) for a general block size.
5221 template <typename MatrixType, int B>
5222 int applyFusedBlockJacobi_Impl(
5223  const Teuchos::RCP<const typename BlockHelperDetails::ImplType<MatrixType>::tpetra_import_type> &tpetra_importer,
5224  const Teuchos::RCP<AsyncableImport<MatrixType>> &async_importer,
5225  const bool overlap_communication_and_computation,
5226  // tpetra interface
5227  const typename BlockHelperDetails::ImplType<MatrixType>::tpetra_multivector_type &X, // tpetra interface
5228  /* */ typename BlockHelperDetails::ImplType<MatrixType>::tpetra_multivector_type &Y, // tpetra interface
5229  /* */ typename BlockHelperDetails::ImplType<MatrixType>::impl_scalar_type_1d_view &W, // temporary tpetra interface (diff)
5230  // local object interface
5231  const BlockHelperDetails::PartInterface<MatrixType> &interf, // mesh interface
5232  const BlockTridiags<MatrixType> &btdm, // packed block tridiagonal matrices
5233  const BlockHelperDetails::AmD<MatrixType> &amd, // R = A - D
5234  /* */ typename BlockHelperDetails::ImplType<MatrixType>::impl_scalar_type_1d_view &work, // workspace
5235  /* */ BlockHelperDetails::NormManager<MatrixType> &norm_manager,
5236  // preconditioner parameters
5237  const typename BlockHelperDetails::ImplType<MatrixType>::impl_scalar_type &damping_factor,
5238  /* */ bool is_y_zero,
5239  const int max_num_sweeps,
5240  const typename BlockHelperDetails::ImplType<MatrixType>::magnitude_type tol,
5241  const int check_tol_every) {
5242  using impl_type = BlockHelperDetails::ImplType<MatrixType>;
5243  using local_ordinal_type = typename impl_type::local_ordinal_type;
5244  using size_type = typename impl_type::size_type;
5245  using magnitude_type = typename impl_type::magnitude_type;
5246  using impl_scalar_type_1d_view = typename impl_type::impl_scalar_type_1d_view;
5247  using impl_scalar_type_2d_view_tpetra = typename impl_type::impl_scalar_type_2d_view_tpetra;
5248 
5249  // the tpetra importer and async importer can't both be active
5250  TEUCHOS_TEST_FOR_EXCEPT_MSG(!tpetra_importer.is_null() && !async_importer.is_null(),
5251  "Neither Tpetra importer nor Async importer is null.");
5252  // max number of sweeps should be positive number
5253  TEUCHOS_TEST_FOR_EXCEPT_MSG(max_num_sweeps <= 0,
5254  "Maximum number of sweeps must be >= 1.");
5255 
5256  // const parameters
5257  const bool is_async_importer_active = !async_importer.is_null();
5258 #if KOKKOS_VERSION >= 40799
5259  const bool is_norm_manager_active = tol > KokkosKernels::ArithTraits<magnitude_type>::zero();
5260 #else
5261  const bool is_norm_manager_active = tol > Kokkos::ArithTraits<magnitude_type>::zero();
5262 #endif
5263  const magnitude_type tolerance = tol * tol;
5264  const local_ordinal_type blocksize = btdm.d_inv.extent(1);
5265  const local_ordinal_type num_vectors = Y.getNumVectors();
5266  const local_ordinal_type num_blockrows = interf.nparts;
5267 
5268  typename impl_type::impl_scalar_type_2d_view_tpetra remote_multivector;
5269  {
5270  if (is_async_importer_active) {
5271  // create comm data buffer and keep it here
5272  async_importer->createDataBuffer(num_vectors);
5273  remote_multivector = async_importer->getRemoteMultiVectorLocalView();
5274  }
5275  }
5276 
5277  const auto XX = X.getLocalViewDevice(Tpetra::Access::ReadOnly);
5278  const auto YY = Y.getLocalViewDevice(Tpetra::Access::ReadWrite);
5279 
5280  const bool two_pass_residual =
5281  overlap_communication_and_computation && is_async_importer_active;
5282 
5283  // Calculate the required work size and reallocate it if not already big enough.
5284  // Check that our assumptions about YY dimension are correct.
5286  size_t(num_blockrows) * blocksize * num_vectors != YY.extent(0) * YY.extent(1),
5287  "Local LHS vector (YY) has total size " << YY.extent(0) << "x" << YY.extent(1) << " = " << YY.extent(0) * YY.extent(1) << ",\n"
5288  << "but expected " << num_blockrows << "x" << blocksize << "x" << num_vectors << " = " << size_t(num_blockrows) * blocksize * num_vectors << '\n');
5289  size_type work_required = size_type(num_blockrows) * blocksize * num_vectors;
5290  if (work.extent(0) < work_required) {
5291  work = impl_scalar_type_1d_view(do_not_initialize_tag("flat workspace 1d view"), work_required);
5292  }
5293 
5294  Unmanaged<impl_scalar_type_2d_view_tpetra> y_doublebuf(work.data(), num_blockrows * blocksize, num_vectors);
5295 
5296  // construct W
5297  if (W.extent(0) != size_t(num_blockrows))
5298  W = impl_scalar_type_1d_view(do_not_initialize_tag("W"), num_blockrows);
5299 
5300  // Create the required functors upfront (this is inexpensive - all shallow copies)
5301  BlockHelperDetails::ComputeResidualAndSolve_SolveOnly<MatrixType, B>
5302  functor_solve_only(amd, btdm.d_inv, W, blocksize, damping_factor);
5303  BlockHelperDetails::ComputeResidualAndSolve_1Pass<MatrixType, B>
5304  functor_1pass(amd, btdm.d_inv, W, blocksize, damping_factor);
5305  BlockHelperDetails::ComputeResidualAndSolve_2Pass<MatrixType, B>
5306  functor_2pass(amd, btdm.d_inv, W, blocksize, damping_factor);
5307 
5308  // norm manager workspace resize
5309  if (is_norm_manager_active)
5310  norm_manager.setCheckFrequency(check_tol_every);
5311 
5312  // For double-buffering.
5313  // yy_buffers[current_y] has the current iterate of y.
5314  // yy_buffers[1-current_y] has the next iterate of y.
5315  Unmanaged<impl_scalar_type_2d_view_tpetra> y_buffers[2] = {YY, y_doublebuf};
5316  int current_y = 0;
5317 
5318  // iterate
5319  int sweep = 0;
5320  for (; sweep < max_num_sweeps; ++sweep) {
5321  if (is_y_zero) {
5322  // If y is initially zero, then we are just computing y := damping_factor * Dinv * x
5323  functor_solve_only.run(XX, y_buffers[1 - current_y]);
5324  } else {
5325  // real use case does not use overlap comp and comm
5326  if (overlap_communication_and_computation || !is_async_importer_active) {
5327  if (is_async_importer_active) async_importer->asyncSendRecv(y_buffers[current_y]);
5328  if (two_pass_residual) {
5329  // Pass 1 computes owned residual and stores into new y buffer,
5330  // but doesn't apply Dinv or produce a norm yet
5331  functor_2pass.run_pass1(XX, y_buffers[current_y], y_buffers[1 - current_y]);
5332  } else {
5333  // This case happens if running with single rank.
5334  // There are no remote columns, so residual and solve can happen in one step.
5335  functor_1pass.run(XX, y_buffers[current_y], remote_multivector, y_buffers[1 - current_y]);
5336  }
5337  if (is_norm_manager_active && norm_manager.checkDone(sweep, tolerance)) {
5338  if (is_async_importer_active) async_importer->cancel();
5339  break;
5340  }
5341  if (is_async_importer_active) {
5342  async_importer->syncRecv();
5343  // Stage 2 finishes computing the residual, then applies Dinv and computes norm.
5344  functor_2pass.run_pass2(y_buffers[current_y], remote_multivector, y_buffers[1 - current_y]);
5345  }
5346  } else {
5347  if (is_async_importer_active)
5348  async_importer->syncExchange(y_buffers[current_y]);
5349  if (is_norm_manager_active && norm_manager.checkDone(sweep, tolerance)) break;
5350  // Full residual, Dinv apply, and norm in one kernel
5351  functor_1pass.run(XX, y_buffers[current_y], remote_multivector, y_buffers[1 - current_y]);
5352  }
5353  }
5354 
5355  // Compute global norm.
5356  if (is_norm_manager_active) {
5357  BlockHelperDetails::reduceVector<MatrixType>(W, norm_manager.getBuffer());
5358  if (sweep + 1 == max_num_sweeps) {
5359  norm_manager.ireduce(sweep, true);
5360  norm_manager.checkDone(sweep + 1, tolerance, true);
5361  } else {
5362  norm_manager.ireduce(sweep);
5363  }
5364  }
5365  is_y_zero = false;
5366  // flip y buffers for next iteration, or termination if we reached max_num_sweeps.
5367  current_y = 1 - current_y;
5368  }
5369  if (current_y == 1) {
5370  // We finished iterating with y in the double buffer, so copy it to the user's vector.
5371  Kokkos::deep_copy(YY, y_doublebuf);
5372  }
5373 
5374  // sqrt the norms for the caller's use.
5375  if (is_norm_manager_active) norm_manager.finalize();
5376  return sweep;
5377 }
5378 
5382 template <typename MatrixType>
5384  const Teuchos::RCP<const typename BlockHelperDetails::ImplType<MatrixType>::tpetra_import_type> &tpetra_importer,
5385  const Teuchos::RCP<AsyncableImport<MatrixType>> &async_importer,
5386  const bool overlap_communication_and_computation,
5387  // tpetra interface
5388  const typename BlockHelperDetails::ImplType<MatrixType>::tpetra_multivector_type &X, // tpetra interface
5389  /* */ typename BlockHelperDetails::ImplType<MatrixType>::tpetra_multivector_type &Y, // tpetra interface
5390  /* */ typename BlockHelperDetails::ImplType<MatrixType>::impl_scalar_type_1d_view &W, // temporary tpetra interface (diff)
5391  // local object interface
5392  const BlockHelperDetails::PartInterface<MatrixType> &interf, // mesh interface
5393  const BlockTridiags<MatrixType> &btdm, // packed block tridiagonal matrices
5394  const BlockHelperDetails::AmD<MatrixType> &amd, // R = A - D
5395  /* */ typename BlockHelperDetails::ImplType<MatrixType>::impl_scalar_type_1d_view &work, // workspace
5396  /* */ BlockHelperDetails::NormManager<MatrixType> &norm_manager,
5397  // preconditioner parameters
5398  const typename BlockHelperDetails::ImplType<MatrixType>::impl_scalar_type &damping_factor,
5399  /* */ bool is_y_zero,
5400  const int max_num_sweeps,
5401  const typename BlockHelperDetails::ImplType<MatrixType>::magnitude_type tol,
5402  const int check_tol_every) {
5403  IFPACK2_BLOCKHELPER_TIMER("BlockTriDi::ApplyFusedBlockJacobi", ApplyFusedBlockJacobi);
5404  int blocksize = btdm.d_inv.extent(1);
5405  int sweep = 0;
5406 #define BLOCKTRIDICONTAINER_APPLY_FUSED_JACOBI(B) \
5407  { \
5408  sweep = applyFusedBlockJacobi_Impl<MatrixType, B>( \
5409  tpetra_importer, async_importer, overlap_communication_and_computation, \
5410  X, Y, W, interf, btdm, amd, work, \
5411  norm_manager, damping_factor, is_y_zero, \
5412  max_num_sweeps, tol, check_tol_every); \
5413  } \
5414  break
5415  switch (blocksize) {
5416  case 3: BLOCKTRIDICONTAINER_APPLY_FUSED_JACOBI(3);
5417  case 5: BLOCKTRIDICONTAINER_APPLY_FUSED_JACOBI(5);
5418  case 7: BLOCKTRIDICONTAINER_APPLY_FUSED_JACOBI(7);
5419  case 9: BLOCKTRIDICONTAINER_APPLY_FUSED_JACOBI(9);
5420  case 10: BLOCKTRIDICONTAINER_APPLY_FUSED_JACOBI(10);
5421  case 11: BLOCKTRIDICONTAINER_APPLY_FUSED_JACOBI(11);
5422  case 16: BLOCKTRIDICONTAINER_APPLY_FUSED_JACOBI(16);
5423  case 17: BLOCKTRIDICONTAINER_APPLY_FUSED_JACOBI(17);
5424  case 18: BLOCKTRIDICONTAINER_APPLY_FUSED_JACOBI(18);
5425  default: BLOCKTRIDICONTAINER_APPLY_FUSED_JACOBI(0);
5426  }
5427 #undef BLOCKTRIDICONTAINER_APPLY_FUSED_JACOBI
5428 
5429  return sweep;
5430 }
5431 
5432 template <typename MatrixType>
5433 struct ImplObject {
5434  using impl_type = BlockHelperDetails::ImplType<MatrixType>;
5435  using part_interface_type = BlockHelperDetails::PartInterface<MatrixType>;
5436  using block_tridiags_type = BlockTridiags<MatrixType>;
5437  using amd_type = BlockHelperDetails::AmD<MatrixType>;
5438  using norm_manager_type = BlockHelperDetails::NormManager<MatrixType>;
5439  using async_import_type = AsyncableImport<MatrixType>;
5440 
5441  // distructed objects
5445  Teuchos::RCP<async_import_type> async_importer;
5446  bool overlap_communication_and_computation;
5447 
5448  // copy of Y (mutable to penentrate const)
5449  mutable typename impl_type::tpetra_multivector_type Z;
5450  mutable typename impl_type::impl_scalar_type_1d_view W;
5451 
5452  // local objects
5453  part_interface_type part_interface;
5454  block_tridiags_type block_tridiags; // D
5455  amd_type a_minus_d; // R = A - D
5456 
5457  // whether to use fused block Jacobi path
5458  bool use_fused_jacobi;
5459 
5460  // vector workspace is used for general block tridi case
5461  mutable typename impl_type::vector_type_1d_view work; // right hand side workspace (1D view of vector)
5462  // scalar workspace is used for fused block jacobi case
5463  mutable typename impl_type::impl_scalar_type_1d_view work_flat; // right hand side workspace (1D view of scalar)
5464  mutable norm_manager_type norm_manager;
5465 };
5466 
5467 } // namespace BlockTriDiContainerDetails
5468 
5469 } // namespace Ifpack2
5470 
5471 #endif
Definition: Ifpack2_BlockTriDiContainer_impl.hpp:143
int applyFusedBlockJacobi(const Teuchos::RCP< const typename BlockHelperDetails::ImplType< MatrixType >::tpetra_import_type > &tpetra_importer, const Teuchos::RCP< AsyncableImport< MatrixType >> &async_importer, const bool overlap_communication_and_computation, const typename BlockHelperDetails::ImplType< MatrixType >::tpetra_multivector_type &X, typename BlockHelperDetails::ImplType< MatrixType >::tpetra_multivector_type &Y, typename BlockHelperDetails::ImplType< MatrixType >::impl_scalar_type_1d_view &W, const BlockHelperDetails::PartInterface< MatrixType > &interf, const BlockTridiags< MatrixType > &btdm, const BlockHelperDetails::AmD< MatrixType > &amd, typename BlockHelperDetails::ImplType< MatrixType >::impl_scalar_type_1d_view &work, BlockHelperDetails::NormManager< MatrixType > &norm_manager, const typename BlockHelperDetails::ImplType< MatrixType >::impl_scalar_type &damping_factor, bool is_y_zero, const int max_num_sweeps, const typename BlockHelperDetails::ImplType< MatrixType >::magnitude_type tol, const int check_tol_every)
Definition: Ifpack2_BlockTriDiContainer_impl.hpp:5383
void performNumericPhase(const Teuchos::RCP< const typename BlockHelperDetails::ImplType< MatrixType >::tpetra_row_matrix_type > &A, const Teuchos::RCP< const typename BlockHelperDetails::ImplType< MatrixType >::tpetra_crs_graph_type > &G, const BlockHelperDetails::PartInterface< MatrixType > &interf, BlockTridiags< MatrixType > &btdm, const typename BlockHelperDetails::ImplType< MatrixType >::magnitude_type tiny, bool use_fused_jacobi)
Definition: Ifpack2_BlockTriDiContainer_impl.hpp:3752
void performSymbolicPhase(const Teuchos::RCP< const typename BlockHelperDetails::ImplType< MatrixType >::tpetra_row_matrix_type > &A, const Teuchos::RCP< const typename BlockHelperDetails::ImplType< MatrixType >::tpetra_crs_graph_type > &g, const BlockHelperDetails::PartInterface< MatrixType > &interf, BlockTridiags< MatrixType > &btdm, BlockHelperDetails::AmD< MatrixType > &amd, const bool overlap_communication_and_computation, const Teuchos::RCP< AsyncableImport< MatrixType >> &async_importer, bool useSeqMethod, bool use_fused_jacobi)
Definition: Ifpack2_BlockTriDiContainer_impl.hpp:1933
#define TEUCHOS_TEST_FOR_EXCEPTION(throw_exception_test, Exception, msg)
size_type size() const
size_t size_type
Definition: Ifpack2_BlockHelper.hpp:274
Kokkos::ArithTraits< scalar_type >::val_type impl_scalar_type
Definition: Ifpack2_BlockHelper.hpp:286
BlockHelperDetails::PartInterface< MatrixType > createPartInterface(const Teuchos::RCP< const typename BlockHelperDetails::ImplType< MatrixType >::tpetra_row_matrix_type > &A, const Teuchos::RCP< const typename BlockHelperDetails::ImplType< MatrixType >::tpetra_crs_graph_type > &G, const Teuchos::Array< Teuchos::Array< typename BlockHelperDetails::ImplType< MatrixType >::local_ordinal_type >> &partitions, const typename BlockHelperDetails::ImplType< MatrixType >::local_ordinal_type n_subparts_per_part_in)
Definition: Ifpack2_BlockTriDiContainer_impl.hpp:1102
Teuchos::RCP< AsyncableImport< MatrixType > > createBlockCrsAsyncImporter(const Teuchos::RCP< const typename BlockHelperDetails::ImplType< MatrixType >::tpetra_row_matrix_type > &A)
Definition: Ifpack2_BlockTriDiContainer_impl.hpp:895
TEUCHOS_DEPRECATED RCP< T > rcp(T *p, Dealloc_T dealloc, bool owns_mem)
#define TEUCHOS_TEST_FOR_EXCEPT_MSG(throw_exception_test, msg)
BlockTridiags< MatrixType > createBlockTridiags(const BlockHelperDetails::PartInterface< MatrixType > &interf)
Definition: Ifpack2_BlockTriDiContainer_impl.hpp:1681
Kokkos::View< size_type *, device_type > size_type_1d_view
Definition: Ifpack2_BlockHelper.hpp:358
Definition: Ifpack2_BlockHelper.hpp:389
Kokkos::ViewAllocateWithoutInitializing do_not_initialize_tag
Definition: Ifpack2_BlockTriDiContainer_impl.hpp:99
void send(const Packet sendBuffer[], const Ordinal count, const int destRank, const int tag, const Comm< Ordinal > &comm)
T * getRawPtr() const
Definition: Ifpack2_BlockHelper.hpp:211
Teuchos::RCP< const typename BlockHelperDetails::ImplType< MatrixType >::tpetra_import_type > createBlockCrsTpetraImporter(const Teuchos::RCP< const typename BlockHelperDetails::ImplType< MatrixType >::tpetra_row_matrix_type > &A)
Definition: Ifpack2_BlockTriDiContainer_impl.hpp:171
RCP< CommRequest< Ordinal > > isend(const ArrayRCP< const Packet > &sendBuffer, const int destRank, const int tag, const Comm< Ordinal > &comm)
#define TEUCHOS_ASSERT(assertion_test)
Definition: Ifpack2_BlockHelper.hpp:236
Definition: Ifpack2_BlockHelper.hpp:270
int applyInverseJacobi(const Teuchos::RCP< const typename BlockHelperDetails::ImplType< MatrixType >::tpetra_row_matrix_type > &A, const Teuchos::RCP< const typename BlockHelperDetails::ImplType< MatrixType >::tpetra_crs_graph_type > &G, const Teuchos::RCP< const typename BlockHelperDetails::ImplType< MatrixType >::tpetra_import_type > &tpetra_importer, const Teuchos::RCP< AsyncableImport< MatrixType >> &async_importer, const bool overlap_communication_and_computation, const typename BlockHelperDetails::ImplType< MatrixType >::tpetra_multivector_type &X, typename BlockHelperDetails::ImplType< MatrixType >::tpetra_multivector_type &Y, typename BlockHelperDetails::ImplType< MatrixType >::tpetra_multivector_type &Z, typename BlockHelperDetails::ImplType< MatrixType >::impl_scalar_type_1d_view &W, const BlockHelperDetails::PartInterface< MatrixType > &interf, const BlockTridiags< MatrixType > &btdm, const BlockHelperDetails::AmD< MatrixType > &amd, typename BlockHelperDetails::ImplType< MatrixType >::vector_type_1d_view &work, BlockHelperDetails::NormManager< MatrixType > &norm_manager, const typename BlockHelperDetails::ImplType< MatrixType >::impl_scalar_type &damping_factor, bool is_y_zero, const int max_num_sweeps, const typename BlockHelperDetails::ImplType< MatrixType >::magnitude_type tol, const int check_tol_every)
Definition: Ifpack2_BlockTriDiContainer_impl.hpp:5024
Definition: Ifpack2_BlockTriDiContainer_impl.hpp:1613
Definition: Ifpack2_BlockComputeResidualVector.hpp:23
Definition: Ifpack2_BlockTriDiContainer_impl.hpp:3802