Ifpack2 Templated Preconditioning Package  Version 1.0
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Groups Pages
Ifpack2_BlockTriDiContainer_impl.hpp
1 // @HEADER
2 // *****************************************************************************
3 // Ifpack2: Templated Object-Oriented Algebraic Preconditioner Package
4 //
5 // Copyright 2009 NTESS and the Ifpack2 contributors.
6 // SPDX-License-Identifier: BSD-3-Clause
7 // *****************************************************************************
8 // @HEADER
9 
10 #ifndef IFPACK2_BLOCKTRIDICONTAINER_IMPL_HPP
11 #define IFPACK2_BLOCKTRIDICONTAINER_IMPL_HPP
12 
13 //#define IFPACK2_BLOCKTRIDICONTAINER_WRITE_MM
14 //#define IFPACK2_BLOCKTRIDICONTAINER_USE_PRINTF
15 
17 
18 #include <Tpetra_Details_extractMpiCommFromTeuchos.hpp>
19 #include <Tpetra_Distributor.hpp>
20 #include <Tpetra_BlockMultiVector.hpp>
21 
22 #include <Kokkos_ArithTraits.hpp>
23 #include <KokkosBatched_Util.hpp>
24 #include <KokkosBatched_Vector.hpp>
25 #include <KokkosBatched_Copy_Decl.hpp>
26 #include <KokkosBatched_Copy_Impl.hpp>
27 #include <KokkosBatched_AddRadial_Decl.hpp>
28 #include <KokkosBatched_AddRadial_Impl.hpp>
29 #include <KokkosBatched_SetIdentity_Decl.hpp>
30 #include <KokkosBatched_SetIdentity_Impl.hpp>
31 #include <KokkosBatched_Gemm_Decl.hpp>
32 #include <KokkosBatched_Gemm_Serial_Impl.hpp>
33 #include <KokkosBatched_Gemm_Team_Impl.hpp>
34 #include <KokkosBatched_Gemv_Decl.hpp>
35 #include <KokkosBatched_Gemv_Team_Impl.hpp>
36 #include <KokkosBatched_Trsm_Decl.hpp>
37 #include <KokkosBatched_Trsm_Serial_Impl.hpp>
38 #include <KokkosBatched_Trsm_Team_Impl.hpp>
39 #include <KokkosBatched_Trsv_Decl.hpp>
40 #include <KokkosBatched_Trsv_Serial_Impl.hpp>
41 #include <KokkosBatched_Trsv_Team_Impl.hpp>
42 #include <KokkosBatched_LU_Decl.hpp>
43 #include <KokkosBatched_LU_Serial_Impl.hpp>
44 #include <KokkosBatched_LU_Team_Impl.hpp>
45 
46 #include <KokkosBlas1_nrm1.hpp>
47 #include <KokkosBlas1_nrm2.hpp>
48 
49 #include <memory>
50 
51 #include "Ifpack2_BlockHelper.hpp"
52 #include "Ifpack2_BlockComputeResidualVector.hpp"
53 #include "Ifpack2_BlockComputeResidualAndSolve.hpp"
54 
55 //#include <KokkosBlas2_gemv.hpp>
56 
57 // need to interface this into cmake variable (or only use this flag when it is necessary)
58 //#define IFPACK2_BLOCKTRIDICONTAINER_ENABLE_PROFILE
59 //#undef IFPACK2_BLOCKTRIDICONTAINER_ENABLE_PROFILE
60 #if defined(KOKKOS_ENABLE_CUDA) && defined(IFPACK2_BLOCKTRIDICONTAINER_ENABLE_PROFILE)
61 #include "cuda_profiler_api.h"
62 #endif
63 
64 // I am not 100% sure about the mpi 3 on cuda
65 #if MPI_VERSION >= 3
66 #define IFPACK2_BLOCKTRIDICONTAINER_USE_MPI_3
67 #endif
68 
69 // ::: Experiments :::
70 // define either pinned memory or cudamemory for mpi
71 // if both macros are disabled, it will use tpetra memory space which is uvm space for cuda
72 // if defined, this use pinned memory instead of device pointer
73 // by default, we enable pinned memory
74 #define IFPACK2_BLOCKTRIDICONTAINER_USE_PINNED_MEMORY_FOR_MPI
75 //#define IFPACK2_BLOCKTRIDICONTAINER_USE_CUDA_MEMORY_FOR_MPI
76 
77 // if defined, all views are allocated on cuda space intead of cuda uvm space
78 #define IFPACK2_BLOCKTRIDICONTAINER_USE_CUDA_SPACE
79 
80 // if defined, btdm_scalar_type is used (if impl_scala_type is double, btdm_scalar_type is float)
81 #if defined(HAVE_IFPACK2_BLOCKTRIDICONTAINER_SMALL_SCALAR)
82 #define IFPACK2_BLOCKTRIDICONTAINER_USE_SMALL_SCALAR_FOR_BLOCKTRIDIAG
83 #endif
84 
85 // if defined, it uses multiple execution spaces
86 #define IFPACK2_BLOCKTRIDICONTAINER_USE_EXEC_SPACE_INSTANCES
87 
88 namespace Ifpack2 {
89 
90 namespace BlockTriDiContainerDetails {
91 
92 namespace KB = KokkosBatched;
93 
97 using do_not_initialize_tag = Kokkos::ViewAllocateWithoutInitializing;
98 
99 template <typename MemoryTraitsType, Kokkos::MemoryTraitsFlags flag>
100 using MemoryTraits = Kokkos::MemoryTraits<MemoryTraitsType::is_unmanaged |
101  MemoryTraitsType::is_random_access |
102  flag>;
103 
104 template <typename ViewType>
105 using Unmanaged = Kokkos::View<typename ViewType::data_type,
106  typename ViewType::array_layout,
107  typename ViewType::device_type,
108  MemoryTraits<typename ViewType::memory_traits, Kokkos::Unmanaged>>;
109 template <typename ViewType>
110 using Atomic = Kokkos::View<typename ViewType::data_type,
111  typename ViewType::array_layout,
112  typename ViewType::device_type,
113  MemoryTraits<typename ViewType::memory_traits, Kokkos::Atomic>>;
114 template <typename ViewType>
115 using Const = Kokkos::View<typename ViewType::const_data_type,
116  typename ViewType::array_layout,
117  typename ViewType::device_type,
118  typename ViewType::memory_traits>;
119 template <typename ViewType>
120 using ConstUnmanaged = Const<Unmanaged<ViewType>>;
121 
122 template <typename ViewType>
123 using AtomicUnmanaged = Atomic<Unmanaged<ViewType>>;
124 
125 template <typename ViewType>
126 using Unmanaged = Kokkos::View<typename ViewType::data_type,
127  typename ViewType::array_layout,
128  typename ViewType::device_type,
129  MemoryTraits<typename ViewType::memory_traits, Kokkos::Unmanaged>>;
130 
131 template <typename ViewType>
132 using Scratch = Kokkos::View<typename ViewType::data_type,
133  typename ViewType::array_layout,
134  typename ViewType::execution_space::scratch_memory_space,
135  MemoryTraits<typename ViewType::memory_traits, Kokkos::Unmanaged>>;
136 
140 template <typename T>
141 struct BlockTridiagScalarType { typedef T type; };
142 #if defined(IFPACK2_BLOCKTRIDICONTAINER_USE_SMALL_SCALAR_FOR_BLOCKTRIDIAG)
143 template <>
144 struct BlockTridiagScalarType<double> { typedef float type; };
145 // template<> struct SmallScalarType<Kokkos::complex<double> > { typedef Kokkos::complex<float> type; };
146 #endif
147 
148 #if defined(KOKKOS_ENABLE_CUDA) && defined(IFPACK2_BLOCKTRIDICONTAINER_ENABLE_PROFILE)
149 #define IFPACK2_BLOCKTRIDICONTAINER_PROFILER_REGION_BEGIN \
150  KOKKOS_IMPL_CUDA_SAFE_CALL(cudaProfilerStart());
151 
152 #define IFPACK2_BLOCKTRIDICONTAINER_PROFILER_REGION_END \
153  { KOKKOS_IMPL_CUDA_SAFE_CALL(cudaProfilerStop()); }
154 #else
155 #define IFPACK2_BLOCKTRIDICONTAINER_PROFILER_REGION_BEGIN
157 #define IFPACK2_BLOCKTRIDICONTAINER_PROFILER_REGION_END
158 #endif
159 
163 template <typename MatrixType>
165 createBlockCrsTpetraImporter(const Teuchos::RCP<const typename BlockHelperDetails::ImplType<MatrixType>::tpetra_row_matrix_type> &A) {
166  IFPACK2_BLOCKHELPER_TIMER("BlockTriDi::CreateBlockCrsTpetraImporter", CreateBlockCrsTpetraImporter);
168  using tpetra_map_type = typename impl_type::tpetra_map_type;
169  using tpetra_mv_type = typename impl_type::tpetra_block_multivector_type;
170  using tpetra_import_type = typename impl_type::tpetra_import_type;
171  using crs_matrix_type = typename impl_type::tpetra_crs_matrix_type;
172  using block_crs_matrix_type = typename impl_type::tpetra_block_crs_matrix_type;
173 
174  auto A_crs = Teuchos::rcp_dynamic_cast<const crs_matrix_type>(A);
175  auto A_bcrs = Teuchos::rcp_dynamic_cast<const block_crs_matrix_type>(A);
176 
177  bool hasBlockCrsMatrix = !A_bcrs.is_null();
178 
179  // This is OK here to use the graph of the A_crs matrix and a block size of 1
180  const auto g = hasBlockCrsMatrix ? A_bcrs->getCrsGraph() : *(A_crs->getCrsGraph()); // tpetra crs graph object
181 
182  const auto blocksize = hasBlockCrsMatrix ? A_bcrs->getBlockSize() : 1;
183  const auto src = Teuchos::rcp(new tpetra_map_type(tpetra_mv_type::makePointMap(*g.getDomainMap(), blocksize)));
184  const auto tgt = Teuchos::rcp(new tpetra_map_type(tpetra_mv_type::makePointMap(*g.getColMap(), blocksize)));
185  IFPACK2_BLOCKHELPER_TIMER_FENCE(typename BlockHelperDetails::ImplType<MatrixType>::execution_space)
186  return Teuchos::rcp(new tpetra_import_type(src, tgt));
187 }
188 
189 // Partial replacement for forward-mode MultiVector::doImport.
190 // Permits overlapped communication and computation, but also supports sync'ed.
191 // I'm finding that overlapped comm/comp can give quite poor performance on some
192 // platforms, so we can't just use it straightforwardly always.
193 
194 template <typename MatrixType>
195 struct AsyncableImport {
196  public:
198 
199  private:
203 #if !defined(HAVE_IFPACK2_MPI)
204  typedef int MPI_Request;
205  typedef int MPI_Comm;
206 #endif
207  using scalar_type = typename impl_type::scalar_type;
210 
211  static int isend(const MPI_Comm comm, const char *buf, int count, int dest, int tag, MPI_Request *ireq) {
212 #ifdef HAVE_IFPACK2_MPI
213  MPI_Request ureq;
214  int ret = MPI_Isend(const_cast<char *>(buf), count, MPI_CHAR, dest, tag, comm, ireq == NULL ? &ureq : ireq);
215  if (ireq == NULL) MPI_Request_free(&ureq);
216  return ret;
217 #else
218  return 0;
219 #endif
220  }
221 
222  static int irecv(const MPI_Comm comm, char *buf, int count, int src, int tag, MPI_Request *ireq) {
223 #ifdef HAVE_IFPACK2_MPI
224  MPI_Request ureq;
225  int ret = MPI_Irecv(buf, count, MPI_CHAR, src, tag, comm, ireq == NULL ? &ureq : ireq);
226  if (ireq == NULL) MPI_Request_free(&ureq);
227  return ret;
228 #else
229  return 0;
230 #endif
231  }
232 
233  static int waitany(int count, MPI_Request *reqs, int *index) {
234 #ifdef HAVE_IFPACK2_MPI
235  return MPI_Waitany(count, reqs, index, MPI_STATUS_IGNORE);
236 #else
237  return 0;
238 #endif
239  }
240 
241  static int waitall(int count, MPI_Request *reqs) {
242 #ifdef HAVE_IFPACK2_MPI
243  return MPI_Waitall(count, reqs, MPI_STATUS_IGNORE);
244 #else
245  return 0;
246 #endif
247  }
248 
249  public:
250  using tpetra_map_type = typename impl_type::tpetra_map_type;
251  using tpetra_import_type = typename impl_type::tpetra_import_type;
252 
253  using local_ordinal_type = typename impl_type::local_ordinal_type;
254  using global_ordinal_type = typename impl_type::global_ordinal_type;
255  using size_type = typename impl_type::size_type;
256  using impl_scalar_type = typename impl_type::impl_scalar_type;
257 
258  using int_1d_view_host = Kokkos::View<int *, Kokkos::HostSpace>;
259  using local_ordinal_type_1d_view_host = Kokkos::View<local_ordinal_type *, Kokkos::HostSpace>;
260 
261  using execution_space = typename impl_type::execution_space;
262  using memory_space = typename impl_type::memory_space;
263  using local_ordinal_type_1d_view = typename impl_type::local_ordinal_type_1d_view;
264  using size_type_1d_view = typename impl_type::size_type_1d_view;
265  using size_type_1d_view_host = Kokkos::View<size_type *, Kokkos::HostSpace>;
266 
267 #if defined(KOKKOS_ENABLE_CUDA)
268  using impl_scalar_type_1d_view =
269  typename std::conditional<std::is_same<execution_space, Kokkos::Cuda>::value,
270 #if defined(IFPACK2_BLOCKTRIDICONTAINER_USE_PINNED_MEMORY_FOR_MPI)
271  Kokkos::View<impl_scalar_type *, Kokkos::CudaHostPinnedSpace>,
272 #elif defined(IFPACK2_BLOCKTRIDICONTAINER_USE_CUDA_MEMORY_FOR_MPI)
273  Kokkos::View<impl_scalar_type *, Kokkos::CudaSpace>,
274 #else // no experimental macros are defined
275  typename impl_type::impl_scalar_type_1d_view,
276 #endif
277  typename impl_type::impl_scalar_type_1d_view>::type;
278 #else
279  using impl_scalar_type_1d_view = typename impl_type::impl_scalar_type_1d_view;
280 #endif
281  using impl_scalar_type_1d_view_host = Kokkos::View<impl_scalar_type *, Kokkos::HostSpace>;
282  using impl_scalar_type_2d_view = typename impl_type::impl_scalar_type_2d_view;
283  using impl_scalar_type_2d_view_tpetra = typename impl_type::impl_scalar_type_2d_view_tpetra;
284 
285 #ifdef HAVE_IFPACK2_MPI
286  MPI_Comm comm;
287 #endif
288 
289  impl_scalar_type_2d_view_tpetra remote_multivector;
290  local_ordinal_type blocksize;
291 
292  template <typename T>
293  struct SendRecvPair {
294  T send, recv;
295  };
296 
297  // (s)end and (r)eceive data:
298  SendRecvPair<int_1d_view_host> pids; // mpi ranks
299  SendRecvPair<std::vector<MPI_Request>> reqs; // MPI_Request is pointer, cannot use kokkos view
300  SendRecvPair<size_type_1d_view> offset; // offsets to local id list and data buffer
301  SendRecvPair<size_type_1d_view_host> offset_host; // offsets to local id list and data buffer
302  SendRecvPair<local_ordinal_type_1d_view> lids; // local id list
303  SendRecvPair<impl_scalar_type_1d_view> buffer; // data buffer
304  SendRecvPair<impl_scalar_type_1d_view_host> buffer_host; // data buffer
305 
306  local_ordinal_type_1d_view dm2cm; // permutation
307 
308 #if defined(KOKKOS_ENABLE_CUDA) || defined(KOKKOS_ENABLE_HIP) || defined(KOKKOS_ENABLE_SYCL)
309  using exec_instance_1d_std_vector = std::vector<execution_space>;
310  exec_instance_1d_std_vector exec_instances;
311 #endif
312 
313  // for cuda
314  public:
315  void setOffsetValues(const Teuchos::ArrayView<const size_t> &lens,
316  const size_type_1d_view &offs) {
317  // wrap lens to kokkos view and deep copy to device
318  Kokkos::View<size_t *, Kokkos::HostSpace> lens_host(const_cast<size_t *>(lens.getRawPtr()), lens.size());
319  const auto lens_device = Kokkos::create_mirror_view_and_copy(memory_space(), lens_host);
320 
321  // exclusive scan
322  const Kokkos::RangePolicy<execution_space> policy(0, offs.extent(0));
323  const local_ordinal_type lens_size = lens_device.extent(0);
324  Kokkos::parallel_scan(
325  "AsyncableImport::RangePolicy::setOffsetValues",
326  policy, KOKKOS_LAMBDA(const local_ordinal_type &i, size_type &update, const bool &final) {
327  if (final)
328  offs(i) = update;
329  update += (i < lens_size ? lens_device[i] : 0);
330  });
331  }
332 
333  void setOffsetValuesHost(const Teuchos::ArrayView<const size_t> &lens,
334  const size_type_1d_view_host &offs) {
335  // wrap lens to kokkos view and deep copy to device
336  Kokkos::View<size_t *, Kokkos::HostSpace> lens_host(const_cast<size_t *>(lens.getRawPtr()), lens.size());
337  const auto lens_device = Kokkos::create_mirror_view_and_copy(memory_space(), lens_host);
338 
339  // exclusive scan
340  offs(0) = 0;
341  for (local_ordinal_type i = 1, iend = offs.extent(0); i < iend; ++i) {
342  offs(i) = offs(i - 1) + lens[i - 1];
343  }
344  }
345 
346  private:
347  void createMpiRequests(const tpetra_import_type &import) {
348  Tpetra::Distributor &distributor = import.getDistributor();
349 
350  // copy pids from distributor
351  const auto pids_from = distributor.getProcsFrom();
352  pids.recv = int_1d_view_host(do_not_initialize_tag("pids recv"), pids_from.size());
353  memcpy(pids.recv.data(), pids_from.getRawPtr(), sizeof(int) * pids.recv.extent(0));
354 
355  const auto pids_to = distributor.getProcsTo();
356  pids.send = int_1d_view_host(do_not_initialize_tag("pids send"), pids_to.size());
357  memcpy(pids.send.data(), pids_to.getRawPtr(), sizeof(int) * pids.send.extent(0));
358 
359  // mpi requests
360  reqs.recv.resize(pids.recv.extent(0));
361  memset(reqs.recv.data(), 0, reqs.recv.size() * sizeof(MPI_Request));
362  reqs.send.resize(pids.send.extent(0));
363  memset(reqs.send.data(), 0, reqs.send.size() * sizeof(MPI_Request));
364 
365  // construct offsets
366 #if 0
367  const auto lengths_to = distributor.getLengthsTo();
368  offset.send = size_type_1d_view(do_not_initialize_tag("offset send"), lengths_to.size() + 1);
369 
370  const auto lengths_from = distributor.getLengthsFrom();
371  offset.recv = size_type_1d_view(do_not_initialize_tag("offset recv"), lengths_from.size() + 1);
372 
373  setOffsetValues(lengths_to, offset.send);
374  offset_host.send = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), offset.send);
375 
376  setOffsetValues(lengths_from, offset.recv);
377  offset_host.recv = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), offset.recv);
378 #else
379  const auto lengths_to = distributor.getLengthsTo();
380  offset_host.send = size_type_1d_view_host(do_not_initialize_tag("offset send"), lengths_to.size() + 1);
381 
382  const auto lengths_from = distributor.getLengthsFrom();
383  offset_host.recv = size_type_1d_view_host(do_not_initialize_tag("offset recv"), lengths_from.size() + 1);
384 
385  setOffsetValuesHost(lengths_to, offset_host.send);
386  // offset.send = Kokkos::create_mirror_view_and_copy(memory_space(), offset_host.send);
387 
388  setOffsetValuesHost(lengths_from, offset_host.recv);
389  // offset.recv = Kokkos::create_mirror_view_and_copy(memory_space(), offset_host.recv);
390 #endif
391  }
392 
393  void createSendRecvIDs(const tpetra_import_type &import) {
394  // For each remote PID, the list of LIDs to receive.
395  const auto remote_lids = import.getRemoteLIDs();
396  const local_ordinal_type_1d_view_host
397  remote_lids_view_host(const_cast<local_ordinal_type *>(remote_lids.getRawPtr()), remote_lids.size());
398  lids.recv = local_ordinal_type_1d_view(do_not_initialize_tag("lids recv"), remote_lids.size());
399  Kokkos::deep_copy(lids.recv, remote_lids_view_host);
400 
401  // For each export PID, the list of LIDs to send.
402  auto epids = import.getExportPIDs();
403  auto elids = import.getExportLIDs();
404  TEUCHOS_ASSERT(epids.size() == elids.size());
405  lids.send = local_ordinal_type_1d_view(do_not_initialize_tag("lids send"), elids.size());
406  auto lids_send_host = Kokkos::create_mirror_view(lids.send);
407 
408  // naive search (not sure if pids or epids are sorted)
409  for (local_ordinal_type cnt = 0, i = 0, iend = pids.send.extent(0); i < iend; ++i) {
410  const auto pid_send_value = pids.send[i];
411  for (local_ordinal_type j = 0, jend = epids.size(); j < jend; ++j)
412  if (epids[j] == pid_send_value) lids_send_host[cnt++] = elids[j];
413  TEUCHOS_ASSERT(static_cast<size_t>(cnt) == offset_host.send[i + 1]);
414  }
415  Kokkos::deep_copy(lids.send, lids_send_host);
416  }
417 
418  void createExecutionSpaceInstances() {
419 #if defined(KOKKOS_ENABLE_CUDA) || defined(KOKKOS_ENABLE_HIP) || defined(KOKKOS_ENABLE_SYCL)
420  // The following line creates 8 streams:
421 #if KOKKOS_VERSION >= 40699
422  exec_instances =
423  Kokkos::Experimental::partition_space(execution_space(), std::vector<int>(8, 1));
424 #else
425  exec_instances =
426  Kokkos::Experimental::partition_space(execution_space(), 1, 1, 1, 1, 1, 1, 1, 1);
427 #endif
428 #endif
429  }
430 
431  public:
432  // for cuda, all tag types are public
433  struct ToBuffer {};
434  struct ToMultiVector {};
435 
436  AsyncableImport(const Teuchos::RCP<const tpetra_map_type> &src_map,
438  const local_ordinal_type blocksize_,
439  const local_ordinal_type_1d_view dm2cm_) {
440  blocksize = blocksize_;
441  dm2cm = dm2cm_;
442 
443 #ifdef HAVE_IFPACK2_MPI
444  comm = Tpetra::Details::extractMpiCommFromTeuchos(*tgt_map->getComm());
445 #endif
446  const tpetra_import_type import(src_map, tgt_map);
447 
448  createMpiRequests(import);
449  createSendRecvIDs(import);
450  createExecutionSpaceInstances();
451  }
452 
453  void createDataBuffer(const local_ordinal_type &num_vectors) {
454  const size_type extent_0 = lids.recv.extent(0) * blocksize;
455  const size_type extent_1 = num_vectors;
456  if (remote_multivector.extent(0) == extent_0 &&
457  remote_multivector.extent(1) == extent_1) {
458  // skip
459  } else {
460  remote_multivector =
461  impl_scalar_type_2d_view_tpetra(do_not_initialize_tag("remote multivector"), extent_0, extent_1);
462 
463  const auto send_buffer_size = offset_host.send[offset_host.send.extent(0) - 1] * blocksize * num_vectors;
464  const auto recv_buffer_size = offset_host.recv[offset_host.recv.extent(0) - 1] * blocksize * num_vectors;
465 
466  buffer.send = impl_scalar_type_1d_view(do_not_initialize_tag("buffer send"), send_buffer_size);
467  buffer.recv = impl_scalar_type_1d_view(do_not_initialize_tag("buffer recv"), recv_buffer_size);
468 
469  if (!Tpetra::Details::Behavior::assumeMpiIsGPUAware()) {
470  buffer_host.send = impl_scalar_type_1d_view_host(do_not_initialize_tag("buffer send"), send_buffer_size);
471  buffer_host.recv = impl_scalar_type_1d_view_host(do_not_initialize_tag("buffer recv"), recv_buffer_size);
472  }
473  }
474  }
475 
476  void cancel() {
477 #ifdef HAVE_IFPACK2_MPI
478  waitall(reqs.recv.size(), reqs.recv.data());
479  waitall(reqs.send.size(), reqs.send.data());
480 #endif
481  }
482 
483  // ======================================================================
484  // Async version using execution space instances
485  // ======================================================================
486 
487 #if defined(KOKKOS_ENABLE_CUDA) || defined(KOKKOS_ENABLE_HIP) || defined(KOKKOS_ENABLE_SYCL)
488  template <typename PackTag>
489  static void copy(const local_ordinal_type_1d_view &lids_,
490  const impl_scalar_type_1d_view &buffer_,
491  const local_ordinal_type ibeg_,
492  const local_ordinal_type iend_,
493  const impl_scalar_type_2d_view_tpetra &multivector_,
494  const local_ordinal_type blocksize_,
495  const execution_space &exec_instance_) {
496  const local_ordinal_type num_vectors = multivector_.extent(1);
497  const local_ordinal_type mv_blocksize = blocksize_ * num_vectors;
498  const local_ordinal_type idiff = iend_ - ibeg_;
499  const auto abase = buffer_.data() + mv_blocksize * ibeg_;
500 
501  using team_policy_type = Kokkos::TeamPolicy<execution_space>;
502  local_ordinal_type vector_size(0);
503  if (blocksize_ <= 4)
504  vector_size = 4;
505  else if (blocksize_ <= 8)
506  vector_size = 8;
507  else if (blocksize_ <= 16)
508  vector_size = 16;
509  else
510  vector_size = 32;
511 
512  const auto work_item_property = Kokkos::Experimental::WorkItemProperty::HintLightWeight;
513  const team_policy_type policy(exec_instance_, idiff, 1, vector_size);
514  Kokkos::parallel_for( //"AsyncableImport::TeamPolicy::copyViaCudaStream",
515  Kokkos::Experimental::require(policy, work_item_property),
516  KOKKOS_LAMBDA(const typename team_policy_type::member_type &member) {
517  const local_ordinal_type i = member.league_rank();
518  Kokkos::parallel_for(Kokkos::TeamThreadRange(member, num_vectors), [&](const local_ordinal_type &j) {
519  auto aptr = abase + blocksize_ * (i + idiff * j);
520  auto bptr = &multivector_(blocksize_ * lids_(i + ibeg_), j);
521  if (std::is_same<PackTag, ToBuffer>::value)
522  Kokkos::parallel_for(Kokkos::ThreadVectorRange(member, blocksize_), [&](const local_ordinal_type &k) {
523  aptr[k] = bptr[k];
524  });
525  else
526  Kokkos::parallel_for(Kokkos::ThreadVectorRange(member, blocksize_), [&](const local_ordinal_type &k) {
527  bptr[k] = aptr[k];
528  });
529  });
530  });
531  }
532 
533  void asyncSendRecvVar1(const impl_scalar_type_2d_view_tpetra &mv) {
534  IFPACK2_BLOCKHELPER_TIMER("BlockTriDi::AsyncableImport::AsyncSendRecv", AsyncSendRecv);
535 
536 #ifdef HAVE_IFPACK2_MPI
537  // constants and reallocate data buffers if necessary
538  const local_ordinal_type num_vectors = mv.extent(1);
539  const local_ordinal_type mv_blocksize = blocksize * num_vectors;
540 
541  // 0. post receive async
542  for (local_ordinal_type i = 0, iend = pids.recv.extent(0); i < iend; ++i) {
543  if (Tpetra::Details::Behavior::assumeMpiIsGPUAware()) {
544  irecv(comm,
545  reinterpret_cast<char *>(buffer.recv.data() + offset_host.recv[i] * mv_blocksize),
546  (offset_host.recv[i + 1] - offset_host.recv[i]) * mv_blocksize * sizeof(impl_scalar_type),
547  pids.recv[i],
548  42,
549  &reqs.recv[i]);
550  } else {
551  irecv(comm,
552  reinterpret_cast<char *>(buffer_host.recv.data() + offset_host.recv[i] * mv_blocksize),
553  (offset_host.recv[i + 1] - offset_host.recv[i]) * mv_blocksize * sizeof(impl_scalar_type),
554  pids.recv[i],
555  42,
556  &reqs.recv[i]);
557  }
558  }
559 
561  execution_space().fence();
562 
563  // 1. async memcpy
564  for (local_ordinal_type i = 0; i < static_cast<local_ordinal_type>(pids.send.extent(0)); ++i) {
565  // 1.0. enqueue pack buffer
566  if (i < 8) exec_instances[i % 8].fence();
567  copy<ToBuffer>(lids.send, buffer.send,
568  offset_host.send(i), offset_host.send(i + 1),
569  mv, blocksize,
570  // execution_space());
571  exec_instances[i % 8]);
572  if (!Tpetra::Details::Behavior::assumeMpiIsGPUAware()) {
573  // if (i<8) exec_instances[i%8].fence();
574  const local_ordinal_type num_vectors = mv.extent(1);
575  const local_ordinal_type mv_blocksize = blocksize * num_vectors;
576 
577  Kokkos::deep_copy(exec_instances[i % 8],
578  Kokkos::subview(buffer_host.send,
579  Kokkos::pair<local_ordinal_type, local_ordinal_type>(
580  offset_host.send(i) * mv_blocksize,
581  offset_host.send(i + 1) * mv_blocksize)),
582  Kokkos::subview(buffer.send,
583  Kokkos::pair<local_ordinal_type, local_ordinal_type>(
584  offset_host.send(i) * mv_blocksize,
585  offset_host.send(i + 1) * mv_blocksize)));
586  }
587  }
589  // execution_space().fence();
590  for (local_ordinal_type i = 0; i < static_cast<local_ordinal_type>(pids.send.extent(0)); ++i) {
591  // 1.1. sync the stream and isend
592  if (i < 8) exec_instances[i % 8].fence();
593  if (Tpetra::Details::Behavior::assumeMpiIsGPUAware()) {
594  isend(comm,
595  reinterpret_cast<const char *>(buffer.send.data() + offset_host.send[i] * mv_blocksize),
596  (offset_host.send[i + 1] - offset_host.send[i]) * mv_blocksize * sizeof(impl_scalar_type),
597  pids.send[i],
598  42,
599  &reqs.send[i]);
600  } else {
601  isend(comm,
602  reinterpret_cast<const char *>(buffer_host.send.data() + offset_host.send[i] * mv_blocksize),
603  (offset_host.send[i + 1] - offset_host.send[i]) * mv_blocksize * sizeof(impl_scalar_type),
604  pids.send[i],
605  42,
606  &reqs.send[i]);
607  }
608  }
609 
610  // 2. poke communication
611  for (local_ordinal_type i = 0, iend = pids.recv.extent(0); i < iend; ++i) {
612  int flag;
613  MPI_Status stat;
614  MPI_Iprobe(pids.recv[i], 42, comm, &flag, &stat);
615  }
616 #endif // HAVE_IFPACK2_MPI
617  IFPACK2_BLOCKHELPER_TIMER_FENCE(execution_space)
618  }
619 
620  void syncRecvVar1() {
621  IFPACK2_BLOCKHELPER_TIMER("BlockTriDi::AsyncableImport::SyncRecv", SyncRecv);
622 #ifdef HAVE_IFPACK2_MPI
623  // 0. wait for receive async.
624  for (local_ordinal_type i = 0; i < static_cast<local_ordinal_type>(pids.recv.extent(0)); ++i) {
625  local_ordinal_type idx = i;
626 
627  // 0.0. wait any
628  waitany(pids.recv.extent(0), reqs.recv.data(), &idx);
629 
630  if (!Tpetra::Details::Behavior::assumeMpiIsGPUAware()) {
631  const local_ordinal_type num_vectors = remote_multivector.extent(1);
632  const local_ordinal_type mv_blocksize = blocksize * num_vectors;
633 
634  Kokkos::deep_copy(
635  Kokkos::subview(buffer.recv,
636  Kokkos::pair<local_ordinal_type, local_ordinal_type>(
637  offset_host.recv(idx) * mv_blocksize,
638  offset_host.recv(idx + 1) * mv_blocksize)),
639  Kokkos::subview(buffer_host.recv,
640  Kokkos::pair<local_ordinal_type, local_ordinal_type>(
641  offset_host.recv(idx) * mv_blocksize,
642  offset_host.recv(idx + 1) * mv_blocksize)));
643  }
644 
645  // 0.1. unpack data after data is moved into a device
646  copy<ToMultiVector>(lids.recv, buffer.recv,
647  offset_host.recv(idx), offset_host.recv(idx + 1),
648  remote_multivector, blocksize,
649  exec_instances[idx % 8]);
650  }
651 
652  // 1. fire up all cuda events
653  Kokkos::fence();
654 
655  // 2. cleanup all open comm
656  waitall(reqs.send.size(), reqs.send.data());
657 #endif // HAVE_IFPACK2_MPI
658  IFPACK2_BLOCKHELPER_TIMER_FENCE(execution_space)
659  }
660 #endif // defined(KOKKOS_ENABLE_CUDA|HIP|SYCL)
661 
662  // ======================================================================
663  // Generic version without using execution space instances
664  // - only difference between device and host architecture is on using team
665  // or range policies.
666  // ======================================================================
667  template <typename PackTag>
668  static void copy(const local_ordinal_type_1d_view &lids_,
669  const impl_scalar_type_1d_view &buffer_,
670  const local_ordinal_type &ibeg_,
671  const local_ordinal_type &iend_,
672  const impl_scalar_type_2d_view_tpetra &multivector_,
673  const local_ordinal_type blocksize_) {
674  const local_ordinal_type num_vectors = multivector_.extent(1);
675  const local_ordinal_type mv_blocksize = blocksize_ * num_vectors;
676  const local_ordinal_type idiff = iend_ - ibeg_;
677  const auto abase = buffer_.data() + mv_blocksize * ibeg_;
678  if constexpr (BlockHelperDetails::is_device<execution_space>::value) {
679  using team_policy_type = Kokkos::TeamPolicy<execution_space>;
680  local_ordinal_type vector_size(0);
681  if (blocksize_ <= 4)
682  vector_size = 4;
683  else if (blocksize_ <= 8)
684  vector_size = 8;
685  else if (blocksize_ <= 16)
686  vector_size = 16;
687  else
688  vector_size = 32;
689  const team_policy_type policy(idiff, 1, vector_size);
690  Kokkos::parallel_for(
691  "AsyncableImport::TeamPolicy::copy",
692  policy, KOKKOS_LAMBDA(const typename team_policy_type::member_type &member) {
693  const local_ordinal_type i = member.league_rank();
694  Kokkos::parallel_for(Kokkos::TeamThreadRange(member, num_vectors), [&](const local_ordinal_type &j) {
695  auto aptr = abase + blocksize_ * (i + idiff * j);
696  auto bptr = &multivector_(blocksize_ * lids_(i + ibeg_), j);
697  if (std::is_same<PackTag, ToBuffer>::value)
698  Kokkos::parallel_for(Kokkos::ThreadVectorRange(member, blocksize_), [&](const local_ordinal_type &k) {
699  aptr[k] = bptr[k];
700  });
701  else
702  Kokkos::parallel_for(Kokkos::ThreadVectorRange(member, blocksize_), [&](const local_ordinal_type &k) {
703  bptr[k] = aptr[k];
704  });
705  });
706  });
707  } else {
708  const Kokkos::RangePolicy<execution_space> policy(0, idiff * num_vectors);
709  Kokkos::parallel_for(
710  "AsyncableImport::RangePolicy::copy",
711  policy, KOKKOS_LAMBDA(const local_ordinal_type &ij) {
712  const local_ordinal_type i = ij % idiff;
713  const local_ordinal_type j = ij / idiff;
714  auto aptr = abase + blocksize_ * (i + idiff * j);
715  auto bptr = &multivector_(blocksize_ * lids_(i + ibeg_), j);
716  auto from = std::is_same<PackTag, ToBuffer>::value ? bptr : aptr;
717  auto to = std::is_same<PackTag, ToBuffer>::value ? aptr : bptr;
718  memcpy(to, from, sizeof(impl_scalar_type) * blocksize_);
719  });
720  }
721  }
722 
726  void asyncSendRecvVar0(const impl_scalar_type_2d_view_tpetra &mv) {
727  IFPACK2_BLOCKHELPER_TIMER("BlockTriDi::AsyncableImport::AsyncSendRecv", AsyncSendRecv);
728 
729 #ifdef HAVE_IFPACK2_MPI
730  // constants and reallocate data buffers if necessary
731  const local_ordinal_type num_vectors = mv.extent(1);
732  const local_ordinal_type mv_blocksize = blocksize * num_vectors;
733 
734  // receive async
735  for (local_ordinal_type i = 0, iend = pids.recv.extent(0); i < iend; ++i) {
736  if (Tpetra::Details::Behavior::assumeMpiIsGPUAware()) {
737  irecv(comm,
738  reinterpret_cast<char *>(buffer.recv.data() + offset_host.recv[i] * mv_blocksize),
739  (offset_host.recv[i + 1] - offset_host.recv[i]) * mv_blocksize * sizeof(impl_scalar_type),
740  pids.recv[i],
741  42,
742  &reqs.recv[i]);
743  } else {
744  irecv(comm,
745  reinterpret_cast<char *>(buffer_host.recv.data() + offset_host.recv[i] * mv_blocksize),
746  (offset_host.recv[i + 1] - offset_host.recv[i]) * mv_blocksize * sizeof(impl_scalar_type),
747  pids.recv[i],
748  42,
749  &reqs.recv[i]);
750  }
751  }
752 
753  // send async
754  for (local_ordinal_type i = 0, iend = pids.send.extent(0); i < iend; ++i) {
755  copy<ToBuffer>(lids.send, buffer.send, offset_host.send(i), offset_host.send(i + 1),
756  mv, blocksize);
757  Kokkos::fence();
758  if (Tpetra::Details::Behavior::assumeMpiIsGPUAware()) {
759  isend(comm,
760  reinterpret_cast<const char *>(buffer.send.data() + offset_host.send[i] * mv_blocksize),
761  (offset_host.send[i + 1] - offset_host.send[i]) * mv_blocksize * sizeof(impl_scalar_type),
762  pids.send[i],
763  42,
764  &reqs.send[i]);
765  } else {
766  Kokkos::deep_copy(
767  Kokkos::subview(buffer_host.send,
768  Kokkos::pair<local_ordinal_type, local_ordinal_type>(
769  offset_host.send(i) * mv_blocksize,
770  offset_host.send(i + 1) * mv_blocksize)),
771  Kokkos::subview(buffer.send,
772  Kokkos::pair<local_ordinal_type, local_ordinal_type>(
773  offset_host.send(i) * mv_blocksize,
774  offset_host.send(i + 1) * mv_blocksize)));
775  isend(comm,
776  reinterpret_cast<const char *>(buffer_host.send.data() + offset_host.send[i] * mv_blocksize),
777  (offset_host.send[i + 1] - offset_host.send[i]) * mv_blocksize * sizeof(impl_scalar_type),
778  pids.send[i],
779  42,
780  &reqs.send[i]);
781  }
782  }
783 
784  // I find that issuing an Iprobe seems to nudge some MPIs into action,
785  // which helps with overlapped comm/comp performance.
786  for (local_ordinal_type i = 0, iend = pids.recv.extent(0); i < iend; ++i) {
787  int flag;
788  MPI_Status stat;
789  MPI_Iprobe(pids.recv[i], 42, comm, &flag, &stat);
790  }
791 #endif
792  IFPACK2_BLOCKHELPER_TIMER_FENCE(execution_space)
793  }
794 
795  void syncRecvVar0() {
796  IFPACK2_BLOCKHELPER_TIMER("BlockTriDi::AsyncableImport::SyncRecv", SyncRecv);
797 #ifdef HAVE_IFPACK2_MPI
798  // receive async.
799  for (local_ordinal_type i = 0, iend = pids.recv.extent(0); i < iend; ++i) {
800  local_ordinal_type idx = i;
801  waitany(pids.recv.extent(0), reqs.recv.data(), &idx);
802  if (!Tpetra::Details::Behavior::assumeMpiIsGPUAware()) {
803  const local_ordinal_type num_vectors = remote_multivector.extent(1);
804  const local_ordinal_type mv_blocksize = blocksize * num_vectors;
805  Kokkos::deep_copy(
806  Kokkos::subview(buffer.recv,
807  Kokkos::pair<local_ordinal_type, local_ordinal_type>(
808  offset_host.recv(idx) * mv_blocksize,
809  offset_host.recv(idx + 1) * mv_blocksize)),
810  Kokkos::subview(buffer_host.recv,
811  Kokkos::pair<local_ordinal_type, local_ordinal_type>(
812  offset_host.recv(idx) * mv_blocksize,
813  offset_host.recv(idx + 1) * mv_blocksize)));
814  }
815  copy<ToMultiVector>(lids.recv, buffer.recv, offset_host.recv(idx), offset_host.recv(idx + 1),
816  remote_multivector, blocksize);
817  }
818  // wait on the sends to match all Isends with a cleanup operation.
819  waitall(reqs.send.size(), reqs.send.data());
820 #endif
821  IFPACK2_BLOCKHELPER_TIMER_FENCE(execution_space)
822  }
823 
827  void asyncSendRecv(const impl_scalar_type_2d_view_tpetra &mv) {
828 #if defined(KOKKOS_ENABLE_CUDA) || defined(KOKKOS_ENABLE_HIP) || defined(KOKKOS_ENABLE_SYCL)
829 #if defined(IFPACK2_BLOCKTRIDICONTAINER_USE_EXEC_SPACE_INSTANCES)
830  asyncSendRecvVar1(mv);
831 #else
832  asyncSendRecvVar0(mv);
833 #endif
834 #else
835  asyncSendRecvVar0(mv);
836 #endif
837  }
838  void syncRecv() {
839 #if defined(KOKKOS_ENABLE_CUDA) || defined(KOKKOS_ENABLE_HIP) || defined(KOKKOS_ENABLE_SYCL)
840 #if defined(IFPACK2_BLOCKTRIDICONTAINER_USE_EXEC_SPACE_INSTANCES)
841  syncRecvVar1();
842 #else
843  syncRecvVar0();
844 #endif
845 #else
846  syncRecvVar0();
847 #endif
848  }
849 
850  void syncExchange(const impl_scalar_type_2d_view_tpetra &mv) {
851  IFPACK2_BLOCKHELPER_TIMER("BlockTriDi::AsyncableImport::SyncExchange", SyncExchange);
852  asyncSendRecv(mv);
853  syncRecv();
854  IFPACK2_BLOCKHELPER_TIMER_FENCE(execution_space)
855  }
856 
857  impl_scalar_type_2d_view_tpetra getRemoteMultiVectorLocalView() const { return remote_multivector; }
858 };
859 
860 template <typename ViewType1, typename ViewType2>
861 struct are_same_struct {
862  ViewType1 keys1;
863  ViewType2 keys2;
864 
865  are_same_struct(ViewType1 keys1_, ViewType2 keys2_)
866  : keys1(keys1_)
867  , keys2(keys2_) {}
868  KOKKOS_INLINE_FUNCTION
869  void operator()(int i, unsigned int &count) const {
870  if (keys1(i) != keys2(i)) count++;
871  }
872 };
873 
874 template <typename ViewType1, typename ViewType2>
875 bool are_same(ViewType1 keys1, ViewType2 keys2) {
876  unsigned int are_same_ = 0;
877 
878  Kokkos::parallel_reduce(Kokkos::RangePolicy<typename ViewType1::execution_space>(0, keys1.extent(0)),
879  are_same_struct(keys1, keys2),
880  are_same_);
881  return are_same_ == 0;
882 }
883 
887 template <typename MatrixType>
889 createBlockCrsAsyncImporter(const Teuchos::RCP<const typename BlockHelperDetails::ImplType<MatrixType>::tpetra_row_matrix_type> &A) {
890  IFPACK2_BLOCKHELPER_TIMER("createBlockCrsAsyncImporter", createBlockCrsAsyncImporter);
892  using tpetra_map_type = typename impl_type::tpetra_map_type;
893  using local_ordinal_type = typename impl_type::local_ordinal_type;
894  using global_ordinal_type = typename impl_type::global_ordinal_type;
895  using local_ordinal_type_1d_view = typename impl_type::local_ordinal_type_1d_view;
896  using crs_matrix_type = typename impl_type::tpetra_crs_matrix_type;
897  using block_crs_matrix_type = typename impl_type::tpetra_block_crs_matrix_type;
898  using global_indices_array_device_type = Kokkos::View<const global_ordinal_type *, typename tpetra_map_type::device_type>;
899 
900  auto A_crs = Teuchos::rcp_dynamic_cast<const crs_matrix_type>(A);
901  auto A_bcrs = Teuchos::rcp_dynamic_cast<const block_crs_matrix_type>(A);
902 
903  bool hasBlockCrsMatrix = !A_bcrs.is_null();
904 
905  // This is OK here to use the graph of the A_crs matrix and a block size of 1
906  const auto g = hasBlockCrsMatrix ? A_bcrs->getCrsGraph() : *(A_crs->getCrsGraph()); // tpetra crs graph object
907 
908  const auto blocksize = hasBlockCrsMatrix ? A_bcrs->getBlockSize() : 1;
909  const auto domain_map = g.getDomainMap();
910  const auto column_map = g.getColMap();
911 
912  std::vector<global_ordinal_type> gids;
913 
914  Kokkos::Subview<global_indices_array_device_type, std::pair<int, int>> column_map_global_iD_last;
915 
916  bool separate_remotes = true, found_first = false, need_owned_permutation = false;
917  {
918  IFPACK2_BLOCKHELPER_TIMER("createBlockCrsAsyncImporter::loop_over_local_elements", loop_over_local_elements);
919 
920  global_indices_array_device_type column_map_global_iD = column_map->getMyGlobalIndicesDevice();
921  global_indices_array_device_type domain_map_global_iD = domain_map->getMyGlobalIndicesDevice();
922 
923  if (are_same(domain_map_global_iD, column_map_global_iD)) {
924  // this should be the most likely path
925  separate_remotes = true;
926  need_owned_permutation = false;
927 
928  column_map_global_iD_last = Kokkos::subview(column_map_global_iD,
929  std::pair<int, int>(domain_map_global_iD.extent(0), column_map_global_iD.extent(0)));
930  } else {
931  // This loop is relatively expensive
932  for (size_t i = 0; i < column_map->getLocalNumElements(); ++i) {
933  const global_ordinal_type gid = column_map->getGlobalElement(i);
934  if (!domain_map->isNodeGlobalElement(gid)) {
935  found_first = true;
936  gids.push_back(gid);
937  } else if (found_first) {
938  separate_remotes = false;
939  break;
940  }
941  if (!found_first && !need_owned_permutation &&
942  domain_map->getLocalElement(gid) != static_cast<local_ordinal_type>(i)) {
943  // The owned part of the domain and column maps are different
944  // orderings. We *could* do a super efficient impl of this case in the
945  // num_sweeps > 1 case by adding complexity to PermuteAndRepack. But,
946  // really, if a caller cares about speed, they wouldn't make different
947  // local permutations like this. So we punt on the best impl and go for
948  // a pretty good one: the permutation is done in place in
949  // compute_b_minus_Rx for the pure-owned part of the MVP. The only cost
950  // is the presumably worse memory access pattern of the input vector.
951  need_owned_permutation = true;
952  }
953  }
954  }
955  IFPACK2_BLOCKHELPER_TIMER_FENCE(typename BlockHelperDetails::ImplType<MatrixType>::execution_space)
956  }
957 
958  if (separate_remotes) {
959  IFPACK2_BLOCKHELPER_TIMER("createBlockCrsAsyncImporter::separate_remotes", separate_remotes);
961  const auto parsimonious_col_map = need_owned_permutation ? Teuchos::rcp(new tpetra_map_type(invalid, gids.data(), gids.size(), 0, domain_map->getComm())) : Teuchos::rcp(new tpetra_map_type(invalid, column_map_global_iD_last, 0, domain_map->getComm()));
962  if (parsimonious_col_map->getGlobalNumElements() > 0) {
963  // make the importer only if needed.
964  local_ordinal_type_1d_view dm2cm;
965  if (need_owned_permutation) {
966  dm2cm = local_ordinal_type_1d_view(do_not_initialize_tag("dm2cm"), domain_map->getLocalNumElements());
967  const auto dm2cm_host = Kokkos::create_mirror_view(dm2cm);
968  for (size_t i = 0; i < domain_map->getLocalNumElements(); ++i)
969  dm2cm_host(i) = domain_map->getLocalElement(column_map->getGlobalElement(i));
970  Kokkos::deep_copy(dm2cm, dm2cm_host);
971  }
972  IFPACK2_BLOCKHELPER_TIMER_FENCE(typename BlockHelperDetails::ImplType<MatrixType>::execution_space)
973  return Teuchos::rcp(new AsyncableImport<MatrixType>(domain_map, parsimonious_col_map, blocksize, dm2cm));
974  }
975  }
976  IFPACK2_BLOCKHELPER_TIMER_FENCE(typename BlockHelperDetails::ImplType<MatrixType>::execution_space)
977  return Teuchos::null;
978 }
979 
980 template <typename local_ordinal_type>
981 local_ordinal_type costTRSM(const local_ordinal_type block_size) {
982  return block_size * block_size;
983 }
984 
985 template <typename local_ordinal_type>
986 local_ordinal_type costGEMV(const local_ordinal_type block_size) {
987  return 2 * block_size * block_size;
988 }
989 
990 template <typename local_ordinal_type>
991 local_ordinal_type costTriDiagSolve(const local_ordinal_type subline_length, const local_ordinal_type block_size) {
992  return 2 * subline_length * costTRSM(block_size) + 2 * (subline_length - 1) * costGEMV(block_size);
993 }
994 
995 template <typename local_ordinal_type>
996 local_ordinal_type costSolveSchur(const local_ordinal_type num_parts,
997  const local_ordinal_type num_teams,
998  const local_ordinal_type line_length,
999  const local_ordinal_type block_size,
1000  const local_ordinal_type n_subparts_per_part) {
1001  const local_ordinal_type subline_length = ceil(double(line_length - (n_subparts_per_part - 1) * 2) / n_subparts_per_part);
1002  if (subline_length < 1) {
1003  return INT_MAX;
1004  }
1005 
1006  const local_ordinal_type p_n_lines = ceil(double(num_parts) / num_teams);
1007  const local_ordinal_type p_n_sublines = ceil(double(n_subparts_per_part) * num_parts / num_teams);
1008  const local_ordinal_type p_n_sublines_2 = ceil(double(n_subparts_per_part - 1) * num_parts / num_teams);
1009 
1010  const local_ordinal_type p_costApplyE = p_n_sublines_2 * subline_length * 2 * costGEMV(block_size);
1011  const local_ordinal_type p_costApplyS = p_n_lines * costTriDiagSolve((n_subparts_per_part - 1) * 2, block_size);
1012  const local_ordinal_type p_costApplyAinv = p_n_sublines * costTriDiagSolve(subline_length, block_size);
1013  const local_ordinal_type p_costApplyC = p_n_sublines_2 * 2 * costGEMV(block_size);
1014 
1015  if (n_subparts_per_part == 1) {
1016  return p_costApplyAinv;
1017  }
1018  return p_costApplyE + p_costApplyS + p_costApplyAinv + p_costApplyC;
1019 }
1020 
1021 template <typename local_ordinal_type>
1022 local_ordinal_type getAutomaticNSubparts(const local_ordinal_type num_parts,
1023  const local_ordinal_type num_teams,
1024  const local_ordinal_type line_length,
1025  const local_ordinal_type block_size) {
1026  // BMK: replaced theoretical model with empirical model
1027  // This is a linear regression based on data from a grid search.
1028  // The independent terms in the regression are:
1029  // - "parallelism surplus" - smaller when problem has enough lines to saturate GPU, larger otherwise
1030  // - log2 of the line length
1031  // - block size
1032  double parallelismSurplus = Kokkos::sqrt((double)num_teams / num_parts);
1033  double logLineLength = Kokkos::log2((double)line_length);
1034  (void)logLineLength;
1035  // Directly predict with linear model
1036 #if defined(KOKKOS_ARCH_AMD_GFX942) || defined(KOKKOS_ARCH_AMD_GFX942_APU)
1037  // MI300-specific data
1038  double modeled = -9.2312 + 4.6946 * parallelismSurplus + 0.4095 * block_size + 0.966 * logLineLength;
1039  // Do not split lines if there is plenty of parallelism
1040  if (parallelismSurplus < 0.3)
1041  modeled = 1;
1042 #elif defined(KOKKOS_ARCH_HOPPER) || defined(KOKKOS_ARCH_BLACKWELL)
1043  // Based on H100 data
1044  double modeled = -9.6053 + 4.7477 * parallelismSurplus + 0.2338 * block_size + 1.0794 * logLineLength;
1045  // On H100, performance degrades rapidly if small lines are split too many times
1046  double maxSplit = (double)line_length / 8;
1047  if (modeled > maxSplit)
1048  modeled = maxSplit;
1049 #elif defined(KOKKOS_ENABLE_CUDA)
1050  // Based on V100 data, line splitting is profitable in fewer cases
1051  // (only when there are few, long lines)
1052  double modeled = 1;
1053  if (parallelismSurplus > 1 && line_length > 64)
1054  modeled = 4;
1055 #elif defined(KOKKOS_ENABLE_HIP)
1056  // Based on MI250X data
1057  double modeled = -8.6214 + 7.3468 * parallelismSurplus + 0.3596 * block_size + 0.6673 * logLineLength;
1058 #else
1059  // GPUs other than CUDA or HIP: default to simple model that works for V100
1060  double modeled = 1;
1061  if (parallelismSurplus > 1 && line_length > 64)
1062  modeled = 4;
1063 #endif
1064 
1065  // Round to nearest integer
1066  local_ordinal_type n_subparts_per_part = 0.5 + modeled;
1067  // Do not split lines if there is plenty of parallelism available
1068  if (parallelismSurplus < 0.3)
1069  n_subparts_per_part = 1;
1070  // Clamp the result to valid range
1071  // Criteria for valid n_subparts_per_part (where connection_length is 2 for wide separators)
1072  // line_length >= n_subparts_per_part + (n_subparts_per_part - 1) * connection_length
1073  // Equivalently:
1074  // line_length >= n_subparts_per_part + n_subparts_per_part * 2 - 2
1075  // line_length >= 3 * n_subparts_per_part - 2
1076  local_ordinal_type min_subparts_per_part = 1;
1077  local_ordinal_type max_subparts_per_part = (line_length + 2) / 3;
1078  // Limit memory usage from too many sublines
1079  if (max_subparts_per_part > 16)
1080  max_subparts_per_part = 16;
1081  if (n_subparts_per_part < min_subparts_per_part)
1082  n_subparts_per_part = min_subparts_per_part;
1083  if (n_subparts_per_part > max_subparts_per_part)
1084  n_subparts_per_part = max_subparts_per_part;
1085  return n_subparts_per_part;
1086 }
1087 
1088 template <typename ArgActiveExecutionMemorySpace>
1089 struct SolveTridiagsDefaultModeAndAlgo;
1090 
1094 template <typename MatrixType>
1095 BlockHelperDetails::PartInterface<MatrixType>
1096 createPartInterface(const Teuchos::RCP<const typename BlockHelperDetails::ImplType<MatrixType>::tpetra_row_matrix_type> &A,
1097  const Teuchos::RCP<const typename BlockHelperDetails::ImplType<MatrixType>::tpetra_crs_graph_type> &G,
1098  const Teuchos::Array<Teuchos::Array<typename BlockHelperDetails::ImplType<MatrixType>::local_ordinal_type>> &partitions,
1099  const typename BlockHelperDetails::ImplType<MatrixType>::local_ordinal_type n_subparts_per_part_in) {
1100  IFPACK2_BLOCKHELPER_TIMER("createPartInterface", createPartInterface);
1101  using impl_type = BlockHelperDetails::ImplType<MatrixType>;
1102  using local_ordinal_type = typename impl_type::local_ordinal_type;
1103  using local_ordinal_type_1d_view = typename impl_type::local_ordinal_type_1d_view;
1104  using local_ordinal_type_2d_view = typename impl_type::local_ordinal_type_2d_view;
1105  using size_type = typename impl_type::size_type;
1106 
1107  auto bA = Teuchos::rcp_dynamic_cast<const typename BlockHelperDetails::ImplType<MatrixType>::tpetra_block_crs_matrix_type>(A);
1108 
1109  TEUCHOS_ASSERT(!bA.is_null() || G->getLocalNumRows() != 0);
1110  const local_ordinal_type blocksize = bA.is_null() ? A->getLocalNumRows() / G->getLocalNumRows() : A->getBlockSize();
1111  constexpr int vector_length = impl_type::vector_length;
1112  constexpr int internal_vector_length = impl_type::internal_vector_length;
1113 
1114  const auto comm = A->getRowMap()->getComm();
1115 
1116  BlockHelperDetails::PartInterface<MatrixType> interf;
1117 
1118  const bool jacobi = partitions.size() == 0;
1119  const local_ordinal_type A_n_lclrows = G->getLocalNumRows();
1120  const local_ordinal_type nparts = jacobi ? A_n_lclrows : partitions.size();
1121 
1122  typedef std::pair<local_ordinal_type, local_ordinal_type> size_idx_pair_type;
1123  std::vector<size_idx_pair_type> partsz(nparts);
1124 
1125  if (!jacobi) {
1126  for (local_ordinal_type i = 0; i < nparts; ++i)
1127  partsz[i] = size_idx_pair_type(partitions[i].size(), i);
1128  std::sort(partsz.begin(), partsz.end(),
1129  [](const size_idx_pair_type &x, const size_idx_pair_type &y) {
1130  return x.first > y.first;
1131  });
1132  }
1133 
1134  local_ordinal_type n_subparts_per_part;
1135  if (n_subparts_per_part_in == -1) {
1136  // If the number of subparts is set to -1, the user let the algorithm
1137  // decides the value automatically
1138  using execution_space = typename impl_type::execution_space;
1139 
1140  // Line splitting only benefits GPUs
1141  if constexpr (impl_type::node_type::is_gpu) {
1142  const int line_length = partsz[0].first;
1143 
1144  const local_ordinal_type team_size =
1145  SolveTridiagsDefaultModeAndAlgo<typename execution_space::memory_space>::
1146  recommended_team_size(blocksize, vector_length, internal_vector_length);
1147 
1148  const local_ordinal_type num_teams = std::max(1, execution_space().concurrency() / (team_size * vector_length));
1149  n_subparts_per_part = getAutomaticNSubparts(nparts, num_teams, line_length, blocksize);
1150 #ifdef IFPACK2_BLOCKTRIDICONTAINER_USE_PRINTF
1151  printf("Automatically chosen n_subparts_per_part = %d for nparts = %d, num_teams = %d, team_size = %d, line_length = %d, and blocksize = %d;\n", n_subparts_per_part, nparts, num_teams, team_size, line_length, blocksize);
1152 #endif
1153  } else {
1154  n_subparts_per_part = 1;
1155 #ifdef IFPACK2_BLOCKTRIDICONTAINER_USE_PRINTF
1156  printf("Automatically chosen n_subparts_per_part = 1 for CPU backend\n");
1157 #endif
1158  }
1159  } else {
1160  n_subparts_per_part = n_subparts_per_part_in;
1161  }
1162 
1163  // Total number of sub lines:
1164  const local_ordinal_type n_sub_parts = nparts * n_subparts_per_part;
1165  // Total number of sub lines + the Schur complement blocks.
1166  // For a given live 2 sub lines implies one Schur complement, 3 sub lines implies two Schur complements etc.
1167  const local_ordinal_type n_sub_parts_and_schur = n_sub_parts + nparts * (n_subparts_per_part - 1);
1168 
1169 #if defined(BLOCKTRIDICONTAINER_DEBUG)
1170  local_ordinal_type nrows = 0;
1171  if (jacobi)
1172  nrows = nparts;
1173  else
1174  for (local_ordinal_type i = 0; i < nparts; ++i) nrows += partitions[i].size();
1175 
1176  TEUCHOS_TEST_FOR_EXCEPT_MSG(nrows != A_n_lclrows, BlockHelperDetails::get_msg_prefix(comm) << "The #rows implied by the local partition is not "
1177  << "the same as getLocalNumRows: " << nrows << " vs " << A_n_lclrows);
1178 #endif
1179 
1180  // permutation vector
1181  std::vector<local_ordinal_type> p;
1182  if (jacobi) {
1183  interf.max_partsz = 1;
1184  interf.max_subpartsz = 0;
1185  interf.n_subparts_per_part = 1;
1186  interf.nparts = nparts;
1187  } else {
1188  // reorder parts to maximize simd packing efficiency
1189  p.resize(nparts);
1190 
1191  for (local_ordinal_type i = 0; i < nparts; ++i)
1192  p[i] = partsz[i].second;
1193 
1194  interf.max_partsz = partsz[0].first;
1195 
1196  constexpr local_ordinal_type connection_length = 2;
1197  const local_ordinal_type sub_line_length = (interf.max_partsz - (n_subparts_per_part - 1) * connection_length) / n_subparts_per_part;
1198  const local_ordinal_type last_sub_line_length = interf.max_partsz - (n_subparts_per_part - 1) * (connection_length + sub_line_length);
1199 
1200  interf.max_subpartsz = (sub_line_length > last_sub_line_length) ? sub_line_length : last_sub_line_length;
1201  interf.n_subparts_per_part = n_subparts_per_part;
1202  interf.nparts = nparts;
1203  }
1204 
1205  // allocate parts
1206  interf.partptr = local_ordinal_type_1d_view(do_not_initialize_tag("partptr"), nparts + 1);
1207  interf.lclrow = local_ordinal_type_1d_view(do_not_initialize_tag("lclrow"), A_n_lclrows);
1208  interf.part2rowidx0 = local_ordinal_type_1d_view(do_not_initialize_tag("part2rowidx0"), nparts + 1);
1209  interf.part2packrowidx0 = local_ordinal_type_1d_view(do_not_initialize_tag("part2packrowidx0"), nparts + 1);
1210  interf.rowidx2part = local_ordinal_type_1d_view(do_not_initialize_tag("rowidx2part"), A_n_lclrows);
1211 
1212  interf.part2rowidx0_sub = local_ordinal_type_1d_view(do_not_initialize_tag("part2rowidx0_sub"), n_sub_parts_and_schur + 1);
1213  interf.part2packrowidx0_sub = local_ordinal_type_2d_view(do_not_initialize_tag("part2packrowidx0_sub"), nparts, 2 * n_subparts_per_part);
1214  interf.rowidx2part_sub = local_ordinal_type_1d_view(do_not_initialize_tag("rowidx2part"), A_n_lclrows);
1215 
1216  interf.partptr_sub = local_ordinal_type_2d_view(do_not_initialize_tag("partptr_sub"), n_sub_parts_and_schur, 2);
1217 
1218  // mirror to host and compute on host execution space
1219  const auto partptr = Kokkos::create_mirror_view(interf.partptr);
1220  const auto partptr_sub = Kokkos::create_mirror_view(interf.partptr_sub);
1221 
1222  const auto lclrow = Kokkos::create_mirror_view(interf.lclrow);
1223  const auto part2rowidx0 = Kokkos::create_mirror_view(interf.part2rowidx0);
1224  const auto part2packrowidx0 = Kokkos::create_mirror_view(interf.part2packrowidx0);
1225  const auto rowidx2part = Kokkos::create_mirror_view(interf.rowidx2part);
1226 
1227  const auto part2rowidx0_sub = Kokkos::create_mirror_view(interf.part2rowidx0_sub);
1228  const auto part2packrowidx0_sub = Kokkos::create_mirror_view(Kokkos::HostSpace(), interf.part2packrowidx0_sub);
1229  const auto rowidx2part_sub = Kokkos::create_mirror_view(interf.rowidx2part_sub);
1230 
1231  // Determine parts.
1232  interf.row_contiguous = true;
1233  partptr(0) = 0;
1234  part2rowidx0(0) = 0;
1235  part2packrowidx0(0) = 0;
1236  local_ordinal_type pack_nrows = 0;
1237  local_ordinal_type pack_nrows_sub = 0;
1238  if (jacobi) {
1239  IFPACK2_BLOCKHELPER_TIMER("compute part indices (Jacobi)", Jacobi);
1240  // Jacobi (all lines have length 1) means that A_n_lclrows == nparts,
1241  // so the mapping between parts and rows is trivial.
1242  // Note: we can leave interf.row_contiguous = true, since for all i: lclrow(i) == i
1243  for (local_ordinal_type i = 0; i <= nparts; ++i) {
1244  part2rowidx0(i) = i;
1245  partptr(i) = i;
1246  }
1247  for (local_ordinal_type i = 0; i < nparts; ++i) {
1248  rowidx2part(i) = i;
1249  lclrow(i) = i;
1250  }
1251  for (local_ordinal_type ip = 0; ip < nparts; ++ip) {
1252  // assume No overlap.
1253  if (ip % vector_length == 0) pack_nrows = 1;
1254  part2packrowidx0(ip + 1) = part2packrowidx0(ip) + ((ip + 1) % vector_length == 0 || ip + 1 == nparts ? pack_nrows : 0);
1255  }
1256  part2rowidx0_sub(0) = 0;
1257  partptr_sub(0, 0) = 0;
1258 
1259  for (local_ordinal_type ip = 0; ip < nparts; ++ip) {
1260  constexpr local_ordinal_type ipnrows = 1;
1261  const local_ordinal_type full_line_length = partptr(ip + 1) - partptr(ip);
1262 
1263  TEUCHOS_TEST_FOR_EXCEPTION(full_line_length != ipnrows, std::logic_error,
1264  "In the part " << ip);
1265 
1266  constexpr local_ordinal_type connection_length = 2;
1267 
1268  if (full_line_length < n_subparts_per_part + (n_subparts_per_part - 1) * connection_length)
1269  TEUCHOS_TEST_FOR_EXCEPTION(true, std::logic_error,
1270  "The part " << ip << " is too short to use " << n_subparts_per_part << " sub parts.");
1271 
1272  const local_ordinal_type sub_line_length = (full_line_length - (n_subparts_per_part - 1) * connection_length) / n_subparts_per_part;
1273  const local_ordinal_type last_sub_line_length = full_line_length - (n_subparts_per_part - 1) * (connection_length + sub_line_length);
1274 
1275  if (ip % vector_length == 0) pack_nrows_sub = ipnrows;
1276 
1277  for (local_ordinal_type local_sub_ip = 0; local_sub_ip < n_subparts_per_part; ++local_sub_ip) {
1278  const local_ordinal_type sub_ip = nparts * (2 * local_sub_ip) + ip;
1279  const local_ordinal_type schur_ip = nparts * (2 * local_sub_ip + 1) + ip;
1280  if (local_sub_ip != n_subparts_per_part - 1) {
1281  if (local_sub_ip != 0) {
1282  partptr_sub(sub_ip, 0) = partptr_sub(nparts * (2 * local_sub_ip - 1) + ip, 1);
1283  } else if (ip != 0) {
1284  partptr_sub(sub_ip, 0) = partptr_sub(nparts * 2 * (n_subparts_per_part - 1) + ip - 1, 1);
1285  }
1286  partptr_sub(sub_ip, 1) = sub_line_length + partptr_sub(sub_ip, 0);
1287  partptr_sub(schur_ip, 0) = partptr_sub(sub_ip, 1);
1288  partptr_sub(schur_ip, 1) = connection_length + partptr_sub(schur_ip, 0);
1289 
1290  part2rowidx0_sub(sub_ip + 1) = part2rowidx0_sub(sub_ip) + sub_line_length;
1291  part2rowidx0_sub(sub_ip + 2) = part2rowidx0_sub(sub_ip + 1) + connection_length;
1292 
1293 #ifdef IFPACK2_BLOCKTRIDICONTAINER_USE_PRINTF
1294  printf("Sub Part index = %d, first LID associated to the sub part = %d, sub part size = %d;\n", sub_ip, partptr_sub(ip, 2 * local_sub_ip), sub_line_length);
1295  printf("Sub Part index Schur = %d, first LID associated to the sub part = %d, sub part size = %d;\n", sub_ip + 1, partptr_sub(ip, 2 * local_sub_ip + 1), connection_length);
1296 #endif
1297  } else {
1298  if (local_sub_ip != 0) {
1299  partptr_sub(sub_ip, 0) = partptr_sub(nparts * (2 * local_sub_ip - 1) + ip, 1);
1300  } else if (ip != 0) {
1301  partptr_sub(sub_ip, 0) = partptr_sub(nparts * 2 * (n_subparts_per_part - 1) + ip - 1, 1);
1302  }
1303  partptr_sub(sub_ip, 1) = last_sub_line_length + partptr_sub(sub_ip, 0);
1304 
1305  part2rowidx0_sub(sub_ip + 1) = part2rowidx0_sub(sub_ip) + last_sub_line_length;
1306 
1307 #ifdef IFPACK2_BLOCKTRIDICONTAINER_USE_PRINTF
1308  printf("Sub Part index = %d, first LID associated to the sub part = %d, sub part size = %d;\n", sub_ip, partptr_sub(ip, 2 * local_sub_ip), last_sub_line_length);
1309 #endif
1310  }
1311  }
1312  }
1313 
1314 #ifdef IFPACK2_BLOCKTRIDICONTAINER_WRITE_MM
1315  std::cout << "partptr_sub = " << std::endl;
1316  for (size_type i = 0; i < partptr_sub.extent(0); ++i) {
1317  for (size_type j = 0; j < partptr_sub.extent(1); ++j) {
1318  std::cout << partptr_sub(i, j) << " ";
1319  }
1320  std::cout << std::endl;
1321  }
1322  std::cout << "partptr_sub end" << std::endl;
1323 #endif
1324 
1325  {
1326  local_ordinal_type npacks = ceil(float(nparts) / vector_length);
1327 
1328  local_ordinal_type ip_max = nparts > vector_length ? vector_length : nparts;
1329  for (local_ordinal_type ip = 0; ip < ip_max; ++ip) {
1330  part2packrowidx0_sub(ip, 0) = 0;
1331  }
1332  for (local_ordinal_type ipack = 0; ipack < npacks; ++ipack) {
1333  if (ipack != 0) {
1334  local_ordinal_type ip_min = ipack * vector_length;
1335  ip_max = nparts > (ipack + 1) * vector_length ? (ipack + 1) * vector_length : nparts;
1336  for (local_ordinal_type ip = ip_min; ip < ip_max; ++ip) {
1337  part2packrowidx0_sub(ip, 0) = part2packrowidx0_sub(ip - vector_length, part2packrowidx0_sub.extent(1) - 1);
1338  }
1339  }
1340 
1341  for (size_type local_sub_ip = 0; local_sub_ip < part2packrowidx0_sub.extent(1) - 1; ++local_sub_ip) {
1342  local_ordinal_type ip_min = ipack * vector_length;
1343  ip_max = nparts > (ipack + 1) * vector_length ? (ipack + 1) * vector_length : nparts;
1344 
1345  const local_ordinal_type full_line_length = partptr(ip_min + 1) - partptr(ip_min);
1346 
1347  constexpr local_ordinal_type connection_length = 2;
1348 
1349  const local_ordinal_type sub_line_length = (full_line_length - (n_subparts_per_part - 1) * connection_length) / n_subparts_per_part;
1350  const local_ordinal_type last_sub_line_length = full_line_length - (n_subparts_per_part - 1) * (connection_length + sub_line_length);
1351 
1352  if (local_sub_ip % 2 == 0) pack_nrows_sub = sub_line_length;
1353  if (local_sub_ip % 2 == 1) pack_nrows_sub = connection_length;
1354  if (local_sub_ip == part2packrowidx0_sub.extent(1) - 2) pack_nrows_sub = last_sub_line_length;
1355 
1356  part2packrowidx0_sub(ip_min, local_sub_ip + 1) = part2packrowidx0_sub(ip_min, local_sub_ip) + pack_nrows_sub;
1357 
1358  for (local_ordinal_type ip = ip_min + 1; ip < ip_max; ++ip) {
1359  part2packrowidx0_sub(ip, local_sub_ip + 1) = part2packrowidx0_sub(ip_min, local_sub_ip + 1);
1360  }
1361  }
1362  }
1363 
1364  Kokkos::deep_copy(interf.part2packrowidx0_sub, part2packrowidx0_sub);
1365  }
1366  IFPACK2_BLOCKHELPER_TIMER_FENCE(typename BlockHelperDetails::ImplType<MatrixType>::execution_space)
1367  } else {
1368  IFPACK2_BLOCKHELPER_TIMER("compute part indices", indices);
1369  for (local_ordinal_type ip = 0; ip < nparts; ++ip) {
1370  const auto *part = &partitions[p[ip]];
1371  const local_ordinal_type ipnrows = part->size();
1372  TEUCHOS_ASSERT(ip == 0 || (ipnrows <= static_cast<local_ordinal_type>(partitions[p[ip - 1]].size())));
1373  TEUCHOS_TEST_FOR_EXCEPT_MSG(ipnrows == 0,
1374  BlockHelperDetails::get_msg_prefix(comm)
1375  << "partition " << p[ip]
1376  << " is empty, which is not allowed.");
1377  // assume No overlap.
1378  part2rowidx0(ip + 1) = part2rowidx0(ip) + ipnrows;
1379  // Since parts are ordered in decreasing size, the size of the first
1380  // part in a pack is the size for all parts in the pack.
1381  if (ip % vector_length == 0) pack_nrows = ipnrows;
1382  part2packrowidx0(ip + 1) = part2packrowidx0(ip) + ((ip + 1) % vector_length == 0 || ip + 1 == nparts ? pack_nrows : 0);
1383  const local_ordinal_type offset = partptr(ip);
1384  for (local_ordinal_type i = 0; i < ipnrows; ++i) {
1385  const auto lcl_row = (*part)[i];
1386  TEUCHOS_TEST_FOR_EXCEPT_MSG(lcl_row < 0 || lcl_row >= A_n_lclrows,
1387  BlockHelperDetails::get_msg_prefix(comm)
1388  << "partitions[" << p[ip] << "]["
1389  << i << "] = " << lcl_row
1390  << " but input matrix implies limits of [0, " << A_n_lclrows - 1
1391  << "].");
1392  lclrow(offset + i) = lcl_row;
1393  rowidx2part(offset + i) = ip;
1394  if (interf.row_contiguous && offset + i > 0 && lclrow((offset + i) - 1) + 1 != lcl_row)
1395  interf.row_contiguous = false;
1396  }
1397  partptr(ip + 1) = offset + ipnrows;
1398 
1399 #ifdef IFPACK2_BLOCKTRIDICONTAINER_USE_PRINTF
1400  printf("Part index = ip = %d, first LID associated to the part = partptr(ip) = offset = %d, part->size() = ipnrows = %d;\n", ip, offset, ipnrows);
1401  printf("partptr(%d+1) = %d\n", ip, partptr(ip + 1));
1402 #endif
1403  }
1404 
1405  part2rowidx0_sub(0) = 0;
1406  partptr_sub(0, 0) = 0;
1407  // const local_ordinal_type number_pack_per_sub_part = ceil(float(nparts)/vector_length);
1408 
1409  for (local_ordinal_type ip = 0; ip < nparts; ++ip) {
1410  const auto *part = &partitions[p[ip]];
1411  const local_ordinal_type ipnrows = part->size();
1412  const local_ordinal_type full_line_length = partptr(ip + 1) - partptr(ip);
1413 
1414  TEUCHOS_TEST_FOR_EXCEPTION(full_line_length != ipnrows, std::logic_error,
1415  "In the part " << ip);
1416 
1417  constexpr local_ordinal_type connection_length = 2;
1418 
1419  if (full_line_length < n_subparts_per_part + (n_subparts_per_part - 1) * connection_length)
1420  TEUCHOS_TEST_FOR_EXCEPTION(true, std::logic_error,
1421  "The part " << ip << " is too short to use " << n_subparts_per_part << " sub parts.");
1422 
1423  const local_ordinal_type sub_line_length = (full_line_length - (n_subparts_per_part - 1) * connection_length) / n_subparts_per_part;
1424  const local_ordinal_type last_sub_line_length = full_line_length - (n_subparts_per_part - 1) * (connection_length + sub_line_length);
1425 
1426  if (ip % vector_length == 0) pack_nrows_sub = ipnrows;
1427 
1428  for (local_ordinal_type local_sub_ip = 0; local_sub_ip < n_subparts_per_part; ++local_sub_ip) {
1429  const local_ordinal_type sub_ip = nparts * (2 * local_sub_ip) + ip;
1430  const local_ordinal_type schur_ip = nparts * (2 * local_sub_ip + 1) + ip;
1431  if (local_sub_ip != n_subparts_per_part - 1) {
1432  if (local_sub_ip != 0) {
1433  partptr_sub(sub_ip, 0) = partptr_sub(nparts * (2 * local_sub_ip - 1) + ip, 1);
1434  } else if (ip != 0) {
1435  partptr_sub(sub_ip, 0) = partptr_sub(nparts * 2 * (n_subparts_per_part - 1) + ip - 1, 1);
1436  }
1437  partptr_sub(sub_ip, 1) = sub_line_length + partptr_sub(sub_ip, 0);
1438  partptr_sub(schur_ip, 0) = partptr_sub(sub_ip, 1);
1439  partptr_sub(schur_ip, 1) = connection_length + partptr_sub(schur_ip, 0);
1440 
1441  part2rowidx0_sub(sub_ip + 1) = part2rowidx0_sub(sub_ip) + sub_line_length;
1442  part2rowidx0_sub(sub_ip + 2) = part2rowidx0_sub(sub_ip + 1) + connection_length;
1443 
1444 #ifdef IFPACK2_BLOCKTRIDICONTAINER_USE_PRINTF
1445  printf("Sub Part index = %d, first LID associated to the sub part = %d, sub part size = %d;\n", sub_ip, partptr_sub(sub_ip, 0), sub_line_length);
1446  printf("Sub Part index Schur = %d, first LID associated to the sub part = %d, sub part size = %d;\n", sub_ip + 1, partptr_sub(ip, 2 * local_sub_ip + 1), connection_length);
1447 #endif
1448  } else {
1449  if (local_sub_ip != 0) {
1450  partptr_sub(sub_ip, 0) = partptr_sub(nparts * (2 * local_sub_ip - 1) + ip, 1);
1451  } else if (ip != 0) {
1452  partptr_sub(sub_ip, 0) = partptr_sub(nparts * 2 * (n_subparts_per_part - 1) + ip - 1, 1);
1453  }
1454  partptr_sub(sub_ip, 1) = last_sub_line_length + partptr_sub(sub_ip, 0);
1455 
1456  part2rowidx0_sub(sub_ip + 1) = part2rowidx0_sub(sub_ip) + last_sub_line_length;
1457 
1458 #ifdef IFPACK2_BLOCKTRIDICONTAINER_USE_PRINTF
1459  printf("Sub Part index = %d, first LID associated to the sub part = %d, sub part size = %d;\n", sub_ip, partptr_sub(sub_ip, 0), last_sub_line_length);
1460 #endif
1461  }
1462  }
1463  }
1464 
1465  {
1466  local_ordinal_type npacks = ceil(float(nparts) / vector_length);
1467 
1468  local_ordinal_type ip_max = nparts > vector_length ? vector_length : nparts;
1469  for (local_ordinal_type ip = 0; ip < ip_max; ++ip) {
1470  part2packrowidx0_sub(ip, 0) = 0;
1471  }
1472  for (local_ordinal_type ipack = 0; ipack < npacks; ++ipack) {
1473  if (ipack != 0) {
1474  local_ordinal_type ip_min = ipack * vector_length;
1475  ip_max = nparts > (ipack + 1) * vector_length ? (ipack + 1) * vector_length : nparts;
1476  for (local_ordinal_type ip = ip_min; ip < ip_max; ++ip) {
1477  part2packrowidx0_sub(ip, 0) = part2packrowidx0_sub(ip - vector_length, part2packrowidx0_sub.extent(1) - 1);
1478  }
1479  }
1480 
1481  for (size_type local_sub_ip = 0; local_sub_ip < part2packrowidx0_sub.extent(1) - 1; ++local_sub_ip) {
1482  local_ordinal_type ip_min = ipack * vector_length;
1483  ip_max = nparts > (ipack + 1) * vector_length ? (ipack + 1) * vector_length : nparts;
1484 
1485  const local_ordinal_type full_line_length = partptr(ip_min + 1) - partptr(ip_min);
1486 
1487  constexpr local_ordinal_type connection_length = 2;
1488 
1489  const local_ordinal_type sub_line_length = (full_line_length - (n_subparts_per_part - 1) * connection_length) / n_subparts_per_part;
1490  const local_ordinal_type last_sub_line_length = full_line_length - (n_subparts_per_part - 1) * (connection_length + sub_line_length);
1491 
1492  if (local_sub_ip % 2 == 0) pack_nrows_sub = sub_line_length;
1493  if (local_sub_ip % 2 == 1) pack_nrows_sub = connection_length;
1494  if (local_sub_ip == part2packrowidx0_sub.extent(1) - 2) pack_nrows_sub = last_sub_line_length;
1495 
1496  part2packrowidx0_sub(ip_min, local_sub_ip + 1) = part2packrowidx0_sub(ip_min, local_sub_ip) + pack_nrows_sub;
1497 
1498  for (local_ordinal_type ip = ip_min + 1; ip < ip_max; ++ip) {
1499  part2packrowidx0_sub(ip, local_sub_ip + 1) = part2packrowidx0_sub(ip_min, local_sub_ip + 1);
1500  }
1501  }
1502  }
1503 
1504  Kokkos::deep_copy(interf.part2packrowidx0_sub, part2packrowidx0_sub);
1505  }
1506  IFPACK2_BLOCKHELPER_TIMER_FENCE(typename BlockHelperDetails::ImplType<MatrixType>::execution_space)
1507  }
1508 #if defined(BLOCKTRIDICONTAINER_DEBUG)
1509  TEUCHOS_ASSERT(partptr(nparts) == nrows);
1510 #endif
1511  if (lclrow(0) != 0) interf.row_contiguous = false;
1512 
1513  Kokkos::deep_copy(interf.partptr, partptr);
1514  Kokkos::deep_copy(interf.lclrow, lclrow);
1515 
1516  Kokkos::deep_copy(interf.partptr_sub, partptr_sub);
1517 
1518  // assume No overlap. Thus:
1519  interf.part2rowidx0 = interf.partptr;
1520  Kokkos::deep_copy(interf.part2packrowidx0, part2packrowidx0);
1521 
1522  interf.part2packrowidx0_back = part2packrowidx0_sub(part2packrowidx0_sub.extent(0) - 1, part2packrowidx0_sub.extent(1) - 1);
1523  Kokkos::deep_copy(interf.rowidx2part, rowidx2part);
1524 
1525  { // Fill packptr.
1526  IFPACK2_BLOCKHELPER_TIMER("Fill packptr", packptr0);
1527  local_ordinal_type npacks = ceil(float(nparts) / vector_length) * (part2packrowidx0_sub.extent(1) - 1);
1528  npacks = 0;
1529  for (local_ordinal_type ip = 1; ip <= nparts; ++ip) // n_sub_parts_and_schur
1530  if (part2packrowidx0(ip) != part2packrowidx0(ip - 1))
1531  ++npacks;
1532 
1533  interf.packptr = local_ordinal_type_1d_view(do_not_initialize_tag("packptr"), npacks + 1);
1534  const auto packptr = Kokkos::create_mirror_view(interf.packptr);
1535  packptr(0) = 0;
1536  for (local_ordinal_type ip = 1, k = 1; ip <= nparts; ++ip)
1537  if (part2packrowidx0(ip) != part2packrowidx0(ip - 1))
1538  packptr(k++) = ip;
1539 
1540  Kokkos::deep_copy(interf.packptr, packptr);
1541 
1542  local_ordinal_type npacks_per_subpart = ceil(float(nparts) / vector_length);
1543  npacks = ceil(float(nparts) / vector_length) * (part2packrowidx0_sub.extent(1) - 1);
1544 
1545  interf.packindices_sub = local_ordinal_type_1d_view(do_not_initialize_tag("packindices_sub"), npacks_per_subpart * n_subparts_per_part);
1546  interf.packindices_schur = local_ordinal_type_2d_view(do_not_initialize_tag("packindices_schur"), npacks_per_subpart, n_subparts_per_part - 1);
1547 
1548  const auto packindices_sub = Kokkos::create_mirror_view(interf.packindices_sub);
1549  const auto packindices_schur = Kokkos::create_mirror_view(interf.packindices_schur);
1550 
1551  // Fill packindices_sub and packindices_schur
1552  for (local_ordinal_type local_sub_ip = 0; local_sub_ip < n_subparts_per_part - 1; ++local_sub_ip) {
1553  for (local_ordinal_type local_pack_ip = 0; local_pack_ip < npacks_per_subpart; ++local_pack_ip) {
1554  packindices_sub(local_sub_ip * npacks_per_subpart + local_pack_ip) = 2 * local_sub_ip * npacks_per_subpart + local_pack_ip;
1555  packindices_schur(local_pack_ip, local_sub_ip) = 2 * local_sub_ip * npacks_per_subpart + local_pack_ip + npacks_per_subpart;
1556  }
1557  }
1558 
1559  for (local_ordinal_type local_pack_ip = 0; local_pack_ip < npacks_per_subpart; ++local_pack_ip) {
1560  packindices_sub((n_subparts_per_part - 1) * npacks_per_subpart + local_pack_ip) = 2 * (n_subparts_per_part - 1) * npacks_per_subpart + local_pack_ip;
1561  }
1562 
1563 #ifdef IFPACK2_BLOCKTRIDICONTAINER_WRITE_MM
1564  std::cout << "packindices_sub = " << std::endl;
1565  for (size_type i = 0; i < packindices_sub.extent(0); ++i) {
1566  std::cout << packindices_sub(i) << " ";
1567  }
1568  std::cout << std::endl;
1569  std::cout << "packindices_sub end" << std::endl;
1570 
1571  std::cout << "packindices_schur = " << std::endl;
1572  for (size_type i = 0; i < packindices_schur.extent(0); ++i) {
1573  for (size_type j = 0; j < packindices_schur.extent(1); ++j) {
1574  std::cout << packindices_schur(i, j) << " ";
1575  }
1576  std::cout << std::endl;
1577  }
1578 
1579  std::cout << "packindices_schur end" << std::endl;
1580 #endif
1581 
1582  Kokkos::deep_copy(interf.packindices_sub, packindices_sub);
1583  Kokkos::deep_copy(interf.packindices_schur, packindices_schur);
1584 
1585  interf.packptr_sub = local_ordinal_type_1d_view(do_not_initialize_tag("packptr"), npacks + 1);
1586  const auto packptr_sub = Kokkos::create_mirror_view(interf.packptr_sub);
1587  packptr_sub(0) = 0;
1588  for (local_ordinal_type k = 0; k < npacks + 1; ++k)
1589  packptr_sub(k) = packptr(k % npacks_per_subpart) + (k / npacks_per_subpart) * packptr(npacks_per_subpart);
1590 
1591  Kokkos::deep_copy(interf.packptr_sub, packptr_sub);
1592  IFPACK2_BLOCKHELPER_TIMER_FENCE(typename BlockHelperDetails::ImplType<MatrixType>::execution_space)
1593  }
1594  IFPACK2_BLOCKHELPER_TIMER_FENCE(typename BlockHelperDetails::ImplType<MatrixType>::execution_space)
1595 
1596  return interf;
1597 }
1598 
1602 template <typename MatrixType>
1605  using local_ordinal_type_1d_view = typename impl_type::local_ordinal_type_1d_view;
1606  using size_type_1d_view = typename impl_type::size_type_1d_view;
1607  using size_type_2d_view = typename impl_type::size_type_2d_view;
1608  using vector_type_3d_view = typename impl_type::vector_type_3d_view;
1609  using vector_type_4d_view = typename impl_type::vector_type_4d_view;
1610  using btdm_scalar_type_3d_view = typename impl_type::btdm_scalar_type_3d_view;
1611 
1612  // flat_td_ptr(i) is the index into flat-array values of the start of the
1613  // i'th tridiag. pack_td_ptr is the same, but for packs. If vector_length ==
1614  // 1, pack_td_ptr is the same as flat_td_ptr; if vector_length > 1, then i %
1615  // vector_length is the position in the pack.
1616  size_type_2d_view flat_td_ptr, pack_td_ptr, pack_td_ptr_schur;
1617  // List of local column indices into A from which to grab
1618  // data. flat_td_ptr(i) points to the start of the i'th tridiag's data.
1619  local_ordinal_type_1d_view A_colindsub;
1620  // Tridiag block values. pack_td_ptr(i) points to the start of the i'th
1621  // tridiag's pack, and i % vector_length gives the position in the pack.
1622  vector_type_3d_view values;
1623  // Schur block values. pack_td_ptr_schur(i) points to the start of the i'th
1624  // Schur's pack, and i % vector_length gives the position in the pack.
1625  vector_type_3d_view values_schur;
1626  // inv(A_00)*A_01 block values.
1627  vector_type_4d_view e_values;
1628 
1629  // The following are for fused block Jacobi only.
1630  // For block row i, diag_offset(i)...diag_offset(i + bs^2)
1631  // is the range of scalars for the diagonal block.
1632  size_type_1d_view diag_offsets;
1633  // For fused residual+solve block Jacobi case,
1634  // this contains the diagonal block inverses in flat, local row indexing:
1635  // d_inv(row, :, :) gives the row-major block for row.
1636  btdm_scalar_type_3d_view d_inv;
1637 
1638  bool is_diagonal_only;
1639 
1640  BlockTridiags() = default;
1641  BlockTridiags(const BlockTridiags &b) = default;
1642 
1643  // Index into row-major block of a tridiag.
1644  template <typename idx_type>
1645  static KOKKOS_FORCEINLINE_FUNCTION
1646  idx_type
1647  IndexToRow(const idx_type &ind) { return (ind + 1) / 3; }
1648  // Given a row of a row-major tridiag, return the index of the first block
1649  // in that row.
1650  template <typename idx_type>
1651  static KOKKOS_FORCEINLINE_FUNCTION
1652  idx_type
1653  RowToIndex(const idx_type &row) { return row > 0 ? 3 * row - 1 : 0; }
1654  // Number of blocks in a tridiag having a given number of rows.
1655  template <typename idx_type>
1656  static KOKKOS_FORCEINLINE_FUNCTION
1657  idx_type
1658  NumBlocks(const idx_type &nrows) { return nrows > 0 ? 3 * nrows - 2 : 0; }
1659  // Number of blocks associated to a Schur complement having a given number of rows.
1660  template <typename idx_type>
1661  static KOKKOS_FORCEINLINE_FUNCTION
1662  idx_type
1663  NumBlocksSchur(const idx_type &nrows) { return nrows > 0 ? 3 * nrows + 2 : 0; }
1664 };
1665 
1669 template <typename MatrixType>
1671 createBlockTridiags(const BlockHelperDetails::PartInterface<MatrixType> &interf) {
1672  IFPACK2_BLOCKHELPER_TIMER("createBlockTridiags", createBlockTridiags0);
1673  using impl_type = BlockHelperDetails::ImplType<MatrixType>;
1674  using execution_space = typename impl_type::execution_space;
1675  using local_ordinal_type = typename impl_type::local_ordinal_type;
1676  using size_type = typename impl_type::size_type;
1677  using size_type_2d_view = typename impl_type::size_type_2d_view;
1678 
1679  constexpr int vector_length = impl_type::vector_length;
1680 
1682 
1683  const local_ordinal_type ntridiags = interf.partptr_sub.extent(0);
1684 
1685  { // construct the flat index pointers into the tridiag values array.
1686  btdm.flat_td_ptr = size_type_2d_view(do_not_initialize_tag("btdm.flat_td_ptr"), interf.nparts, 2 * interf.n_subparts_per_part);
1687  const Kokkos::RangePolicy<execution_space> policy(0, 2 * interf.nparts * interf.n_subparts_per_part);
1688  Kokkos::parallel_scan(
1689  "createBlockTridiags::RangePolicy::flat_td_ptr",
1690  policy, KOKKOS_LAMBDA(const local_ordinal_type &i, size_type &update, const bool &final) {
1691  const local_ordinal_type partidx = i / (2 * interf.n_subparts_per_part);
1692  const local_ordinal_type local_subpartidx = i % (2 * interf.n_subparts_per_part);
1693 
1694  if (final) {
1695  btdm.flat_td_ptr(partidx, local_subpartidx) = update;
1696  }
1697  if (local_subpartidx != (2 * interf.n_subparts_per_part - 1)) {
1698  const local_ordinal_type nrows = interf.partptr_sub(interf.nparts * local_subpartidx + partidx, 1) - interf.partptr_sub(interf.nparts * local_subpartidx + partidx, 0);
1699  if (local_subpartidx % 2 == 0)
1700  update += btdm.NumBlocks(nrows);
1701  else
1702  update += btdm.NumBlocksSchur(nrows);
1703  }
1704  });
1705 
1706  const auto nblocks = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), Kokkos::subview(btdm.flat_td_ptr, interf.nparts - 1, 2 * interf.n_subparts_per_part - 1));
1707  btdm.is_diagonal_only = (static_cast<local_ordinal_type>(nblocks()) == ntridiags);
1708  }
1709 
1710  // And the packed index pointers.
1711  if (vector_length == 1) {
1712  btdm.pack_td_ptr = btdm.flat_td_ptr;
1713  } else {
1714  // const local_ordinal_type npacks = interf.packptr_sub.extent(0) - 1;
1715 
1716  local_ordinal_type npacks_per_subpart = 0;
1717  const auto part2packrowidx0 = Kokkos::create_mirror_view(interf.part2packrowidx0);
1718  Kokkos::deep_copy(part2packrowidx0, interf.part2packrowidx0);
1719  for (local_ordinal_type ip = 1; ip <= interf.nparts; ++ip) // n_sub_parts_and_schur
1720  if (part2packrowidx0(ip) != part2packrowidx0(ip - 1))
1721  ++npacks_per_subpart;
1722 
1723  btdm.pack_td_ptr = size_type_2d_view(do_not_initialize_tag("btdm.pack_td_ptr"), interf.nparts, 2 * interf.n_subparts_per_part);
1724  const Kokkos::RangePolicy<execution_space> policy(0, npacks_per_subpart);
1725 
1726  Kokkos::parallel_for(
1727  "createBlockTridiags::RangePolicy::pack_td_ptr",
1728  policy, KOKKOS_LAMBDA(const local_ordinal_type &i) {
1729  for (local_ordinal_type j = 0; j < 2 * interf.n_subparts_per_part; ++j) {
1730  const local_ordinal_type pack_id = (j == 2 * interf.n_subparts_per_part - 1) ? i + (j - 1) * npacks_per_subpart : i + j * npacks_per_subpart;
1731  const local_ordinal_type nparts_in_pack = interf.packptr_sub(pack_id + 1) - interf.packptr_sub(pack_id);
1732 
1733  const local_ordinal_type parti = interf.packptr_sub(pack_id);
1734  const local_ordinal_type partidx = parti % interf.nparts;
1735 
1736  for (local_ordinal_type pti = 0; pti < nparts_in_pack; ++pti) {
1737  btdm.pack_td_ptr(partidx + pti, j) = btdm.flat_td_ptr(i, j);
1738  }
1739  }
1740  });
1741  }
1742 
1743  btdm.pack_td_ptr_schur = size_type_2d_view(do_not_initialize_tag("btdm.pack_td_ptr_schur"), interf.nparts, interf.n_subparts_per_part);
1744 
1745  const auto host_pack_td_ptr_schur = Kokkos::create_mirror_view(btdm.pack_td_ptr_schur);
1746  constexpr local_ordinal_type connection_length = 2;
1747 
1748  host_pack_td_ptr_schur(0, 0) = 0;
1749  for (local_ordinal_type i = 0; i < interf.nparts; ++i) {
1750  if (i % vector_length == 0) {
1751  if (i != 0)
1752  host_pack_td_ptr_schur(i, 0) = host_pack_td_ptr_schur(i - 1, host_pack_td_ptr_schur.extent(1) - 1);
1753  for (local_ordinal_type j = 0; j < interf.n_subparts_per_part - 1; ++j) {
1754  host_pack_td_ptr_schur(i, j + 1) = host_pack_td_ptr_schur(i, j) + btdm.NumBlocks(connection_length) + (j != 0 ? 1 : 0) + (j != interf.n_subparts_per_part - 2 ? 1 : 0);
1755  }
1756  } else {
1757  for (local_ordinal_type j = 0; j < interf.n_subparts_per_part; ++j) {
1758  host_pack_td_ptr_schur(i, j) = host_pack_td_ptr_schur(i - 1, j);
1759  }
1760  }
1761  }
1762 
1763  Kokkos::deep_copy(btdm.pack_td_ptr_schur, host_pack_td_ptr_schur);
1764 
1765 #ifdef IFPACK2_BLOCKTRIDICONTAINER_WRITE_MM
1766  const auto host_flat_td_ptr = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), btdm.flat_td_ptr);
1767  std::cout << "flat_td_ptr = " << std::endl;
1768  for (size_type i = 0; i < host_flat_td_ptr.extent(0); ++i) {
1769  for (size_type j = 0; j < host_flat_td_ptr.extent(1); ++j) {
1770  std::cout << host_flat_td_ptr(i, j) << " ";
1771  }
1772  std::cout << std::endl;
1773  }
1774  std::cout << "flat_td_ptr end" << std::endl;
1775 
1776  const auto host_pack_td_ptr = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), btdm.pack_td_ptr);
1777 
1778  std::cout << "pack_td_ptr = " << std::endl;
1779  for (size_type i = 0; i < host_pack_td_ptr.extent(0); ++i) {
1780  for (size_type j = 0; j < host_pack_td_ptr.extent(1); ++j) {
1781  std::cout << host_pack_td_ptr(i, j) << " ";
1782  }
1783  std::cout << std::endl;
1784  }
1785  std::cout << "pack_td_ptr end" << std::endl;
1786 
1787  std::cout << "pack_td_ptr_schur = " << std::endl;
1788  for (size_type i = 0; i < host_pack_td_ptr_schur.extent(0); ++i) {
1789  for (size_type j = 0; j < host_pack_td_ptr_schur.extent(1); ++j) {
1790  std::cout << host_pack_td_ptr_schur(i, j) << " ";
1791  }
1792  std::cout << std::endl;
1793  }
1794  std::cout << "pack_td_ptr_schur end" << std::endl;
1795 #endif
1796 
1797  // values and A_colindsub are created in the symbolic phase
1798  IFPACK2_BLOCKHELPER_TIMER_FENCE(typename BlockHelperDetails::ImplType<MatrixType>::execution_space)
1799 
1800  return btdm;
1801 }
1802 
1803 // Set the tridiags to be I to the full pack block size. That way, if a
1804 // tridiag within a pack is shorter than the longest one, the extra blocks are
1805 // processed in a safe way. Similarly, in the solve phase, if the extra blocks
1806 // in the packed multvector are 0, and the tridiag LU reflects the extra I
1807 // blocks, then the solve proceeds as though the extra blocks aren't
1808 // present. Since this extra work is part of the SIMD calls, it's not actually
1809 // extra work. Instead, it means we don't have to put checks or masks in, or
1810 // quiet NaNs. This functor has to be called just once, in the symbolic phase,
1811 // since the numeric phase fills in only the used entries, leaving these I
1812 // blocks intact.
1813 template <typename MatrixType>
1814 void setTridiagsToIdentity(const BlockTridiags<MatrixType> &btdm,
1815  const typename BlockHelperDetails::ImplType<MatrixType>::local_ordinal_type_1d_view &packptr) {
1816  using impl_type = BlockHelperDetails::ImplType<MatrixType>;
1817  using execution_space = typename impl_type::execution_space;
1818  using local_ordinal_type = typename impl_type::local_ordinal_type;
1819  using size_type_2d_view = typename impl_type::size_type_2d_view;
1820 
1821  const ConstUnmanaged<size_type_2d_view> pack_td_ptr(btdm.pack_td_ptr);
1822  const local_ordinal_type blocksize = btdm.values.extent(1);
1823 
1824  {
1825  const int vector_length = impl_type::vector_length;
1826  const int internal_vector_length = impl_type::internal_vector_length;
1827 
1828  using btdm_scalar_type = typename impl_type::btdm_scalar_type;
1829  using internal_vector_type = typename impl_type::internal_vector_type;
1830  using internal_vector_type_4d_view =
1831  typename impl_type::internal_vector_type_4d_view;
1832 
1833  using team_policy_type = Kokkos::TeamPolicy<execution_space>;
1834  const internal_vector_type_4d_view values(reinterpret_cast<internal_vector_type *>(btdm.values.data()),
1835  btdm.values.extent(0),
1836  btdm.values.extent(1),
1837  btdm.values.extent(2),
1838  vector_length / internal_vector_length);
1839  const local_ordinal_type vector_loop_size = values.extent(3);
1840 #if defined(KOKKOS_ENABLE_CUDA) && defined(__CUDA_ARCH__)
1841  local_ordinal_type total_team_size(0);
1842  if (blocksize <= 5)
1843  total_team_size = 32;
1844  else if (blocksize <= 9)
1845  total_team_size = 64;
1846  else if (blocksize <= 12)
1847  total_team_size = 96;
1848  else if (blocksize <= 16)
1849  total_team_size = 128;
1850  else if (blocksize <= 20)
1851  total_team_size = 160;
1852  else
1853  total_team_size = 160;
1854  const local_ordinal_type team_size = total_team_size / vector_loop_size;
1855  const team_policy_type policy(packptr.extent(0) - 1, team_size, vector_loop_size);
1856 #elif defined(KOKKOS_ENABLE_HIP)
1857  // FIXME: HIP
1858  // These settings might be completely wrong
1859  // will have to do some experiments to decide
1860  // what makes sense on AMD GPUs
1861  local_ordinal_type total_team_size(0);
1862  if (blocksize <= 5)
1863  total_team_size = 32;
1864  else if (blocksize <= 9)
1865  total_team_size = 64;
1866  else if (blocksize <= 12)
1867  total_team_size = 96;
1868  else if (blocksize <= 16)
1869  total_team_size = 128;
1870  else if (blocksize <= 20)
1871  total_team_size = 160;
1872  else
1873  total_team_size = 160;
1874  const local_ordinal_type team_size = total_team_size / vector_loop_size;
1875  const team_policy_type policy(packptr.extent(0) - 1, team_size, vector_loop_size);
1876 #elif defined(KOKKOS_ENABLE_SYCL)
1877  // SYCL: FIXME
1878  local_ordinal_type total_team_size(0);
1879  if (blocksize <= 5)
1880  total_team_size = 32;
1881  else if (blocksize <= 9)
1882  total_team_size = 64;
1883  else if (blocksize <= 12)
1884  total_team_size = 96;
1885  else if (blocksize <= 16)
1886  total_team_size = 128;
1887  else if (blocksize <= 20)
1888  total_team_size = 160;
1889  else
1890  total_team_size = 160;
1891  const local_ordinal_type team_size = total_team_size / vector_loop_size;
1892  const team_policy_type policy(packptr.extent(0) - 1, team_size, vector_loop_size);
1893 #else
1894  // Host architecture: team size is always one
1895  const team_policy_type policy(packptr.extent(0) - 1, 1, 1);
1896 #endif
1897  Kokkos::parallel_for(
1898  "setTridiagsToIdentity::TeamPolicy",
1899  policy, KOKKOS_LAMBDA(const typename team_policy_type::member_type &member) {
1900  const local_ordinal_type k = member.league_rank();
1901  const local_ordinal_type ibeg = pack_td_ptr(packptr(k), 0);
1902  const local_ordinal_type iend = pack_td_ptr(packptr(k), pack_td_ptr.extent(1) - 1);
1903 
1904  const local_ordinal_type diff = iend - ibeg;
1905  const local_ordinal_type icount = diff / 3 + (diff % 3 > 0);
1906  const btdm_scalar_type one(1);
1907  Kokkos::parallel_for(Kokkos::ThreadVectorRange(member, vector_loop_size), [&](const int &v) {
1908  Kokkos::parallel_for(Kokkos::TeamThreadRange(member, icount), [&](const local_ordinal_type &ii) {
1909  const local_ordinal_type i = ibeg + ii * 3;
1910  for (local_ordinal_type j = 0; j < blocksize; ++j) {
1911  values(i, j, j, v) = one;
1912  }
1913  });
1914  });
1915  });
1916  }
1917 }
1918 
1922 template <typename MatrixType>
1923 void performSymbolicPhase(const Teuchos::RCP<const typename BlockHelperDetails::ImplType<MatrixType>::tpetra_row_matrix_type> &A,
1924  const Teuchos::RCP<const typename BlockHelperDetails::ImplType<MatrixType>::tpetra_crs_graph_type> &g,
1925  const BlockHelperDetails::PartInterface<MatrixType> &interf,
1928  const bool overlap_communication_and_computation,
1929  const Teuchos::RCP<AsyncableImport<MatrixType>> &async_importer,
1930  bool useSeqMethod,
1931  bool use_fused_jacobi) {
1932  IFPACK2_BLOCKHELPER_TIMER("BlockTriDi::SymbolicPhase", SymbolicPhase);
1933 
1934  using impl_type = BlockHelperDetails::ImplType<MatrixType>;
1935 
1936  using execution_space = typename impl_type::execution_space;
1937 
1938  using local_ordinal_type = typename impl_type::local_ordinal_type;
1939  using global_ordinal_type = typename impl_type::global_ordinal_type;
1940  using size_type = typename impl_type::size_type;
1941  using local_ordinal_type_1d_view = typename impl_type::local_ordinal_type_1d_view;
1942  using size_type_1d_view = typename impl_type::size_type_1d_view;
1943  using vector_type_3d_view = typename impl_type::vector_type_3d_view;
1944  using vector_type_4d_view = typename impl_type::vector_type_4d_view;
1945  using crs_matrix_type = typename impl_type::tpetra_crs_matrix_type;
1946  using block_crs_matrix_type = typename impl_type::tpetra_block_crs_matrix_type;
1947  using btdm_scalar_type_3d_view = typename impl_type::btdm_scalar_type_3d_view;
1948  using lo_traits = Tpetra::Details::OrdinalTraits<local_ordinal_type>;
1949 
1950  constexpr int vector_length = impl_type::vector_length;
1951 
1952  const auto comm = A->getRowMap()->getComm();
1953 
1954  auto A_crs = Teuchos::rcp_dynamic_cast<const crs_matrix_type>(A);
1955  auto A_bcrs = Teuchos::rcp_dynamic_cast<const block_crs_matrix_type>(A);
1956 
1957  bool hasBlockCrsMatrix = !A_bcrs.is_null();
1958  TEUCHOS_ASSERT(hasBlockCrsMatrix || g->getLocalNumRows() != 0);
1959  const local_ordinal_type blocksize = hasBlockCrsMatrix ? A->getBlockSize() : A->getLocalNumRows() / g->getLocalNumRows();
1960 
1961  const auto partptr = interf.partptr;
1962  const auto lclrow = interf.lclrow;
1963  const auto rowidx2part = interf.rowidx2part;
1964  const auto part2rowidx0 = interf.part2rowidx0;
1965  const auto packptr = interf.packptr;
1966 
1967  // TODO: add nrows as a member of part interface?
1968  const local_ordinal_type nrows = Kokkos::create_mirror_view_and_copy(
1969  Kokkos::HostSpace(), Kokkos::subview(partptr, partptr.extent(0) - 1))();
1970 
1971  Kokkos::View<local_ordinal_type *, execution_space> col2row("col2row", A->getLocalNumCols());
1972 
1973  // find column to row map on host
1974 
1975  Kokkos::deep_copy(execution_space(), col2row, Teuchos::OrdinalTraits<local_ordinal_type>::invalid());
1976  {
1977  TEUCHOS_ASSERT(!(g->getRowMap().is_null() || g->getColMap().is_null() || g->getDomainMap().is_null()));
1978 #if defined(BLOCKTRIDICONTAINER_DEBUG)
1979  {
1980  // On host: check that row, col, domain maps are consistent
1981  auto rowmapHost = g->getRowMap();
1982  auto colmapHost = g->getColMap();
1983  auto dommapHost = g->getDomainMap();
1984  for (local_ordinal_type lr = 0; lr < nrows; lr++) {
1985  const global_ordinal_type gid = rowmapHost->getGlobalElement(lr);
1987  if (dommapHost->isNodeGlobalElement(gid)) {
1988  const local_ordinal_type lc = colmapHost->getLocalElement(gid);
1990  BlockHelperDetails::get_msg_prefix(comm) << "GID " << gid
1991  << " gives an invalid local column.");
1992  }
1993  }
1994  }
1995 #endif
1996  auto rowmap = g->getRowMap()->getLocalMap();
1997  auto colmap = g->getColMap()->getLocalMap();
1998  auto dommap = g->getDomainMap()->getLocalMap();
1999 
2000  const Kokkos::RangePolicy<execution_space> policy(0, nrows);
2001  Kokkos::parallel_for(
2002  "performSymbolicPhase::RangePolicy::col2row",
2003  policy, KOKKOS_LAMBDA(const local_ordinal_type &lr) {
2004  const global_ordinal_type gid = rowmap.getGlobalElement(lr);
2005  if (dommap.getLocalElement(gid) != lo_traits::invalid()) {
2006  const local_ordinal_type lc = colmap.getLocalElement(gid);
2007  col2row(lc) = lr;
2008  }
2009  });
2010  }
2011 
2012  // construct the D and R graphs in A = D + R.
2013  {
2014  const auto local_graph = g->getLocalGraphDevice();
2015  const auto local_graph_rowptr = local_graph.row_map;
2016  TEUCHOS_ASSERT(local_graph_rowptr.size() == static_cast<size_t>(nrows + 1));
2017  const auto local_graph_colidx = local_graph.entries;
2018 
2019  // assume no overlap.
2020 
2021  Kokkos::View<local_ordinal_type *, execution_space> lclrow2idx("lclrow2idx", nrows);
2022  {
2023  const Kokkos::RangePolicy<execution_space> policy(0, nrows);
2024  Kokkos::parallel_for(
2025  "performSymbolicPhase::RangePolicy::lclrow2idx",
2026  policy, KOKKOS_LAMBDA(const local_ordinal_type &i) {
2027  lclrow2idx(lclrow(i)) = i;
2028  });
2029  }
2030 
2031  // count (block) nnzs in D and R.
2032  size_type D_nnz, R_nnz_owned, R_nnz_remote;
2033  {
2034  const Kokkos::RangePolicy<execution_space> policy(0, nrows);
2035  Kokkos::parallel_reduce
2036  // profiling interface does not work
2037  ( //"performSymbolicPhase::RangePolicy::count_nnz",
2038  policy, KOKKOS_LAMBDA(const local_ordinal_type &lr, size_type &update_D_nnz, size_type &update_R_nnz_owned, size_type &update_R_nnz_remote) {
2039  // LID -> index.
2040  const local_ordinal_type ri0 = lclrow2idx(lr);
2041  const local_ordinal_type pi0 = rowidx2part(ri0);
2042  for (size_type j = local_graph_rowptr(lr); j < local_graph_rowptr(lr + 1); ++j) {
2043  const local_ordinal_type lc = local_graph_colidx(j);
2044  const local_ordinal_type lc2r = col2row(lc);
2045  bool incr_R = false;
2046  do { // breakable
2047  if (lc2r == (local_ordinal_type)-1) {
2048  incr_R = true;
2049  break;
2050  }
2051  const local_ordinal_type ri = lclrow2idx(lc2r);
2052  const local_ordinal_type pi = rowidx2part(ri);
2053  if (pi != pi0) {
2054  incr_R = true;
2055  break;
2056  }
2057  // Test for being in the tridiag. This is done in index space. In
2058  // LID space, tridiag LIDs in a row are not necessarily related by
2059  // {-1, 0, 1}.
2060  if (ri0 + 1 >= ri && ri0 <= ri + 1)
2061  ++update_D_nnz;
2062  else
2063  incr_R = true;
2064  } while (0);
2065  if (incr_R) {
2066  if (lc < nrows)
2067  ++update_R_nnz_owned;
2068  else
2069  ++update_R_nnz_remote;
2070  }
2071  }
2072  },
2073  D_nnz, R_nnz_owned, R_nnz_remote);
2074  }
2075 
2076  if (!overlap_communication_and_computation) {
2077  R_nnz_owned += R_nnz_remote;
2078  R_nnz_remote = 0;
2079  }
2080 
2081  // construct the D_00 graph.
2082  {
2083  const auto flat_td_ptr = btdm.flat_td_ptr;
2084 
2085  btdm.A_colindsub = local_ordinal_type_1d_view("btdm.A_colindsub", D_nnz);
2086  const auto D_A_colindsub = btdm.A_colindsub;
2087 
2088 #if defined(BLOCKTRIDICONTAINER_DEBUG)
2089  Kokkos::deep_copy(D_A_colindsub, Teuchos::OrdinalTraits<local_ordinal_type>::invalid());
2090 #endif
2091 
2092  const local_ordinal_type nparts = partptr.extent(0) - 1;
2093 
2094  {
2095  const Kokkos::RangePolicy<execution_space> policy(0, nparts);
2096  Kokkos::parallel_for(
2097  "performSymbolicPhase::RangePolicy<execution_space>::D_graph",
2098  policy, KOKKOS_LAMBDA(const local_ordinal_type &pi0) {
2099  const local_ordinal_type part_ri0 = part2rowidx0(pi0);
2100  local_ordinal_type offset = 0;
2101  for (local_ordinal_type ri0 = partptr(pi0); ri0 < partptr(pi0 + 1); ++ri0) {
2102  const local_ordinal_type td_row_os = btdm.RowToIndex(ri0 - part_ri0) + offset;
2103  offset = 1;
2104  const local_ordinal_type lr0 = lclrow(ri0);
2105  const size_type j0 = local_graph_rowptr(lr0);
2106  for (size_type j = j0; j < local_graph_rowptr(lr0 + 1); ++j) {
2107  const local_ordinal_type lc = local_graph_colidx(j);
2108  const local_ordinal_type lc2r = col2row[lc];
2109  if (lc2r == (local_ordinal_type)-1) continue;
2110  const local_ordinal_type ri = lclrow2idx[lc2r];
2111  const local_ordinal_type pi = rowidx2part(ri);
2112  if (pi != pi0) continue;
2113  if (ri + 1 < ri0 || ri > ri0 + 1) continue;
2114  const local_ordinal_type row_entry = j - j0;
2115  D_A_colindsub(flat_td_ptr(pi0, 0) + ((td_row_os + ri) - ri0)) = row_entry;
2116  }
2117  }
2118  });
2119  }
2120 #if defined(BLOCKTRIDICONTAINER_DEBUG)
2121  {
2122  auto D_A_colindsub_host = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), D_A_colindsub);
2123  for (size_t i = 0; i < D_A_colindsub_host.extent(0); ++i)
2125  }
2126 #endif
2127 
2128  // Allocate values.
2129  {
2130  const auto pack_td_ptr_last = Kokkos::subview(btdm.pack_td_ptr, btdm.pack_td_ptr.extent(0) - 1, btdm.pack_td_ptr.extent(1) - 1);
2131  const auto num_packed_blocks = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), pack_td_ptr_last);
2132  btdm.values = vector_type_3d_view("btdm.values", num_packed_blocks(), blocksize, blocksize);
2133 
2134  if (interf.n_subparts_per_part > 1) {
2135  const auto pack_td_ptr_schur_last = Kokkos::subview(btdm.pack_td_ptr_schur, btdm.pack_td_ptr_schur.extent(0) - 1, btdm.pack_td_ptr_schur.extent(1) - 1);
2136  const auto num_packed_blocks_schur = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), pack_td_ptr_schur_last);
2137  btdm.values_schur = vector_type_3d_view("btdm.values_schur", num_packed_blocks_schur(), blocksize, blocksize);
2138  }
2139 
2140  if (vector_length > 1) setTridiagsToIdentity(btdm, interf.packptr);
2141  }
2142  }
2143 
2144  // Construct the R graph.
2145  {
2146  amd.rowptr = size_type_1d_view("amd.rowptr", nrows + 1);
2147  amd.A_colindsub = local_ordinal_type_1d_view(do_not_initialize_tag("amd.A_colindsub"), R_nnz_owned);
2148 
2149  const auto R_rowptr = amd.rowptr;
2150  const auto R_A_colindsub = amd.A_colindsub;
2151 
2152  amd.rowptr_remote = size_type_1d_view("amd.rowptr_remote", overlap_communication_and_computation ? nrows + 1 : 0);
2153  amd.A_colindsub_remote = local_ordinal_type_1d_view(do_not_initialize_tag("amd.A_colindsub_remote"), R_nnz_remote);
2154 
2155  const auto R_rowptr_remote = amd.rowptr_remote;
2156  const auto R_A_colindsub_remote = amd.A_colindsub_remote;
2157 
2158  {
2159  const Kokkos::RangePolicy<execution_space> policy(0, nrows);
2160  Kokkos::parallel_for(
2161  "performSymbolicPhase::RangePolicy<execution_space>::R_graph_count",
2162  policy, KOKKOS_LAMBDA(const local_ordinal_type &lr) {
2163  const local_ordinal_type ri0 = lclrow2idx[lr];
2164  const local_ordinal_type pi0 = rowidx2part(ri0);
2165  const size_type j0 = local_graph_rowptr(lr);
2166  for (size_type j = j0; j < local_graph_rowptr(lr + 1); ++j) {
2167  const local_ordinal_type lc = local_graph_colidx(j);
2168  const local_ordinal_type lc2r = col2row[lc];
2169  if (lc2r != (local_ordinal_type)-1) {
2170  const local_ordinal_type ri = lclrow2idx[lc2r];
2171  const local_ordinal_type pi = rowidx2part(ri);
2172  if (pi == pi0 && ri + 1 >= ri0 && ri <= ri0 + 1) {
2173  continue;
2174  }
2175  }
2176  // exclusive scan will be performed later
2177  if (!overlap_communication_and_computation || lc < nrows) {
2178  ++R_rowptr(lr);
2179  } else {
2180  ++R_rowptr_remote(lr);
2181  }
2182  }
2183  });
2184  }
2185 
2186  // exclusive scan
2188  {
2189  Kokkos::RangePolicy<execution_space> policy(0, nrows + 1);
2190  Kokkos::parallel_scan(
2191  "performSymbolicPhase::RangePolicy<execution_space>::R_graph_fill",
2192  policy, KOKKOS_LAMBDA(const local_ordinal_type &lr, update_type &update, const bool &final) {
2193  update_type val;
2194  val.v[0] = R_rowptr(lr);
2195  if (overlap_communication_and_computation)
2196  val.v[1] = R_rowptr_remote(lr);
2197 
2198  if (final) {
2199  R_rowptr(lr) = update.v[0];
2200  if (overlap_communication_and_computation)
2201  R_rowptr_remote(lr) = update.v[1];
2202 
2203  if (lr < nrows) {
2204  const local_ordinal_type ri0 = lclrow2idx[lr];
2205  const local_ordinal_type pi0 = rowidx2part(ri0);
2206 
2207  size_type cnt_rowptr = R_rowptr(lr);
2208  size_type cnt_rowptr_remote = overlap_communication_and_computation ? R_rowptr_remote(lr) : 0; // when not overlap_communication_and_computation, this value is garbage
2209 
2210  const size_type j0 = local_graph_rowptr(lr);
2211  for (size_type j = j0; j < local_graph_rowptr(lr + 1); ++j) {
2212  const local_ordinal_type lc = local_graph_colidx(j);
2213  const local_ordinal_type lc2r = col2row[lc];
2214  if (lc2r != (local_ordinal_type)-1) {
2215  const local_ordinal_type ri = lclrow2idx[lc2r];
2216  const local_ordinal_type pi = rowidx2part(ri);
2217  if (pi == pi0 && ri + 1 >= ri0 && ri <= ri0 + 1)
2218  continue;
2219  }
2220  const local_ordinal_type row_entry = j - j0;
2221  if (!overlap_communication_and_computation || lc < nrows)
2222  R_A_colindsub(cnt_rowptr++) = row_entry;
2223  else
2224  R_A_colindsub_remote(cnt_rowptr_remote++) = row_entry;
2225  }
2226  }
2227  }
2228  update += val;
2229  });
2230  }
2231  {
2232  // Check that the last elements of R_rowptr (aka amd.rowptr)
2233  // and R_rowptr_remote (aka amd.rowptr_remote) match the expected entry counts
2234  auto r_rowptr_end = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), Kokkos::subview(R_rowptr, nrows));
2235  TEUCHOS_ASSERT(r_rowptr_end() == R_nnz_owned);
2236  if (overlap_communication_and_computation) {
2237  auto r_rowptr_remote_end = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), Kokkos::subview(R_rowptr_remote, nrows));
2238  TEUCHOS_ASSERT(r_rowptr_remote_end() == R_nnz_remote);
2239  }
2240  }
2241 
2242  // Allocate or view values.
2243  if (hasBlockCrsMatrix)
2244  amd.tpetra_values = (const_cast<block_crs_matrix_type *>(A_bcrs.get())->getValuesDeviceNonConst());
2245  else {
2246  amd.tpetra_values = (const_cast<crs_matrix_type *>(A_crs.get()))->getLocalValuesDevice(Tpetra::Access::ReadWrite);
2247  }
2248  }
2249 
2250  // Allocate view for E and initialize the values with B:
2251 
2252  if (interf.n_subparts_per_part > 1)
2253  btdm.e_values = vector_type_4d_view("btdm.e_values", 2, interf.part2packrowidx0_back, blocksize, blocksize);
2254  }
2255  // Precompute offsets of each A and x entry to speed up residual.
2256  // Applies if all of these are true:
2257  // - hasBlockCrsMatrix
2258  // - execution_space is a GPU
2259  // - !useSeqMethod (since this uses a different scheme for indexing A,x)
2260  //
2261  // Reading A, x take up to 4 and 6 levels of indirection respectively,
2262  // but precomputing the offsets reduces it to 2 for both (get index, then value)
2263  if (BlockHelperDetails::is_device<execution_space>::value && !useSeqMethod && hasBlockCrsMatrix) {
2264  bool is_async_importer_active = !async_importer.is_null();
2265  local_ordinal_type_1d_view dm2cm = is_async_importer_active ? async_importer->dm2cm : local_ordinal_type_1d_view();
2266  bool ownedRemoteSeparate = overlap_communication_and_computation || !is_async_importer_active;
2267  BlockHelperDetails::precompute_A_x_offsets<MatrixType>(amd, interf, g, dm2cm, blocksize, ownedRemoteSeparate);
2268  }
2269 
2270  // If using fused block Jacobi path, allocate diagonal inverses here (d_inv) and find diagonal offsets.
2271  if (use_fused_jacobi) {
2272  btdm.d_inv = btdm_scalar_type_3d_view(do_not_initialize_tag("btdm.d_inv"), interf.nparts, blocksize, blocksize);
2273  auto rowptrs = A_bcrs->getCrsGraph().getLocalRowPtrsDevice();
2274  auto entries = A_bcrs->getCrsGraph().getLocalIndicesDevice();
2275  btdm.diag_offsets = BlockHelperDetails::findDiagOffsets<execution_space, size_type_1d_view>(rowptrs, entries, interf.nparts, blocksize);
2276  }
2277  IFPACK2_BLOCKHELPER_TIMER_FENCE(typename BlockHelperDetails::ImplType<MatrixType>::execution_space)
2278 }
2279 
2283 template <typename ArgActiveExecutionMemorySpace>
2285 
2286 template <>
2287 struct ExtractAndFactorizeTridiagsDefaultModeAndAlgo<Kokkos::HostSpace> {
2288  typedef KB::Mode::Serial mode_type;
2289 #if defined(__KOKKOSBATCHED_INTEL_MKL_COMPACT_BATCHED__)
2290  typedef KB::Algo::Level3::CompactMKL algo_type;
2291 #else
2292  typedef KB::Algo::Level3::Blocked algo_type;
2293 #endif
2294  static int recommended_team_size(const int /* blksize */,
2295  const int /* vector_length */,
2296  const int /* internal_vector_length */) {
2297  return 1;
2298  }
2299 };
2300 
2301 #if defined(KOKKOS_ENABLE_CUDA)
2302 static inline int ExtractAndFactorizeRecommendedCudaTeamSize(const int blksize,
2303  const int vector_length,
2304  const int internal_vector_length) {
2305  const int vector_size = vector_length / internal_vector_length;
2306  int total_team_size(0);
2307  if (blksize <= 5)
2308  total_team_size = 32;
2309  else if (blksize <= 9)
2310  total_team_size = 32; // 64
2311  else if (blksize <= 12)
2312  total_team_size = 96;
2313  else if (blksize <= 16)
2314  total_team_size = 128;
2315  else if (blksize <= 20)
2316  total_team_size = 160;
2317  else
2318  total_team_size = 160;
2319  return 2 * total_team_size / vector_size;
2320 }
2321 template <>
2322 struct ExtractAndFactorizeTridiagsDefaultModeAndAlgo<Kokkos::CudaSpace> {
2323  typedef KB::Mode::Team mode_type;
2324  typedef KB::Algo::Level3::Unblocked algo_type;
2325  static int recommended_team_size(const int blksize,
2326  const int vector_length,
2327  const int internal_vector_length) {
2328  return ExtractAndFactorizeRecommendedCudaTeamSize(blksize, vector_length, internal_vector_length);
2329  }
2330 };
2331 template <>
2332 struct ExtractAndFactorizeTridiagsDefaultModeAndAlgo<Kokkos::CudaUVMSpace> {
2333  typedef KB::Mode::Team mode_type;
2334  typedef KB::Algo::Level3::Unblocked algo_type;
2335  static int recommended_team_size(const int blksize,
2336  const int vector_length,
2337  const int internal_vector_length) {
2338  return ExtractAndFactorizeRecommendedCudaTeamSize(blksize, vector_length, internal_vector_length);
2339  }
2340 };
2341 #endif
2342 
2343 #if defined(KOKKOS_ENABLE_HIP)
2344 static inline int ExtractAndFactorizeRecommendedHIPTeamSize(const int blksize,
2345  const int vector_length,
2346  const int internal_vector_length) {
2347  const int vector_size = vector_length / internal_vector_length;
2348  int total_team_size(0);
2349  if (blksize <= 5)
2350  total_team_size = 32;
2351  else if (blksize <= 9)
2352  total_team_size = 32; // 64
2353  else if (blksize <= 12)
2354  total_team_size = 96;
2355  else if (blksize <= 16)
2356  total_team_size = 128;
2357  else if (blksize <= 20)
2358  total_team_size = 160;
2359  else
2360  total_team_size = 160;
2361  return 2 * total_team_size / vector_size;
2362 }
2363 template <>
2364 struct ExtractAndFactorizeTridiagsDefaultModeAndAlgo<Kokkos::HIPSpace> {
2365  typedef KB::Mode::Team mode_type;
2366  typedef KB::Algo::Level3::Unblocked algo_type;
2367  static int recommended_team_size(const int blksize,
2368  const int vector_length,
2369  const int internal_vector_length) {
2370  return ExtractAndFactorizeRecommendedHIPTeamSize(blksize, vector_length, internal_vector_length);
2371  }
2372 };
2373 template <>
2374 struct ExtractAndFactorizeTridiagsDefaultModeAndAlgo<Kokkos::HIPHostPinnedSpace> {
2375  typedef KB::Mode::Team mode_type;
2376  typedef KB::Algo::Level3::Unblocked algo_type;
2377  static int recommended_team_size(const int blksize,
2378  const int vector_length,
2379  const int internal_vector_length) {
2380  return ExtractAndFactorizeRecommendedHIPTeamSize(blksize, vector_length, internal_vector_length);
2381  }
2382 };
2383 #endif
2384 
2385 #if defined(KOKKOS_ENABLE_SYCL)
2386 static inline int ExtractAndFactorizeRecommendedSYCLTeamSize(const int blksize,
2387  const int vector_length,
2388  const int internal_vector_length) {
2389  const int vector_size = vector_length / internal_vector_length;
2390  int total_team_size(0);
2391  if (blksize <= 5)
2392  total_team_size = 32;
2393  else if (blksize <= 9)
2394  total_team_size = 32; // 64
2395  else if (blksize <= 12)
2396  total_team_size = 96;
2397  else if (blksize <= 16)
2398  total_team_size = 128;
2399  else if (blksize <= 20)
2400  total_team_size = 160;
2401  else
2402  total_team_size = 160;
2403  return 2 * total_team_size / vector_size;
2404 }
2405 template <>
2406 struct ExtractAndFactorizeTridiagsDefaultModeAndAlgo<Kokkos::Experimental::SYCLDeviceUSMSpace> {
2407  typedef KB::Mode::Team mode_type;
2408  typedef KB::Algo::Level3::Unblocked algo_type;
2409  static int recommended_team_size(const int blksize,
2410  const int vector_length,
2411  const int internal_vector_length) {
2412  return ExtractAndFactorizeRecommendedSYCLTeamSize(blksize, vector_length, internal_vector_length);
2413  }
2414 };
2415 template <>
2416 struct ExtractAndFactorizeTridiagsDefaultModeAndAlgo<Kokkos::Experimental::SYCLSharedUSMSpace> {
2417  typedef KB::Mode::Team mode_type;
2418  typedef KB::Algo::Level3::Unblocked algo_type;
2419  static int recommended_team_size(const int blksize,
2420  const int vector_length,
2421  const int internal_vector_length) {
2422  return ExtractAndFactorizeRecommendedSYCLTeamSize(blksize, vector_length, internal_vector_length);
2423  }
2424 };
2425 #endif
2426 
2427 template <typename impl_type, typename WWViewType>
2428 KOKKOS_INLINE_FUNCTION void
2429 solveMultiVector(const typename Kokkos::TeamPolicy<typename impl_type::execution_space>::member_type &member,
2430  const typename impl_type::local_ordinal_type & /* blocksize */,
2431  const typename impl_type::local_ordinal_type &i0,
2432  const typename impl_type::local_ordinal_type &r0,
2433  const typename impl_type::local_ordinal_type &nrows,
2434  const typename impl_type::local_ordinal_type &v,
2435  const ConstUnmanaged<typename impl_type::internal_vector_type_4d_view> D_internal_vector_values,
2436  const Unmanaged<typename impl_type::internal_vector_type_4d_view> X_internal_vector_values,
2437  const WWViewType &WW,
2438  const bool skip_first_pass = false) {
2439  using execution_space = typename impl_type::execution_space;
2440  using team_policy_type = Kokkos::TeamPolicy<execution_space>;
2441  using member_type = typename team_policy_type::member_type;
2442  using local_ordinal_type = typename impl_type::local_ordinal_type;
2443 
2444  typedef SolveTridiagsDefaultModeAndAlgo<typename execution_space::memory_space> default_mode_and_algo_type;
2445 
2446  typedef typename default_mode_and_algo_type::mode_type default_mode_type;
2447  typedef typename default_mode_and_algo_type::multi_vector_algo_type default_algo_type;
2448 
2449  using btdm_magnitude_type = typename impl_type::btdm_magnitude_type;
2450 
2451  // constant
2452  const auto one = Kokkos::ArithTraits<btdm_magnitude_type>::one();
2453  const auto zero = Kokkos::ArithTraits<btdm_magnitude_type>::zero();
2454 
2455  // subview pattern
2456  auto A = Kokkos::subview(D_internal_vector_values, i0, Kokkos::ALL(), Kokkos::ALL(), v);
2457  auto X1 = Kokkos::subview(X_internal_vector_values, r0, Kokkos::ALL(), Kokkos::ALL(), v);
2458  auto X2 = X1;
2459 
2460  local_ordinal_type i = i0, r = r0;
2461 
2462  if (nrows > 1) {
2463  // solve Lx = x
2464  if (skip_first_pass) {
2465  i += (nrows - 2) * 3;
2466  r += (nrows - 2);
2467  A.assign_data(&D_internal_vector_values(i + 2, 0, 0, v));
2468  X2.assign_data(&X_internal_vector_values(++r, 0, 0, v));
2469  A.assign_data(&D_internal_vector_values(i + 3, 0, 0, v));
2470  KB::Trsm<member_type,
2471  KB::Side::Left, KB::Uplo::Lower, KB::Trans::NoTranspose, KB::Diag::Unit,
2472  default_mode_type, default_algo_type>::invoke(member, one, A, X2);
2473  X1.assign_data(X2.data());
2474  i += 3;
2475  } else {
2476  KB::Trsm<member_type,
2477  KB::Side::Left, KB::Uplo::Lower, KB::Trans::NoTranspose, KB::Diag::Unit,
2478  default_mode_type, default_algo_type>::invoke(member, one, A, X1);
2479  for (local_ordinal_type tr = 1; tr < nrows; ++tr, i += 3) {
2480  A.assign_data(&D_internal_vector_values(i + 2, 0, 0, v));
2481  X2.assign_data(&X_internal_vector_values(++r, 0, 0, v));
2482  member.team_barrier();
2483  KB::Gemm<member_type,
2484  KB::Trans::NoTranspose, KB::Trans::NoTranspose,
2485  default_mode_type, default_algo_type>::invoke(member, -one, A, X1, one, X2);
2486  A.assign_data(&D_internal_vector_values(i + 3, 0, 0, v));
2487  KB::Trsm<member_type,
2488  KB::Side::Left, KB::Uplo::Lower, KB::Trans::NoTranspose, KB::Diag::Unit,
2489  default_mode_type, default_algo_type>::invoke(member, one, A, X2);
2490  X1.assign_data(X2.data());
2491  }
2492  }
2493 
2494  // solve Ux = x
2495  KB::Trsm<member_type,
2496  KB::Side::Left, KB::Uplo::Upper, KB::Trans::NoTranspose, KB::Diag::NonUnit,
2497  default_mode_type, default_algo_type>::invoke(member, one, A, X1);
2498  for (local_ordinal_type tr = nrows; tr > 1; --tr) {
2499  i -= 3;
2500  A.assign_data(&D_internal_vector_values(i + 1, 0, 0, v));
2501  X2.assign_data(&X_internal_vector_values(--r, 0, 0, v));
2502  member.team_barrier();
2503  KB::Gemm<member_type,
2504  KB::Trans::NoTranspose, KB::Trans::NoTranspose,
2505  default_mode_type, default_algo_type>::invoke(member, -one, A, X1, one, X2);
2506 
2507  A.assign_data(&D_internal_vector_values(i, 0, 0, v));
2508  KB::Trsm<member_type,
2509  KB::Side::Left, KB::Uplo::Upper, KB::Trans::NoTranspose, KB::Diag::NonUnit,
2510  default_mode_type, default_algo_type>::invoke(member, one, A, X2);
2511  X1.assign_data(X2.data());
2512  }
2513  } else {
2514  // matrix is already inverted
2515  auto W = Kokkos::subview(WW, Kokkos::ALL(), Kokkos::ALL(), v);
2516  KB::Copy<member_type, KB::Trans::NoTranspose, default_mode_type>::invoke(member, X1, W);
2517  member.team_barrier();
2518  KB::Gemm<member_type,
2519  KB::Trans::NoTranspose, KB::Trans::NoTranspose,
2520  default_mode_type, default_algo_type>::invoke(member, one, A, W, zero, X1);
2521  }
2522 }
2523 
2524 template <typename impl_type, typename WWViewType, typename XViewType>
2525 KOKKOS_INLINE_FUNCTION void
2526 solveSingleVectorNew(const typename Kokkos::TeamPolicy<typename impl_type::execution_space>::member_type &member,
2527  const typename impl_type::local_ordinal_type &blocksize,
2528  const typename impl_type::local_ordinal_type &i0,
2529  const typename impl_type::local_ordinal_type &r0,
2530  const typename impl_type::local_ordinal_type &nrows,
2531  const typename impl_type::local_ordinal_type &v,
2532  const ConstUnmanaged<typename impl_type::internal_vector_type_4d_view> D_internal_vector_values,
2533  const XViewType &X_internal_vector_values, // Unmanaged<typename impl_type::internal_vector_type_4d_view>
2534  const WWViewType &WW) {
2535  using execution_space = typename impl_type::execution_space;
2536  // using team_policy_type = Kokkos::TeamPolicy<execution_space>;
2537  // using member_type = typename team_policy_type::member_type;
2538  using local_ordinal_type = typename impl_type::local_ordinal_type;
2539 
2540  typedef SolveTridiagsDefaultModeAndAlgo<typename execution_space::memory_space> default_mode_and_algo_type;
2541 
2542  typedef typename default_mode_and_algo_type::mode_type default_mode_type;
2543  typedef typename default_mode_and_algo_type::single_vector_algo_type default_algo_type;
2544 
2545  using btdm_magnitude_type = typename impl_type::btdm_magnitude_type;
2546 
2547  // base pointers
2548  auto A = D_internal_vector_values.data();
2549  auto X = X_internal_vector_values.data();
2550 
2551  // constant
2552  const auto one = Kokkos::ArithTraits<btdm_magnitude_type>::one();
2553  const auto zero = Kokkos::ArithTraits<btdm_magnitude_type>::zero();
2554  // const local_ordinal_type num_vectors = X_scalar_values.extent(2);
2555 
2556  // const local_ordinal_type blocksize = D_scalar_values.extent(1);
2557  const local_ordinal_type astep = D_internal_vector_values.stride(0);
2558  const local_ordinal_type as0 = D_internal_vector_values.stride(1); // blocksize*vector_length;
2559  const local_ordinal_type as1 = D_internal_vector_values.stride(2); // vector_length;
2560  const local_ordinal_type xstep = X_internal_vector_values.stride(0);
2561  const local_ordinal_type xs0 = X_internal_vector_values.stride(1); // vector_length;
2562 
2563  // move to starting point
2564  A += i0 * astep + v;
2565  X += r0 * xstep + v;
2566 
2567  // for (local_ordinal_type col=0;col<num_vectors;++col)
2568  if (nrows > 1) {
2569  // solve Lx = x
2570  KOKKOSBATCHED_TRSV_LOWER_NO_TRANSPOSE_INTERNAL_INVOKE(default_mode_type, default_algo_type,
2571  member,
2572  KB::Diag::Unit,
2573  blocksize, blocksize,
2574  one,
2575  A, as0, as1,
2576  X, xs0);
2577 
2578  for (local_ordinal_type tr = 1; tr < nrows; ++tr) {
2579  member.team_barrier();
2580  KOKKOSBATCHED_GEMV_NO_TRANSPOSE_INTERNAL_INVOKE(default_mode_type, default_algo_type,
2581  member,
2582  blocksize, blocksize,
2583  -one,
2584  A + 2 * astep, as0, as1,
2585  X, xs0,
2586  one,
2587  X + 1 * xstep, xs0);
2588  KOKKOSBATCHED_TRSV_LOWER_NO_TRANSPOSE_INTERNAL_INVOKE(default_mode_type, default_algo_type,
2589  member,
2590  KB::Diag::Unit,
2591  blocksize, blocksize,
2592  one,
2593  A + 3 * astep, as0, as1,
2594  X + 1 * xstep, xs0);
2595 
2596  A += 3 * astep;
2597  X += 1 * xstep;
2598  }
2599 
2600  // solve Ux = x
2601  KOKKOSBATCHED_TRSV_UPPER_NO_TRANSPOSE_INTERNAL_INVOKE(default_mode_type, default_algo_type,
2602  member,
2603  KB::Diag::NonUnit,
2604  blocksize, blocksize,
2605  one,
2606  A, as0, as1,
2607  X, xs0);
2608 
2609  for (local_ordinal_type tr = nrows; tr > 1; --tr) {
2610  A -= 3 * astep;
2611  member.team_barrier();
2612  KOKKOSBATCHED_GEMV_NO_TRANSPOSE_INTERNAL_INVOKE(default_mode_type, default_algo_type,
2613  member,
2614  blocksize, blocksize,
2615  -one,
2616  A + 1 * astep, as0, as1,
2617  X, xs0,
2618  one,
2619  X - 1 * xstep, xs0);
2620  KOKKOSBATCHED_TRSV_UPPER_NO_TRANSPOSE_INTERNAL_INVOKE(default_mode_type, default_algo_type,
2621  member,
2622  KB::Diag::NonUnit,
2623  blocksize, blocksize,
2624  one,
2625  A, as0, as1,
2626  X - 1 * xstep, xs0);
2627  X -= 1 * xstep;
2628  }
2629  // for multiple rhs
2630  // X += xs1;
2631  } else {
2632  const local_ordinal_type ws0 = WW.stride(0);
2633  auto W = WW.data() + v;
2634  KOKKOSBATCHED_COPY_VECTOR_NO_TRANSPOSE_INTERNAL_INVOKE(default_mode_type,
2635  member, blocksize, X, xs0, W, ws0);
2636  member.team_barrier();
2637  KOKKOSBATCHED_GEMV_NO_TRANSPOSE_INTERNAL_INVOKE(default_mode_type, default_algo_type,
2638  member,
2639  blocksize, blocksize,
2640  one,
2641  A, as0, as1,
2642  W, xs0,
2643  zero,
2644  X, xs0);
2645  }
2646 }
2647 
2648 template <typename local_ordinal_type, typename ViewType>
2649 void writeBTDValuesToFile(const local_ordinal_type &n_parts, const ViewType &scalar_values_device, std::string fileName) {
2650 #ifdef IFPACK2_BLOCKTRIDICONTAINER_WRITE_MM
2651  auto scalar_values = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), scalar_values_device);
2652  std::ofstream myfile;
2653  myfile.open(fileName);
2654 
2655  const local_ordinal_type n_parts_per_pack = n_parts < (local_ordinal_type)scalar_values.extent(3) ? n_parts : scalar_values.extent(3);
2656  local_ordinal_type nnz = scalar_values.extent(0) * scalar_values.extent(1) * scalar_values.extent(2) * n_parts_per_pack;
2657  const local_ordinal_type n_blocks = scalar_values.extent(0) * n_parts_per_pack;
2658  const local_ordinal_type n_blocks_per_part = n_blocks / n_parts;
2659 
2660  const local_ordinal_type block_size = scalar_values.extent(1);
2661 
2662  const local_ordinal_type n_rows_per_part = (n_blocks_per_part + 2) / 3 * block_size;
2663  const local_ordinal_type n_rows = n_rows_per_part * n_parts;
2664 
2665  const local_ordinal_type n_packs = ceil(float(n_parts) / n_parts_per_pack);
2666 
2667  myfile << "%%MatrixMarket matrix coordinate real general" << std::endl;
2668  myfile << "%%nnz = " << nnz;
2669  myfile << " block size = " << block_size;
2670  myfile << " number of blocks = " << n_blocks;
2671  myfile << " number of parts = " << n_parts;
2672  myfile << " number of blocks per part = " << n_blocks_per_part;
2673  myfile << " number of rows = " << n_rows;
2674  myfile << " number of cols = " << n_rows;
2675  myfile << " number of packs = " << n_packs << std::endl;
2676 
2677  myfile << n_rows << " " << n_rows << " " << nnz << std::setprecision(9) << std::endl;
2678 
2679  local_ordinal_type current_part_idx, current_block_idx, current_row_offset, current_col_offset, current_row, current_col;
2680  for (local_ordinal_type i_pack = 0; i_pack < n_packs; ++i_pack) {
2681  for (local_ordinal_type i_part_in_pack = 0; i_part_in_pack < n_parts_per_pack; ++i_part_in_pack) {
2682  current_part_idx = i_part_in_pack + i_pack * n_parts_per_pack;
2683  for (local_ordinal_type i_block_in_part = 0; i_block_in_part < n_blocks_per_part; ++i_block_in_part) {
2684  current_block_idx = i_block_in_part + i_pack * n_blocks_per_part;
2685  if (current_block_idx >= (local_ordinal_type)scalar_values.extent(0))
2686  continue;
2687  if (i_block_in_part % 3 == 0) {
2688  current_row_offset = i_block_in_part / 3 * block_size;
2689  current_col_offset = i_block_in_part / 3 * block_size;
2690  } else if (i_block_in_part % 3 == 1) {
2691  current_row_offset = (i_block_in_part - 1) / 3 * block_size;
2692  current_col_offset = ((i_block_in_part - 1) / 3 + 1) * block_size;
2693  } else if (i_block_in_part % 3 == 2) {
2694  current_row_offset = ((i_block_in_part - 2) / 3 + 1) * block_size;
2695  current_col_offset = (i_block_in_part - 2) / 3 * block_size;
2696  }
2697  current_row_offset += current_part_idx * n_rows_per_part;
2698  current_col_offset += current_part_idx * n_rows_per_part;
2699  for (local_ordinal_type i_in_block = 0; i_in_block < block_size; ++i_in_block) {
2700  for (local_ordinal_type j_in_block = 0; j_in_block < block_size; ++j_in_block) {
2701  current_row = current_row_offset + i_in_block + 1;
2702  current_col = current_col_offset + j_in_block + 1;
2703  myfile << current_row << " " << current_col << " " << scalar_values(current_block_idx, i_in_block, j_in_block, i_part_in_pack) << std::endl;
2704  }
2705  }
2706  }
2707  }
2708  }
2709 
2710  myfile.close();
2711 #endif
2712 }
2713 
2714 template <typename local_ordinal_type, typename ViewType>
2715 void write4DMultiVectorValuesToFile(const local_ordinal_type &n_parts, const ViewType &scalar_values_device, std::string fileName) {
2716 #ifdef IFPACK2_BLOCKTRIDICONTAINER_WRITE_MM
2717  auto scalar_values = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), scalar_values_device);
2718  std::ofstream myfile;
2719  myfile.open(fileName);
2720 
2721  const local_ordinal_type n_parts_per_pack = n_parts < scalar_values.extent(3) ? n_parts : scalar_values.extent(3);
2722  const local_ordinal_type n_blocks = scalar_values.extent(0) * n_parts_per_pack;
2723  const local_ordinal_type n_blocks_per_part = n_blocks / n_parts;
2724 
2725  const local_ordinal_type block_size = scalar_values.extent(1);
2726  const local_ordinal_type n_cols = scalar_values.extent(2);
2727 
2728  const local_ordinal_type n_rows_per_part = n_blocks_per_part * block_size;
2729  const local_ordinal_type n_rows = n_rows_per_part * n_parts;
2730 
2731  const local_ordinal_type n_packs = ceil(float(n_parts) / n_parts_per_pack);
2732 
2733  myfile << "%%MatrixMarket matrix array real general" << std::endl;
2734  myfile << "%%block size = " << block_size;
2735  myfile << " number of blocks = " << n_blocks;
2736  myfile << " number of parts = " << n_parts;
2737  myfile << " number of blocks per part = " << n_blocks_per_part;
2738  myfile << " number of rows = " << n_rows;
2739  myfile << " number of cols = " << n_cols;
2740  myfile << " number of packs = " << n_packs << std::endl;
2741 
2742  myfile << n_rows << " " << n_cols << std::setprecision(9) << std::endl;
2743 
2744  local_ordinal_type current_part_idx, current_block_idx, current_row_offset;
2745  (void)current_row_offset;
2746  (void)current_part_idx;
2747  for (local_ordinal_type j_in_block = 0; j_in_block < n_cols; ++j_in_block) {
2748  for (local_ordinal_type i_pack = 0; i_pack < n_packs; ++i_pack) {
2749  for (local_ordinal_type i_part_in_pack = 0; i_part_in_pack < n_parts_per_pack; ++i_part_in_pack) {
2750  current_part_idx = i_part_in_pack + i_pack * n_parts_per_pack;
2751  for (local_ordinal_type i_block_in_part = 0; i_block_in_part < n_blocks_per_part; ++i_block_in_part) {
2752  current_block_idx = i_block_in_part + i_pack * n_blocks_per_part;
2753 
2754  if (current_block_idx >= (local_ordinal_type)scalar_values.extent(0))
2755  continue;
2756  for (local_ordinal_type i_in_block = 0; i_in_block < block_size; ++i_in_block) {
2757  myfile << scalar_values(current_block_idx, i_in_block, j_in_block, i_part_in_pack) << std::endl;
2758  }
2759  }
2760  }
2761  }
2762  }
2763  myfile.close();
2764 #endif
2765 }
2766 
2767 template <typename local_ordinal_type, typename ViewType>
2768 void write5DMultiVectorValuesToFile(const local_ordinal_type &n_parts, const ViewType &scalar_values_device, std::string fileName) {
2769 #ifdef IFPACK2_BLOCKTRIDICONTAINER_WRITE_MM
2770  auto scalar_values = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), scalar_values_device);
2771  std::ofstream myfile;
2772  myfile.open(fileName);
2773 
2774  const local_ordinal_type n_parts_per_pack = n_parts < scalar_values.extent(4) ? n_parts : scalar_values.extent(4);
2775  const local_ordinal_type n_blocks = scalar_values.extent(1) * n_parts_per_pack;
2776  const local_ordinal_type n_blocks_per_part = n_blocks / n_parts;
2777 
2778  const local_ordinal_type block_size = scalar_values.extent(2);
2779  const local_ordinal_type n_blocks_cols = scalar_values.extent(0);
2780  const local_ordinal_type n_cols = n_blocks_cols * block_size;
2781 
2782  const local_ordinal_type n_rows_per_part = n_blocks_per_part * block_size;
2783  const local_ordinal_type n_rows = n_rows_per_part * n_parts;
2784 
2785  const local_ordinal_type n_packs = ceil(float(n_parts) / n_parts_per_pack);
2786 
2787  myfile << "%%MatrixMarket matrix array real general" << std::endl;
2788  myfile << "%%block size = " << block_size;
2789  myfile << " number of blocks = " << n_blocks;
2790  myfile << " number of parts = " << n_parts;
2791  myfile << " number of blocks per part = " << n_blocks_per_part;
2792  myfile << " number of rows = " << n_rows;
2793  myfile << " number of cols = " << n_cols;
2794  myfile << " number of packs = " << n_packs << std::endl;
2795 
2796  myfile << n_rows << " " << n_cols << std::setprecision(9) << std::endl;
2797 
2798  local_ordinal_type current_part_idx, current_block_idx, current_row_offset;
2799  (void)current_row_offset;
2800  (void)current_part_idx;
2801  for (local_ordinal_type i_block_col = 0; i_block_col < n_blocks_cols; ++i_block_col) {
2802  for (local_ordinal_type j_in_block = 0; j_in_block < block_size; ++j_in_block) {
2803  for (local_ordinal_type i_pack = 0; i_pack < n_packs; ++i_pack) {
2804  for (local_ordinal_type i_part_in_pack = 0; i_part_in_pack < n_parts_per_pack; ++i_part_in_pack) {
2805  current_part_idx = i_part_in_pack + i_pack * n_parts_per_pack;
2806  for (local_ordinal_type i_block_in_part = 0; i_block_in_part < n_blocks_per_part; ++i_block_in_part) {
2807  current_block_idx = i_block_in_part + i_pack * n_blocks_per_part;
2808 
2809  if (current_block_idx >= (local_ordinal_type)scalar_values.extent(1))
2810  continue;
2811  for (local_ordinal_type i_in_block = 0; i_in_block < block_size; ++i_in_block) {
2812  myfile << scalar_values(i_block_col, current_block_idx, i_in_block, j_in_block, i_part_in_pack) << std::endl;
2813  }
2814  }
2815  }
2816  }
2817  }
2818  }
2819  myfile.close();
2820 #endif
2821 }
2822 
2823 template <typename local_ordinal_type, typename member_type, typename ViewType1, typename ViewType2>
2824 KOKKOS_INLINE_FUNCTION void
2825 copy3DView(const member_type &member, const ViewType1 &view1, const ViewType2 &view2) {
2826  /*
2827  // Kokkos::Experimental::local_deep_copy
2828  auto teamVectorRange =
2829  Kokkos::TeamVectorMDRange<Kokkos::Rank<3>, member_type>(
2830  member, view1.extent(0), view1.extent(1), view1.extent(2));
2831 
2832  Kokkos::parallel_for
2833  (teamVectorRange,
2834  [&](const local_ordinal_type &i, const local_ordinal_type &j, const local_ordinal_type &k) {
2835  view1(i,j,k) = view2(i,j,k);
2836  });
2837  */
2838  Kokkos::Experimental::local_deep_copy(member, view1, view2);
2839 }
2840 template <typename MatrixType, int ScratchLevel>
2841 struct ExtractAndFactorizeTridiags {
2842  public:
2843  using impl_type = BlockHelperDetails::ImplType<MatrixType>;
2844  // a functor cannot have both device_type and execution_space; specialization error in kokkos
2845  using execution_space = typename impl_type::execution_space;
2846  using memory_space = typename impl_type::memory_space;
2848  using local_ordinal_type = typename impl_type::local_ordinal_type;
2849  using size_type = typename impl_type::size_type;
2850  using impl_scalar_type = typename impl_type::impl_scalar_type;
2851  using magnitude_type = typename impl_type::magnitude_type;
2853  using row_matrix_type = typename impl_type::tpetra_row_matrix_type;
2854  using crs_graph_type = typename impl_type::tpetra_crs_graph_type;
2856  using local_ordinal_type_1d_view = typename impl_type::local_ordinal_type_1d_view;
2857  using local_ordinal_type_2d_view = typename impl_type::local_ordinal_type_2d_view;
2858  using size_type_1d_view = typename impl_type::size_type_1d_view;
2859  using size_type_2d_view = typename impl_type::size_type_2d_view;
2860  using impl_scalar_type_1d_view_tpetra = typename impl_type::impl_scalar_type_1d_view_tpetra;
2862  using btdm_scalar_type = typename impl_type::btdm_scalar_type;
2863  using btdm_magnitude_type = typename impl_type::btdm_magnitude_type;
2864  using vector_type_3d_view = typename impl_type::vector_type_3d_view;
2865  using vector_type_4d_view = typename impl_type::vector_type_4d_view;
2866  using internal_vector_type_4d_view = typename impl_type::internal_vector_type_4d_view;
2867  using internal_vector_type_5d_view = typename impl_type::internal_vector_type_5d_view;
2868  using btdm_scalar_type_2d_view = typename impl_type::btdm_scalar_type_2d_view;
2869  using btdm_scalar_type_3d_view = typename impl_type::btdm_scalar_type_3d_view;
2870  using btdm_scalar_type_4d_view = typename impl_type::btdm_scalar_type_4d_view;
2871  using btdm_scalar_type_5d_view = typename impl_type::btdm_scalar_type_5d_view;
2872  using internal_vector_scratch_type_3d_view = Scratch<typename impl_type::internal_vector_type_3d_view>;
2873  using btdm_scalar_scratch_type_3d_view = Scratch<typename impl_type::btdm_scalar_type_3d_view>;
2874  using tpetra_block_access_view_type = typename impl_type::tpetra_block_access_view_type; // block crs (layout right)
2875  using local_crs_graph_type = typename impl_type::local_crs_graph_type;
2876  using colinds_view = typename local_crs_graph_type::entries_type;
2877 
2878  using internal_vector_type = typename impl_type::internal_vector_type;
2879  static constexpr int vector_length = impl_type::vector_length;
2880  static constexpr int internal_vector_length = impl_type::internal_vector_length;
2881  static_assert(vector_length >= internal_vector_length, "Ifpack2 BlockTriDi Numeric: vector_length must be at least as large as internal_vector_length");
2882  static_assert(vector_length % internal_vector_length == 0, "Ifpack2 BlockTriDi Numeric: vector_length must be divisible by internal_vector_length");
2883  // half_vector_length is used for block Jacobi factorization.
2884  // Shared memory requirement is twice as large (per vector lane) as for general tridi factorization, so
2885  // reducing vector length (if possible) keeps the shared requirement constant. This avoids the performance
2886  // cliff of switching from level 0 to level 1 scratch.
2887  static constexpr int half_vector_length = impl_type::half_vector_length;
2888 
2890  using team_policy_type = Kokkos::TeamPolicy<execution_space>;
2891  using member_type = typename team_policy_type::member_type;
2892 
2893  private:
2894  // part interface
2895  const ConstUnmanaged<local_ordinal_type_1d_view> partptr, lclrow, packptr, packindices_sub, packptr_sub;
2896  const ConstUnmanaged<local_ordinal_type_2d_view> partptr_sub, part2packrowidx0_sub, packindices_schur;
2897  const local_ordinal_type max_partsz;
2898  // block crs matrix (it could be Kokkos::UVMSpace::size_type, which is int)
2899  using size_type_1d_view_tpetra = Kokkos::View<size_t *, typename impl_type::node_device_type>;
2900  ConstUnmanaged<size_type_1d_view_tpetra> A_block_rowptr;
2901  ConstUnmanaged<size_type_1d_view_tpetra> A_point_rowptr;
2902  ConstUnmanaged<impl_scalar_type_1d_view_tpetra> A_values;
2903  // block tridiags
2904  const ConstUnmanaged<size_type_2d_view> pack_td_ptr, flat_td_ptr, pack_td_ptr_schur;
2905  const ConstUnmanaged<local_ordinal_type_1d_view> A_colindsub;
2906  const Unmanaged<internal_vector_type_4d_view> internal_vector_values, internal_vector_values_schur;
2907  const Unmanaged<internal_vector_type_5d_view> e_internal_vector_values;
2908  const Unmanaged<btdm_scalar_type_4d_view> scalar_values, scalar_values_schur;
2909  const Unmanaged<btdm_scalar_type_5d_view> e_scalar_values;
2910  const Unmanaged<btdm_scalar_type_3d_view> d_inv;
2911  const Unmanaged<size_type_1d_view> diag_offsets;
2912  // shared information
2913  const local_ordinal_type blocksize, blocksize_square;
2914  // diagonal safety
2915  const magnitude_type tiny;
2916  const local_ordinal_type vector_loop_size;
2917 
2918  bool hasBlockCrsMatrix;
2919 
2920  public:
2921  ExtractAndFactorizeTridiags(const BlockTridiags<MatrixType> &btdm_,
2922  const BlockHelperDetails::PartInterface<MatrixType> &interf_,
2925  const magnitude_type &tiny_)
2926  : // interface
2927  partptr(interf_.partptr)
2928  , lclrow(interf_.lclrow)
2929  , packptr(interf_.packptr)
2930  , packindices_sub(interf_.packindices_sub)
2931  , packptr_sub(interf_.packptr_sub)
2932  , partptr_sub(interf_.partptr_sub)
2933  , part2packrowidx0_sub(interf_.part2packrowidx0_sub)
2934  , packindices_schur(interf_.packindices_schur)
2935  , max_partsz(interf_.max_partsz)
2936  ,
2937  // block tridiags
2938  pack_td_ptr(btdm_.pack_td_ptr)
2939  , flat_td_ptr(btdm_.flat_td_ptr)
2940  , pack_td_ptr_schur(btdm_.pack_td_ptr_schur)
2941  , A_colindsub(btdm_.A_colindsub)
2942  , internal_vector_values((internal_vector_type *)btdm_.values.data(),
2943  btdm_.values.extent(0),
2944  btdm_.values.extent(1),
2945  btdm_.values.extent(2),
2946  vector_length / internal_vector_length)
2947  , internal_vector_values_schur((internal_vector_type *)btdm_.values_schur.data(),
2948  btdm_.values_schur.extent(0),
2949  btdm_.values_schur.extent(1),
2950  btdm_.values_schur.extent(2),
2951  vector_length / internal_vector_length)
2952  , e_internal_vector_values((internal_vector_type *)btdm_.e_values.data(),
2953  btdm_.e_values.extent(0),
2954  btdm_.e_values.extent(1),
2955  btdm_.e_values.extent(2),
2956  btdm_.e_values.extent(3),
2957  vector_length / internal_vector_length)
2958  , scalar_values((btdm_scalar_type *)btdm_.values.data(),
2959  btdm_.values.extent(0),
2960  btdm_.values.extent(1),
2961  btdm_.values.extent(2),
2962  vector_length)
2963  , scalar_values_schur((btdm_scalar_type *)btdm_.values_schur.data(),
2964  btdm_.values_schur.extent(0),
2965  btdm_.values_schur.extent(1),
2966  btdm_.values_schur.extent(2),
2967  vector_length)
2968  , e_scalar_values((btdm_scalar_type *)btdm_.e_values.data(),
2969  btdm_.e_values.extent(0),
2970  btdm_.e_values.extent(1),
2971  btdm_.e_values.extent(2),
2972  btdm_.e_values.extent(3),
2973  vector_length)
2974  , d_inv(btdm_.d_inv)
2975  , diag_offsets(btdm_.diag_offsets)
2976  , blocksize(btdm_.values.extent(1))
2977  , blocksize_square(blocksize * blocksize)
2978  ,
2979  // diagonal weight to avoid zero pivots
2980  tiny(tiny_)
2981  , vector_loop_size(vector_length / internal_vector_length) {
2982  using crs_matrix_type = typename impl_type::tpetra_crs_matrix_type;
2983  using block_crs_matrix_type = typename impl_type::tpetra_block_crs_matrix_type;
2984 
2985  auto A_crs = Teuchos::rcp_dynamic_cast<const crs_matrix_type>(A_);
2986  auto A_bcrs = Teuchos::rcp_dynamic_cast<const block_crs_matrix_type>(A_);
2987 
2988  hasBlockCrsMatrix = !A_bcrs.is_null();
2989 
2990  A_block_rowptr = G_->getLocalGraphDevice().row_map;
2991  if (hasBlockCrsMatrix) {
2992  A_values = const_cast<block_crs_matrix_type *>(A_bcrs.get())->getValuesDeviceNonConst();
2993  } else {
2994  A_point_rowptr = A_crs->getCrsGraph()->getLocalGraphDevice().row_map;
2995  A_values = A_crs->getLocalValuesDevice(Tpetra::Access::ReadOnly);
2996  }
2997  }
2998 
2999  private:
3000  KOKKOS_INLINE_FUNCTION
3001  void
3002  extract(local_ordinal_type partidx,
3003  local_ordinal_type local_subpartidx,
3004  local_ordinal_type npacks) const {
3005 #ifdef IFPACK2_BLOCKTRIDICONTAINER_USE_PRINTF
3006  printf("extract partidx = %d, local_subpartidx = %d, npacks = %d;\n", partidx, local_subpartidx, npacks);
3007 #endif
3008  using tlb = BlockHelperDetails::TpetraLittleBlock<Tpetra::Impl::BlockCrsMatrixLittleBlockArrayLayout>;
3009  const size_type kps = pack_td_ptr(partidx, local_subpartidx);
3010  local_ordinal_type kfs[vector_length] = {};
3011  local_ordinal_type ri0[vector_length] = {};
3012  local_ordinal_type nrows[vector_length] = {};
3013 
3014  for (local_ordinal_type vi = 0; vi < npacks; ++vi, ++partidx) {
3015  kfs[vi] = flat_td_ptr(partidx, local_subpartidx);
3016  ri0[vi] = partptr_sub(pack_td_ptr.extent(0) * local_subpartidx + partidx, 0);
3017  nrows[vi] = partptr_sub(pack_td_ptr.extent(0) * local_subpartidx + partidx, 1) - ri0[vi];
3018 #ifdef IFPACK2_BLOCKTRIDICONTAINER_USE_PRINTF
3019  printf("kfs[%d] = %d;\n", vi, kfs[vi]);
3020  printf("ri0[%d] = %d;\n", vi, ri0[vi]);
3021  printf("nrows[%d] = %d;\n", vi, nrows[vi]);
3022 #endif
3023  }
3024  local_ordinal_type tr_min = 0;
3025  local_ordinal_type tr_max = nrows[0];
3026  if (local_subpartidx % 2 == 1) {
3027  tr_min -= 1;
3028  tr_max += 1;
3029  }
3030 #ifdef IFPACK2_BLOCKTRIDICONTAINER_USE_PRINTF
3031  printf("tr_min = %d and tr_max = %d;\n", tr_min, tr_max);
3032 #endif
3033  for (local_ordinal_type tr = tr_min, j = 0; tr < tr_max; ++tr) {
3034  for (local_ordinal_type e = 0; e < 3; ++e) {
3035  if (hasBlockCrsMatrix) {
3036  const impl_scalar_type *block[vector_length] = {};
3037  for (local_ordinal_type vi = 0; vi < npacks; ++vi) {
3038  const size_type Aj = A_block_rowptr(lclrow(ri0[vi] + tr)) + A_colindsub(kfs[vi] + j);
3039 
3040  block[vi] = &A_values(Aj * blocksize_square);
3041  }
3042  const size_type pi = kps + j;
3043 #ifdef IFPACK2_BLOCKTRIDICONTAINER_USE_PRINTF
3044  printf("Extract pi = %ld, ri0 + tr = %d, kfs + j = %d\n", pi, ri0[0] + tr, kfs[0] + j);
3045 #endif
3046  ++j;
3047  for (local_ordinal_type ii = 0; ii < blocksize; ++ii) {
3048  for (local_ordinal_type jj = 0; jj < blocksize; ++jj) {
3049  const auto idx = tlb::getFlatIndex(ii, jj, blocksize);
3050  auto &v = internal_vector_values(pi, ii, jj, 0);
3051  for (local_ordinal_type vi = 0; vi < npacks; ++vi) {
3052  v[vi] = static_cast<btdm_scalar_type>(block[vi][idx]);
3053  }
3054  }
3055  }
3056  } else {
3057  const size_type pi = kps + j;
3058 
3059  for (local_ordinal_type vi = 0; vi < npacks; ++vi) {
3060  const size_type Aj_c = A_colindsub(kfs[vi] + j);
3061 
3062  for (local_ordinal_type ii = 0; ii < blocksize; ++ii) {
3063  auto point_row_offset = A_point_rowptr(lclrow(ri0[vi] + tr) * blocksize + ii);
3064 
3065  for (local_ordinal_type jj = 0; jj < blocksize; ++jj) {
3066  scalar_values(pi, ii, jj, vi) = A_values(point_row_offset + Aj_c * blocksize + jj);
3067  }
3068  }
3069  }
3070  ++j;
3071  }
3072  if (nrows[0] == 1) break;
3073  if (local_subpartidx % 2 == 0) {
3074  if (e == 1 && (tr == 0 || tr + 1 == nrows[0])) break;
3075  for (local_ordinal_type vi = 1; vi < npacks; ++vi) {
3076  if ((e == 0 && nrows[vi] == 1) || (e == 1 && tr + 1 == nrows[vi])) {
3077  npacks = vi;
3078  break;
3079  }
3080  }
3081  } else {
3082  if (e == 0 && (tr == -1 || tr == nrows[0])) break;
3083  for (local_ordinal_type vi = 1; vi < npacks; ++vi) {
3084  if ((e == 0 && nrows[vi] == 1) || (e == 0 && tr == nrows[vi])) {
3085  npacks = vi;
3086  break;
3087  }
3088  }
3089  }
3090  }
3091  }
3092  }
3093 
3094  KOKKOS_INLINE_FUNCTION
3095  void
3096  extract(const member_type &member,
3097  const local_ordinal_type &partidxbeg,
3098  local_ordinal_type local_subpartidx,
3099  const local_ordinal_type &npacks,
3100  const local_ordinal_type &vbeg) const {
3101 #ifdef IFPACK2_BLOCKTRIDICONTAINER_USE_PRINTF
3102  printf("extract partidxbeg = %d, local_subpartidx = %d, npacks = %d, vbeg = %d;\n", partidxbeg, local_subpartidx, npacks, vbeg);
3103 #endif
3104  using tlb = BlockHelperDetails::TpetraLittleBlock<Tpetra::Impl::BlockCrsMatrixLittleBlockArrayLayout>;
3105  local_ordinal_type kfs_vals[internal_vector_length] = {};
3106  local_ordinal_type ri0_vals[internal_vector_length] = {};
3107  local_ordinal_type nrows_vals[internal_vector_length] = {};
3108 
3109  const size_type kps = pack_td_ptr(partidxbeg, local_subpartidx);
3110  for (local_ordinal_type v = vbeg, vi = 0; v < npacks && vi < internal_vector_length; ++v, ++vi) {
3111  kfs_vals[vi] = flat_td_ptr(partidxbeg + vi, local_subpartidx);
3112  ri0_vals[vi] = partptr_sub(pack_td_ptr.extent(0) * local_subpartidx + partidxbeg + vi, 0);
3113  nrows_vals[vi] = partptr_sub(pack_td_ptr.extent(0) * local_subpartidx + partidxbeg + vi, 1) - ri0_vals[vi];
3114 #ifdef IFPACK2_BLOCKTRIDICONTAINER_USE_PRINTF
3115  printf("kfs_vals[%d] = %d;\n", vi, kfs_vals[vi]);
3116  printf("ri0_vals[%d] = %d;\n", vi, ri0_vals[vi]);
3117  printf("nrows_vals[%d] = %d;\n", vi, nrows_vals[vi]);
3118 #endif
3119  }
3120 
3121  local_ordinal_type j_vals[internal_vector_length] = {};
3122 
3123  local_ordinal_type tr_min = 0;
3124  local_ordinal_type tr_max = nrows_vals[0];
3125  if (local_subpartidx % 2 == 1) {
3126  tr_min -= 1;
3127  tr_max += 1;
3128  }
3129 #ifdef IFPACK2_BLOCKTRIDICONTAINER_USE_PRINTF
3130  printf("tr_min = %d and tr_max = %d;\n", tr_min, tr_max);
3131 #endif
3132  for (local_ordinal_type tr = tr_min; tr < tr_max; ++tr) {
3133  for (local_ordinal_type v = vbeg, vi = 0; v < npacks && vi < internal_vector_length; ++v, ++vi) {
3134  const local_ordinal_type nrows = (local_subpartidx % 2 == 0 ? nrows_vals[vi] : nrows_vals[vi]);
3135  if ((local_subpartidx % 2 == 0 && tr < nrows) || (local_subpartidx % 2 == 1 && tr < nrows + 1)) {
3136  auto &j = j_vals[vi];
3137  const local_ordinal_type kfs = kfs_vals[vi];
3138  const local_ordinal_type ri0 = ri0_vals[vi];
3139  local_ordinal_type lbeg, lend;
3140  if (local_subpartidx % 2 == 0) {
3141  lbeg = (tr == tr_min ? 1 : 0);
3142  lend = (tr == nrows - 1 ? 2 : 3);
3143  } else {
3144  lbeg = 0;
3145  lend = 3;
3146  if (tr == tr_min) {
3147  lbeg = 1;
3148  lend = 2;
3149  } else if (tr == nrows) {
3150  lbeg = 0;
3151  lend = 1;
3152  }
3153  }
3154  if (hasBlockCrsMatrix) {
3155  for (local_ordinal_type l = lbeg; l < lend; ++l, ++j) {
3156  const size_type Aj = A_block_rowptr(lclrow(ri0 + tr)) + A_colindsub(kfs + j);
3157  const impl_scalar_type *block = &A_values(Aj * blocksize_square);
3158  const size_type pi = kps + j;
3159 #ifdef IFPACK2_BLOCKTRIDICONTAINER_USE_PRINTF
3160  printf("Extract pi = %ld, ri0 + tr = %d, kfs + j = %d, tr = %d, lbeg = %d, lend = %d, l = %d\n", pi, ri0 + tr, kfs + j, tr, lbeg, lend, l);
3161 #endif
3162  Kokkos::parallel_for(Kokkos::TeamThreadRange(member, blocksize),
3163  [&](const local_ordinal_type &ii) {
3164  for (local_ordinal_type jj = 0; jj < blocksize; ++jj) {
3165  scalar_values(pi, ii, jj, v) = static_cast<btdm_scalar_type>(block[tlb::getFlatIndex(ii, jj, blocksize)]);
3166  }
3167  });
3168  }
3169  } else {
3170  for (local_ordinal_type l = lbeg; l < lend; ++l, ++j) {
3171  const size_type Aj_c = A_colindsub(kfs + j);
3172  const size_type pi = kps + j;
3173  Kokkos::parallel_for(Kokkos::TeamThreadRange(member, blocksize),
3174  [&](const local_ordinal_type &ii) {
3175  auto point_row_offset = A_point_rowptr(lclrow(ri0 + tr) * blocksize + ii);
3176  for (local_ordinal_type jj = 0; jj < blocksize; ++jj) {
3177  scalar_values(pi, ii, jj, v) = A_values(point_row_offset + Aj_c * blocksize + jj);
3178  }
3179  });
3180  }
3181  }
3182  }
3183  }
3184  }
3185  }
3186 
3187  template <typename AAViewType,
3188  typename WWViewType>
3189  KOKKOS_INLINE_FUNCTION void
3190  factorize_subline(const member_type &member,
3191  const local_ordinal_type &i0,
3192  const local_ordinal_type &nrows,
3193  const local_ordinal_type &v,
3194  const AAViewType &AA,
3195  const WWViewType &WW) const {
3196  typedef ExtractAndFactorizeTridiagsDefaultModeAndAlgo<typename execution_space::memory_space> default_mode_and_algo_type;
3197 
3198  typedef typename default_mode_and_algo_type::mode_type default_mode_type;
3199  typedef typename default_mode_and_algo_type::algo_type default_algo_type;
3200 
3201  // constant
3202  const auto one = Kokkos::ArithTraits<btdm_magnitude_type>::one();
3203 
3204 #ifdef IFPACK2_BLOCKTRIDICONTAINER_USE_PRINTF
3205  printf("i0 = %d, nrows = %d, v = %d, AA.extent(0) = %ld;\n", i0, nrows, v, AA.extent(0));
3206 #endif
3207 
3208  // subview pattern
3209  auto A = Kokkos::subview(AA, i0, Kokkos::ALL(), Kokkos::ALL(), v);
3210  KB::LU<member_type,
3211  default_mode_type, KB::Algo::LU::Unblocked>::invoke(member, A, tiny);
3212 
3213  if (nrows > 1) {
3214  auto B = A;
3215  auto C = A;
3216  local_ordinal_type i = i0;
3217  for (local_ordinal_type tr = 1; tr < nrows; ++tr, i += 3) {
3218 #ifdef IFPACK2_BLOCKTRIDICONTAINER_USE_PRINTF
3219  printf("tr = %d, i = %d;\n", tr, i);
3220 #endif
3221  B.assign_data(&AA(i + 1, 0, 0, v));
3222  KB::Trsm<member_type,
3223  KB::Side::Left, KB::Uplo::Lower, KB::Trans::NoTranspose, KB::Diag::Unit,
3224  default_mode_type, default_algo_type>::invoke(member, one, A, B);
3225  C.assign_data(&AA(i + 2, 0, 0, v));
3226  KB::Trsm<member_type,
3227  KB::Side::Right, KB::Uplo::Upper, KB::Trans::NoTranspose, KB::Diag::NonUnit,
3228  default_mode_type, default_algo_type>::invoke(member, one, A, C);
3229  A.assign_data(&AA(i + 3, 0, 0, v));
3230 
3231  member.team_barrier();
3232  KB::Gemm<member_type,
3233  KB::Trans::NoTranspose, KB::Trans::NoTranspose,
3234  default_mode_type, default_algo_type>::invoke(member, -one, C, B, one, A);
3235  KB::LU<member_type,
3236  default_mode_type, KB::Algo::LU::Unblocked>::invoke(member, A, tiny);
3237  }
3238  } else {
3239  // for block jacobi invert a matrix here
3240  auto W = Kokkos::subview(WW, Kokkos::ALL(), Kokkos::ALL(), v);
3241  KB::Copy<member_type, KB::Trans::NoTranspose, default_mode_type>::invoke(member, A, W);
3242  KB::SetIdentity<member_type, default_mode_type>::invoke(member, A);
3243  member.team_barrier();
3244  KB::Trsm<member_type,
3245  KB::Side::Left, KB::Uplo::Lower, KB::Trans::NoTranspose, KB::Diag::Unit,
3246  default_mode_type, default_algo_type>::invoke(member, one, W, A);
3247  KB::Trsm<member_type,
3248  KB::Side::Left, KB::Uplo::Upper, KB::Trans::NoTranspose, KB::Diag::NonUnit,
3249  default_mode_type, default_algo_type>::invoke(member, one, W, A);
3250  }
3251  }
3252 
3253  public:
3254  struct ExtractAndFactorizeSubLineTag {};
3255  struct ExtractAndFactorizeFusedJacobiTag {};
3256  struct ExtractBCDTag {};
3257  struct ComputeETag {};
3258  struct ComputeSchurTag {};
3259  struct FactorizeSchurTag {};
3260 
3261  KOKKOS_INLINE_FUNCTION
3262  void
3263  operator()(const ExtractAndFactorizeSubLineTag &, const member_type &member) const {
3264  // btdm is packed and sorted from largest one
3265  const local_ordinal_type packidx = packindices_sub(member.league_rank());
3266 
3267  const local_ordinal_type subpartidx = packptr_sub(packidx);
3268  const local_ordinal_type n_parts = part2packrowidx0_sub.extent(0);
3269  const local_ordinal_type local_subpartidx = subpartidx / n_parts;
3270  const local_ordinal_type partidx = subpartidx % n_parts;
3271 
3272  const local_ordinal_type npacks = packptr_sub(packidx + 1) - subpartidx;
3273  const local_ordinal_type i0 = pack_td_ptr(partidx, local_subpartidx);
3274  const local_ordinal_type nrows = partptr_sub(subpartidx, 1) - partptr_sub(subpartidx, 0);
3275 
3276  internal_vector_scratch_type_3d_view
3277  WW(member.team_scratch(ScratchLevel), blocksize, blocksize, vector_loop_size);
3278 
3279 #ifdef IFPACK2_BLOCKTRIDICONTAINER_USE_PRINTF
3280  printf("rank = %d, i0 = %d, npacks = %d, nrows = %d, packidx = %d, subpartidx = %d, partidx = %d, local_subpartidx = %d;\n", member.league_rank(), i0, npacks, nrows, packidx, subpartidx, partidx, local_subpartidx);
3281  printf("vector_loop_size = %d\n", vector_loop_size);
3282 #endif
3283 
3284  if (vector_loop_size == 1) {
3285  extract(partidx, local_subpartidx, npacks);
3286  factorize_subline(member, i0, nrows, 0, internal_vector_values, WW);
3287  } else {
3288  Kokkos::parallel_for(Kokkos::ThreadVectorRange(member, vector_loop_size),
3289  [&](const local_ordinal_type &v) {
3290  const local_ordinal_type vbeg = v * internal_vector_length;
3291 #ifdef IFPACK2_BLOCKTRIDICONTAINER_USE_PRINTF
3292  printf("i0 = %d, npacks = %d, vbeg = %d;\n", i0, npacks, vbeg);
3293 #endif
3294  if (vbeg < npacks)
3295  extract(member, partidx + vbeg, local_subpartidx, npacks, vbeg);
3296  // this is not safe if vector loop size is different from vector size of
3297  // the team policy. we always make sure this when constructing the team policy
3298  member.team_barrier();
3299  factorize_subline(member, i0, nrows, v, internal_vector_values, WW);
3300  });
3301  }
3302  }
3303 
3304  KOKKOS_INLINE_FUNCTION
3305  void
3306  operator()(const ExtractAndFactorizeFusedJacobiTag &, const member_type &member) const {
3307  using default_mode_and_algo_type = ExtractAndFactorizeTridiagsDefaultModeAndAlgo<typename execution_space::memory_space>;
3308  using default_mode_type = typename default_mode_and_algo_type::mode_type;
3309  using default_algo_type = typename default_mode_and_algo_type::algo_type;
3310  // When fused block Jacobi can be used, the mapping between local rows and parts is trivial (i <-> i)
3311  // We can simply pull the diagonal entry from A into d_inv
3312  btdm_scalar_scratch_type_3d_view WW1(member.team_scratch(ScratchLevel), half_vector_length, blocksize, blocksize);
3313  btdm_scalar_scratch_type_3d_view WW2(member.team_scratch(ScratchLevel), half_vector_length, blocksize, blocksize);
3314  const auto one = Kokkos::ArithTraits<btdm_magnitude_type>::one();
3315  const local_ordinal_type nrows = lclrow.extent(0);
3316  Kokkos::parallel_for(Kokkos::ThreadVectorRange(member, half_vector_length),
3317  [&](const local_ordinal_type &v) {
3318  local_ordinal_type row = member.league_rank() * half_vector_length + v;
3319  // diagEntry has index of diagonal within row
3320  auto W1 = Kokkos::subview(WW1, v, Kokkos::ALL(), Kokkos::ALL());
3321  auto W2 = Kokkos::subview(WW2, v, Kokkos::ALL(), Kokkos::ALL());
3322  if (row < nrows) {
3323  // View the diagonal block of A in row as 2D row-major
3324  const impl_scalar_type *A_diag = A_values.data() + diag_offsets(row);
3325  // Copy the diag into scratch slice W1
3326  // (copying elements directly is better than KokkosBatched copy)
3327  Kokkos::parallel_for(Kokkos::TeamThreadRange(member, blocksize * blocksize),
3328  [&](int i) {
3329  W1.data()[i] = A_diag[i];
3330  });
3331  // and set W2 to identity in preparation to invert with 2 x Trsm
3332  KB::SetIdentity<member_type, default_mode_type>::invoke(member, W2);
3333  } else {
3334  // if this vector lane has no block to invert, then set W1 to identity
3335  // so that LU still has a matrix to work on. LU uses team barriers so
3336  // having some lanes run it and some not will deadlock.
3337  KB::SetIdentity<member_type, default_mode_type>::invoke(member, W1);
3338  }
3339  member.team_barrier();
3340  // LU factorize in-place
3341  KB::LU<member_type, default_mode_type, KB::Algo::LU::Unblocked>::invoke(member, W1, tiny);
3342  member.team_barrier();
3343  KB::Trsm<member_type,
3344  KB::Side::Left, KB::Uplo::Lower, KB::Trans::NoTranspose, KB::Diag::Unit,
3345  default_mode_type, default_algo_type>::invoke(member, one, W1, W2);
3346  KB::Trsm<member_type,
3347  KB::Side::Left, KB::Uplo::Upper, KB::Trans::NoTranspose, KB::Diag::NonUnit,
3348  default_mode_type, default_algo_type>::invoke(member, one, W1, W2);
3349  member.team_barrier();
3350  if (row < nrows) {
3351  Kokkos::parallel_for(Kokkos::TeamThreadRange(member, blocksize * blocksize),
3352  [&](int i) {
3353  auto d_inv_block = &d_inv(row, 0, 0);
3354  d_inv_block[i] = W2.data()[i];
3355  });
3356  }
3357  });
3358  }
3359 
3360  KOKKOS_INLINE_FUNCTION
3361  void
3362  operator()(const ExtractBCDTag &, const member_type &member) const {
3363  // btdm is packed and sorted from largest one
3364  const local_ordinal_type packindices_schur_i = member.league_rank() % packindices_schur.extent(0);
3365  const local_ordinal_type packindices_schur_j = member.league_rank() / packindices_schur.extent(0);
3366  const local_ordinal_type packidx = packindices_schur(packindices_schur_i, packindices_schur_j);
3367 
3368  const local_ordinal_type subpartidx = packptr_sub(packidx);
3369  const local_ordinal_type n_parts = part2packrowidx0_sub.extent(0);
3370  const local_ordinal_type local_subpartidx = subpartidx / n_parts;
3371  const local_ordinal_type partidx = subpartidx % n_parts;
3372 
3373  const local_ordinal_type npacks = packptr_sub(packidx + 1) - subpartidx;
3374  // const local_ordinal_type i0 = pack_td_ptr(partidx,local_subpartidx);
3375  // const local_ordinal_type nrows = partptr_sub(subpartidx,1) - partptr_sub(subpartidx,0);
3376 
3377  if (vector_loop_size == 1) {
3378  extract(partidx, local_subpartidx, npacks);
3379  } else {
3380  Kokkos::parallel_for(Kokkos::ThreadVectorRange(member, vector_loop_size),
3381  [&](const local_ordinal_type &v) {
3382  const local_ordinal_type vbeg = v * internal_vector_length;
3383 #ifdef IFPACK2_BLOCKTRIDICONTAINER_USE_PRINTF
3384  const local_ordinal_type i0 = pack_td_ptr(partidx, local_subpartidx);
3385  printf("i0 = %d, npacks = %d, vbeg = %d;\n", i0, npacks, vbeg);
3386 #endif
3387  if (vbeg < npacks)
3388  extract(member, partidx + vbeg, local_subpartidx, npacks, vbeg);
3389  });
3390  }
3391 
3392  member.team_barrier();
3393 
3394  const size_type kps1 = pack_td_ptr(partidx, local_subpartidx);
3395  const size_type kps2 = pack_td_ptr(partidx, local_subpartidx + 1) - 1;
3396 
3397  const local_ordinal_type r1 = part2packrowidx0_sub(partidx, local_subpartidx) - 1;
3398  const local_ordinal_type r2 = part2packrowidx0_sub(partidx, local_subpartidx) + 2;
3399 
3400 #ifdef IFPACK2_BLOCKTRIDICONTAINER_USE_PRINTF
3401  printf("Copy for Schur complement part id = %d from kps1 = %ld to r1 = %d and from kps2 = %ld to r2 = %d partidx = %d local_subpartidx = %d;\n", packidx, kps1, r1, kps2, r2, partidx, local_subpartidx);
3402 #endif
3403 
3404  // Need to copy D to e_internal_vector_values.
3405  copy3DView<local_ordinal_type>(member, Kokkos::subview(e_internal_vector_values, 0, r1, Kokkos::ALL(), Kokkos::ALL(), Kokkos::ALL()),
3406  Kokkos::subview(internal_vector_values, kps1, Kokkos::ALL(), Kokkos::ALL(), Kokkos::ALL()));
3407 
3408  copy3DView<local_ordinal_type>(member, Kokkos::subview(e_internal_vector_values, 1, r2, Kokkos::ALL(), Kokkos::ALL(), Kokkos::ALL()),
3409  Kokkos::subview(internal_vector_values, kps2, Kokkos::ALL(), Kokkos::ALL(), Kokkos::ALL()));
3410  }
3411 
3412  KOKKOS_INLINE_FUNCTION
3413  void
3414  operator()(const ComputeETag &, const member_type &member) const {
3415  // btdm is packed and sorted from largest one
3416  const local_ordinal_type packidx = packindices_sub(member.league_rank());
3417 
3418  const local_ordinal_type subpartidx = packptr_sub(packidx);
3419  const local_ordinal_type n_parts = part2packrowidx0_sub.extent(0);
3420  const local_ordinal_type local_subpartidx = subpartidx / n_parts;
3421  const local_ordinal_type partidx = subpartidx % n_parts;
3422 
3423  const local_ordinal_type npacks = packptr_sub(packidx + 1) - subpartidx;
3424  const local_ordinal_type i0 = pack_td_ptr(partidx, local_subpartidx);
3425  const local_ordinal_type r0 = part2packrowidx0_sub(partidx, local_subpartidx);
3426  const local_ordinal_type nrows = partptr_sub(subpartidx, 1) - partptr_sub(subpartidx, 0);
3427  const local_ordinal_type num_vectors = blocksize;
3428 
3429  (void)npacks;
3430 
3431  internal_vector_scratch_type_3d_view
3432  WW(member.team_scratch(ScratchLevel), blocksize, num_vectors, vector_loop_size);
3433  if (local_subpartidx == 0) {
3434  Kokkos::parallel_for(Kokkos::ThreadVectorRange(member, vector_loop_size), [&](const int &v) {
3435  solveMultiVector<impl_type, internal_vector_scratch_type_3d_view>(member, blocksize, i0, r0, nrows, v, internal_vector_values, Kokkos::subview(e_internal_vector_values, 0, Kokkos::ALL(), Kokkos::ALL(), Kokkos::ALL(), Kokkos::ALL()), WW, true);
3436  });
3437  } else if (local_subpartidx == (local_ordinal_type)part2packrowidx0_sub.extent(1) - 2) {
3438  Kokkos::parallel_for(Kokkos::ThreadVectorRange(member, vector_loop_size), [&](const int &v) {
3439  solveMultiVector<impl_type, internal_vector_scratch_type_3d_view>(member, blocksize, i0, r0, nrows, v, internal_vector_values, Kokkos::subview(e_internal_vector_values, 1, Kokkos::ALL(), Kokkos::ALL(), Kokkos::ALL(), Kokkos::ALL()), WW);
3440  });
3441  } else {
3442  Kokkos::parallel_for(Kokkos::ThreadVectorRange(member, vector_loop_size), [&](const int &v) {
3443  solveMultiVector<impl_type, internal_vector_scratch_type_3d_view>(member, blocksize, i0, r0, nrows, v, internal_vector_values, Kokkos::subview(e_internal_vector_values, 0, Kokkos::ALL(), Kokkos::ALL(), Kokkos::ALL(), Kokkos::ALL()), WW, true);
3444  solveMultiVector<impl_type, internal_vector_scratch_type_3d_view>(member, blocksize, i0, r0, nrows, v, internal_vector_values, Kokkos::subview(e_internal_vector_values, 1, Kokkos::ALL(), Kokkos::ALL(), Kokkos::ALL(), Kokkos::ALL()), WW);
3445  });
3446  }
3447  }
3448 
3449  KOKKOS_INLINE_FUNCTION
3450  void
3451  operator()(const ComputeSchurTag &, const member_type &member) const {
3452  // btdm is packed and sorted from largest one
3453  const local_ordinal_type packindices_schur_i = member.league_rank() % packindices_schur.extent(0);
3454  const local_ordinal_type packindices_schur_j = member.league_rank() / packindices_schur.extent(0);
3455  const local_ordinal_type packidx = packindices_schur(packindices_schur_i, packindices_schur_j);
3456 
3457  const local_ordinal_type subpartidx = packptr_sub(packidx);
3458  const local_ordinal_type n_parts = part2packrowidx0_sub.extent(0);
3459  const local_ordinal_type local_subpartidx = subpartidx / n_parts;
3460  const local_ordinal_type partidx = subpartidx % n_parts;
3461 
3462  // const local_ordinal_type npacks = packptr_sub(packidx+1) - subpartidx;
3463  const local_ordinal_type i0 = pack_td_ptr(partidx, local_subpartidx);
3464  // const local_ordinal_type r0 = part2packrowidx0_sub(partidx,local_subpartidx);
3465  // const local_ordinal_type nrows = partptr_sub(subpartidx,1) - partptr_sub(subpartidx,0);
3466 
3467  // Compute S = D - C E
3468 
3469  const local_ordinal_type local_subpartidx_schur = (local_subpartidx - 1) / 2;
3470  const local_ordinal_type i0_schur = local_subpartidx_schur == 0 ? pack_td_ptr_schur(partidx, local_subpartidx_schur) : pack_td_ptr_schur(partidx, local_subpartidx_schur) + 1;
3471  const local_ordinal_type i0_offset = local_subpartidx_schur == 0 ? i0 + 2 : i0 + 2;
3472 
3473  for (local_ordinal_type i = 0; i < 4; ++i) { // pack_td_ptr_schur(partidx,local_subpartidx_schur+1)-i0_schur
3474  copy3DView<local_ordinal_type>(member, Kokkos::subview(internal_vector_values_schur, i0_schur + i, Kokkos::ALL(), Kokkos::ALL(), Kokkos::ALL()),
3475  Kokkos::subview(internal_vector_values, i0_offset + i, Kokkos::ALL(), Kokkos::ALL(), Kokkos::ALL()));
3476  }
3477 
3478  member.team_barrier();
3479 
3480  const auto one = Kokkos::ArithTraits<btdm_magnitude_type>::one();
3481 
3482  const size_type c_kps1 = pack_td_ptr(partidx, local_subpartidx) + 1;
3483  const size_type c_kps2 = pack_td_ptr(partidx, local_subpartidx + 1) - 2;
3484 
3485  const local_ordinal_type e_r1 = part2packrowidx0_sub(partidx, local_subpartidx) - 1;
3486  const local_ordinal_type e_r2 = part2packrowidx0_sub(partidx, local_subpartidx) + 2;
3487 
3488  typedef ExtractAndFactorizeTridiagsDefaultModeAndAlgo<typename execution_space::memory_space> default_mode_and_algo_type;
3489 
3490  typedef typename default_mode_and_algo_type::mode_type default_mode_type;
3491  typedef typename default_mode_and_algo_type::algo_type default_algo_type;
3492 
3493  Kokkos::parallel_for(Kokkos::ThreadVectorRange(member, vector_loop_size), [&](const int &v) {
3494  for (size_type i = 0; i < pack_td_ptr_schur(partidx, local_subpartidx_schur + 1) - pack_td_ptr_schur(partidx, local_subpartidx_schur); ++i) {
3495  local_ordinal_type e_r, e_c, c_kps;
3496 
3497  if (local_subpartidx_schur == 0) {
3498  if (i == 0) {
3499  e_r = e_r1;
3500  e_c = 0;
3501  c_kps = c_kps1;
3502  } else if (i == 3) {
3503  e_r = e_r2;
3504  e_c = 1;
3505  c_kps = c_kps2;
3506  } else if (i == 4) {
3507  e_r = e_r2;
3508  e_c = 0;
3509  c_kps = c_kps2;
3510  } else {
3511  continue;
3512  }
3513  } else {
3514  if (i == 0) {
3515  e_r = e_r1;
3516  e_c = 1;
3517  c_kps = c_kps1;
3518  } else if (i == 1) {
3519  e_r = e_r1;
3520  e_c = 0;
3521  c_kps = c_kps1;
3522  } else if (i == 4) {
3523  e_r = e_r2;
3524  e_c = 1;
3525  c_kps = c_kps2;
3526  } else if (i == 5) {
3527  e_r = e_r2;
3528  e_c = 0;
3529  c_kps = c_kps2;
3530  } else {
3531  continue;
3532  }
3533  }
3534 
3535  auto S = Kokkos::subview(internal_vector_values_schur, pack_td_ptr_schur(partidx, local_subpartidx_schur) + i, Kokkos::ALL(), Kokkos::ALL(), v);
3536  auto C = Kokkos::subview(internal_vector_values, c_kps, Kokkos::ALL(), Kokkos::ALL(), v);
3537  auto E = Kokkos::subview(e_internal_vector_values, e_c, e_r, Kokkos::ALL(), Kokkos::ALL(), v);
3538  KB::Gemm<member_type,
3539  KB::Trans::NoTranspose, KB::Trans::NoTranspose,
3540  default_mode_type, default_algo_type>::invoke(member, -one, C, E, one, S);
3541  }
3542  });
3543  }
3544 
3545  KOKKOS_INLINE_FUNCTION
3546  void
3547  operator()(const FactorizeSchurTag &, const member_type &member) const {
3548  const local_ordinal_type packidx = packindices_schur(member.league_rank(), 0);
3549 
3550  const local_ordinal_type subpartidx = packptr_sub(packidx);
3551 
3552  const local_ordinal_type n_parts = part2packrowidx0_sub.extent(0);
3553  const local_ordinal_type partidx = subpartidx % n_parts;
3554 
3555  const local_ordinal_type i0 = pack_td_ptr_schur(partidx, 0);
3556  const local_ordinal_type nrows = 2 * (pack_td_ptr_schur.extent(1) - 1);
3557 
3558  internal_vector_scratch_type_3d_view
3559  WW(member.team_scratch(ScratchLevel), blocksize, blocksize, vector_loop_size);
3560 
3561 #ifdef IFPACK2_BLOCKTRIDICONTAINER_USE_PRINTF
3562  printf("FactorizeSchurTag rank = %d, i0 = %d, nrows = %d, vector_loop_size = %d;\n", member.league_rank(), i0, nrows, vector_loop_size);
3563 #endif
3564 
3565  if (vector_loop_size == 1) {
3566  factorize_subline(member, i0, nrows, 0, internal_vector_values_schur, WW);
3567  } else {
3568  Kokkos::parallel_for(Kokkos::ThreadVectorRange(member, vector_loop_size),
3569  [&](const local_ordinal_type &v) {
3570  factorize_subline(member, i0, nrows, v, internal_vector_values_schur, WW);
3571  });
3572  }
3573  }
3574 
3575  void run() {
3576  IFPACK2_BLOCKTRIDICONTAINER_PROFILER_REGION_BEGIN;
3577  const local_ordinal_type team_size =
3578  ExtractAndFactorizeTridiagsDefaultModeAndAlgo<typename execution_space::memory_space>::
3579  recommended_team_size(blocksize, vector_length, internal_vector_length);
3580  const local_ordinal_type per_team_scratch = internal_vector_scratch_type_3d_view::
3581  shmem_size(blocksize, blocksize, vector_loop_size);
3582 
3583  {
3584 #ifdef IFPACK2_BLOCKTRIDICONTAINER_USE_PRINTF
3585  printf("Start ExtractAndFactorizeSubLineTag\n");
3586 #endif
3587  IFPACK2_BLOCKHELPER_TIMER("BlockTriDi::NumericPhase::ExtractAndFactorizeSubLineTag", ExtractAndFactorizeSubLineTag0);
3588  Kokkos::TeamPolicy<execution_space, ExtractAndFactorizeSubLineTag>
3589  policy(packindices_sub.extent(0), team_size, vector_loop_size);
3590 
3591  const local_ordinal_type n_parts = part2packrowidx0_sub.extent(0);
3592  writeBTDValuesToFile(n_parts, scalar_values, "before.mm");
3593 
3594  policy.set_scratch_size(ScratchLevel, Kokkos::PerTeam(per_team_scratch));
3595  Kokkos::parallel_for("ExtractAndFactorize::TeamPolicy::run<ExtractAndFactorizeSubLineTag>",
3596  policy, *this);
3597  execution_space().fence();
3598 
3599  writeBTDValuesToFile(n_parts, scalar_values, "after.mm");
3600 #ifdef IFPACK2_BLOCKTRIDICONTAINER_USE_PRINTF
3601  printf("End ExtractAndFactorizeSubLineTag\n");
3602 #endif
3603  }
3604 
3605  if (packindices_schur.extent(1) > 0) {
3606  {
3607 #ifdef IFPACK2_BLOCKTRIDICONTAINER_USE_PRINTF
3608  printf("Start ExtractBCDTag\n");
3609 #endif
3610  Kokkos::deep_copy(e_scalar_values, Kokkos::ArithTraits<btdm_magnitude_type>::zero());
3611  Kokkos::deep_copy(scalar_values_schur, Kokkos::ArithTraits<btdm_magnitude_type>::zero());
3612 
3613  write5DMultiVectorValuesToFile(part2packrowidx0_sub.extent(0), e_scalar_values, "e_scalar_values_before_extract.mm");
3614 
3615  {
3616  IFPACK2_BLOCKHELPER_TIMER("BlockTriDi::NumericPhase::ExtractBCDTag", ExtractBCDTag0);
3617  Kokkos::TeamPolicy<execution_space, ExtractBCDTag>
3618  policy(packindices_schur.extent(0) * packindices_schur.extent(1), team_size, vector_loop_size);
3619 
3620  policy.set_scratch_size(ScratchLevel, Kokkos::PerTeam(per_team_scratch));
3621  Kokkos::parallel_for("ExtractAndFactorize::TeamPolicy::run<ExtractBCDTag>",
3622  policy, *this);
3623  execution_space().fence();
3624  }
3625 
3626 #ifdef IFPACK2_BLOCKTRIDICONTAINER_USE_PRINTF
3627  printf("End ExtractBCDTag\n");
3628 #endif
3629  writeBTDValuesToFile(part2packrowidx0_sub.extent(0), scalar_values, "after_extraction_of_BCD.mm");
3630 #ifdef IFPACK2_BLOCKTRIDICONTAINER_USE_PRINTF
3631  printf("Start ComputeETag\n");
3632 #endif
3633  write5DMultiVectorValuesToFile(part2packrowidx0_sub.extent(0), e_scalar_values, "e_scalar_values_after_extract.mm");
3634  {
3635  IFPACK2_BLOCKHELPER_TIMER("BlockTriDi::NumericPhase::ComputeETag", ComputeETag0);
3636  Kokkos::TeamPolicy<execution_space, ComputeETag>
3637  policy(packindices_sub.extent(0), team_size, vector_loop_size);
3638 
3639  policy.set_scratch_size(ScratchLevel, Kokkos::PerTeam(per_team_scratch));
3640  Kokkos::parallel_for("ExtractAndFactorize::TeamPolicy::run<ComputeETag>",
3641  policy, *this);
3642  execution_space().fence();
3643  }
3644  write5DMultiVectorValuesToFile(part2packrowidx0_sub.extent(0), e_scalar_values, "e_scalar_values_after_compute.mm");
3645 
3646 #ifdef IFPACK2_BLOCKTRIDICONTAINER_USE_PRINTF
3647  printf("End ComputeETag\n");
3648 #endif
3649  }
3650 
3651  {
3652 #ifdef IFPACK2_BLOCKTRIDICONTAINER_USE_PRINTF
3653  printf("Start ComputeSchurTag\n");
3654 #endif
3655  IFPACK2_BLOCKHELPER_TIMER("BlockTriDi::NumericPhase::ComputeSchurTag", ComputeSchurTag0);
3656  writeBTDValuesToFile(part2packrowidx0_sub.extent(0), scalar_values_schur, "before_schur.mm");
3657  Kokkos::TeamPolicy<execution_space, ComputeSchurTag>
3658  policy(packindices_schur.extent(0) * packindices_schur.extent(1), team_size, vector_loop_size);
3659 
3660  Kokkos::parallel_for("ExtractAndFactorize::TeamPolicy::run<ComputeSchurTag>",
3661  policy, *this);
3662  writeBTDValuesToFile(part2packrowidx0_sub.extent(0), scalar_values_schur, "after_schur.mm");
3663  execution_space().fence();
3664 #ifdef IFPACK2_BLOCKTRIDICONTAINER_USE_PRINTF
3665  printf("End ComputeSchurTag\n");
3666 #endif
3667  }
3668 
3669  {
3670 #ifdef IFPACK2_BLOCKTRIDICONTAINER_USE_PRINTF
3671  printf("Start FactorizeSchurTag\n");
3672 #endif
3673  IFPACK2_BLOCKHELPER_TIMER("BlockTriDi::NumericPhase::FactorizeSchurTag", FactorizeSchurTag0);
3674  Kokkos::TeamPolicy<execution_space, FactorizeSchurTag>
3675  policy(packindices_schur.extent(0), team_size, vector_loop_size);
3676  policy.set_scratch_size(ScratchLevel, Kokkos::PerTeam(per_team_scratch));
3677  Kokkos::parallel_for("ExtractAndFactorize::TeamPolicy::run<FactorizeSchurTag>",
3678  policy, *this);
3679  execution_space().fence();
3680  writeBTDValuesToFile(part2packrowidx0_sub.extent(0), scalar_values_schur, "after_factor_schur.mm");
3681 #ifdef IFPACK2_BLOCKTRIDICONTAINER_USE_PRINTF
3682  printf("End FactorizeSchurTag\n");
3683 #endif
3684  }
3685  }
3686 
3687  IFPACK2_BLOCKTRIDICONTAINER_PROFILER_REGION_END;
3688  }
3689 
3690  void run_fused_jacobi() {
3691  IFPACK2_BLOCKTRIDICONTAINER_PROFILER_REGION_BEGIN;
3692  const local_ordinal_type team_size =
3693  ExtractAndFactorizeTridiagsDefaultModeAndAlgo<typename execution_space::memory_space>::
3694  recommended_team_size(blocksize, half_vector_length, 1);
3695  const local_ordinal_type per_team_scratch =
3696  btdm_scalar_scratch_type_3d_view::shmem_size(blocksize, blocksize, 2 * half_vector_length);
3697  {
3698  IFPACK2_BLOCKHELPER_TIMER("BlockTriDi::NumericPhase::ExtractAndFactorizeFusedJacobi", ExtractAndFactorizeFusedJacobiTag);
3699  Kokkos::TeamPolicy<execution_space, ExtractAndFactorizeFusedJacobiTag>
3700  policy((lclrow.extent(0) + half_vector_length - 1) / half_vector_length, team_size, half_vector_length);
3701 
3702  policy.set_scratch_size(ScratchLevel, Kokkos::PerTeam(per_team_scratch));
3703  Kokkos::parallel_for("ExtractAndFactorize::TeamPolicy::run<ExtractAndFactorizeFusedJacobiTag>",
3704  policy, *this);
3705  }
3706  IFPACK2_BLOCKTRIDICONTAINER_PROFILER_REGION_END;
3707  }
3708 };
3709 
3713 template <typename MatrixType>
3714 void performNumericPhase(const Teuchos::RCP<const typename BlockHelperDetails::ImplType<MatrixType>::tpetra_row_matrix_type> &A,
3715  const Teuchos::RCP<const typename BlockHelperDetails::ImplType<MatrixType>::tpetra_crs_graph_type> &G,
3716  const BlockHelperDetails::PartInterface<MatrixType> &interf,
3718  const typename BlockHelperDetails::ImplType<MatrixType>::magnitude_type tiny,
3719  bool use_fused_jacobi) {
3720  using impl_type = BlockHelperDetails::ImplType<MatrixType>;
3721  using execution_space = typename impl_type::execution_space;
3722  using team_policy_type = Kokkos::TeamPolicy<execution_space>;
3723  using internal_vector_scratch_type_3d_view = Scratch<typename impl_type::internal_vector_type_3d_view>;
3724  using btdm_scalar_scratch_type_3d_view = Scratch<typename impl_type::btdm_scalar_type_3d_view>;
3725 
3726  IFPACK2_BLOCKHELPER_TIMER("BlockTriDi::NumericPhase", NumericPhase);
3727 
3728  int blocksize = btdm.values.extent(1);
3729  // Both Kokkos policy vector length and SIMD type vector length are hardcoded in KokkosBatched.
3730  // For large block sizes, have to fall back to level 1 scratch.
3731  int scratch_required;
3732  if (!use_fused_jacobi) {
3733  // General path scratch requirement
3734  scratch_required = internal_vector_scratch_type_3d_view::shmem_size(blocksize, blocksize, impl_type::vector_length / impl_type::internal_vector_length);
3735  } else {
3736  // Block Jacobi scratch requirement: measured in scalars, and uses twice as much (in bytes) per vector lane as the general path.
3737  scratch_required = btdm_scalar_scratch_type_3d_view::shmem_size(blocksize, blocksize, 2 * impl_type::half_vector_length);
3738  }
3739 
3740  int max_scratch = team_policy_type::scratch_size_max(0);
3741 
3742  if (scratch_required < max_scratch) {
3743  // Can use level 0 scratch
3744  ExtractAndFactorizeTridiags<MatrixType, 0> function(btdm, interf, A, G, tiny);
3745  if (!use_fused_jacobi)
3746  function.run();
3747  else
3748  function.run_fused_jacobi();
3749  } else {
3750  // Not enough level 0 scratch, so fall back to level 1
3751  ExtractAndFactorizeTridiags<MatrixType, 1> function(btdm, interf, A, G, tiny);
3752  if (!use_fused_jacobi)
3753  function.run();
3754  else
3755  function.run_fused_jacobi();
3756  }
3757  IFPACK2_BLOCKHELPER_TIMER_FENCE(typename BlockHelperDetails::ImplType<MatrixType>::execution_space)
3758 }
3759 
3763 template <typename MatrixType>
3765  public:
3767  using execution_space = typename impl_type::execution_space;
3768  using memory_space = typename impl_type::memory_space;
3769 
3770  using local_ordinal_type = typename impl_type::local_ordinal_type;
3771  using impl_scalar_type = typename impl_type::impl_scalar_type;
3772  using btdm_scalar_type = typename impl_type::btdm_scalar_type;
3773  using tpetra_multivector_type = typename impl_type::tpetra_multivector_type;
3774  using local_ordinal_type_1d_view = typename impl_type::local_ordinal_type_1d_view;
3775  using vector_type_3d_view = typename impl_type::vector_type_3d_view;
3776  using impl_scalar_type_2d_view_tpetra = typename impl_type::impl_scalar_type_2d_view_tpetra;
3777  using const_impl_scalar_type_2d_view_tpetra = typename impl_scalar_type_2d_view_tpetra::const_type;
3778  static constexpr int vector_length = impl_type::vector_length;
3779 
3780  using member_type = typename Kokkos::TeamPolicy<execution_space>::member_type;
3781 
3782  private:
3783  // part interface
3784  const ConstUnmanaged<local_ordinal_type_1d_view> partptr;
3785  const ConstUnmanaged<local_ordinal_type_1d_view> packptr;
3786  const ConstUnmanaged<local_ordinal_type_1d_view> part2packrowidx0;
3787  const ConstUnmanaged<local_ordinal_type_1d_view> part2rowidx0;
3788  const ConstUnmanaged<local_ordinal_type_1d_view> lclrow;
3789  const local_ordinal_type blocksize;
3790  const local_ordinal_type num_vectors;
3791 
3792  // packed multivector output (or input)
3793  vector_type_3d_view packed_multivector;
3794  const_impl_scalar_type_2d_view_tpetra scalar_multivector;
3795 
3796  template <typename TagType>
3797  KOKKOS_INLINE_FUNCTION void copy_multivectors(const local_ordinal_type &j,
3798  const local_ordinal_type &vi,
3799  const local_ordinal_type &pri,
3800  const local_ordinal_type &ri0) const {
3801  for (local_ordinal_type col = 0; col < num_vectors; ++col)
3802  for (local_ordinal_type i = 0; i < blocksize; ++i)
3803  packed_multivector(pri, i, col)[vi] = static_cast<btdm_scalar_type>(scalar_multivector(blocksize * lclrow(ri0 + j) + i, col));
3804  }
3805 
3806  public:
3807  MultiVectorConverter(const BlockHelperDetails::PartInterface<MatrixType> &interf,
3808  const vector_type_3d_view &pmv)
3809  : partptr(interf.partptr)
3810  , packptr(interf.packptr)
3811  , part2packrowidx0(interf.part2packrowidx0)
3812  , part2rowidx0(interf.part2rowidx0)
3813  , lclrow(interf.lclrow)
3814  , blocksize(pmv.extent(1))
3815  , num_vectors(pmv.extent(2))
3816  , packed_multivector(pmv) {}
3817 
3818  // TODO:: modify this routine similar to the team level functions
3819  KOKKOS_INLINE_FUNCTION
3820  void
3821  operator()(const local_ordinal_type &packidx) const {
3822  local_ordinal_type partidx = packptr(packidx);
3823  local_ordinal_type npacks = packptr(packidx + 1) - partidx;
3824  const local_ordinal_type pri0 = part2packrowidx0(partidx);
3825 
3826  local_ordinal_type ri0[vector_length] = {};
3827  local_ordinal_type nrows[vector_length] = {};
3828  for (local_ordinal_type v = 0; v < npacks; ++v, ++partidx) {
3829  ri0[v] = part2rowidx0(partidx);
3830  nrows[v] = part2rowidx0(partidx + 1) - ri0[v];
3831  }
3832  for (local_ordinal_type j = 0; j < nrows[0]; ++j) {
3833  local_ordinal_type cnt = 1;
3834  for (; cnt < npacks && j != nrows[cnt]; ++cnt)
3835  ;
3836  npacks = cnt;
3837  const local_ordinal_type pri = pri0 + j;
3838  for (local_ordinal_type col = 0; col < num_vectors; ++col)
3839  for (local_ordinal_type i = 0; i < blocksize; ++i)
3840  for (local_ordinal_type v = 0; v < npacks; ++v)
3841  packed_multivector(pri, i, col)[v] = static_cast<btdm_scalar_type>(scalar_multivector(blocksize * lclrow(ri0[v] + j) + i, col));
3842  }
3843  }
3844 
3845  KOKKOS_INLINE_FUNCTION
3846  void
3847  operator()(const member_type &member) const {
3848  const local_ordinal_type packidx = member.league_rank();
3849  const local_ordinal_type partidx_begin = packptr(packidx);
3850  const local_ordinal_type npacks = packptr(packidx + 1) - partidx_begin;
3851  const local_ordinal_type pri0 = part2packrowidx0(partidx_begin);
3852  Kokkos::parallel_for(Kokkos::ThreadVectorRange(member, npacks), [&](const local_ordinal_type &v) {
3853  const local_ordinal_type partidx = partidx_begin + v;
3854  const local_ordinal_type ri0 = part2rowidx0(partidx);
3855  const local_ordinal_type nrows = part2rowidx0(partidx + 1) - ri0;
3856 
3857  if (nrows == 1) {
3858  const local_ordinal_type pri = pri0;
3859  for (local_ordinal_type col = 0; col < num_vectors; ++col) {
3860  Kokkos::parallel_for(Kokkos::TeamThreadRange(member, blocksize), [&](const local_ordinal_type &i) {
3861  packed_multivector(pri, i, col)[v] = static_cast<btdm_scalar_type>(scalar_multivector(blocksize * lclrow(ri0) + i, col));
3862  });
3863  }
3864  } else {
3865  Kokkos::parallel_for(Kokkos::TeamThreadRange(member, nrows), [&](const local_ordinal_type &j) {
3866  const local_ordinal_type pri = pri0 + j;
3867  for (local_ordinal_type col = 0; col < num_vectors; ++col)
3868  for (local_ordinal_type i = 0; i < blocksize; ++i)
3869  packed_multivector(pri, i, col)[v] = static_cast<btdm_scalar_type>(scalar_multivector(blocksize * lclrow(ri0 + j) + i, col));
3870  });
3871  }
3872  });
3873  }
3874 
3875  void run(const const_impl_scalar_type_2d_view_tpetra &scalar_multivector_) {
3876  IFPACK2_BLOCKTRIDICONTAINER_PROFILER_REGION_BEGIN;
3877  IFPACK2_BLOCKHELPER_TIMER("BlockTriDi::MultiVectorConverter", MultiVectorConverter0);
3878 
3879  scalar_multivector = scalar_multivector_;
3880  if constexpr (BlockHelperDetails::is_device<execution_space>::value) {
3881  const local_ordinal_type vl = vector_length;
3882  const Kokkos::TeamPolicy<execution_space> policy(packptr.extent(0) - 1, Kokkos::AUTO(), vl);
3883  Kokkos::parallel_for("MultiVectorConverter::TeamPolicy", policy, *this);
3884  } else {
3885  const Kokkos::RangePolicy<execution_space> policy(0, packptr.extent(0) - 1);
3886  Kokkos::parallel_for("MultiVectorConverter::RangePolicy", policy, *this);
3887  }
3888  IFPACK2_BLOCKTRIDICONTAINER_PROFILER_REGION_END;
3889  IFPACK2_BLOCKHELPER_TIMER_FENCE(execution_space)
3890  }
3891 };
3892 
3896 
3897 template <>
3898 struct SolveTridiagsDefaultModeAndAlgo<Kokkos::HostSpace> {
3899  typedef KB::Mode::Serial mode_type;
3900  typedef KB::Algo::Level2::Unblocked single_vector_algo_type;
3901 #if defined(__KOKKOSBATCHED_INTEL_MKL_COMPACT_BATCHED__)
3902  typedef KB::Algo::Level3::CompactMKL multi_vector_algo_type;
3903 #else
3904  typedef KB::Algo::Level3::Blocked multi_vector_algo_type;
3905 #endif
3906  static int recommended_team_size(const int /* blksize */,
3907  const int /* vector_length */,
3908  const int /* internal_vector_length */) {
3909  return 1;
3910  }
3911 };
3912 
3913 #if defined(KOKKOS_ENABLE_CUDA)
3914 static inline int SolveTridiagsRecommendedCudaTeamSize(const int blksize,
3915  const int vector_length,
3916  const int internal_vector_length) {
3917  const int vector_size = vector_length / internal_vector_length;
3918  int total_team_size(0);
3919  if (blksize <= 5)
3920  total_team_size = 32;
3921  else if (blksize <= 9)
3922  total_team_size = 32; // 64
3923  else if (blksize <= 12)
3924  total_team_size = 96;
3925  else if (blksize <= 16)
3926  total_team_size = 128;
3927  else if (blksize <= 20)
3928  total_team_size = 160;
3929  else
3930  total_team_size = 160;
3931  return total_team_size / vector_size;
3932 }
3933 
3934 template <>
3935 struct SolveTridiagsDefaultModeAndAlgo<Kokkos::CudaSpace> {
3936  typedef KB::Mode::Team mode_type;
3937  typedef KB::Algo::Level2::Unblocked single_vector_algo_type;
3938  typedef KB::Algo::Level3::Unblocked multi_vector_algo_type;
3939  static int recommended_team_size(const int blksize,
3940  const int vector_length,
3941  const int internal_vector_length) {
3942  return SolveTridiagsRecommendedCudaTeamSize(blksize, vector_length, internal_vector_length);
3943  }
3944 };
3945 template <>
3946 struct SolveTridiagsDefaultModeAndAlgo<Kokkos::CudaUVMSpace> {
3947  typedef KB::Mode::Team mode_type;
3948  typedef KB::Algo::Level2::Unblocked single_vector_algo_type;
3949  typedef KB::Algo::Level3::Unblocked multi_vector_algo_type;
3950  static int recommended_team_size(const int blksize,
3951  const int vector_length,
3952  const int internal_vector_length) {
3953  return SolveTridiagsRecommendedCudaTeamSize(blksize, vector_length, internal_vector_length);
3954  }
3955 };
3956 #endif
3957 
3958 #if defined(KOKKOS_ENABLE_HIP)
3959 static inline int SolveTridiagsRecommendedHIPTeamSize(const int blksize,
3960  const int vector_length,
3961  const int internal_vector_length) {
3962  const int vector_size = vector_length / internal_vector_length;
3963  int total_team_size(0);
3964  if (blksize <= 5)
3965  total_team_size = 32;
3966  else if (blksize <= 9)
3967  total_team_size = 32; // 64
3968  else if (blksize <= 12)
3969  total_team_size = 96;
3970  else if (blksize <= 16)
3971  total_team_size = 128;
3972  else if (blksize <= 20)
3973  total_team_size = 160;
3974  else
3975  total_team_size = 160;
3976  return total_team_size / vector_size;
3977 }
3978 
3979 template <>
3980 struct SolveTridiagsDefaultModeAndAlgo<Kokkos::HIPSpace> {
3981  typedef KB::Mode::Team mode_type;
3982  typedef KB::Algo::Level2::Unblocked single_vector_algo_type;
3983  typedef KB::Algo::Level3::Unblocked multi_vector_algo_type;
3984  static int recommended_team_size(const int blksize,
3985  const int vector_length,
3986  const int internal_vector_length) {
3987  return SolveTridiagsRecommendedHIPTeamSize(blksize, vector_length, internal_vector_length);
3988  }
3989 };
3990 template <>
3991 struct SolveTridiagsDefaultModeAndAlgo<Kokkos::HIPHostPinnedSpace> {
3992  typedef KB::Mode::Team mode_type;
3993  typedef KB::Algo::Level2::Unblocked single_vector_algo_type;
3994  typedef KB::Algo::Level3::Unblocked multi_vector_algo_type;
3995  static int recommended_team_size(const int blksize,
3996  const int vector_length,
3997  const int internal_vector_length) {
3998  return SolveTridiagsRecommendedHIPTeamSize(blksize, vector_length, internal_vector_length);
3999  }
4000 };
4001 #endif
4002 
4003 #if defined(KOKKOS_ENABLE_SYCL)
4004 static inline int SolveTridiagsRecommendedSYCLTeamSize(const int blksize,
4005  const int vector_length,
4006  const int internal_vector_length) {
4007  const int vector_size = vector_length / internal_vector_length;
4008  int total_team_size(0);
4009  if (blksize <= 5)
4010  total_team_size = 32;
4011  else if (blksize <= 9)
4012  total_team_size = 32; // 64
4013  else if (blksize <= 12)
4014  total_team_size = 96;
4015  else if (blksize <= 16)
4016  total_team_size = 128;
4017  else if (blksize <= 20)
4018  total_team_size = 160;
4019  else
4020  total_team_size = 160;
4021  return total_team_size / vector_size;
4022 }
4023 
4024 template <>
4025 struct SolveTridiagsDefaultModeAndAlgo<Kokkos::Experimental::SYCLSharedUSMSpace> {
4026  typedef KB::Mode::Team mode_type;
4027  typedef KB::Algo::Level2::Unblocked single_vector_algo_type;
4028  typedef KB::Algo::Level3::Unblocked multi_vector_algo_type;
4029  static int recommended_team_size(const int blksize,
4030  const int vector_length,
4031  const int internal_vector_length) {
4032  return SolveTridiagsRecommendedSYCLTeamSize(blksize, vector_length, internal_vector_length);
4033  }
4034 };
4035 template <>
4036 struct SolveTridiagsDefaultModeAndAlgo<Kokkos::Experimental::SYCLDeviceUSMSpace> {
4037  typedef KB::Mode::Team mode_type;
4038  typedef KB::Algo::Level2::Unblocked single_vector_algo_type;
4039  typedef KB::Algo::Level3::Unblocked multi_vector_algo_type;
4040  static int recommended_team_size(const int blksize,
4041  const int vector_length,
4042  const int internal_vector_length) {
4043  return SolveTridiagsRecommendedSYCLTeamSize(blksize, vector_length, internal_vector_length);
4044  }
4045 };
4046 #endif
4047 
4048 template <typename MatrixType>
4049 struct SolveTridiags {
4050  public:
4051  using impl_type = BlockHelperDetails::ImplType<MatrixType>;
4052  using execution_space = typename impl_type::execution_space;
4053 
4054  using local_ordinal_type = typename impl_type::local_ordinal_type;
4055  using size_type = typename impl_type::size_type;
4056  using impl_scalar_type = typename impl_type::impl_scalar_type;
4057  using magnitude_type = typename impl_type::magnitude_type;
4058  using btdm_scalar_type = typename impl_type::btdm_scalar_type;
4059  using btdm_magnitude_type = typename impl_type::btdm_magnitude_type;
4061  using local_ordinal_type_1d_view = typename impl_type::local_ordinal_type_1d_view;
4062  using local_ordinal_type_2d_view = typename impl_type::local_ordinal_type_2d_view;
4063  using size_type_2d_view = typename impl_type::size_type_2d_view;
4065  using vector_type_3d_view = typename impl_type::vector_type_3d_view;
4066  using internal_vector_type_4d_view = typename impl_type::internal_vector_type_4d_view;
4067  using internal_vector_type_5d_view = typename impl_type::internal_vector_type_5d_view;
4068  using btdm_scalar_type_4d_view = typename impl_type::btdm_scalar_type_4d_view;
4069 
4070  using internal_vector_scratch_type_3d_view = Scratch<typename impl_type::internal_vector_type_3d_view>;
4071 
4072  using internal_vector_type = typename impl_type::internal_vector_type;
4073  static constexpr int vector_length = impl_type::vector_length;
4074  static constexpr int internal_vector_length = impl_type::internal_vector_length;
4075 
4077  using impl_scalar_type_1d_view = typename impl_type::impl_scalar_type_1d_view;
4078  using impl_scalar_type_2d_view_tpetra = typename impl_type::impl_scalar_type_2d_view_tpetra;
4079 
4081  using team_policy_type = Kokkos::TeamPolicy<execution_space>;
4082  using member_type = typename team_policy_type::member_type;
4083 
4084  private:
4085  // part interface
4086  local_ordinal_type n_subparts_per_part;
4087  const ConstUnmanaged<local_ordinal_type_1d_view> partptr;
4088  const ConstUnmanaged<local_ordinal_type_1d_view> packptr;
4089  const ConstUnmanaged<local_ordinal_type_1d_view> packindices_sub;
4090  const ConstUnmanaged<local_ordinal_type_2d_view> packindices_schur;
4091  const ConstUnmanaged<local_ordinal_type_1d_view> part2packrowidx0;
4092  const ConstUnmanaged<local_ordinal_type_2d_view> part2packrowidx0_sub;
4093  const ConstUnmanaged<local_ordinal_type_1d_view> lclrow;
4094  const ConstUnmanaged<local_ordinal_type_1d_view> packptr_sub;
4095 
4096  const ConstUnmanaged<local_ordinal_type_2d_view> partptr_sub;
4097  const ConstUnmanaged<size_type_2d_view> pack_td_ptr_schur;
4098 
4099  // block tridiags
4100  const ConstUnmanaged<size_type_2d_view> pack_td_ptr;
4101 
4102  // block tridiags values
4103  const ConstUnmanaged<internal_vector_type_4d_view> D_internal_vector_values;
4104  const Unmanaged<internal_vector_type_4d_view> X_internal_vector_values;
4105  const Unmanaged<btdm_scalar_type_4d_view> X_internal_scalar_values;
4106 
4107  internal_vector_type_4d_view X_internal_vector_values_schur;
4108 
4109  const ConstUnmanaged<internal_vector_type_4d_view> D_internal_vector_values_schur;
4110  const ConstUnmanaged<internal_vector_type_5d_view> e_internal_vector_values;
4111 
4112  const local_ordinal_type vector_loop_size;
4113 
4114  // copy to multivectors : damping factor and Y_scalar_multivector
4115  Unmanaged<impl_scalar_type_2d_view_tpetra> Y_scalar_multivector;
4116 #if defined(__CUDA_ARCH__) || defined(__HIP_DEVICE_COMPILE__) || defined(__SYCL_DEVICE_ONLY__)
4117  AtomicUnmanaged<impl_scalar_type_1d_view> Z_scalar_vector;
4118 #else
4119  /* */ Unmanaged<impl_scalar_type_1d_view> Z_scalar_vector;
4120 #endif
4121  const impl_scalar_type df;
4122  const bool compute_diff;
4123 
4124  public:
4125  SolveTridiags(const BlockHelperDetails::PartInterface<MatrixType> &interf,
4126  const BlockTridiags<MatrixType> &btdm,
4127  const vector_type_3d_view &pmv,
4128  const impl_scalar_type damping_factor,
4129  const bool is_norm_manager_active)
4130  : // interface
4131  n_subparts_per_part(interf.n_subparts_per_part)
4132  , partptr(interf.partptr)
4133  , packptr(interf.packptr)
4134  , packindices_sub(interf.packindices_sub)
4135  , packindices_schur(interf.packindices_schur)
4136  , part2packrowidx0(interf.part2packrowidx0)
4137  , part2packrowidx0_sub(interf.part2packrowidx0_sub)
4138  , lclrow(interf.lclrow)
4139  , packptr_sub(interf.packptr_sub)
4140  , partptr_sub(interf.partptr_sub)
4141  , pack_td_ptr_schur(btdm.pack_td_ptr_schur)
4142  ,
4143  // block tridiags and multivector
4144  pack_td_ptr(btdm.pack_td_ptr)
4145  , D_internal_vector_values((internal_vector_type *)btdm.values.data(),
4146  btdm.values.extent(0),
4147  btdm.values.extent(1),
4148  btdm.values.extent(2),
4149  vector_length / internal_vector_length)
4150  , X_internal_vector_values((internal_vector_type *)pmv.data(),
4151  pmv.extent(0),
4152  pmv.extent(1),
4153  pmv.extent(2),
4154  vector_length / internal_vector_length)
4155  , X_internal_scalar_values((btdm_scalar_type *)pmv.data(),
4156  pmv.extent(0),
4157  pmv.extent(1),
4158  pmv.extent(2),
4159  vector_length)
4160  , X_internal_vector_values_schur(do_not_initialize_tag("X_internal_vector_values_schur"),
4161  2 * (n_subparts_per_part - 1) * part2packrowidx0_sub.extent(0),
4162  pmv.extent(1),
4163  pmv.extent(2),
4164  vector_length / internal_vector_length)
4165  , D_internal_vector_values_schur((internal_vector_type *)btdm.values_schur.data(),
4166  btdm.values_schur.extent(0),
4167  btdm.values_schur.extent(1),
4168  btdm.values_schur.extent(2),
4169  vector_length / internal_vector_length)
4170  , e_internal_vector_values((internal_vector_type *)btdm.e_values.data(),
4171  btdm.e_values.extent(0),
4172  btdm.e_values.extent(1),
4173  btdm.e_values.extent(2),
4174  btdm.e_values.extent(3),
4175  vector_length / internal_vector_length)
4176  , vector_loop_size(vector_length / internal_vector_length)
4177  , Y_scalar_multivector()
4178  , Z_scalar_vector()
4179  , df(damping_factor)
4180  , compute_diff(is_norm_manager_active) {}
4181 
4182  public:
4184  KOKKOS_INLINE_FUNCTION
4185  void
4186  copyToFlatMultiVector(const member_type &member,
4187  const local_ordinal_type partidxbeg, // partidx for v = 0
4188  const local_ordinal_type npacks,
4189  const local_ordinal_type pri0,
4190  const local_ordinal_type v, // index with a loop of vector_loop_size
4191  const local_ordinal_type blocksize,
4192  const local_ordinal_type num_vectors) const {
4193  const local_ordinal_type vbeg = v * internal_vector_length;
4194  if (vbeg < npacks) {
4195  local_ordinal_type ri0_vals[internal_vector_length] = {};
4196  local_ordinal_type nrows_vals[internal_vector_length] = {};
4197  for (local_ordinal_type vv = vbeg, vi = 0; vv < npacks && vi < internal_vector_length; ++vv, ++vi) {
4198  const local_ordinal_type partidx = partidxbeg + vv;
4199  ri0_vals[vi] = partptr(partidx);
4200  nrows_vals[vi] = partptr(partidx + 1) - ri0_vals[vi];
4201  }
4202 
4203  impl_scalar_type z_partial_sum(0);
4204  if (nrows_vals[0] == 1) {
4205  const local_ordinal_type j = 0, pri = pri0;
4206  {
4207  for (local_ordinal_type vv = vbeg, vi = 0; vv < npacks && vi < internal_vector_length; ++vv, ++vi) {
4208  const local_ordinal_type ri0 = ri0_vals[vi];
4209  const local_ordinal_type nrows = nrows_vals[vi];
4210  if (j < nrows) {
4211  Kokkos::parallel_for(Kokkos::TeamThreadRange(member, blocksize),
4212  [&](const local_ordinal_type &i) {
4213  const local_ordinal_type row = blocksize * lclrow(ri0 + j) + i;
4214  for (local_ordinal_type col = 0; col < num_vectors; ++col) {
4215  impl_scalar_type &y = Y_scalar_multivector(row, col);
4216  const impl_scalar_type yd = X_internal_vector_values(pri, i, col, v)[vi] - y;
4217  y += df * yd;
4218 
4219  { // if (compute_diff) {
4220  const auto yd_abs = Kokkos::ArithTraits<impl_scalar_type>::abs(yd);
4221  z_partial_sum += yd_abs * yd_abs;
4222  }
4223  }
4224  });
4225  }
4226  }
4227  }
4228  } else {
4229  Kokkos::parallel_for(Kokkos::TeamThreadRange(member, nrows_vals[0]),
4230  [&](const local_ordinal_type &j) {
4231  const local_ordinal_type pri = pri0 + j;
4232  for (local_ordinal_type vv = vbeg, vi = 0; vv < npacks && vi < internal_vector_length; ++vv, ++vi) {
4233  const local_ordinal_type ri0 = ri0_vals[vi];
4234  const local_ordinal_type nrows = nrows_vals[vi];
4235  if (j < nrows) {
4236  for (local_ordinal_type col = 0; col < num_vectors; ++col) {
4237  for (local_ordinal_type i = 0; i < blocksize; ++i) {
4238  const local_ordinal_type row = blocksize * lclrow(ri0 + j) + i;
4239  impl_scalar_type &y = Y_scalar_multivector(row, col);
4240  const impl_scalar_type yd = X_internal_vector_values(pri, i, col, v)[vi] - y;
4241  y += df * yd;
4242 
4243  { // if (compute_diff) {
4244  const auto yd_abs = Kokkos::ArithTraits<impl_scalar_type>::abs(yd);
4245  z_partial_sum += yd_abs * yd_abs;
4246  }
4247  }
4248  }
4249  }
4250  }
4251  });
4252  }
4253  // if (compute_diff)
4254  Z_scalar_vector(member.league_rank()) += z_partial_sum;
4255  }
4256  }
4257 
4261  template <typename WWViewType>
4262  KOKKOS_INLINE_FUNCTION void
4263  solveSingleVector(const member_type &member,
4264  const local_ordinal_type &blocksize,
4265  const local_ordinal_type &i0,
4266  const local_ordinal_type &r0,
4267  const local_ordinal_type &nrows,
4268  const local_ordinal_type &v,
4269  const WWViewType &WW) const {
4270  typedef SolveTridiagsDefaultModeAndAlgo<typename execution_space::memory_space> default_mode_and_algo_type;
4271 
4272  typedef typename default_mode_and_algo_type::mode_type default_mode_type;
4273  typedef typename default_mode_and_algo_type::single_vector_algo_type default_algo_type;
4274 
4275  // base pointers
4276  auto A = D_internal_vector_values.data();
4277  auto X = X_internal_vector_values.data();
4278 
4279  // constant
4280  const auto one = Kokkos::ArithTraits<btdm_magnitude_type>::one();
4281  const auto zero = Kokkos::ArithTraits<btdm_magnitude_type>::zero();
4282  // const local_ordinal_type num_vectors = X_scalar_values.extent(2);
4283 
4284  // const local_ordinal_type blocksize = D_scalar_values.extent(1);
4285  const local_ordinal_type astep = D_internal_vector_values.stride(0);
4286  const local_ordinal_type as0 = D_internal_vector_values.stride(1); // blocksize*vector_length;
4287  const local_ordinal_type as1 = D_internal_vector_values.stride(2); // vector_length;
4288  const local_ordinal_type xstep = X_internal_vector_values.stride(0);
4289  const local_ordinal_type xs0 = X_internal_vector_values.stride(1); // vector_length;
4290 
4291  // move to starting point
4292  A += i0 * astep + v;
4293  X += r0 * xstep + v;
4294 
4295  // for (local_ordinal_type col=0;col<num_vectors;++col)
4296  if (nrows > 1) {
4297  // solve Lx = x
4298  KOKKOSBATCHED_TRSV_LOWER_NO_TRANSPOSE_INTERNAL_INVOKE(default_mode_type, default_algo_type,
4299  member,
4300  KB::Diag::Unit,
4301  blocksize, blocksize,
4302  one,
4303  A, as0, as1,
4304  X, xs0);
4305 
4306  for (local_ordinal_type tr = 1; tr < nrows; ++tr) {
4307  member.team_barrier();
4308  KOKKOSBATCHED_GEMV_NO_TRANSPOSE_INTERNAL_INVOKE(default_mode_type, default_algo_type,
4309  member,
4310  blocksize, blocksize,
4311  -one,
4312  A + 2 * astep, as0, as1,
4313  X, xs0,
4314  one,
4315  X + 1 * xstep, xs0);
4316  KOKKOSBATCHED_TRSV_LOWER_NO_TRANSPOSE_INTERNAL_INVOKE(default_mode_type, default_algo_type,
4317  member,
4318  KB::Diag::Unit,
4319  blocksize, blocksize,
4320  one,
4321  A + 3 * astep, as0, as1,
4322  X + 1 * xstep, xs0);
4323 
4324  A += 3 * astep;
4325  X += 1 * xstep;
4326  }
4327 
4328  // solve Ux = x
4329  KOKKOSBATCHED_TRSV_UPPER_NO_TRANSPOSE_INTERNAL_INVOKE(default_mode_type, default_algo_type,
4330  member,
4331  KB::Diag::NonUnit,
4332  blocksize, blocksize,
4333  one,
4334  A, as0, as1,
4335  X, xs0);
4336 
4337  for (local_ordinal_type tr = nrows; tr > 1; --tr) {
4338  A -= 3 * astep;
4339  member.team_barrier();
4340  KOKKOSBATCHED_GEMV_NO_TRANSPOSE_INTERNAL_INVOKE(default_mode_type, default_algo_type,
4341  member,
4342  blocksize, blocksize,
4343  -one,
4344  A + 1 * astep, as0, as1,
4345  X, xs0,
4346  one,
4347  X - 1 * xstep, xs0);
4348  KOKKOSBATCHED_TRSV_UPPER_NO_TRANSPOSE_INTERNAL_INVOKE(default_mode_type, default_algo_type,
4349  member,
4350  KB::Diag::NonUnit,
4351  blocksize, blocksize,
4352  one,
4353  A, as0, as1,
4354  X - 1 * xstep, xs0);
4355  X -= 1 * xstep;
4356  }
4357  // for multiple rhs
4358  // X += xs1;
4359  } else {
4360  const local_ordinal_type ws0 = WW.stride(0);
4361  auto W = WW.data() + v;
4362  KOKKOSBATCHED_COPY_VECTOR_NO_TRANSPOSE_INTERNAL_INVOKE(default_mode_type,
4363  member, blocksize, X, xs0, W, ws0);
4364  member.team_barrier();
4365  KOKKOSBATCHED_GEMV_NO_TRANSPOSE_INTERNAL_INVOKE(default_mode_type, default_algo_type,
4366  member,
4367  blocksize, blocksize,
4368  one,
4369  A, as0, as1,
4370  W, xs0,
4371  zero,
4372  X, xs0);
4373  }
4374  }
4375 
4376  template <typename WWViewType>
4377  KOKKOS_INLINE_FUNCTION void
4378  solveMultiVector(const member_type &member,
4379  const local_ordinal_type & /* blocksize */,
4380  const local_ordinal_type &i0,
4381  const local_ordinal_type &r0,
4382  const local_ordinal_type &nrows,
4383  const local_ordinal_type &v,
4384  const WWViewType &WW) const {
4385  typedef SolveTridiagsDefaultModeAndAlgo<typename execution_space::memory_space> default_mode_and_algo_type;
4386 
4387  typedef typename default_mode_and_algo_type::mode_type default_mode_type;
4388  typedef typename default_mode_and_algo_type::multi_vector_algo_type default_algo_type;
4389 
4390  // constant
4391  const auto one = Kokkos::ArithTraits<btdm_magnitude_type>::one();
4392  const auto zero = Kokkos::ArithTraits<btdm_magnitude_type>::zero();
4393 
4394  // subview pattern
4395  auto A = Kokkos::subview(D_internal_vector_values, i0, Kokkos::ALL(), Kokkos::ALL(), v);
4396  auto X1 = Kokkos::subview(X_internal_vector_values, r0, Kokkos::ALL(), Kokkos::ALL(), v);
4397  auto X2 = X1;
4398 
4399  local_ordinal_type i = i0, r = r0;
4400 
4401  if (nrows > 1) {
4402  // solve Lx = x
4403  KB::Trsm<member_type,
4404  KB::Side::Left, KB::Uplo::Lower, KB::Trans::NoTranspose, KB::Diag::Unit,
4405  default_mode_type, default_algo_type>::invoke(member, one, A, X1);
4406  for (local_ordinal_type tr = 1; tr < nrows; ++tr, i += 3) {
4407  A.assign_data(&D_internal_vector_values(i + 2, 0, 0, v));
4408  X2.assign_data(&X_internal_vector_values(++r, 0, 0, v));
4409  member.team_barrier();
4410  KB::Gemm<member_type,
4411  KB::Trans::NoTranspose, KB::Trans::NoTranspose,
4412  default_mode_type, default_algo_type>::invoke(member, -one, A, X1, one, X2);
4413  A.assign_data(&D_internal_vector_values(i + 3, 0, 0, v));
4414  KB::Trsm<member_type,
4415  KB::Side::Left, KB::Uplo::Lower, KB::Trans::NoTranspose, KB::Diag::Unit,
4416  default_mode_type, default_algo_type>::invoke(member, one, A, X2);
4417  X1.assign_data(X2.data());
4418  }
4419 
4420  // solve Ux = x
4421  KB::Trsm<member_type,
4422  KB::Side::Left, KB::Uplo::Upper, KB::Trans::NoTranspose, KB::Diag::NonUnit,
4423  default_mode_type, default_algo_type>::invoke(member, one, A, X1);
4424  for (local_ordinal_type tr = nrows; tr > 1; --tr) {
4425  i -= 3;
4426  A.assign_data(&D_internal_vector_values(i + 1, 0, 0, v));
4427  X2.assign_data(&X_internal_vector_values(--r, 0, 0, v));
4428  member.team_barrier();
4429  KB::Gemm<member_type,
4430  KB::Trans::NoTranspose, KB::Trans::NoTranspose,
4431  default_mode_type, default_algo_type>::invoke(member, -one, A, X1, one, X2);
4432 
4433  A.assign_data(&D_internal_vector_values(i, 0, 0, v));
4434  KB::Trsm<member_type,
4435  KB::Side::Left, KB::Uplo::Upper, KB::Trans::NoTranspose, KB::Diag::NonUnit,
4436  default_mode_type, default_algo_type>::invoke(member, one, A, X2);
4437  X1.assign_data(X2.data());
4438  }
4439  } else {
4440  // matrix is already inverted
4441  auto W = Kokkos::subview(WW, Kokkos::ALL(), Kokkos::ALL(), v);
4442  KB::Copy<member_type, KB::Trans::NoTranspose, default_mode_type>::invoke(member, X1, W);
4443  member.team_barrier();
4444  KB::Gemm<member_type,
4445  KB::Trans::NoTranspose, KB::Trans::NoTranspose,
4446  default_mode_type, default_algo_type>::invoke(member, one, A, W, zero, X1);
4447  }
4448  }
4449 
4450  template <int B>
4451  struct SingleVectorTag {};
4452  template <int B>
4453  struct MultiVectorTag {};
4454 
4455  template <int B>
4456  struct SingleVectorSubLineTag {};
4457  template <int B>
4458  struct MultiVectorSubLineTag {};
4459  template <int B>
4460  struct SingleVectorApplyCTag {};
4461  template <int B>
4462  struct MultiVectorApplyCTag {};
4463  template <int B>
4464  struct SingleVectorSchurTag {};
4465  template <int B>
4466  struct MultiVectorSchurTag {};
4467  template <int B>
4468  struct SingleVectorApplyETag {};
4469  template <int B>
4470  struct MultiVectorApplyETag {};
4471  template <int B>
4472  struct SingleVectorCopyToFlatTag {};
4473  template <int B>
4474  struct SingleZeroingTag {};
4475 
4476  template <int B>
4477  KOKKOS_INLINE_FUNCTION void
4478  operator()(const SingleVectorTag<B> &, const member_type &member) const {
4479  const local_ordinal_type packidx = member.league_rank();
4480  const local_ordinal_type partidx = packptr(packidx);
4481  const local_ordinal_type npacks = packptr(packidx + 1) - partidx;
4482  const local_ordinal_type pri0 = part2packrowidx0(partidx);
4483  const local_ordinal_type i0 = pack_td_ptr(partidx, 0);
4484  const local_ordinal_type r0 = part2packrowidx0(partidx);
4485  const local_ordinal_type nrows = partptr(partidx + 1) - partptr(partidx);
4486  const local_ordinal_type blocksize = (B == 0 ? D_internal_vector_values.extent(1) : B);
4487  const local_ordinal_type num_vectors = 1;
4488  internal_vector_scratch_type_3d_view
4489  WW(member.team_scratch(0), blocksize, 1, vector_loop_size);
4490  Kokkos::single(Kokkos::PerTeam(member), [&]() {
4491  Z_scalar_vector(member.league_rank()) = impl_scalar_type(0);
4492  });
4493  Kokkos::parallel_for(Kokkos::ThreadVectorRange(member, vector_loop_size), [&](const int &v) {
4494  solveSingleVector(member, blocksize, i0, r0, nrows, v, WW);
4495  copyToFlatMultiVector(member, partidx, npacks, pri0, v, blocksize, num_vectors);
4496  });
4497  }
4498 
4499  template <int B>
4500  KOKKOS_INLINE_FUNCTION void
4501  operator()(const MultiVectorTag<B> &, const member_type &member) const {
4502  const local_ordinal_type packidx = member.league_rank();
4503  const local_ordinal_type partidx = packptr(packidx);
4504  const local_ordinal_type npacks = packptr(packidx + 1) - partidx;
4505  const local_ordinal_type pri0 = part2packrowidx0(partidx);
4506  const local_ordinal_type i0 = pack_td_ptr(partidx, 0);
4507  const local_ordinal_type r0 = part2packrowidx0(partidx);
4508  const local_ordinal_type nrows = partptr(partidx + 1) - partptr(partidx);
4509  const local_ordinal_type blocksize = (B == 0 ? D_internal_vector_values.extent(1) : B);
4510  const local_ordinal_type num_vectors = X_internal_vector_values.extent(2);
4511 
4512  internal_vector_scratch_type_3d_view
4513  WW(member.team_scratch(0), blocksize, num_vectors, vector_loop_size);
4514  Kokkos::single(Kokkos::PerTeam(member), [&]() {
4515  Z_scalar_vector(member.league_rank()) = impl_scalar_type(0);
4516  });
4517  Kokkos::parallel_for(Kokkos::ThreadVectorRange(member, vector_loop_size), [&](const int &v) {
4518  solveMultiVector(member, blocksize, i0, r0, nrows, v, WW);
4519  copyToFlatMultiVector(member, partidx, npacks, pri0, v, blocksize, num_vectors);
4520  });
4521  }
4522 
4523  template <int B>
4524  KOKKOS_INLINE_FUNCTION void
4525  operator()(const SingleVectorSubLineTag<B> &, const member_type &member) const {
4526  // btdm is packed and sorted from largest one
4527  const local_ordinal_type packidx = packindices_sub(member.league_rank());
4528 
4529  const local_ordinal_type subpartidx = packptr_sub(packidx);
4530  const local_ordinal_type n_parts = part2packrowidx0_sub.extent(0);
4531  const local_ordinal_type local_subpartidx = subpartidx / n_parts;
4532  const local_ordinal_type partidx = subpartidx % n_parts;
4533 
4534  const local_ordinal_type npacks = packptr_sub(packidx + 1) - subpartidx;
4535  const local_ordinal_type i0 = pack_td_ptr(partidx, local_subpartidx);
4536  const local_ordinal_type r0 = part2packrowidx0_sub(partidx, local_subpartidx);
4537  const local_ordinal_type nrows = partptr_sub(subpartidx, 1) - partptr_sub(subpartidx, 0);
4538  const local_ordinal_type blocksize = e_internal_vector_values.extent(2);
4539 
4540  //(void) i0;
4541  //(void) nrows;
4542  (void)npacks;
4543 
4544  internal_vector_scratch_type_3d_view
4545  WW(member.team_scratch(0), blocksize, 1, vector_loop_size);
4546 
4547  Kokkos::parallel_for(Kokkos::ThreadVectorRange(member, vector_loop_size), [&](const int &v) {
4548  solveSingleVectorNew<impl_type, internal_vector_scratch_type_3d_view>(member, blocksize, i0, r0, nrows, v, D_internal_vector_values, X_internal_vector_values, WW);
4549  });
4550  }
4551 
4552  template <int B>
4553  KOKKOS_INLINE_FUNCTION void
4554  operator()(const SingleVectorApplyCTag<B> &, const member_type &member) const {
4555  // btdm is packed and sorted from largest one
4556  // const local_ordinal_type packidx = packindices_schur(member.league_rank());
4557  const local_ordinal_type packidx = packindices_sub(member.league_rank());
4558 
4559  const local_ordinal_type subpartidx = packptr_sub(packidx);
4560  const local_ordinal_type n_parts = part2packrowidx0_sub.extent(0);
4561  const local_ordinal_type local_subpartidx = subpartidx / n_parts;
4562  const local_ordinal_type partidx = subpartidx % n_parts;
4563  const local_ordinal_type blocksize = e_internal_vector_values.extent(2);
4564 
4565  // const local_ordinal_type npacks = packptr_sub(packidx+1) - subpartidx;
4566  const local_ordinal_type i0 = pack_td_ptr(partidx, local_subpartidx);
4567  const local_ordinal_type r0 = part2packrowidx0_sub(partidx, local_subpartidx);
4568  const local_ordinal_type nrows = partptr_sub(subpartidx, 1) - partptr_sub(subpartidx, 0);
4569 
4570  internal_vector_scratch_type_3d_view
4571  WW(member.team_scratch(0), blocksize, blocksize, vector_loop_size);
4572 
4573  // Compute v_2 = v_2 - C v_1
4574 
4575  const local_ordinal_type local_subpartidx_schur = (local_subpartidx - 1) / 2;
4576  const local_ordinal_type i0_schur = local_subpartidx_schur == 0 ? pack_td_ptr_schur(partidx, local_subpartidx_schur) : pack_td_ptr_schur(partidx, local_subpartidx_schur) + 1;
4577  const local_ordinal_type i0_offset = local_subpartidx_schur == 0 ? i0 + 2 : i0 + 2;
4578 
4579  (void)i0_schur;
4580  (void)i0_offset;
4581 
4582  const auto one = Kokkos::ArithTraits<btdm_magnitude_type>::one();
4583 
4584  const size_type c_kps2 = local_subpartidx > 0 ? pack_td_ptr(partidx, local_subpartidx) - 2 : 0;
4585  const size_type c_kps1 = pack_td_ptr(partidx, local_subpartidx + 1) + 1;
4586 
4587  typedef SolveTridiagsDefaultModeAndAlgo<typename execution_space::memory_space> default_mode_and_algo_type;
4588 
4589  typedef typename default_mode_and_algo_type::mode_type default_mode_type;
4590  typedef typename default_mode_and_algo_type::single_vector_algo_type default_algo_type;
4591 
4592  if (local_subpartidx == 0) {
4593  Kokkos::parallel_for(Kokkos::ThreadVectorRange(member, vector_loop_size), [&](const int &v) {
4594  auto v_1 = Kokkos::subview(X_internal_vector_values, r0 + nrows - 1, Kokkos::ALL(), 0, v);
4595  auto v_2 = Kokkos::subview(X_internal_vector_values, r0 + nrows, Kokkos::ALL(), 0, v);
4596  auto C = Kokkos::subview(D_internal_vector_values, c_kps1, Kokkos::ALL(), Kokkos::ALL(), v);
4597 
4598  KOKKOSBATCHED_GEMV_NO_TRANSPOSE_INTERNAL_INVOKE(default_mode_type, default_algo_type,
4599  member,
4600  blocksize, blocksize,
4601  -one,
4602  C.data(), C.stride(0), C.stride(1),
4603  v_1.data(), v_1.stride(0),
4604  one,
4605  v_2.data(), v_2.stride(0));
4606  });
4607  } else if (local_subpartidx == (local_ordinal_type)part2packrowidx0_sub.extent(1) - 2) {
4608  Kokkos::parallel_for(Kokkos::ThreadVectorRange(member, vector_loop_size), [&](const int &v) {
4609  auto v_1 = Kokkos::subview(X_internal_vector_values, r0, Kokkos::ALL(), 0, v);
4610  auto v_2 = Kokkos::subview(X_internal_vector_values, r0 - 1, Kokkos::ALL(), 0, v);
4611  auto C = Kokkos::subview(D_internal_vector_values, c_kps2, Kokkos::ALL(), Kokkos::ALL(), v);
4612 
4613  KOKKOSBATCHED_GEMV_NO_TRANSPOSE_INTERNAL_INVOKE(default_mode_type, default_algo_type,
4614  member,
4615  blocksize, blocksize,
4616  -one,
4617  C.data(), C.stride(0), C.stride(1),
4618  v_1.data(), v_1.stride(0),
4619  one,
4620  v_2.data(), v_2.stride(0));
4621  });
4622  } else {
4623  Kokkos::parallel_for(Kokkos::ThreadVectorRange(member, vector_loop_size), [&](const int &v) {
4624  {
4625  auto v_1 = Kokkos::subview(X_internal_vector_values, r0 + nrows - 1, Kokkos::ALL(), 0, v);
4626  auto v_2 = Kokkos::subview(X_internal_vector_values, r0 + nrows, Kokkos::ALL(), 0, v);
4627  auto C = Kokkos::subview(D_internal_vector_values, c_kps1, Kokkos::ALL(), Kokkos::ALL(), v);
4628 
4629  KOKKOSBATCHED_GEMV_NO_TRANSPOSE_INTERNAL_INVOKE(default_mode_type, default_algo_type,
4630  member,
4631  blocksize, blocksize,
4632  -one,
4633  C.data(), C.stride(0), C.stride(1),
4634  v_1.data(), v_1.stride(0),
4635  one,
4636  v_2.data(), v_2.stride(0));
4637  }
4638  {
4639  auto v_1 = Kokkos::subview(X_internal_vector_values, r0, Kokkos::ALL(), 0, v);
4640  auto v_2 = Kokkos::subview(X_internal_vector_values, r0 - 1, Kokkos::ALL(), 0, v);
4641  auto C = Kokkos::subview(D_internal_vector_values, c_kps2, Kokkos::ALL(), Kokkos::ALL(), v);
4642 
4643  KOKKOSBATCHED_GEMV_NO_TRANSPOSE_INTERNAL_INVOKE(default_mode_type, default_algo_type,
4644  member,
4645  blocksize, blocksize,
4646  -one,
4647  C.data(), C.stride(0), C.stride(1),
4648  v_1.data(), v_1.stride(0),
4649  one,
4650  v_2.data(), v_2.stride(0));
4651  }
4652  });
4653  }
4654  }
4655 
4656  template <int B>
4657  KOKKOS_INLINE_FUNCTION void
4658  operator()(const SingleVectorSchurTag<B> &, const member_type &member) const {
4659  const local_ordinal_type packidx = packindices_sub(member.league_rank());
4660 
4661  const local_ordinal_type partidx = packptr_sub(packidx);
4662 
4663  const local_ordinal_type blocksize = e_internal_vector_values.extent(2);
4664 
4665  const local_ordinal_type i0_schur = pack_td_ptr_schur(partidx, 0);
4666  const local_ordinal_type nrows = 2 * (n_subparts_per_part - 1);
4667 
4668  const local_ordinal_type r0_schur = nrows * member.league_rank();
4669 
4670  internal_vector_scratch_type_3d_view
4671  WW(member.team_scratch(0), blocksize, blocksize, vector_loop_size);
4672 
4673  for (local_ordinal_type schur_sub_part = 0; schur_sub_part < n_subparts_per_part - 1; ++schur_sub_part) {
4674  const local_ordinal_type r0 = part2packrowidx0_sub(partidx, 2 * schur_sub_part + 1);
4675  for (local_ordinal_type i = 0; i < 2; ++i) {
4676  copy3DView<local_ordinal_type>(member,
4677  Kokkos::subview(X_internal_vector_values_schur, r0_schur + 2 * schur_sub_part + i, Kokkos::ALL(), Kokkos::ALL(), Kokkos::ALL()),
4678  Kokkos::subview(X_internal_vector_values, r0 + i, Kokkos::ALL(), Kokkos::ALL(), Kokkos::ALL()));
4679  }
4680  }
4681 
4682  Kokkos::parallel_for(Kokkos::ThreadVectorRange(member, vector_loop_size), [&](const int &v) {
4683  solveSingleVectorNew<impl_type, internal_vector_scratch_type_3d_view>(member, blocksize, i0_schur, r0_schur, nrows, v, D_internal_vector_values_schur, X_internal_vector_values_schur, WW);
4684  });
4685 
4686  for (local_ordinal_type schur_sub_part = 0; schur_sub_part < n_subparts_per_part - 1; ++schur_sub_part) {
4687  const local_ordinal_type r0 = part2packrowidx0_sub(partidx, 2 * schur_sub_part + 1);
4688  for (local_ordinal_type i = 0; i < 2; ++i) {
4689  copy3DView<local_ordinal_type>(member,
4690  Kokkos::subview(X_internal_vector_values, r0 + i, Kokkos::ALL(), Kokkos::ALL(), Kokkos::ALL()),
4691  Kokkos::subview(X_internal_vector_values_schur, r0_schur + 2 * schur_sub_part + i, Kokkos::ALL(), Kokkos::ALL(), Kokkos::ALL()));
4692  }
4693  }
4694  }
4695 
4696  template <int B>
4697  KOKKOS_INLINE_FUNCTION void
4698  operator()(const SingleVectorApplyETag<B> &, const member_type &member) const {
4699  const local_ordinal_type packidx = packindices_sub(member.league_rank());
4700 
4701  const local_ordinal_type subpartidx = packptr_sub(packidx);
4702  const local_ordinal_type n_parts = part2packrowidx0_sub.extent(0);
4703  const local_ordinal_type local_subpartidx = subpartidx / n_parts;
4704  const local_ordinal_type partidx = subpartidx % n_parts;
4705  const local_ordinal_type blocksize = e_internal_vector_values.extent(2);
4706 
4707  const local_ordinal_type r0 = part2packrowidx0_sub(partidx, local_subpartidx);
4708  const local_ordinal_type nrows = partptr_sub(subpartidx, 1) - partptr_sub(subpartidx, 0);
4709 
4710  internal_vector_scratch_type_3d_view
4711  WW(member.team_scratch(0), blocksize, blocksize, vector_loop_size);
4712 
4713  // Compute v_2 = v_2 - C v_1
4714 
4715  const auto one = Kokkos::ArithTraits<btdm_magnitude_type>::one();
4716 
4717  typedef SolveTridiagsDefaultModeAndAlgo<typename execution_space::memory_space> default_mode_and_algo_type;
4718 
4719  typedef typename default_mode_and_algo_type::mode_type default_mode_type;
4720  typedef typename default_mode_and_algo_type::single_vector_algo_type default_algo_type;
4721 
4722  if (local_subpartidx == 0) {
4723  Kokkos::parallel_for(Kokkos::ThreadVectorRange(member, vector_loop_size), [&](const int &v) {
4724  auto v_2 = Kokkos::subview(X_internal_vector_values, r0 + nrows, Kokkos::ALL(), 0, v);
4725 
4726  for (local_ordinal_type row = 0; row < nrows; ++row) {
4727  auto v_1 = Kokkos::subview(X_internal_vector_values, r0 + row, Kokkos::ALL(), 0, v);
4728  auto E = Kokkos::subview(e_internal_vector_values, 0, r0 + row, Kokkos::ALL(), Kokkos::ALL(), v);
4729 
4730  KOKKOSBATCHED_GEMV_NO_TRANSPOSE_INTERNAL_INVOKE(default_mode_type, default_algo_type,
4731  member,
4732  blocksize, blocksize,
4733  -one,
4734  E.data(), E.stride(0), E.stride(1),
4735  v_2.data(), v_2.stride(0),
4736  one,
4737  v_1.data(), v_1.stride(0));
4738  }
4739  });
4740  } else if (local_subpartidx == (local_ordinal_type)part2packrowidx0_sub.extent(1) - 2) {
4741  Kokkos::parallel_for(Kokkos::ThreadVectorRange(member, vector_loop_size), [&](const int &v) {
4742  auto v_2 = Kokkos::subview(X_internal_vector_values, r0 - 1, Kokkos::ALL(), 0, v);
4743 
4744  for (local_ordinal_type row = 0; row < nrows; ++row) {
4745  auto v_1 = Kokkos::subview(X_internal_vector_values, r0 + row, Kokkos::ALL(), 0, v);
4746  auto E = Kokkos::subview(e_internal_vector_values, 1, r0 + row, Kokkos::ALL(), Kokkos::ALL(), v);
4747 
4748  KOKKOSBATCHED_GEMV_NO_TRANSPOSE_INTERNAL_INVOKE(default_mode_type, default_algo_type,
4749  member,
4750  blocksize, blocksize,
4751  -one,
4752  E.data(), E.stride(0), E.stride(1),
4753  v_2.data(), v_2.stride(0),
4754  one,
4755  v_1.data(), v_1.stride(0));
4756  }
4757  });
4758  } else {
4759  Kokkos::parallel_for(Kokkos::ThreadVectorRange(member, vector_loop_size), [&](const int &v) {
4760  {
4761  auto v_2 = Kokkos::subview(X_internal_vector_values, r0 + nrows, Kokkos::ALL(), 0, v);
4762 
4763  for (local_ordinal_type row = 0; row < nrows; ++row) {
4764  auto v_1 = Kokkos::subview(X_internal_vector_values, r0 + row, Kokkos::ALL(), 0, v);
4765  auto E = Kokkos::subview(e_internal_vector_values, 0, r0 + row, Kokkos::ALL(), Kokkos::ALL(), v);
4766 
4767  KOKKOSBATCHED_GEMV_NO_TRANSPOSE_INTERNAL_INVOKE(default_mode_type, default_algo_type,
4768  member,
4769  blocksize, blocksize,
4770  -one,
4771  E.data(), E.stride(0), E.stride(1),
4772  v_2.data(), v_2.stride(0),
4773  one,
4774  v_1.data(), v_1.stride(0));
4775  }
4776  }
4777  {
4778  auto v_2 = Kokkos::subview(X_internal_vector_values, r0 - 1, Kokkos::ALL(), 0, v);
4779 
4780  for (local_ordinal_type row = 0; row < nrows; ++row) {
4781  auto v_1 = Kokkos::subview(X_internal_vector_values, r0 + row, Kokkos::ALL(), 0, v);
4782  auto E = Kokkos::subview(e_internal_vector_values, 1, r0 + row, Kokkos::ALL(), Kokkos::ALL(), v);
4783 
4784  KOKKOSBATCHED_GEMV_NO_TRANSPOSE_INTERNAL_INVOKE(default_mode_type, default_algo_type,
4785  member,
4786  blocksize, blocksize,
4787  -one,
4788  E.data(), E.stride(0), E.stride(1),
4789  v_2.data(), v_2.stride(0),
4790  one,
4791  v_1.data(), v_1.stride(0));
4792  }
4793  }
4794  });
4795  }
4796  }
4797 
4798  template <int B>
4799  KOKKOS_INLINE_FUNCTION void
4800  operator()(const SingleVectorCopyToFlatTag<B> &, const member_type &member) const {
4801  const local_ordinal_type packidx = member.league_rank();
4802  const local_ordinal_type partidx = packptr(packidx);
4803  const local_ordinal_type npacks = packptr(packidx + 1) - partidx;
4804  const local_ordinal_type pri0 = part2packrowidx0(partidx);
4805  const local_ordinal_type blocksize = (B == 0 ? D_internal_vector_values.extent(1) : B);
4806  const local_ordinal_type num_vectors = 1;
4807 
4808  Kokkos::parallel_for(Kokkos::ThreadVectorRange(member, vector_loop_size), [&](const int &v) {
4809  copyToFlatMultiVector(member, partidx, npacks, pri0, v, blocksize, num_vectors);
4810  });
4811  }
4812 
4813  template <int B>
4814  KOKKOS_INLINE_FUNCTION void
4815  operator()(const SingleZeroingTag<B> &, const member_type &member) const {
4816  Kokkos::single(Kokkos::PerTeam(member), [&]() {
4817  Z_scalar_vector(member.league_rank()) = impl_scalar_type(0);
4818  });
4819  }
4820 
4821  void run(const impl_scalar_type_2d_view_tpetra &Y,
4822  const impl_scalar_type_1d_view &Z) {
4823  IFPACK2_BLOCKTRIDICONTAINER_PROFILER_REGION_BEGIN;
4824  IFPACK2_BLOCKHELPER_TIMER("BlockTriDi::SolveTridiags", SolveTridiags);
4825 
4827  this->Y_scalar_multivector = Y;
4828  this->Z_scalar_vector = Z;
4829 
4830  const local_ordinal_type num_vectors = X_internal_vector_values.extent(2);
4831  const local_ordinal_type blocksize = D_internal_vector_values.extent(1);
4832 
4833  const local_ordinal_type team_size =
4834  SolveTridiagsDefaultModeAndAlgo<typename execution_space::memory_space>::
4835  recommended_team_size(blocksize, vector_length, internal_vector_length);
4836  const int per_team_scratch = internal_vector_scratch_type_3d_view ::shmem_size(blocksize, num_vectors, vector_loop_size);
4837 
4838 #if defined(KOKKOS_ENABLE_DEPRECATED_CODE)
4839 #define BLOCKTRIDICONTAINER_DETAILS_SOLVETRIDIAGS(B) \
4840  if (num_vectors == 1) { \
4841  const Kokkos::TeamPolicy<execution_space, SingleVectorTag<B>> \
4842  policy(packptr.extent(0) - 1, team_size, vector_loop_size); \
4843  Kokkos::parallel_for("SolveTridiags::TeamPolicy::run<SingleVector>", \
4844  policy.set_scratch_size(0, Kokkos::PerTeam(per_team_scratch)), *this); \
4845  } else { \
4846  const Kokkos::TeamPolicy<execution_space, MultiVectorTag<B>> \
4847  policy(packptr.extent(0) - 1, team_size, vector_loop_size); \
4848  Kokkos::parallel_for("SolveTridiags::TeamPolicy::run<MultiVector>", \
4849  policy.set_scratch_size(0, Kokkos::PerTeam(per_team_scratch)), *this); \
4850  } \
4851  break
4852 #else
4853 #define BLOCKTRIDICONTAINER_DETAILS_SOLVETRIDIAGS(B) \
4854  if (num_vectors == 1) { \
4855  if (packindices_schur.extent(1) <= 0) { \
4856  Kokkos::TeamPolicy<execution_space, SingleVectorTag<B>> \
4857  policy(packptr.extent(0) - 1, team_size, vector_loop_size); \
4858  policy.set_scratch_size(0, Kokkos::PerTeam(per_team_scratch)); \
4859  Kokkos::parallel_for("SolveTridiags::TeamPolicy::run<SingleVector>", \
4860  policy, *this); \
4861  } else { \
4862  { \
4863  Kokkos::TeamPolicy<execution_space, SingleZeroingTag<B>> \
4864  policy(packptr.extent(0) - 1, team_size, vector_loop_size); \
4865  Kokkos::parallel_for("SolveTridiags::TeamPolicy::run<SingleZeroingTag>", \
4866  policy, *this); \
4867  } \
4868  { \
4869  IFPACK2_BLOCKHELPER_TIMER("BlockTriDi::ApplyInverseJacobi::SingleVectorSubLineTag", SingleVectorSubLineTag0); \
4870  write4DMultiVectorValuesToFile(part2packrowidx0_sub.extent(0), X_internal_scalar_values, "x_scalar_values_before_SingleVectorSubLineTag.mm"); \
4871  Kokkos::TeamPolicy<execution_space, SingleVectorSubLineTag<B>> \
4872  policy(packindices_sub.extent(0), team_size, vector_loop_size); \
4873  policy.set_scratch_size(0, Kokkos::PerTeam(per_team_scratch)); \
4874  Kokkos::parallel_for("SolveTridiags::TeamPolicy::run<SingleVector>", \
4875  policy, *this); \
4876  write4DMultiVectorValuesToFile(part2packrowidx0_sub.extent(0), X_internal_scalar_values, "x_scalar_values_after_SingleVectorSubLineTag.mm"); \
4877  IFPACK2_BLOCKHELPER_TIMER_FENCE(execution_space) \
4878  } \
4879  { \
4880  IFPACK2_BLOCKHELPER_TIMER("BlockTriDi::ApplyInverseJacobi::SingleVectorApplyCTag", SingleVectorApplyCTag0); \
4881  write4DMultiVectorValuesToFile(part2packrowidx0_sub.extent(0), X_internal_scalar_values, "x_scalar_values_before_SingleVectorApplyCTag.mm"); \
4882  Kokkos::TeamPolicy<execution_space, SingleVectorApplyCTag<B>> \
4883  policy(packindices_sub.extent(0), team_size, vector_loop_size); \
4884  policy.set_scratch_size(0, Kokkos::PerTeam(per_team_scratch)); \
4885  Kokkos::parallel_for("SolveTridiags::TeamPolicy::run<SingleVector>", \
4886  policy, *this); \
4887  write4DMultiVectorValuesToFile(part2packrowidx0_sub.extent(0), X_internal_scalar_values, "x_scalar_values_after_SingleVectorApplyCTag.mm"); \
4888  IFPACK2_BLOCKHELPER_TIMER_FENCE(execution_space) \
4889  } \
4890  { \
4891  IFPACK2_BLOCKHELPER_TIMER("BlockTriDi::ApplyInverseJacobi::SingleVectorSchurTag", SingleVectorSchurTag0); \
4892  write4DMultiVectorValuesToFile(part2packrowidx0_sub.extent(0), X_internal_scalar_values, "x_scalar_values_before_SingleVectorSchurTag.mm"); \
4893  Kokkos::TeamPolicy<execution_space, SingleVectorSchurTag<B>> \
4894  policy(packindices_schur.extent(0), team_size, vector_loop_size); \
4895  policy.set_scratch_size(0, Kokkos::PerTeam(per_team_scratch)); \
4896  Kokkos::parallel_for("SolveTridiags::TeamPolicy::run<SingleVector>", \
4897  policy, *this); \
4898  write4DMultiVectorValuesToFile(part2packrowidx0_sub.extent(0), X_internal_scalar_values, "x_scalar_values_after_SingleVectorSchurTag.mm"); \
4899  IFPACK2_BLOCKHELPER_TIMER_FENCE(execution_space) \
4900  } \
4901  { \
4902  IFPACK2_BLOCKHELPER_TIMER("BlockTriDi::ApplyInverseJacobi::SingleVectorApplyETag", SingleVectorApplyETag0); \
4903  write4DMultiVectorValuesToFile(part2packrowidx0_sub.extent(0), X_internal_scalar_values, "x_scalar_values_before_SingleVectorApplyETag.mm"); \
4904  Kokkos::TeamPolicy<execution_space, SingleVectorApplyETag<B>> \
4905  policy(packindices_sub.extent(0), team_size, vector_loop_size); \
4906  policy.set_scratch_size(0, Kokkos::PerTeam(per_team_scratch)); \
4907  Kokkos::parallel_for("SolveTridiags::TeamPolicy::run<SingleVector>", \
4908  policy, *this); \
4909  write4DMultiVectorValuesToFile(part2packrowidx0_sub.extent(0), X_internal_scalar_values, "x_scalar_values_after_SingleVectorApplyETag.mm"); \
4910  IFPACK2_BLOCKHELPER_TIMER_FENCE(execution_space) \
4911  } \
4912  { \
4913  Kokkos::TeamPolicy<execution_space, SingleVectorCopyToFlatTag<B>> \
4914  policy(packptr.extent(0) - 1, team_size, vector_loop_size); \
4915  Kokkos::parallel_for("SolveTridiags::TeamPolicy::run<SingleVectorCopyToFlatTag>", \
4916  policy, *this); \
4917  } \
4918  } \
4919  } else { \
4920  Kokkos::TeamPolicy<execution_space, MultiVectorTag<B>> \
4921  policy(packptr.extent(0) - 1, team_size, vector_loop_size); \
4922  policy.set_scratch_size(0, Kokkos::PerTeam(per_team_scratch)); \
4923  Kokkos::parallel_for("SolveTridiags::TeamPolicy::run<MultiVector>", \
4924  policy, *this); \
4925  } \
4926  break
4927 #endif
4928  switch (blocksize) {
4929  case 3: BLOCKTRIDICONTAINER_DETAILS_SOLVETRIDIAGS(3);
4930  case 5: BLOCKTRIDICONTAINER_DETAILS_SOLVETRIDIAGS(5);
4931  case 6: BLOCKTRIDICONTAINER_DETAILS_SOLVETRIDIAGS(6);
4932  case 7: BLOCKTRIDICONTAINER_DETAILS_SOLVETRIDIAGS(7);
4933  case 10: BLOCKTRIDICONTAINER_DETAILS_SOLVETRIDIAGS(10);
4934  case 11: BLOCKTRIDICONTAINER_DETAILS_SOLVETRIDIAGS(11);
4935  case 12: BLOCKTRIDICONTAINER_DETAILS_SOLVETRIDIAGS(12);
4936  case 13: BLOCKTRIDICONTAINER_DETAILS_SOLVETRIDIAGS(13);
4937  case 16: BLOCKTRIDICONTAINER_DETAILS_SOLVETRIDIAGS(16);
4938  case 17: BLOCKTRIDICONTAINER_DETAILS_SOLVETRIDIAGS(17);
4939  case 18: BLOCKTRIDICONTAINER_DETAILS_SOLVETRIDIAGS(18);
4940  case 19: BLOCKTRIDICONTAINER_DETAILS_SOLVETRIDIAGS(19);
4941  default: BLOCKTRIDICONTAINER_DETAILS_SOLVETRIDIAGS(0);
4942  }
4943 #undef BLOCKTRIDICONTAINER_DETAILS_SOLVETRIDIAGS
4944 
4945  IFPACK2_BLOCKTRIDICONTAINER_PROFILER_REGION_END;
4946  IFPACK2_BLOCKHELPER_TIMER_FENCE(execution_space)
4947  }
4948 };
4949 
4953 template <typename MatrixType>
4954 int applyInverseJacobi( // importer
4955  const Teuchos::RCP<const typename BlockHelperDetails::ImplType<MatrixType>::tpetra_row_matrix_type> &A,
4956  const Teuchos::RCP<const typename BlockHelperDetails::ImplType<MatrixType>::tpetra_crs_graph_type> &G,
4957  const Teuchos::RCP<const typename BlockHelperDetails::ImplType<MatrixType>::tpetra_import_type> &tpetra_importer,
4958  const Teuchos::RCP<AsyncableImport<MatrixType>> &async_importer,
4959  const bool overlap_communication_and_computation,
4960  // tpetra interface
4961  const typename BlockHelperDetails::ImplType<MatrixType>::tpetra_multivector_type &X, // tpetra interface
4962  /* */ typename BlockHelperDetails::ImplType<MatrixType>::tpetra_multivector_type &Y, // tpetra interface
4963  /* */ typename BlockHelperDetails::ImplType<MatrixType>::tpetra_multivector_type &Z, // temporary tpetra interface (seq_method)
4964  /* */ typename BlockHelperDetails::ImplType<MatrixType>::impl_scalar_type_1d_view &W, // temporary tpetra interface (diff)
4965  // local object interface
4966  const BlockHelperDetails::PartInterface<MatrixType> &interf, // mesh interface
4967  const BlockTridiags<MatrixType> &btdm, // packed block tridiagonal matrices
4968  const BlockHelperDetails::AmD<MatrixType> &amd, // R = A - D
4969  /* */ typename BlockHelperDetails::ImplType<MatrixType>::vector_type_1d_view &work, // workspace for packed multivector of right hand side
4970  /* */ BlockHelperDetails::NormManager<MatrixType> &norm_manager,
4971  // preconditioner parameters
4972  const typename BlockHelperDetails::ImplType<MatrixType>::impl_scalar_type &damping_factor,
4973  /* */ bool is_y_zero,
4974  const int max_num_sweeps,
4975  const typename BlockHelperDetails::ImplType<MatrixType>::magnitude_type tol,
4976  const int check_tol_every) {
4977  IFPACK2_BLOCKHELPER_TIMER("BlockTriDi::ApplyInverseJacobi", ApplyInverseJacobi);
4978 
4979  using impl_type = BlockHelperDetails::ImplType<MatrixType>;
4980  using node_memory_space = typename impl_type::node_memory_space;
4981  using local_ordinal_type = typename impl_type::local_ordinal_type;
4982  using size_type = typename impl_type::size_type;
4983  using impl_scalar_type = typename impl_type::impl_scalar_type;
4984  using magnitude_type = typename impl_type::magnitude_type;
4985  using local_ordinal_type_1d_view = typename impl_type::local_ordinal_type_1d_view;
4986  using vector_type_1d_view = typename impl_type::vector_type_1d_view;
4987  using vector_type_3d_view = typename impl_type::vector_type_3d_view;
4988  using tpetra_multivector_type = typename impl_type::tpetra_multivector_type;
4989 
4990  using impl_scalar_type_1d_view = typename impl_type::impl_scalar_type_1d_view;
4991 
4992  // either tpetra importer or async importer must be active
4993  TEUCHOS_TEST_FOR_EXCEPT_MSG(!tpetra_importer.is_null() && !async_importer.is_null(),
4994  "Neither Tpetra importer nor Async importer is null.");
4995  // max number of sweeps should be positive number
4996  TEUCHOS_TEST_FOR_EXCEPT_MSG(max_num_sweeps <= 0,
4997  "Maximum number of sweeps must be >= 1.");
4998 
4999  // const parameters
5000  const bool is_seq_method_requested = !tpetra_importer.is_null();
5001  const bool is_async_importer_active = !async_importer.is_null();
5002  const bool is_norm_manager_active = tol > Kokkos::ArithTraits<magnitude_type>::zero();
5003  const magnitude_type tolerance = tol * tol;
5004  const local_ordinal_type blocksize = btdm.values.extent(1);
5005  const local_ordinal_type num_vectors = Y.getNumVectors();
5006  const local_ordinal_type num_blockrows = interf.part2packrowidx0_back;
5007 
5008  const impl_scalar_type zero(0.0);
5009 
5010  TEUCHOS_TEST_FOR_EXCEPT_MSG(is_norm_manager_active && is_seq_method_requested,
5011  "The seq method for applyInverseJacobi, "
5012  << "which in any case is for developer use only, "
5013  << "does not support norm-based termination.");
5014  const bool device_accessible_from_host = Kokkos::SpaceAccessibility<
5015  Kokkos::DefaultHostExecutionSpace, node_memory_space>::accessible;
5016  TEUCHOS_TEST_FOR_EXCEPTION(is_seq_method_requested && !device_accessible_from_host,
5017  std::invalid_argument,
5018  "The seq method for applyInverseJacobi, "
5019  << "which in any case is for developer use only, "
5020  << "only supports memory spaces accessible from host.");
5021 
5022  // if workspace is needed more, resize it
5023  const size_type work_span_required = num_blockrows * num_vectors * blocksize;
5024  if (work.span() < work_span_required)
5025  work = vector_type_1d_view("vector workspace 1d view", work_span_required);
5026 
5027  // construct W
5028  const local_ordinal_type W_size = interf.packptr.extent(0) - 1;
5029  if (local_ordinal_type(W.extent(0)) < W_size)
5030  W = impl_scalar_type_1d_view("W", W_size);
5031 
5032  typename impl_type::impl_scalar_type_2d_view_tpetra remote_multivector;
5033  {
5034  if (is_seq_method_requested) {
5035  if (Z.getNumVectors() != Y.getNumVectors())
5036  Z = tpetra_multivector_type(tpetra_importer->getTargetMap(), num_vectors, false);
5037  } else {
5038  if (is_async_importer_active) {
5039  // create comm data buffer and keep it here
5040  async_importer->createDataBuffer(num_vectors);
5041  remote_multivector = async_importer->getRemoteMultiVectorLocalView();
5042  }
5043  }
5044  }
5045 
5046  // wrap the workspace with 3d view
5047  vector_type_3d_view pmv(work.data(), num_blockrows, blocksize, num_vectors);
5048  const auto XX = X.getLocalViewDevice(Tpetra::Access::ReadOnly);
5049  const auto YY = Y.getLocalViewDevice(Tpetra::Access::ReadWrite);
5050  const auto ZZ = Z.getLocalViewDevice(Tpetra::Access::ReadWrite);
5051  if (is_y_zero) Kokkos::deep_copy(YY, zero);
5052 
5053  MultiVectorConverter<MatrixType> multivector_converter(interf, pmv);
5054  SolveTridiags<MatrixType> solve_tridiags(interf, btdm, pmv,
5055  damping_factor, is_norm_manager_active);
5056 
5057  const local_ordinal_type_1d_view dummy_local_ordinal_type_1d_view;
5058 
5059  auto A_crs = Teuchos::rcp_dynamic_cast<const typename impl_type::tpetra_crs_matrix_type>(A);
5060  auto A_bcrs = Teuchos::rcp_dynamic_cast<const typename impl_type::tpetra_block_crs_matrix_type>(A);
5061 
5062  bool hasBlockCrsMatrix = !A_bcrs.is_null();
5063 
5064  // This is OK here to use the graph of the A_crs matrix and a block size of 1
5065  const auto g = hasBlockCrsMatrix ? A_bcrs->getCrsGraph() : *(A_crs->getCrsGraph()); // tpetra crs graph object
5066 
5067  BlockHelperDetails::ComputeResidualVector<MatrixType>
5068  compute_residual_vector(amd, G->getLocalGraphDevice(), g.getLocalGraphDevice(), blocksize, interf,
5069  is_async_importer_active ? async_importer->dm2cm : dummy_local_ordinal_type_1d_view,
5070  hasBlockCrsMatrix);
5071 
5072  // norm manager workspace resize
5073  if (is_norm_manager_active)
5074  norm_manager.setCheckFrequency(check_tol_every);
5075 
5076  // iterate
5077  int sweep = 0;
5078  for (; sweep < max_num_sweeps; ++sweep) {
5079  {
5080  if (is_y_zero) {
5081  // pmv := x(lclrow)
5082  multivector_converter.run(XX);
5083  } else {
5084  if (is_seq_method_requested) {
5085  // SEQ METHOD IS TESTING ONLY
5086 
5087  // y := x - R y
5088  Z.doImport(Y, *tpetra_importer, Tpetra::REPLACE);
5089  compute_residual_vector.run(YY, XX, ZZ);
5090 
5091  // pmv := y(lclrow).
5092  multivector_converter.run(YY);
5093  } else {
5094  // fused y := x - R y and pmv := y(lclrow);
5095  // real use case does not use overlap comp and comm
5096  if (overlap_communication_and_computation || !is_async_importer_active) {
5097  if (is_async_importer_active) async_importer->asyncSendRecv(YY);
5098  // OverlapTag, compute_owned = true
5099  compute_residual_vector.run(pmv, XX, YY, remote_multivector, true);
5100  if (is_norm_manager_active && norm_manager.checkDone(sweep, tolerance)) {
5101  if (is_async_importer_active) async_importer->cancel();
5102  break;
5103  }
5104  if (is_async_importer_active) {
5105  async_importer->syncRecv();
5106  // OverlapTag, compute_owned = false
5107  compute_residual_vector.run(pmv, XX, YY, remote_multivector, false);
5108  }
5109  } else {
5110  if (is_async_importer_active)
5111  async_importer->syncExchange(YY);
5112  if (is_norm_manager_active && norm_manager.checkDone(sweep, tolerance)) break;
5113  // AsyncTag
5114  compute_residual_vector.run(pmv, XX, YY, remote_multivector);
5115  }
5116  }
5117  }
5118  }
5119 
5120  // pmv := inv(D) pmv.
5121  {
5122  solve_tridiags.run(YY, W);
5123  }
5124  {
5125  if (is_norm_manager_active) {
5126  // y(lclrow) = (b - a) y(lclrow) + a pmv, with b = 1 always.
5127  BlockHelperDetails::reduceVector<MatrixType>(W, norm_manager.getBuffer());
5128  if (sweep + 1 == max_num_sweeps) {
5129  norm_manager.ireduce(sweep, true);
5130  norm_manager.checkDone(sweep + 1, tolerance, true);
5131  } else {
5132  norm_manager.ireduce(sweep);
5133  }
5134  }
5135  }
5136  is_y_zero = false;
5137  }
5138 
5139  // sqrt the norms for the caller's use.
5140  if (is_norm_manager_active) norm_manager.finalize();
5141 
5142  return sweep;
5143 }
5144 
5145 // Implementation of fused block Jacobi for a specific block size,
5146 // or (if B == 0) for a general block size.
5147 template <typename MatrixType, int B>
5148 int applyFusedBlockJacobi_Impl(
5149  const Teuchos::RCP<const typename BlockHelperDetails::ImplType<MatrixType>::tpetra_import_type> &tpetra_importer,
5150  const Teuchos::RCP<AsyncableImport<MatrixType>> &async_importer,
5151  const bool overlap_communication_and_computation,
5152  // tpetra interface
5153  const typename BlockHelperDetails::ImplType<MatrixType>::tpetra_multivector_type &X, // tpetra interface
5154  /* */ typename BlockHelperDetails::ImplType<MatrixType>::tpetra_multivector_type &Y, // tpetra interface
5155  /* */ typename BlockHelperDetails::ImplType<MatrixType>::impl_scalar_type_1d_view &W, // temporary tpetra interface (diff)
5156  // local object interface
5157  const BlockHelperDetails::PartInterface<MatrixType> &interf, // mesh interface
5158  const BlockTridiags<MatrixType> &btdm, // packed block tridiagonal matrices
5159  const BlockHelperDetails::AmD<MatrixType> &amd, // R = A - D
5160  /* */ typename BlockHelperDetails::ImplType<MatrixType>::impl_scalar_type_1d_view &work, // workspace
5161  /* */ BlockHelperDetails::NormManager<MatrixType> &norm_manager,
5162  // preconditioner parameters
5163  const typename BlockHelperDetails::ImplType<MatrixType>::impl_scalar_type &damping_factor,
5164  /* */ bool is_y_zero,
5165  const int max_num_sweeps,
5166  const typename BlockHelperDetails::ImplType<MatrixType>::magnitude_type tol,
5167  const int check_tol_every) {
5168  using impl_type = BlockHelperDetails::ImplType<MatrixType>;
5169  using local_ordinal_type = typename impl_type::local_ordinal_type;
5170  using size_type = typename impl_type::size_type;
5171  using magnitude_type = typename impl_type::magnitude_type;
5172  using impl_scalar_type_1d_view = typename impl_type::impl_scalar_type_1d_view;
5173  using impl_scalar_type_2d_view_tpetra = typename impl_type::impl_scalar_type_2d_view_tpetra;
5174 
5175  // the tpetra importer and async importer can't both be active
5176  TEUCHOS_TEST_FOR_EXCEPT_MSG(!tpetra_importer.is_null() && !async_importer.is_null(),
5177  "Neither Tpetra importer nor Async importer is null.");
5178  // max number of sweeps should be positive number
5179  TEUCHOS_TEST_FOR_EXCEPT_MSG(max_num_sweeps <= 0,
5180  "Maximum number of sweeps must be >= 1.");
5181 
5182  // const parameters
5183  const bool is_async_importer_active = !async_importer.is_null();
5184  const bool is_norm_manager_active = tol > Kokkos::ArithTraits<magnitude_type>::zero();
5185  const magnitude_type tolerance = tol * tol;
5186  const local_ordinal_type blocksize = btdm.d_inv.extent(1);
5187  const local_ordinal_type num_vectors = Y.getNumVectors();
5188  const local_ordinal_type num_blockrows = interf.nparts;
5189 
5190  typename impl_type::impl_scalar_type_2d_view_tpetra remote_multivector;
5191  {
5192  if (is_async_importer_active) {
5193  // create comm data buffer and keep it here
5194  async_importer->createDataBuffer(num_vectors);
5195  remote_multivector = async_importer->getRemoteMultiVectorLocalView();
5196  }
5197  }
5198 
5199  const auto XX = X.getLocalViewDevice(Tpetra::Access::ReadOnly);
5200  const auto YY = Y.getLocalViewDevice(Tpetra::Access::ReadWrite);
5201 
5202  const bool two_pass_residual =
5203  overlap_communication_and_computation && is_async_importer_active;
5204 
5205  // Calculate the required work size and reallocate it if not already big enough.
5206  // Check that our assumptions about YY dimension are correct.
5208  size_t(num_blockrows) * blocksize * num_vectors != YY.extent(0) * YY.extent(1),
5209  "Local LHS vector (YY) has total size " << YY.extent(0) << "x" << YY.extent(1) << " = " << YY.extent(0) * YY.extent(1) << ",\n"
5210  << "but expected " << num_blockrows << "x" << blocksize << "x" << num_vectors << " = " << size_t(num_blockrows) * blocksize * num_vectors << '\n');
5211  size_type work_required = size_type(num_blockrows) * blocksize * num_vectors;
5212  if (work.extent(0) < work_required) {
5213  work = impl_scalar_type_1d_view(do_not_initialize_tag("flat workspace 1d view"), work_required);
5214  }
5215 
5216  Unmanaged<impl_scalar_type_2d_view_tpetra> y_doublebuf(work.data(), num_blockrows * blocksize, num_vectors);
5217 
5218  // construct W
5219  if (W.extent(0) != size_t(num_blockrows))
5220  W = impl_scalar_type_1d_view(do_not_initialize_tag("W"), num_blockrows);
5221 
5222  // Create the required functors upfront (this is inexpensive - all shallow copies)
5223  BlockHelperDetails::ComputeResidualAndSolve_SolveOnly<MatrixType, B>
5224  functor_solve_only(amd, btdm.d_inv, W, blocksize, damping_factor);
5225  BlockHelperDetails::ComputeResidualAndSolve_1Pass<MatrixType, B>
5226  functor_1pass(amd, btdm.d_inv, W, blocksize, damping_factor);
5227  BlockHelperDetails::ComputeResidualAndSolve_2Pass<MatrixType, B>
5228  functor_2pass(amd, btdm.d_inv, W, blocksize, damping_factor);
5229 
5230  // norm manager workspace resize
5231  if (is_norm_manager_active)
5232  norm_manager.setCheckFrequency(check_tol_every);
5233 
5234  // For double-buffering.
5235  // yy_buffers[current_y] has the current iterate of y.
5236  // yy_buffers[1-current_y] has the next iterate of y.
5237  Unmanaged<impl_scalar_type_2d_view_tpetra> y_buffers[2] = {YY, y_doublebuf};
5238  int current_y = 0;
5239 
5240  // iterate
5241  int sweep = 0;
5242  for (; sweep < max_num_sweeps; ++sweep) {
5243  if (is_y_zero) {
5244  // If y is initially zero, then we are just computing y := damping_factor * Dinv * x
5245  functor_solve_only.run(XX, y_buffers[1 - current_y]);
5246  } else {
5247  // real use case does not use overlap comp and comm
5248  if (overlap_communication_and_computation || !is_async_importer_active) {
5249  if (is_async_importer_active) async_importer->asyncSendRecv(y_buffers[current_y]);
5250  if (two_pass_residual) {
5251  // Pass 1 computes owned residual and stores into new y buffer,
5252  // but doesn't apply Dinv or produce a norm yet
5253  functor_2pass.run_pass1(XX, y_buffers[current_y], y_buffers[1 - current_y]);
5254  } else {
5255  // This case happens if running with single rank.
5256  // There are no remote columns, so residual and solve can happen in one step.
5257  functor_1pass.run(XX, y_buffers[current_y], remote_multivector, y_buffers[1 - current_y]);
5258  }
5259  if (is_norm_manager_active && norm_manager.checkDone(sweep, tolerance)) {
5260  if (is_async_importer_active) async_importer->cancel();
5261  break;
5262  }
5263  if (is_async_importer_active) {
5264  async_importer->syncRecv();
5265  // Stage 2 finishes computing the residual, then applies Dinv and computes norm.
5266  functor_2pass.run_pass2(y_buffers[current_y], remote_multivector, y_buffers[1 - current_y]);
5267  }
5268  } else {
5269  if (is_async_importer_active)
5270  async_importer->syncExchange(y_buffers[current_y]);
5271  if (is_norm_manager_active && norm_manager.checkDone(sweep, tolerance)) break;
5272  // Full residual, Dinv apply, and norm in one kernel
5273  functor_1pass.run(XX, y_buffers[current_y], remote_multivector, y_buffers[1 - current_y]);
5274  }
5275  }
5276 
5277  // Compute global norm.
5278  if (is_norm_manager_active) {
5279  BlockHelperDetails::reduceVector<MatrixType>(W, norm_manager.getBuffer());
5280  if (sweep + 1 == max_num_sweeps) {
5281  norm_manager.ireduce(sweep, true);
5282  norm_manager.checkDone(sweep + 1, tolerance, true);
5283  } else {
5284  norm_manager.ireduce(sweep);
5285  }
5286  }
5287  is_y_zero = false;
5288  // flip y buffers for next iteration, or termination if we reached max_num_sweeps.
5289  current_y = 1 - current_y;
5290  }
5291  if (current_y == 1) {
5292  // We finished iterating with y in the double buffer, so copy it to the user's vector.
5293  Kokkos::deep_copy(YY, y_doublebuf);
5294  }
5295 
5296  // sqrt the norms for the caller's use.
5297  if (is_norm_manager_active) norm_manager.finalize();
5298  return sweep;
5299 }
5300 
5304 template <typename MatrixType>
5306  const Teuchos::RCP<const typename BlockHelperDetails::ImplType<MatrixType>::tpetra_import_type> &tpetra_importer,
5307  const Teuchos::RCP<AsyncableImport<MatrixType>> &async_importer,
5308  const bool overlap_communication_and_computation,
5309  // tpetra interface
5310  const typename BlockHelperDetails::ImplType<MatrixType>::tpetra_multivector_type &X, // tpetra interface
5311  /* */ typename BlockHelperDetails::ImplType<MatrixType>::tpetra_multivector_type &Y, // tpetra interface
5312  /* */ typename BlockHelperDetails::ImplType<MatrixType>::impl_scalar_type_1d_view &W, // temporary tpetra interface (diff)
5313  // local object interface
5314  const BlockHelperDetails::PartInterface<MatrixType> &interf, // mesh interface
5315  const BlockTridiags<MatrixType> &btdm, // packed block tridiagonal matrices
5316  const BlockHelperDetails::AmD<MatrixType> &amd, // R = A - D
5317  /* */ typename BlockHelperDetails::ImplType<MatrixType>::impl_scalar_type_1d_view &work, // workspace
5318  /* */ BlockHelperDetails::NormManager<MatrixType> &norm_manager,
5319  // preconditioner parameters
5320  const typename BlockHelperDetails::ImplType<MatrixType>::impl_scalar_type &damping_factor,
5321  /* */ bool is_y_zero,
5322  const int max_num_sweeps,
5323  const typename BlockHelperDetails::ImplType<MatrixType>::magnitude_type tol,
5324  const int check_tol_every) {
5325  IFPACK2_BLOCKHELPER_TIMER("BlockTriDi::ApplyFusedBlockJacobi", ApplyFusedBlockJacobi);
5326  int blocksize = btdm.d_inv.extent(1);
5327  int sweep = 0;
5328 #define BLOCKTRIDICONTAINER_APPLY_FUSED_JACOBI(B) \
5329  { \
5330  sweep = applyFusedBlockJacobi_Impl<MatrixType, B>( \
5331  tpetra_importer, async_importer, overlap_communication_and_computation, \
5332  X, Y, W, interf, btdm, amd, work, \
5333  norm_manager, damping_factor, is_y_zero, \
5334  max_num_sweeps, tol, check_tol_every); \
5335  } \
5336  break
5337  switch (blocksize) {
5338  case 3: BLOCKTRIDICONTAINER_APPLY_FUSED_JACOBI(3);
5339  case 5: BLOCKTRIDICONTAINER_APPLY_FUSED_JACOBI(5);
5340  case 7: BLOCKTRIDICONTAINER_APPLY_FUSED_JACOBI(7);
5341  case 9: BLOCKTRIDICONTAINER_APPLY_FUSED_JACOBI(9);
5342  case 10: BLOCKTRIDICONTAINER_APPLY_FUSED_JACOBI(10);
5343  case 11: BLOCKTRIDICONTAINER_APPLY_FUSED_JACOBI(11);
5344  case 16: BLOCKTRIDICONTAINER_APPLY_FUSED_JACOBI(16);
5345  case 17: BLOCKTRIDICONTAINER_APPLY_FUSED_JACOBI(17);
5346  case 18: BLOCKTRIDICONTAINER_APPLY_FUSED_JACOBI(18);
5347  default: BLOCKTRIDICONTAINER_APPLY_FUSED_JACOBI(0);
5348  }
5349 #undef BLOCKTRIDICONTAINER_APPLY_FUSED_JACOBI
5350 
5351  return sweep;
5352 }
5353 
5354 template <typename MatrixType>
5355 struct ImplObject {
5356  using impl_type = BlockHelperDetails::ImplType<MatrixType>;
5357  using part_interface_type = BlockHelperDetails::PartInterface<MatrixType>;
5358  using block_tridiags_type = BlockTridiags<MatrixType>;
5359  using amd_type = BlockHelperDetails::AmD<MatrixType>;
5360  using norm_manager_type = BlockHelperDetails::NormManager<MatrixType>;
5361  using async_import_type = AsyncableImport<MatrixType>;
5362 
5363  // distructed objects
5367  Teuchos::RCP<async_import_type> async_importer;
5368  bool overlap_communication_and_computation;
5369 
5370  // copy of Y (mutable to penentrate const)
5371  mutable typename impl_type::tpetra_multivector_type Z;
5372  mutable typename impl_type::impl_scalar_type_1d_view W;
5373 
5374  // local objects
5375  part_interface_type part_interface;
5376  block_tridiags_type block_tridiags; // D
5377  amd_type a_minus_d; // R = A - D
5378 
5379  // whether to use fused block Jacobi path
5380  bool use_fused_jacobi;
5381 
5382  // vector workspace is used for general block tridi case
5383  mutable typename impl_type::vector_type_1d_view work; // right hand side workspace (1D view of vector)
5384  // scalar workspace is used for fused block jacobi case
5385  mutable typename impl_type::impl_scalar_type_1d_view work_flat; // right hand side workspace (1D view of scalar)
5386  mutable norm_manager_type norm_manager;
5387 };
5388 
5389 } // namespace BlockTriDiContainerDetails
5390 
5391 } // namespace Ifpack2
5392 
5393 #endif
Definition: Ifpack2_BlockTriDiContainer_impl.hpp:141
int applyFusedBlockJacobi(const Teuchos::RCP< const typename BlockHelperDetails::ImplType< MatrixType >::tpetra_import_type > &tpetra_importer, const Teuchos::RCP< AsyncableImport< MatrixType >> &async_importer, const bool overlap_communication_and_computation, const typename BlockHelperDetails::ImplType< MatrixType >::tpetra_multivector_type &X, typename BlockHelperDetails::ImplType< MatrixType >::tpetra_multivector_type &Y, typename BlockHelperDetails::ImplType< MatrixType >::impl_scalar_type_1d_view &W, const BlockHelperDetails::PartInterface< MatrixType > &interf, const BlockTridiags< MatrixType > &btdm, const BlockHelperDetails::AmD< MatrixType > &amd, typename BlockHelperDetails::ImplType< MatrixType >::impl_scalar_type_1d_view &work, BlockHelperDetails::NormManager< MatrixType > &norm_manager, const typename BlockHelperDetails::ImplType< MatrixType >::impl_scalar_type &damping_factor, bool is_y_zero, const int max_num_sweeps, const typename BlockHelperDetails::ImplType< MatrixType >::magnitude_type tol, const int check_tol_every)
Definition: Ifpack2_BlockTriDiContainer_impl.hpp:5305
void performNumericPhase(const Teuchos::RCP< const typename BlockHelperDetails::ImplType< MatrixType >::tpetra_row_matrix_type > &A, const Teuchos::RCP< const typename BlockHelperDetails::ImplType< MatrixType >::tpetra_crs_graph_type > &G, const BlockHelperDetails::PartInterface< MatrixType > &interf, BlockTridiags< MatrixType > &btdm, const typename BlockHelperDetails::ImplType< MatrixType >::magnitude_type tiny, bool use_fused_jacobi)
Definition: Ifpack2_BlockTriDiContainer_impl.hpp:3714
void performSymbolicPhase(const Teuchos::RCP< const typename BlockHelperDetails::ImplType< MatrixType >::tpetra_row_matrix_type > &A, const Teuchos::RCP< const typename BlockHelperDetails::ImplType< MatrixType >::tpetra_crs_graph_type > &g, const BlockHelperDetails::PartInterface< MatrixType > &interf, BlockTridiags< MatrixType > &btdm, BlockHelperDetails::AmD< MatrixType > &amd, const bool overlap_communication_and_computation, const Teuchos::RCP< AsyncableImport< MatrixType >> &async_importer, bool useSeqMethod, bool use_fused_jacobi)
Definition: Ifpack2_BlockTriDiContainer_impl.hpp:1923
#define TEUCHOS_TEST_FOR_EXCEPTION(throw_exception_test, Exception, msg)
size_type size() const
size_t size_type
Definition: Ifpack2_BlockHelper.hpp:274
BlockHelperDetails::PartInterface< MatrixType > createPartInterface(const Teuchos::RCP< const typename BlockHelperDetails::ImplType< MatrixType >::tpetra_row_matrix_type > &A, const Teuchos::RCP< const typename BlockHelperDetails::ImplType< MatrixType >::tpetra_crs_graph_type > &G, const Teuchos::Array< Teuchos::Array< typename BlockHelperDetails::ImplType< MatrixType >::local_ordinal_type >> &partitions, const typename BlockHelperDetails::ImplType< MatrixType >::local_ordinal_type n_subparts_per_part_in)
Definition: Ifpack2_BlockTriDiContainer_impl.hpp:1096
Teuchos::RCP< AsyncableImport< MatrixType > > createBlockCrsAsyncImporter(const Teuchos::RCP< const typename BlockHelperDetails::ImplType< MatrixType >::tpetra_row_matrix_type > &A)
Definition: Ifpack2_BlockTriDiContainer_impl.hpp:889
TEUCHOS_DEPRECATED RCP< T > rcp(T *p, Dealloc_T dealloc, bool owns_mem)
#define TEUCHOS_TEST_FOR_EXCEPT_MSG(throw_exception_test, msg)
BlockTridiags< MatrixType > createBlockTridiags(const BlockHelperDetails::PartInterface< MatrixType > &interf)
Definition: Ifpack2_BlockTriDiContainer_impl.hpp:1671
Kokkos::View< size_type *, device_type > size_type_1d_view
Definition: Ifpack2_BlockHelper.hpp:346
Definition: Ifpack2_BlockHelper.hpp:377
Kokkos::ViewAllocateWithoutInitializing do_not_initialize_tag
Definition: Ifpack2_BlockTriDiContainer_impl.hpp:97
void send(const Packet sendBuffer[], const Ordinal count, const int destRank, const int tag, const Comm< Ordinal > &comm)
T * getRawPtr() const
Kokkos::Details::ArithTraits< scalar_type >::val_type impl_scalar_type
Definition: Ifpack2_BlockHelper.hpp:283
Definition: Ifpack2_BlockHelper.hpp:211
Teuchos::RCP< const typename BlockHelperDetails::ImplType< MatrixType >::tpetra_import_type > createBlockCrsTpetraImporter(const Teuchos::RCP< const typename BlockHelperDetails::ImplType< MatrixType >::tpetra_row_matrix_type > &A)
Definition: Ifpack2_BlockTriDiContainer_impl.hpp:165
RCP< CommRequest< Ordinal > > isend(const ArrayRCP< const Packet > &sendBuffer, const int destRank, const int tag, const Comm< Ordinal > &comm)
#define TEUCHOS_ASSERT(assertion_test)
Definition: Ifpack2_BlockHelper.hpp:270
int applyInverseJacobi(const Teuchos::RCP< const typename BlockHelperDetails::ImplType< MatrixType >::tpetra_row_matrix_type > &A, const Teuchos::RCP< const typename BlockHelperDetails::ImplType< MatrixType >::tpetra_crs_graph_type > &G, const Teuchos::RCP< const typename BlockHelperDetails::ImplType< MatrixType >::tpetra_import_type > &tpetra_importer, const Teuchos::RCP< AsyncableImport< MatrixType >> &async_importer, const bool overlap_communication_and_computation, const typename BlockHelperDetails::ImplType< MatrixType >::tpetra_multivector_type &X, typename BlockHelperDetails::ImplType< MatrixType >::tpetra_multivector_type &Y, typename BlockHelperDetails::ImplType< MatrixType >::tpetra_multivector_type &Z, typename BlockHelperDetails::ImplType< MatrixType >::impl_scalar_type_1d_view &W, const BlockHelperDetails::PartInterface< MatrixType > &interf, const BlockTridiags< MatrixType > &btdm, const BlockHelperDetails::AmD< MatrixType > &amd, typename BlockHelperDetails::ImplType< MatrixType >::vector_type_1d_view &work, BlockHelperDetails::NormManager< MatrixType > &norm_manager, const typename BlockHelperDetails::ImplType< MatrixType >::impl_scalar_type &damping_factor, bool is_y_zero, const int max_num_sweeps, const typename BlockHelperDetails::ImplType< MatrixType >::magnitude_type tol, const int check_tol_every)
Definition: Ifpack2_BlockTriDiContainer_impl.hpp:4954
Definition: Ifpack2_BlockTriDiContainer_impl.hpp:1603
Definition: Ifpack2_BlockComputeResidualVector.hpp:23
Definition: Ifpack2_BlockTriDiContainer_impl.hpp:3764