Ifpack2 Templated Preconditioning Package  Version 1.0
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Groups Pages
Ifpack2_BlockTriDiContainer_impl.hpp
1 // @HEADER
2 // *****************************************************************************
3 // Ifpack2: Templated Object-Oriented Algebraic Preconditioner Package
4 //
5 // Copyright 2009 NTESS and the Ifpack2 contributors.
6 // SPDX-License-Identifier: BSD-3-Clause
7 // *****************************************************************************
8 // @HEADER
9 
10 #ifndef IFPACK2_BLOCKTRIDICONTAINER_IMPL_HPP
11 #define IFPACK2_BLOCKTRIDICONTAINER_IMPL_HPP
12 
13 //#define IFPACK2_BLOCKTRIDICONTAINER_WRITE_MM
14 //#define IFPACK2_BLOCKTRIDICONTAINER_USE_PRINTF
15 
17 
18 #include <Tpetra_Details_extractMpiCommFromTeuchos.hpp>
19 #include <Tpetra_Distributor.hpp>
20 #include <Tpetra_BlockMultiVector.hpp>
21 
22 #include <Kokkos_ArithTraits.hpp>
23 #include <KokkosBatched_Util.hpp>
24 #include <KokkosBatched_Vector.hpp>
25 #include <KokkosBatched_Copy_Decl.hpp>
26 #include <KokkosBatched_Copy_Impl.hpp>
27 #include <KokkosBatched_AddRadial_Decl.hpp>
28 #include <KokkosBatched_AddRadial_Impl.hpp>
29 #include <KokkosBatched_SetIdentity_Decl.hpp>
30 #include <KokkosBatched_SetIdentity_Impl.hpp>
31 #include <KokkosBatched_Gemm_Decl.hpp>
32 #include <KokkosBatched_Gemm_Serial_Impl.hpp>
33 #include <KokkosBatched_Gemm_Team_Impl.hpp>
34 #include <KokkosBatched_Gemv_Decl.hpp>
35 #include <KokkosBatched_Gemv_Team_Impl.hpp>
36 #include <KokkosBatched_Trsm_Decl.hpp>
37 #include <KokkosBatched_Trsm_Serial_Impl.hpp>
38 #include <KokkosBatched_Trsm_Team_Impl.hpp>
39 #include <KokkosBatched_Trsv_Decl.hpp>
40 #include <KokkosBatched_Trsv_Serial_Impl.hpp>
41 #include <KokkosBatched_Trsv_Team_Impl.hpp>
42 #include <KokkosBatched_LU_Decl.hpp>
43 #include <KokkosBatched_LU_Serial_Impl.hpp>
44 #include <KokkosBatched_LU_Team_Impl.hpp>
45 
46 #include <KokkosBlas1_nrm1.hpp>
47 #include <KokkosBlas1_nrm2.hpp>
48 
49 #include <memory>
50 
51 #include "Ifpack2_BlockHelper.hpp"
52 #include "Ifpack2_BlockComputeResidualVector.hpp"
53 #include "Ifpack2_BlockComputeResidualAndSolve.hpp"
54 
55 //#include <KokkosBlas2_gemv.hpp>
56 
57 // need to interface this into cmake variable (or only use this flag when it is necessary)
58 //#define IFPACK2_BLOCKTRIDICONTAINER_ENABLE_PROFILE
59 //#undef IFPACK2_BLOCKTRIDICONTAINER_ENABLE_PROFILE
60 #if defined(KOKKOS_ENABLE_CUDA) && defined(IFPACK2_BLOCKTRIDICONTAINER_ENABLE_PROFILE)
61 #include "cuda_profiler_api.h"
62 #endif
63 
64 // I am not 100% sure about the mpi 3 on cuda
65 #if MPI_VERSION >= 3
66 #define IFPACK2_BLOCKTRIDICONTAINER_USE_MPI_3
67 #endif
68 
69 // ::: Experiments :::
70 // define either pinned memory or cudamemory for mpi
71 // if both macros are disabled, it will use tpetra memory space which is uvm space for cuda
72 // if defined, this use pinned memory instead of device pointer
73 // by default, we enable pinned memory
74 #define IFPACK2_BLOCKTRIDICONTAINER_USE_PINNED_MEMORY_FOR_MPI
75 //#define IFPACK2_BLOCKTRIDICONTAINER_USE_CUDA_MEMORY_FOR_MPI
76 
77 // if defined, all views are allocated on cuda space intead of cuda uvm space
78 #define IFPACK2_BLOCKTRIDICONTAINER_USE_CUDA_SPACE
79 
80 // if defined, btdm_scalar_type is used (if impl_scala_type is double, btdm_scalar_type is float)
81 #if defined(HAVE_IFPACK2_BLOCKTRIDICONTAINER_SMALL_SCALAR)
82 #define IFPACK2_BLOCKTRIDICONTAINER_USE_SMALL_SCALAR_FOR_BLOCKTRIDIAG
83 #endif
84 
85 // if defined, it uses multiple execution spaces
86 #define IFPACK2_BLOCKTRIDICONTAINER_USE_EXEC_SPACE_INSTANCES
87 
88 namespace Ifpack2 {
89 
90  namespace BlockTriDiContainerDetails {
91 
92  namespace KB = KokkosBatched;
93 
97  using do_not_initialize_tag = Kokkos::ViewAllocateWithoutInitializing;
98 
99  template <typename MemoryTraitsType, Kokkos::MemoryTraitsFlags flag>
100  using MemoryTraits = Kokkos::MemoryTraits<MemoryTraitsType::is_unmanaged |
101  MemoryTraitsType::is_random_access |
102  flag>;
103 
104  template <typename ViewType>
105  using Unmanaged = Kokkos::View<typename ViewType::data_type,
106  typename ViewType::array_layout,
107  typename ViewType::device_type,
108  MemoryTraits<typename ViewType::memory_traits,Kokkos::Unmanaged> >;
109  template <typename ViewType>
110  using Atomic = Kokkos::View<typename ViewType::data_type,
111  typename ViewType::array_layout,
112  typename ViewType::device_type,
113  MemoryTraits<typename ViewType::memory_traits,Kokkos::Atomic> >;
114  template <typename ViewType>
115  using Const = Kokkos::View<typename ViewType::const_data_type,
116  typename ViewType::array_layout,
117  typename ViewType::device_type,
118  typename ViewType::memory_traits>;
119  template <typename ViewType>
120  using ConstUnmanaged = Const<Unmanaged<ViewType> >;
121 
122  template <typename ViewType>
123  using AtomicUnmanaged = Atomic<Unmanaged<ViewType> >;
124 
125  template <typename ViewType>
126  using Unmanaged = Kokkos::View<typename ViewType::data_type,
127  typename ViewType::array_layout,
128  typename ViewType::device_type,
129  MemoryTraits<typename ViewType::memory_traits,Kokkos::Unmanaged> >;
130 
131 
132  template <typename ViewType>
133  using Scratch = Kokkos::View<typename ViewType::data_type,
134  typename ViewType::array_layout,
135  typename ViewType::execution_space::scratch_memory_space,
136  MemoryTraits<typename ViewType::memory_traits, Kokkos::Unmanaged> >;
137 
141  template<typename T> struct BlockTridiagScalarType { typedef T type; };
142 #if defined(IFPACK2_BLOCKTRIDICONTAINER_USE_SMALL_SCALAR_FOR_BLOCKTRIDIAG)
143  template<> struct BlockTridiagScalarType<double> { typedef float type; };
144  //template<> struct SmallScalarType<Kokkos::complex<double> > { typedef Kokkos::complex<float> type; };
145 #endif
146 
147 #if defined(KOKKOS_ENABLE_CUDA) && defined(IFPACK2_BLOCKTRIDICONTAINER_ENABLE_PROFILE)
148 #define IFPACK2_BLOCKTRIDICONTAINER_PROFILER_REGION_BEGIN \
149  KOKKOS_IMPL_CUDA_SAFE_CALL(cudaProfilerStart());
150 
151 #define IFPACK2_BLOCKTRIDICONTAINER_PROFILER_REGION_END \
152  { KOKKOS_IMPL_CUDA_SAFE_CALL( cudaProfilerStop() ); }
153 #else
154 #define IFPACK2_BLOCKTRIDICONTAINER_PROFILER_REGION_BEGIN
156 #define IFPACK2_BLOCKTRIDICONTAINER_PROFILER_REGION_END
157 #endif
158 
162  template<typename MatrixType>
164  createBlockCrsTpetraImporter(const Teuchos::RCP<const typename BlockHelperDetails::ImplType<MatrixType>::tpetra_row_matrix_type> &A) {
165  IFPACK2_BLOCKHELPER_TIMER("BlockTriDi::CreateBlockCrsTpetraImporter", CreateBlockCrsTpetraImporter);
167  using tpetra_map_type = typename impl_type::tpetra_map_type;
168  using tpetra_mv_type = typename impl_type::tpetra_block_multivector_type;
169  using tpetra_import_type = typename impl_type::tpetra_import_type;
170  using crs_matrix_type = typename impl_type::tpetra_crs_matrix_type;
171  using block_crs_matrix_type = typename impl_type::tpetra_block_crs_matrix_type;
172 
173  auto A_crs = Teuchos::rcp_dynamic_cast<const crs_matrix_type>(A);
174  auto A_bcrs = Teuchos::rcp_dynamic_cast<const block_crs_matrix_type>(A);
175 
176  bool hasBlockCrsMatrix = ! A_bcrs.is_null ();
177 
178  // This is OK here to use the graph of the A_crs matrix and a block size of 1
179  const auto g = hasBlockCrsMatrix ? A_bcrs->getCrsGraph() : *(A_crs->getCrsGraph()); // tpetra crs graph object
180 
181  const auto blocksize = hasBlockCrsMatrix ? A_bcrs->getBlockSize() : 1;
182  const auto src = Teuchos::rcp(new tpetra_map_type(tpetra_mv_type::makePointMap(*g.getDomainMap(), blocksize)));
183  const auto tgt = Teuchos::rcp(new tpetra_map_type(tpetra_mv_type::makePointMap(*g.getColMap() , blocksize)));
184  IFPACK2_BLOCKHELPER_TIMER_FENCE(typename BlockHelperDetails::ImplType<MatrixType>::execution_space)
185  return Teuchos::rcp(new tpetra_import_type(src, tgt));
186  }
187 
188  // Partial replacement for forward-mode MultiVector::doImport.
189  // Permits overlapped communication and computation, but also supports sync'ed.
190  // I'm finding that overlapped comm/comp can give quite poor performance on some
191  // platforms, so we can't just use it straightforwardly always.
192 
193  template<typename MatrixType>
194  struct AsyncableImport {
195  public:
197 
198  private:
202 #if !defined(HAVE_IFPACK2_MPI)
203  typedef int MPI_Request;
204  typedef int MPI_Comm;
205 #endif
206  using scalar_type = typename impl_type::scalar_type;
209 
210  static int isend(const MPI_Comm comm, const char* buf, int count, int dest, int tag, MPI_Request* ireq) {
211 #ifdef HAVE_IFPACK2_MPI
212  MPI_Request ureq;
213  int ret = MPI_Isend(const_cast<char*>(buf), count, MPI_CHAR, dest, tag, comm, ireq == NULL ? &ureq : ireq);
214  if (ireq == NULL) MPI_Request_free(&ureq);
215  return ret;
216 #else
217  return 0;
218 #endif
219  }
220 
221  static int irecv(const MPI_Comm comm, char* buf, int count, int src, int tag, MPI_Request* ireq) {
222 #ifdef HAVE_IFPACK2_MPI
223  MPI_Request ureq;
224  int ret = MPI_Irecv(buf, count, MPI_CHAR, src, tag, comm, ireq == NULL ? &ureq : ireq);
225  if (ireq == NULL) MPI_Request_free(&ureq);
226  return ret;
227 #else
228  return 0;
229 #endif
230  }
231 
232  static int waitany(int count, MPI_Request* reqs, int* index) {
233 #ifdef HAVE_IFPACK2_MPI
234  return MPI_Waitany(count, reqs, index, MPI_STATUS_IGNORE);
235 #else
236  return 0;
237 #endif
238  }
239 
240  static int waitall(int count, MPI_Request* reqs) {
241 #ifdef HAVE_IFPACK2_MPI
242  return MPI_Waitall(count, reqs, MPI_STATUS_IGNORE);
243 #else
244  return 0;
245 #endif
246  }
247 
248  public:
249  using tpetra_map_type = typename impl_type::tpetra_map_type;
250  using tpetra_import_type = typename impl_type::tpetra_import_type;
251 
252  using local_ordinal_type = typename impl_type::local_ordinal_type;
253  using global_ordinal_type = typename impl_type::global_ordinal_type;
254  using size_type = typename impl_type::size_type;
255  using impl_scalar_type = typename impl_type::impl_scalar_type;
256 
257  using int_1d_view_host = Kokkos::View<int*,Kokkos::HostSpace>;
258  using local_ordinal_type_1d_view_host = Kokkos::View<local_ordinal_type*,Kokkos::HostSpace>;
259 
260  using execution_space = typename impl_type::execution_space;
261  using memory_space = typename impl_type::memory_space;
262  using local_ordinal_type_1d_view = typename impl_type::local_ordinal_type_1d_view;
263  using size_type_1d_view = typename impl_type::size_type_1d_view;
264  using size_type_1d_view_host = Kokkos::View<size_type*,Kokkos::HostSpace>;
265 
266 #if defined(KOKKOS_ENABLE_CUDA)
267  using impl_scalar_type_1d_view =
268  typename std::conditional<std::is_same<execution_space,Kokkos::Cuda>::value,
269 # if defined(IFPACK2_BLOCKTRIDICONTAINER_USE_PINNED_MEMORY_FOR_MPI)
270  Kokkos::View<impl_scalar_type*,Kokkos::CudaHostPinnedSpace>,
271 # elif defined(IFPACK2_BLOCKTRIDICONTAINER_USE_CUDA_MEMORY_FOR_MPI)
272  Kokkos::View<impl_scalar_type*,Kokkos::CudaSpace>,
273 # else // no experimental macros are defined
274  typename impl_type::impl_scalar_type_1d_view,
275 # endif
276  typename impl_type::impl_scalar_type_1d_view>::type;
277 #else
278  using impl_scalar_type_1d_view = typename impl_type::impl_scalar_type_1d_view;
279 #endif
280  using impl_scalar_type_1d_view_host = Kokkos::View<impl_scalar_type*,Kokkos::HostSpace>;
281  using impl_scalar_type_2d_view = typename impl_type::impl_scalar_type_2d_view;
282  using impl_scalar_type_2d_view_tpetra = typename impl_type::impl_scalar_type_2d_view_tpetra;
283 
284 #ifdef HAVE_IFPACK2_MPI
285  MPI_Comm comm;
286 #endif
287 
288  impl_scalar_type_2d_view_tpetra remote_multivector;
289  local_ordinal_type blocksize;
290 
291  template<typename T>
292  struct SendRecvPair {
293  T send, recv;
294  };
295 
296  // (s)end and (r)eceive data:
297  SendRecvPair<int_1d_view_host> pids; // mpi ranks
298  SendRecvPair<std::vector<MPI_Request> > reqs; // MPI_Request is pointer, cannot use kokkos view
299  SendRecvPair<size_type_1d_view> offset; // offsets to local id list and data buffer
300  SendRecvPair<size_type_1d_view_host> offset_host; // offsets to local id list and data buffer
301  SendRecvPair<local_ordinal_type_1d_view> lids; // local id list
302  SendRecvPair<impl_scalar_type_1d_view> buffer; // data buffer
303  SendRecvPair<impl_scalar_type_1d_view_host> buffer_host; // data buffer
304 
305  local_ordinal_type_1d_view dm2cm; // permutation
306 
307 #if defined(KOKKOS_ENABLE_CUDA) || defined(KOKKOS_ENABLE_HIP) || defined(KOKKOS_ENABLE_SYCL)
308  using exec_instance_1d_std_vector = std::vector<execution_space>;
309  exec_instance_1d_std_vector exec_instances;
310 #endif
311 
312  // for cuda
313  public:
314  void setOffsetValues(const Teuchos::ArrayView<const size_t> &lens,
315  const size_type_1d_view &offs) {
316  // wrap lens to kokkos view and deep copy to device
317  Kokkos::View<size_t*,Kokkos::HostSpace> lens_host(const_cast<size_t*>(lens.getRawPtr()), lens.size());
318  const auto lens_device = Kokkos::create_mirror_view_and_copy(memory_space(), lens_host);
319 
320  // exclusive scan
321  const Kokkos::RangePolicy<execution_space> policy(0,offs.extent(0));
322  const local_ordinal_type lens_size = lens_device.extent(0);
323  Kokkos::parallel_scan
324  ("AsyncableImport::RangePolicy::setOffsetValues",
325  policy, KOKKOS_LAMBDA(const local_ordinal_type &i, size_type &update, const bool &final) {
326  if (final)
327  offs(i) = update;
328  update += (i < lens_size ? lens_device[i] : 0);
329  });
330  }
331 
332  void setOffsetValuesHost(const Teuchos::ArrayView<const size_t> &lens,
333  const size_type_1d_view_host &offs) {
334  // wrap lens to kokkos view and deep copy to device
335  Kokkos::View<size_t*,Kokkos::HostSpace> lens_host(const_cast<size_t*>(lens.getRawPtr()), lens.size());
336  const auto lens_device = Kokkos::create_mirror_view_and_copy(memory_space(), lens_host);
337 
338  // exclusive scan
339  offs(0) = 0;
340  for (local_ordinal_type i=1,iend=offs.extent(0);i<iend;++i) {
341  offs(i) = offs(i-1) + lens[i-1];
342  }
343  }
344 
345  private:
346  void createMpiRequests(const tpetra_import_type &import) {
347  Tpetra::Distributor &distributor = import.getDistributor();
348 
349  // copy pids from distributor
350  const auto pids_from = distributor.getProcsFrom();
351  pids.recv = int_1d_view_host(do_not_initialize_tag("pids recv"), pids_from.size());
352  memcpy(pids.recv.data(), pids_from.getRawPtr(), sizeof(int)*pids.recv.extent(0));
353 
354  const auto pids_to = distributor.getProcsTo();
355  pids.send = int_1d_view_host(do_not_initialize_tag("pids send"), pids_to.size());
356  memcpy(pids.send.data(), pids_to.getRawPtr(), sizeof(int)*pids.send.extent(0));
357 
358  // mpi requests
359  reqs.recv.resize(pids.recv.extent(0)); memset(reqs.recv.data(), 0, reqs.recv.size()*sizeof(MPI_Request));
360  reqs.send.resize(pids.send.extent(0)); memset(reqs.send.data(), 0, reqs.send.size()*sizeof(MPI_Request));
361 
362  // construct offsets
363 #if 0
364  const auto lengths_to = distributor.getLengthsTo();
365  offset.send = size_type_1d_view(do_not_initialize_tag("offset send"), lengths_to.size() + 1);
366 
367  const auto lengths_from = distributor.getLengthsFrom();
368  offset.recv = size_type_1d_view(do_not_initialize_tag("offset recv"), lengths_from.size() + 1);
369 
370  setOffsetValues(lengths_to, offset.send);
371  offset_host.send = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), offset.send);
372 
373  setOffsetValues(lengths_from, offset.recv);
374  offset_host.recv = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), offset.recv);
375 #else
376  const auto lengths_to = distributor.getLengthsTo();
377  offset_host.send = size_type_1d_view_host(do_not_initialize_tag("offset send"), lengths_to.size() + 1);
378 
379  const auto lengths_from = distributor.getLengthsFrom();
380  offset_host.recv = size_type_1d_view_host(do_not_initialize_tag("offset recv"), lengths_from.size() + 1);
381 
382  setOffsetValuesHost(lengths_to, offset_host.send);
383  //offset.send = Kokkos::create_mirror_view_and_copy(memory_space(), offset_host.send);
384 
385  setOffsetValuesHost(lengths_from, offset_host.recv);
386  //offset.recv = Kokkos::create_mirror_view_and_copy(memory_space(), offset_host.recv);
387 #endif
388  }
389 
390  void createSendRecvIDs(const tpetra_import_type &import) {
391  // For each remote PID, the list of LIDs to receive.
392  const auto remote_lids = import.getRemoteLIDs();
393  const local_ordinal_type_1d_view_host
394  remote_lids_view_host(const_cast<local_ordinal_type*>(remote_lids.getRawPtr()), remote_lids.size());
395  lids.recv = local_ordinal_type_1d_view(do_not_initialize_tag("lids recv"), remote_lids.size());
396  Kokkos::deep_copy(lids.recv, remote_lids_view_host);
397 
398  // For each export PID, the list of LIDs to send.
399  auto epids = import.getExportPIDs();
400  auto elids = import.getExportLIDs();
401  TEUCHOS_ASSERT(epids.size() == elids.size());
402  lids.send = local_ordinal_type_1d_view(do_not_initialize_tag("lids send"), elids.size());
403  auto lids_send_host = Kokkos::create_mirror_view(lids.send);
404 
405  // naive search (not sure if pids or epids are sorted)
406  for (local_ordinal_type cnt=0,i=0,iend=pids.send.extent(0);i<iend;++i) {
407  const auto pid_send_value = pids.send[i];
408  for (local_ordinal_type j=0,jend=epids.size();j<jend;++j)
409  if (epids[j] == pid_send_value) lids_send_host[cnt++] = elids[j];
410  TEUCHOS_ASSERT(static_cast<size_t>(cnt) == offset_host.send[i+1]);
411  }
412  Kokkos::deep_copy(lids.send, lids_send_host);
413  }
414 
415  void createExecutionSpaceInstances() {
416 #if defined(KOKKOS_ENABLE_CUDA) || defined(KOKKOS_ENABLE_HIP) || defined(KOKKOS_ENABLE_SYCL)
417  //The following line creates 8 streams:
418 #if KOKKOS_VERSION >= 40699
419  exec_instances =
420  Kokkos::Experimental::partition_space(execution_space(), std::vector<int>(8, 1));
421 #else
422  exec_instances =
423  Kokkos::Experimental::partition_space(execution_space(), 1, 1, 1, 1, 1, 1, 1, 1);
424 #endif
425 #endif
426  }
427 
428  public:
429  // for cuda, all tag types are public
430  struct ToBuffer {};
431  struct ToMultiVector {};
432 
433  AsyncableImport (const Teuchos::RCP<const tpetra_map_type>& src_map,
435  const local_ordinal_type blocksize_,
436  const local_ordinal_type_1d_view dm2cm_) {
437  blocksize = blocksize_;
438  dm2cm = dm2cm_;
439 
440 #ifdef HAVE_IFPACK2_MPI
441  comm = Tpetra::Details::extractMpiCommFromTeuchos(*tgt_map->getComm());
442 #endif
443  const tpetra_import_type import(src_map, tgt_map);
444 
445  createMpiRequests(import);
446  createSendRecvIDs(import);
447  createExecutionSpaceInstances();
448  }
449 
450  void createDataBuffer(const local_ordinal_type &num_vectors) {
451  const size_type extent_0 = lids.recv.extent(0)*blocksize;
452  const size_type extent_1 = num_vectors;
453  if (remote_multivector.extent(0) == extent_0 &&
454  remote_multivector.extent(1) == extent_1) {
455  // skip
456  } else {
457  remote_multivector =
458  impl_scalar_type_2d_view_tpetra(do_not_initialize_tag("remote multivector"), extent_0, extent_1);
459 
460  const auto send_buffer_size = offset_host.send[offset_host.send.extent(0)-1]*blocksize*num_vectors;
461  const auto recv_buffer_size = offset_host.recv[offset_host.recv.extent(0)-1]*blocksize*num_vectors;
462 
463  buffer.send = impl_scalar_type_1d_view(do_not_initialize_tag("buffer send"), send_buffer_size);
464  buffer.recv = impl_scalar_type_1d_view(do_not_initialize_tag("buffer recv"), recv_buffer_size);
465 
466  if (!Tpetra::Details::Behavior::assumeMpiIsGPUAware()) {
467  buffer_host.send = impl_scalar_type_1d_view_host(do_not_initialize_tag("buffer send"), send_buffer_size);
468  buffer_host.recv = impl_scalar_type_1d_view_host(do_not_initialize_tag("buffer recv"), recv_buffer_size);
469  }
470  }
471  }
472 
473  void cancel () {
474 #ifdef HAVE_IFPACK2_MPI
475  waitall(reqs.recv.size(), reqs.recv.data());
476  waitall(reqs.send.size(), reqs.send.data());
477 #endif
478  }
479 
480  // ======================================================================
481  // Async version using execution space instances
482  // ======================================================================
483 
484 #if defined(KOKKOS_ENABLE_CUDA) || defined(KOKKOS_ENABLE_HIP) || defined(KOKKOS_ENABLE_SYCL)
485  template<typename PackTag>
486  static
487  void copy(const local_ordinal_type_1d_view &lids_,
488  const impl_scalar_type_1d_view &buffer_,
489  const local_ordinal_type ibeg_,
490  const local_ordinal_type iend_,
491  const impl_scalar_type_2d_view_tpetra &multivector_,
492  const local_ordinal_type blocksize_,
493  const execution_space &exec_instance_) {
494  const local_ordinal_type num_vectors = multivector_.extent(1);
495  const local_ordinal_type mv_blocksize = blocksize_*num_vectors;
496  const local_ordinal_type idiff = iend_ - ibeg_;
497  const auto abase = buffer_.data() + mv_blocksize*ibeg_;
498 
499  using team_policy_type = Kokkos::TeamPolicy<execution_space>;
500  local_ordinal_type vector_size(0);
501  if (blocksize_ <= 4) vector_size = 4;
502  else if (blocksize_ <= 8) vector_size = 8;
503  else if (blocksize_ <= 16) vector_size = 16;
504  else vector_size = 32;
505 
506  const auto work_item_property = Kokkos::Experimental::WorkItemProperty::HintLightWeight;
507  const team_policy_type policy(exec_instance_, idiff, 1, vector_size);
508  Kokkos::parallel_for
509  (//"AsyncableImport::TeamPolicy::copyViaCudaStream",
510  Kokkos::Experimental::require(policy, work_item_property),
511  KOKKOS_LAMBDA(const typename team_policy_type::member_type &member) {
512  const local_ordinal_type i = member.league_rank();
513  Kokkos::parallel_for
514  (Kokkos::TeamThreadRange(member,num_vectors),[&](const local_ordinal_type &j) {
515  auto aptr = abase + blocksize_*(i + idiff*j);
516  auto bptr = &multivector_(blocksize_*lids_(i + ibeg_), j);
517  if (std::is_same<PackTag,ToBuffer>::value)
518  Kokkos::parallel_for
519  (Kokkos::ThreadVectorRange(member,blocksize_),[&](const local_ordinal_type &k) {
520  aptr[k] = bptr[k];
521  });
522  else
523  Kokkos::parallel_for
524  (Kokkos::ThreadVectorRange(member,blocksize_),[&](const local_ordinal_type &k) {
525  bptr[k] = aptr[k];
526  });
527  });
528  });
529  }
530 
531  void asyncSendRecvVar1(const impl_scalar_type_2d_view_tpetra &mv) {
532  IFPACK2_BLOCKHELPER_TIMER("BlockTriDi::AsyncableImport::AsyncSendRecv", AsyncSendRecv);
533 
534 #ifdef HAVE_IFPACK2_MPI
535  // constants and reallocate data buffers if necessary
536  const local_ordinal_type num_vectors = mv.extent(1);
537  const local_ordinal_type mv_blocksize = blocksize*num_vectors;
538 
539  // 0. post receive async
540  for (local_ordinal_type i=0,iend=pids.recv.extent(0);i<iend;++i) {
541  if(Tpetra::Details::Behavior::assumeMpiIsGPUAware()) {
542  irecv(comm,
543  reinterpret_cast<char*>(buffer.recv.data() + offset_host.recv[i]*mv_blocksize),
544  (offset_host.recv[i+1] - offset_host.recv[i])*mv_blocksize*sizeof(impl_scalar_type),
545  pids.recv[i],
546  42,
547  &reqs.recv[i]);
548  }
549  else {
550  irecv(comm,
551  reinterpret_cast<char*>(buffer_host.recv.data() + offset_host.recv[i]*mv_blocksize),
552  (offset_host.recv[i+1] - offset_host.recv[i])*mv_blocksize*sizeof(impl_scalar_type),
553  pids.recv[i],
554  42,
555  &reqs.recv[i]);
556  }
557  }
558 
560  execution_space().fence();
561 
562  // 1. async memcpy
563  for (local_ordinal_type i=0;i<static_cast<local_ordinal_type>(pids.send.extent(0));++i) {
564  // 1.0. enqueue pack buffer
565  if (i<8) exec_instances[i%8].fence();
566  copy<ToBuffer>(lids.send, buffer.send,
567  offset_host.send(i), offset_host.send(i+1),
568  mv, blocksize,
569  //execution_space());
570  exec_instances[i%8]);
571  if (!Tpetra::Details::Behavior::assumeMpiIsGPUAware()) {
572  //if (i<8) exec_instances[i%8].fence();
573  const local_ordinal_type num_vectors = mv.extent(1);
574  const local_ordinal_type mv_blocksize = blocksize*num_vectors;
575 
576  Kokkos::deep_copy(exec_instances[i%8],
577  Kokkos::subview(buffer_host.send,
578  Kokkos::pair<local_ordinal_type, local_ordinal_type>(
579  offset_host.send(i)*mv_blocksize,
580  offset_host.send(i+1)*mv_blocksize)),
581  Kokkos::subview(buffer.send,
582  Kokkos::pair<local_ordinal_type, local_ordinal_type>(
583  offset_host.send(i)*mv_blocksize,
584  offset_host.send(i+1)*mv_blocksize)));
585  }
586  }
588  //execution_space().fence();
589  for (local_ordinal_type i=0;i<static_cast<local_ordinal_type>(pids.send.extent(0));++i) {
590  // 1.1. sync the stream and isend
591  if (i<8) exec_instances[i%8].fence();
592  if(Tpetra::Details::Behavior::assumeMpiIsGPUAware()) {
593  isend(comm,
594  reinterpret_cast<const char*>(buffer.send.data() + offset_host.send[i]*mv_blocksize),
595  (offset_host.send[i+1] - offset_host.send[i])*mv_blocksize*sizeof(impl_scalar_type),
596  pids.send[i],
597  42,
598  &reqs.send[i]);
599  }
600  else {
601  isend(comm,
602  reinterpret_cast<const char*>(buffer_host.send.data() + offset_host.send[i]*mv_blocksize),
603  (offset_host.send[i+1] - offset_host.send[i])*mv_blocksize*sizeof(impl_scalar_type),
604  pids.send[i],
605  42,
606  &reqs.send[i]);
607  }
608  }
609 
610  // 2. poke communication
611  for (local_ordinal_type i=0,iend=pids.recv.extent(0);i<iend;++i) {
612  int flag;
613  MPI_Status stat;
614  MPI_Iprobe(pids.recv[i], 42, comm, &flag, &stat);
615  }
616 #endif // HAVE_IFPACK2_MPI
617  IFPACK2_BLOCKHELPER_TIMER_FENCE(execution_space)
618  }
619 
620  void syncRecvVar1() {
621  IFPACK2_BLOCKHELPER_TIMER("BlockTriDi::AsyncableImport::SyncRecv", SyncRecv);
622 #ifdef HAVE_IFPACK2_MPI
623  // 0. wait for receive async.
624  for (local_ordinal_type i=0;i<static_cast<local_ordinal_type>(pids.recv.extent(0));++i) {
625  local_ordinal_type idx = i;
626 
627  // 0.0. wait any
628  waitany(pids.recv.extent(0), reqs.recv.data(), &idx);
629 
630  if (!Tpetra::Details::Behavior::assumeMpiIsGPUAware()) {
631  const local_ordinal_type num_vectors = remote_multivector.extent(1);
632  const local_ordinal_type mv_blocksize = blocksize*num_vectors;
633 
634  Kokkos::deep_copy(
635  Kokkos::subview(buffer.recv,
636  Kokkos::pair<local_ordinal_type, local_ordinal_type>(
637  offset_host.recv(idx)*mv_blocksize,
638  offset_host.recv(idx+1)*mv_blocksize)),
639  Kokkos::subview(buffer_host.recv,
640  Kokkos::pair<local_ordinal_type, local_ordinal_type>(
641  offset_host.recv(idx)*mv_blocksize,
642  offset_host.recv(idx+1)*mv_blocksize)));
643  }
644 
645  // 0.1. unpack data after data is moved into a device
646  copy<ToMultiVector>(lids.recv, buffer.recv,
647  offset_host.recv(idx), offset_host.recv(idx+1),
648  remote_multivector, blocksize,
649  exec_instances[idx%8]);
650  }
651 
652  // 1. fire up all cuda events
653  Kokkos::fence();
654 
655  // 2. cleanup all open comm
656  waitall(reqs.send.size(), reqs.send.data());
657 #endif // HAVE_IFPACK2_MPI
658  IFPACK2_BLOCKHELPER_TIMER_FENCE(execution_space)
659  }
660 #endif //defined(KOKKOS_ENABLE_CUDA|HIP|SYCL)
661 
662  // ======================================================================
663  // Generic version without using execution space instances
664  // - only difference between device and host architecture is on using team
665  // or range policies.
666  // ======================================================================
667  template<typename PackTag>
668  static
669  void copy(const local_ordinal_type_1d_view &lids_,
670  const impl_scalar_type_1d_view &buffer_,
671  const local_ordinal_type &ibeg_,
672  const local_ordinal_type &iend_,
673  const impl_scalar_type_2d_view_tpetra &multivector_,
674  const local_ordinal_type blocksize_) {
675  const local_ordinal_type num_vectors = multivector_.extent(1);
676  const local_ordinal_type mv_blocksize = blocksize_*num_vectors;
677  const local_ordinal_type idiff = iend_ - ibeg_;
678  const auto abase = buffer_.data() + mv_blocksize*ibeg_;
679  if constexpr (BlockHelperDetails::is_device<execution_space>::value) {
680  using team_policy_type = Kokkos::TeamPolicy<execution_space>;
681  local_ordinal_type vector_size(0);
682  if (blocksize_ <= 4) vector_size = 4;
683  else if (blocksize_ <= 8) vector_size = 8;
684  else if (blocksize_ <= 16) vector_size = 16;
685  else vector_size = 32;
686  const team_policy_type policy(idiff, 1, vector_size);
687  Kokkos::parallel_for
688  ("AsyncableImport::TeamPolicy::copy",
689  policy, KOKKOS_LAMBDA(const typename team_policy_type::member_type &member) {
690  const local_ordinal_type i = member.league_rank();
691  Kokkos::parallel_for
692  (Kokkos::TeamThreadRange(member,num_vectors),[&](const local_ordinal_type &j) {
693  auto aptr = abase + blocksize_*(i + idiff*j);
694  auto bptr = &multivector_(blocksize_*lids_(i + ibeg_), j);
695  if (std::is_same<PackTag,ToBuffer>::value)
696  Kokkos::parallel_for
697  (Kokkos::ThreadVectorRange(member,blocksize_),[&](const local_ordinal_type &k) {
698  aptr[k] = bptr[k];
699  });
700  else
701  Kokkos::parallel_for
702  (Kokkos::ThreadVectorRange(member,blocksize_),[&](const local_ordinal_type &k) {
703  bptr[k] = aptr[k];
704  });
705  });
706  });
707  } else {
708  const Kokkos::RangePolicy<execution_space> policy(0, idiff*num_vectors);
709  Kokkos::parallel_for
710  ("AsyncableImport::RangePolicy::copy",
711  policy, KOKKOS_LAMBDA(const local_ordinal_type &ij) {
712  const local_ordinal_type i = ij%idiff;
713  const local_ordinal_type j = ij/idiff;
714  auto aptr = abase + blocksize_*(i + idiff*j);
715  auto bptr = &multivector_(blocksize_*lids_(i + ibeg_), j);
716  auto from = std::is_same<PackTag,ToBuffer>::value ? bptr : aptr;
717  auto to = std::is_same<PackTag,ToBuffer>::value ? aptr : bptr;
718  memcpy(to, from, sizeof(impl_scalar_type)*blocksize_);
719  });
720  }
721  }
722 
723 
727  void asyncSendRecvVar0(const impl_scalar_type_2d_view_tpetra &mv) {
728  IFPACK2_BLOCKHELPER_TIMER("BlockTriDi::AsyncableImport::AsyncSendRecv", AsyncSendRecv);
729 
730 #ifdef HAVE_IFPACK2_MPI
731  // constants and reallocate data buffers if necessary
732  const local_ordinal_type num_vectors = mv.extent(1);
733  const local_ordinal_type mv_blocksize = blocksize*num_vectors;
734 
735  // receive async
736  for (local_ordinal_type i=0,iend=pids.recv.extent(0);i<iend;++i) {
737  if(Tpetra::Details::Behavior::assumeMpiIsGPUAware()) {
738  irecv(comm,
739  reinterpret_cast<char*>(buffer.recv.data() + offset_host.recv[i]*mv_blocksize),
740  (offset_host.recv[i+1] - offset_host.recv[i])*mv_blocksize*sizeof(impl_scalar_type),
741  pids.recv[i],
742  42,
743  &reqs.recv[i]);
744  }
745  else {
746  irecv(comm,
747  reinterpret_cast<char*>(buffer_host.recv.data() + offset_host.recv[i]*mv_blocksize),
748  (offset_host.recv[i+1] - offset_host.recv[i])*mv_blocksize*sizeof(impl_scalar_type),
749  pids.recv[i],
750  42,
751  &reqs.recv[i]);
752  }
753  }
754 
755  // send async
756  for (local_ordinal_type i=0,iend=pids.send.extent(0);i<iend;++i) {
757  copy<ToBuffer>(lids.send, buffer.send, offset_host.send(i), offset_host.send(i+1),
758  mv, blocksize);
759  Kokkos::fence();
760  if(Tpetra::Details::Behavior::assumeMpiIsGPUAware()) {
761  isend(comm,
762  reinterpret_cast<const char*>(buffer.send.data() + offset_host.send[i]*mv_blocksize),
763  (offset_host.send[i+1] - offset_host.send[i])*mv_blocksize*sizeof(impl_scalar_type),
764  pids.send[i],
765  42,
766  &reqs.send[i]);
767  }
768  else {
769  Kokkos::deep_copy(
770  Kokkos::subview(buffer_host.send,
771  Kokkos::pair<local_ordinal_type, local_ordinal_type>(
772  offset_host.send(i)*mv_blocksize,
773  offset_host.send(i+1)*mv_blocksize)),
774  Kokkos::subview(buffer.send,
775  Kokkos::pair<local_ordinal_type, local_ordinal_type>(
776  offset_host.send(i)*mv_blocksize,
777  offset_host.send(i+1)*mv_blocksize)));
778  isend(comm,
779  reinterpret_cast<const char*>(buffer_host.send.data() + offset_host.send[i]*mv_blocksize),
780  (offset_host.send[i+1] - offset_host.send[i])*mv_blocksize*sizeof(impl_scalar_type),
781  pids.send[i],
782  42,
783  &reqs.send[i]);
784  }
785  }
786 
787  // I find that issuing an Iprobe seems to nudge some MPIs into action,
788  // which helps with overlapped comm/comp performance.
789  for (local_ordinal_type i=0,iend=pids.recv.extent(0);i<iend;++i) {
790  int flag;
791  MPI_Status stat;
792  MPI_Iprobe(pids.recv[i], 42, comm, &flag, &stat);
793  }
794 #endif
795  IFPACK2_BLOCKHELPER_TIMER_FENCE(execution_space)
796  }
797 
798  void syncRecvVar0() {
799  IFPACK2_BLOCKHELPER_TIMER("BlockTriDi::AsyncableImport::SyncRecv", SyncRecv);
800 #ifdef HAVE_IFPACK2_MPI
801  // receive async.
802  for (local_ordinal_type i=0,iend=pids.recv.extent(0);i<iend;++i) {
803  local_ordinal_type idx = i;
804  waitany(pids.recv.extent(0), reqs.recv.data(), &idx);
805  if (!Tpetra::Details::Behavior::assumeMpiIsGPUAware()) {
806  const local_ordinal_type num_vectors = remote_multivector.extent(1);
807  const local_ordinal_type mv_blocksize = blocksize*num_vectors;
808  Kokkos::deep_copy(
809  Kokkos::subview(buffer.recv,
810  Kokkos::pair<local_ordinal_type, local_ordinal_type>(
811  offset_host.recv(idx)*mv_blocksize,
812  offset_host.recv(idx+1)*mv_blocksize)),
813  Kokkos::subview(buffer_host.recv,
814  Kokkos::pair<local_ordinal_type, local_ordinal_type>(
815  offset_host.recv(idx)*mv_blocksize,
816  offset_host.recv(idx+1)*mv_blocksize)));
817  }
818  copy<ToMultiVector>(lids.recv, buffer.recv, offset_host.recv(idx), offset_host.recv(idx+1),
819  remote_multivector, blocksize);
820  }
821  // wait on the sends to match all Isends with a cleanup operation.
822  waitall(reqs.send.size(), reqs.send.data());
823 #endif
824  IFPACK2_BLOCKHELPER_TIMER_FENCE(execution_space)
825  }
826 
830  void asyncSendRecv(const impl_scalar_type_2d_view_tpetra &mv) {
831 #if defined(KOKKOS_ENABLE_CUDA) || defined(KOKKOS_ENABLE_HIP) || defined(KOKKOS_ENABLE_SYCL)
832 #if defined(IFPACK2_BLOCKTRIDICONTAINER_USE_EXEC_SPACE_INSTANCES)
833  asyncSendRecvVar1(mv);
834 #else
835  asyncSendRecvVar0(mv);
836 #endif
837 #else
838  asyncSendRecvVar0(mv);
839 #endif
840  }
841  void syncRecv() {
842 #if defined(KOKKOS_ENABLE_CUDA) || defined(KOKKOS_ENABLE_HIP) || defined(KOKKOS_ENABLE_SYCL)
843 #if defined(IFPACK2_BLOCKTRIDICONTAINER_USE_EXEC_SPACE_INSTANCES)
844  syncRecvVar1();
845 #else
846  syncRecvVar0();
847 #endif
848 #else
849  syncRecvVar0();
850 #endif
851  }
852 
853  void syncExchange(const impl_scalar_type_2d_view_tpetra &mv) {
854  IFPACK2_BLOCKHELPER_TIMER("BlockTriDi::AsyncableImport::SyncExchange", SyncExchange);
855  asyncSendRecv(mv);
856  syncRecv();
857  IFPACK2_BLOCKHELPER_TIMER_FENCE(execution_space)
858  }
859 
860  impl_scalar_type_2d_view_tpetra getRemoteMultiVectorLocalView() const { return remote_multivector; }
861  };
862 
863  template <typename ViewType1, typename ViewType2>
864  struct are_same_struct {
865  ViewType1 keys1;
866  ViewType2 keys2;
867 
868  are_same_struct(ViewType1 keys1_, ViewType2 keys2_) : keys1(keys1_), keys2(keys2_) {}
869  KOKKOS_INLINE_FUNCTION
870  void operator()(int i, unsigned int& count) const {
871  if (keys1(i) != keys2(i)) count++;
872  }
873  };
874 
875  template <typename ViewType1, typename ViewType2>
876  bool are_same (ViewType1 keys1, ViewType2 keys2) {
877  unsigned int are_same_ = 0;
878 
879  Kokkos::parallel_reduce(Kokkos::RangePolicy<typename ViewType1::execution_space>(0, keys1.extent(0)),
880  are_same_struct(keys1, keys2),
881  are_same_);
882  return are_same_==0;
883  }
884 
888  template<typename MatrixType>
890  createBlockCrsAsyncImporter(const Teuchos::RCP<const typename BlockHelperDetails::ImplType<MatrixType>::tpetra_row_matrix_type> &A) {
891  IFPACK2_BLOCKHELPER_TIMER("createBlockCrsAsyncImporter", createBlockCrsAsyncImporter);
893  using tpetra_map_type = typename impl_type::tpetra_map_type;
894  using local_ordinal_type = typename impl_type::local_ordinal_type;
895  using global_ordinal_type = typename impl_type::global_ordinal_type;
896  using local_ordinal_type_1d_view = typename impl_type::local_ordinal_type_1d_view;
897  using crs_matrix_type = typename impl_type::tpetra_crs_matrix_type;
898  using block_crs_matrix_type = typename impl_type::tpetra_block_crs_matrix_type;
899  using global_indices_array_device_type = Kokkos::View<const global_ordinal_type*, typename tpetra_map_type::device_type>;
900 
901  auto A_crs = Teuchos::rcp_dynamic_cast<const crs_matrix_type>(A);
902  auto A_bcrs = Teuchos::rcp_dynamic_cast<const block_crs_matrix_type>(A);
903 
904  bool hasBlockCrsMatrix = ! A_bcrs.is_null ();
905 
906  // This is OK here to use the graph of the A_crs matrix and a block size of 1
907  const auto g = hasBlockCrsMatrix ? A_bcrs->getCrsGraph() : *(A_crs->getCrsGraph()); // tpetra crs graph object
908 
909  const auto blocksize = hasBlockCrsMatrix ? A_bcrs->getBlockSize() : 1;
910  const auto domain_map = g.getDomainMap();
911  const auto column_map = g.getColMap();
912 
913  std::vector<global_ordinal_type> gids;
914 
915  Kokkos::Subview<global_indices_array_device_type, std::pair<int,int>> column_map_global_iD_last;
916 
917  bool separate_remotes = true, found_first = false, need_owned_permutation = false;
918  {
919  IFPACK2_BLOCKHELPER_TIMER("createBlockCrsAsyncImporter::loop_over_local_elements", loop_over_local_elements);
920 
921  global_indices_array_device_type column_map_global_iD = column_map->getMyGlobalIndicesDevice();
922  global_indices_array_device_type domain_map_global_iD = domain_map->getMyGlobalIndicesDevice();
923 
924  if(are_same(domain_map_global_iD, column_map_global_iD)) {
925  // this should be the most likely path
926  separate_remotes = true;
927  need_owned_permutation = false;
928 
929  column_map_global_iD_last = Kokkos::subview(column_map_global_iD,
930  std::pair<int,int>(domain_map_global_iD.extent(0), column_map_global_iD.extent(0)));
931  }
932  else {
933  // This loop is relatively expensive
934  for (size_t i=0;i<column_map->getLocalNumElements();++i) {
935  const global_ordinal_type gid = column_map->getGlobalElement(i);
936  if (!domain_map->isNodeGlobalElement(gid)) {
937  found_first = true;
938  gids.push_back(gid);
939  } else if (found_first) {
940  separate_remotes = false;
941  break;
942  }
943  if (!found_first && !need_owned_permutation &&
944  domain_map->getLocalElement(gid) != static_cast<local_ordinal_type>(i)) {
945  // The owned part of the domain and column maps are different
946  // orderings. We *could* do a super efficient impl of this case in the
947  // num_sweeps > 1 case by adding complexity to PermuteAndRepack. But,
948  // really, if a caller cares about speed, they wouldn't make different
949  // local permutations like this. So we punt on the best impl and go for
950  // a pretty good one: the permutation is done in place in
951  // compute_b_minus_Rx for the pure-owned part of the MVP. The only cost
952  // is the presumably worse memory access pattern of the input vector.
953  need_owned_permutation = true;
954  }
955  }
956  }
957  IFPACK2_BLOCKHELPER_TIMER_FENCE(typename BlockHelperDetails::ImplType<MatrixType>::execution_space)
958  }
959 
960  if (separate_remotes) {
961  IFPACK2_BLOCKHELPER_TIMER("createBlockCrsAsyncImporter::separate_remotes", separate_remotes);
963  const auto parsimonious_col_map
964  = need_owned_permutation ?
965  Teuchos::rcp(new tpetra_map_type(invalid, gids.data(), gids.size(), 0, domain_map->getComm())):
966  Teuchos::rcp(new tpetra_map_type(invalid, column_map_global_iD_last, 0, domain_map->getComm()));
967  if (parsimonious_col_map->getGlobalNumElements() > 0) {
968  // make the importer only if needed.
969  local_ordinal_type_1d_view dm2cm;
970  if (need_owned_permutation) {
971  dm2cm = local_ordinal_type_1d_view(do_not_initialize_tag("dm2cm"), domain_map->getLocalNumElements());
972  const auto dm2cm_host = Kokkos::create_mirror_view(dm2cm);
973  for (size_t i=0;i<domain_map->getLocalNumElements();++i)
974  dm2cm_host(i) = domain_map->getLocalElement(column_map->getGlobalElement(i));
975  Kokkos::deep_copy(dm2cm, dm2cm_host);
976  }
977  IFPACK2_BLOCKHELPER_TIMER_FENCE(typename BlockHelperDetails::ImplType<MatrixType>::execution_space)
978  return Teuchos::rcp(new AsyncableImport<MatrixType>(domain_map, parsimonious_col_map, blocksize, dm2cm));
979  }
980  }
981  IFPACK2_BLOCKHELPER_TIMER_FENCE(typename BlockHelperDetails::ImplType<MatrixType>::execution_space)
982  return Teuchos::null;
983  }
984 
985  template<typename local_ordinal_type>
986  local_ordinal_type costTRSM(const local_ordinal_type block_size) {
987  return block_size*block_size;
988  }
989 
990  template<typename local_ordinal_type>
991  local_ordinal_type costGEMV(const local_ordinal_type block_size) {
992  return 2*block_size*block_size;
993  }
994 
995  template<typename local_ordinal_type>
996  local_ordinal_type costTriDiagSolve(const local_ordinal_type subline_length, const local_ordinal_type block_size) {
997  return 2 * subline_length * costTRSM(block_size) + 2 * (subline_length-1) * costGEMV(block_size);
998  }
999 
1000  template<typename local_ordinal_type>
1001  local_ordinal_type costSolveSchur(const local_ordinal_type num_parts,
1002  const local_ordinal_type num_teams,
1003  const local_ordinal_type line_length,
1004  const local_ordinal_type block_size,
1005  const local_ordinal_type n_subparts_per_part) {
1006  const local_ordinal_type subline_length = ceil(double(line_length - (n_subparts_per_part-1) * 2) / n_subparts_per_part);
1007  if (subline_length < 1) {
1008  return INT_MAX;
1009  }
1010 
1011  const local_ordinal_type p_n_lines = ceil(double(num_parts)/num_teams);
1012  const local_ordinal_type p_n_sublines = ceil(double(n_subparts_per_part)*num_parts/num_teams);
1013  const local_ordinal_type p_n_sublines_2 = ceil(double(n_subparts_per_part-1)*num_parts/num_teams);
1014 
1015  const local_ordinal_type p_costApplyE = p_n_sublines_2 * subline_length * 2 * costGEMV(block_size);
1016  const local_ordinal_type p_costApplyS = p_n_lines * costTriDiagSolve((n_subparts_per_part-1)*2,block_size);
1017  const local_ordinal_type p_costApplyAinv = p_n_sublines * costTriDiagSolve(subline_length,block_size);
1018  const local_ordinal_type p_costApplyC = p_n_sublines_2 * 2 * costGEMV(block_size);
1019 
1020  if (n_subparts_per_part == 1) {
1021  return p_costApplyAinv;
1022  }
1023  return p_costApplyE + p_costApplyS + p_costApplyAinv + p_costApplyC;
1024  }
1025 
1026  template<typename local_ordinal_type>
1027  local_ordinal_type getAutomaticNSubparts(const local_ordinal_type num_parts,
1028  const local_ordinal_type num_teams,
1029  const local_ordinal_type line_length,
1030  const local_ordinal_type block_size) {
1031  local_ordinal_type n_subparts_per_part_0 = 1;
1032  local_ordinal_type flop_0 = costSolveSchur(num_parts, num_teams, line_length, block_size, n_subparts_per_part_0);
1033  local_ordinal_type flop_1 = costSolveSchur(num_parts, num_teams, line_length, block_size, n_subparts_per_part_0+1);
1034  while (flop_0 > flop_1) {
1035  flop_0 = flop_1;
1036  flop_1 = costSolveSchur(num_parts, num_teams, line_length, block_size, (++n_subparts_per_part_0)+1);
1037  }
1038  return n_subparts_per_part_0;
1039  }
1040 
1041  template<typename ArgActiveExecutionMemorySpace>
1042  struct SolveTridiagsDefaultModeAndAlgo;
1043 
1047  template<typename MatrixType>
1048  BlockHelperDetails::PartInterface<MatrixType>
1049  createPartInterface(const Teuchos::RCP<const typename BlockHelperDetails::ImplType<MatrixType>::tpetra_row_matrix_type> &A,
1050  const Teuchos::RCP<const typename BlockHelperDetails::ImplType<MatrixType>::tpetra_crs_graph_type> &G,
1051  const Teuchos::Array<Teuchos::Array<typename BlockHelperDetails::ImplType<MatrixType>::local_ordinal_type> > &partitions,
1052  const typename BlockHelperDetails::ImplType<MatrixType>::local_ordinal_type n_subparts_per_part_in) {
1053  IFPACK2_BLOCKHELPER_TIMER("createPartInterface", createPartInterface);
1054  using impl_type = BlockHelperDetails::ImplType<MatrixType>;
1055  using local_ordinal_type = typename impl_type::local_ordinal_type;
1056  using local_ordinal_type_1d_view = typename impl_type::local_ordinal_type_1d_view;
1057  using local_ordinal_type_2d_view = typename impl_type::local_ordinal_type_2d_view;
1058  using size_type = typename impl_type::size_type;
1059 
1060  auto bA = Teuchos::rcp_dynamic_cast<const typename BlockHelperDetails::ImplType<MatrixType>::tpetra_block_crs_matrix_type>(A);
1061 
1062  TEUCHOS_ASSERT(!bA.is_null() || G->getLocalNumRows() != 0);
1063  const local_ordinal_type blocksize = bA.is_null() ? A->getLocalNumRows() / G->getLocalNumRows() : A->getBlockSize();
1064  constexpr int vector_length = impl_type::vector_length;
1065  constexpr int internal_vector_length = impl_type::internal_vector_length;
1066 
1067  const auto comm = A->getRowMap()->getComm();
1068 
1069  BlockHelperDetails::PartInterface<MatrixType> interf;
1070 
1071  const bool jacobi = partitions.size() == 0;
1072  const local_ordinal_type A_n_lclrows = G->getLocalNumRows();
1073  const local_ordinal_type nparts = jacobi ? A_n_lclrows : partitions.size();
1074 
1075  typedef std::pair<local_ordinal_type,local_ordinal_type> size_idx_pair_type;
1076  std::vector<size_idx_pair_type> partsz(nparts);
1077 
1078  if (!jacobi) {
1079  for (local_ordinal_type i=0;i<nparts;++i)
1080  partsz[i] = size_idx_pair_type(partitions[i].size(), i);
1081  std::sort(partsz.begin(), partsz.end(),
1082  [] (const size_idx_pair_type& x, const size_idx_pair_type& y) {
1083  return x.first > y.first;
1084  });
1085  }
1086 
1087  local_ordinal_type n_subparts_per_part;
1088  if (n_subparts_per_part_in == -1) {
1089  // If the number of subparts is set to -1, the user let the algorithm
1090  // decides the value automatically
1091  using execution_space = typename impl_type::execution_space;
1092 
1093  const int line_length = partsz[0].first;
1094 
1095  const local_ordinal_type team_size =
1096  SolveTridiagsDefaultModeAndAlgo<typename execution_space::memory_space>::
1097  recommended_team_size(blocksize, vector_length, internal_vector_length);
1098 
1099  const local_ordinal_type num_teams = std::max(1, execution_space().concurrency() / (team_size * vector_length));
1100 
1101  n_subparts_per_part = getAutomaticNSubparts(nparts, num_teams, line_length, blocksize);
1102 
1103 #ifdef IFPACK2_BLOCKTRIDICONTAINER_USE_PRINTF
1104  printf("Automatically chosen n_subparts_per_part = %d for nparts = %d, num_teams = %d, team_size = %d, line_length = %d, and blocksize = %d;\n", n_subparts_per_part, nparts, num_teams, team_size, line_length, blocksize);
1105 #endif
1106  }
1107  else {
1108  n_subparts_per_part = n_subparts_per_part_in;
1109  }
1110 
1111  // Total number of sub lines:
1112  const local_ordinal_type n_sub_parts = nparts * n_subparts_per_part;
1113  // Total number of sub lines + the Schur complement blocks.
1114  // For a given live 2 sub lines implies one Schur complement, 3 sub lines implies two Schur complements etc.
1115  const local_ordinal_type n_sub_parts_and_schur = n_sub_parts + nparts * (n_subparts_per_part-1);
1116 
1117 #if defined(BLOCKTRIDICONTAINER_DEBUG)
1118  local_ordinal_type nrows = 0;
1119  if (jacobi)
1120  nrows = nparts;
1121  else
1122  for (local_ordinal_type i=0;i<nparts;++i) nrows += partitions[i].size();
1123 
1125  (nrows != A_n_lclrows, BlockHelperDetails::get_msg_prefix(comm) << "The #rows implied by the local partition is not "
1126  << "the same as getLocalNumRows: " << nrows << " vs " << A_n_lclrows);
1127 #endif
1128 
1129  // permutation vector
1130  std::vector<local_ordinal_type> p;
1131  if (jacobi) {
1132  interf.max_partsz = 1;
1133  interf.max_subpartsz = 0;
1134  interf.n_subparts_per_part = 1;
1135  interf.nparts = nparts;
1136  } else {
1137  // reorder parts to maximize simd packing efficiency
1138  p.resize(nparts);
1139 
1140  for (local_ordinal_type i=0;i<nparts;++i)
1141  p[i] = partsz[i].second;
1142 
1143  interf.max_partsz = partsz[0].first;
1144 
1145  constexpr local_ordinal_type connection_length = 2;
1146  const local_ordinal_type sub_line_length = (interf.max_partsz - (n_subparts_per_part - 1) * connection_length) / n_subparts_per_part;
1147  const local_ordinal_type last_sub_line_length = interf.max_partsz - (n_subparts_per_part - 1) * (connection_length + sub_line_length);
1148 
1149  interf.max_subpartsz = (sub_line_length > last_sub_line_length) ? sub_line_length : last_sub_line_length;
1150  interf.n_subparts_per_part = n_subparts_per_part;
1151  interf.nparts = nparts;
1152  }
1153 
1154  // allocate parts
1155  interf.partptr = local_ordinal_type_1d_view(do_not_initialize_tag("partptr"), nparts + 1);
1156  interf.lclrow = local_ordinal_type_1d_view(do_not_initialize_tag("lclrow"), A_n_lclrows);
1157  interf.part2rowidx0 = local_ordinal_type_1d_view(do_not_initialize_tag("part2rowidx0"), nparts + 1);
1158  interf.part2packrowidx0 = local_ordinal_type_1d_view(do_not_initialize_tag("part2packrowidx0"), nparts + 1);
1159  interf.rowidx2part = local_ordinal_type_1d_view(do_not_initialize_tag("rowidx2part"), A_n_lclrows);
1160 
1161  interf.part2rowidx0_sub = local_ordinal_type_1d_view(do_not_initialize_tag("part2rowidx0_sub"), n_sub_parts_and_schur + 1);
1162  interf.part2packrowidx0_sub = local_ordinal_type_2d_view(do_not_initialize_tag("part2packrowidx0_sub"), nparts, 2 * n_subparts_per_part);
1163  interf.rowidx2part_sub = local_ordinal_type_1d_view(do_not_initialize_tag("rowidx2part"), A_n_lclrows);
1164 
1165  interf.partptr_sub = local_ordinal_type_2d_view(do_not_initialize_tag("partptr_sub"), n_sub_parts_and_schur, 2);
1166 
1167  // mirror to host and compute on host execution space
1168  const auto partptr = Kokkos::create_mirror_view(interf.partptr);
1169  const auto partptr_sub = Kokkos::create_mirror_view(interf.partptr_sub);
1170 
1171  const auto lclrow = Kokkos::create_mirror_view(interf.lclrow);
1172  const auto part2rowidx0 = Kokkos::create_mirror_view(interf.part2rowidx0);
1173  const auto part2packrowidx0 = Kokkos::create_mirror_view(interf.part2packrowidx0);
1174  const auto rowidx2part = Kokkos::create_mirror_view(interf.rowidx2part);
1175 
1176  const auto part2rowidx0_sub = Kokkos::create_mirror_view(interf.part2rowidx0_sub);
1177  const auto part2packrowidx0_sub = Kokkos::create_mirror_view(Kokkos::HostSpace(), interf.part2packrowidx0_sub);
1178  const auto rowidx2part_sub = Kokkos::create_mirror_view(interf.rowidx2part_sub);
1179 
1180  // Determine parts.
1181  interf.row_contiguous = true;
1182  partptr(0) = 0;
1183  part2rowidx0(0) = 0;
1184  part2packrowidx0(0) = 0;
1185  local_ordinal_type pack_nrows = 0;
1186  local_ordinal_type pack_nrows_sub = 0;
1187  if (jacobi) {
1188  IFPACK2_BLOCKHELPER_TIMER("compute part indices (Jacobi)", Jacobi);
1189  // Jacobi (all lines have length 1) means that A_n_lclrows == nparts,
1190  // so the mapping between parts and rows is trivial.
1191  // Note: we can leave interf.row_contiguous = true, since for all i: lclrow(i) == i
1192  for (local_ordinal_type i=0; i <= nparts; ++i) {
1193  part2rowidx0(i) = i;
1194  partptr(i) = i;
1195  }
1196  for (local_ordinal_type i=0; i < nparts; ++i) {
1197  rowidx2part(i) = i;
1198  lclrow(i) = i;
1199  }
1200  for (local_ordinal_type ip=0;ip<nparts;++ip) {
1201  //assume No overlap.
1202  if (ip % vector_length == 0) pack_nrows = 1;
1203  part2packrowidx0(ip+1) = part2packrowidx0(ip) + ((ip+1) % vector_length == 0 || ip+1 == nparts ? pack_nrows : 0);
1204  }
1205  part2rowidx0_sub(0) = 0;
1206  partptr_sub(0, 0) = 0;
1207 
1208  for (local_ordinal_type ip=0;ip<nparts;++ip) {
1209  constexpr local_ordinal_type ipnrows = 1;
1210  const local_ordinal_type full_line_length = partptr(ip+1) - partptr(ip);
1211 
1213  (full_line_length != ipnrows, std::logic_error,
1214  "In the part " << ip );
1215 
1216  constexpr local_ordinal_type connection_length = 2;
1217 
1218  if (full_line_length < n_subparts_per_part + (n_subparts_per_part - 1) * connection_length )
1220  (true, std::logic_error,
1221  "The part " << ip << " is too short to use " << n_subparts_per_part << " sub parts.");
1222 
1223  const local_ordinal_type sub_line_length = (full_line_length - (n_subparts_per_part - 1) * connection_length) / n_subparts_per_part;
1224  const local_ordinal_type last_sub_line_length = full_line_length - (n_subparts_per_part - 1) * (connection_length + sub_line_length);
1225 
1226  if (ip % vector_length == 0) pack_nrows_sub = ipnrows;
1227 
1228  for (local_ordinal_type local_sub_ip=0; local_sub_ip<n_subparts_per_part;++local_sub_ip) {
1229  const local_ordinal_type sub_ip = nparts*(2*local_sub_ip) + ip;
1230  const local_ordinal_type schur_ip = nparts*(2*local_sub_ip+1) + ip;
1231  if (local_sub_ip != n_subparts_per_part-1) {
1232  if (local_sub_ip != 0) {
1233  partptr_sub(sub_ip, 0) = partptr_sub(nparts*(2*local_sub_ip-1) + ip, 1);
1234  }
1235  else if (ip != 0) {
1236  partptr_sub(sub_ip, 0) = partptr_sub(nparts*2*(n_subparts_per_part-1) + ip - 1, 1);
1237  }
1238  partptr_sub(sub_ip, 1) = sub_line_length + partptr_sub(sub_ip, 0);
1239  partptr_sub(schur_ip, 0) = partptr_sub(sub_ip, 1);
1240  partptr_sub(schur_ip, 1) = connection_length + partptr_sub(schur_ip, 0);
1241 
1242  part2rowidx0_sub(sub_ip + 1) = part2rowidx0_sub(sub_ip) + sub_line_length;
1243  part2rowidx0_sub(sub_ip + 2) = part2rowidx0_sub(sub_ip + 1) + connection_length;
1244 
1245 #ifdef IFPACK2_BLOCKTRIDICONTAINER_USE_PRINTF
1246  printf("Sub Part index = %d, first LID associated to the sub part = %d, sub part size = %d;\n", sub_ip, partptr_sub(ip, 2 * local_sub_ip), sub_line_length);
1247  printf("Sub Part index Schur = %d, first LID associated to the sub part = %d, sub part size = %d;\n", sub_ip + 1, partptr_sub(ip, 2 * local_sub_ip + 1), connection_length);
1248 #endif
1249  }
1250  else {
1251  if (local_sub_ip != 0) {
1252  partptr_sub(sub_ip, 0) = partptr_sub(nparts*(2*local_sub_ip-1) + ip, 1);
1253  }
1254  else if (ip != 0) {
1255  partptr_sub(sub_ip, 0) = partptr_sub(nparts*2*(n_subparts_per_part-1) + ip - 1, 1);
1256  }
1257  partptr_sub(sub_ip, 1) = last_sub_line_length + partptr_sub(sub_ip, 0);
1258 
1259  part2rowidx0_sub(sub_ip + 1) = part2rowidx0_sub(sub_ip) + last_sub_line_length;
1260 
1261 #ifdef IFPACK2_BLOCKTRIDICONTAINER_USE_PRINTF
1262  printf("Sub Part index = %d, first LID associated to the sub part = %d, sub part size = %d;\n", sub_ip, partptr_sub(ip, 2 * local_sub_ip), last_sub_line_length);
1263 #endif
1264  }
1265  }
1266  }
1267 
1268 #ifdef IFPACK2_BLOCKTRIDICONTAINER_WRITE_MM
1269  std::cout << "partptr_sub = " << std::endl;
1270  for (size_type i = 0; i < partptr_sub.extent(0); ++i) {
1271  for (size_type j = 0; j < partptr_sub.extent(1); ++j) {
1272  std::cout << partptr_sub(i,j) << " ";
1273  }
1274  std::cout << std::endl;
1275  }
1276  std::cout << "partptr_sub end" << std::endl;
1277 #endif
1278 
1279  {
1280  local_ordinal_type npacks = ceil(float(nparts)/vector_length);
1281 
1282  local_ordinal_type ip_max = nparts > vector_length ? vector_length : nparts;
1283  for (local_ordinal_type ip=0;ip<ip_max;++ip) {
1284  part2packrowidx0_sub(ip, 0) = 0;
1285  }
1286  for (local_ordinal_type ipack=0;ipack<npacks;++ipack) {
1287  if (ipack != 0) {
1288  local_ordinal_type ip_min = ipack*vector_length;
1289  ip_max = nparts > (ipack+1)*vector_length ? (ipack+1)*vector_length : nparts;
1290  for (local_ordinal_type ip=ip_min;ip<ip_max;++ip) {
1291  part2packrowidx0_sub(ip, 0) = part2packrowidx0_sub(ip-vector_length, part2packrowidx0_sub.extent(1)-1);
1292  }
1293  }
1294 
1295  for (size_type local_sub_ip=0; local_sub_ip<part2packrowidx0_sub.extent(1)-1;++local_sub_ip) {
1296  local_ordinal_type ip_min = ipack*vector_length;
1297  ip_max = nparts > (ipack+1)*vector_length ? (ipack+1)*vector_length : nparts;
1298 
1299  const local_ordinal_type full_line_length = partptr(ip_min+1) - partptr(ip_min);
1300 
1301  constexpr local_ordinal_type connection_length = 2;
1302 
1303  const local_ordinal_type sub_line_length = (full_line_length - (n_subparts_per_part - 1) * connection_length) / n_subparts_per_part;
1304  const local_ordinal_type last_sub_line_length = full_line_length - (n_subparts_per_part - 1) * (connection_length + sub_line_length);
1305 
1306  if (local_sub_ip % 2 == 0) pack_nrows_sub = sub_line_length;
1307  if (local_sub_ip % 2 == 1) pack_nrows_sub = connection_length;
1308  if (local_sub_ip == part2packrowidx0_sub.extent(1)-2) pack_nrows_sub = last_sub_line_length;
1309 
1310  part2packrowidx0_sub(ip_min, local_sub_ip + 1) = part2packrowidx0_sub(ip_min, local_sub_ip) + pack_nrows_sub;
1311 
1312  for (local_ordinal_type ip=ip_min+1;ip<ip_max;++ip) {
1313  part2packrowidx0_sub(ip, local_sub_ip + 1) = part2packrowidx0_sub(ip_min, local_sub_ip + 1);
1314  }
1315  }
1316  }
1317 
1318  Kokkos::deep_copy(interf.part2packrowidx0_sub, part2packrowidx0_sub);
1319  }
1320  IFPACK2_BLOCKHELPER_TIMER_FENCE(typename BlockHelperDetails::ImplType<MatrixType>::execution_space)
1321  } else {
1322  IFPACK2_BLOCKHELPER_TIMER("compute part indices", indices);
1323  for (local_ordinal_type ip=0;ip<nparts;++ip) {
1324  const auto* part = &partitions[p[ip]];
1325  const local_ordinal_type ipnrows = part->size();
1326  TEUCHOS_ASSERT(ip == 0 || (ipnrows <= static_cast<local_ordinal_type>(partitions[p[ip-1]].size())));
1327  TEUCHOS_TEST_FOR_EXCEPT_MSG(ipnrows == 0,
1328  BlockHelperDetails::get_msg_prefix(comm)
1329  << "partition " << p[ip]
1330  << " is empty, which is not allowed.");
1331  //assume No overlap.
1332  part2rowidx0(ip+1) = part2rowidx0(ip) + ipnrows;
1333  // Since parts are ordered in decreasing size, the size of the first
1334  // part in a pack is the size for all parts in the pack.
1335  if (ip % vector_length == 0) pack_nrows = ipnrows;
1336  part2packrowidx0(ip+1) = part2packrowidx0(ip) + ((ip+1) % vector_length == 0 || ip+1 == nparts ? pack_nrows : 0);
1337  const local_ordinal_type offset = partptr(ip);
1338  for (local_ordinal_type i=0;i<ipnrows;++i) {
1339  const auto lcl_row = (*part)[i];
1340  TEUCHOS_TEST_FOR_EXCEPT_MSG(lcl_row < 0 || lcl_row >= A_n_lclrows,
1341  BlockHelperDetails::get_msg_prefix(comm)
1342  << "partitions[" << p[ip] << "]["
1343  << i << "] = " << lcl_row
1344  << " but input matrix implies limits of [0, " << A_n_lclrows-1
1345  << "].");
1346  lclrow(offset+i) = lcl_row;
1347  rowidx2part(offset+i) = ip;
1348  if (interf.row_contiguous && offset+i > 0 && lclrow((offset+i)-1) + 1 != lcl_row)
1349  interf.row_contiguous = false;
1350  }
1351  partptr(ip+1) = offset + ipnrows;
1352 
1353 #ifdef IFPACK2_BLOCKTRIDICONTAINER_USE_PRINTF
1354  printf("Part index = ip = %d, first LID associated to the part = partptr(ip) = offset = %d, part->size() = ipnrows = %d;\n", ip, offset, ipnrows);
1355  printf("partptr(%d+1) = %d\n", ip, partptr(ip+1));
1356 #endif
1357  }
1358 
1359  part2rowidx0_sub(0) = 0;
1360  partptr_sub(0, 0) = 0;
1361  //const local_ordinal_type number_pack_per_sub_part = ceil(float(nparts)/vector_length);
1362 
1363  for (local_ordinal_type ip=0;ip<nparts;++ip) {
1364  const auto* part = &partitions[p[ip]];
1365  const local_ordinal_type ipnrows = part->size();
1366  const local_ordinal_type full_line_length = partptr(ip+1) - partptr(ip);
1367 
1369  (full_line_length != ipnrows, std::logic_error,
1370  "In the part " << ip );
1371 
1372  constexpr local_ordinal_type connection_length = 2;
1373 
1374  if (full_line_length < n_subparts_per_part + (n_subparts_per_part - 1) * connection_length )
1376  (true, std::logic_error,
1377  "The part " << ip << " is too short to use " << n_subparts_per_part << " sub parts.");
1378 
1379  const local_ordinal_type sub_line_length = (full_line_length - (n_subparts_per_part - 1) * connection_length) / n_subparts_per_part;
1380  const local_ordinal_type last_sub_line_length = full_line_length - (n_subparts_per_part - 1) * (connection_length + sub_line_length);
1381 
1382  if (ip % vector_length == 0) pack_nrows_sub = ipnrows;
1383 
1384  for (local_ordinal_type local_sub_ip=0; local_sub_ip<n_subparts_per_part;++local_sub_ip) {
1385  const local_ordinal_type sub_ip = nparts*(2*local_sub_ip) + ip;
1386  const local_ordinal_type schur_ip = nparts*(2*local_sub_ip+1) + ip;
1387  if (local_sub_ip != n_subparts_per_part-1) {
1388  if (local_sub_ip != 0) {
1389  partptr_sub(sub_ip, 0) = partptr_sub(nparts*(2*local_sub_ip-1) + ip, 1);
1390  }
1391  else if (ip != 0) {
1392  partptr_sub(sub_ip, 0) = partptr_sub(nparts*2*(n_subparts_per_part-1) + ip - 1, 1);
1393  }
1394  partptr_sub(sub_ip, 1) = sub_line_length + partptr_sub(sub_ip, 0);
1395  partptr_sub(schur_ip, 0) = partptr_sub(sub_ip, 1);
1396  partptr_sub(schur_ip, 1) = connection_length + partptr_sub(schur_ip, 0);
1397 
1398  part2rowidx0_sub(sub_ip + 1) = part2rowidx0_sub(sub_ip) + sub_line_length;
1399  part2rowidx0_sub(sub_ip + 2) = part2rowidx0_sub(sub_ip + 1) + connection_length;
1400 
1401 #ifdef IFPACK2_BLOCKTRIDICONTAINER_USE_PRINTF
1402  printf("Sub Part index = %d, first LID associated to the sub part = %d, sub part size = %d;\n", sub_ip, partptr_sub(sub_ip, 0), sub_line_length);
1403  printf("Sub Part index Schur = %d, first LID associated to the sub part = %d, sub part size = %d;\n", sub_ip + 1, partptr_sub(ip, 2 * local_sub_ip + 1), connection_length);
1404 #endif
1405  }
1406  else {
1407  if (local_sub_ip != 0) {
1408  partptr_sub(sub_ip, 0) = partptr_sub(nparts*(2*local_sub_ip-1) + ip, 1);
1409  }
1410  else if (ip != 0) {
1411  partptr_sub(sub_ip, 0) = partptr_sub(nparts*2*(n_subparts_per_part-1) + ip - 1, 1);
1412  }
1413  partptr_sub(sub_ip, 1) = last_sub_line_length + partptr_sub(sub_ip, 0);
1414 
1415  part2rowidx0_sub(sub_ip + 1) = part2rowidx0_sub(sub_ip) + last_sub_line_length;
1416 
1417 #ifdef IFPACK2_BLOCKTRIDICONTAINER_USE_PRINTF
1418  printf("Sub Part index = %d, first LID associated to the sub part = %d, sub part size = %d;\n", sub_ip, partptr_sub(sub_ip, 0), last_sub_line_length);
1419 #endif
1420  }
1421  }
1422  }
1423 
1424  {
1425  local_ordinal_type npacks = ceil(float(nparts)/vector_length);
1426 
1427  local_ordinal_type ip_max = nparts > vector_length ? vector_length : nparts;
1428  for (local_ordinal_type ip=0;ip<ip_max;++ip) {
1429  part2packrowidx0_sub(ip, 0) = 0;
1430  }
1431  for (local_ordinal_type ipack=0;ipack<npacks;++ipack) {
1432  if (ipack != 0) {
1433  local_ordinal_type ip_min = ipack*vector_length;
1434  ip_max = nparts > (ipack+1)*vector_length ? (ipack+1)*vector_length : nparts;
1435  for (local_ordinal_type ip=ip_min;ip<ip_max;++ip) {
1436  part2packrowidx0_sub(ip, 0) = part2packrowidx0_sub(ip-vector_length, part2packrowidx0_sub.extent(1)-1);
1437  }
1438  }
1439 
1440  for (size_type local_sub_ip=0; local_sub_ip<part2packrowidx0_sub.extent(1)-1;++local_sub_ip) {
1441  local_ordinal_type ip_min = ipack*vector_length;
1442  ip_max = nparts > (ipack+1)*vector_length ? (ipack+1)*vector_length : nparts;
1443 
1444  const local_ordinal_type full_line_length = partptr(ip_min+1) - partptr(ip_min);
1445 
1446  constexpr local_ordinal_type connection_length = 2;
1447 
1448  const local_ordinal_type sub_line_length = (full_line_length - (n_subparts_per_part - 1) * connection_length) / n_subparts_per_part;
1449  const local_ordinal_type last_sub_line_length = full_line_length - (n_subparts_per_part - 1) * (connection_length + sub_line_length);
1450 
1451  if (local_sub_ip % 2 == 0) pack_nrows_sub = sub_line_length;
1452  if (local_sub_ip % 2 == 1) pack_nrows_sub = connection_length;
1453  if (local_sub_ip == part2packrowidx0_sub.extent(1)-2) pack_nrows_sub = last_sub_line_length;
1454 
1455  part2packrowidx0_sub(ip_min, local_sub_ip + 1) = part2packrowidx0_sub(ip_min, local_sub_ip) + pack_nrows_sub;
1456 
1457  for (local_ordinal_type ip=ip_min+1;ip<ip_max;++ip) {
1458  part2packrowidx0_sub(ip, local_sub_ip + 1) = part2packrowidx0_sub(ip_min, local_sub_ip + 1);
1459  }
1460  }
1461  }
1462 
1463  Kokkos::deep_copy(interf.part2packrowidx0_sub, part2packrowidx0_sub);
1464  }
1465  IFPACK2_BLOCKHELPER_TIMER_FENCE(typename BlockHelperDetails::ImplType<MatrixType>::execution_space)
1466  }
1467 #if defined(BLOCKTRIDICONTAINER_DEBUG)
1468  TEUCHOS_ASSERT(partptr(nparts) == nrows);
1469 #endif
1470  if (lclrow(0) != 0) interf.row_contiguous = false;
1471 
1472  Kokkos::deep_copy(interf.partptr, partptr);
1473  Kokkos::deep_copy(interf.lclrow, lclrow);
1474 
1475  Kokkos::deep_copy(interf.partptr_sub, partptr_sub);
1476 
1477  //assume No overlap. Thus:
1478  interf.part2rowidx0 = interf.partptr;
1479  Kokkos::deep_copy(interf.part2packrowidx0, part2packrowidx0);
1480 
1481  interf.part2packrowidx0_back = part2packrowidx0_sub(part2packrowidx0_sub.extent(0) - 1, part2packrowidx0_sub.extent(1) - 1);
1482  Kokkos::deep_copy(interf.rowidx2part, rowidx2part);
1483 
1484  { // Fill packptr.
1485  IFPACK2_BLOCKHELPER_TIMER("Fill packptr", packptr0);
1486  local_ordinal_type npacks = ceil(float(nparts)/vector_length) * (part2packrowidx0_sub.extent(1)-1);
1487  npacks = 0;
1488  for (local_ordinal_type ip=1;ip<=nparts;++ip) //n_sub_parts_and_schur
1489  if (part2packrowidx0(ip) != part2packrowidx0(ip-1))
1490  ++npacks;
1491 
1492  interf.packptr = local_ordinal_type_1d_view(do_not_initialize_tag("packptr"), npacks + 1);
1493  const auto packptr = Kokkos::create_mirror_view(interf.packptr);
1494  packptr(0) = 0;
1495  for (local_ordinal_type ip=1,k=1;ip<=nparts;++ip)
1496  if (part2packrowidx0(ip) != part2packrowidx0(ip-1))
1497  packptr(k++) = ip;
1498 
1499  Kokkos::deep_copy(interf.packptr, packptr);
1500 
1501  local_ordinal_type npacks_per_subpart = ceil(float(nparts)/vector_length);
1502  npacks = ceil(float(nparts)/vector_length) * (part2packrowidx0_sub.extent(1)-1);
1503 
1504  interf.packindices_sub = local_ordinal_type_1d_view(do_not_initialize_tag("packindices_sub"), npacks_per_subpart*n_subparts_per_part);
1505  interf.packindices_schur = local_ordinal_type_2d_view(do_not_initialize_tag("packindices_schur"), npacks_per_subpart,n_subparts_per_part-1);
1506 
1507  const auto packindices_sub = Kokkos::create_mirror_view(interf.packindices_sub);
1508  const auto packindices_schur = Kokkos::create_mirror_view(interf.packindices_schur);
1509 
1510 
1511  // Fill packindices_sub and packindices_schur
1512  for (local_ordinal_type local_sub_ip=0; local_sub_ip<n_subparts_per_part-1;++local_sub_ip) {
1513  for (local_ordinal_type local_pack_ip=0; local_pack_ip<npacks_per_subpart;++local_pack_ip) {
1514  packindices_sub(local_sub_ip * npacks_per_subpart + local_pack_ip) = 2 * local_sub_ip * npacks_per_subpart + local_pack_ip;
1515  packindices_schur(local_pack_ip,local_sub_ip) = 2 * local_sub_ip * npacks_per_subpart + local_pack_ip + npacks_per_subpart;
1516  }
1517  }
1518 
1519  for (local_ordinal_type local_pack_ip=0; local_pack_ip<npacks_per_subpart;++local_pack_ip) {
1520  packindices_sub((n_subparts_per_part-1) * npacks_per_subpart + local_pack_ip) = 2 * (n_subparts_per_part-1) * npacks_per_subpart + local_pack_ip;
1521  }
1522 
1523 #ifdef IFPACK2_BLOCKTRIDICONTAINER_WRITE_MM
1524  std::cout << "packindices_sub = " << std::endl;
1525  for (size_type i = 0; i < packindices_sub.extent(0); ++i) {
1526  std::cout << packindices_sub(i) << " ";
1527  }
1528  std::cout << std::endl;
1529  std::cout << "packindices_sub end" << std::endl;
1530 
1531  std::cout << "packindices_schur = " << std::endl;
1532  for (size_type i = 0; i < packindices_schur.extent(0); ++i) {
1533  for (size_type j = 0; j < packindices_schur.extent(1); ++j) {
1534  std::cout << packindices_schur(i,j) << " ";
1535  }
1536  std::cout << std::endl;
1537  }
1538 
1539  std::cout << "packindices_schur end" << std::endl;
1540 #endif
1541 
1542  Kokkos::deep_copy(interf.packindices_sub, packindices_sub);
1543  Kokkos::deep_copy(interf.packindices_schur, packindices_schur);
1544 
1545  interf.packptr_sub = local_ordinal_type_1d_view(do_not_initialize_tag("packptr"), npacks + 1);
1546  const auto packptr_sub = Kokkos::create_mirror_view(interf.packptr_sub);
1547  packptr_sub(0) = 0;
1548  for (local_ordinal_type k=0;k<npacks + 1;++k)
1549  packptr_sub(k) = packptr(k%npacks_per_subpart) + (k / npacks_per_subpart) * packptr(npacks_per_subpart);
1550 
1551  Kokkos::deep_copy(interf.packptr_sub, packptr_sub);
1552  IFPACK2_BLOCKHELPER_TIMER_FENCE(typename BlockHelperDetails::ImplType<MatrixType>::execution_space)
1553  }
1554  IFPACK2_BLOCKHELPER_TIMER_FENCE(typename BlockHelperDetails::ImplType<MatrixType>::execution_space)
1555 
1556  return interf;
1557  }
1558 
1562  template <typename MatrixType>
1563  struct BlockTridiags {
1565  using local_ordinal_type_1d_view = typename impl_type::local_ordinal_type_1d_view;
1566  using size_type_1d_view = typename impl_type::size_type_1d_view;
1567  using size_type_2d_view = typename impl_type::size_type_2d_view;
1568  using vector_type_3d_view = typename impl_type::vector_type_3d_view;
1569  using vector_type_4d_view = typename impl_type::vector_type_4d_view;
1570  using btdm_scalar_type_3d_view = typename impl_type::btdm_scalar_type_3d_view;
1571 
1572  // flat_td_ptr(i) is the index into flat-array values of the start of the
1573  // i'th tridiag. pack_td_ptr is the same, but for packs. If vector_length ==
1574  // 1, pack_td_ptr is the same as flat_td_ptr; if vector_length > 1, then i %
1575  // vector_length is the position in the pack.
1576  size_type_2d_view flat_td_ptr, pack_td_ptr, pack_td_ptr_schur;
1577  // List of local column indices into A from which to grab
1578  // data. flat_td_ptr(i) points to the start of the i'th tridiag's data.
1579  local_ordinal_type_1d_view A_colindsub;
1580  // Tridiag block values. pack_td_ptr(i) points to the start of the i'th
1581  // tridiag's pack, and i % vector_length gives the position in the pack.
1582  vector_type_3d_view values;
1583  // Schur block values. pack_td_ptr_schur(i) points to the start of the i'th
1584  // Schur's pack, and i % vector_length gives the position in the pack.
1585  vector_type_3d_view values_schur;
1586  // inv(A_00)*A_01 block values.
1587  vector_type_4d_view e_values;
1588 
1589  // The following are for fused block Jacobi only.
1590  // For block row i, diag_offset(i)...diag_offset(i + bs^2)
1591  // is the range of scalars for the diagonal block.
1592  size_type_1d_view diag_offsets;
1593  // For fused residual+solve block Jacobi case,
1594  // this contains the diagonal block inverses in flat, local row indexing:
1595  // d_inv(row, :, :) gives the row-major block for row.
1596  btdm_scalar_type_3d_view d_inv;
1597 
1598  bool is_diagonal_only;
1599 
1600  BlockTridiags() = default;
1601  BlockTridiags(const BlockTridiags &b) = default;
1602 
1603  // Index into row-major block of a tridiag.
1604  template <typename idx_type>
1605  static KOKKOS_FORCEINLINE_FUNCTION
1606  idx_type IndexToRow (const idx_type& ind) { return (ind + 1) / 3; }
1607  // Given a row of a row-major tridiag, return the index of the first block
1608  // in that row.
1609  template <typename idx_type>
1610  static KOKKOS_FORCEINLINE_FUNCTION
1611  idx_type RowToIndex (const idx_type& row) { return row > 0 ? 3*row - 1 : 0; }
1612  // Number of blocks in a tridiag having a given number of rows.
1613  template <typename idx_type>
1614  static KOKKOS_FORCEINLINE_FUNCTION
1615  idx_type NumBlocks (const idx_type& nrows) { return nrows > 0 ? 3*nrows - 2 : 0; }
1616  // Number of blocks associated to a Schur complement having a given number of rows.
1617  template <typename idx_type>
1618  static KOKKOS_FORCEINLINE_FUNCTION
1619  idx_type NumBlocksSchur (const idx_type& nrows) { return nrows > 0 ? 3*nrows + 2 : 0; }
1620  };
1621 
1622 
1626  template<typename MatrixType>
1628  createBlockTridiags(const BlockHelperDetails::PartInterface<MatrixType> &interf) {
1629  IFPACK2_BLOCKHELPER_TIMER("createBlockTridiags", createBlockTridiags0);
1630  using impl_type = BlockHelperDetails::ImplType<MatrixType>;
1631  using execution_space = typename impl_type::execution_space;
1632  using local_ordinal_type = typename impl_type::local_ordinal_type;
1633  using size_type = typename impl_type::size_type;
1634  using size_type_2d_view = typename impl_type::size_type_2d_view;
1635 
1636  constexpr int vector_length = impl_type::vector_length;
1637 
1639 
1640  const local_ordinal_type ntridiags = interf.partptr_sub.extent(0);
1641 
1642  { // construct the flat index pointers into the tridiag values array.
1643  btdm.flat_td_ptr = size_type_2d_view(do_not_initialize_tag("btdm.flat_td_ptr"), interf.nparts, 2*interf.n_subparts_per_part);
1644  const Kokkos::RangePolicy<execution_space> policy(0, 2 * interf.nparts * interf.n_subparts_per_part );
1645  Kokkos::parallel_scan
1646  ("createBlockTridiags::RangePolicy::flat_td_ptr",
1647  policy, KOKKOS_LAMBDA(const local_ordinal_type &i, size_type &update, const bool &final) {
1648  const local_ordinal_type partidx = i/(2 * interf.n_subparts_per_part);
1649  const local_ordinal_type local_subpartidx = i % (2 * interf.n_subparts_per_part);
1650 
1651  if (final) {
1652  btdm.flat_td_ptr(partidx, local_subpartidx) = update;
1653  }
1654  if (local_subpartidx != (2 * interf.n_subparts_per_part -1)) {
1655  const local_ordinal_type nrows = interf.partptr_sub(interf.nparts*local_subpartidx + partidx,1) - interf.partptr_sub(interf.nparts*local_subpartidx + partidx,0);
1656  if (local_subpartidx % 2 == 0)
1657  update += btdm.NumBlocks(nrows);
1658  else
1659  update += btdm.NumBlocksSchur(nrows);
1660  }
1661  });
1662 
1663  const auto nblocks = Kokkos::create_mirror_view_and_copy
1664  (Kokkos::HostSpace(), Kokkos::subview(btdm.flat_td_ptr, interf.nparts-1, 2*interf.n_subparts_per_part-1));
1665  btdm.is_diagonal_only = (static_cast<local_ordinal_type>(nblocks()) == ntridiags);
1666  }
1667 
1668  // And the packed index pointers.
1669  if (vector_length == 1) {
1670  btdm.pack_td_ptr = btdm.flat_td_ptr;
1671  } else {
1672  //const local_ordinal_type npacks = interf.packptr_sub.extent(0) - 1;
1673 
1674  local_ordinal_type npacks_per_subpart = 0;
1675  const auto part2packrowidx0 = Kokkos::create_mirror_view(interf.part2packrowidx0);
1676  Kokkos::deep_copy(part2packrowidx0, interf.part2packrowidx0);
1677  for (local_ordinal_type ip=1;ip<=interf.nparts;++ip) //n_sub_parts_and_schur
1678  if (part2packrowidx0(ip) != part2packrowidx0(ip-1))
1679  ++npacks_per_subpart;
1680 
1681  btdm.pack_td_ptr = size_type_2d_view(do_not_initialize_tag("btdm.pack_td_ptr"), interf.nparts, 2*interf.n_subparts_per_part);
1682  const Kokkos::RangePolicy<execution_space> policy(0,npacks_per_subpart);
1683 
1684  Kokkos::parallel_for
1685  ("createBlockTridiags::RangePolicy::pack_td_ptr",
1686  policy, KOKKOS_LAMBDA(const local_ordinal_type &i) {
1687  for (local_ordinal_type j = 0; j < 2*interf.n_subparts_per_part; ++j) {
1688  const local_ordinal_type pack_id = ( j == 2*interf.n_subparts_per_part-1 ) ? i+(j-1)*npacks_per_subpart : i+j*npacks_per_subpart;
1689  const local_ordinal_type nparts_in_pack = interf.packptr_sub(pack_id+1) - interf.packptr_sub(pack_id);
1690 
1691  const local_ordinal_type parti = interf.packptr_sub(pack_id);
1692  const local_ordinal_type partidx = parti%interf.nparts;
1693 
1694  for (local_ordinal_type pti=0;pti<nparts_in_pack;++pti) {
1695  btdm.pack_td_ptr(partidx+pti, j) = btdm.flat_td_ptr(i, j);
1696  }
1697  }
1698  });
1699  }
1700 
1701  btdm.pack_td_ptr_schur = size_type_2d_view(do_not_initialize_tag("btdm.pack_td_ptr_schur"), interf.nparts, interf.n_subparts_per_part);
1702 
1703  const auto host_pack_td_ptr_schur = Kokkos::create_mirror_view(btdm.pack_td_ptr_schur);
1704  constexpr local_ordinal_type connection_length = 2;
1705 
1706  host_pack_td_ptr_schur(0,0) = 0;
1707  for (local_ordinal_type i = 0; i < interf.nparts; ++i) {
1708  if (i % vector_length == 0) {
1709  if (i != 0)
1710  host_pack_td_ptr_schur(i,0) = host_pack_td_ptr_schur(i-1,host_pack_td_ptr_schur.extent(1)-1);
1711  for (local_ordinal_type j = 0; j < interf.n_subparts_per_part-1; ++j) {
1712  host_pack_td_ptr_schur(i,j+1) = host_pack_td_ptr_schur(i,j) + btdm.NumBlocks(connection_length) + (j != 0 ? 1 : 0) + (j != interf.n_subparts_per_part-2 ? 1 : 0);
1713  }
1714  }
1715  else {
1716  for (local_ordinal_type j = 0; j < interf.n_subparts_per_part; ++j) {
1717  host_pack_td_ptr_schur(i,j) = host_pack_td_ptr_schur(i-1,j);
1718  }
1719  }
1720  }
1721 
1722  Kokkos::deep_copy(btdm.pack_td_ptr_schur, host_pack_td_ptr_schur);
1723 
1724 #ifdef IFPACK2_BLOCKTRIDICONTAINER_WRITE_MM
1725  const auto host_flat_td_ptr = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), btdm.flat_td_ptr);
1726  std::cout << "flat_td_ptr = " << std::endl;
1727  for (size_type i = 0; i < host_flat_td_ptr.extent(0); ++i) {
1728  for (size_type j = 0; j < host_flat_td_ptr.extent(1); ++j) {
1729  std::cout << host_flat_td_ptr(i,j) << " ";
1730  }
1731  std::cout << std::endl;
1732  }
1733  std::cout << "flat_td_ptr end" << std::endl;
1734 
1735  const auto host_pack_td_ptr = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), btdm.pack_td_ptr);
1736 
1737  std::cout << "pack_td_ptr = " << std::endl;
1738  for (size_type i = 0; i < host_pack_td_ptr.extent(0); ++i) {
1739  for (size_type j = 0; j < host_pack_td_ptr.extent(1); ++j) {
1740  std::cout << host_pack_td_ptr(i,j) << " ";
1741  }
1742  std::cout << std::endl;
1743  }
1744  std::cout << "pack_td_ptr end" << std::endl;
1745 
1746 
1747  std::cout << "pack_td_ptr_schur = " << std::endl;
1748  for (size_type i = 0; i < host_pack_td_ptr_schur.extent(0); ++i) {
1749  for (size_type j = 0; j < host_pack_td_ptr_schur.extent(1); ++j) {
1750  std::cout << host_pack_td_ptr_schur(i,j) << " ";
1751  }
1752  std::cout << std::endl;
1753  }
1754  std::cout << "pack_td_ptr_schur end" << std::endl;
1755 #endif
1756 
1757  // values and A_colindsub are created in the symbolic phase
1758  IFPACK2_BLOCKHELPER_TIMER_FENCE(typename BlockHelperDetails::ImplType<MatrixType>::execution_space)
1759 
1760  return btdm;
1761  }
1762 
1763  // Set the tridiags to be I to the full pack block size. That way, if a
1764  // tridiag within a pack is shorter than the longest one, the extra blocks are
1765  // processed in a safe way. Similarly, in the solve phase, if the extra blocks
1766  // in the packed multvector are 0, and the tridiag LU reflects the extra I
1767  // blocks, then the solve proceeds as though the extra blocks aren't
1768  // present. Since this extra work is part of the SIMD calls, it's not actually
1769  // extra work. Instead, it means we don't have to put checks or masks in, or
1770  // quiet NaNs. This functor has to be called just once, in the symbolic phase,
1771  // since the numeric phase fills in only the used entries, leaving these I
1772  // blocks intact.
1773  template<typename MatrixType>
1774  void
1775  setTridiagsToIdentity
1776  (const BlockTridiags<MatrixType>& btdm,
1777  const typename BlockHelperDetails::ImplType<MatrixType>::local_ordinal_type_1d_view& packptr)
1778  {
1779  using impl_type = BlockHelperDetails::ImplType<MatrixType>;
1780  using execution_space = typename impl_type::execution_space;
1781  using local_ordinal_type = typename impl_type::local_ordinal_type;
1782  using size_type_2d_view = typename impl_type::size_type_2d_view;
1783 
1784  const ConstUnmanaged<size_type_2d_view> pack_td_ptr(btdm.pack_td_ptr);
1785  const local_ordinal_type blocksize = btdm.values.extent(1);
1786 
1787  {
1788  const int vector_length = impl_type::vector_length;
1789  const int internal_vector_length = impl_type::internal_vector_length;
1790 
1791  using btdm_scalar_type = typename impl_type::btdm_scalar_type;
1792  using internal_vector_type = typename impl_type::internal_vector_type;
1793  using internal_vector_type_4d_view =
1794  typename impl_type::internal_vector_type_4d_view;
1795 
1796  using team_policy_type = Kokkos::TeamPolicy<execution_space>;
1797  const internal_vector_type_4d_view values
1798  (reinterpret_cast<internal_vector_type*>(btdm.values.data()),
1799  btdm.values.extent(0),
1800  btdm.values.extent(1),
1801  btdm.values.extent(2),
1802  vector_length/internal_vector_length);
1803  const local_ordinal_type vector_loop_size = values.extent(3);
1804 #if defined(KOKKOS_ENABLE_CUDA) && defined(__CUDA_ARCH__)
1805  local_ordinal_type total_team_size(0);
1806  if (blocksize <= 5) total_team_size = 32;
1807  else if (blocksize <= 9) total_team_size = 64;
1808  else if (blocksize <= 12) total_team_size = 96;
1809  else if (blocksize <= 16) total_team_size = 128;
1810  else if (blocksize <= 20) total_team_size = 160;
1811  else total_team_size = 160;
1812  const local_ordinal_type team_size = total_team_size/vector_loop_size;
1813  const team_policy_type policy(packptr.extent(0)-1, team_size, vector_loop_size);
1814 #elif defined(KOKKOS_ENABLE_HIP)
1815  // FIXME: HIP
1816  // These settings might be completely wrong
1817  // will have to do some experiments to decide
1818  // what makes sense on AMD GPUs
1819  local_ordinal_type total_team_size(0);
1820  if (blocksize <= 5) total_team_size = 32;
1821  else if (blocksize <= 9) total_team_size = 64;
1822  else if (blocksize <= 12) total_team_size = 96;
1823  else if (blocksize <= 16) total_team_size = 128;
1824  else if (blocksize <= 20) total_team_size = 160;
1825  else total_team_size = 160;
1826  const local_ordinal_type team_size = total_team_size/vector_loop_size;
1827  const team_policy_type policy(packptr.extent(0)-1, team_size, vector_loop_size);
1828 #elif defined(KOKKOS_ENABLE_SYCL)
1829  // SYCL: FIXME
1830  local_ordinal_type total_team_size(0);
1831  if (blocksize <= 5) total_team_size = 32;
1832  else if (blocksize <= 9) total_team_size = 64;
1833  else if (blocksize <= 12) total_team_size = 96;
1834  else if (blocksize <= 16) total_team_size = 128;
1835  else if (blocksize <= 20) total_team_size = 160;
1836  else total_team_size = 160;
1837  const local_ordinal_type team_size = total_team_size/vector_loop_size;
1838  const team_policy_type policy(packptr.extent(0)-1, team_size, vector_loop_size);
1839 #else
1840  // Host architecture: team size is always one
1841  const team_policy_type policy(packptr.extent(0)-1, 1, 1);
1842 #endif
1843  Kokkos::parallel_for
1844  ("setTridiagsToIdentity::TeamPolicy",
1845  policy, KOKKOS_LAMBDA(const typename team_policy_type::member_type &member) {
1846  const local_ordinal_type k = member.league_rank();
1847  const local_ordinal_type ibeg = pack_td_ptr(packptr(k),0);
1848  const local_ordinal_type iend = pack_td_ptr(packptr(k),pack_td_ptr.extent(1)-1);
1849 
1850  const local_ordinal_type diff = iend - ibeg;
1851  const local_ordinal_type icount = diff/3 + (diff%3 > 0);
1852  const btdm_scalar_type one(1);
1853  Kokkos::parallel_for(Kokkos::ThreadVectorRange(member, vector_loop_size),[&](const int &v) {
1854  Kokkos::parallel_for(Kokkos::TeamThreadRange(member,icount),[&](const local_ordinal_type &ii) {
1855  const local_ordinal_type i = ibeg + ii*3;
1856  for (local_ordinal_type j=0;j<blocksize;++j) {
1857  values(i,j,j,v) = one;
1858  }
1859  });
1860  });
1861  });
1862  }
1863  }
1864 
1868  template<typename MatrixType>
1869  void
1870  performSymbolicPhase(const Teuchos::RCP<const typename BlockHelperDetails::ImplType<MatrixType>::tpetra_row_matrix_type> &A,
1871  const Teuchos::RCP<const typename BlockHelperDetails::ImplType<MatrixType>::tpetra_crs_graph_type> &g,
1872  const BlockHelperDetails::PartInterface<MatrixType> &interf,
1875  const bool overlap_communication_and_computation,
1876  const Teuchos::RCP<AsyncableImport<MatrixType> > &async_importer,
1877  bool useSeqMethod,
1878  bool use_fused_jacobi) {
1879  IFPACK2_BLOCKHELPER_TIMER("BlockTriDi::SymbolicPhase", SymbolicPhase);
1880 
1881  using impl_type = BlockHelperDetails::ImplType<MatrixType>;
1882 
1883  using execution_space = typename impl_type::execution_space;
1884  using host_execution_space = typename impl_type::host_execution_space;
1885 
1886  using local_ordinal_type = typename impl_type::local_ordinal_type;
1887  using global_ordinal_type = typename impl_type::global_ordinal_type;
1888  using size_type = typename impl_type::size_type;
1889  using local_ordinal_type_1d_view = typename impl_type::local_ordinal_type_1d_view;
1890  using size_type_1d_view = typename impl_type::size_type_1d_view;
1891  using vector_type_3d_view = typename impl_type::vector_type_3d_view;
1892  using vector_type_4d_view = typename impl_type::vector_type_4d_view;
1893  using crs_matrix_type = typename impl_type::tpetra_crs_matrix_type;
1894  using block_crs_matrix_type = typename impl_type::tpetra_block_crs_matrix_type;
1895  using btdm_scalar_type_3d_view = typename impl_type::btdm_scalar_type_3d_view;
1896 
1897  constexpr int vector_length = impl_type::vector_length;
1898 
1899  const auto comm = A->getRowMap()->getComm();
1900 
1901  auto A_crs = Teuchos::rcp_dynamic_cast<const crs_matrix_type>(A);
1902  auto A_bcrs = Teuchos::rcp_dynamic_cast<const block_crs_matrix_type>(A);
1903 
1904  bool hasBlockCrsMatrix = ! A_bcrs.is_null ();
1905  TEUCHOS_ASSERT(hasBlockCrsMatrix || g->getLocalNumRows() != 0);
1906  const local_ordinal_type blocksize = hasBlockCrsMatrix ? A->getBlockSize() : A->getLocalNumRows()/g->getLocalNumRows();
1907 
1908  // mirroring to host
1909  const auto partptr = Kokkos::create_mirror_view_and_copy (Kokkos::HostSpace(), interf.partptr);
1910  const auto lclrow = Kokkos::create_mirror_view_and_copy (Kokkos::HostSpace(), interf.lclrow);
1911  const auto rowidx2part = Kokkos::create_mirror_view_and_copy (Kokkos::HostSpace(), interf.rowidx2part);
1912  const auto part2rowidx0 = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), interf.part2rowidx0);
1913  const auto packptr = Kokkos::create_mirror_view_and_copy (Kokkos::HostSpace(), interf.packptr);
1914 
1915  const local_ordinal_type nrows = partptr(partptr.extent(0) - 1);
1916 
1917  Kokkos::View<local_ordinal_type*,host_execution_space> col2row("col2row", A->getLocalNumCols());
1918 
1919  // find column to row map on host
1920 
1921  Kokkos::deep_copy(col2row, Teuchos::OrdinalTraits<local_ordinal_type>::invalid());
1922  {
1923  const auto rowmap = g->getRowMap();
1924  const auto colmap = g->getColMap();
1925  const auto dommap = g->getDomainMap();
1926  TEUCHOS_ASSERT( !(rowmap.is_null() || colmap.is_null() || dommap.is_null()));
1927 
1928 #if !defined(__CUDA_ARCH__) && !defined(__HIP_DEVICE_COMPILE__) && !defined(__SYCL_DEVICE_ONLY__)
1929  const Kokkos::RangePolicy<host_execution_space> policy(0,nrows);
1930  Kokkos::parallel_for
1931  ("performSymbolicPhase::RangePolicy::col2row",
1932  policy, KOKKOS_LAMBDA(const local_ordinal_type &lr) {
1933  const global_ordinal_type gid = rowmap->getGlobalElement(lr);
1935  if (dommap->isNodeGlobalElement(gid)) {
1936  const local_ordinal_type lc = colmap->getLocalElement(gid);
1937 # if defined(BLOCKTRIDICONTAINER_DEBUG)
1939  BlockHelperDetails::get_msg_prefix(comm) << "GID " << gid
1940  << " gives an invalid local column.");
1941 # endif
1942  col2row(lc) = lr;
1943  }
1944  });
1945 #endif
1946  }
1947 
1948  // construct the D and R graphs in A = D + R.
1949  {
1950  const auto local_graph = g->getLocalGraphHost();
1951  const auto local_graph_rowptr = local_graph.row_map;
1952  TEUCHOS_ASSERT(local_graph_rowptr.size() == static_cast<size_t>(nrows + 1));
1953  const auto local_graph_colidx = local_graph.entries;
1954 
1955  //assume no overlap.
1956 
1957  Kokkos::View<local_ordinal_type*,host_execution_space> lclrow2idx("lclrow2idx", nrows);
1958  {
1959  const Kokkos::RangePolicy<host_execution_space> policy(0,nrows);
1960  Kokkos::parallel_for
1961  ("performSymbolicPhase::RangePolicy::lclrow2idx",
1962  policy, KOKKOS_LAMBDA(const local_ordinal_type &i) {
1963  lclrow2idx[lclrow(i)] = i;
1964  });
1965  }
1966 
1967  // count (block) nnzs in D and R.
1969  typename sum_reducer_type::value_type sum_reducer_value;
1970  {
1971  const Kokkos::RangePolicy<host_execution_space> policy(0,nrows);
1972  Kokkos::parallel_reduce
1973  // profiling interface does not work
1974  (//"performSymbolicPhase::RangePolicy::count_nnz",
1975  policy, KOKKOS_LAMBDA(const local_ordinal_type &lr, typename sum_reducer_type::value_type &update) {
1976  // LID -> index.
1977  const local_ordinal_type ri0 = lclrow2idx[lr];
1978  const local_ordinal_type pi0 = rowidx2part(ri0);
1979  for (size_type j=local_graph_rowptr(lr);j<local_graph_rowptr(lr+1);++j) {
1980  const local_ordinal_type lc = local_graph_colidx(j);
1981  const local_ordinal_type lc2r = col2row[lc];
1982  bool incr_R = false;
1983  do { // breakable
1984  if (lc2r == (local_ordinal_type) -1) {
1985  incr_R = true;
1986  break;
1987  }
1988  const local_ordinal_type ri = lclrow2idx[lc2r];
1989  const local_ordinal_type pi = rowidx2part(ri);
1990  if (pi != pi0) {
1991  incr_R = true;
1992  break;
1993  }
1994  // Test for being in the tridiag. This is done in index space. In
1995  // LID space, tridiag LIDs in a row are not necessarily related by
1996  // {-1, 0, 1}.
1997  if (ri0 + 1 >= ri && ri0 <= ri + 1)
1998  ++update.v[0]; // D_nnz
1999  else
2000  incr_R = true;
2001  } while (0);
2002  if (incr_R) {
2003  if (lc < nrows) ++update.v[1]; // R_nnz_owned
2004  else ++update.v[2]; // R_nnz_remote
2005  }
2006  }
2007  }, sum_reducer_type(sum_reducer_value));
2008  }
2009  size_type D_nnz = sum_reducer_value.v[0];
2010  size_type R_nnz_owned = sum_reducer_value.v[1];
2011  size_type R_nnz_remote = sum_reducer_value.v[2];
2012 
2013  if (!overlap_communication_and_computation) {
2014  R_nnz_owned += R_nnz_remote;
2015  R_nnz_remote = 0;
2016  }
2017 
2018  // construct the D_00 graph.
2019  {
2020  const auto flat_td_ptr = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), btdm.flat_td_ptr);
2021 
2022  btdm.A_colindsub = local_ordinal_type_1d_view("btdm.A_colindsub", D_nnz);
2023  const auto D_A_colindsub = Kokkos::create_mirror_view(btdm.A_colindsub);
2024 
2025 #if defined(BLOCKTRIDICONTAINER_DEBUG)
2026  Kokkos::deep_copy(D_A_colindsub, Teuchos::OrdinalTraits<local_ordinal_type>::invalid());
2027 #endif
2028 
2029  const local_ordinal_type nparts = partptr.extent(0) - 1;
2030 
2031  {
2032  const Kokkos::RangePolicy<host_execution_space> policy(0, nparts);
2033  Kokkos::parallel_for
2034  ("performSymbolicPhase::RangePolicy<host_execution_space>::D_graph",
2035  policy, KOKKOS_LAMBDA(const local_ordinal_type &pi0) {
2036  const local_ordinal_type part_ri0 = part2rowidx0(pi0);
2037  local_ordinal_type offset = 0;
2038  for (local_ordinal_type ri0=partptr(pi0);ri0<partptr(pi0+1);++ri0) {
2039  const local_ordinal_type td_row_os = btdm.RowToIndex(ri0 - part_ri0) + offset;
2040  offset = 1;
2041  const local_ordinal_type lr0 = lclrow(ri0);
2042  const size_type j0 = local_graph_rowptr(lr0);
2043  for (size_type j=j0;j<local_graph_rowptr(lr0+1);++j) {
2044  const local_ordinal_type lc = local_graph_colidx(j);
2045  const local_ordinal_type lc2r = col2row[lc];
2046  if (lc2r == (local_ordinal_type) -1) continue;
2047  const local_ordinal_type ri = lclrow2idx[lc2r];
2048  const local_ordinal_type pi = rowidx2part(ri);
2049  if (pi != pi0) continue;
2050  if (ri + 1 < ri0 || ri > ri0 + 1) continue;
2051  const local_ordinal_type row_entry = j - j0;
2052  D_A_colindsub(flat_td_ptr(pi0,0) + ((td_row_os + ri) - ri0)) = row_entry;
2053  }
2054  }
2055  });
2056  }
2057 #if defined(BLOCKTRIDICONTAINER_DEBUG)
2058  for (size_t i=0;i<D_A_colindsub.extent(0);++i)
2060 #endif
2061  Kokkos::deep_copy(btdm.A_colindsub, D_A_colindsub);
2062 
2063  // Allocate values.
2064  {
2065  const auto pack_td_ptr_last = Kokkos::subview(btdm.pack_td_ptr, btdm.pack_td_ptr.extent(0)-1, btdm.pack_td_ptr.extent(1)-1);
2066  const auto num_packed_blocks = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), pack_td_ptr_last);
2067  btdm.values = vector_type_3d_view("btdm.values", num_packed_blocks(), blocksize, blocksize);
2068 
2069  if (interf.n_subparts_per_part > 1) {
2070  const auto pack_td_ptr_schur_last = Kokkos::subview(btdm.pack_td_ptr_schur, btdm.pack_td_ptr_schur.extent(0)-1, btdm.pack_td_ptr_schur.extent(1)-1);
2071  const auto num_packed_blocks_schur = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), pack_td_ptr_schur_last);
2072  btdm.values_schur = vector_type_3d_view("btdm.values_schur", num_packed_blocks_schur(), blocksize, blocksize);
2073  }
2074 
2075  if (vector_length > 1) setTridiagsToIdentity(btdm, interf.packptr);
2076  }
2077  }
2078 
2079  // Construct the R graph.
2080  {
2081  amd.rowptr = size_type_1d_view("amd.rowptr", nrows + 1);
2082  amd.A_colindsub = local_ordinal_type_1d_view(do_not_initialize_tag("amd.A_colindsub"), R_nnz_owned);
2083 
2084  const auto R_rowptr = Kokkos::create_mirror_view(amd.rowptr);
2085  const auto R_A_colindsub = Kokkos::create_mirror_view(amd.A_colindsub);
2086 
2087  amd.rowptr_remote = size_type_1d_view("amd.rowptr_remote", overlap_communication_and_computation ? nrows + 1 : 0);
2088  amd.A_colindsub_remote = local_ordinal_type_1d_view(do_not_initialize_tag("amd.A_colindsub_remote"), R_nnz_remote);
2089 
2090  const auto R_rowptr_remote = Kokkos::create_mirror_view(amd.rowptr_remote);
2091  const auto R_A_colindsub_remote = Kokkos::create_mirror_view(amd.A_colindsub_remote);
2092 
2093  {
2094  const Kokkos::RangePolicy<host_execution_space> policy(0,nrows);
2095  Kokkos::parallel_for
2096  ("performSymbolicPhase::RangePolicy<host_execution_space>::R_graph_count",
2097  policy, KOKKOS_LAMBDA(const local_ordinal_type &lr) {
2098  const local_ordinal_type ri0 = lclrow2idx[lr];
2099  const local_ordinal_type pi0 = rowidx2part(ri0);
2100  const size_type j0 = local_graph_rowptr(lr);
2101  for (size_type j=j0;j<local_graph_rowptr(lr+1);++j) {
2102  const local_ordinal_type lc = local_graph_colidx(j);
2103  const local_ordinal_type lc2r = col2row[lc];
2104  if (lc2r != (local_ordinal_type) -1) {
2105  const local_ordinal_type ri = lclrow2idx[lc2r];
2106  const local_ordinal_type pi = rowidx2part(ri);
2107  if (pi == pi0 && ri + 1 >= ri0 && ri <= ri0 + 1) {
2108  continue;
2109  }
2110  }
2111  // exclusive scan will be performed later
2112  if (!overlap_communication_and_computation || lc < nrows) {
2113  ++R_rowptr(lr);
2114  } else {
2115  ++R_rowptr_remote(lr);
2116  }
2117  }
2118  });
2119  }
2120 
2121  // exclusive scan
2123  {
2124  Kokkos::RangePolicy<host_execution_space> policy(0,nrows+1);
2125  Kokkos::parallel_scan
2126  ("performSymbolicPhase::RangePolicy<host_execution_space>::R_graph_fill",
2127  policy, KOKKOS_LAMBDA(const local_ordinal_type &lr,
2128  update_type &update,
2129  const bool &final) {
2130  update_type val;
2131  val.v[0] = R_rowptr(lr);
2132  if (overlap_communication_and_computation)
2133  val.v[1] = R_rowptr_remote(lr);
2134 
2135  if (final) {
2136  R_rowptr(lr) = update.v[0];
2137  if (overlap_communication_and_computation)
2138  R_rowptr_remote(lr) = update.v[1];
2139 
2140  if (lr < nrows) {
2141  const local_ordinal_type ri0 = lclrow2idx[lr];
2142  const local_ordinal_type pi0 = rowidx2part(ri0);
2143 
2144  size_type cnt_rowptr = R_rowptr(lr);
2145  size_type cnt_rowptr_remote = overlap_communication_and_computation ? R_rowptr_remote(lr) : 0; // when not overlap_communication_and_computation, this value is garbage
2146 
2147  const size_type j0 = local_graph_rowptr(lr);
2148  for (size_type j=j0;j<local_graph_rowptr(lr+1);++j) {
2149  const local_ordinal_type lc = local_graph_colidx(j);
2150  const local_ordinal_type lc2r = col2row[lc];
2151  if (lc2r != (local_ordinal_type) -1) {
2152  const local_ordinal_type ri = lclrow2idx[lc2r];
2153  const local_ordinal_type pi = rowidx2part(ri);
2154  if (pi == pi0 && ri + 1 >= ri0 && ri <= ri0 + 1)
2155  continue;
2156  }
2157  const local_ordinal_type row_entry = j - j0;
2158  if (!overlap_communication_and_computation || lc < nrows)
2159  R_A_colindsub(cnt_rowptr++) = row_entry;
2160  else
2161  R_A_colindsub_remote(cnt_rowptr_remote++) = row_entry;
2162  }
2163  }
2164  }
2165  update += val;
2166  });
2167  }
2168  TEUCHOS_ASSERT(R_rowptr(nrows) == R_nnz_owned);
2169  Kokkos::deep_copy(amd.rowptr, R_rowptr);
2170  Kokkos::deep_copy(amd.A_colindsub, R_A_colindsub);
2171  if (overlap_communication_and_computation) {
2172  TEUCHOS_ASSERT(R_rowptr_remote(nrows) == R_nnz_remote);
2173  Kokkos::deep_copy(amd.rowptr_remote, R_rowptr_remote);
2174  Kokkos::deep_copy(amd.A_colindsub_remote, R_A_colindsub_remote);
2175  }
2176 
2177  // Allocate or view values.
2178  if (hasBlockCrsMatrix)
2179  amd.tpetra_values = (const_cast<block_crs_matrix_type*>(A_bcrs.get())->getValuesDeviceNonConst());
2180  else {
2181  amd.tpetra_values = (const_cast<crs_matrix_type*>(A_crs.get()))->getLocalValuesDevice (Tpetra::Access::ReadWrite);
2182  }
2183  }
2184 
2185  // Allocate view for E and initialize the values with B:
2186 
2187  if (interf.n_subparts_per_part > 1)
2188  btdm.e_values = vector_type_4d_view("btdm.e_values", 2, interf.part2packrowidx0_back, blocksize, blocksize);
2189  }
2190  // Precompute offsets of each A and x entry to speed up residual.
2191  // Applies if all of these are true:
2192  // - hasBlockCrsMatrix
2193  // - execution_space is a GPU
2194  // - !useSeqMethod (since this uses a different scheme for indexing A,x)
2195  //
2196  // Reading A, x take up to 4 and 6 levels of indirection respectively,
2197  // but precomputing the offsets reduces it to 2 for both (get index, then value)
2198  if(BlockHelperDetails::is_device<execution_space>::value && !useSeqMethod && hasBlockCrsMatrix)
2199  {
2200  bool is_async_importer_active = !async_importer.is_null();
2201  local_ordinal_type_1d_view dm2cm = is_async_importer_active ? async_importer->dm2cm : local_ordinal_type_1d_view();
2202  bool ownedRemoteSeparate = overlap_communication_and_computation || !is_async_importer_active;
2203  BlockHelperDetails::precompute_A_x_offsets<MatrixType>(amd, interf, g, dm2cm, blocksize, ownedRemoteSeparate);
2204  }
2205 
2206  // If using fused block Jacobi path, allocate diagonal inverses here (d_inv) and find diagonal offsets.
2207  if(use_fused_jacobi) {
2208  btdm.d_inv = btdm_scalar_type_3d_view(do_not_initialize_tag("btdm.d_inv"), interf.nparts, blocksize, blocksize);
2209  auto rowptrs = A_bcrs->getCrsGraph().getLocalRowPtrsDevice();
2210  auto entries = A_bcrs->getCrsGraph().getLocalIndicesDevice();
2211  btdm.diag_offsets = BlockHelperDetails::findDiagOffsets<execution_space, size_type_1d_view>(rowptrs, entries, interf.nparts, blocksize);
2212  }
2213  IFPACK2_BLOCKHELPER_TIMER_FENCE(typename BlockHelperDetails::ImplType<MatrixType>::execution_space)
2214  }
2215 
2216 
2220  template<typename ArgActiveExecutionMemorySpace>
2222 
2223  template<>
2224  struct ExtractAndFactorizeTridiagsDefaultModeAndAlgo<Kokkos::HostSpace> {
2225  typedef KB::Mode::Serial mode_type;
2226 #if defined(__KOKKOSBATCHED_INTEL_MKL_COMPACT_BATCHED__)
2227  typedef KB::Algo::Level3::CompactMKL algo_type;
2228 #else
2229  typedef KB::Algo::Level3::Blocked algo_type;
2230 #endif
2231  static int recommended_team_size(const int /* blksize */,
2232  const int /* vector_length */,
2233  const int /* internal_vector_length */) {
2234  return 1;
2235  }
2236 
2237  };
2238 
2239 #if defined(KOKKOS_ENABLE_CUDA)
2240  static inline int ExtractAndFactorizeRecommendedCudaTeamSize(const int blksize,
2241  const int vector_length,
2242  const int internal_vector_length) {
2243  const int vector_size = vector_length/internal_vector_length;
2244  int total_team_size(0);
2245  if (blksize <= 5) total_team_size = 32;
2246  else if (blksize <= 9) total_team_size = 32; // 64
2247  else if (blksize <= 12) total_team_size = 96;
2248  else if (blksize <= 16) total_team_size = 128;
2249  else if (blksize <= 20) total_team_size = 160;
2250  else total_team_size = 160;
2251  return 2*total_team_size/vector_size;
2252  }
2253  template<>
2254  struct ExtractAndFactorizeTridiagsDefaultModeAndAlgo<Kokkos::CudaSpace> {
2255  typedef KB::Mode::Team mode_type;
2256  typedef KB::Algo::Level3::Unblocked algo_type;
2257  static int recommended_team_size(const int blksize,
2258  const int vector_length,
2259  const int internal_vector_length) {
2260  return ExtractAndFactorizeRecommendedCudaTeamSize(blksize, vector_length, internal_vector_length);
2261  }
2262  };
2263  template<>
2264  struct ExtractAndFactorizeTridiagsDefaultModeAndAlgo<Kokkos::CudaUVMSpace> {
2265  typedef KB::Mode::Team mode_type;
2266  typedef KB::Algo::Level3::Unblocked algo_type;
2267  static int recommended_team_size(const int blksize,
2268  const int vector_length,
2269  const int internal_vector_length) {
2270  return ExtractAndFactorizeRecommendedCudaTeamSize(blksize, vector_length, internal_vector_length);
2271  }
2272  };
2273 #endif
2274 
2275 #if defined(KOKKOS_ENABLE_HIP)
2276  static inline int ExtractAndFactorizeRecommendedHIPTeamSize(const int blksize,
2277  const int vector_length,
2278  const int internal_vector_length) {
2279  const int vector_size = vector_length/internal_vector_length;
2280  int total_team_size(0);
2281  if (blksize <= 5) total_team_size = 32;
2282  else if (blksize <= 9) total_team_size = 32; // 64
2283  else if (blksize <= 12) total_team_size = 96;
2284  else if (blksize <= 16) total_team_size = 128;
2285  else if (blksize <= 20) total_team_size = 160;
2286  else total_team_size = 160;
2287  return 2*total_team_size/vector_size;
2288  }
2289  template<>
2290  struct ExtractAndFactorizeTridiagsDefaultModeAndAlgo<Kokkos::HIPSpace> {
2291  typedef KB::Mode::Team mode_type;
2292  typedef KB::Algo::Level3::Unblocked algo_type;
2293  static int recommended_team_size(const int blksize,
2294  const int vector_length,
2295  const int internal_vector_length) {
2296  return ExtractAndFactorizeRecommendedHIPTeamSize(blksize, vector_length, internal_vector_length);
2297  }
2298  };
2299  template<>
2300  struct ExtractAndFactorizeTridiagsDefaultModeAndAlgo<Kokkos::HIPHostPinnedSpace> {
2301  typedef KB::Mode::Team mode_type;
2302  typedef KB::Algo::Level3::Unblocked algo_type;
2303  static int recommended_team_size(const int blksize,
2304  const int vector_length,
2305  const int internal_vector_length) {
2306  return ExtractAndFactorizeRecommendedHIPTeamSize(blksize, vector_length, internal_vector_length);
2307  }
2308  };
2309 #endif
2310 
2311 #if defined(KOKKOS_ENABLE_SYCL)
2312  static inline int ExtractAndFactorizeRecommendedSYCLTeamSize(const int blksize,
2313  const int vector_length,
2314  const int internal_vector_length) {
2315  const int vector_size = vector_length/internal_vector_length;
2316  int total_team_size(0);
2317  if (blksize <= 5) total_team_size = 32;
2318  else if (blksize <= 9) total_team_size = 32; // 64
2319  else if (blksize <= 12) total_team_size = 96;
2320  else if (blksize <= 16) total_team_size = 128;
2321  else if (blksize <= 20) total_team_size = 160;
2322  else total_team_size = 160;
2323  return 2*total_team_size/vector_size;
2324  }
2325  template<>
2326  struct ExtractAndFactorizeTridiagsDefaultModeAndAlgo<Kokkos::Experimental::SYCLDeviceUSMSpace> {
2327  typedef KB::Mode::Team mode_type;
2328  typedef KB::Algo::Level3::Unblocked algo_type;
2329  static int recommended_team_size(const int blksize,
2330  const int vector_length,
2331  const int internal_vector_length) {
2332  return ExtractAndFactorizeRecommendedSYCLTeamSize(blksize, vector_length, internal_vector_length);
2333  }
2334  };
2335  template<>
2336  struct ExtractAndFactorizeTridiagsDefaultModeAndAlgo<Kokkos::Experimental::SYCLSharedUSMSpace> {
2337  typedef KB::Mode::Team mode_type;
2338  typedef KB::Algo::Level3::Unblocked algo_type;
2339  static int recommended_team_size(const int blksize,
2340  const int vector_length,
2341  const int internal_vector_length) {
2342  return ExtractAndFactorizeRecommendedSYCLTeamSize(blksize, vector_length, internal_vector_length);
2343  }
2344  };
2345 #endif
2346 
2347  template<typename impl_type, typename WWViewType>
2348  KOKKOS_INLINE_FUNCTION
2349  void
2350  solveMultiVector(const typename Kokkos::TeamPolicy<typename impl_type::execution_space>::member_type &member,
2351  const typename impl_type::local_ordinal_type &/* blocksize */,
2352  const typename impl_type::local_ordinal_type &i0,
2353  const typename impl_type::local_ordinal_type &r0,
2354  const typename impl_type::local_ordinal_type &nrows,
2355  const typename impl_type::local_ordinal_type &v,
2356  const ConstUnmanaged<typename impl_type::internal_vector_type_4d_view> D_internal_vector_values,
2357  const Unmanaged<typename impl_type::internal_vector_type_4d_view> X_internal_vector_values,
2358  const WWViewType &WW,
2359  const bool skip_first_pass=false) {
2360  using execution_space = typename impl_type::execution_space;
2361  using team_policy_type = Kokkos::TeamPolicy<execution_space>;
2362  using member_type = typename team_policy_type::member_type;
2363  using local_ordinal_type = typename impl_type::local_ordinal_type;
2364 
2365  typedef SolveTridiagsDefaultModeAndAlgo
2366  <typename execution_space::memory_space> default_mode_and_algo_type;
2367 
2368  typedef typename default_mode_and_algo_type::mode_type default_mode_type;
2369  typedef typename default_mode_and_algo_type::multi_vector_algo_type default_algo_type;
2370 
2371  using btdm_magnitude_type = typename impl_type::btdm_magnitude_type;
2372 
2373  // constant
2374  const auto one = Kokkos::ArithTraits<btdm_magnitude_type>::one();
2375  const auto zero = Kokkos::ArithTraits<btdm_magnitude_type>::zero();
2376 
2377  // subview pattern
2378  auto A = Kokkos::subview(D_internal_vector_values, i0, Kokkos::ALL(), Kokkos::ALL(), v);
2379  auto X1 = Kokkos::subview(X_internal_vector_values, r0, Kokkos::ALL(), Kokkos::ALL(), v);
2380  auto X2 = X1;
2381 
2382  local_ordinal_type i = i0, r = r0;
2383 
2384 
2385  if (nrows > 1) {
2386  // solve Lx = x
2387  if (skip_first_pass) {
2388  i += (nrows-2) * 3;
2389  r += (nrows-2);
2390  A.assign_data( &D_internal_vector_values(i+2,0,0,v) );
2391  X2.assign_data( &X_internal_vector_values(++r,0,0,v) );
2392  A.assign_data( &D_internal_vector_values(i+3,0,0,v) );
2393  KB::Trsm<member_type,
2394  KB::Side::Left,KB::Uplo::Lower,KB::Trans::NoTranspose,KB::Diag::Unit,
2395  default_mode_type,default_algo_type>
2396  ::invoke(member, one, A, X2);
2397  X1.assign_data( X2.data() );
2398  i+=3;
2399  }
2400  else {
2401  KB::Trsm<member_type,
2402  KB::Side::Left,KB::Uplo::Lower,KB::Trans::NoTranspose,KB::Diag::Unit,
2403  default_mode_type,default_algo_type>
2404  ::invoke(member, one, A, X1);
2405  for (local_ordinal_type tr=1;tr<nrows;++tr,i+=3) {
2406  A.assign_data( &D_internal_vector_values(i+2,0,0,v) );
2407  X2.assign_data( &X_internal_vector_values(++r,0,0,v) );
2408  member.team_barrier();
2409  KB::Gemm<member_type,
2410  KB::Trans::NoTranspose,KB::Trans::NoTranspose,
2411  default_mode_type,default_algo_type>
2412  ::invoke(member, -one, A, X1, one, X2);
2413  A.assign_data( &D_internal_vector_values(i+3,0,0,v) );
2414  KB::Trsm<member_type,
2415  KB::Side::Left,KB::Uplo::Lower,KB::Trans::NoTranspose,KB::Diag::Unit,
2416  default_mode_type,default_algo_type>
2417  ::invoke(member, one, A, X2);
2418  X1.assign_data( X2.data() );
2419  }
2420  }
2421 
2422  // solve Ux = x
2423  KB::Trsm<member_type,
2424  KB::Side::Left,KB::Uplo::Upper,KB::Trans::NoTranspose,KB::Diag::NonUnit,
2425  default_mode_type,default_algo_type>
2426  ::invoke(member, one, A, X1);
2427  for (local_ordinal_type tr=nrows;tr>1;--tr) {
2428  i -= 3;
2429  A.assign_data( &D_internal_vector_values(i+1,0,0,v) );
2430  X2.assign_data( &X_internal_vector_values(--r,0,0,v) );
2431  member.team_barrier();
2432  KB::Gemm<member_type,
2433  KB::Trans::NoTranspose,KB::Trans::NoTranspose,
2434  default_mode_type,default_algo_type>
2435  ::invoke(member, -one, A, X1, one, X2);
2436 
2437  A.assign_data( &D_internal_vector_values(i,0,0,v) );
2438  KB::Trsm<member_type,
2439  KB::Side::Left,KB::Uplo::Upper,KB::Trans::NoTranspose,KB::Diag::NonUnit,
2440  default_mode_type,default_algo_type>
2441  ::invoke(member, one, A, X2);
2442  X1.assign_data( X2.data() );
2443  }
2444  } else {
2445  // matrix is already inverted
2446  auto W = Kokkos::subview(WW, Kokkos::ALL(), Kokkos::ALL(), v);
2447  KB::Copy<member_type,KB::Trans::NoTranspose,default_mode_type>
2448  ::invoke(member, X1, W);
2449  member.team_barrier();
2450  KB::Gemm<member_type,
2451  KB::Trans::NoTranspose,KB::Trans::NoTranspose,
2452  default_mode_type,default_algo_type>
2453  ::invoke(member, one, A, W, zero, X1);
2454  }
2455 
2456  }
2457 
2458  template<typename impl_type, typename WWViewType, typename XViewType>
2459  KOKKOS_INLINE_FUNCTION
2460  void
2461  solveSingleVectorNew(const typename Kokkos::TeamPolicy<typename impl_type::execution_space>::member_type &member,
2462  const typename impl_type::local_ordinal_type &blocksize,
2463  const typename impl_type::local_ordinal_type &i0,
2464  const typename impl_type::local_ordinal_type &r0,
2465  const typename impl_type::local_ordinal_type &nrows,
2466  const typename impl_type::local_ordinal_type &v,
2467  const ConstUnmanaged<typename impl_type::internal_vector_type_4d_view> D_internal_vector_values,
2468  const XViewType &X_internal_vector_values, //Unmanaged<typename impl_type::internal_vector_type_4d_view>
2469  const WWViewType &WW) {
2470  using execution_space = typename impl_type::execution_space;
2471  //using team_policy_type = Kokkos::TeamPolicy<execution_space>;
2472  //using member_type = typename team_policy_type::member_type;
2473  using local_ordinal_type = typename impl_type::local_ordinal_type;
2474 
2475  typedef SolveTridiagsDefaultModeAndAlgo
2476  <typename execution_space::memory_space> default_mode_and_algo_type;
2477 
2478  typedef typename default_mode_and_algo_type::mode_type default_mode_type;
2479  typedef typename default_mode_and_algo_type::single_vector_algo_type default_algo_type;
2480 
2481  using btdm_magnitude_type = typename impl_type::btdm_magnitude_type;
2482 
2483  // base pointers
2484  auto A = D_internal_vector_values.data();
2485  auto X = X_internal_vector_values.data();
2486 
2487  // constant
2488  const auto one = Kokkos::ArithTraits<btdm_magnitude_type>::one();
2489  const auto zero = Kokkos::ArithTraits<btdm_magnitude_type>::zero();
2490  //const local_ordinal_type num_vectors = X_scalar_values.extent(2);
2491 
2492  // const local_ordinal_type blocksize = D_scalar_values.extent(1);
2493  const local_ordinal_type astep = D_internal_vector_values.stride_0();
2494  const local_ordinal_type as0 = D_internal_vector_values.stride_1(); //blocksize*vector_length;
2495  const local_ordinal_type as1 = D_internal_vector_values.stride_2(); //vector_length;
2496  const local_ordinal_type xstep = X_internal_vector_values.stride_0();
2497  const local_ordinal_type xs0 = X_internal_vector_values.stride_1(); //vector_length;
2498 
2499  // move to starting point
2500  A += i0*astep + v;
2501  X += r0*xstep + v;
2502 
2503  //for (local_ordinal_type col=0;col<num_vectors;++col)
2504  if (nrows > 1) {
2505  // solve Lx = x
2506  KOKKOSBATCHED_TRSV_LOWER_NO_TRANSPOSE_INTERNAL_INVOKE
2507  (default_mode_type,default_algo_type,
2508  member,
2509  KB::Diag::Unit,
2510  blocksize,blocksize,
2511  one,
2512  A, as0, as1,
2513  X, xs0);
2514 
2515  for (local_ordinal_type tr=1;tr<nrows;++tr) {
2516  member.team_barrier();
2517  KOKKOSBATCHED_GEMV_NO_TRANSPOSE_INTERNAL_INVOKE
2518  (default_mode_type,default_algo_type,
2519  member,
2520  blocksize, blocksize,
2521  -one,
2522  A+2*astep, as0, as1,
2523  X, xs0,
2524  one,
2525  X+1*xstep, xs0);
2526  KOKKOSBATCHED_TRSV_LOWER_NO_TRANSPOSE_INTERNAL_INVOKE
2527  (default_mode_type,default_algo_type,
2528  member,
2529  KB::Diag::Unit,
2530  blocksize,blocksize,
2531  one,
2532  A+3*astep, as0, as1,
2533  X+1*xstep, xs0);
2534 
2535  A += 3*astep;
2536  X += 1*xstep;
2537  }
2538 
2539  // solve Ux = x
2540  KOKKOSBATCHED_TRSV_UPPER_NO_TRANSPOSE_INTERNAL_INVOKE
2541  (default_mode_type,default_algo_type,
2542  member,
2543  KB::Diag::NonUnit,
2544  blocksize, blocksize,
2545  one,
2546  A, as0, as1,
2547  X, xs0);
2548 
2549  for (local_ordinal_type tr=nrows;tr>1;--tr) {
2550  A -= 3*astep;
2551  member.team_barrier();
2552  KOKKOSBATCHED_GEMV_NO_TRANSPOSE_INTERNAL_INVOKE
2553  (default_mode_type,default_algo_type,
2554  member,
2555  blocksize, blocksize,
2556  -one,
2557  A+1*astep, as0, as1,
2558  X, xs0,
2559  one,
2560  X-1*xstep, xs0);
2561  KOKKOSBATCHED_TRSV_UPPER_NO_TRANSPOSE_INTERNAL_INVOKE
2562  (default_mode_type,default_algo_type,
2563  member,
2564  KB::Diag::NonUnit,
2565  blocksize, blocksize,
2566  one,
2567  A, as0, as1,
2568  X-1*xstep,xs0);
2569  X -= 1*xstep;
2570  }
2571  // for multiple rhs
2572  //X += xs1;
2573  } else {
2574  const local_ordinal_type ws0 = WW.stride_0();
2575  auto W = WW.data() + v;
2576  KOKKOSBATCHED_COPY_VECTOR_NO_TRANSPOSE_INTERNAL_INVOKE
2577  (default_mode_type,
2578  member, blocksize, X, xs0, W, ws0);
2579  member.team_barrier();
2580  KOKKOSBATCHED_GEMV_NO_TRANSPOSE_INTERNAL_INVOKE
2581  (default_mode_type,default_algo_type,
2582  member,
2583  blocksize, blocksize,
2584  one,
2585  A, as0, as1,
2586  W, xs0,
2587  zero,
2588  X, xs0);
2589  }
2590  }
2591 
2592  template<typename local_ordinal_type, typename ViewType>
2593  void writeBTDValuesToFile (const local_ordinal_type &n_parts, const ViewType &scalar_values_device, std::string fileName) {
2594 #ifdef IFPACK2_BLOCKTRIDICONTAINER_WRITE_MM
2595  auto scalar_values = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), scalar_values_device);
2596  std::ofstream myfile;
2597  myfile.open (fileName);
2598 
2599  const local_ordinal_type n_parts_per_pack = n_parts < (local_ordinal_type) scalar_values.extent(3) ? n_parts : scalar_values.extent(3);
2600  local_ordinal_type nnz = scalar_values.extent(0) * scalar_values.extent(1) * scalar_values.extent(2) * n_parts_per_pack;
2601  const local_ordinal_type n_blocks = scalar_values.extent(0)*n_parts_per_pack;
2602  const local_ordinal_type n_blocks_per_part = n_blocks/n_parts;
2603 
2604  const local_ordinal_type block_size = scalar_values.extent(1);
2605 
2606  const local_ordinal_type n_rows_per_part = (n_blocks_per_part+2)/3 * block_size;
2607  const local_ordinal_type n_rows = n_rows_per_part*n_parts;
2608 
2609  const local_ordinal_type n_packs = ceil(float(n_parts)/n_parts_per_pack);
2610 
2611  myfile << "%%MatrixMarket matrix coordinate real general"<< std::endl;
2612  myfile << "%%nnz = " << nnz;
2613  myfile << " block size = " << block_size;
2614  myfile << " number of blocks = " << n_blocks;
2615  myfile << " number of parts = " << n_parts;
2616  myfile << " number of blocks per part = " << n_blocks_per_part;
2617  myfile << " number of rows = " << n_rows ;
2618  myfile << " number of cols = " << n_rows;
2619  myfile << " number of packs = " << n_packs << std::endl;
2620 
2621  myfile << n_rows << " " << n_rows << " " << nnz << std::setprecision(9) << std::endl;
2622 
2623  local_ordinal_type current_part_idx, current_block_idx, current_row_offset, current_col_offset, current_row, current_col;
2624  for (local_ordinal_type i_pack=0;i_pack<n_packs;++i_pack) {
2625  for (local_ordinal_type i_part_in_pack=0;i_part_in_pack<n_parts_per_pack;++i_part_in_pack) {
2626  current_part_idx = i_part_in_pack + i_pack * n_parts_per_pack;
2627  for (local_ordinal_type i_block_in_part=0;i_block_in_part<n_blocks_per_part;++i_block_in_part) {
2628  current_block_idx = i_block_in_part + i_pack * n_blocks_per_part;
2629  if (current_block_idx >= (local_ordinal_type) scalar_values.extent(0))
2630  continue;
2631  if (i_block_in_part % 3 == 0) {
2632  current_row_offset = i_block_in_part/3 * block_size;
2633  current_col_offset = i_block_in_part/3 * block_size;
2634  }
2635  else if (i_block_in_part % 3 == 1) {
2636  current_row_offset = (i_block_in_part-1)/3 * block_size;
2637  current_col_offset = ((i_block_in_part-1)/3+1) * block_size;
2638  }
2639  else if (i_block_in_part % 3 == 2) {
2640  current_row_offset = ((i_block_in_part-2)/3+1) * block_size;
2641  current_col_offset = (i_block_in_part-2)/3 * block_size;
2642  }
2643  current_row_offset += current_part_idx * n_rows_per_part;
2644  current_col_offset += current_part_idx * n_rows_per_part;
2645  for (local_ordinal_type i_in_block=0;i_in_block<block_size;++i_in_block) {
2646  for (local_ordinal_type j_in_block=0;j_in_block<block_size;++j_in_block) {
2647  current_row = current_row_offset + i_in_block + 1;
2648  current_col = current_col_offset + j_in_block + 1;
2649  myfile << current_row << " " << current_col << " " << scalar_values(current_block_idx,i_in_block,j_in_block,i_part_in_pack) << std::endl;
2650  }
2651  }
2652  }
2653  }
2654  }
2655 
2656  myfile.close();
2657 #endif
2658  }
2659 
2660  template<typename local_ordinal_type, typename ViewType>
2661  void write4DMultiVectorValuesToFile (const local_ordinal_type &n_parts, const ViewType &scalar_values_device, std::string fileName) {
2662 #ifdef IFPACK2_BLOCKTRIDICONTAINER_WRITE_MM
2663  auto scalar_values = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), scalar_values_device);
2664  std::ofstream myfile;
2665  myfile.open (fileName);
2666 
2667  const local_ordinal_type n_parts_per_pack = n_parts < scalar_values.extent(3) ? n_parts : scalar_values.extent(3);
2668  const local_ordinal_type n_blocks = scalar_values.extent(0)*n_parts_per_pack;
2669  const local_ordinal_type n_blocks_per_part = n_blocks/n_parts;
2670 
2671  const local_ordinal_type block_size = scalar_values.extent(1);
2672  const local_ordinal_type n_cols = scalar_values.extent(2);
2673 
2674  const local_ordinal_type n_rows_per_part = n_blocks_per_part * block_size;
2675  const local_ordinal_type n_rows = n_rows_per_part*n_parts;
2676 
2677  const local_ordinal_type n_packs = ceil(float(n_parts)/n_parts_per_pack);
2678 
2679 
2680  myfile << "%%MatrixMarket matrix array real general"<< std::endl;
2681  myfile << "%%block size = " << block_size;
2682  myfile << " number of blocks = " << n_blocks;
2683  myfile << " number of parts = " << n_parts;
2684  myfile << " number of blocks per part = " << n_blocks_per_part;
2685  myfile << " number of rows = " << n_rows ;
2686  myfile << " number of cols = " << n_cols;
2687  myfile << " number of packs = " << n_packs << std::endl;
2688 
2689  myfile << n_rows << " " << n_cols << std::setprecision(9) << std::endl;
2690 
2691  local_ordinal_type current_part_idx, current_block_idx, current_row_offset;
2692  (void) current_row_offset;
2693  (void) current_part_idx;
2694  for (local_ordinal_type j_in_block=0;j_in_block<n_cols;++j_in_block) {
2695  for (local_ordinal_type i_pack=0;i_pack<n_packs;++i_pack) {
2696  for (local_ordinal_type i_part_in_pack=0;i_part_in_pack<n_parts_per_pack;++i_part_in_pack) {
2697  current_part_idx = i_part_in_pack + i_pack * n_parts_per_pack;
2698  for (local_ordinal_type i_block_in_part=0;i_block_in_part<n_blocks_per_part;++i_block_in_part) {
2699  current_block_idx = i_block_in_part + i_pack * n_blocks_per_part;
2700 
2701  if (current_block_idx >= (local_ordinal_type) scalar_values.extent(0))
2702  continue;
2703  for (local_ordinal_type i_in_block=0;i_in_block<block_size;++i_in_block) {
2704  myfile << scalar_values(current_block_idx,i_in_block,j_in_block,i_part_in_pack) << std::endl;
2705  }
2706  }
2707  }
2708  }
2709  }
2710  myfile.close();
2711 #endif
2712  }
2713 
2714  template<typename local_ordinal_type, typename ViewType>
2715  void write5DMultiVectorValuesToFile (const local_ordinal_type &n_parts, const ViewType &scalar_values_device, std::string fileName) {
2716 #ifdef IFPACK2_BLOCKTRIDICONTAINER_WRITE_MM
2717  auto scalar_values = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), scalar_values_device);
2718  std::ofstream myfile;
2719  myfile.open (fileName);
2720 
2721  const local_ordinal_type n_parts_per_pack = n_parts < scalar_values.extent(4) ? n_parts : scalar_values.extent(4);
2722  const local_ordinal_type n_blocks = scalar_values.extent(1)*n_parts_per_pack;
2723  const local_ordinal_type n_blocks_per_part = n_blocks/n_parts;
2724 
2725  const local_ordinal_type block_size = scalar_values.extent(2);
2726  const local_ordinal_type n_blocks_cols = scalar_values.extent(0);
2727  const local_ordinal_type n_cols = n_blocks_cols * block_size;
2728 
2729  const local_ordinal_type n_rows_per_part = n_blocks_per_part * block_size;
2730  const local_ordinal_type n_rows = n_rows_per_part*n_parts;
2731 
2732  const local_ordinal_type n_packs = ceil(float(n_parts)/n_parts_per_pack);
2733 
2734  myfile << "%%MatrixMarket matrix array real general"<< std::endl;
2735  myfile << "%%block size = " << block_size;
2736  myfile << " number of blocks = " << n_blocks;
2737  myfile << " number of parts = " << n_parts;
2738  myfile << " number of blocks per part = " << n_blocks_per_part;
2739  myfile << " number of rows = " << n_rows ;
2740  myfile << " number of cols = " << n_cols;
2741  myfile << " number of packs = " << n_packs << std::endl;
2742 
2743  myfile << n_rows << " " << n_cols << std::setprecision(9) << std::endl;
2744 
2745  local_ordinal_type current_part_idx, current_block_idx, current_row_offset;
2746  (void) current_row_offset;
2747  (void) current_part_idx;
2748  for (local_ordinal_type i_block_col=0;i_block_col<n_blocks_cols;++i_block_col) {
2749  for (local_ordinal_type j_in_block=0;j_in_block<block_size;++j_in_block) {
2750  for (local_ordinal_type i_pack=0;i_pack<n_packs;++i_pack) {
2751  for (local_ordinal_type i_part_in_pack=0;i_part_in_pack<n_parts_per_pack;++i_part_in_pack) {
2752  current_part_idx = i_part_in_pack + i_pack * n_parts_per_pack;
2753  for (local_ordinal_type i_block_in_part=0;i_block_in_part<n_blocks_per_part;++i_block_in_part) {
2754  current_block_idx = i_block_in_part + i_pack * n_blocks_per_part;
2755 
2756  if (current_block_idx >= (local_ordinal_type) scalar_values.extent(1))
2757  continue;
2758  for (local_ordinal_type i_in_block=0;i_in_block<block_size;++i_in_block) {
2759  myfile << scalar_values(i_block_col,current_block_idx,i_in_block,j_in_block,i_part_in_pack) << std::endl;
2760  }
2761  }
2762  }
2763  }
2764  }
2765  }
2766  myfile.close();
2767 #endif
2768  }
2769 
2770  template<typename local_ordinal_type, typename member_type, typename ViewType1, typename ViewType2>
2771  KOKKOS_INLINE_FUNCTION
2772  void
2773  copy3DView(const member_type &member, const ViewType1 &view1, const ViewType2 &view2) {
2774 /*
2775  // Kokkos::Experimental::local_deep_copy
2776  auto teamVectorRange =
2777  Kokkos::TeamVectorMDRange<Kokkos::Rank<3>, member_type>(
2778  member, view1.extent(0), view1.extent(1), view1.extent(2));
2779 
2780  Kokkos::parallel_for
2781  (teamVectorRange,
2782  [&](const local_ordinal_type &i, const local_ordinal_type &j, const local_ordinal_type &k) {
2783  view1(i,j,k) = view2(i,j,k);
2784  });
2785 */
2786  Kokkos::Experimental::local_deep_copy(member, view1, view2);
2787  }
2788  template<typename MatrixType, int ScratchLevel>
2789  struct ExtractAndFactorizeTridiags {
2790  public:
2791  using impl_type = BlockHelperDetails::ImplType<MatrixType>;
2792  // a functor cannot have both device_type and execution_space; specialization error in kokkos
2793  using execution_space = typename impl_type::execution_space;
2794  using memory_space = typename impl_type::memory_space;
2796  using local_ordinal_type = typename impl_type::local_ordinal_type;
2797  using size_type = typename impl_type::size_type;
2798  using impl_scalar_type = typename impl_type::impl_scalar_type;
2799  using magnitude_type = typename impl_type::magnitude_type;
2801  using row_matrix_type = typename impl_type::tpetra_row_matrix_type;
2802  using crs_graph_type = typename impl_type::tpetra_crs_graph_type;
2804  using local_ordinal_type_1d_view = typename impl_type::local_ordinal_type_1d_view;
2805  using local_ordinal_type_2d_view = typename impl_type::local_ordinal_type_2d_view;
2806  using size_type_1d_view = typename impl_type::size_type_1d_view;
2807  using size_type_2d_view = typename impl_type::size_type_2d_view;
2808  using impl_scalar_type_1d_view_tpetra = typename impl_type::impl_scalar_type_1d_view_tpetra;
2810  using btdm_scalar_type = typename impl_type::btdm_scalar_type;
2811  using btdm_magnitude_type = typename impl_type::btdm_magnitude_type;
2812  using vector_type_3d_view = typename impl_type::vector_type_3d_view;
2813  using vector_type_4d_view = typename impl_type::vector_type_4d_view;
2814  using internal_vector_type_4d_view = typename impl_type::internal_vector_type_4d_view;
2815  using internal_vector_type_5d_view = typename impl_type::internal_vector_type_5d_view;
2816  using btdm_scalar_type_2d_view = typename impl_type::btdm_scalar_type_2d_view;
2817  using btdm_scalar_type_3d_view = typename impl_type::btdm_scalar_type_3d_view;
2818  using btdm_scalar_type_4d_view = typename impl_type::btdm_scalar_type_4d_view;
2819  using btdm_scalar_type_5d_view = typename impl_type::btdm_scalar_type_5d_view;
2820  using internal_vector_scratch_type_3d_view = Scratch<typename impl_type::internal_vector_type_3d_view>;
2821  using btdm_scalar_scratch_type_3d_view = Scratch<typename impl_type::btdm_scalar_type_3d_view>;
2822  using tpetra_block_access_view_type = typename impl_type::tpetra_block_access_view_type; // block crs (layout right)
2823  using local_crs_graph_type = typename impl_type::local_crs_graph_type;
2824  using colinds_view = typename local_crs_graph_type::entries_type;
2825 
2826  using internal_vector_type = typename impl_type::internal_vector_type;
2827  static constexpr int vector_length = impl_type::vector_length;
2828  static constexpr int internal_vector_length = impl_type::internal_vector_length;
2829  static_assert(vector_length >= internal_vector_length, "Ifpack2 BlockTriDi Numeric: vector_length must be at least as large as internal_vector_length");
2830  static_assert(vector_length % internal_vector_length == 0, "Ifpack2 BlockTriDi Numeric: vector_length must be divisible by internal_vector_length");
2831  // half_vector_length is used for block Jacobi factorization.
2832  // Shared memory requirement is twice as large (per vector lane) as for general tridi factorization, so
2833  // reducing vector length (if possible) keeps the shared requirement constant. This avoids the performance
2834  // cliff of switching from level 0 to level 1 scratch.
2835  static constexpr int half_vector_length = impl_type::half_vector_length;
2836 
2838  using team_policy_type = Kokkos::TeamPolicy<execution_space>;
2839  using member_type = typename team_policy_type::member_type;
2840 
2841  private:
2842  // part interface
2843  const ConstUnmanaged<local_ordinal_type_1d_view> partptr, lclrow, packptr, packindices_sub, packptr_sub;
2844  const ConstUnmanaged<local_ordinal_type_2d_view> partptr_sub, part2packrowidx0_sub, packindices_schur;
2845  const local_ordinal_type max_partsz;
2846  // block crs matrix (it could be Kokkos::UVMSpace::size_type, which is int)
2847  using size_type_1d_view_tpetra = Kokkos::View<size_t*,typename impl_type::node_device_type>;
2848  ConstUnmanaged<size_type_1d_view_tpetra> A_block_rowptr;
2849  ConstUnmanaged<size_type_1d_view_tpetra> A_point_rowptr;
2850  ConstUnmanaged<impl_scalar_type_1d_view_tpetra> A_values;
2851  // block tridiags
2852  const ConstUnmanaged<size_type_2d_view> pack_td_ptr, flat_td_ptr, pack_td_ptr_schur;
2853  const ConstUnmanaged<local_ordinal_type_1d_view> A_colindsub;
2854  const Unmanaged<internal_vector_type_4d_view> internal_vector_values, internal_vector_values_schur;
2855  const Unmanaged<internal_vector_type_5d_view> e_internal_vector_values;
2856  const Unmanaged<btdm_scalar_type_4d_view> scalar_values, scalar_values_schur;
2857  const Unmanaged<btdm_scalar_type_5d_view> e_scalar_values;
2858  const Unmanaged<btdm_scalar_type_3d_view> d_inv;
2859  const Unmanaged<size_type_1d_view> diag_offsets;
2860  // shared information
2861  const local_ordinal_type blocksize, blocksize_square;
2862  // diagonal safety
2863  const magnitude_type tiny;
2864  const local_ordinal_type vector_loop_size;
2865 
2866  bool hasBlockCrsMatrix;
2867 
2868  public:
2869  ExtractAndFactorizeTridiags(const BlockTridiags<MatrixType> &btdm_,
2870  const BlockHelperDetails::PartInterface<MatrixType> &interf_,
2873  const magnitude_type& tiny_) :
2874  // interface
2875  partptr(interf_.partptr),
2876  lclrow(interf_.lclrow),
2877  packptr(interf_.packptr),
2878  packindices_sub(interf_.packindices_sub),
2879  packptr_sub(interf_.packptr_sub),
2880  partptr_sub(interf_.partptr_sub),
2881  part2packrowidx0_sub(interf_.part2packrowidx0_sub),
2882  packindices_schur(interf_.packindices_schur),
2883  max_partsz(interf_.max_partsz),
2884  // block tridiags
2885  pack_td_ptr(btdm_.pack_td_ptr),
2886  flat_td_ptr(btdm_.flat_td_ptr),
2887  pack_td_ptr_schur(btdm_.pack_td_ptr_schur),
2888  A_colindsub(btdm_.A_colindsub),
2889  internal_vector_values((internal_vector_type*)btdm_.values.data(),
2890  btdm_.values.extent(0),
2891  btdm_.values.extent(1),
2892  btdm_.values.extent(2),
2893  vector_length/internal_vector_length),
2894  internal_vector_values_schur((internal_vector_type*)btdm_.values_schur.data(),
2895  btdm_.values_schur.extent(0),
2896  btdm_.values_schur.extent(1),
2897  btdm_.values_schur.extent(2),
2898  vector_length/internal_vector_length),
2899  e_internal_vector_values((internal_vector_type*)btdm_.e_values.data(),
2900  btdm_.e_values.extent(0),
2901  btdm_.e_values.extent(1),
2902  btdm_.e_values.extent(2),
2903  btdm_.e_values.extent(3),
2904  vector_length/internal_vector_length),
2905  scalar_values((btdm_scalar_type*)btdm_.values.data(),
2906  btdm_.values.extent(0),
2907  btdm_.values.extent(1),
2908  btdm_.values.extent(2),
2909  vector_length),
2910  scalar_values_schur((btdm_scalar_type*)btdm_.values_schur.data(),
2911  btdm_.values_schur.extent(0),
2912  btdm_.values_schur.extent(1),
2913  btdm_.values_schur.extent(2),
2914  vector_length),
2915  e_scalar_values((btdm_scalar_type*)btdm_.e_values.data(),
2916  btdm_.e_values.extent(0),
2917  btdm_.e_values.extent(1),
2918  btdm_.e_values.extent(2),
2919  btdm_.e_values.extent(3),
2920  vector_length),
2921  d_inv(btdm_.d_inv),
2922  diag_offsets(btdm_.diag_offsets),
2923  blocksize(btdm_.values.extent(1)),
2924  blocksize_square(blocksize*blocksize),
2925  // diagonal weight to avoid zero pivots
2926  tiny(tiny_),
2927  vector_loop_size(vector_length/internal_vector_length) {
2928  using crs_matrix_type = typename impl_type::tpetra_crs_matrix_type;
2929  using block_crs_matrix_type = typename impl_type::tpetra_block_crs_matrix_type;
2930 
2931  auto A_crs = Teuchos::rcp_dynamic_cast<const crs_matrix_type>(A_);
2932  auto A_bcrs = Teuchos::rcp_dynamic_cast<const block_crs_matrix_type>(A_);
2933 
2934  hasBlockCrsMatrix = ! A_bcrs.is_null ();
2935 
2936  A_block_rowptr = G_->getLocalGraphDevice().row_map;
2937  if (hasBlockCrsMatrix) {
2938  A_values = const_cast<block_crs_matrix_type*>(A_bcrs.get())->getValuesDeviceNonConst();
2939  }
2940  else {
2941  A_point_rowptr = A_crs->getCrsGraph()->getLocalGraphDevice().row_map;
2942  A_values = A_crs->getLocalValuesDevice (Tpetra::Access::ReadOnly);
2943  }
2944  }
2945 
2946  private:
2947 
2948  KOKKOS_INLINE_FUNCTION
2949  void
2950  extract(local_ordinal_type partidx,
2951  local_ordinal_type local_subpartidx,
2952  local_ordinal_type npacks) const {
2953 #ifdef IFPACK2_BLOCKTRIDICONTAINER_USE_PRINTF
2954  printf("extract partidx = %d, local_subpartidx = %d, npacks = %d;\n", partidx, local_subpartidx, npacks);
2955 #endif
2956  using tlb = BlockHelperDetails::TpetraLittleBlock<Tpetra::Impl::BlockCrsMatrixLittleBlockArrayLayout>;
2957  const size_type kps = pack_td_ptr(partidx, local_subpartidx);
2958  local_ordinal_type kfs[vector_length] = {};
2959  local_ordinal_type ri0[vector_length] = {};
2960  local_ordinal_type nrows[vector_length] = {};
2961 
2962  for (local_ordinal_type vi=0;vi<npacks;++vi,++partidx) {
2963  kfs[vi] = flat_td_ptr(partidx,local_subpartidx);
2964  ri0[vi] = partptr_sub(pack_td_ptr.extent(0)*local_subpartidx + partidx,0);
2965  nrows[vi] = partptr_sub(pack_td_ptr.extent(0)*local_subpartidx + partidx,1) - ri0[vi];
2966 #ifdef IFPACK2_BLOCKTRIDICONTAINER_USE_PRINTF
2967  printf("kfs[%d] = %d;\n", vi, kfs[vi]);
2968  printf("ri0[%d] = %d;\n", vi, ri0[vi]);
2969  printf("nrows[%d] = %d;\n", vi, nrows[vi]);
2970 #endif
2971  }
2972  local_ordinal_type tr_min = 0;
2973  local_ordinal_type tr_max = nrows[0];
2974  if (local_subpartidx % 2 == 1) {
2975  tr_min -= 1;
2976  tr_max += 1;
2977  }
2978 #ifdef IFPACK2_BLOCKTRIDICONTAINER_USE_PRINTF
2979  printf("tr_min = %d and tr_max = %d;\n", tr_min, tr_max);
2980 #endif
2981  for (local_ordinal_type tr=tr_min,j=0;tr<tr_max;++tr) {
2982  for (local_ordinal_type e=0;e<3;++e) {
2983  if (hasBlockCrsMatrix) {
2984  const impl_scalar_type* block[vector_length] = {};
2985  for (local_ordinal_type vi=0;vi<npacks;++vi) {
2986  const size_type Aj = A_block_rowptr(lclrow(ri0[vi] + tr)) + A_colindsub(kfs[vi] + j);
2987 
2988  block[vi] = &A_values(Aj*blocksize_square);
2989  }
2990  const size_type pi = kps + j;
2991 #ifdef IFPACK2_BLOCKTRIDICONTAINER_USE_PRINTF
2992  printf("Extract pi = %ld, ri0 + tr = %d, kfs + j = %d\n", pi, ri0[0] + tr, kfs[0] + j);
2993 #endif
2994  ++j;
2995  for (local_ordinal_type ii=0;ii<blocksize;++ii) {
2996  for (local_ordinal_type jj=0;jj<blocksize;++jj) {
2997  const auto idx = tlb::getFlatIndex(ii, jj, blocksize);
2998  auto& v = internal_vector_values(pi, ii, jj, 0);
2999  for (local_ordinal_type vi=0;vi<npacks;++vi) {
3000  v[vi] = static_cast<btdm_scalar_type>(block[vi][idx]);
3001  }
3002  }
3003  }
3004  }
3005  else {
3006  const size_type pi = kps + j;
3007 
3008  for (local_ordinal_type vi=0;vi<npacks;++vi) {
3009  const size_type Aj_c = A_colindsub(kfs[vi] + j);
3010 
3011  for (local_ordinal_type ii=0;ii<blocksize;++ii) {
3012  auto point_row_offset = A_point_rowptr(lclrow(ri0[vi] + tr)*blocksize + ii);
3013 
3014  for (local_ordinal_type jj=0;jj<blocksize;++jj) {
3015  scalar_values(pi, ii, jj, vi) = A_values(point_row_offset + Aj_c*blocksize + jj);
3016  }
3017  }
3018  }
3019  ++j;
3020  }
3021  if (nrows[0] == 1) break;
3022  if (local_subpartidx % 2 == 0) {
3023  if (e == 1 && (tr == 0 || tr+1 == nrows[0])) break;
3024  for (local_ordinal_type vi=1;vi<npacks;++vi) {
3025  if ((e == 0 && nrows[vi] == 1) || (e == 1 && tr+1 == nrows[vi])) {
3026  npacks = vi;
3027  break;
3028  }
3029  }
3030  }
3031  else {
3032  if (e == 0 && (tr == -1 || tr == nrows[0])) break;
3033  for (local_ordinal_type vi=1;vi<npacks;++vi) {
3034  if ((e == 0 && nrows[vi] == 1) || (e == 0 && tr == nrows[vi])) {
3035  npacks = vi;
3036  break;
3037  }
3038  }
3039  }
3040  }
3041  }
3042  }
3043 
3044  KOKKOS_INLINE_FUNCTION
3045  void
3046  extract(const member_type &member,
3047  const local_ordinal_type &partidxbeg,
3048  local_ordinal_type local_subpartidx,
3049  const local_ordinal_type &npacks,
3050  const local_ordinal_type &vbeg) const {
3051 #ifdef IFPACK2_BLOCKTRIDICONTAINER_USE_PRINTF
3052  printf("extract partidxbeg = %d, local_subpartidx = %d, npacks = %d, vbeg = %d;\n", partidxbeg, local_subpartidx, npacks, vbeg);
3053 #endif
3054  using tlb = BlockHelperDetails::TpetraLittleBlock<Tpetra::Impl::BlockCrsMatrixLittleBlockArrayLayout>;
3055  local_ordinal_type kfs_vals[internal_vector_length] = {};
3056  local_ordinal_type ri0_vals[internal_vector_length] = {};
3057  local_ordinal_type nrows_vals[internal_vector_length] = {};
3058 
3059  const size_type kps = pack_td_ptr(partidxbeg,local_subpartidx);
3060  for (local_ordinal_type v=vbeg,vi=0;v<npacks && vi<internal_vector_length;++v,++vi) {
3061  kfs_vals[vi] = flat_td_ptr(partidxbeg+vi,local_subpartidx);
3062  ri0_vals[vi] = partptr_sub(pack_td_ptr.extent(0)*local_subpartidx + partidxbeg+vi,0);
3063  nrows_vals[vi] = partptr_sub(pack_td_ptr.extent(0)*local_subpartidx + partidxbeg+vi,1) - ri0_vals[vi];
3064 #ifdef IFPACK2_BLOCKTRIDICONTAINER_USE_PRINTF
3065  printf("kfs_vals[%d] = %d;\n", vi, kfs_vals[vi]);
3066  printf("ri0_vals[%d] = %d;\n", vi, ri0_vals[vi]);
3067  printf("nrows_vals[%d] = %d;\n", vi, nrows_vals[vi]);
3068 #endif
3069  }
3070 
3071  local_ordinal_type j_vals[internal_vector_length] = {};
3072 
3073  local_ordinal_type tr_min = 0;
3074  local_ordinal_type tr_max = nrows_vals[0];
3075  if (local_subpartidx % 2 == 1) {
3076  tr_min -= 1;
3077  tr_max += 1;
3078  }
3079 #ifdef IFPACK2_BLOCKTRIDICONTAINER_USE_PRINTF
3080  printf("tr_min = %d and tr_max = %d;\n", tr_min, tr_max);
3081 #endif
3082  for (local_ordinal_type tr=tr_min;tr<tr_max;++tr) {
3083  for (local_ordinal_type v=vbeg,vi=0;v<npacks && vi<internal_vector_length;++v,++vi) {
3084  const local_ordinal_type nrows = (local_subpartidx % 2 == 0 ? nrows_vals[vi] : nrows_vals[vi]);
3085  if ((local_subpartidx % 2 == 0 && tr < nrows) || (local_subpartidx % 2 == 1 && tr < nrows+1)) {
3086  auto &j = j_vals[vi];
3087  const local_ordinal_type kfs = kfs_vals[vi];
3088  const local_ordinal_type ri0 = ri0_vals[vi];
3089  local_ordinal_type lbeg, lend;
3090  if (local_subpartidx % 2 == 0) {
3091  lbeg = (tr == tr_min ? 1 : 0);
3092  lend = (tr == nrows - 1 ? 2 : 3);
3093  }
3094  else {
3095  lbeg = 0;
3096  lend = 3;
3097  if (tr == tr_min) {
3098  lbeg = 1;
3099  lend = 2;
3100  }
3101  else if (tr == nrows) {
3102  lbeg = 0;
3103  lend = 1;
3104  }
3105  }
3106  if (hasBlockCrsMatrix) {
3107  for (local_ordinal_type l=lbeg;l<lend;++l,++j) {
3108  const size_type Aj = A_block_rowptr(lclrow(ri0 + tr)) + A_colindsub(kfs + j);
3109  const impl_scalar_type* block = &A_values(Aj*blocksize_square);
3110  const size_type pi = kps + j;
3111 #ifdef IFPACK2_BLOCKTRIDICONTAINER_USE_PRINTF
3112  printf("Extract pi = %ld, ri0 + tr = %d, kfs + j = %d, tr = %d, lbeg = %d, lend = %d, l = %d\n", pi, ri0 + tr, kfs + j, tr, lbeg, lend, l);
3113 #endif
3114  Kokkos::parallel_for
3115  (Kokkos::TeamThreadRange(member,blocksize),
3116  [&](const local_ordinal_type &ii) {
3117  for (local_ordinal_type jj=0;jj<blocksize;++jj) {
3118  scalar_values(pi, ii, jj, v) = static_cast<btdm_scalar_type>(block[tlb::getFlatIndex(ii,jj,blocksize)]);
3119  }
3120  });
3121  }
3122  }
3123  else {
3124  for (local_ordinal_type l=lbeg;l<lend;++l,++j) {
3125  const size_type Aj_c = A_colindsub(kfs + j);
3126  const size_type pi = kps + j;
3127  Kokkos::parallel_for
3128  (Kokkos::TeamThreadRange(member,blocksize),
3129  [&](const local_ordinal_type &ii) {
3130  auto point_row_offset = A_point_rowptr(lclrow(ri0 + tr)*blocksize + ii);
3131  for (local_ordinal_type jj=0;jj<blocksize;++jj) {
3132  scalar_values(pi, ii, jj, v) = A_values(point_row_offset + Aj_c*blocksize + jj);
3133  }
3134  });
3135  }
3136  }
3137  }
3138  }
3139  }
3140  }
3141 
3142  template<typename AAViewType,
3143  typename WWViewType>
3144  KOKKOS_INLINE_FUNCTION
3145  void
3146  factorize_subline(const member_type &member,
3147  const local_ordinal_type &i0,
3148  const local_ordinal_type &nrows,
3149  const local_ordinal_type &v,
3150  const AAViewType &AA,
3151  const WWViewType &WW) const {
3152 
3153  typedef ExtractAndFactorizeTridiagsDefaultModeAndAlgo
3154  <typename execution_space::memory_space> default_mode_and_algo_type;
3155 
3156  typedef typename default_mode_and_algo_type::mode_type default_mode_type;
3157  typedef typename default_mode_and_algo_type::algo_type default_algo_type;
3158 
3159  // constant
3160  const auto one = Kokkos::ArithTraits<btdm_magnitude_type>::one();
3161 
3162 #ifdef IFPACK2_BLOCKTRIDICONTAINER_USE_PRINTF
3163  printf("i0 = %d, nrows = %d, v = %d, AA.extent(0) = %ld;\n", i0, nrows, v, AA.extent(0));
3164 #endif
3165 
3166  // subview pattern
3167  auto A = Kokkos::subview(AA, i0, Kokkos::ALL(), Kokkos::ALL(), v);
3168  KB::LU<member_type,
3169  default_mode_type,KB::Algo::LU::Unblocked>
3170  ::invoke(member, A , tiny);
3171 
3172  if (nrows > 1) {
3173  auto B = A;
3174  auto C = A;
3175  local_ordinal_type i = i0;
3176  for (local_ordinal_type tr=1;tr<nrows;++tr,i+=3) {
3177 #ifdef IFPACK2_BLOCKTRIDICONTAINER_USE_PRINTF
3178  printf("tr = %d, i = %d;\n", tr, i);
3179 #endif
3180  B.assign_data( &AA(i+1,0,0,v) );
3181  KB::Trsm<member_type,
3182  KB::Side::Left,KB::Uplo::Lower,KB::Trans::NoTranspose,KB::Diag::Unit,
3183  default_mode_type,default_algo_type>
3184  ::invoke(member, one, A, B);
3185  C.assign_data( &AA(i+2,0,0,v) );
3186  KB::Trsm<member_type,
3187  KB::Side::Right,KB::Uplo::Upper,KB::Trans::NoTranspose,KB::Diag::NonUnit,
3188  default_mode_type,default_algo_type>
3189  ::invoke(member, one, A, C);
3190  A.assign_data( &AA(i+3,0,0,v) );
3191 
3192  member.team_barrier();
3193  KB::Gemm<member_type,
3194  KB::Trans::NoTranspose,KB::Trans::NoTranspose,
3195  default_mode_type,default_algo_type>
3196  ::invoke(member, -one, C, B, one, A);
3197  KB::LU<member_type,
3198  default_mode_type,KB::Algo::LU::Unblocked>
3199  ::invoke(member, A, tiny);
3200  }
3201  } else {
3202  // for block jacobi invert a matrix here
3203  auto W = Kokkos::subview(WW, Kokkos::ALL(), Kokkos::ALL(), v);
3204  KB::Copy<member_type,KB::Trans::NoTranspose,default_mode_type>
3205  ::invoke(member, A, W);
3206  KB::SetIdentity<member_type,default_mode_type>
3207  ::invoke(member, A);
3208  member.team_barrier();
3209  KB::Trsm<member_type,
3210  KB::Side::Left,KB::Uplo::Lower,KB::Trans::NoTranspose,KB::Diag::Unit,
3211  default_mode_type,default_algo_type>
3212  ::invoke(member, one, W, A);
3213  KB::Trsm<member_type,
3214  KB::Side::Left,KB::Uplo::Upper,KB::Trans::NoTranspose,KB::Diag::NonUnit,
3215  default_mode_type,default_algo_type>
3216  ::invoke(member, one, W, A);
3217  }
3218  }
3219 
3220  public:
3221 
3222  struct ExtractAndFactorizeSubLineTag {};
3223  struct ExtractAndFactorizeFusedJacobiTag {};
3224  struct ExtractBCDTag {};
3225  struct ComputeETag {};
3226  struct ComputeSchurTag {};
3227  struct FactorizeSchurTag {};
3228 
3229  KOKKOS_INLINE_FUNCTION
3230  void
3231  operator() (const ExtractAndFactorizeSubLineTag &, const member_type &member) const {
3232  // btdm is packed and sorted from largest one
3233  const local_ordinal_type packidx = packindices_sub(member.league_rank());
3234 
3235  const local_ordinal_type subpartidx = packptr_sub(packidx);
3236  const local_ordinal_type n_parts = part2packrowidx0_sub.extent(0);
3237  const local_ordinal_type local_subpartidx = subpartidx/n_parts;
3238  const local_ordinal_type partidx = subpartidx%n_parts;
3239 
3240  const local_ordinal_type npacks = packptr_sub(packidx+1) - subpartidx;
3241  const local_ordinal_type i0 = pack_td_ptr(partidx,local_subpartidx);
3242  const local_ordinal_type nrows = partptr_sub(subpartidx,1) - partptr_sub(subpartidx,0);
3243 
3244  internal_vector_scratch_type_3d_view
3245  WW(member.team_scratch(ScratchLevel), blocksize, blocksize, vector_loop_size);
3246 
3247 #ifdef IFPACK2_BLOCKTRIDICONTAINER_USE_PRINTF
3248  printf("rank = %d, i0 = %d, npacks = %d, nrows = %d, packidx = %d, subpartidx = %d, partidx = %d, local_subpartidx = %d;\n", member.league_rank(), i0, npacks, nrows, packidx, subpartidx, partidx, local_subpartidx);
3249  printf("vector_loop_size = %d\n", vector_loop_size);
3250 #endif
3251 
3252  if (vector_loop_size == 1) {
3253  extract(partidx, local_subpartidx, npacks);
3254  factorize_subline(member, i0, nrows, 0, internal_vector_values, WW);
3255  } else {
3256  Kokkos::parallel_for
3257  (Kokkos::ThreadVectorRange(member, vector_loop_size),
3258  [&](const local_ordinal_type &v) {
3259  const local_ordinal_type vbeg = v*internal_vector_length;
3260 #ifdef IFPACK2_BLOCKTRIDICONTAINER_USE_PRINTF
3261  printf("i0 = %d, npacks = %d, vbeg = %d;\n", i0, npacks, vbeg);
3262 #endif
3263  if (vbeg < npacks)
3264  extract(member, partidx+vbeg, local_subpartidx, npacks, vbeg);
3265  // this is not safe if vector loop size is different from vector size of
3266  // the team policy. we always make sure this when constructing the team policy
3267  member.team_barrier();
3268  factorize_subline(member, i0, nrows, v, internal_vector_values, WW);
3269  });
3270  }
3271  }
3272 
3273  KOKKOS_INLINE_FUNCTION
3274  void
3275  operator() (const ExtractAndFactorizeFusedJacobiTag&, const member_type &member) const {
3276  using default_mode_and_algo_type = ExtractAndFactorizeTridiagsDefaultModeAndAlgo<typename execution_space::memory_space>;
3277  using default_mode_type = typename default_mode_and_algo_type::mode_type;
3278  using default_algo_type = typename default_mode_and_algo_type::algo_type;
3279  // When fused block Jacobi can be used, the mapping between local rows and parts is trivial (i <-> i)
3280  // We can simply pull the diagonal entry from A into d_inv
3281  btdm_scalar_scratch_type_3d_view WW1(member.team_scratch(ScratchLevel), half_vector_length, blocksize, blocksize);
3282  btdm_scalar_scratch_type_3d_view WW2(member.team_scratch(ScratchLevel), half_vector_length, blocksize, blocksize);
3283  const auto one = Kokkos::ArithTraits<btdm_magnitude_type>::one();
3284  const local_ordinal_type nrows = lclrow.extent(0);
3285  Kokkos::parallel_for
3286  (Kokkos::ThreadVectorRange(member, half_vector_length),
3287  [&](const local_ordinal_type &v) {
3288  local_ordinal_type row = member.league_rank() * half_vector_length + v;
3289  // diagEntry has index of diagonal within row
3290  auto W1 = Kokkos::subview(WW1, v, Kokkos::ALL(), Kokkos::ALL());
3291  auto W2 = Kokkos::subview(WW2, v, Kokkos::ALL(), Kokkos::ALL());
3292  if(row < nrows) {
3293  // View the diagonal block of A in row as 2D row-major
3294  const impl_scalar_type* A_diag = A_values.data() + diag_offsets(row);
3295  // Copy the diag into scratch slice W1
3296  // (copying elements directly is better than KokkosBatched copy)
3297  Kokkos::parallel_for(Kokkos::TeamThreadRange(member, blocksize * blocksize),
3298  [&](int i)
3299  {
3300  W1.data()[i] = A_diag[i];
3301  });
3302  // and set W2 to identity in preparation to invert with 2 x Trsm
3303  KB::SetIdentity<member_type,default_mode_type>
3304  ::invoke(member, W2);
3305  }
3306  else {
3307  // if this vector lane has no block to invert, then set W1 to identity
3308  // so that LU still has a matrix to work on. LU uses team barriers so
3309  // having some lanes run it and some not will deadlock.
3310  KB::SetIdentity<member_type,default_mode_type>
3311  ::invoke(member, W1);
3312  }
3313  member.team_barrier();
3314  // LU factorize in-place
3315  KB::LU<member_type, default_mode_type,KB::Algo::LU::Unblocked>
3316  ::invoke(member, W1, tiny);
3317  member.team_barrier();
3318  KB::Trsm<member_type,
3319  KB::Side::Left,KB::Uplo::Lower,KB::Trans::NoTranspose,KB::Diag::Unit,
3320  default_mode_type,default_algo_type>
3321  ::invoke(member, one, W1, W2);
3322  KB::Trsm<member_type,
3323  KB::Side::Left,KB::Uplo::Upper,KB::Trans::NoTranspose,KB::Diag::NonUnit,
3324  default_mode_type,default_algo_type>
3325  ::invoke(member, one, W1, W2);
3326  member.team_barrier();
3327  if(row < nrows) {
3328  Kokkos::parallel_for(Kokkos::TeamThreadRange(member, blocksize * blocksize),
3329  [&](int i)
3330  {
3331  auto d_inv_block = &d_inv(row, 0, 0);
3332  d_inv_block[i] = W2.data()[i];
3333  });
3334  }
3335  });
3336  }
3337 
3338  KOKKOS_INLINE_FUNCTION
3339  void
3340  operator() (const ExtractBCDTag &, const member_type &member) const {
3341  // btdm is packed and sorted from largest one
3342  const local_ordinal_type packindices_schur_i = member.league_rank() % packindices_schur.extent(0);
3343  const local_ordinal_type packindices_schur_j = member.league_rank() / packindices_schur.extent(0);
3344  const local_ordinal_type packidx = packindices_schur(packindices_schur_i, packindices_schur_j);
3345 
3346  const local_ordinal_type subpartidx = packptr_sub(packidx);
3347  const local_ordinal_type n_parts = part2packrowidx0_sub.extent(0);
3348  const local_ordinal_type local_subpartidx = subpartidx/n_parts;
3349  const local_ordinal_type partidx = subpartidx%n_parts;
3350 
3351  const local_ordinal_type npacks = packptr_sub(packidx+1) - subpartidx;
3352  //const local_ordinal_type i0 = pack_td_ptr(partidx,local_subpartidx);
3353  //const local_ordinal_type nrows = partptr_sub(subpartidx,1) - partptr_sub(subpartidx,0);
3354 
3355  if (vector_loop_size == 1) {
3356  extract(partidx, local_subpartidx, npacks);
3357  }
3358  else {
3359  Kokkos::parallel_for
3360  (Kokkos::ThreadVectorRange(member, vector_loop_size),
3361  [&](const local_ordinal_type &v) {
3362  const local_ordinal_type vbeg = v*internal_vector_length;
3363 #ifdef IFPACK2_BLOCKTRIDICONTAINER_USE_PRINTF
3364  const local_ordinal_type i0 = pack_td_ptr(partidx,local_subpartidx);
3365  printf("i0 = %d, npacks = %d, vbeg = %d;\n", i0, npacks, vbeg);
3366 #endif
3367  if (vbeg < npacks)
3368  extract(member, partidx+vbeg, local_subpartidx, npacks, vbeg);
3369  });
3370  }
3371 
3372  member.team_barrier();
3373 
3374  const size_type kps1 = pack_td_ptr(partidx, local_subpartidx);
3375  const size_type kps2 = pack_td_ptr(partidx, local_subpartidx+1)-1;
3376 
3377  const local_ordinal_type r1 = part2packrowidx0_sub(partidx,local_subpartidx)-1;
3378  const local_ordinal_type r2 = part2packrowidx0_sub(partidx,local_subpartidx)+2;
3379 
3380 #ifdef IFPACK2_BLOCKTRIDICONTAINER_USE_PRINTF
3381  printf("Copy for Schur complement part id = %d from kps1 = %ld to r1 = %d and from kps2 = %ld to r2 = %d partidx = %d local_subpartidx = %d;\n", packidx, kps1, r1, kps2, r2, partidx, local_subpartidx);
3382 #endif
3383 
3384  // Need to copy D to e_internal_vector_values.
3385  copy3DView<local_ordinal_type>(member, Kokkos::subview(e_internal_vector_values, 0, r1, Kokkos::ALL(), Kokkos::ALL(), Kokkos::ALL()),
3386  Kokkos::subview(internal_vector_values, kps1, Kokkos::ALL(), Kokkos::ALL(), Kokkos::ALL()));
3387 
3388  copy3DView<local_ordinal_type>(member, Kokkos::subview(e_internal_vector_values, 1, r2, Kokkos::ALL(), Kokkos::ALL(), Kokkos::ALL()),
3389  Kokkos::subview(internal_vector_values, kps2, Kokkos::ALL(), Kokkos::ALL(), Kokkos::ALL()));
3390 
3391  }
3392 
3393  KOKKOS_INLINE_FUNCTION
3394  void
3395  operator() (const ComputeETag &, const member_type &member) const {
3396  // btdm is packed and sorted from largest one
3397  const local_ordinal_type packidx = packindices_sub(member.league_rank());
3398 
3399  const local_ordinal_type subpartidx = packptr_sub(packidx);
3400  const local_ordinal_type n_parts = part2packrowidx0_sub.extent(0);
3401  const local_ordinal_type local_subpartidx = subpartidx/n_parts;
3402  const local_ordinal_type partidx = subpartidx%n_parts;
3403 
3404  const local_ordinal_type npacks = packptr_sub(packidx+1) - subpartidx;
3405  const local_ordinal_type i0 = pack_td_ptr(partidx,local_subpartidx);
3406  const local_ordinal_type r0 = part2packrowidx0_sub(partidx,local_subpartidx);
3407  const local_ordinal_type nrows = partptr_sub(subpartidx,1) - partptr_sub(subpartidx,0);
3408  const local_ordinal_type num_vectors = blocksize;
3409 
3410  (void) npacks;
3411 
3412  internal_vector_scratch_type_3d_view
3413  WW(member.team_scratch(ScratchLevel), blocksize, num_vectors, vector_loop_size);
3414  if (local_subpartidx == 0) {
3415  Kokkos::parallel_for
3416  (Kokkos::ThreadVectorRange(member, vector_loop_size),[&](const int &v) {
3417  solveMultiVector<impl_type, internal_vector_scratch_type_3d_view> (member, blocksize, i0, r0, nrows, v, internal_vector_values, Kokkos::subview(e_internal_vector_values, 0, Kokkos::ALL(), Kokkos::ALL(), Kokkos::ALL(), Kokkos::ALL()), WW, true);
3418  });
3419  }
3420  else if (local_subpartidx == (local_ordinal_type) part2packrowidx0_sub.extent(1) - 2) {
3421  Kokkos::parallel_for
3422  (Kokkos::ThreadVectorRange(member, vector_loop_size),[&](const int &v) {
3423  solveMultiVector<impl_type, internal_vector_scratch_type_3d_view> (member, blocksize, i0, r0, nrows, v, internal_vector_values, Kokkos::subview(e_internal_vector_values, 1, Kokkos::ALL(), Kokkos::ALL(), Kokkos::ALL(), Kokkos::ALL()), WW);
3424  });
3425  }
3426  else {
3427  Kokkos::parallel_for
3428  (Kokkos::ThreadVectorRange(member, vector_loop_size),[&](const int &v) {
3429  solveMultiVector<impl_type, internal_vector_scratch_type_3d_view> (member, blocksize, i0, r0, nrows, v, internal_vector_values, Kokkos::subview(e_internal_vector_values, 0, Kokkos::ALL(), Kokkos::ALL(), Kokkos::ALL(), Kokkos::ALL()), WW, true);
3430  solveMultiVector<impl_type, internal_vector_scratch_type_3d_view> (member, blocksize, i0, r0, nrows, v, internal_vector_values, Kokkos::subview(e_internal_vector_values, 1, Kokkos::ALL(), Kokkos::ALL(), Kokkos::ALL(), Kokkos::ALL()), WW);
3431  });
3432  }
3433  }
3434 
3435  KOKKOS_INLINE_FUNCTION
3436  void
3437  operator() (const ComputeSchurTag &, const member_type &member) const {
3438  // btdm is packed and sorted from largest one
3439  const local_ordinal_type packindices_schur_i = member.league_rank() % packindices_schur.extent(0);
3440  const local_ordinal_type packindices_schur_j = member.league_rank() / packindices_schur.extent(0);
3441  const local_ordinal_type packidx = packindices_schur(packindices_schur_i, packindices_schur_j);
3442 
3443  const local_ordinal_type subpartidx = packptr_sub(packidx);
3444  const local_ordinal_type n_parts = part2packrowidx0_sub.extent(0);
3445  const local_ordinal_type local_subpartidx = subpartidx/n_parts;
3446  const local_ordinal_type partidx = subpartidx%n_parts;
3447 
3448  //const local_ordinal_type npacks = packptr_sub(packidx+1) - subpartidx;
3449  const local_ordinal_type i0 = pack_td_ptr(partidx,local_subpartidx);
3450  //const local_ordinal_type r0 = part2packrowidx0_sub(partidx,local_subpartidx);
3451  //const local_ordinal_type nrows = partptr_sub(subpartidx,1) - partptr_sub(subpartidx,0);
3452 
3453  // Compute S = D - C E
3454 
3455  const local_ordinal_type local_subpartidx_schur = (local_subpartidx-1)/2;
3456  const local_ordinal_type i0_schur = local_subpartidx_schur == 0 ? pack_td_ptr_schur(partidx,local_subpartidx_schur) : pack_td_ptr_schur(partidx,local_subpartidx_schur) + 1;
3457  const local_ordinal_type i0_offset = local_subpartidx_schur == 0 ? i0+2 : i0+2;
3458 
3459  for (local_ordinal_type i = 0; i < 4; ++i) { //pack_td_ptr_schur(partidx,local_subpartidx_schur+1)-i0_schur
3460  copy3DView<local_ordinal_type>(member, Kokkos::subview(internal_vector_values_schur, i0_schur+i, Kokkos::ALL(), Kokkos::ALL(), Kokkos::ALL()),
3461  Kokkos::subview(internal_vector_values, i0_offset+i, Kokkos::ALL(), Kokkos::ALL(), Kokkos::ALL()));
3462  }
3463 
3464  member.team_barrier();
3465 
3466  const auto one = Kokkos::ArithTraits<btdm_magnitude_type>::one();
3467 
3468  const size_type c_kps1 = pack_td_ptr(partidx, local_subpartidx)+1;
3469  const size_type c_kps2 = pack_td_ptr(partidx, local_subpartidx+1)-2;
3470 
3471  const local_ordinal_type e_r1 = part2packrowidx0_sub(partidx,local_subpartidx)-1;
3472  const local_ordinal_type e_r2 = part2packrowidx0_sub(partidx,local_subpartidx)+2;
3473 
3474  typedef ExtractAndFactorizeTridiagsDefaultModeAndAlgo
3475  <typename execution_space::memory_space> default_mode_and_algo_type;
3476 
3477  typedef typename default_mode_and_algo_type::mode_type default_mode_type;
3478  typedef typename default_mode_and_algo_type::algo_type default_algo_type;
3479 
3480  Kokkos::parallel_for
3481  (Kokkos::ThreadVectorRange(member, vector_loop_size),[&](const int &v) {
3482  for (size_type i = 0; i < pack_td_ptr_schur(partidx,local_subpartidx_schur+1)-pack_td_ptr_schur(partidx,local_subpartidx_schur); ++i) {
3483  local_ordinal_type e_r, e_c, c_kps;
3484 
3485  if ( local_subpartidx_schur == 0 ) {
3486  if ( i == 0 ) {
3487  e_r = e_r1;
3488  e_c = 0;
3489  c_kps = c_kps1;
3490  }
3491  else if ( i == 3 ) {
3492  e_r = e_r2;
3493  e_c = 1;
3494  c_kps = c_kps2;
3495  }
3496  else if ( i == 4 ) {
3497  e_r = e_r2;
3498  e_c = 0;
3499  c_kps = c_kps2;
3500  }
3501  else {
3502  continue;
3503  }
3504  }
3505  else {
3506  if ( i == 0 ) {
3507  e_r = e_r1;
3508  e_c = 1;
3509  c_kps = c_kps1;
3510  }
3511  else if ( i == 1 ) {
3512  e_r = e_r1;
3513  e_c = 0;
3514  c_kps = c_kps1;
3515  }
3516  else if ( i == 4 ) {
3517  e_r = e_r2;
3518  e_c = 1;
3519  c_kps = c_kps2;
3520  }
3521  else if ( i == 5 ) {
3522  e_r = e_r2;
3523  e_c = 0;
3524  c_kps = c_kps2;
3525  }
3526  else {
3527  continue;
3528  }
3529  }
3530 
3531  auto S = Kokkos::subview(internal_vector_values_schur, pack_td_ptr_schur(partidx,local_subpartidx_schur)+i, Kokkos::ALL(), Kokkos::ALL(), v);
3532  auto C = Kokkos::subview(internal_vector_values, c_kps, Kokkos::ALL(), Kokkos::ALL(), v);
3533  auto E = Kokkos::subview(e_internal_vector_values, e_c, e_r, Kokkos::ALL(), Kokkos::ALL(), v);
3534  KB::Gemm<member_type,
3535  KB::Trans::NoTranspose,KB::Trans::NoTranspose,
3536  default_mode_type,default_algo_type>
3537  ::invoke(member, -one, C, E, one, S);
3538  }
3539  });
3540  }
3541 
3542  KOKKOS_INLINE_FUNCTION
3543  void
3544  operator() (const FactorizeSchurTag &, const member_type &member) const {
3545  const local_ordinal_type packidx = packindices_schur(member.league_rank(), 0);
3546 
3547  const local_ordinal_type subpartidx = packptr_sub(packidx);
3548 
3549  const local_ordinal_type n_parts = part2packrowidx0_sub.extent(0);
3550  const local_ordinal_type partidx = subpartidx%n_parts;
3551 
3552  const local_ordinal_type i0 = pack_td_ptr_schur(partidx,0);
3553  const local_ordinal_type nrows = 2*(pack_td_ptr_schur.extent(1)-1);
3554 
3555  internal_vector_scratch_type_3d_view
3556  WW(member.team_scratch(ScratchLevel), blocksize, blocksize, vector_loop_size);
3557 
3558 #ifdef IFPACK2_BLOCKTRIDICONTAINER_USE_PRINTF
3559  printf("FactorizeSchurTag rank = %d, i0 = %d, nrows = %d, vector_loop_size = %d;\n", member.league_rank(), i0, nrows, vector_loop_size);
3560 #endif
3561 
3562  if (vector_loop_size == 1) {
3563  factorize_subline(member, i0, nrows, 0, internal_vector_values_schur, WW);
3564  } else {
3565  Kokkos::parallel_for
3566  (Kokkos::ThreadVectorRange(member, vector_loop_size),
3567  [&](const local_ordinal_type &v) {
3568  factorize_subline(member, i0, nrows, v, internal_vector_values_schur, WW);
3569  });
3570  }
3571  }
3572 
3573  void run() {
3574  IFPACK2_BLOCKTRIDICONTAINER_PROFILER_REGION_BEGIN;
3575  const local_ordinal_type team_size =
3576  ExtractAndFactorizeTridiagsDefaultModeAndAlgo<typename execution_space::memory_space>::
3577  recommended_team_size(blocksize, vector_length, internal_vector_length);
3578  const local_ordinal_type per_team_scratch = internal_vector_scratch_type_3d_view::
3579  shmem_size(blocksize, blocksize, vector_loop_size);
3580 
3581  {
3582 #ifdef IFPACK2_BLOCKTRIDICONTAINER_USE_PRINTF
3583  printf("Start ExtractAndFactorizeSubLineTag\n");
3584 #endif
3585  IFPACK2_BLOCKHELPER_TIMER("BlockTriDi::NumericPhase::ExtractAndFactorizeSubLineTag", ExtractAndFactorizeSubLineTag0);
3586  Kokkos::TeamPolicy<execution_space,ExtractAndFactorizeSubLineTag>
3587  policy(packindices_sub.extent(0), team_size, vector_loop_size);
3588 
3589 
3590  const local_ordinal_type n_parts = part2packrowidx0_sub.extent(0);
3591  writeBTDValuesToFile(n_parts, scalar_values, "before.mm");
3592 
3593  policy.set_scratch_size(ScratchLevel, Kokkos::PerTeam(per_team_scratch));
3594  Kokkos::parallel_for("ExtractAndFactorize::TeamPolicy::run<ExtractAndFactorizeSubLineTag>",
3595  policy, *this);
3596  execution_space().fence();
3597 
3598  writeBTDValuesToFile(n_parts, scalar_values, "after.mm");
3599 #ifdef IFPACK2_BLOCKTRIDICONTAINER_USE_PRINTF
3600  printf("End ExtractAndFactorizeSubLineTag\n");
3601 #endif
3602  }
3603 
3604  if (packindices_schur.extent(1) > 0)
3605  {
3606  {
3607 #ifdef IFPACK2_BLOCKTRIDICONTAINER_USE_PRINTF
3608  printf("Start ExtractBCDTag\n");
3609 #endif
3610  Kokkos::deep_copy(e_scalar_values, Kokkos::ArithTraits<btdm_magnitude_type>::zero());
3611  Kokkos::deep_copy(scalar_values_schur, Kokkos::ArithTraits<btdm_magnitude_type>::zero());
3612 
3613  write5DMultiVectorValuesToFile(part2packrowidx0_sub.extent(0), e_scalar_values, "e_scalar_values_before_extract.mm");
3614 
3615  {
3616  IFPACK2_BLOCKHELPER_TIMER("BlockTriDi::NumericPhase::ExtractBCDTag", ExtractBCDTag0);
3617  Kokkos::TeamPolicy<execution_space,ExtractBCDTag>
3618  policy(packindices_schur.extent(0)*packindices_schur.extent(1), team_size, vector_loop_size);
3619 
3620  policy.set_scratch_size(ScratchLevel, Kokkos::PerTeam(per_team_scratch));
3621  Kokkos::parallel_for("ExtractAndFactorize::TeamPolicy::run<ExtractBCDTag>",
3622  policy, *this);
3623  execution_space().fence();
3624  }
3625 
3626 #ifdef IFPACK2_BLOCKTRIDICONTAINER_USE_PRINTF
3627  printf("End ExtractBCDTag\n");
3628 #endif
3629  writeBTDValuesToFile(part2packrowidx0_sub.extent(0), scalar_values, "after_extraction_of_BCD.mm");
3630 #ifdef IFPACK2_BLOCKTRIDICONTAINER_USE_PRINTF
3631  printf("Start ComputeETag\n");
3632 #endif
3633  write5DMultiVectorValuesToFile(part2packrowidx0_sub.extent(0), e_scalar_values, "e_scalar_values_after_extract.mm");
3634  {
3635  IFPACK2_BLOCKHELPER_TIMER("BlockTriDi::NumericPhase::ComputeETag", ComputeETag0);
3636  Kokkos::TeamPolicy<execution_space,ComputeETag>
3637  policy(packindices_sub.extent(0), team_size, vector_loop_size);
3638 
3639  policy.set_scratch_size(ScratchLevel, Kokkos::PerTeam(per_team_scratch));
3640  Kokkos::parallel_for("ExtractAndFactorize::TeamPolicy::run<ComputeETag>",
3641  policy, *this);
3642  execution_space().fence();
3643  }
3644  write5DMultiVectorValuesToFile(part2packrowidx0_sub.extent(0), e_scalar_values, "e_scalar_values_after_compute.mm");
3645 
3646 #ifdef IFPACK2_BLOCKTRIDICONTAINER_USE_PRINTF
3647  printf("End ComputeETag\n");
3648 #endif
3649  }
3650 
3651  {
3652 #ifdef IFPACK2_BLOCKTRIDICONTAINER_USE_PRINTF
3653  printf("Start ComputeSchurTag\n");
3654 #endif
3655  IFPACK2_BLOCKHELPER_TIMER("BlockTriDi::NumericPhase::ComputeSchurTag", ComputeSchurTag0);
3656  writeBTDValuesToFile(part2packrowidx0_sub.extent(0), scalar_values_schur, "before_schur.mm");
3657  Kokkos::TeamPolicy<execution_space,ComputeSchurTag>
3658  policy(packindices_schur.extent(0)*packindices_schur.extent(1), team_size, vector_loop_size);
3659 
3660  Kokkos::parallel_for("ExtractAndFactorize::TeamPolicy::run<ComputeSchurTag>",
3661  policy, *this);
3662  writeBTDValuesToFile(part2packrowidx0_sub.extent(0), scalar_values_schur, "after_schur.mm");
3663  execution_space().fence();
3664 #ifdef IFPACK2_BLOCKTRIDICONTAINER_USE_PRINTF
3665  printf("End ComputeSchurTag\n");
3666 #endif
3667  }
3668 
3669  {
3670 #ifdef IFPACK2_BLOCKTRIDICONTAINER_USE_PRINTF
3671  printf("Start FactorizeSchurTag\n");
3672 #endif
3673  IFPACK2_BLOCKHELPER_TIMER("BlockTriDi::NumericPhase::FactorizeSchurTag", FactorizeSchurTag0);
3674  Kokkos::TeamPolicy<execution_space,FactorizeSchurTag>
3675  policy(packindices_schur.extent(0), team_size, vector_loop_size);
3676  policy.set_scratch_size(ScratchLevel, Kokkos::PerTeam(per_team_scratch));
3677  Kokkos::parallel_for("ExtractAndFactorize::TeamPolicy::run<FactorizeSchurTag>",
3678  policy, *this);
3679  execution_space().fence();
3680  writeBTDValuesToFile(part2packrowidx0_sub.extent(0), scalar_values_schur, "after_factor_schur.mm");
3681 #ifdef IFPACK2_BLOCKTRIDICONTAINER_USE_PRINTF
3682  printf("End FactorizeSchurTag\n");
3683 #endif
3684  }
3685  }
3686 
3687  IFPACK2_BLOCKTRIDICONTAINER_PROFILER_REGION_END;
3688  }
3689 
3690  void run_fused_jacobi() {
3691  IFPACK2_BLOCKTRIDICONTAINER_PROFILER_REGION_BEGIN;
3692  const local_ordinal_type team_size =
3693  ExtractAndFactorizeTridiagsDefaultModeAndAlgo<typename execution_space::memory_space>::
3694  recommended_team_size(blocksize, half_vector_length, 1);
3695  const local_ordinal_type per_team_scratch =
3696  btdm_scalar_scratch_type_3d_view::shmem_size(blocksize, blocksize, 2 * half_vector_length);
3697  {
3698  IFPACK2_BLOCKHELPER_TIMER("BlockTriDi::NumericPhase::ExtractAndFactorizeFusedJacobi", ExtractAndFactorizeFusedJacobiTag);
3699  Kokkos::TeamPolicy<execution_space, ExtractAndFactorizeFusedJacobiTag>
3700  policy((lclrow.extent(0) + half_vector_length - 1) / half_vector_length, team_size, half_vector_length);
3701 
3702  policy.set_scratch_size(ScratchLevel, Kokkos::PerTeam(per_team_scratch));
3703  Kokkos::parallel_for("ExtractAndFactorize::TeamPolicy::run<ExtractAndFactorizeFusedJacobiTag>",
3704  policy, *this);
3705  }
3706  IFPACK2_BLOCKTRIDICONTAINER_PROFILER_REGION_END;
3707  }
3708  };
3709 
3713  template<typename MatrixType>
3714  void
3715  performNumericPhase(const Teuchos::RCP<const typename BlockHelperDetails::ImplType<MatrixType>::tpetra_row_matrix_type> &A,
3716  const Teuchos::RCP<const typename BlockHelperDetails::ImplType<MatrixType>::tpetra_crs_graph_type> &G,
3717  const BlockHelperDetails::PartInterface<MatrixType> &interf,
3719  const typename BlockHelperDetails::ImplType<MatrixType>::magnitude_type tiny,
3720  bool use_fused_jacobi) {
3721  using impl_type = BlockHelperDetails::ImplType<MatrixType>;
3722  using execution_space = typename impl_type::execution_space;
3723  using team_policy_type = Kokkos::TeamPolicy<execution_space>;
3724  using internal_vector_scratch_type_3d_view = Scratch<typename impl_type::internal_vector_type_3d_view>;
3725  using btdm_scalar_scratch_type_3d_view = Scratch<typename impl_type::btdm_scalar_type_3d_view>;
3726 
3727  IFPACK2_BLOCKHELPER_TIMER("BlockTriDi::NumericPhase", NumericPhase);
3728 
3729  int blocksize = btdm.values.extent(1);
3730  // Both Kokkos policy vector length and SIMD type vector length are hardcoded in KokkosBatched.
3731  // For large block sizes, have to fall back to level 1 scratch.
3732  int scratch_required;
3733  if(!use_fused_jacobi) {
3734  // General path scratch requirement
3735  scratch_required = internal_vector_scratch_type_3d_view::shmem_size(blocksize, blocksize, impl_type::vector_length / impl_type::internal_vector_length);
3736  }
3737  else {
3738  // Block Jacobi scratch requirement: measured in scalars, and uses twice as much (in bytes) per vector lane as the general path.
3739  scratch_required = btdm_scalar_scratch_type_3d_view::shmem_size(blocksize, blocksize, 2 * impl_type::half_vector_length);
3740  }
3741 
3742  int max_scratch = team_policy_type::scratch_size_max(0);
3743 
3744  if(scratch_required < max_scratch) {
3745  // Can use level 0 scratch
3746  ExtractAndFactorizeTridiags<MatrixType, 0> function(btdm, interf, A, G, tiny);
3747  if(!use_fused_jacobi)
3748  function.run();
3749  else
3750  function.run_fused_jacobi();
3751  }
3752  else {
3753  // Not enough level 0 scratch, so fall back to level 1
3754  ExtractAndFactorizeTridiags<MatrixType, 1> function(btdm, interf, A, G, tiny);
3755  if(!use_fused_jacobi)
3756  function.run();
3757  else
3758  function.run_fused_jacobi();
3759  }
3760  IFPACK2_BLOCKHELPER_TIMER_FENCE(typename BlockHelperDetails::ImplType<MatrixType>::execution_space)
3761  }
3762 
3766  template<typename MatrixType>
3768  public:
3770  using execution_space = typename impl_type::execution_space;
3771  using memory_space = typename impl_type::memory_space;
3772 
3773  using local_ordinal_type = typename impl_type::local_ordinal_type;
3774  using impl_scalar_type = typename impl_type::impl_scalar_type;
3775  using btdm_scalar_type = typename impl_type::btdm_scalar_type;
3776  using tpetra_multivector_type = typename impl_type::tpetra_multivector_type;
3777  using local_ordinal_type_1d_view = typename impl_type::local_ordinal_type_1d_view;
3778  using vector_type_3d_view = typename impl_type::vector_type_3d_view;
3779  using impl_scalar_type_2d_view_tpetra = typename impl_type::impl_scalar_type_2d_view_tpetra;
3780  using const_impl_scalar_type_2d_view_tpetra = typename impl_scalar_type_2d_view_tpetra::const_type;
3781  static constexpr int vector_length = impl_type::vector_length;
3782 
3783  using member_type = typename Kokkos::TeamPolicy<execution_space>::member_type;
3784 
3785  private:
3786  // part interface
3787  const ConstUnmanaged<local_ordinal_type_1d_view> partptr;
3788  const ConstUnmanaged<local_ordinal_type_1d_view> packptr;
3789  const ConstUnmanaged<local_ordinal_type_1d_view> part2packrowidx0;
3790  const ConstUnmanaged<local_ordinal_type_1d_view> part2rowidx0;
3791  const ConstUnmanaged<local_ordinal_type_1d_view> lclrow;
3792  const local_ordinal_type blocksize;
3793  const local_ordinal_type num_vectors;
3794 
3795  // packed multivector output (or input)
3796  vector_type_3d_view packed_multivector;
3797  const_impl_scalar_type_2d_view_tpetra scalar_multivector;
3798 
3799  template<typename TagType>
3800  KOKKOS_INLINE_FUNCTION
3801  void copy_multivectors(const local_ordinal_type &j,
3802  const local_ordinal_type &vi,
3803  const local_ordinal_type &pri,
3804  const local_ordinal_type &ri0) const {
3805  for (local_ordinal_type col=0;col<num_vectors;++col)
3806  for (local_ordinal_type i=0;i<blocksize;++i)
3807  packed_multivector(pri, i, col)[vi] = static_cast<btdm_scalar_type>(scalar_multivector(blocksize*lclrow(ri0+j)+i,col));
3808  }
3809 
3810  public:
3811 
3812  MultiVectorConverter(const BlockHelperDetails::PartInterface<MatrixType> &interf,
3813  const vector_type_3d_view &pmv)
3814  : partptr(interf.partptr),
3815  packptr(interf.packptr),
3816  part2packrowidx0(interf.part2packrowidx0),
3817  part2rowidx0(interf.part2rowidx0),
3818  lclrow(interf.lclrow),
3819  blocksize(pmv.extent(1)),
3820  num_vectors(pmv.extent(2)),
3821  packed_multivector(pmv) {}
3822 
3823  // TODO:: modify this routine similar to the team level functions
3824  KOKKOS_INLINE_FUNCTION
3825  void
3826  operator() (const local_ordinal_type &packidx) const {
3827  local_ordinal_type partidx = packptr(packidx);
3828  local_ordinal_type npacks = packptr(packidx+1) - partidx;
3829  const local_ordinal_type pri0 = part2packrowidx0(partidx);
3830 
3831  local_ordinal_type ri0[vector_length] = {};
3832  local_ordinal_type nrows[vector_length] = {};
3833  for (local_ordinal_type v=0;v<npacks;++v,++partidx) {
3834  ri0[v] = part2rowidx0(partidx);
3835  nrows[v] = part2rowidx0(partidx+1) - ri0[v];
3836  }
3837  for (local_ordinal_type j=0;j<nrows[0];++j) {
3838  local_ordinal_type cnt = 1;
3839  for (;cnt<npacks && j!= nrows[cnt];++cnt);
3840  npacks = cnt;
3841  const local_ordinal_type pri = pri0 + j;
3842  for (local_ordinal_type col=0;col<num_vectors;++col)
3843  for (local_ordinal_type i=0;i<blocksize;++i)
3844  for (local_ordinal_type v=0;v<npacks;++v)
3845  packed_multivector(pri, i, col)[v] = static_cast<btdm_scalar_type>(scalar_multivector(blocksize*lclrow(ri0[v]+j)+i,col));
3846  }
3847  }
3848 
3849  KOKKOS_INLINE_FUNCTION
3850  void
3851  operator() (const member_type &member) const {
3852  const local_ordinal_type packidx = member.league_rank();
3853  const local_ordinal_type partidx_begin = packptr(packidx);
3854  const local_ordinal_type npacks = packptr(packidx+1) - partidx_begin;
3855  const local_ordinal_type pri0 = part2packrowidx0(partidx_begin);
3856  Kokkos::parallel_for(Kokkos::ThreadVectorRange(member, npacks), [&](const local_ordinal_type &v) {
3857  const local_ordinal_type partidx = partidx_begin + v;
3858  const local_ordinal_type ri0 = part2rowidx0(partidx);
3859  const local_ordinal_type nrows = part2rowidx0(partidx+1) - ri0;
3860 
3861  if (nrows == 1) {
3862  const local_ordinal_type pri = pri0;
3863  for (local_ordinal_type col=0;col<num_vectors;++col) {
3864  Kokkos::parallel_for(Kokkos::TeamThreadRange(member, blocksize), [&](const local_ordinal_type &i) {
3865  packed_multivector(pri, i, col)[v] = static_cast<btdm_scalar_type>(scalar_multivector(blocksize*lclrow(ri0)+i,col));
3866  });
3867  }
3868  } else {
3869  Kokkos::parallel_for(Kokkos::TeamThreadRange(member, nrows), [&](const local_ordinal_type &j) {
3870  const local_ordinal_type pri = pri0 + j;
3871  for (local_ordinal_type col=0;col<num_vectors;++col)
3872  for (local_ordinal_type i=0;i<blocksize;++i)
3873  packed_multivector(pri, i, col)[v] = static_cast<btdm_scalar_type>(scalar_multivector(blocksize*lclrow(ri0+j)+i,col));
3874  });
3875  }
3876  });
3877  }
3878 
3879  void run(const const_impl_scalar_type_2d_view_tpetra &scalar_multivector_) {
3880  IFPACK2_BLOCKTRIDICONTAINER_PROFILER_REGION_BEGIN;
3881  IFPACK2_BLOCKHELPER_TIMER("BlockTriDi::MultiVectorConverter", MultiVectorConverter0);
3882 
3883  scalar_multivector = scalar_multivector_;
3884  if constexpr (BlockHelperDetails::is_device<execution_space>::value) {
3885  const local_ordinal_type vl = vector_length;
3886  const Kokkos::TeamPolicy<execution_space> policy(packptr.extent(0) - 1, Kokkos::AUTO(), vl);
3887  Kokkos::parallel_for
3888  ("MultiVectorConverter::TeamPolicy", policy, *this);
3889  } else {
3890  const Kokkos::RangePolicy<execution_space> policy(0, packptr.extent(0) - 1);
3891  Kokkos::parallel_for
3892  ("MultiVectorConverter::RangePolicy", policy, *this);
3893  }
3894  IFPACK2_BLOCKTRIDICONTAINER_PROFILER_REGION_END;
3895  IFPACK2_BLOCKHELPER_TIMER_FENCE(execution_space)
3896  }
3897  };
3898 
3902 
3903  template<>
3904  struct SolveTridiagsDefaultModeAndAlgo<Kokkos::HostSpace> {
3905  typedef KB::Mode::Serial mode_type;
3906  typedef KB::Algo::Level2::Unblocked single_vector_algo_type;
3907 #if defined(__KOKKOSBATCHED_INTEL_MKL_COMPACT_BATCHED__)
3908  typedef KB::Algo::Level3::CompactMKL multi_vector_algo_type;
3909 #else
3910  typedef KB::Algo::Level3::Blocked multi_vector_algo_type;
3911 #endif
3912  static int recommended_team_size(const int /* blksize */,
3913  const int /* vector_length */,
3914  const int /* internal_vector_length */) {
3915  return 1;
3916  }
3917  };
3918 
3919 #if defined(KOKKOS_ENABLE_CUDA)
3920  static inline int SolveTridiagsRecommendedCudaTeamSize(const int blksize,
3921  const int vector_length,
3922  const int internal_vector_length) {
3923  const int vector_size = vector_length/internal_vector_length;
3924  int total_team_size(0);
3925  if (blksize <= 5) total_team_size = 32;
3926  else if (blksize <= 9) total_team_size = 32; // 64
3927  else if (blksize <= 12) total_team_size = 96;
3928  else if (blksize <= 16) total_team_size = 128;
3929  else if (blksize <= 20) total_team_size = 160;
3930  else total_team_size = 160;
3931  return total_team_size/vector_size;
3932  }
3933 
3934  template<>
3935  struct SolveTridiagsDefaultModeAndAlgo<Kokkos::CudaSpace> {
3936  typedef KB::Mode::Team mode_type;
3937  typedef KB::Algo::Level2::Unblocked single_vector_algo_type;
3938  typedef KB::Algo::Level3::Unblocked multi_vector_algo_type;
3939  static int recommended_team_size(const int blksize,
3940  const int vector_length,
3941  const int internal_vector_length) {
3942  return SolveTridiagsRecommendedCudaTeamSize(blksize, vector_length, internal_vector_length);
3943  }
3944  };
3945  template<>
3946  struct SolveTridiagsDefaultModeAndAlgo<Kokkos::CudaUVMSpace> {
3947  typedef KB::Mode::Team mode_type;
3948  typedef KB::Algo::Level2::Unblocked single_vector_algo_type;
3949  typedef KB::Algo::Level3::Unblocked multi_vector_algo_type;
3950  static int recommended_team_size(const int blksize,
3951  const int vector_length,
3952  const int internal_vector_length) {
3953  return SolveTridiagsRecommendedCudaTeamSize(blksize, vector_length, internal_vector_length);
3954  }
3955  };
3956 #endif
3957 
3958 #if defined(KOKKOS_ENABLE_HIP)
3959  static inline int SolveTridiagsRecommendedHIPTeamSize(const int blksize,
3960  const int vector_length,
3961  const int internal_vector_length) {
3962  const int vector_size = vector_length/internal_vector_length;
3963  int total_team_size(0);
3964  if (blksize <= 5) total_team_size = 32;
3965  else if (blksize <= 9) total_team_size = 32; // 64
3966  else if (blksize <= 12) total_team_size = 96;
3967  else if (blksize <= 16) total_team_size = 128;
3968  else if (blksize <= 20) total_team_size = 160;
3969  else total_team_size = 160;
3970  return total_team_size/vector_size;
3971  }
3972 
3973  template<>
3974  struct SolveTridiagsDefaultModeAndAlgo<Kokkos::HIPSpace> {
3975  typedef KB::Mode::Team mode_type;
3976  typedef KB::Algo::Level2::Unblocked single_vector_algo_type;
3977  typedef KB::Algo::Level3::Unblocked multi_vector_algo_type;
3978  static int recommended_team_size(const int blksize,
3979  const int vector_length,
3980  const int internal_vector_length) {
3981  return SolveTridiagsRecommendedHIPTeamSize(blksize, vector_length, internal_vector_length);
3982  }
3983  };
3984  template<>
3985  struct SolveTridiagsDefaultModeAndAlgo<Kokkos::HIPHostPinnedSpace> {
3986  typedef KB::Mode::Team mode_type;
3987  typedef KB::Algo::Level2::Unblocked single_vector_algo_type;
3988  typedef KB::Algo::Level3::Unblocked multi_vector_algo_type;
3989  static int recommended_team_size(const int blksize,
3990  const int vector_length,
3991  const int internal_vector_length) {
3992  return SolveTridiagsRecommendedHIPTeamSize(blksize, vector_length, internal_vector_length);
3993  }
3994  };
3995 #endif
3996 
3997 #if defined(KOKKOS_ENABLE_SYCL)
3998  static inline int SolveTridiagsRecommendedSYCLTeamSize(const int blksize,
3999  const int vector_length,
4000  const int internal_vector_length) {
4001  const int vector_size = vector_length/internal_vector_length;
4002  int total_team_size(0);
4003  if (blksize <= 5) total_team_size = 32;
4004  else if (blksize <= 9) total_team_size = 32; // 64
4005  else if (blksize <= 12) total_team_size = 96;
4006  else if (blksize <= 16) total_team_size = 128;
4007  else if (blksize <= 20) total_team_size = 160;
4008  else total_team_size = 160;
4009  return total_team_size/vector_size;
4010  }
4011 
4012  template<>
4013  struct SolveTridiagsDefaultModeAndAlgo<Kokkos::Experimental::SYCLSharedUSMSpace> {
4014  typedef KB::Mode::Team mode_type;
4015  typedef KB::Algo::Level2::Unblocked single_vector_algo_type;
4016  typedef KB::Algo::Level3::Unblocked multi_vector_algo_type;
4017  static int recommended_team_size(const int blksize,
4018  const int vector_length,
4019  const int internal_vector_length) {
4020  return SolveTridiagsRecommendedSYCLTeamSize(blksize, vector_length, internal_vector_length);
4021  }
4022  };
4023  template<>
4024  struct SolveTridiagsDefaultModeAndAlgo<Kokkos::Experimental::SYCLDeviceUSMSpace> {
4025  typedef KB::Mode::Team mode_type;
4026  typedef KB::Algo::Level2::Unblocked single_vector_algo_type;
4027  typedef KB::Algo::Level3::Unblocked multi_vector_algo_type;
4028  static int recommended_team_size(const int blksize,
4029  const int vector_length,
4030  const int internal_vector_length) {
4031  return SolveTridiagsRecommendedSYCLTeamSize(blksize, vector_length, internal_vector_length);
4032  }
4033  };
4034 #endif
4035 
4036 
4037 
4038 
4039  template<typename MatrixType>
4040  struct SolveTridiags {
4041  public:
4042  using impl_type = BlockHelperDetails::ImplType<MatrixType>;
4043  using execution_space = typename impl_type::execution_space;
4044 
4045  using local_ordinal_type = typename impl_type::local_ordinal_type;
4046  using size_type = typename impl_type::size_type;
4047  using impl_scalar_type = typename impl_type::impl_scalar_type;
4048  using magnitude_type = typename impl_type::magnitude_type;
4049  using btdm_scalar_type = typename impl_type::btdm_scalar_type;
4050  using btdm_magnitude_type = typename impl_type::btdm_magnitude_type;
4052  using local_ordinal_type_1d_view = typename impl_type::local_ordinal_type_1d_view;
4053  using local_ordinal_type_2d_view = typename impl_type::local_ordinal_type_2d_view;
4054  using size_type_2d_view = typename impl_type::size_type_2d_view;
4056  using vector_type_3d_view = typename impl_type::vector_type_3d_view;
4057  using internal_vector_type_4d_view = typename impl_type::internal_vector_type_4d_view;
4058  using internal_vector_type_5d_view = typename impl_type::internal_vector_type_5d_view;
4059  using btdm_scalar_type_4d_view = typename impl_type::btdm_scalar_type_4d_view;
4060 
4061  using internal_vector_scratch_type_3d_view = Scratch<typename impl_type::internal_vector_type_3d_view>;
4062 
4063  using internal_vector_type =typename impl_type::internal_vector_type;
4064  static constexpr int vector_length = impl_type::vector_length;
4065  static constexpr int internal_vector_length = impl_type::internal_vector_length;
4066 
4068  using impl_scalar_type_1d_view = typename impl_type::impl_scalar_type_1d_view;
4069  using impl_scalar_type_2d_view_tpetra = typename impl_type::impl_scalar_type_2d_view_tpetra;
4070 
4072  using team_policy_type = Kokkos::TeamPolicy<execution_space>;
4073  using member_type = typename team_policy_type::member_type;
4074 
4075  private:
4076  // part interface
4077  local_ordinal_type n_subparts_per_part;
4078  const ConstUnmanaged<local_ordinal_type_1d_view> partptr;
4079  const ConstUnmanaged<local_ordinal_type_1d_view> packptr;
4080  const ConstUnmanaged<local_ordinal_type_1d_view> packindices_sub;
4081  const ConstUnmanaged<local_ordinal_type_2d_view> packindices_schur;
4082  const ConstUnmanaged<local_ordinal_type_1d_view> part2packrowidx0;
4083  const ConstUnmanaged<local_ordinal_type_2d_view> part2packrowidx0_sub;
4084  const ConstUnmanaged<local_ordinal_type_1d_view> lclrow;
4085  const ConstUnmanaged<local_ordinal_type_1d_view> packptr_sub;
4086 
4087  const ConstUnmanaged<local_ordinal_type_2d_view> partptr_sub;
4088  const ConstUnmanaged<size_type_2d_view> pack_td_ptr_schur;
4089 
4090  // block tridiags
4091  const ConstUnmanaged<size_type_2d_view> pack_td_ptr;
4092 
4093  // block tridiags values
4094  const ConstUnmanaged<internal_vector_type_4d_view> D_internal_vector_values;
4095  const Unmanaged<internal_vector_type_4d_view> X_internal_vector_values;
4096  const Unmanaged<btdm_scalar_type_4d_view> X_internal_scalar_values;
4097 
4098  internal_vector_type_4d_view X_internal_vector_values_schur;
4099 
4100  const ConstUnmanaged<internal_vector_type_4d_view> D_internal_vector_values_schur;
4101  const ConstUnmanaged<internal_vector_type_5d_view> e_internal_vector_values;
4102 
4103 
4104  const local_ordinal_type vector_loop_size;
4105 
4106  // copy to multivectors : damping factor and Y_scalar_multivector
4107  Unmanaged<impl_scalar_type_2d_view_tpetra> Y_scalar_multivector;
4108 #if defined(__CUDA_ARCH__) || defined(__HIP_DEVICE_COMPILE__) || defined(__SYCL_DEVICE_ONLY__)
4109  AtomicUnmanaged<impl_scalar_type_1d_view> Z_scalar_vector;
4110 #else
4111  /* */ Unmanaged<impl_scalar_type_1d_view> Z_scalar_vector;
4112 #endif
4113  const impl_scalar_type df;
4114  const bool compute_diff;
4115 
4116  public:
4117  SolveTridiags(const BlockHelperDetails::PartInterface<MatrixType> &interf,
4118  const BlockTridiags<MatrixType> &btdm,
4119  const vector_type_3d_view &pmv,
4120  const impl_scalar_type damping_factor,
4121  const bool is_norm_manager_active)
4122  :
4123  // interface
4124  n_subparts_per_part(interf.n_subparts_per_part),
4125  partptr(interf.partptr),
4126  packptr(interf.packptr),
4127  packindices_sub(interf.packindices_sub),
4128  packindices_schur(interf.packindices_schur),
4129  part2packrowidx0(interf.part2packrowidx0),
4130  part2packrowidx0_sub(interf.part2packrowidx0_sub),
4131  lclrow(interf.lclrow),
4132  packptr_sub(interf.packptr_sub),
4133  partptr_sub(interf.partptr_sub),
4134  pack_td_ptr_schur(btdm.pack_td_ptr_schur),
4135  // block tridiags and multivector
4136  pack_td_ptr(btdm.pack_td_ptr),
4137  D_internal_vector_values((internal_vector_type*)btdm.values.data(),
4138  btdm.values.extent(0),
4139  btdm.values.extent(1),
4140  btdm.values.extent(2),
4141  vector_length/internal_vector_length),
4142  X_internal_vector_values((internal_vector_type*)pmv.data(),
4143  pmv.extent(0),
4144  pmv.extent(1),
4145  pmv.extent(2),
4146  vector_length/internal_vector_length),
4147  X_internal_scalar_values((btdm_scalar_type*)pmv.data(),
4148  pmv.extent(0),
4149  pmv.extent(1),
4150  pmv.extent(2),
4151  vector_length),
4152  X_internal_vector_values_schur(do_not_initialize_tag("X_internal_vector_values_schur"),
4153  2*(n_subparts_per_part-1) * part2packrowidx0_sub.extent(0),
4154  pmv.extent(1),
4155  pmv.extent(2),
4156  vector_length/internal_vector_length),
4157  D_internal_vector_values_schur((internal_vector_type*)btdm.values_schur.data(),
4158  btdm.values_schur.extent(0),
4159  btdm.values_schur.extent(1),
4160  btdm.values_schur.extent(2),
4161  vector_length/internal_vector_length),
4162  e_internal_vector_values((internal_vector_type*)btdm.e_values.data(),
4163  btdm.e_values.extent(0),
4164  btdm.e_values.extent(1),
4165  btdm.e_values.extent(2),
4166  btdm.e_values.extent(3),
4167  vector_length/internal_vector_length),
4168  vector_loop_size(vector_length/internal_vector_length),
4169  Y_scalar_multivector(),
4170  Z_scalar_vector(),
4171  df(damping_factor),
4172  compute_diff(is_norm_manager_active)
4173  {}
4174 
4175  public:
4176 
4178  KOKKOS_INLINE_FUNCTION
4179  void
4180  copyToFlatMultiVector(const member_type &member,
4181  const local_ordinal_type partidxbeg, // partidx for v = 0
4182  const local_ordinal_type npacks,
4183  const local_ordinal_type pri0,
4184  const local_ordinal_type v, // index with a loop of vector_loop_size
4185  const local_ordinal_type blocksize,
4186  const local_ordinal_type num_vectors) const {
4187  const local_ordinal_type vbeg = v*internal_vector_length;
4188  if (vbeg < npacks) {
4189  local_ordinal_type ri0_vals[internal_vector_length] = {};
4190  local_ordinal_type nrows_vals[internal_vector_length] = {};
4191  for (local_ordinal_type vv=vbeg,vi=0;vv<npacks && vi<internal_vector_length;++vv,++vi) {
4192  const local_ordinal_type partidx = partidxbeg+vv;
4193  ri0_vals[vi] = partptr(partidx);
4194  nrows_vals[vi] = partptr(partidx+1) - ri0_vals[vi];
4195  }
4196 
4197  impl_scalar_type z_partial_sum(0);
4198  if (nrows_vals[0] == 1) {
4199  const local_ordinal_type j=0, pri=pri0;
4200  {
4201  for (local_ordinal_type vv=vbeg,vi=0;vv<npacks && vi<internal_vector_length;++vv,++vi) {
4202  const local_ordinal_type ri0 = ri0_vals[vi];
4203  const local_ordinal_type nrows = nrows_vals[vi];
4204  if (j < nrows) {
4205  Kokkos::parallel_for
4206  (Kokkos::TeamThreadRange(member, blocksize),
4207  [&](const local_ordinal_type &i) {
4208  const local_ordinal_type row = blocksize*lclrow(ri0+j)+i;
4209  for (local_ordinal_type col=0;col<num_vectors;++col) {
4210  impl_scalar_type &y = Y_scalar_multivector(row,col);
4211  const impl_scalar_type yd = X_internal_vector_values(pri, i, col, v)[vi] - y;
4212  y += df*yd;
4213 
4214  {//if (compute_diff) {
4215  const auto yd_abs = Kokkos::ArithTraits<impl_scalar_type>::abs(yd);
4216  z_partial_sum += yd_abs*yd_abs;
4217  }
4218  }
4219  });
4220  }
4221  }
4222  }
4223  } else {
4224  Kokkos::parallel_for
4225  (Kokkos::TeamThreadRange(member, nrows_vals[0]),
4226  [&](const local_ordinal_type &j) {
4227  const local_ordinal_type pri = pri0 + j;
4228  for (local_ordinal_type vv=vbeg,vi=0;vv<npacks && vi<internal_vector_length;++vv,++vi) {
4229  const local_ordinal_type ri0 = ri0_vals[vi];
4230  const local_ordinal_type nrows = nrows_vals[vi];
4231  if (j < nrows) {
4232  for (local_ordinal_type col=0;col<num_vectors;++col) {
4233  for (local_ordinal_type i=0;i<blocksize;++i) {
4234  const local_ordinal_type row = blocksize*lclrow(ri0+j)+i;
4235  impl_scalar_type &y = Y_scalar_multivector(row,col);
4236  const impl_scalar_type yd = X_internal_vector_values(pri, i, col, v)[vi] - y;
4237  y += df*yd;
4238 
4239  {//if (compute_diff) {
4240  const auto yd_abs = Kokkos::ArithTraits<impl_scalar_type>::abs(yd);
4241  z_partial_sum += yd_abs*yd_abs;
4242  }
4243  }
4244  }
4245  }
4246  }
4247  });
4248  }
4249  //if (compute_diff)
4250  Z_scalar_vector(member.league_rank()) += z_partial_sum;
4251  }
4252  }
4253 
4257  template<typename WWViewType>
4258  KOKKOS_INLINE_FUNCTION
4259  void
4260  solveSingleVector(const member_type &member,
4261  const local_ordinal_type &blocksize,
4262  const local_ordinal_type &i0,
4263  const local_ordinal_type &r0,
4264  const local_ordinal_type &nrows,
4265  const local_ordinal_type &v,
4266  const WWViewType &WW) const {
4267 
4268  typedef SolveTridiagsDefaultModeAndAlgo
4269  <typename execution_space::memory_space> default_mode_and_algo_type;
4270 
4271  typedef typename default_mode_and_algo_type::mode_type default_mode_type;
4272  typedef typename default_mode_and_algo_type::single_vector_algo_type default_algo_type;
4273 
4274  // base pointers
4275  auto A = D_internal_vector_values.data();
4276  auto X = X_internal_vector_values.data();
4277 
4278  // constant
4279  const auto one = Kokkos::ArithTraits<btdm_magnitude_type>::one();
4280  const auto zero = Kokkos::ArithTraits<btdm_magnitude_type>::zero();
4281  //const local_ordinal_type num_vectors = X_scalar_values.extent(2);
4282 
4283  // const local_ordinal_type blocksize = D_scalar_values.extent(1);
4284  const local_ordinal_type astep = D_internal_vector_values.stride_0();
4285  const local_ordinal_type as0 = D_internal_vector_values.stride_1(); //blocksize*vector_length;
4286  const local_ordinal_type as1 = D_internal_vector_values.stride_2(); //vector_length;
4287  const local_ordinal_type xstep = X_internal_vector_values.stride_0();
4288  const local_ordinal_type xs0 = X_internal_vector_values.stride_1(); //vector_length;
4289 
4290  // move to starting point
4291  A += i0*astep + v;
4292  X += r0*xstep + v;
4293 
4294  //for (local_ordinal_type col=0;col<num_vectors;++col)
4295  if (nrows > 1) {
4296  // solve Lx = x
4297  KOKKOSBATCHED_TRSV_LOWER_NO_TRANSPOSE_INTERNAL_INVOKE
4298  (default_mode_type,default_algo_type,
4299  member,
4300  KB::Diag::Unit,
4301  blocksize,blocksize,
4302  one,
4303  A, as0, as1,
4304  X, xs0);
4305 
4306  for (local_ordinal_type tr=1;tr<nrows;++tr) {
4307  member.team_barrier();
4308  KOKKOSBATCHED_GEMV_NO_TRANSPOSE_INTERNAL_INVOKE
4309  (default_mode_type,default_algo_type,
4310  member,
4311  blocksize, blocksize,
4312  -one,
4313  A+2*astep, as0, as1,
4314  X, xs0,
4315  one,
4316  X+1*xstep, xs0);
4317  KOKKOSBATCHED_TRSV_LOWER_NO_TRANSPOSE_INTERNAL_INVOKE
4318  (default_mode_type,default_algo_type,
4319  member,
4320  KB::Diag::Unit,
4321  blocksize,blocksize,
4322  one,
4323  A+3*astep, as0, as1,
4324  X+1*xstep, xs0);
4325 
4326  A += 3*astep;
4327  X += 1*xstep;
4328  }
4329 
4330  // solve Ux = x
4331  KOKKOSBATCHED_TRSV_UPPER_NO_TRANSPOSE_INTERNAL_INVOKE
4332  (default_mode_type,default_algo_type,
4333  member,
4334  KB::Diag::NonUnit,
4335  blocksize, blocksize,
4336  one,
4337  A, as0, as1,
4338  X, xs0);
4339 
4340  for (local_ordinal_type tr=nrows;tr>1;--tr) {
4341  A -= 3*astep;
4342  member.team_barrier();
4343  KOKKOSBATCHED_GEMV_NO_TRANSPOSE_INTERNAL_INVOKE
4344  (default_mode_type,default_algo_type,
4345  member,
4346  blocksize, blocksize,
4347  -one,
4348  A+1*astep, as0, as1,
4349  X, xs0,
4350  one,
4351  X-1*xstep, xs0);
4352  KOKKOSBATCHED_TRSV_UPPER_NO_TRANSPOSE_INTERNAL_INVOKE
4353  (default_mode_type,default_algo_type,
4354  member,
4355  KB::Diag::NonUnit,
4356  blocksize, blocksize,
4357  one,
4358  A, as0, as1,
4359  X-1*xstep,xs0);
4360  X -= 1*xstep;
4361  }
4362  // for multiple rhs
4363  //X += xs1;
4364  } else {
4365  const local_ordinal_type ws0 = WW.stride_0();
4366  auto W = WW.data() + v;
4367  KOKKOSBATCHED_COPY_VECTOR_NO_TRANSPOSE_INTERNAL_INVOKE
4368  (default_mode_type,
4369  member, blocksize, X, xs0, W, ws0);
4370  member.team_barrier();
4371  KOKKOSBATCHED_GEMV_NO_TRANSPOSE_INTERNAL_INVOKE
4372  (default_mode_type,default_algo_type,
4373  member,
4374  blocksize, blocksize,
4375  one,
4376  A, as0, as1,
4377  W, xs0,
4378  zero,
4379  X, xs0);
4380  }
4381  }
4382 
4383  template<typename WWViewType>
4384  KOKKOS_INLINE_FUNCTION
4385  void
4386  solveMultiVector(const member_type &member,
4387  const local_ordinal_type &/* blocksize */,
4388  const local_ordinal_type &i0,
4389  const local_ordinal_type &r0,
4390  const local_ordinal_type &nrows,
4391  const local_ordinal_type &v,
4392  const WWViewType &WW) const {
4393 
4394  typedef SolveTridiagsDefaultModeAndAlgo
4395  <typename execution_space::memory_space> default_mode_and_algo_type;
4396 
4397  typedef typename default_mode_and_algo_type::mode_type default_mode_type;
4398  typedef typename default_mode_and_algo_type::multi_vector_algo_type default_algo_type;
4399 
4400  // constant
4401  const auto one = Kokkos::ArithTraits<btdm_magnitude_type>::one();
4402  const auto zero = Kokkos::ArithTraits<btdm_magnitude_type>::zero();
4403 
4404  // subview pattern
4405  auto A = Kokkos::subview(D_internal_vector_values, i0, Kokkos::ALL(), Kokkos::ALL(), v);
4406  auto X1 = Kokkos::subview(X_internal_vector_values, r0, Kokkos::ALL(), Kokkos::ALL(), v);
4407  auto X2 = X1;
4408 
4409  local_ordinal_type i = i0, r = r0;
4410 
4411 
4412  if (nrows > 1) {
4413  // solve Lx = x
4414  KB::Trsm<member_type,
4415  KB::Side::Left,KB::Uplo::Lower,KB::Trans::NoTranspose,KB::Diag::Unit,
4416  default_mode_type,default_algo_type>
4417  ::invoke(member, one, A, X1);
4418  for (local_ordinal_type tr=1;tr<nrows;++tr,i+=3) {
4419  A.assign_data( &D_internal_vector_values(i+2,0,0,v) );
4420  X2.assign_data( &X_internal_vector_values(++r,0,0,v) );
4421  member.team_barrier();
4422  KB::Gemm<member_type,
4423  KB::Trans::NoTranspose,KB::Trans::NoTranspose,
4424  default_mode_type,default_algo_type>
4425  ::invoke(member, -one, A, X1, one, X2);
4426  A.assign_data( &D_internal_vector_values(i+3,0,0,v) );
4427  KB::Trsm<member_type,
4428  KB::Side::Left,KB::Uplo::Lower,KB::Trans::NoTranspose,KB::Diag::Unit,
4429  default_mode_type,default_algo_type>
4430  ::invoke(member, one, A, X2);
4431  X1.assign_data( X2.data() );
4432  }
4433 
4434  // solve Ux = x
4435  KB::Trsm<member_type,
4436  KB::Side::Left,KB::Uplo::Upper,KB::Trans::NoTranspose,KB::Diag::NonUnit,
4437  default_mode_type,default_algo_type>
4438  ::invoke(member, one, A, X1);
4439  for (local_ordinal_type tr=nrows;tr>1;--tr) {
4440  i -= 3;
4441  A.assign_data( &D_internal_vector_values(i+1,0,0,v) );
4442  X2.assign_data( &X_internal_vector_values(--r,0,0,v) );
4443  member.team_barrier();
4444  KB::Gemm<member_type,
4445  KB::Trans::NoTranspose,KB::Trans::NoTranspose,
4446  default_mode_type,default_algo_type>
4447  ::invoke(member, -one, A, X1, one, X2);
4448 
4449  A.assign_data( &D_internal_vector_values(i,0,0,v) );
4450  KB::Trsm<member_type,
4451  KB::Side::Left,KB::Uplo::Upper,KB::Trans::NoTranspose,KB::Diag::NonUnit,
4452  default_mode_type,default_algo_type>
4453  ::invoke(member, one, A, X2);
4454  X1.assign_data( X2.data() );
4455  }
4456  } else {
4457  // matrix is already inverted
4458  auto W = Kokkos::subview(WW, Kokkos::ALL(), Kokkos::ALL(), v);
4459  KB::Copy<member_type,KB::Trans::NoTranspose,default_mode_type>
4460  ::invoke(member, X1, W);
4461  member.team_barrier();
4462  KB::Gemm<member_type,
4463  KB::Trans::NoTranspose,KB::Trans::NoTranspose,
4464  default_mode_type,default_algo_type>
4465  ::invoke(member, one, A, W, zero, X1);
4466  }
4467  }
4468 
4469  template<int B> struct SingleVectorTag {};
4470  template<int B> struct MultiVectorTag {};
4471 
4472  template<int B> struct SingleVectorSubLineTag {};
4473  template<int B> struct MultiVectorSubLineTag {};
4474  template<int B> struct SingleVectorApplyCTag {};
4475  template<int B> struct MultiVectorApplyCTag {};
4476  template<int B> struct SingleVectorSchurTag {};
4477  template<int B> struct MultiVectorSchurTag {};
4478  template<int B> struct SingleVectorApplyETag {};
4479  template<int B> struct MultiVectorApplyETag {};
4480  template<int B> struct SingleVectorCopyToFlatTag {};
4481  template<int B> struct SingleZeroingTag {};
4482 
4483  template<int B>
4484  KOKKOS_INLINE_FUNCTION
4485  void
4486  operator() (const SingleVectorTag<B> &, const member_type &member) const {
4487  const local_ordinal_type packidx = member.league_rank();
4488  const local_ordinal_type partidx = packptr(packidx);
4489  const local_ordinal_type npacks = packptr(packidx+1) - partidx;
4490  const local_ordinal_type pri0 = part2packrowidx0(partidx);
4491  const local_ordinal_type i0 = pack_td_ptr(partidx,0);
4492  const local_ordinal_type r0 = part2packrowidx0(partidx);
4493  const local_ordinal_type nrows = partptr(partidx+1) - partptr(partidx);
4494  const local_ordinal_type blocksize = (B == 0 ? D_internal_vector_values.extent(1) : B);
4495  const local_ordinal_type num_vectors = 1;
4496  internal_vector_scratch_type_3d_view
4497  WW(member.team_scratch(0), blocksize, 1, vector_loop_size);
4498  Kokkos::single(Kokkos::PerTeam(member), [&]() {
4499  Z_scalar_vector(member.league_rank()) = impl_scalar_type(0);
4500  });
4501  Kokkos::parallel_for
4502  (Kokkos::ThreadVectorRange(member, vector_loop_size),[&](const int &v) {
4503  solveSingleVector(member, blocksize, i0, r0, nrows, v, WW);
4504  copyToFlatMultiVector(member, partidx, npacks, pri0, v, blocksize, num_vectors);
4505  });
4506  }
4507 
4508  template<int B>
4509  KOKKOS_INLINE_FUNCTION
4510  void
4511  operator() (const MultiVectorTag<B> &, const member_type &member) const {
4512  const local_ordinal_type packidx = member.league_rank();
4513  const local_ordinal_type partidx = packptr(packidx);
4514  const local_ordinal_type npacks = packptr(packidx+1) - partidx;
4515  const local_ordinal_type pri0 = part2packrowidx0(partidx);
4516  const local_ordinal_type i0 = pack_td_ptr(partidx,0);
4517  const local_ordinal_type r0 = part2packrowidx0(partidx);
4518  const local_ordinal_type nrows = partptr(partidx+1) - partptr(partidx);
4519  const local_ordinal_type blocksize = (B == 0 ? D_internal_vector_values.extent(1) : B);
4520  const local_ordinal_type num_vectors = X_internal_vector_values.extent(2);
4521 
4522  internal_vector_scratch_type_3d_view
4523  WW(member.team_scratch(0), blocksize, num_vectors, vector_loop_size);
4524  Kokkos::single(Kokkos::PerTeam(member), [&]() {
4525  Z_scalar_vector(member.league_rank()) = impl_scalar_type(0);
4526  });
4527  Kokkos::parallel_for
4528  (Kokkos::ThreadVectorRange(member, vector_loop_size),[&](const int &v) {
4529  solveMultiVector(member, blocksize, i0, r0, nrows, v, WW);
4530  copyToFlatMultiVector(member, partidx, npacks, pri0, v, blocksize, num_vectors);
4531  });
4532  }
4533 
4534  template<int B>
4535  KOKKOS_INLINE_FUNCTION
4536  void
4537  operator() (const SingleVectorSubLineTag<B> &, const member_type &member) const {
4538  // btdm is packed and sorted from largest one
4539  const local_ordinal_type packidx = packindices_sub(member.league_rank());
4540 
4541  const local_ordinal_type subpartidx = packptr_sub(packidx);
4542  const local_ordinal_type n_parts = part2packrowidx0_sub.extent(0);
4543  const local_ordinal_type local_subpartidx = subpartidx/n_parts;
4544  const local_ordinal_type partidx = subpartidx%n_parts;
4545 
4546  const local_ordinal_type npacks = packptr_sub(packidx+1) - subpartidx;
4547  const local_ordinal_type i0 = pack_td_ptr(partidx,local_subpartidx);
4548  const local_ordinal_type r0 = part2packrowidx0_sub(partidx,local_subpartidx);
4549  const local_ordinal_type nrows = partptr_sub(subpartidx,1) - partptr_sub(subpartidx,0);
4550  const local_ordinal_type blocksize = e_internal_vector_values.extent(2);
4551 
4552  //(void) i0;
4553  //(void) nrows;
4554  (void) npacks;
4555 
4556  internal_vector_scratch_type_3d_view
4557  WW(member.team_scratch(0), blocksize, 1, vector_loop_size);
4558 
4559  Kokkos::parallel_for
4560  (Kokkos::ThreadVectorRange(member, vector_loop_size),[&](const int &v) {
4561  solveSingleVectorNew<impl_type, internal_vector_scratch_type_3d_view> (member, blocksize, i0, r0, nrows, v, D_internal_vector_values, X_internal_vector_values, WW);
4562  });
4563  }
4564 
4565  template<int B>
4566  KOKKOS_INLINE_FUNCTION
4567  void
4568  operator() (const SingleVectorApplyCTag<B> &, const member_type &member) const {
4569  // btdm is packed and sorted from largest one
4570  //const local_ordinal_type packidx = packindices_schur(member.league_rank());
4571  const local_ordinal_type packidx = packindices_sub(member.league_rank());
4572 
4573  const local_ordinal_type subpartidx = packptr_sub(packidx);
4574  const local_ordinal_type n_parts = part2packrowidx0_sub.extent(0);
4575  const local_ordinal_type local_subpartidx = subpartidx/n_parts;
4576  const local_ordinal_type partidx = subpartidx%n_parts;
4577  const local_ordinal_type blocksize = e_internal_vector_values.extent(2);
4578 
4579  //const local_ordinal_type npacks = packptr_sub(packidx+1) - subpartidx;
4580  const local_ordinal_type i0 = pack_td_ptr(partidx,local_subpartidx);
4581  const local_ordinal_type r0 = part2packrowidx0_sub(partidx,local_subpartidx);
4582  const local_ordinal_type nrows = partptr_sub(subpartidx,1) - partptr_sub(subpartidx,0);
4583 
4584  internal_vector_scratch_type_3d_view
4585  WW(member.team_scratch(0), blocksize, blocksize, vector_loop_size);
4586 
4587  // Compute v_2 = v_2 - C v_1
4588 
4589  const local_ordinal_type local_subpartidx_schur = (local_subpartidx-1)/2;
4590  const local_ordinal_type i0_schur = local_subpartidx_schur == 0 ? pack_td_ptr_schur(partidx,local_subpartidx_schur) : pack_td_ptr_schur(partidx,local_subpartidx_schur) + 1;
4591  const local_ordinal_type i0_offset = local_subpartidx_schur == 0 ? i0+2 : i0+2;
4592 
4593  (void) i0_schur;
4594  (void) i0_offset;
4595 
4596  const auto one = Kokkos::ArithTraits<btdm_magnitude_type>::one();
4597 
4598  const size_type c_kps2 = local_subpartidx > 0 ? pack_td_ptr(partidx, local_subpartidx)-2 : 0;
4599  const size_type c_kps1 = pack_td_ptr(partidx, local_subpartidx+1)+1;
4600 
4601  typedef SolveTridiagsDefaultModeAndAlgo
4602  <typename execution_space::memory_space> default_mode_and_algo_type;
4603 
4604  typedef typename default_mode_and_algo_type::mode_type default_mode_type;
4605  typedef typename default_mode_and_algo_type::single_vector_algo_type default_algo_type;
4606 
4607  if (local_subpartidx == 0) {
4608  Kokkos::parallel_for
4609  (Kokkos::ThreadVectorRange(member, vector_loop_size),[&](const int &v) {
4610  auto v_1 = Kokkos::subview(X_internal_vector_values, r0+nrows-1, Kokkos::ALL(), 0, v);
4611  auto v_2 = Kokkos::subview(X_internal_vector_values, r0+nrows, Kokkos::ALL(), 0, v);
4612  auto C = Kokkos::subview(D_internal_vector_values, c_kps1, Kokkos::ALL(), Kokkos::ALL(), v);
4613 
4614  KOKKOSBATCHED_GEMV_NO_TRANSPOSE_INTERNAL_INVOKE
4615  (default_mode_type,default_algo_type,
4616  member,
4617  blocksize, blocksize,
4618  -one,
4619  C.data(), C.stride_0(), C.stride_1(),
4620  v_1.data(), v_1.stride_0(),
4621  one,
4622  v_2.data(), v_2.stride_0());
4623  });
4624  }
4625  else if (local_subpartidx == (local_ordinal_type) part2packrowidx0_sub.extent(1) - 2) {
4626  Kokkos::parallel_for
4627  (Kokkos::ThreadVectorRange(member, vector_loop_size),[&](const int &v) {
4628  auto v_1 = Kokkos::subview(X_internal_vector_values, r0, Kokkos::ALL(), 0, v);
4629  auto v_2 = Kokkos::subview(X_internal_vector_values, r0-1, Kokkos::ALL(), 0, v);
4630  auto C = Kokkos::subview(D_internal_vector_values, c_kps2, Kokkos::ALL(), Kokkos::ALL(), v);
4631 
4632  KOKKOSBATCHED_GEMV_NO_TRANSPOSE_INTERNAL_INVOKE
4633  (default_mode_type,default_algo_type,
4634  member,
4635  blocksize, blocksize,
4636  -one,
4637  C.data(), C.stride_0(), C.stride_1(),
4638  v_1.data(), v_1.stride_0(),
4639  one,
4640  v_2.data(), v_2.stride_0());
4641  });
4642  }
4643  else {
4644  Kokkos::parallel_for
4645  (Kokkos::ThreadVectorRange(member, vector_loop_size),[&](const int &v) {
4646  {
4647  auto v_1 = Kokkos::subview(X_internal_vector_values, r0+nrows-1, Kokkos::ALL(), 0, v);
4648  auto v_2 = Kokkos::subview(X_internal_vector_values, r0+nrows, Kokkos::ALL(), 0, v);
4649  auto C = Kokkos::subview(D_internal_vector_values, c_kps1, Kokkos::ALL(), Kokkos::ALL(), v);
4650 
4651  KOKKOSBATCHED_GEMV_NO_TRANSPOSE_INTERNAL_INVOKE
4652  (default_mode_type,default_algo_type,
4653  member,
4654  blocksize, blocksize,
4655  -one,
4656  C.data(), C.stride_0(), C.stride_1(),
4657  v_1.data(), v_1.stride_0(),
4658  one,
4659  v_2.data(), v_2.stride_0());
4660  }
4661  {
4662  auto v_1 = Kokkos::subview(X_internal_vector_values, r0, Kokkos::ALL(), 0, v);
4663  auto v_2 = Kokkos::subview(X_internal_vector_values, r0-1, Kokkos::ALL(), 0, v);
4664  auto C = Kokkos::subview(D_internal_vector_values, c_kps2, Kokkos::ALL(), Kokkos::ALL(), v);
4665 
4666  KOKKOSBATCHED_GEMV_NO_TRANSPOSE_INTERNAL_INVOKE
4667  (default_mode_type,default_algo_type,
4668  member,
4669  blocksize, blocksize,
4670  -one,
4671  C.data(), C.stride_0(), C.stride_1(),
4672  v_1.data(), v_1.stride_0(),
4673  one,
4674  v_2.data(), v_2.stride_0());
4675  }
4676  });
4677  }
4678  }
4679 
4680  template<int B>
4681  KOKKOS_INLINE_FUNCTION
4682  void
4683  operator() (const SingleVectorSchurTag<B> &, const member_type &member) const {
4684  const local_ordinal_type packidx = packindices_sub(member.league_rank());
4685 
4686  const local_ordinal_type partidx = packptr_sub(packidx);
4687 
4688  const local_ordinal_type blocksize = e_internal_vector_values.extent(2);
4689 
4690  const local_ordinal_type i0_schur = pack_td_ptr_schur(partidx,0);
4691  const local_ordinal_type nrows = 2*(n_subparts_per_part-1);
4692 
4693  const local_ordinal_type r0_schur = nrows * member.league_rank();
4694 
4695  internal_vector_scratch_type_3d_view
4696  WW(member.team_scratch(0), blocksize, blocksize, vector_loop_size);
4697 
4698  for (local_ordinal_type schur_sub_part = 0; schur_sub_part < n_subparts_per_part-1; ++schur_sub_part) {
4699  const local_ordinal_type r0 = part2packrowidx0_sub(partidx,2*schur_sub_part+1);
4700  for (local_ordinal_type i = 0; i < 2; ++i) {
4701  copy3DView<local_ordinal_type>(member,
4702  Kokkos::subview(X_internal_vector_values_schur, r0_schur+2*schur_sub_part+i, Kokkos::ALL(), Kokkos::ALL(), Kokkos::ALL()),
4703  Kokkos::subview(X_internal_vector_values, r0+i, Kokkos::ALL(), Kokkos::ALL(), Kokkos::ALL()));
4704  }
4705  }
4706 
4707  Kokkos::parallel_for
4708  (Kokkos::ThreadVectorRange(member, vector_loop_size),[&](const int &v) {
4709  solveSingleVectorNew<impl_type, internal_vector_scratch_type_3d_view> (member, blocksize, i0_schur, r0_schur, nrows, v, D_internal_vector_values_schur, X_internal_vector_values_schur, WW);
4710  });
4711 
4712  for (local_ordinal_type schur_sub_part = 0; schur_sub_part < n_subparts_per_part-1; ++schur_sub_part) {
4713  const local_ordinal_type r0 = part2packrowidx0_sub(partidx,2*schur_sub_part+1);
4714  for (local_ordinal_type i = 0; i < 2; ++i) {
4715  copy3DView<local_ordinal_type>(member,
4716  Kokkos::subview(X_internal_vector_values, r0+i, Kokkos::ALL(), Kokkos::ALL(), Kokkos::ALL()),
4717  Kokkos::subview(X_internal_vector_values_schur, r0_schur+2*schur_sub_part+i, Kokkos::ALL(), Kokkos::ALL(), Kokkos::ALL()));
4718  }
4719  }
4720  }
4721 
4722  template<int B>
4723  KOKKOS_INLINE_FUNCTION
4724  void
4725  operator() (const SingleVectorApplyETag<B> &, const member_type &member) const {
4726  const local_ordinal_type packidx = packindices_sub(member.league_rank());
4727 
4728  const local_ordinal_type subpartidx = packptr_sub(packidx);
4729  const local_ordinal_type n_parts = part2packrowidx0_sub.extent(0);
4730  const local_ordinal_type local_subpartidx = subpartidx/n_parts;
4731  const local_ordinal_type partidx = subpartidx%n_parts;
4732  const local_ordinal_type blocksize = e_internal_vector_values.extent(2);
4733 
4734  const local_ordinal_type r0 = part2packrowidx0_sub(partidx,local_subpartidx);
4735  const local_ordinal_type nrows = partptr_sub(subpartidx,1) - partptr_sub(subpartidx,0);
4736 
4737  internal_vector_scratch_type_3d_view
4738  WW(member.team_scratch(0), blocksize, blocksize, vector_loop_size);
4739 
4740  // Compute v_2 = v_2 - C v_1
4741 
4742  const auto one = Kokkos::ArithTraits<btdm_magnitude_type>::one();
4743 
4744  typedef SolveTridiagsDefaultModeAndAlgo
4745  <typename execution_space::memory_space> default_mode_and_algo_type;
4746 
4747  typedef typename default_mode_and_algo_type::mode_type default_mode_type;
4748  typedef typename default_mode_and_algo_type::single_vector_algo_type default_algo_type;
4749 
4750  if (local_subpartidx == 0) {
4751  Kokkos::parallel_for
4752  (Kokkos::ThreadVectorRange(member, vector_loop_size),[&](const int &v) {
4753 
4754  auto v_2 = Kokkos::subview(X_internal_vector_values, r0+nrows, Kokkos::ALL(), 0, v);
4755 
4756  for (local_ordinal_type row = 0; row < nrows; ++row) {
4757  auto v_1 = Kokkos::subview(X_internal_vector_values, r0+row, Kokkos::ALL(), 0, v);
4758  auto E = Kokkos::subview(e_internal_vector_values, 0, r0+row, Kokkos::ALL(), Kokkos::ALL(), v);
4759 
4760  KOKKOSBATCHED_GEMV_NO_TRANSPOSE_INTERNAL_INVOKE
4761  (default_mode_type,default_algo_type,
4762  member,
4763  blocksize, blocksize,
4764  -one,
4765  E.data(), E.stride_0(), E.stride_1(),
4766  v_2.data(), v_2.stride_0(),
4767  one,
4768  v_1.data(), v_1.stride_0());
4769  }
4770  });
4771  }
4772  else if (local_subpartidx == (local_ordinal_type) part2packrowidx0_sub.extent(1) - 2) {
4773  Kokkos::parallel_for
4774  (Kokkos::ThreadVectorRange(member, vector_loop_size),[&](const int &v) {
4775  auto v_2 = Kokkos::subview(X_internal_vector_values, r0-1, Kokkos::ALL(), 0, v);
4776 
4777  for (local_ordinal_type row = 0; row < nrows; ++row) {
4778  auto v_1 = Kokkos::subview(X_internal_vector_values, r0+row, Kokkos::ALL(), 0, v);
4779  auto E = Kokkos::subview(e_internal_vector_values, 1, r0+row, Kokkos::ALL(), Kokkos::ALL(), v);
4780 
4781  KOKKOSBATCHED_GEMV_NO_TRANSPOSE_INTERNAL_INVOKE
4782  (default_mode_type,default_algo_type,
4783  member,
4784  blocksize, blocksize,
4785  -one,
4786  E.data(), E.stride_0(), E.stride_1(),
4787  v_2.data(), v_2.stride_0(),
4788  one,
4789  v_1.data(), v_1.stride_0());
4790  }
4791  });
4792  }
4793  else {
4794  Kokkos::parallel_for
4795  (Kokkos::ThreadVectorRange(member, vector_loop_size),[&](const int &v) {
4796  {
4797  auto v_2 = Kokkos::subview(X_internal_vector_values, r0+nrows, Kokkos::ALL(), 0, v);
4798 
4799  for (local_ordinal_type row = 0; row < nrows; ++row) {
4800  auto v_1 = Kokkos::subview(X_internal_vector_values, r0+row, Kokkos::ALL(), 0, v);
4801  auto E = Kokkos::subview(e_internal_vector_values, 0, r0+row, Kokkos::ALL(), Kokkos::ALL(), v);
4802 
4803  KOKKOSBATCHED_GEMV_NO_TRANSPOSE_INTERNAL_INVOKE
4804  (default_mode_type,default_algo_type,
4805  member,
4806  blocksize, blocksize,
4807  -one,
4808  E.data(), E.stride_0(), E.stride_1(),
4809  v_2.data(), v_2.stride_0(),
4810  one,
4811  v_1.data(), v_1.stride_0());
4812  }
4813  }
4814  {
4815  auto v_2 = Kokkos::subview(X_internal_vector_values, r0-1, Kokkos::ALL(), 0, v);
4816 
4817  for (local_ordinal_type row = 0; row < nrows; ++row) {
4818  auto v_1 = Kokkos::subview(X_internal_vector_values, r0+row, Kokkos::ALL(), 0, v);
4819  auto E = Kokkos::subview(e_internal_vector_values, 1, r0+row, Kokkos::ALL(), Kokkos::ALL(), v);
4820 
4821  KOKKOSBATCHED_GEMV_NO_TRANSPOSE_INTERNAL_INVOKE
4822  (default_mode_type,default_algo_type,
4823  member,
4824  blocksize, blocksize,
4825  -one,
4826  E.data(), E.stride_0(), E.stride_1(),
4827  v_2.data(), v_2.stride_0(),
4828  one,
4829  v_1.data(), v_1.stride_0());
4830  }
4831  }
4832  });
4833  }
4834  }
4835 
4836  template<int B>
4837  KOKKOS_INLINE_FUNCTION
4838  void
4839  operator() (const SingleVectorCopyToFlatTag<B> &, const member_type &member) const {
4840  const local_ordinal_type packidx = member.league_rank();
4841  const local_ordinal_type partidx = packptr(packidx);
4842  const local_ordinal_type npacks = packptr(packidx+1) - partidx;
4843  const local_ordinal_type pri0 = part2packrowidx0(partidx);
4844  const local_ordinal_type blocksize = (B == 0 ? D_internal_vector_values.extent(1) : B);
4845  const local_ordinal_type num_vectors = 1;
4846 
4847  Kokkos::parallel_for
4848  (Kokkos::ThreadVectorRange(member, vector_loop_size),[&](const int &v) {
4849  copyToFlatMultiVector(member, partidx, npacks, pri0, v, blocksize, num_vectors);
4850  });
4851  }
4852 
4853  template<int B>
4854  KOKKOS_INLINE_FUNCTION
4855  void
4856  operator() (const SingleZeroingTag<B> &, const member_type &member) const {
4857  Kokkos::single(Kokkos::PerTeam(member), [&]() {
4858  Z_scalar_vector(member.league_rank()) = impl_scalar_type(0);
4859  });
4860  }
4861 
4862  void run(const impl_scalar_type_2d_view_tpetra &Y,
4863  const impl_scalar_type_1d_view &Z) {
4864  IFPACK2_BLOCKTRIDICONTAINER_PROFILER_REGION_BEGIN;
4865  IFPACK2_BLOCKHELPER_TIMER("BlockTriDi::SolveTridiags", SolveTridiags);
4866 
4868  this->Y_scalar_multivector = Y;
4869  this->Z_scalar_vector = Z;
4870 
4871  const local_ordinal_type num_vectors = X_internal_vector_values.extent(2);
4872  const local_ordinal_type blocksize = D_internal_vector_values.extent(1);
4873 
4874  const local_ordinal_type team_size =
4875  SolveTridiagsDefaultModeAndAlgo<typename execution_space::memory_space>::
4876  recommended_team_size(blocksize, vector_length, internal_vector_length);
4877  const int per_team_scratch = internal_vector_scratch_type_3d_view
4878  ::shmem_size(blocksize, num_vectors, vector_loop_size);
4879 
4880 #if defined(KOKKOS_ENABLE_DEPRECATED_CODE)
4881 #define BLOCKTRIDICONTAINER_DETAILS_SOLVETRIDIAGS(B) \
4882  if (num_vectors == 1) { \
4883  const Kokkos::TeamPolicy<execution_space,SingleVectorTag<B> > \
4884  policy(packptr.extent(0) - 1, team_size, vector_loop_size); \
4885  Kokkos::parallel_for \
4886  ("SolveTridiags::TeamPolicy::run<SingleVector>", \
4887  policy.set_scratch_size(0,Kokkos::PerTeam(per_team_scratch)), *this); \
4888  } else { \
4889  const Kokkos::TeamPolicy<execution_space,MultiVectorTag<B> > \
4890  policy(packptr.extent(0) - 1, team_size, vector_loop_size); \
4891  Kokkos::parallel_for \
4892  ("SolveTridiags::TeamPolicy::run<MultiVector>", \
4893  policy.set_scratch_size(0,Kokkos::PerTeam(per_team_scratch)), *this); \
4894  } break
4895 #else
4896 #define BLOCKTRIDICONTAINER_DETAILS_SOLVETRIDIAGS(B) \
4897  if (num_vectors == 1) { \
4898  if (packindices_schur.extent(1) <= 0) { \
4899  Kokkos::TeamPolicy<execution_space,SingleVectorTag<B> > \
4900  policy(packptr.extent(0) - 1, team_size, vector_loop_size); \
4901  policy.set_scratch_size(0,Kokkos::PerTeam(per_team_scratch)); \
4902  Kokkos::parallel_for \
4903  ("SolveTridiags::TeamPolicy::run<SingleVector>", \
4904  policy, *this); \
4905  } \
4906  else { \
4907  { \
4908  \
4909  Kokkos::TeamPolicy<execution_space,SingleZeroingTag<B> > \
4910  policy(packptr.extent(0) - 1, team_size, vector_loop_size); \
4911  Kokkos::parallel_for \
4912  ("SolveTridiags::TeamPolicy::run<SingleZeroingTag>", \
4913  policy, *this); \
4914  } \
4915  { \
4916  IFPACK2_BLOCKHELPER_TIMER("BlockTriDi::ApplyInverseJacobi::SingleVectorSubLineTag", SingleVectorSubLineTag0); \
4917  write4DMultiVectorValuesToFile(part2packrowidx0_sub.extent(0), X_internal_scalar_values, "x_scalar_values_before_SingleVectorSubLineTag.mm"); \
4918  Kokkos::TeamPolicy<execution_space,SingleVectorSubLineTag<B> > \
4919  policy(packindices_sub.extent(0), team_size, vector_loop_size); \
4920  policy.set_scratch_size(0,Kokkos::PerTeam(per_team_scratch)); \
4921  Kokkos::parallel_for \
4922  ("SolveTridiags::TeamPolicy::run<SingleVector>", \
4923  policy, *this); \
4924  write4DMultiVectorValuesToFile(part2packrowidx0_sub.extent(0), X_internal_scalar_values, "x_scalar_values_after_SingleVectorSubLineTag.mm"); \
4925  IFPACK2_BLOCKHELPER_TIMER_FENCE(execution_space) \
4926  } \
4927  { \
4928  IFPACK2_BLOCKHELPER_TIMER("BlockTriDi::ApplyInverseJacobi::SingleVectorApplyCTag", SingleVectorApplyCTag0); \
4929  write4DMultiVectorValuesToFile(part2packrowidx0_sub.extent(0), X_internal_scalar_values, "x_scalar_values_before_SingleVectorApplyCTag.mm"); \
4930  Kokkos::TeamPolicy<execution_space,SingleVectorApplyCTag<B> > \
4931  policy(packindices_sub.extent(0), team_size, vector_loop_size); \
4932  policy.set_scratch_size(0,Kokkos::PerTeam(per_team_scratch)); \
4933  Kokkos::parallel_for \
4934  ("SolveTridiags::TeamPolicy::run<SingleVector>", \
4935  policy, *this); \
4936  write4DMultiVectorValuesToFile(part2packrowidx0_sub.extent(0), X_internal_scalar_values, "x_scalar_values_after_SingleVectorApplyCTag.mm"); \
4937  IFPACK2_BLOCKHELPER_TIMER_FENCE(execution_space) \
4938  } \
4939  { \
4940  IFPACK2_BLOCKHELPER_TIMER("BlockTriDi::ApplyInverseJacobi::SingleVectorSchurTag", SingleVectorSchurTag0); \
4941  write4DMultiVectorValuesToFile(part2packrowidx0_sub.extent(0), X_internal_scalar_values, "x_scalar_values_before_SingleVectorSchurTag.mm"); \
4942  Kokkos::TeamPolicy<execution_space,SingleVectorSchurTag<B> > \
4943  policy(packindices_schur.extent(0), team_size, vector_loop_size); \
4944  policy.set_scratch_size(0,Kokkos::PerTeam(per_team_scratch)); \
4945  Kokkos::parallel_for \
4946  ("SolveTridiags::TeamPolicy::run<SingleVector>", \
4947  policy, *this); \
4948  write4DMultiVectorValuesToFile(part2packrowidx0_sub.extent(0), X_internal_scalar_values, "x_scalar_values_after_SingleVectorSchurTag.mm"); \
4949  IFPACK2_BLOCKHELPER_TIMER_FENCE(execution_space) \
4950  } \
4951  { \
4952  IFPACK2_BLOCKHELPER_TIMER("BlockTriDi::ApplyInverseJacobi::SingleVectorApplyETag", SingleVectorApplyETag0); \
4953  write4DMultiVectorValuesToFile(part2packrowidx0_sub.extent(0), X_internal_scalar_values, "x_scalar_values_before_SingleVectorApplyETag.mm"); \
4954  Kokkos::TeamPolicy<execution_space,SingleVectorApplyETag<B> > \
4955  policy(packindices_sub.extent(0), team_size, vector_loop_size); \
4956  policy.set_scratch_size(0,Kokkos::PerTeam(per_team_scratch)); \
4957  Kokkos::parallel_for \
4958  ("SolveTridiags::TeamPolicy::run<SingleVector>", \
4959  policy, *this); \
4960  write4DMultiVectorValuesToFile(part2packrowidx0_sub.extent(0), X_internal_scalar_values, "x_scalar_values_after_SingleVectorApplyETag.mm"); \
4961  IFPACK2_BLOCKHELPER_TIMER_FENCE(execution_space) \
4962  } \
4963  { \
4964  \
4965  Kokkos::TeamPolicy<execution_space,SingleVectorCopyToFlatTag<B> > \
4966  policy(packptr.extent(0) - 1, team_size, vector_loop_size); \
4967  Kokkos::parallel_for \
4968  ("SolveTridiags::TeamPolicy::run<SingleVectorCopyToFlatTag>", \
4969  policy, *this); \
4970  } \
4971  } \
4972  } else { \
4973  Kokkos::TeamPolicy<execution_space,MultiVectorTag<B> > \
4974  policy(packptr.extent(0) - 1, team_size, vector_loop_size); \
4975  policy.set_scratch_size(0,Kokkos::PerTeam(per_team_scratch)); \
4976  Kokkos::parallel_for \
4977  ("SolveTridiags::TeamPolicy::run<MultiVector>", \
4978  policy, *this); \
4979  } break
4980 #endif
4981  switch (blocksize) {
4982  case 3: BLOCKTRIDICONTAINER_DETAILS_SOLVETRIDIAGS( 3);
4983  case 5: BLOCKTRIDICONTAINER_DETAILS_SOLVETRIDIAGS( 5);
4984  case 6: BLOCKTRIDICONTAINER_DETAILS_SOLVETRIDIAGS( 6);
4985  case 7: BLOCKTRIDICONTAINER_DETAILS_SOLVETRIDIAGS( 7);
4986  case 10: BLOCKTRIDICONTAINER_DETAILS_SOLVETRIDIAGS(10);
4987  case 11: BLOCKTRIDICONTAINER_DETAILS_SOLVETRIDIAGS(11);
4988  case 12: BLOCKTRIDICONTAINER_DETAILS_SOLVETRIDIAGS(12);
4989  case 13: BLOCKTRIDICONTAINER_DETAILS_SOLVETRIDIAGS(13);
4990  case 16: BLOCKTRIDICONTAINER_DETAILS_SOLVETRIDIAGS(16);
4991  case 17: BLOCKTRIDICONTAINER_DETAILS_SOLVETRIDIAGS(17);
4992  case 18: BLOCKTRIDICONTAINER_DETAILS_SOLVETRIDIAGS(18);
4993  case 19: BLOCKTRIDICONTAINER_DETAILS_SOLVETRIDIAGS(19);
4994  default : BLOCKTRIDICONTAINER_DETAILS_SOLVETRIDIAGS( 0);
4995  }
4996 #undef BLOCKTRIDICONTAINER_DETAILS_SOLVETRIDIAGS
4997 
4998  IFPACK2_BLOCKTRIDICONTAINER_PROFILER_REGION_END;
4999  IFPACK2_BLOCKHELPER_TIMER_FENCE(execution_space)
5000  }
5001  };
5002 
5006  template<typename MatrixType>
5007  int
5009  const Teuchos::RCP<const typename BlockHelperDetails::ImplType<MatrixType>::tpetra_row_matrix_type> &A,
5010  const Teuchos::RCP<const typename BlockHelperDetails::ImplType<MatrixType>::tpetra_crs_graph_type> &G,
5011  const Teuchos::RCP<const typename BlockHelperDetails::ImplType<MatrixType>::tpetra_import_type> &tpetra_importer,
5012  const Teuchos::RCP<AsyncableImport<MatrixType> > &async_importer,
5013  const bool overlap_communication_and_computation,
5014  // tpetra interface
5015  const typename BlockHelperDetails::ImplType<MatrixType>::tpetra_multivector_type &X, // tpetra interface
5016  /* */ typename BlockHelperDetails::ImplType<MatrixType>::tpetra_multivector_type &Y, // tpetra interface
5017  /* */ typename BlockHelperDetails::ImplType<MatrixType>::tpetra_multivector_type &Z, // temporary tpetra interface (seq_method)
5018  /* */ typename BlockHelperDetails::ImplType<MatrixType>::impl_scalar_type_1d_view &W, // temporary tpetra interface (diff)
5019  // local object interface
5020  const BlockHelperDetails::PartInterface<MatrixType> &interf, // mesh interface
5021  const BlockTridiags<MatrixType> &btdm, // packed block tridiagonal matrices
5022  const BlockHelperDetails::AmD<MatrixType> &amd, // R = A - D
5023  /* */ typename BlockHelperDetails::ImplType<MatrixType>::vector_type_1d_view &work, // workspace for packed multivector of right hand side
5024  /* */ BlockHelperDetails::NormManager<MatrixType> &norm_manager,
5025  // preconditioner parameters
5026  const typename BlockHelperDetails::ImplType<MatrixType>::impl_scalar_type &damping_factor,
5027  /* */ bool is_y_zero,
5028  const int max_num_sweeps,
5029  const typename BlockHelperDetails::ImplType<MatrixType>::magnitude_type tol,
5030  const int check_tol_every) {
5031  IFPACK2_BLOCKHELPER_TIMER("BlockTriDi::ApplyInverseJacobi", ApplyInverseJacobi);
5032 
5033  using impl_type = BlockHelperDetails::ImplType<MatrixType>;
5034  using node_memory_space = typename impl_type::node_memory_space;
5035  using local_ordinal_type = typename impl_type::local_ordinal_type;
5036  using size_type = typename impl_type::size_type;
5037  using impl_scalar_type = typename impl_type::impl_scalar_type;
5038  using magnitude_type = typename impl_type::magnitude_type;
5039  using local_ordinal_type_1d_view = typename impl_type::local_ordinal_type_1d_view;
5040  using vector_type_1d_view = typename impl_type::vector_type_1d_view;
5041  using vector_type_3d_view = typename impl_type::vector_type_3d_view;
5042  using tpetra_multivector_type = typename impl_type::tpetra_multivector_type;
5043 
5044  using impl_scalar_type_1d_view = typename impl_type::impl_scalar_type_1d_view;
5045 
5046  // either tpetra importer or async importer must be active
5047  TEUCHOS_TEST_FOR_EXCEPT_MSG(!tpetra_importer.is_null() && !async_importer.is_null(),
5048  "Neither Tpetra importer nor Async importer is null.");
5049  // max number of sweeps should be positive number
5050  TEUCHOS_TEST_FOR_EXCEPT_MSG(max_num_sweeps <= 0,
5051  "Maximum number of sweeps must be >= 1.");
5052 
5053  // const parameters
5054  const bool is_seq_method_requested = !tpetra_importer.is_null();
5055  const bool is_async_importer_active = !async_importer.is_null();
5056  const bool is_norm_manager_active = tol > Kokkos::ArithTraits<magnitude_type>::zero();
5057  const magnitude_type tolerance = tol*tol;
5058  const local_ordinal_type blocksize = btdm.values.extent(1);
5059  const local_ordinal_type num_vectors = Y.getNumVectors();
5060  const local_ordinal_type num_blockrows = interf.part2packrowidx0_back;
5061 
5062  const impl_scalar_type zero(0.0);
5063 
5064  TEUCHOS_TEST_FOR_EXCEPT_MSG(is_norm_manager_active && is_seq_method_requested,
5065  "The seq method for applyInverseJacobi, " <<
5066  "which in any case is for developer use only, " <<
5067  "does not support norm-based termination.");
5068  const bool device_accessible_from_host = Kokkos::SpaceAccessibility<
5069  Kokkos::DefaultHostExecutionSpace, node_memory_space>::accessible;
5070  TEUCHOS_TEST_FOR_EXCEPTION(is_seq_method_requested && !device_accessible_from_host,
5071  std::invalid_argument,
5072  "The seq method for applyInverseJacobi, " <<
5073  "which in any case is for developer use only, " <<
5074  "only supports memory spaces accessible from host.");
5075 
5076  // if workspace is needed more, resize it
5077  const size_type work_span_required = num_blockrows*num_vectors*blocksize;
5078  if (work.span() < work_span_required)
5079  work = vector_type_1d_view("vector workspace 1d view", work_span_required);
5080 
5081  // construct W
5082  const local_ordinal_type W_size = interf.packptr.extent(0)-1;
5083  if (local_ordinal_type(W.extent(0)) < W_size)
5084  W = impl_scalar_type_1d_view("W", W_size);
5085 
5086  typename impl_type::impl_scalar_type_2d_view_tpetra remote_multivector;
5087  {
5088  if (is_seq_method_requested) {
5089  if (Z.getNumVectors() != Y.getNumVectors())
5090  Z = tpetra_multivector_type(tpetra_importer->getTargetMap(), num_vectors, false);
5091  } else {
5092  if (is_async_importer_active) {
5093  // create comm data buffer and keep it here
5094  async_importer->createDataBuffer(num_vectors);
5095  remote_multivector = async_importer->getRemoteMultiVectorLocalView();
5096  }
5097  }
5098  }
5099 
5100  // wrap the workspace with 3d view
5101  vector_type_3d_view pmv(work.data(), num_blockrows, blocksize, num_vectors);
5102  const auto XX = X.getLocalViewDevice(Tpetra::Access::ReadOnly);
5103  const auto YY = Y.getLocalViewDevice(Tpetra::Access::ReadWrite);
5104  const auto ZZ = Z.getLocalViewDevice(Tpetra::Access::ReadWrite);
5105  if (is_y_zero) Kokkos::deep_copy(YY, zero);
5106 
5107  MultiVectorConverter<MatrixType> multivector_converter(interf, pmv);
5108  SolveTridiags<MatrixType> solve_tridiags(interf, btdm, pmv,
5109  damping_factor, is_norm_manager_active);
5110 
5111  const local_ordinal_type_1d_view dummy_local_ordinal_type_1d_view;
5112 
5113 
5114  auto A_crs = Teuchos::rcp_dynamic_cast<const typename impl_type::tpetra_crs_matrix_type>(A);
5115  auto A_bcrs = Teuchos::rcp_dynamic_cast<const typename impl_type::tpetra_block_crs_matrix_type>(A);
5116 
5117  bool hasBlockCrsMatrix = ! A_bcrs.is_null ();
5118 
5119  // This is OK here to use the graph of the A_crs matrix and a block size of 1
5120  const auto g = hasBlockCrsMatrix ? A_bcrs->getCrsGraph() : *(A_crs->getCrsGraph()); // tpetra crs graph object
5121 
5122  BlockHelperDetails::ComputeResidualVector<MatrixType>
5123  compute_residual_vector(amd, G->getLocalGraphDevice(), g.getLocalGraphDevice(), blocksize, interf,
5124  is_async_importer_active ? async_importer->dm2cm : dummy_local_ordinal_type_1d_view,
5125  hasBlockCrsMatrix);
5126 
5127  // norm manager workspace resize
5128  if (is_norm_manager_active)
5129  norm_manager.setCheckFrequency(check_tol_every);
5130 
5131  // iterate
5132  int sweep = 0;
5133  for (;sweep<max_num_sweeps;++sweep) {
5134  {
5135  if (is_y_zero) {
5136  // pmv := x(lclrow)
5137  multivector_converter.run(XX);
5138  } else {
5139  if (is_seq_method_requested) {
5140  // SEQ METHOD IS TESTING ONLY
5141 
5142  // y := x - R y
5143  Z.doImport(Y, *tpetra_importer, Tpetra::REPLACE);
5144  compute_residual_vector.run(YY, XX, ZZ);
5145 
5146  // pmv := y(lclrow).
5147  multivector_converter.run(YY);
5148  } else {
5149  // fused y := x - R y and pmv := y(lclrow);
5150  // real use case does not use overlap comp and comm
5151  if (overlap_communication_and_computation || !is_async_importer_active) {
5152  if (is_async_importer_active) async_importer->asyncSendRecv(YY);
5153  // OverlapTag, compute_owned = true
5154  compute_residual_vector.run(pmv, XX, YY, remote_multivector, true);
5155  if (is_norm_manager_active && norm_manager.checkDone(sweep, tolerance)) {
5156  if (is_async_importer_active) async_importer->cancel();
5157  break;
5158  }
5159  if (is_async_importer_active) {
5160  async_importer->syncRecv();
5161  // OverlapTag, compute_owned = false
5162  compute_residual_vector.run(pmv, XX, YY, remote_multivector, false);
5163  }
5164  } else {
5165  if (is_async_importer_active)
5166  async_importer->syncExchange(YY);
5167  if (is_norm_manager_active && norm_manager.checkDone(sweep, tolerance)) break;
5168  // AsyncTag
5169  compute_residual_vector.run(pmv, XX, YY, remote_multivector);
5170  }
5171  }
5172  }
5173  }
5174 
5175  // pmv := inv(D) pmv.
5176  {
5177  solve_tridiags.run(YY, W);
5178  }
5179  {
5180  if (is_norm_manager_active) {
5181  // y(lclrow) = (b - a) y(lclrow) + a pmv, with b = 1 always.
5182  BlockHelperDetails::reduceVector<MatrixType>(W, norm_manager.getBuffer());
5183  if (sweep + 1 == max_num_sweeps) {
5184  norm_manager.ireduce(sweep, true);
5185  norm_manager.checkDone(sweep + 1, tolerance, true);
5186  } else {
5187  norm_manager.ireduce(sweep);
5188  }
5189  }
5190  }
5191  is_y_zero = false;
5192  }
5193 
5194  //sqrt the norms for the caller's use.
5195  if (is_norm_manager_active) norm_manager.finalize();
5196 
5197  return sweep;
5198  }
5199 
5200  // Implementation of fused block Jacobi for a specific block size,
5201  // or (if B == 0) for a general block size.
5202  template<typename MatrixType, int B>
5203  int
5204  applyFusedBlockJacobi_Impl(
5205  const Teuchos::RCP<const typename BlockHelperDetails::ImplType<MatrixType>::tpetra_import_type> &tpetra_importer,
5206  const Teuchos::RCP<AsyncableImport<MatrixType> > &async_importer,
5207  const bool overlap_communication_and_computation,
5208  // tpetra interface
5209  const typename BlockHelperDetails::ImplType<MatrixType>::tpetra_multivector_type &X, // tpetra interface
5210  /* */ typename BlockHelperDetails::ImplType<MatrixType>::tpetra_multivector_type &Y, // tpetra interface
5211  /* */ typename BlockHelperDetails::ImplType<MatrixType>::impl_scalar_type_1d_view &W, // temporary tpetra interface (diff)
5212  // local object interface
5213  const BlockHelperDetails::PartInterface<MatrixType> &interf, // mesh interface
5214  const BlockTridiags<MatrixType> &btdm, // packed block tridiagonal matrices
5215  const BlockHelperDetails::AmD<MatrixType> &amd, // R = A - D
5216  /* */ typename BlockHelperDetails::ImplType<MatrixType>::impl_scalar_type_1d_view &work, // workspace
5217  /* */ BlockHelperDetails::NormManager<MatrixType> &norm_manager,
5218  // preconditioner parameters
5219  const typename BlockHelperDetails::ImplType<MatrixType>::impl_scalar_type &damping_factor,
5220  /* */ bool is_y_zero,
5221  const int max_num_sweeps,
5222  const typename BlockHelperDetails::ImplType<MatrixType>::magnitude_type tol,
5223  const int check_tol_every) {
5224  using impl_type = BlockHelperDetails::ImplType<MatrixType>;
5225  using node_memory_space = typename impl_type::node_memory_space;
5226  using local_ordinal_type = typename impl_type::local_ordinal_type;
5227  using size_type = typename impl_type::size_type;
5228  using impl_scalar_type = typename impl_type::impl_scalar_type;
5229  using magnitude_type = typename impl_type::magnitude_type;
5230  using local_ordinal_type_1d_view = typename impl_type::local_ordinal_type_1d_view;
5231  using tpetra_multivector_type = typename impl_type::tpetra_multivector_type;
5232  using impl_scalar_type_1d_view = typename impl_type::impl_scalar_type_1d_view;
5233  using impl_scalar_type_2d_view_tpetra = typename impl_type::impl_scalar_type_2d_view_tpetra;
5234 
5235  // the tpetra importer and async importer can't both be active
5236  TEUCHOS_TEST_FOR_EXCEPT_MSG(!tpetra_importer.is_null() && !async_importer.is_null(),
5237  "Neither Tpetra importer nor Async importer is null.");
5238  // max number of sweeps should be positive number
5239  TEUCHOS_TEST_FOR_EXCEPT_MSG(max_num_sweeps <= 0,
5240  "Maximum number of sweeps must be >= 1.");
5241 
5242  // const parameters
5243  const bool is_async_importer_active = !async_importer.is_null();
5244  const bool is_norm_manager_active = tol > Kokkos::ArithTraits<magnitude_type>::zero();
5245  const magnitude_type tolerance = tol*tol;
5246  const local_ordinal_type blocksize = btdm.d_inv.extent(1);
5247  const local_ordinal_type num_vectors = Y.getNumVectors();
5248  const local_ordinal_type num_blockrows = interf.nparts;
5249 
5250  typename impl_type::impl_scalar_type_2d_view_tpetra remote_multivector;
5251  {
5252  if (is_async_importer_active) {
5253  // create comm data buffer and keep it here
5254  async_importer->createDataBuffer(num_vectors);
5255  remote_multivector = async_importer->getRemoteMultiVectorLocalView();
5256  }
5257  }
5258 
5259  const auto XX = X.getLocalViewDevice(Tpetra::Access::ReadOnly);
5260  const auto YY = Y.getLocalViewDevice(Tpetra::Access::ReadWrite);
5261 
5262  const bool two_pass_residual =
5263  overlap_communication_and_computation && is_async_importer_active;
5264 
5265  // Calculate the required work size and reallocate it if not already big enough.
5266  // Check that our assumptions about YY dimension are correct.
5268  size_t(num_blockrows) * blocksize * num_vectors != YY.extent(0) * YY.extent(1),
5269  "Local LHS vector (YY) has total size " << YY.extent(0) << "x" << YY.extent(1) <<
5270  " = " << YY.extent(0) * YY.extent(1) << ",\n" <<
5271  "but expected " << num_blockrows << "x" << blocksize << "x" << num_vectors <<
5272  " = " << size_t(num_blockrows) * blocksize * num_vectors << '\n');
5273  size_type work_required = size_type(num_blockrows) * blocksize * num_vectors;
5274  if (work.extent(0) < work_required) {
5275  work = impl_scalar_type_1d_view(do_not_initialize_tag("flat workspace 1d view"), work_required);
5276  }
5277 
5278  Unmanaged<impl_scalar_type_2d_view_tpetra> y_doublebuf(work.data(), num_blockrows * blocksize, num_vectors);
5279 
5280  // construct W
5281  if (W.extent(0) != size_t(num_blockrows))
5282  W = impl_scalar_type_1d_view(do_not_initialize_tag("W"), num_blockrows);
5283 
5284  // Create the required functors upfront (this is inexpensive - all shallow copies)
5285  BlockHelperDetails::ComputeResidualAndSolve_SolveOnly<MatrixType, B>
5286  functor_solve_only(amd, btdm.d_inv, W, blocksize, damping_factor);
5287  BlockHelperDetails::ComputeResidualAndSolve_1Pass<MatrixType, B>
5288  functor_1pass(amd, btdm.d_inv, W, blocksize, damping_factor);
5289  BlockHelperDetails::ComputeResidualAndSolve_2Pass<MatrixType, B>
5290  functor_2pass(amd, btdm.d_inv, W, blocksize, damping_factor);
5291 
5292  // norm manager workspace resize
5293  if (is_norm_manager_active)
5294  norm_manager.setCheckFrequency(check_tol_every);
5295 
5296  // For double-buffering.
5297  // yy_buffers[current_y] has the current iterate of y.
5298  // yy_buffers[1-current_y] has the next iterate of y.
5299  Unmanaged<impl_scalar_type_2d_view_tpetra> y_buffers[2] = {YY, y_doublebuf};
5300  int current_y = 0;
5301 
5302  // iterate
5303  int sweep = 0;
5304  for (;sweep < max_num_sweeps; ++sweep) {
5305  if (is_y_zero) {
5306  // If y is initially zero, then we are just computing y := damping_factor * Dinv * x
5307  functor_solve_only.run(XX, y_buffers[1-current_y]);
5308  } else {
5309  // real use case does not use overlap comp and comm
5310  if (overlap_communication_and_computation || !is_async_importer_active) {
5311  if (is_async_importer_active) async_importer->asyncSendRecv(y_buffers[current_y]);
5312  if(two_pass_residual) {
5313  // Pass 1 computes owned residual and stores into new y buffer,
5314  // but doesn't apply Dinv or produce a norm yet
5315  functor_2pass.run_pass1(XX, y_buffers[current_y], y_buffers[1-current_y]);
5316  }
5317  else {
5318  // This case happens if running with single rank.
5319  // There are no remote columns, so residual and solve can happen in one step.
5320  functor_1pass.run(XX, y_buffers[current_y], remote_multivector, y_buffers[1-current_y]);
5321  }
5322  if (is_norm_manager_active && norm_manager.checkDone(sweep, tolerance)) {
5323  if (is_async_importer_active) async_importer->cancel();
5324  break;
5325  }
5326  if (is_async_importer_active) {
5327  async_importer->syncRecv();
5328  // Stage 2 finishes computing the residual, then applies Dinv and computes norm.
5329  functor_2pass.run_pass2(y_buffers[current_y], remote_multivector, y_buffers[1-current_y]);
5330  }
5331  } else {
5332  if (is_async_importer_active)
5333  async_importer->syncExchange(y_buffers[current_y]);
5334  if (is_norm_manager_active && norm_manager.checkDone(sweep, tolerance)) break;
5335  // Full residual, Dinv apply, and norm in one kernel
5336  functor_1pass.run(XX, y_buffers[current_y], remote_multivector, y_buffers[1-current_y]);
5337  }
5338  }
5339 
5340  // Compute global norm.
5341  if (is_norm_manager_active) {
5342  BlockHelperDetails::reduceVector<MatrixType>(W, norm_manager.getBuffer());
5343  if (sweep + 1 == max_num_sweeps) {
5344  norm_manager.ireduce(sweep, true);
5345  norm_manager.checkDone(sweep + 1, tolerance, true);
5346  } else {
5347  norm_manager.ireduce(sweep);
5348  }
5349  }
5350  is_y_zero = false;
5351  // flip y buffers for next iteration, or termination if we reached max_num_sweeps.
5352  current_y = 1 - current_y;
5353  }
5354  if(current_y == 1) {
5355  // We finished iterating with y in the double buffer, so copy it to the user's vector.
5356  Kokkos::deep_copy(YY, y_doublebuf);
5357  }
5358 
5359  //sqrt the norms for the caller's use.
5360  if (is_norm_manager_active) norm_manager.finalize();
5361  return sweep;
5362  }
5363 
5367  template<typename MatrixType>
5368  int
5370  const Teuchos::RCP<const typename BlockHelperDetails::ImplType<MatrixType>::tpetra_import_type> &tpetra_importer,
5371  const Teuchos::RCP<AsyncableImport<MatrixType> > &async_importer,
5372  const bool overlap_communication_and_computation,
5373  // tpetra interface
5374  const typename BlockHelperDetails::ImplType<MatrixType>::tpetra_multivector_type &X, // tpetra interface
5375  /* */ typename BlockHelperDetails::ImplType<MatrixType>::tpetra_multivector_type &Y, // tpetra interface
5376  /* */ typename BlockHelperDetails::ImplType<MatrixType>::impl_scalar_type_1d_view &W, // temporary tpetra interface (diff)
5377  // local object interface
5378  const BlockHelperDetails::PartInterface<MatrixType> &interf, // mesh interface
5379  const BlockTridiags<MatrixType> &btdm, // packed block tridiagonal matrices
5380  const BlockHelperDetails::AmD<MatrixType> &amd, // R = A - D
5381  /* */ typename BlockHelperDetails::ImplType<MatrixType>::impl_scalar_type_1d_view &work, // workspace
5382  /* */ BlockHelperDetails::NormManager<MatrixType> &norm_manager,
5383  // preconditioner parameters
5384  const typename BlockHelperDetails::ImplType<MatrixType>::impl_scalar_type &damping_factor,
5385  /* */ bool is_y_zero,
5386  const int max_num_sweeps,
5387  const typename BlockHelperDetails::ImplType<MatrixType>::magnitude_type tol,
5388  const int check_tol_every) {
5389  IFPACK2_BLOCKHELPER_TIMER("BlockTriDi::ApplyFusedBlockJacobi", ApplyFusedBlockJacobi);
5390  int blocksize = btdm.d_inv.extent(1);
5391  int sweep = 0;
5392 #define BLOCKTRIDICONTAINER_APPLY_FUSED_JACOBI(B) { \
5393  sweep = applyFusedBlockJacobi_Impl<MatrixType, B>( \
5394  tpetra_importer, async_importer, overlap_communication_and_computation, \
5395  X, Y, W, interf, btdm, amd, work, \
5396  norm_manager, damping_factor, is_y_zero, \
5397  max_num_sweeps, tol, check_tol_every); \
5398  } break
5399  switch (blocksize) {
5400  case 3: BLOCKTRIDICONTAINER_APPLY_FUSED_JACOBI( 3);
5401  case 5: BLOCKTRIDICONTAINER_APPLY_FUSED_JACOBI( 5);
5402  case 7: BLOCKTRIDICONTAINER_APPLY_FUSED_JACOBI( 7);
5403  case 9: BLOCKTRIDICONTAINER_APPLY_FUSED_JACOBI( 9);
5404  case 10: BLOCKTRIDICONTAINER_APPLY_FUSED_JACOBI(10);
5405  case 11: BLOCKTRIDICONTAINER_APPLY_FUSED_JACOBI(11);
5406  case 16: BLOCKTRIDICONTAINER_APPLY_FUSED_JACOBI(16);
5407  case 17: BLOCKTRIDICONTAINER_APPLY_FUSED_JACOBI(17);
5408  case 18: BLOCKTRIDICONTAINER_APPLY_FUSED_JACOBI(18);
5409  default : BLOCKTRIDICONTAINER_APPLY_FUSED_JACOBI( 0);
5410  }
5411 #undef BLOCKTRIDICONTAINER_APPLY_FUSED_JACOBI
5412 
5413  return sweep;
5414  }
5415 
5416 
5417  template<typename MatrixType>
5418  struct ImplObject {
5419  using impl_type = BlockHelperDetails::ImplType<MatrixType>;
5420  using part_interface_type = BlockHelperDetails::PartInterface<MatrixType>;
5421  using block_tridiags_type = BlockTridiags<MatrixType>;
5422  using amd_type = BlockHelperDetails::AmD<MatrixType>;
5423  using norm_manager_type = BlockHelperDetails::NormManager<MatrixType>;
5424  using async_import_type = AsyncableImport<MatrixType>;
5425 
5426  // distructed objects
5430  Teuchos::RCP<async_import_type> async_importer;
5431  bool overlap_communication_and_computation;
5432 
5433  // copy of Y (mutable to penentrate const)
5434  mutable typename impl_type::tpetra_multivector_type Z;
5435  mutable typename impl_type::impl_scalar_type_1d_view W;
5436 
5437  // local objects
5438  part_interface_type part_interface;
5439  block_tridiags_type block_tridiags; // D
5440  amd_type a_minus_d; // R = A - D
5441 
5442  // whether to use fused block Jacobi path
5443  bool use_fused_jacobi;
5444 
5445  // vector workspace is used for general block tridi case
5446  mutable typename impl_type::vector_type_1d_view work; // right hand side workspace (1D view of vector)
5447  // scalar workspace is used for fused block jacobi case
5448  mutable typename impl_type::impl_scalar_type_1d_view work_flat; // right hand side workspace (1D view of scalar)
5449  mutable norm_manager_type norm_manager;
5450  };
5451 
5452  } // namespace BlockTriDiContainerDetails
5453 
5454 } // namespace Ifpack2
5455 
5456 #endif
Definition: Ifpack2_BlockTriDiContainer_impl.hpp:141
void performNumericPhase(const Teuchos::RCP< const typename BlockHelperDetails::ImplType< MatrixType >::tpetra_row_matrix_type > &A, const Teuchos::RCP< const typename BlockHelperDetails::ImplType< MatrixType >::tpetra_crs_graph_type > &G, const BlockHelperDetails::PartInterface< MatrixType > &interf, BlockTridiags< MatrixType > &btdm, const typename BlockHelperDetails::ImplType< MatrixType >::magnitude_type tiny, bool use_fused_jacobi)
Definition: Ifpack2_BlockTriDiContainer_impl.hpp:3715
#define TEUCHOS_TEST_FOR_EXCEPTION(throw_exception_test, Exception, msg)
size_type size() const
size_t size_type
Definition: Ifpack2_BlockHelper.hpp:253
Teuchos::RCP< AsyncableImport< MatrixType > > createBlockCrsAsyncImporter(const Teuchos::RCP< const typename BlockHelperDetails::ImplType< MatrixType >::tpetra_row_matrix_type > &A)
Definition: Ifpack2_BlockTriDiContainer_impl.hpp:890
int applyInverseJacobi(const Teuchos::RCP< const typename BlockHelperDetails::ImplType< MatrixType >::tpetra_row_matrix_type > &A, const Teuchos::RCP< const typename BlockHelperDetails::ImplType< MatrixType >::tpetra_crs_graph_type > &G, const Teuchos::RCP< const typename BlockHelperDetails::ImplType< MatrixType >::tpetra_import_type > &tpetra_importer, const Teuchos::RCP< AsyncableImport< MatrixType > > &async_importer, const bool overlap_communication_and_computation, const typename BlockHelperDetails::ImplType< MatrixType >::tpetra_multivector_type &X, typename BlockHelperDetails::ImplType< MatrixType >::tpetra_multivector_type &Y, typename BlockHelperDetails::ImplType< MatrixType >::tpetra_multivector_type &Z, typename BlockHelperDetails::ImplType< MatrixType >::impl_scalar_type_1d_view &W, const BlockHelperDetails::PartInterface< MatrixType > &interf, const BlockTridiags< MatrixType > &btdm, const BlockHelperDetails::AmD< MatrixType > &amd, typename BlockHelperDetails::ImplType< MatrixType >::vector_type_1d_view &work, BlockHelperDetails::NormManager< MatrixType > &norm_manager, const typename BlockHelperDetails::ImplType< MatrixType >::impl_scalar_type &damping_factor, bool is_y_zero, const int max_num_sweeps, const typename BlockHelperDetails::ImplType< MatrixType >::magnitude_type tol, const int check_tol_every)
Definition: Ifpack2_BlockTriDiContainer_impl.hpp:5008
TEUCHOS_DEPRECATED RCP< T > rcp(T *p, Dealloc_T dealloc, bool owns_mem)
#define TEUCHOS_TEST_FOR_EXCEPT_MSG(throw_exception_test, msg)
BlockTridiags< MatrixType > createBlockTridiags(const BlockHelperDetails::PartInterface< MatrixType > &interf)
Definition: Ifpack2_BlockTriDiContainer_impl.hpp:1628
Definition: Ifpack2_BlockHelper.hpp:353
void performSymbolicPhase(const Teuchos::RCP< const typename BlockHelperDetails::ImplType< MatrixType >::tpetra_row_matrix_type > &A, const Teuchos::RCP< const typename BlockHelperDetails::ImplType< MatrixType >::tpetra_crs_graph_type > &g, const BlockHelperDetails::PartInterface< MatrixType > &interf, BlockTridiags< MatrixType > &btdm, BlockHelperDetails::AmD< MatrixType > &amd, const bool overlap_communication_and_computation, const Teuchos::RCP< AsyncableImport< MatrixType > > &async_importer, bool useSeqMethod, bool use_fused_jacobi)
Definition: Ifpack2_BlockTriDiContainer_impl.hpp:1870
Kokkos::ViewAllocateWithoutInitializing do_not_initialize_tag
Definition: Ifpack2_BlockTriDiContainer_impl.hpp:97
int applyFusedBlockJacobi(const Teuchos::RCP< const typename BlockHelperDetails::ImplType< MatrixType >::tpetra_import_type > &tpetra_importer, const Teuchos::RCP< AsyncableImport< MatrixType > > &async_importer, const bool overlap_communication_and_computation, const typename BlockHelperDetails::ImplType< MatrixType >::tpetra_multivector_type &X, typename BlockHelperDetails::ImplType< MatrixType >::tpetra_multivector_type &Y, typename BlockHelperDetails::ImplType< MatrixType >::impl_scalar_type_1d_view &W, const BlockHelperDetails::PartInterface< MatrixType > &interf, const BlockTridiags< MatrixType > &btdm, const BlockHelperDetails::AmD< MatrixType > &amd, typename BlockHelperDetails::ImplType< MatrixType >::impl_scalar_type_1d_view &work, BlockHelperDetails::NormManager< MatrixType > &norm_manager, const typename BlockHelperDetails::ImplType< MatrixType >::impl_scalar_type &damping_factor, bool is_y_zero, const int max_num_sweeps, const typename BlockHelperDetails::ImplType< MatrixType >::magnitude_type tol, const int check_tol_every)
Definition: Ifpack2_BlockTriDiContainer_impl.hpp:5369
void send(const Packet sendBuffer[], const Ordinal count, const int destRank, const int tag, const Comm< Ordinal > &comm)
T * getRawPtr() const
Kokkos::Details::ArithTraits< scalar_type >::val_type impl_scalar_type
Definition: Ifpack2_BlockHelper.hpp:262
Definition: Ifpack2_BlockHelper.hpp:188
BlockHelperDetails::PartInterface< MatrixType > createPartInterface(const Teuchos::RCP< const typename BlockHelperDetails::ImplType< MatrixType >::tpetra_row_matrix_type > &A, const Teuchos::RCP< const typename BlockHelperDetails::ImplType< MatrixType >::tpetra_crs_graph_type > &G, const Teuchos::Array< Teuchos::Array< typename BlockHelperDetails::ImplType< MatrixType >::local_ordinal_type > > &partitions, const typename BlockHelperDetails::ImplType< MatrixType >::local_ordinal_type n_subparts_per_part_in)
Definition: Ifpack2_BlockTriDiContainer_impl.hpp:1049
Kokkos::View< size_type *, device_type > size_type_1d_view
Definition: Ifpack2_BlockHelper.hpp:321
Teuchos::RCP< const typename BlockHelperDetails::ImplType< MatrixType >::tpetra_import_type > createBlockCrsTpetraImporter(const Teuchos::RCP< const typename BlockHelperDetails::ImplType< MatrixType >::tpetra_row_matrix_type > &A)
Definition: Ifpack2_BlockTriDiContainer_impl.hpp:164
RCP< CommRequest< Ordinal > > isend(const ArrayRCP< const Packet > &sendBuffer, const int destRank, const int tag, const Comm< Ordinal > &comm)
#define TEUCHOS_ASSERT(assertion_test)
Definition: Ifpack2_BlockHelper.hpp:215
Definition: Ifpack2_BlockHelper.hpp:249
Definition: Ifpack2_BlockTriDiContainer_impl.hpp:1563
Definition: Ifpack2_BlockComputeResidualVector.hpp:23
Definition: Ifpack2_BlockTriDiContainer_impl.hpp:3767