Ifpack2 Templated Preconditioning Package  Version 1.0
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Groups Pages
Ifpack2_BlockTriDiContainer_impl.hpp
1 // @HEADER
2 // *****************************************************************************
3 // Ifpack2: Templated Object-Oriented Algebraic Preconditioner Package
4 //
5 // Copyright 2009 NTESS and the Ifpack2 contributors.
6 // SPDX-License-Identifier: BSD-3-Clause
7 // *****************************************************************************
8 // @HEADER
9 
10 #ifndef IFPACK2_BLOCKTRIDICONTAINER_IMPL_HPP
11 #define IFPACK2_BLOCKTRIDICONTAINER_IMPL_HPP
12 
13 //#define IFPACK2_BLOCKTRIDICONTAINER_WRITE_MM
14 //#define IFPACK2_BLOCKTRIDICONTAINER_USE_PRINTF
15 
17 
18 #include <Tpetra_Details_extractMpiCommFromTeuchos.hpp>
19 #include <Tpetra_Distributor.hpp>
20 #include <Tpetra_BlockMultiVector.hpp>
21 
22 #include <Kokkos_ArithTraits.hpp>
23 #include <KokkosBatched_Util.hpp>
24 #include <KokkosBatched_Vector.hpp>
25 #include <KokkosBatched_Copy_Decl.hpp>
26 #include <KokkosBatched_Copy_Impl.hpp>
27 #include <KokkosBatched_AddRadial_Decl.hpp>
28 #include <KokkosBatched_AddRadial_Impl.hpp>
29 #include <KokkosBatched_SetIdentity_Decl.hpp>
30 #include <KokkosBatched_SetIdentity_Impl.hpp>
31 #include <KokkosBatched_Gemm_Decl.hpp>
32 #include <KokkosBatched_Gemm_Serial_Impl.hpp>
33 #include <KokkosBatched_Gemm_Team_Impl.hpp>
34 #include <KokkosBatched_Gemv_Decl.hpp>
35 #include <KokkosBatched_Gemv_Team_Impl.hpp>
36 #include <KokkosBatched_Trsm_Decl.hpp>
37 #include <KokkosBatched_Trsm_Serial_Impl.hpp>
38 #include <KokkosBatched_Trsm_Team_Impl.hpp>
39 #include <KokkosBatched_Trsv_Decl.hpp>
40 #include <KokkosBatched_Trsv_Serial_Impl.hpp>
41 #include <KokkosBatched_Trsv_Team_Impl.hpp>
42 #include <KokkosBatched_LU_Decl.hpp>
43 #include <KokkosBatched_LU_Serial_Impl.hpp>
44 #include <KokkosBatched_LU_Team_Impl.hpp>
45 
46 #include <KokkosBlas1_nrm1.hpp>
47 #include <KokkosBlas1_nrm2.hpp>
48 
49 #include <memory>
50 
51 #include "Ifpack2_BlockHelper.hpp"
52 #include "Ifpack2_BlockComputeResidualVector.hpp"
53 
54 //#include <KokkosBlas2_gemv.hpp>
55 
56 // need to interface this into cmake variable (or only use this flag when it is necessary)
57 //#define IFPACK2_BLOCKTRIDICONTAINER_ENABLE_PROFILE
58 //#undef IFPACK2_BLOCKTRIDICONTAINER_ENABLE_PROFILE
59 #if defined(KOKKOS_ENABLE_CUDA) && defined(IFPACK2_BLOCKTRIDICONTAINER_ENABLE_PROFILE)
60 #include "cuda_profiler_api.h"
61 #endif
62 
63 // I am not 100% sure about the mpi 3 on cuda
64 #if MPI_VERSION >= 3
65 #define IFPACK2_BLOCKTRIDICONTAINER_USE_MPI_3
66 #endif
67 
68 // ::: Experiments :::
69 // define either pinned memory or cudamemory for mpi
70 // if both macros are disabled, it will use tpetra memory space which is uvm space for cuda
71 // if defined, this use pinned memory instead of device pointer
72 // by default, we enable pinned memory
73 #define IFPACK2_BLOCKTRIDICONTAINER_USE_PINNED_MEMORY_FOR_MPI
74 //#define IFPACK2_BLOCKTRIDICONTAINER_USE_CUDA_MEMORY_FOR_MPI
75 
76 // if defined, all views are allocated on cuda space intead of cuda uvm space
77 #define IFPACK2_BLOCKTRIDICONTAINER_USE_CUDA_SPACE
78 
79 // if defined, btdm_scalar_type is used (if impl_scala_type is double, btdm_scalar_type is float)
80 #if defined(HAVE_IFPACK2_BLOCKTRIDICONTAINER_SMALL_SCALAR)
81 #define IFPACK2_BLOCKTRIDICONTAINER_USE_SMALL_SCALAR_FOR_BLOCKTRIDIAG
82 #endif
83 
84 // if defined, it uses multiple execution spaces
85 #define IFPACK2_BLOCKTRIDICONTAINER_USE_EXEC_SPACE_INSTANCES
86 
87 namespace Ifpack2 {
88 
89  namespace BlockTriDiContainerDetails {
90 
91  namespace KB = KokkosBatched;
92 
96  using do_not_initialize_tag = Kokkos::ViewAllocateWithoutInitializing;
97 
98  template <typename MemoryTraitsType, Kokkos::MemoryTraitsFlags flag>
99  using MemoryTraits = Kokkos::MemoryTraits<MemoryTraitsType::is_unmanaged |
100  MemoryTraitsType::is_random_access |
101  flag>;
102 
103  template <typename ViewType>
104  using Unmanaged = Kokkos::View<typename ViewType::data_type,
105  typename ViewType::array_layout,
106  typename ViewType::device_type,
107  MemoryTraits<typename ViewType::memory_traits,Kokkos::Unmanaged> >;
108  template <typename ViewType>
109  using Atomic = Kokkos::View<typename ViewType::data_type,
110  typename ViewType::array_layout,
111  typename ViewType::device_type,
112  MemoryTraits<typename ViewType::memory_traits,Kokkos::Atomic> >;
113  template <typename ViewType>
114  using Const = Kokkos::View<typename ViewType::const_data_type,
115  typename ViewType::array_layout,
116  typename ViewType::device_type,
117  typename ViewType::memory_traits>;
118  template <typename ViewType>
119  using ConstUnmanaged = Const<Unmanaged<ViewType> >;
120 
121  template <typename ViewType>
122  using AtomicUnmanaged = Atomic<Unmanaged<ViewType> >;
123 
124  template <typename ViewType>
125  using Unmanaged = Kokkos::View<typename ViewType::data_type,
126  typename ViewType::array_layout,
127  typename ViewType::device_type,
128  MemoryTraits<typename ViewType::memory_traits,Kokkos::Unmanaged> >;
129 
130 
131  template <typename ViewType>
132  using Scratch = Kokkos::View<typename ViewType::data_type,
133  typename ViewType::array_layout,
134  typename ViewType::execution_space::scratch_memory_space,
135  MemoryTraits<typename ViewType::memory_traits, Kokkos::Unmanaged> >;
136 
140  template<typename T> struct BlockTridiagScalarType { typedef T type; };
141 #if defined(IFPACK2_BLOCKTRIDICONTAINER_USE_SMALL_SCALAR_FOR_BLOCKTRIDIAG)
142  template<> struct BlockTridiagScalarType<double> { typedef float type; };
143  //template<> struct SmallScalarType<Kokkos::complex<double> > { typedef Kokkos::complex<float> type; };
144 #endif
145 
146 #if defined(KOKKOS_ENABLE_CUDA) && defined(IFPACK2_BLOCKTRIDICONTAINER_ENABLE_PROFILE)
147 #define IFPACK2_BLOCKTRIDICONTAINER_PROFILER_REGION_BEGIN \
148  KOKKOS_IMPL_CUDA_SAFE_CALL(cudaProfilerStart());
149 
150 #define IFPACK2_BLOCKTRIDICONTAINER_PROFILER_REGION_END \
151  { KOKKOS_IMPL_CUDA_SAFE_CALL( cudaProfilerStop() ); }
152 #else
153 #define IFPACK2_BLOCKTRIDICONTAINER_PROFILER_REGION_BEGIN
155 #define IFPACK2_BLOCKTRIDICONTAINER_PROFILER_REGION_END
156 #endif
157 
161  template<typename MatrixType>
163  createBlockCrsTpetraImporter(const Teuchos::RCP<const typename BlockHelperDetails::ImplType<MatrixType>::tpetra_row_matrix_type> &A) {
164  IFPACK2_BLOCKHELPER_TIMER("BlockTriDi::CreateBlockCrsTpetraImporter", CreateBlockCrsTpetraImporter);
166  using tpetra_map_type = typename impl_type::tpetra_map_type;
167  using tpetra_mv_type = typename impl_type::tpetra_block_multivector_type;
168  using tpetra_import_type = typename impl_type::tpetra_import_type;
169  using crs_matrix_type = typename impl_type::tpetra_crs_matrix_type;
170  using block_crs_matrix_type = typename impl_type::tpetra_block_crs_matrix_type;
171 
172  auto A_crs = Teuchos::rcp_dynamic_cast<const crs_matrix_type>(A);
173  auto A_bcrs = Teuchos::rcp_dynamic_cast<const block_crs_matrix_type>(A);
174 
175  bool hasBlockCrsMatrix = ! A_bcrs.is_null ();
176 
177  // This is OK here to use the graph of the A_crs matrix and a block size of 1
178  const auto g = hasBlockCrsMatrix ? A_bcrs->getCrsGraph() : *(A_crs->getCrsGraph()); // tpetra crs graph object
179 
180  const auto blocksize = hasBlockCrsMatrix ? A_bcrs->getBlockSize() : 1;
181  const auto src = Teuchos::rcp(new tpetra_map_type(tpetra_mv_type::makePointMap(*g.getDomainMap(), blocksize)));
182  const auto tgt = Teuchos::rcp(new tpetra_map_type(tpetra_mv_type::makePointMap(*g.getColMap() , blocksize)));
183  IFPACK2_BLOCKHELPER_TIMER_FENCE(typename BlockHelperDetails::ImplType<MatrixType>::execution_space)
184  return Teuchos::rcp(new tpetra_import_type(src, tgt));
185  }
186 
187  // Partial replacement for forward-mode MultiVector::doImport.
188  // Permits overlapped communication and computation, but also supports sync'ed.
189  // I'm finding that overlapped comm/comp can give quite poor performance on some
190  // platforms, so we can't just use it straightforwardly always.
191 
192  template<typename MatrixType>
193  struct AsyncableImport {
194  public:
196 
197  private:
201 #if !defined(HAVE_IFPACK2_MPI)
202  typedef int MPI_Request;
203  typedef int MPI_Comm;
204 #endif
205  using scalar_type = typename impl_type::scalar_type;
208 
209  static int isend(const MPI_Comm comm, const char* buf, int count, int dest, int tag, MPI_Request* ireq) {
210 #ifdef HAVE_IFPACK2_MPI
211  MPI_Request ureq;
212  int ret = MPI_Isend(const_cast<char*>(buf), count, MPI_CHAR, dest, tag, comm, ireq == NULL ? &ureq : ireq);
213  if (ireq == NULL) MPI_Request_free(&ureq);
214  return ret;
215 #else
216  return 0;
217 #endif
218  }
219 
220  static int irecv(const MPI_Comm comm, char* buf, int count, int src, int tag, MPI_Request* ireq) {
221 #ifdef HAVE_IFPACK2_MPI
222  MPI_Request ureq;
223  int ret = MPI_Irecv(buf, count, MPI_CHAR, src, tag, comm, ireq == NULL ? &ureq : ireq);
224  if (ireq == NULL) MPI_Request_free(&ureq);
225  return ret;
226 #else
227  return 0;
228 #endif
229  }
230 
231  static int waitany(int count, MPI_Request* reqs, int* index) {
232 #ifdef HAVE_IFPACK2_MPI
233  return MPI_Waitany(count, reqs, index, MPI_STATUS_IGNORE);
234 #else
235  return 0;
236 #endif
237  }
238 
239  static int waitall(int count, MPI_Request* reqs) {
240 #ifdef HAVE_IFPACK2_MPI
241  return MPI_Waitall(count, reqs, MPI_STATUS_IGNORE);
242 #else
243  return 0;
244 #endif
245  }
246 
247  public:
248  using tpetra_map_type = typename impl_type::tpetra_map_type;
249  using tpetra_import_type = typename impl_type::tpetra_import_type;
250 
251  using local_ordinal_type = typename impl_type::local_ordinal_type;
252  using global_ordinal_type = typename impl_type::global_ordinal_type;
253  using size_type = typename impl_type::size_type;
254  using impl_scalar_type = typename impl_type::impl_scalar_type;
255 
256  using int_1d_view_host = Kokkos::View<int*,Kokkos::HostSpace>;
257  using local_ordinal_type_1d_view_host = Kokkos::View<local_ordinal_type*,Kokkos::HostSpace>;
258 
259  using execution_space = typename impl_type::execution_space;
260  using memory_space = typename impl_type::memory_space;
261  using local_ordinal_type_1d_view = typename impl_type::local_ordinal_type_1d_view;
262  using size_type_1d_view = typename impl_type::size_type_1d_view;
263  using size_type_1d_view_host = Kokkos::View<size_type*,Kokkos::HostSpace>;
264 
265 #if defined(KOKKOS_ENABLE_CUDA)
266  using impl_scalar_type_1d_view =
267  typename std::conditional<std::is_same<execution_space,Kokkos::Cuda>::value,
268 # if defined(IFPACK2_BLOCKTRIDICONTAINER_USE_PINNED_MEMORY_FOR_MPI)
269  Kokkos::View<impl_scalar_type*,Kokkos::CudaHostPinnedSpace>,
270 # elif defined(IFPACK2_BLOCKTRIDICONTAINER_USE_CUDA_MEMORY_FOR_MPI)
271  Kokkos::View<impl_scalar_type*,Kokkos::CudaSpace>,
272 # else // no experimental macros are defined
273  typename impl_type::impl_scalar_type_1d_view,
274 # endif
275  typename impl_type::impl_scalar_type_1d_view>::type;
276 #else
277  using impl_scalar_type_1d_view = typename impl_type::impl_scalar_type_1d_view;
278 #endif
279  using impl_scalar_type_1d_view_host = Kokkos::View<impl_scalar_type*,Kokkos::HostSpace>;
280  using impl_scalar_type_2d_view = typename impl_type::impl_scalar_type_2d_view;
281  using impl_scalar_type_2d_view_tpetra = typename impl_type::impl_scalar_type_2d_view_tpetra;
282 
283 #ifdef HAVE_IFPACK2_MPI
284  MPI_Comm comm;
285 #endif
286 
287  impl_scalar_type_2d_view_tpetra remote_multivector;
288  local_ordinal_type blocksize;
289 
290  template<typename T>
291  struct SendRecvPair {
292  T send, recv;
293  };
294 
295  // (s)end and (r)eceive data:
296  SendRecvPair<int_1d_view_host> pids; // mpi ranks
297  SendRecvPair<std::vector<MPI_Request> > reqs; // MPI_Request is pointer, cannot use kokkos view
298  SendRecvPair<size_type_1d_view> offset; // offsets to local id list and data buffer
299  SendRecvPair<size_type_1d_view_host> offset_host; // offsets to local id list and data buffer
300  SendRecvPair<local_ordinal_type_1d_view> lids; // local id list
301  SendRecvPair<impl_scalar_type_1d_view> buffer; // data buffer
302  SendRecvPair<impl_scalar_type_1d_view_host> buffer_host; // data buffer
303 
304  local_ordinal_type_1d_view dm2cm; // permutation
305 
306 #if defined(KOKKOS_ENABLE_CUDA) || defined(KOKKOS_ENABLE_HIP) || defined(KOKKOS_ENABLE_SYCL)
307  using exec_instance_1d_std_vector = std::vector<execution_space>;
308  exec_instance_1d_std_vector exec_instances;
309 #endif
310 
311  // for cuda
312  public:
313  void setOffsetValues(const Teuchos::ArrayView<const size_t> &lens,
314  const size_type_1d_view &offs) {
315  // wrap lens to kokkos view and deep copy to device
316  Kokkos::View<size_t*,Kokkos::HostSpace> lens_host(const_cast<size_t*>(lens.getRawPtr()), lens.size());
317  const auto lens_device = Kokkos::create_mirror_view_and_copy(memory_space(), lens_host);
318 
319  // exclusive scan
320  const Kokkos::RangePolicy<execution_space> policy(0,offs.extent(0));
321  const local_ordinal_type lens_size = lens_device.extent(0);
322  Kokkos::parallel_scan
323  ("AsyncableImport::RangePolicy::setOffsetValues",
324  policy, KOKKOS_LAMBDA(const local_ordinal_type &i, size_type &update, const bool &final) {
325  if (final)
326  offs(i) = update;
327  update += (i < lens_size ? lens_device[i] : 0);
328  });
329  }
330 
331  void setOffsetValuesHost(const Teuchos::ArrayView<const size_t> &lens,
332  const size_type_1d_view_host &offs) {
333  // wrap lens to kokkos view and deep copy to device
334  Kokkos::View<size_t*,Kokkos::HostSpace> lens_host(const_cast<size_t*>(lens.getRawPtr()), lens.size());
335  const auto lens_device = Kokkos::create_mirror_view_and_copy(memory_space(), lens_host);
336 
337  // exclusive scan
338  offs(0) = 0;
339  for (local_ordinal_type i=1,iend=offs.extent(0);i<iend;++i) {
340  offs(i) = offs(i-1) + lens[i-1];
341  }
342  }
343 
344  private:
345  void createMpiRequests(const tpetra_import_type &import) {
346  Tpetra::Distributor &distributor = import.getDistributor();
347 
348  // copy pids from distributor
349  const auto pids_from = distributor.getProcsFrom();
350  pids.recv = int_1d_view_host(do_not_initialize_tag("pids recv"), pids_from.size());
351  memcpy(pids.recv.data(), pids_from.getRawPtr(), sizeof(int)*pids.recv.extent(0));
352 
353  const auto pids_to = distributor.getProcsTo();
354  pids.send = int_1d_view_host(do_not_initialize_tag("pids send"), pids_to.size());
355  memcpy(pids.send.data(), pids_to.getRawPtr(), sizeof(int)*pids.send.extent(0));
356 
357  // mpi requests
358  reqs.recv.resize(pids.recv.extent(0)); memset(reqs.recv.data(), 0, reqs.recv.size()*sizeof(MPI_Request));
359  reqs.send.resize(pids.send.extent(0)); memset(reqs.send.data(), 0, reqs.send.size()*sizeof(MPI_Request));
360 
361  // construct offsets
362 #if 0
363  const auto lengths_to = distributor.getLengthsTo();
364  offset.send = size_type_1d_view(do_not_initialize_tag("offset send"), lengths_to.size() + 1);
365 
366  const auto lengths_from = distributor.getLengthsFrom();
367  offset.recv = size_type_1d_view(do_not_initialize_tag("offset recv"), lengths_from.size() + 1);
368 
369  setOffsetValues(lengths_to, offset.send);
370  offset_host.send = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), offset.send);
371 
372  setOffsetValues(lengths_from, offset.recv);
373  offset_host.recv = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), offset.recv);
374 #else
375  const auto lengths_to = distributor.getLengthsTo();
376  offset_host.send = size_type_1d_view_host(do_not_initialize_tag("offset send"), lengths_to.size() + 1);
377 
378  const auto lengths_from = distributor.getLengthsFrom();
379  offset_host.recv = size_type_1d_view_host(do_not_initialize_tag("offset recv"), lengths_from.size() + 1);
380 
381  setOffsetValuesHost(lengths_to, offset_host.send);
382  //offset.send = Kokkos::create_mirror_view_and_copy(memory_space(), offset_host.send);
383 
384  setOffsetValuesHost(lengths_from, offset_host.recv);
385  //offset.recv = Kokkos::create_mirror_view_and_copy(memory_space(), offset_host.recv);
386 #endif
387  }
388 
389  void createSendRecvIDs(const tpetra_import_type &import) {
390  // For each remote PID, the list of LIDs to receive.
391  const auto remote_lids = import.getRemoteLIDs();
392  const local_ordinal_type_1d_view_host
393  remote_lids_view_host(const_cast<local_ordinal_type*>(remote_lids.getRawPtr()), remote_lids.size());
394  lids.recv = local_ordinal_type_1d_view(do_not_initialize_tag("lids recv"), remote_lids.size());
395  Kokkos::deep_copy(lids.recv, remote_lids_view_host);
396 
397  // For each export PID, the list of LIDs to send.
398  auto epids = import.getExportPIDs();
399  auto elids = import.getExportLIDs();
400  TEUCHOS_ASSERT(epids.size() == elids.size());
401  lids.send = local_ordinal_type_1d_view(do_not_initialize_tag("lids send"), elids.size());
402  auto lids_send_host = Kokkos::create_mirror_view(lids.send);
403 
404  // naive search (not sure if pids or epids are sorted)
405  for (local_ordinal_type cnt=0,i=0,iend=pids.send.extent(0);i<iend;++i) {
406  const auto pid_send_value = pids.send[i];
407  for (local_ordinal_type j=0,jend=epids.size();j<jend;++j)
408  if (epids[j] == pid_send_value) lids_send_host[cnt++] = elids[j];
409  TEUCHOS_ASSERT(static_cast<size_t>(cnt) == offset_host.send[i+1]);
410  }
411  Kokkos::deep_copy(lids.send, lids_send_host);
412  }
413 
414  void createExecutionSpaceInstances() {
415 #if defined(KOKKOS_ENABLE_CUDA) || defined(KOKKOS_ENABLE_HIP) || defined(KOKKOS_ENABLE_SYCL)
416  //The following line creates 8 streams:
417  exec_instances =
418  Kokkos::Experimental::partition_space(execution_space(), 1, 1, 1, 1, 1, 1, 1, 1);
419 #endif
420  }
421 
422  public:
423  // for cuda, all tag types are public
424  struct ToBuffer {};
425  struct ToMultiVector {};
426 
427  AsyncableImport (const Teuchos::RCP<const tpetra_map_type>& src_map,
429  const local_ordinal_type blocksize_,
430  const local_ordinal_type_1d_view dm2cm_) {
431  blocksize = blocksize_;
432  dm2cm = dm2cm_;
433 
434 #ifdef HAVE_IFPACK2_MPI
435  comm = Tpetra::Details::extractMpiCommFromTeuchos(*tgt_map->getComm());
436 #endif
437  const tpetra_import_type import(src_map, tgt_map);
438 
439  createMpiRequests(import);
440  createSendRecvIDs(import);
441  createExecutionSpaceInstances();
442  }
443 
444  void createDataBuffer(const local_ordinal_type &num_vectors) {
445  const size_type extent_0 = lids.recv.extent(0)*blocksize;
446  const size_type extent_1 = num_vectors;
447  if (remote_multivector.extent(0) == extent_0 &&
448  remote_multivector.extent(1) == extent_1) {
449  // skip
450  } else {
451  remote_multivector =
452  impl_scalar_type_2d_view_tpetra(do_not_initialize_tag("remote multivector"), extent_0, extent_1);
453 
454  const auto send_buffer_size = offset_host.send[offset_host.send.extent(0)-1]*blocksize*num_vectors;
455  const auto recv_buffer_size = offset_host.recv[offset_host.recv.extent(0)-1]*blocksize*num_vectors;
456 
457  buffer.send = impl_scalar_type_1d_view(do_not_initialize_tag("buffer send"), send_buffer_size);
458  buffer.recv = impl_scalar_type_1d_view(do_not_initialize_tag("buffer recv"), recv_buffer_size);
459 
460  if (!Tpetra::Details::Behavior::assumeMpiIsGPUAware()) {
461  buffer_host.send = impl_scalar_type_1d_view_host(do_not_initialize_tag("buffer send"), send_buffer_size);
462  buffer_host.recv = impl_scalar_type_1d_view_host(do_not_initialize_tag("buffer recv"), recv_buffer_size);
463  }
464  }
465  }
466 
467  void cancel () {
468 #ifdef HAVE_IFPACK2_MPI
469  waitall(reqs.recv.size(), reqs.recv.data());
470  waitall(reqs.send.size(), reqs.send.data());
471 #endif
472  }
473 
474  // ======================================================================
475  // Async version using execution space instances
476  // ======================================================================
477 
478 #if defined(KOKKOS_ENABLE_CUDA) || defined(KOKKOS_ENABLE_HIP) || defined(KOKKOS_ENABLE_SYCL)
479  template<typename PackTag>
480  static
481  void copy(const local_ordinal_type_1d_view &lids_,
482  const impl_scalar_type_1d_view &buffer_,
483  const local_ordinal_type ibeg_,
484  const local_ordinal_type iend_,
485  const impl_scalar_type_2d_view_tpetra &multivector_,
486  const local_ordinal_type blocksize_,
487  const execution_space &exec_instance_) {
488  const local_ordinal_type num_vectors = multivector_.extent(1);
489  const local_ordinal_type mv_blocksize = blocksize_*num_vectors;
490  const local_ordinal_type idiff = iend_ - ibeg_;
491  const auto abase = buffer_.data() + mv_blocksize*ibeg_;
492 
493  using team_policy_type = Kokkos::TeamPolicy<execution_space>;
494  local_ordinal_type vector_size(0);
495  if (blocksize_ <= 4) vector_size = 4;
496  else if (blocksize_ <= 8) vector_size = 8;
497  else if (blocksize_ <= 16) vector_size = 16;
498  else vector_size = 32;
499 
500  const auto work_item_property = Kokkos::Experimental::WorkItemProperty::HintLightWeight;
501  const team_policy_type policy(exec_instance_, idiff, 1, vector_size);
502  Kokkos::parallel_for
503  (//"AsyncableImport::TeamPolicy::copyViaCudaStream",
504  Kokkos::Experimental::require(policy, work_item_property),
505  KOKKOS_LAMBDA(const typename team_policy_type::member_type &member) {
506  const local_ordinal_type i = member.league_rank();
507  Kokkos::parallel_for
508  (Kokkos::TeamThreadRange(member,num_vectors),[&](const local_ordinal_type &j) {
509  auto aptr = abase + blocksize_*(i + idiff*j);
510  auto bptr = &multivector_(blocksize_*lids_(i + ibeg_), j);
511  if (std::is_same<PackTag,ToBuffer>::value)
512  Kokkos::parallel_for
513  (Kokkos::ThreadVectorRange(member,blocksize_),[&](const local_ordinal_type &k) {
514  aptr[k] = bptr[k];
515  });
516  else
517  Kokkos::parallel_for
518  (Kokkos::ThreadVectorRange(member,blocksize_),[&](const local_ordinal_type &k) {
519  bptr[k] = aptr[k];
520  });
521  });
522  });
523  }
524 
525  void asyncSendRecvVar1(const impl_scalar_type_2d_view_tpetra &mv) {
526  IFPACK2_BLOCKHELPER_TIMER("BlockTriDi::AsyncableImport::AsyncSendRecv", AsyncSendRecv);
527 
528 #ifdef HAVE_IFPACK2_MPI
529  // constants and reallocate data buffers if necessary
530  const local_ordinal_type num_vectors = mv.extent(1);
531  const local_ordinal_type mv_blocksize = blocksize*num_vectors;
532 
533  // 0. post receive async
534  for (local_ordinal_type i=0,iend=pids.recv.extent(0);i<iend;++i) {
535  if(Tpetra::Details::Behavior::assumeMpiIsGPUAware()) {
536  irecv(comm,
537  reinterpret_cast<char*>(buffer.recv.data() + offset_host.recv[i]*mv_blocksize),
538  (offset_host.recv[i+1] - offset_host.recv[i])*mv_blocksize*sizeof(impl_scalar_type),
539  pids.recv[i],
540  42,
541  &reqs.recv[i]);
542  }
543  else {
544  irecv(comm,
545  reinterpret_cast<char*>(buffer_host.recv.data() + offset_host.recv[i]*mv_blocksize),
546  (offset_host.recv[i+1] - offset_host.recv[i])*mv_blocksize*sizeof(impl_scalar_type),
547  pids.recv[i],
548  42,
549  &reqs.recv[i]);
550  }
551  }
552 
554  execution_space().fence();
555 
556  // 1. async memcpy
557  for (local_ordinal_type i=0;i<static_cast<local_ordinal_type>(pids.send.extent(0));++i) {
558  // 1.0. enqueue pack buffer
559  if (i<8) exec_instances[i%8].fence();
560  copy<ToBuffer>(lids.send, buffer.send,
561  offset_host.send(i), offset_host.send(i+1),
562  mv, blocksize,
563  //execution_space());
564  exec_instances[i%8]);
565  if (!Tpetra::Details::Behavior::assumeMpiIsGPUAware()) {
566  //if (i<8) exec_instances[i%8].fence();
567  const local_ordinal_type num_vectors = mv.extent(1);
568  const local_ordinal_type mv_blocksize = blocksize*num_vectors;
569 
570  Kokkos::deep_copy(exec_instances[i%8],
571  Kokkos::subview(buffer_host.send,
572  Kokkos::pair<local_ordinal_type, local_ordinal_type>(
573  offset_host.send(i)*mv_blocksize,
574  offset_host.send(i+1)*mv_blocksize)),
575  Kokkos::subview(buffer.send,
576  Kokkos::pair<local_ordinal_type, local_ordinal_type>(
577  offset_host.send(i)*mv_blocksize,
578  offset_host.send(i+1)*mv_blocksize)));
579  }
580  }
582  //execution_space().fence();
583  for (local_ordinal_type i=0;i<static_cast<local_ordinal_type>(pids.send.extent(0));++i) {
584  // 1.1. sync the stream and isend
585  if (i<8) exec_instances[i%8].fence();
586  if(Tpetra::Details::Behavior::assumeMpiIsGPUAware()) {
587  isend(comm,
588  reinterpret_cast<const char*>(buffer.send.data() + offset_host.send[i]*mv_blocksize),
589  (offset_host.send[i+1] - offset_host.send[i])*mv_blocksize*sizeof(impl_scalar_type),
590  pids.send[i],
591  42,
592  &reqs.send[i]);
593  }
594  else {
595  isend(comm,
596  reinterpret_cast<const char*>(buffer_host.send.data() + offset_host.send[i]*mv_blocksize),
597  (offset_host.send[i+1] - offset_host.send[i])*mv_blocksize*sizeof(impl_scalar_type),
598  pids.send[i],
599  42,
600  &reqs.send[i]);
601  }
602  }
603 
604  // 2. poke communication
605  for (local_ordinal_type i=0,iend=pids.recv.extent(0);i<iend;++i) {
606  int flag;
607  MPI_Status stat;
608  MPI_Iprobe(pids.recv[i], 42, comm, &flag, &stat);
609  }
610 #endif // HAVE_IFPACK2_MPI
611  IFPACK2_BLOCKHELPER_TIMER_FENCE(execution_space)
612  }
613 
614  void syncRecvVar1() {
615  IFPACK2_BLOCKHELPER_TIMER("BlockTriDi::AsyncableImport::SyncRecv", SyncRecv);
616 #ifdef HAVE_IFPACK2_MPI
617  // 0. wait for receive async.
618  for (local_ordinal_type i=0;i<static_cast<local_ordinal_type>(pids.recv.extent(0));++i) {
619  local_ordinal_type idx = i;
620 
621  // 0.0. wait any
622  waitany(pids.recv.extent(0), reqs.recv.data(), &idx);
623 
624  if (!Tpetra::Details::Behavior::assumeMpiIsGPUAware()) {
625  const local_ordinal_type num_vectors = remote_multivector.extent(1);
626  const local_ordinal_type mv_blocksize = blocksize*num_vectors;
627 
628  Kokkos::deep_copy(
629  Kokkos::subview(buffer.recv,
630  Kokkos::pair<local_ordinal_type, local_ordinal_type>(
631  offset_host.recv(idx)*mv_blocksize,
632  offset_host.recv(idx+1)*mv_blocksize)),
633  Kokkos::subview(buffer_host.recv,
634  Kokkos::pair<local_ordinal_type, local_ordinal_type>(
635  offset_host.recv(idx)*mv_blocksize,
636  offset_host.recv(idx+1)*mv_blocksize)));
637  }
638 
639  // 0.1. unpack data after data is moved into a device
640  copy<ToMultiVector>(lids.recv, buffer.recv,
641  offset_host.recv(idx), offset_host.recv(idx+1),
642  remote_multivector, blocksize,
643  exec_instances[idx%8]);
644  }
645 
646  // 1. fire up all cuda events
647  Kokkos::fence();
648 
649  // 2. cleanup all open comm
650  waitall(reqs.send.size(), reqs.send.data());
651 #endif // HAVE_IFPACK2_MPI
652  IFPACK2_BLOCKHELPER_TIMER_FENCE(execution_space)
653  }
654 #endif //defined(KOKKOS_ENABLE_CUDA|HIP|SYCL)
655 
656  // ======================================================================
657  // Generic version without using execution space instances
658  // - only difference between device and host architecture is on using team
659  // or range policies.
660  // ======================================================================
661  template<typename PackTag>
662  static
663  void copy(const local_ordinal_type_1d_view &lids_,
664  const impl_scalar_type_1d_view &buffer_,
665  const local_ordinal_type &ibeg_,
666  const local_ordinal_type &iend_,
667  const impl_scalar_type_2d_view_tpetra &multivector_,
668  const local_ordinal_type blocksize_) {
669  const local_ordinal_type num_vectors = multivector_.extent(1);
670  const local_ordinal_type mv_blocksize = blocksize_*num_vectors;
671  const local_ordinal_type idiff = iend_ - ibeg_;
672  const auto abase = buffer_.data() + mv_blocksize*ibeg_;
673  if constexpr (BlockHelperDetails::is_device<execution_space>::value) {
674  using team_policy_type = Kokkos::TeamPolicy<execution_space>;
675  local_ordinal_type vector_size(0);
676  if (blocksize_ <= 4) vector_size = 4;
677  else if (blocksize_ <= 8) vector_size = 8;
678  else if (blocksize_ <= 16) vector_size = 16;
679  else vector_size = 32;
680  const team_policy_type policy(idiff, 1, vector_size);
681  Kokkos::parallel_for
682  ("AsyncableImport::TeamPolicy::copy",
683  policy, KOKKOS_LAMBDA(const typename team_policy_type::member_type &member) {
684  const local_ordinal_type i = member.league_rank();
685  Kokkos::parallel_for
686  (Kokkos::TeamThreadRange(member,num_vectors),[&](const local_ordinal_type &j) {
687  auto aptr = abase + blocksize_*(i + idiff*j);
688  auto bptr = &multivector_(blocksize_*lids_(i + ibeg_), j);
689  if (std::is_same<PackTag,ToBuffer>::value)
690  Kokkos::parallel_for
691  (Kokkos::ThreadVectorRange(member,blocksize_),[&](const local_ordinal_type &k) {
692  aptr[k] = bptr[k];
693  });
694  else
695  Kokkos::parallel_for
696  (Kokkos::ThreadVectorRange(member,blocksize_),[&](const local_ordinal_type &k) {
697  bptr[k] = aptr[k];
698  });
699  });
700  });
701  } else {
702  const Kokkos::RangePolicy<execution_space> policy(0, idiff*num_vectors);
703  Kokkos::parallel_for
704  ("AsyncableImport::RangePolicy::copy",
705  policy, KOKKOS_LAMBDA(const local_ordinal_type &ij) {
706  const local_ordinal_type i = ij%idiff;
707  const local_ordinal_type j = ij/idiff;
708  auto aptr = abase + blocksize_*(i + idiff*j);
709  auto bptr = &multivector_(blocksize_*lids_(i + ibeg_), j);
710  auto from = std::is_same<PackTag,ToBuffer>::value ? bptr : aptr;
711  auto to = std::is_same<PackTag,ToBuffer>::value ? aptr : bptr;
712  memcpy(to, from, sizeof(impl_scalar_type)*blocksize_);
713  });
714  }
715  }
716 
717 
721  void asyncSendRecvVar0(const impl_scalar_type_2d_view_tpetra &mv) {
722  IFPACK2_BLOCKHELPER_TIMER("BlockTriDi::AsyncableImport::AsyncSendRecv", AsyncSendRecv);
723 
724 #ifdef HAVE_IFPACK2_MPI
725  // constants and reallocate data buffers if necessary
726  const local_ordinal_type num_vectors = mv.extent(1);
727  const local_ordinal_type mv_blocksize = blocksize*num_vectors;
728 
729  // receive async
730  for (local_ordinal_type i=0,iend=pids.recv.extent(0);i<iend;++i) {
731  if(Tpetra::Details::Behavior::assumeMpiIsGPUAware()) {
732  irecv(comm,
733  reinterpret_cast<char*>(buffer.recv.data() + offset_host.recv[i]*mv_blocksize),
734  (offset_host.recv[i+1] - offset_host.recv[i])*mv_blocksize*sizeof(impl_scalar_type),
735  pids.recv[i],
736  42,
737  &reqs.recv[i]);
738  }
739  else {
740  irecv(comm,
741  reinterpret_cast<char*>(buffer_host.recv.data() + offset_host.recv[i]*mv_blocksize),
742  (offset_host.recv[i+1] - offset_host.recv[i])*mv_blocksize*sizeof(impl_scalar_type),
743  pids.recv[i],
744  42,
745  &reqs.recv[i]);
746  }
747  }
748 
749  // send async
750  for (local_ordinal_type i=0,iend=pids.send.extent(0);i<iend;++i) {
751  copy<ToBuffer>(lids.send, buffer.send, offset_host.send(i), offset_host.send(i+1),
752  mv, blocksize);
753  Kokkos::fence();
754  if(Tpetra::Details::Behavior::assumeMpiIsGPUAware()) {
755  isend(comm,
756  reinterpret_cast<const char*>(buffer.send.data() + offset_host.send[i]*mv_blocksize),
757  (offset_host.send[i+1] - offset_host.send[i])*mv_blocksize*sizeof(impl_scalar_type),
758  pids.send[i],
759  42,
760  &reqs.send[i]);
761  }
762  else {
763  Kokkos::deep_copy(
764  Kokkos::subview(buffer_host.send,
765  Kokkos::pair<local_ordinal_type, local_ordinal_type>(
766  offset_host.send(i)*mv_blocksize,
767  offset_host.send(i+1)*mv_blocksize)),
768  Kokkos::subview(buffer.send,
769  Kokkos::pair<local_ordinal_type, local_ordinal_type>(
770  offset_host.send(i)*mv_blocksize,
771  offset_host.send(i+1)*mv_blocksize)));
772  isend(comm,
773  reinterpret_cast<const char*>(buffer_host.send.data() + offset_host.send[i]*mv_blocksize),
774  (offset_host.send[i+1] - offset_host.send[i])*mv_blocksize*sizeof(impl_scalar_type),
775  pids.send[i],
776  42,
777  &reqs.send[i]);
778  }
779  }
780 
781  // I find that issuing an Iprobe seems to nudge some MPIs into action,
782  // which helps with overlapped comm/comp performance.
783  for (local_ordinal_type i=0,iend=pids.recv.extent(0);i<iend;++i) {
784  int flag;
785  MPI_Status stat;
786  MPI_Iprobe(pids.recv[i], 42, comm, &flag, &stat);
787  }
788 #endif
789  IFPACK2_BLOCKHELPER_TIMER_FENCE(execution_space)
790  }
791 
792  void syncRecvVar0() {
793  IFPACK2_BLOCKHELPER_TIMER("BlockTriDi::AsyncableImport::SyncRecv", SyncRecv);
794 #ifdef HAVE_IFPACK2_MPI
795  // receive async.
796  for (local_ordinal_type i=0,iend=pids.recv.extent(0);i<iend;++i) {
797  local_ordinal_type idx = i;
798  waitany(pids.recv.extent(0), reqs.recv.data(), &idx);
799  if (!Tpetra::Details::Behavior::assumeMpiIsGPUAware()) {
800  const local_ordinal_type num_vectors = remote_multivector.extent(1);
801  const local_ordinal_type mv_blocksize = blocksize*num_vectors;
802  Kokkos::deep_copy(
803  Kokkos::subview(buffer.recv,
804  Kokkos::pair<local_ordinal_type, local_ordinal_type>(
805  offset_host.recv(idx)*mv_blocksize,
806  offset_host.recv(idx+1)*mv_blocksize)),
807  Kokkos::subview(buffer_host.recv,
808  Kokkos::pair<local_ordinal_type, local_ordinal_type>(
809  offset_host.recv(idx)*mv_blocksize,
810  offset_host.recv(idx+1)*mv_blocksize)));
811  }
812  copy<ToMultiVector>(lids.recv, buffer.recv, offset_host.recv(idx), offset_host.recv(idx+1),
813  remote_multivector, blocksize);
814  }
815  // wait on the sends to match all Isends with a cleanup operation.
816  waitall(reqs.send.size(), reqs.send.data());
817 #endif
818  IFPACK2_BLOCKHELPER_TIMER_FENCE(execution_space)
819  }
820 
824  void asyncSendRecv(const impl_scalar_type_2d_view_tpetra &mv) {
825 #if defined(KOKKOS_ENABLE_CUDA) || defined(KOKKOS_ENABLE_HIP) || defined(KOKKOS_ENABLE_SYCL)
826 #if defined(IFPACK2_BLOCKTRIDICONTAINER_USE_EXEC_SPACE_INSTANCES)
827  asyncSendRecvVar1(mv);
828 #else
829  asyncSendRecvVar0(mv);
830 #endif
831 #else
832  asyncSendRecvVar0(mv);
833 #endif
834  }
835  void syncRecv() {
836 #if defined(KOKKOS_ENABLE_CUDA) || defined(KOKKOS_ENABLE_HIP) || defined(KOKKOS_ENABLE_SYCL)
837 #if defined(IFPACK2_BLOCKTRIDICONTAINER_USE_EXEC_SPACE_INSTANCES)
838  syncRecvVar1();
839 #else
840  syncRecvVar0();
841 #endif
842 #else
843  syncRecvVar0();
844 #endif
845  }
846 
847  void syncExchange(const impl_scalar_type_2d_view_tpetra &mv) {
848  IFPACK2_BLOCKHELPER_TIMER("BlockTriDi::AsyncableImport::SyncExchange", SyncExchange);
849  asyncSendRecv(mv);
850  syncRecv();
851  IFPACK2_BLOCKHELPER_TIMER_FENCE(execution_space)
852  }
853 
854  impl_scalar_type_2d_view_tpetra getRemoteMultiVectorLocalView() const { return remote_multivector; }
855  };
856 
857  template <typename ViewType1, typename ViewType2>
858  struct are_same_struct {
859  ViewType1 keys1;
860  ViewType2 keys2;
861 
862  are_same_struct(ViewType1 keys1_, ViewType2 keys2_) : keys1(keys1_), keys2(keys2_) {}
863  KOKKOS_INLINE_FUNCTION
864  void operator()(int i, unsigned int& count) const {
865  if (keys1(i) != keys2(i)) count++;
866  }
867  };
868 
869  template <typename ViewType1, typename ViewType2>
870  bool are_same (ViewType1 keys1, ViewType2 keys2) {
871  unsigned int are_same_ = 0;
872 
873  Kokkos::parallel_reduce(Kokkos::RangePolicy<typename ViewType1::execution_space>(0, keys1.extent(0)),
874  are_same_struct(keys1, keys2),
875  are_same_);
876  return are_same_==0;
877  }
878 
882  template<typename MatrixType>
884  createBlockCrsAsyncImporter(const Teuchos::RCP<const typename BlockHelperDetails::ImplType<MatrixType>::tpetra_row_matrix_type> &A) {
885  IFPACK2_BLOCKHELPER_TIMER("createBlockCrsAsyncImporter", createBlockCrsAsyncImporter);
887  using tpetra_map_type = typename impl_type::tpetra_map_type;
888  using local_ordinal_type = typename impl_type::local_ordinal_type;
889  using global_ordinal_type = typename impl_type::global_ordinal_type;
890  using local_ordinal_type_1d_view = typename impl_type::local_ordinal_type_1d_view;
891  using crs_matrix_type = typename impl_type::tpetra_crs_matrix_type;
892  using block_crs_matrix_type = typename impl_type::tpetra_block_crs_matrix_type;
893  using global_indices_array_device_type = Kokkos::View<const global_ordinal_type*, typename tpetra_map_type::device_type>;
894 
895  auto A_crs = Teuchos::rcp_dynamic_cast<const crs_matrix_type>(A);
896  auto A_bcrs = Teuchos::rcp_dynamic_cast<const block_crs_matrix_type>(A);
897 
898  bool hasBlockCrsMatrix = ! A_bcrs.is_null ();
899 
900  // This is OK here to use the graph of the A_crs matrix and a block size of 1
901  const auto g = hasBlockCrsMatrix ? A_bcrs->getCrsGraph() : *(A_crs->getCrsGraph()); // tpetra crs graph object
902 
903  const auto blocksize = hasBlockCrsMatrix ? A_bcrs->getBlockSize() : 1;
904  const auto domain_map = g.getDomainMap();
905  const auto column_map = g.getColMap();
906 
907  std::vector<global_ordinal_type> gids;
908 
909  Kokkos::Subview<global_indices_array_device_type, std::pair<int,int>> column_map_global_iD_last;
910 
911  bool separate_remotes = true, found_first = false, need_owned_permutation = false;
912  {
913  IFPACK2_BLOCKHELPER_TIMER("createBlockCrsAsyncImporter::loop_over_local_elements", loop_over_local_elements);
914 
915  global_indices_array_device_type column_map_global_iD = column_map->getMyGlobalIndicesDevice();
916  global_indices_array_device_type domain_map_global_iD = domain_map->getMyGlobalIndicesDevice();
917 
918  if(are_same(domain_map_global_iD, column_map_global_iD)) {
919  // this should be the most likely path
920  separate_remotes = true;
921  need_owned_permutation = false;
922 
923  column_map_global_iD_last = Kokkos::subview(column_map_global_iD,
924  std::pair<int,int>(domain_map_global_iD.extent(0), column_map_global_iD.extent(0)));
925  }
926  else {
927  // This loop is relatively expensive
928  for (size_t i=0;i<column_map->getLocalNumElements();++i) {
929  const global_ordinal_type gid = column_map->getGlobalElement(i);
930  if (!domain_map->isNodeGlobalElement(gid)) {
931  found_first = true;
932  gids.push_back(gid);
933  } else if (found_first) {
934  separate_remotes = false;
935  break;
936  }
937  if (!found_first && !need_owned_permutation &&
938  domain_map->getLocalElement(gid) != static_cast<local_ordinal_type>(i)) {
939  // The owned part of the domain and column maps are different
940  // orderings. We *could* do a super efficient impl of this case in the
941  // num_sweeps > 1 case by adding complexity to PermuteAndRepack. But,
942  // really, if a caller cares about speed, they wouldn't make different
943  // local permutations like this. So we punt on the best impl and go for
944  // a pretty good one: the permutation is done in place in
945  // compute_b_minus_Rx for the pure-owned part of the MVP. The only cost
946  // is the presumably worse memory access pattern of the input vector.
947  need_owned_permutation = true;
948  }
949  }
950  }
951  IFPACK2_BLOCKHELPER_TIMER_FENCE(typename BlockHelperDetails::ImplType<MatrixType>::execution_space)
952  }
953 
954  if (separate_remotes) {
955  IFPACK2_BLOCKHELPER_TIMER("createBlockCrsAsyncImporter::separate_remotes", separate_remotes);
957  const auto parsimonious_col_map
958  = need_owned_permutation ?
959  Teuchos::rcp(new tpetra_map_type(invalid, gids.data(), gids.size(), 0, domain_map->getComm())):
960  Teuchos::rcp(new tpetra_map_type(invalid, column_map_global_iD_last, 0, domain_map->getComm()));
961  if (parsimonious_col_map->getGlobalNumElements() > 0) {
962  // make the importer only if needed.
963  local_ordinal_type_1d_view dm2cm;
964  if (need_owned_permutation) {
965  dm2cm = local_ordinal_type_1d_view(do_not_initialize_tag("dm2cm"), domain_map->getLocalNumElements());
966  const auto dm2cm_host = Kokkos::create_mirror_view(dm2cm);
967  for (size_t i=0;i<domain_map->getLocalNumElements();++i)
968  dm2cm_host(i) = domain_map->getLocalElement(column_map->getGlobalElement(i));
969  Kokkos::deep_copy(dm2cm, dm2cm_host);
970  }
971  IFPACK2_BLOCKHELPER_TIMER_FENCE(typename BlockHelperDetails::ImplType<MatrixType>::execution_space)
972  return Teuchos::rcp(new AsyncableImport<MatrixType>(domain_map, parsimonious_col_map, blocksize, dm2cm));
973  }
974  }
975  IFPACK2_BLOCKHELPER_TIMER_FENCE(typename BlockHelperDetails::ImplType<MatrixType>::execution_space)
976  return Teuchos::null;
977  }
978 
979  template<typename local_ordinal_type>
980  local_ordinal_type costTRSM(const local_ordinal_type block_size) {
981  return block_size*block_size;
982  }
983 
984  template<typename local_ordinal_type>
985  local_ordinal_type costGEMV(const local_ordinal_type block_size) {
986  return 2*block_size*block_size;
987  }
988 
989  template<typename local_ordinal_type>
990  local_ordinal_type costTriDiagSolve(const local_ordinal_type subline_length, const local_ordinal_type block_size) {
991  return 2 * subline_length * costTRSM(block_size) + 2 * (subline_length-1) * costGEMV(block_size);
992  }
993 
994  template<typename local_ordinal_type>
995  local_ordinal_type costSolveSchur(const local_ordinal_type num_parts,
996  const local_ordinal_type num_teams,
997  const local_ordinal_type line_length,
998  const local_ordinal_type block_size,
999  const local_ordinal_type n_subparts_per_part) {
1000  const local_ordinal_type subline_length = ceil(double(line_length - (n_subparts_per_part-1) * 2) / n_subparts_per_part);
1001  if (subline_length < 1) {
1002  return INT_MAX;
1003  }
1004 
1005  const local_ordinal_type p_n_lines = ceil(double(num_parts)/num_teams);
1006  const local_ordinal_type p_n_sublines = ceil(double(n_subparts_per_part)*num_parts/num_teams);
1007  const local_ordinal_type p_n_sublines_2 = ceil(double(n_subparts_per_part-1)*num_parts/num_teams);
1008 
1009  const local_ordinal_type p_costApplyE = p_n_sublines_2 * subline_length * 2 * costGEMV(block_size);
1010  const local_ordinal_type p_costApplyS = p_n_lines * costTriDiagSolve((n_subparts_per_part-1)*2,block_size);
1011  const local_ordinal_type p_costApplyAinv = p_n_sublines * costTriDiagSolve(subline_length,block_size);
1012  const local_ordinal_type p_costApplyC = p_n_sublines_2 * 2 * costGEMV(block_size);
1013 
1014  if (n_subparts_per_part == 1) {
1015  return p_costApplyAinv;
1016  }
1017  return p_costApplyE + p_costApplyS + p_costApplyAinv + p_costApplyC;
1018  }
1019 
1020  template<typename local_ordinal_type>
1021  local_ordinal_type getAutomaticNSubparts(const local_ordinal_type num_parts,
1022  const local_ordinal_type num_teams,
1023  const local_ordinal_type line_length,
1024  const local_ordinal_type block_size) {
1025  local_ordinal_type n_subparts_per_part_0 = 1;
1026  local_ordinal_type flop_0 = costSolveSchur(num_parts, num_teams, line_length, block_size, n_subparts_per_part_0);
1027  local_ordinal_type flop_1 = costSolveSchur(num_parts, num_teams, line_length, block_size, n_subparts_per_part_0+1);
1028  while (flop_0 > flop_1) {
1029  flop_0 = flop_1;
1030  flop_1 = costSolveSchur(num_parts, num_teams, line_length, block_size, (++n_subparts_per_part_0)+1);
1031  }
1032  return n_subparts_per_part_0;
1033  }
1034 
1035  template<typename ArgActiveExecutionMemorySpace>
1036  struct SolveTridiagsDefaultModeAndAlgo;
1037 
1041  template<typename MatrixType>
1042  BlockHelperDetails::PartInterface<MatrixType>
1043  createPartInterface(const Teuchos::RCP<const typename BlockHelperDetails::ImplType<MatrixType>::tpetra_row_matrix_type> &A,
1044  const Teuchos::RCP<const typename BlockHelperDetails::ImplType<MatrixType>::tpetra_crs_graph_type> &G,
1045  const Teuchos::Array<Teuchos::Array<typename BlockHelperDetails::ImplType<MatrixType>::local_ordinal_type> > &partitions,
1046  const typename BlockHelperDetails::ImplType<MatrixType>::local_ordinal_type n_subparts_per_part_in) {
1047  IFPACK2_BLOCKHELPER_TIMER("createPartInterface", createPartInterface);
1048  using impl_type = BlockHelperDetails::ImplType<MatrixType>;
1049  using local_ordinal_type = typename impl_type::local_ordinal_type;
1050  using local_ordinal_type_1d_view = typename impl_type::local_ordinal_type_1d_view;
1051  using local_ordinal_type_2d_view = typename impl_type::local_ordinal_type_2d_view;
1052  using size_type = typename impl_type::size_type;
1053 
1054  auto bA = Teuchos::rcp_dynamic_cast<const typename BlockHelperDetails::ImplType<MatrixType>::tpetra_block_crs_matrix_type>(A);
1055 
1056  TEUCHOS_ASSERT(!bA.is_null() || G->getLocalNumRows() != 0);
1057  const local_ordinal_type blocksize = bA.is_null() ? A->getLocalNumRows() / G->getLocalNumRows() : A->getBlockSize();
1058  constexpr int vector_length = impl_type::vector_length;
1059  constexpr int internal_vector_length = impl_type::internal_vector_length;
1060 
1061  const auto comm = A->getRowMap()->getComm();
1062 
1063  BlockHelperDetails::PartInterface<MatrixType> interf;
1064 
1065  const bool jacobi = partitions.size() == 0;
1066  const local_ordinal_type A_n_lclrows = G->getLocalNumRows();
1067  const local_ordinal_type nparts = jacobi ? A_n_lclrows : partitions.size();
1068 
1069  typedef std::pair<local_ordinal_type,local_ordinal_type> size_idx_pair_type;
1070  std::vector<size_idx_pair_type> partsz(nparts);
1071 
1072  if (!jacobi) {
1073  for (local_ordinal_type i=0;i<nparts;++i)
1074  partsz[i] = size_idx_pair_type(partitions[i].size(), i);
1075  std::sort(partsz.begin(), partsz.end(),
1076  [] (const size_idx_pair_type& x, const size_idx_pair_type& y) {
1077  return x.first > y.first;
1078  });
1079  }
1080 
1081  local_ordinal_type n_subparts_per_part;
1082  if (n_subparts_per_part_in == -1) {
1083  // If the number of subparts is set to -1, the user let the algorithm
1084  // decides the value automatically
1085  using execution_space = typename impl_type::execution_space;
1086 
1087  const int line_length = partsz[0].first;
1088 
1089  const local_ordinal_type team_size =
1090  SolveTridiagsDefaultModeAndAlgo<typename execution_space::memory_space>::
1091  recommended_team_size(blocksize, vector_length, internal_vector_length);
1092 
1093  const local_ordinal_type num_teams = std::max(1, execution_space().concurrency() / (team_size * vector_length));
1094 
1095  n_subparts_per_part = getAutomaticNSubparts(nparts, num_teams, line_length, blocksize);
1096 
1097 #ifdef IFPACK2_BLOCKTRIDICONTAINER_USE_PRINTF
1098  printf("Automatically chosen n_subparts_per_part = %d for nparts = %d, num_teams = %d, team_size = %d, line_length = %d, and blocksize = %d;\n", n_subparts_per_part, nparts, num_teams, team_size, line_length, blocksize);
1099 #endif
1100  }
1101  else {
1102  n_subparts_per_part = n_subparts_per_part_in;
1103  }
1104 
1105  // Total number of sub lines:
1106  const local_ordinal_type n_sub_parts = nparts * n_subparts_per_part;
1107  // Total number of sub lines + the Schur complement blocks.
1108  // For a given live 2 sub lines implies one Schur complement, 3 sub lines implies two Schur complements etc.
1109  const local_ordinal_type n_sub_parts_and_schur = n_sub_parts + nparts * (n_subparts_per_part-1);
1110 
1111 #if defined(BLOCKTRIDICONTAINER_DEBUG)
1112  local_ordinal_type nrows = 0;
1113  if (jacobi)
1114  nrows = nparts;
1115  else
1116  for (local_ordinal_type i=0;i<nparts;++i) nrows += partitions[i].size();
1117 
1119  (nrows != A_n_lclrows, BlockHelperDetails::get_msg_prefix(comm) << "The #rows implied by the local partition is not "
1120  << "the same as getLocalNumRows: " << nrows << " vs " << A_n_lclrows);
1121 #endif
1122 
1123  // permutation vector
1124  std::vector<local_ordinal_type> p;
1125  if (jacobi) {
1126  interf.max_partsz = 1;
1127  interf.max_subpartsz = 0;
1128  interf.n_subparts_per_part = 1;
1129  interf.nparts = nparts;
1130  } else {
1131  // reorder parts to maximize simd packing efficiency
1132  p.resize(nparts);
1133 
1134  for (local_ordinal_type i=0;i<nparts;++i)
1135  p[i] = partsz[i].second;
1136 
1137  interf.max_partsz = partsz[0].first;
1138 
1139  constexpr local_ordinal_type connection_length = 2;
1140  const local_ordinal_type sub_line_length = (interf.max_partsz - (n_subparts_per_part - 1) * connection_length) / n_subparts_per_part;
1141  const local_ordinal_type last_sub_line_length = interf.max_partsz - (n_subparts_per_part - 1) * (connection_length + sub_line_length);
1142 
1143  interf.max_subpartsz = (sub_line_length > last_sub_line_length) ? sub_line_length : last_sub_line_length;
1144  interf.n_subparts_per_part = n_subparts_per_part;
1145  interf.nparts = nparts;
1146  }
1147 
1148  // allocate parts
1149  interf.partptr = local_ordinal_type_1d_view(do_not_initialize_tag("partptr"), nparts + 1);
1150  interf.lclrow = local_ordinal_type_1d_view(do_not_initialize_tag("lclrow"), A_n_lclrows);
1151  interf.part2rowidx0 = local_ordinal_type_1d_view(do_not_initialize_tag("part2rowidx0"), nparts + 1);
1152  interf.part2packrowidx0 = local_ordinal_type_1d_view(do_not_initialize_tag("part2packrowidx0"), nparts + 1);
1153  interf.rowidx2part = local_ordinal_type_1d_view(do_not_initialize_tag("rowidx2part"), A_n_lclrows);
1154 
1155  interf.part2rowidx0_sub = local_ordinal_type_1d_view(do_not_initialize_tag("part2rowidx0_sub"), n_sub_parts_and_schur + 1);
1156  interf.part2packrowidx0_sub = local_ordinal_type_2d_view(do_not_initialize_tag("part2packrowidx0_sub"), nparts, 2 * n_subparts_per_part);
1157  interf.rowidx2part_sub = local_ordinal_type_1d_view(do_not_initialize_tag("rowidx2part"), A_n_lclrows);
1158 
1159  interf.partptr_sub = local_ordinal_type_2d_view(do_not_initialize_tag("partptr_sub"), n_sub_parts_and_schur, 2);
1160 
1161  // mirror to host and compute on host execution space
1162  const auto partptr = Kokkos::create_mirror_view(interf.partptr);
1163  const auto partptr_sub = Kokkos::create_mirror_view(interf.partptr_sub);
1164 
1165  const auto lclrow = Kokkos::create_mirror_view(interf.lclrow);
1166  const auto part2rowidx0 = Kokkos::create_mirror_view(interf.part2rowidx0);
1167  const auto part2packrowidx0 = Kokkos::create_mirror_view(interf.part2packrowidx0);
1168  const auto rowidx2part = Kokkos::create_mirror_view(interf.rowidx2part);
1169 
1170  const auto part2rowidx0_sub = Kokkos::create_mirror_view(interf.part2rowidx0_sub);
1171  const auto part2packrowidx0_sub = Kokkos::create_mirror_view(Kokkos::HostSpace(), interf.part2packrowidx0_sub);
1172  const auto rowidx2part_sub = Kokkos::create_mirror_view(interf.rowidx2part_sub);
1173 
1174  // Determine parts.
1175  interf.row_contiguous = true;
1176  partptr(0) = 0;
1177  part2rowidx0(0) = 0;
1178  part2packrowidx0(0) = 0;
1179  local_ordinal_type pack_nrows = 0;
1180  local_ordinal_type pack_nrows_sub = 0;
1181  if (jacobi) {
1182  IFPACK2_BLOCKHELPER_TIMER("compute part indices (Jacobi)", Jacobi);
1183  for (local_ordinal_type ip=0;ip<nparts;++ip) {
1184  constexpr local_ordinal_type ipnrows = 1;
1185  //assume No overlap.
1186  part2rowidx0(ip+1) = part2rowidx0(ip) + ipnrows;
1187  // Since parts are ordered in decreasing size, the size of the first
1188  // part in a pack is the size for all parts in the pack.
1189  if (ip % vector_length == 0) pack_nrows = ipnrows;
1190  part2packrowidx0(ip+1) = part2packrowidx0(ip) + ((ip+1) % vector_length == 0 || ip+1 == nparts ? pack_nrows : 0);
1191  const local_ordinal_type offset = partptr(ip);
1192  for (local_ordinal_type i=0;i<ipnrows;++i) {
1193  const auto lcl_row = ip;
1194  TEUCHOS_TEST_FOR_EXCEPT_MSG(lcl_row < 0 || lcl_row >= A_n_lclrows,
1195  BlockHelperDetails::get_msg_prefix(comm)
1196  << "partitions[" << p[ip] << "]["
1197  << i << "] = " << lcl_row
1198  << " but input matrix implies limits of [0, " << A_n_lclrows-1
1199  << "].");
1200  lclrow(offset+i) = lcl_row;
1201  rowidx2part(offset+i) = ip;
1202  if (interf.row_contiguous && offset+i > 0 && lclrow((offset+i)-1) + 1 != lcl_row)
1203  interf.row_contiguous = false;
1204  }
1205  partptr(ip+1) = offset + ipnrows;
1206  }
1207  part2rowidx0_sub(0) = 0;
1208  partptr_sub(0, 0) = 0;
1209 
1210  for (local_ordinal_type ip=0;ip<nparts;++ip) {
1211  constexpr local_ordinal_type ipnrows = 1;
1212  const local_ordinal_type full_line_length = partptr(ip+1) - partptr(ip);
1213 
1215  (full_line_length != ipnrows, std::logic_error,
1216  "In the part " << ip );
1217 
1218  constexpr local_ordinal_type connection_length = 2;
1219 
1220  if (full_line_length < n_subparts_per_part + (n_subparts_per_part - 1) * connection_length )
1222  (true, std::logic_error,
1223  "The part " << ip << " is too short to use " << n_subparts_per_part << " sub parts.");
1224 
1225  const local_ordinal_type sub_line_length = (full_line_length - (n_subparts_per_part - 1) * connection_length) / n_subparts_per_part;
1226  const local_ordinal_type last_sub_line_length = full_line_length - (n_subparts_per_part - 1) * (connection_length + sub_line_length);
1227 
1228  if (ip % vector_length == 0) pack_nrows_sub = ipnrows;
1229 
1230  for (local_ordinal_type local_sub_ip=0; local_sub_ip<n_subparts_per_part;++local_sub_ip) {
1231  const local_ordinal_type sub_ip = nparts*(2*local_sub_ip) + ip;
1232  const local_ordinal_type schur_ip = nparts*(2*local_sub_ip+1) + ip;
1233  if (local_sub_ip != n_subparts_per_part-1) {
1234  if (local_sub_ip != 0) {
1235  partptr_sub(sub_ip, 0) = partptr_sub(nparts*(2*local_sub_ip-1) + ip, 1);
1236  }
1237  else if (ip != 0) {
1238  partptr_sub(sub_ip, 0) = partptr_sub(nparts*2*(n_subparts_per_part-1) + ip - 1, 1);
1239  }
1240  partptr_sub(sub_ip, 1) = sub_line_length + partptr_sub(sub_ip, 0);
1241  partptr_sub(schur_ip, 0) = partptr_sub(sub_ip, 1);
1242  partptr_sub(schur_ip, 1) = connection_length + partptr_sub(schur_ip, 0);
1243 
1244  part2rowidx0_sub(sub_ip + 1) = part2rowidx0_sub(sub_ip) + sub_line_length;
1245  part2rowidx0_sub(sub_ip + 2) = part2rowidx0_sub(sub_ip + 1) + connection_length;
1246 
1247 #ifdef IFPACK2_BLOCKTRIDICONTAINER_USE_PRINTF
1248  printf("Sub Part index = %d, first LID associated to the sub part = %d, sub part size = %d;\n", sub_ip, partptr_sub(ip, 2 * local_sub_ip), sub_line_length);
1249  printf("Sub Part index Schur = %d, first LID associated to the sub part = %d, sub part size = %d;\n", sub_ip + 1, partptr_sub(ip, 2 * local_sub_ip + 1), connection_length);
1250 #endif
1251  }
1252  else {
1253  if (local_sub_ip != 0) {
1254  partptr_sub(sub_ip, 0) = partptr_sub(nparts*(2*local_sub_ip-1) + ip, 1);
1255  }
1256  else if (ip != 0) {
1257  partptr_sub(sub_ip, 0) = partptr_sub(nparts*2*(n_subparts_per_part-1) + ip - 1, 1);
1258  }
1259  partptr_sub(sub_ip, 1) = last_sub_line_length + partptr_sub(sub_ip, 0);
1260 
1261  part2rowidx0_sub(sub_ip + 1) = part2rowidx0_sub(sub_ip) + last_sub_line_length;
1262 
1263 #ifdef IFPACK2_BLOCKTRIDICONTAINER_USE_PRINTF
1264  printf("Sub Part index = %d, first LID associated to the sub part = %d, sub part size = %d;\n", sub_ip, partptr_sub(ip, 2 * local_sub_ip), last_sub_line_length);
1265 #endif
1266  }
1267  }
1268  }
1269 
1270 #ifdef IFPACK2_BLOCKTRIDICONTAINER_WRITE_MM
1271  std::cout << "partptr_sub = " << std::endl;
1272  for (size_type i = 0; i < partptr_sub.extent(0); ++i) {
1273  for (size_type j = 0; j < partptr_sub.extent(1); ++j) {
1274  std::cout << partptr_sub(i,j) << " ";
1275  }
1276  std::cout << std::endl;
1277  }
1278  std::cout << "partptr_sub end" << std::endl;
1279 #endif
1280 
1281  {
1282  local_ordinal_type npacks = ceil(float(nparts)/vector_length);
1283 
1284  local_ordinal_type ip_max = nparts > vector_length ? vector_length : nparts;
1285  for (local_ordinal_type ip=0;ip<ip_max;++ip) {
1286  part2packrowidx0_sub(ip, 0) = 0;
1287  }
1288  for (local_ordinal_type ipack=0;ipack<npacks;++ipack) {
1289  if (ipack != 0) {
1290  local_ordinal_type ip_min = ipack*vector_length;
1291  ip_max = nparts > (ipack+1)*vector_length ? (ipack+1)*vector_length : nparts;
1292  for (local_ordinal_type ip=ip_min;ip<ip_max;++ip) {
1293  part2packrowidx0_sub(ip, 0) = part2packrowidx0_sub(ip-vector_length, part2packrowidx0_sub.extent(1)-1);
1294  }
1295  }
1296 
1297  for (size_type local_sub_ip=0; local_sub_ip<part2packrowidx0_sub.extent(1)-1;++local_sub_ip) {
1298  local_ordinal_type ip_min = ipack*vector_length;
1299  ip_max = nparts > (ipack+1)*vector_length ? (ipack+1)*vector_length : nparts;
1300 
1301  const local_ordinal_type full_line_length = partptr(ip_min+1) - partptr(ip_min);
1302 
1303  constexpr local_ordinal_type connection_length = 2;
1304 
1305  const local_ordinal_type sub_line_length = (full_line_length - (n_subparts_per_part - 1) * connection_length) / n_subparts_per_part;
1306  const local_ordinal_type last_sub_line_length = full_line_length - (n_subparts_per_part - 1) * (connection_length + sub_line_length);
1307 
1308  if (local_sub_ip % 2 == 0) pack_nrows_sub = sub_line_length;
1309  if (local_sub_ip % 2 == 1) pack_nrows_sub = connection_length;
1310  if (local_sub_ip == part2packrowidx0_sub.extent(1)-2) pack_nrows_sub = last_sub_line_length;
1311 
1312  part2packrowidx0_sub(ip_min, local_sub_ip + 1) = part2packrowidx0_sub(ip_min, local_sub_ip) + pack_nrows_sub;
1313 
1314  for (local_ordinal_type ip=ip_min+1;ip<ip_max;++ip) {
1315  part2packrowidx0_sub(ip, local_sub_ip + 1) = part2packrowidx0_sub(ip_min, local_sub_ip + 1);
1316  }
1317  }
1318  }
1319 
1320  Kokkos::deep_copy(interf.part2packrowidx0_sub, part2packrowidx0_sub);
1321  }
1322  IFPACK2_BLOCKHELPER_TIMER_FENCE(typename BlockHelperDetails::ImplType<MatrixType>::execution_space)
1323  } else {
1324  IFPACK2_BLOCKHELPER_TIMER("compute part indices", indices);
1325  for (local_ordinal_type ip=0;ip<nparts;++ip) {
1326  const auto* part = &partitions[p[ip]];
1327  const local_ordinal_type ipnrows = part->size();
1328  TEUCHOS_ASSERT(ip == 0 || (ipnrows <= static_cast<local_ordinal_type>(partitions[p[ip-1]].size())));
1329  TEUCHOS_TEST_FOR_EXCEPT_MSG(ipnrows == 0,
1330  BlockHelperDetails::get_msg_prefix(comm)
1331  << "partition " << p[ip]
1332  << " is empty, which is not allowed.");
1333  //assume No overlap.
1334  part2rowidx0(ip+1) = part2rowidx0(ip) + ipnrows;
1335  // Since parts are ordered in decreasing size, the size of the first
1336  // part in a pack is the size for all parts in the pack.
1337  if (ip % vector_length == 0) pack_nrows = ipnrows;
1338  part2packrowidx0(ip+1) = part2packrowidx0(ip) + ((ip+1) % vector_length == 0 || ip+1 == nparts ? pack_nrows : 0);
1339  const local_ordinal_type offset = partptr(ip);
1340  for (local_ordinal_type i=0;i<ipnrows;++i) {
1341  const auto lcl_row = (*part)[i];
1342  TEUCHOS_TEST_FOR_EXCEPT_MSG(lcl_row < 0 || lcl_row >= A_n_lclrows,
1343  BlockHelperDetails::get_msg_prefix(comm)
1344  << "partitions[" << p[ip] << "]["
1345  << i << "] = " << lcl_row
1346  << " but input matrix implies limits of [0, " << A_n_lclrows-1
1347  << "].");
1348  lclrow(offset+i) = lcl_row;
1349  rowidx2part(offset+i) = ip;
1350  if (interf.row_contiguous && offset+i > 0 && lclrow((offset+i)-1) + 1 != lcl_row)
1351  interf.row_contiguous = false;
1352  }
1353  partptr(ip+1) = offset + ipnrows;
1354 
1355 #ifdef IFPACK2_BLOCKTRIDICONTAINER_USE_PRINTF
1356  printf("Part index = ip = %d, first LID associated to the part = partptr(ip) = offset = %d, part->size() = ipnrows = %d;\n", ip, offset, ipnrows);
1357  printf("partptr(%d+1) = %d\n", ip, partptr(ip+1));
1358 #endif
1359  }
1360 
1361  part2rowidx0_sub(0) = 0;
1362  partptr_sub(0, 0) = 0;
1363  //const local_ordinal_type number_pack_per_sub_part = ceil(float(nparts)/vector_length);
1364 
1365  for (local_ordinal_type ip=0;ip<nparts;++ip) {
1366  const auto* part = &partitions[p[ip]];
1367  const local_ordinal_type ipnrows = part->size();
1368  const local_ordinal_type full_line_length = partptr(ip+1) - partptr(ip);
1369 
1371  (full_line_length != ipnrows, std::logic_error,
1372  "In the part " << ip );
1373 
1374  constexpr local_ordinal_type connection_length = 2;
1375 
1376  if (full_line_length < n_subparts_per_part + (n_subparts_per_part - 1) * connection_length )
1378  (true, std::logic_error,
1379  "The part " << ip << " is too short to use " << n_subparts_per_part << " sub parts.");
1380 
1381  const local_ordinal_type sub_line_length = (full_line_length - (n_subparts_per_part - 1) * connection_length) / n_subparts_per_part;
1382  const local_ordinal_type last_sub_line_length = full_line_length - (n_subparts_per_part - 1) * (connection_length + sub_line_length);
1383 
1384  if (ip % vector_length == 0) pack_nrows_sub = ipnrows;
1385 
1386  for (local_ordinal_type local_sub_ip=0; local_sub_ip<n_subparts_per_part;++local_sub_ip) {
1387  const local_ordinal_type sub_ip = nparts*(2*local_sub_ip) + ip;
1388  const local_ordinal_type schur_ip = nparts*(2*local_sub_ip+1) + ip;
1389  if (local_sub_ip != n_subparts_per_part-1) {
1390  if (local_sub_ip != 0) {
1391  partptr_sub(sub_ip, 0) = partptr_sub(nparts*(2*local_sub_ip-1) + ip, 1);
1392  }
1393  else if (ip != 0) {
1394  partptr_sub(sub_ip, 0) = partptr_sub(nparts*2*(n_subparts_per_part-1) + ip - 1, 1);
1395  }
1396  partptr_sub(sub_ip, 1) = sub_line_length + partptr_sub(sub_ip, 0);
1397  partptr_sub(schur_ip, 0) = partptr_sub(sub_ip, 1);
1398  partptr_sub(schur_ip, 1) = connection_length + partptr_sub(schur_ip, 0);
1399 
1400  part2rowidx0_sub(sub_ip + 1) = part2rowidx0_sub(sub_ip) + sub_line_length;
1401  part2rowidx0_sub(sub_ip + 2) = part2rowidx0_sub(sub_ip + 1) + connection_length;
1402 
1403 #ifdef IFPACK2_BLOCKTRIDICONTAINER_USE_PRINTF
1404  printf("Sub Part index = %d, first LID associated to the sub part = %d, sub part size = %d;\n", sub_ip, partptr_sub(sub_ip, 0), sub_line_length);
1405  printf("Sub Part index Schur = %d, first LID associated to the sub part = %d, sub part size = %d;\n", sub_ip + 1, partptr_sub(ip, 2 * local_sub_ip + 1), connection_length);
1406 #endif
1407  }
1408  else {
1409  if (local_sub_ip != 0) {
1410  partptr_sub(sub_ip, 0) = partptr_sub(nparts*(2*local_sub_ip-1) + ip, 1);
1411  }
1412  else if (ip != 0) {
1413  partptr_sub(sub_ip, 0) = partptr_sub(nparts*2*(n_subparts_per_part-1) + ip - 1, 1);
1414  }
1415  partptr_sub(sub_ip, 1) = last_sub_line_length + partptr_sub(sub_ip, 0);
1416 
1417  part2rowidx0_sub(sub_ip + 1) = part2rowidx0_sub(sub_ip) + last_sub_line_length;
1418 
1419 #ifdef IFPACK2_BLOCKTRIDICONTAINER_USE_PRINTF
1420  printf("Sub Part index = %d, first LID associated to the sub part = %d, sub part size = %d;\n", sub_ip, partptr_sub(sub_ip, 0), last_sub_line_length);
1421 #endif
1422  }
1423  }
1424  }
1425 
1426  {
1427  local_ordinal_type npacks = ceil(float(nparts)/vector_length);
1428 
1429  local_ordinal_type ip_max = nparts > vector_length ? vector_length : nparts;
1430  for (local_ordinal_type ip=0;ip<ip_max;++ip) {
1431  part2packrowidx0_sub(ip, 0) = 0;
1432  }
1433  for (local_ordinal_type ipack=0;ipack<npacks;++ipack) {
1434  if (ipack != 0) {
1435  local_ordinal_type ip_min = ipack*vector_length;
1436  ip_max = nparts > (ipack+1)*vector_length ? (ipack+1)*vector_length : nparts;
1437  for (local_ordinal_type ip=ip_min;ip<ip_max;++ip) {
1438  part2packrowidx0_sub(ip, 0) = part2packrowidx0_sub(ip-vector_length, part2packrowidx0_sub.extent(1)-1);
1439  }
1440  }
1441 
1442  for (size_type local_sub_ip=0; local_sub_ip<part2packrowidx0_sub.extent(1)-1;++local_sub_ip) {
1443  local_ordinal_type ip_min = ipack*vector_length;
1444  ip_max = nparts > (ipack+1)*vector_length ? (ipack+1)*vector_length : nparts;
1445 
1446  const local_ordinal_type full_line_length = partptr(ip_min+1) - partptr(ip_min);
1447 
1448  constexpr local_ordinal_type connection_length = 2;
1449 
1450  const local_ordinal_type sub_line_length = (full_line_length - (n_subparts_per_part - 1) * connection_length) / n_subparts_per_part;
1451  const local_ordinal_type last_sub_line_length = full_line_length - (n_subparts_per_part - 1) * (connection_length + sub_line_length);
1452 
1453  if (local_sub_ip % 2 == 0) pack_nrows_sub = sub_line_length;
1454  if (local_sub_ip % 2 == 1) pack_nrows_sub = connection_length;
1455  if (local_sub_ip == part2packrowidx0_sub.extent(1)-2) pack_nrows_sub = last_sub_line_length;
1456 
1457  part2packrowidx0_sub(ip_min, local_sub_ip + 1) = part2packrowidx0_sub(ip_min, local_sub_ip) + pack_nrows_sub;
1458 
1459  for (local_ordinal_type ip=ip_min+1;ip<ip_max;++ip) {
1460  part2packrowidx0_sub(ip, local_sub_ip + 1) = part2packrowidx0_sub(ip_min, local_sub_ip + 1);
1461  }
1462  }
1463  }
1464 
1465  Kokkos::deep_copy(interf.part2packrowidx0_sub, part2packrowidx0_sub);
1466  }
1467  IFPACK2_BLOCKHELPER_TIMER_FENCE(typename BlockHelperDetails::ImplType<MatrixType>::execution_space)
1468  }
1469 #if defined(BLOCKTRIDICONTAINER_DEBUG)
1470  TEUCHOS_ASSERT(partptr(nparts) == nrows);
1471 #endif
1472  if (lclrow(0) != 0) interf.row_contiguous = false;
1473 
1474  Kokkos::deep_copy(interf.partptr, partptr);
1475  Kokkos::deep_copy(interf.lclrow, lclrow);
1476 
1477  Kokkos::deep_copy(interf.partptr_sub, partptr_sub);
1478 
1479  //assume No overlap. Thus:
1480  interf.part2rowidx0 = interf.partptr;
1481  Kokkos::deep_copy(interf.part2packrowidx0, part2packrowidx0);
1482 
1483  interf.part2packrowidx0_back = part2packrowidx0_sub(part2packrowidx0_sub.extent(0) - 1, part2packrowidx0_sub.extent(1) - 1);
1484  Kokkos::deep_copy(interf.rowidx2part, rowidx2part);
1485 
1486  { // Fill packptr.
1487  IFPACK2_BLOCKHELPER_TIMER("Fill packptr", packptr0);
1488  local_ordinal_type npacks = ceil(float(nparts)/vector_length) * (part2packrowidx0_sub.extent(1)-1);
1489  npacks = 0;
1490  for (local_ordinal_type ip=1;ip<=nparts;++ip) //n_sub_parts_and_schur
1491  if (part2packrowidx0(ip) != part2packrowidx0(ip-1))
1492  ++npacks;
1493 
1494  interf.packptr = local_ordinal_type_1d_view(do_not_initialize_tag("packptr"), npacks + 1);
1495  const auto packptr = Kokkos::create_mirror_view(interf.packptr);
1496  packptr(0) = 0;
1497  for (local_ordinal_type ip=1,k=1;ip<=nparts;++ip)
1498  if (part2packrowidx0(ip) != part2packrowidx0(ip-1))
1499  packptr(k++) = ip;
1500 
1501  Kokkos::deep_copy(interf.packptr, packptr);
1502 
1503  local_ordinal_type npacks_per_subpart = ceil(float(nparts)/vector_length);
1504  npacks = ceil(float(nparts)/vector_length) * (part2packrowidx0_sub.extent(1)-1);
1505 
1506  interf.packindices_sub = local_ordinal_type_1d_view(do_not_initialize_tag("packindices_sub"), npacks_per_subpart*n_subparts_per_part);
1507  interf.packindices_schur = local_ordinal_type_2d_view(do_not_initialize_tag("packindices_schur"), npacks_per_subpart,n_subparts_per_part-1);
1508 
1509  const auto packindices_sub = Kokkos::create_mirror_view(interf.packindices_sub);
1510  const auto packindices_schur = Kokkos::create_mirror_view(interf.packindices_schur);
1511 
1512 
1513  // Fill packindices_sub and packindices_schur
1514  for (local_ordinal_type local_sub_ip=0; local_sub_ip<n_subparts_per_part-1;++local_sub_ip) {
1515  for (local_ordinal_type local_pack_ip=0; local_pack_ip<npacks_per_subpart;++local_pack_ip) {
1516  packindices_sub(local_sub_ip * npacks_per_subpart + local_pack_ip) = 2 * local_sub_ip * npacks_per_subpart + local_pack_ip;
1517  packindices_schur(local_pack_ip,local_sub_ip) = 2 * local_sub_ip * npacks_per_subpart + local_pack_ip + npacks_per_subpart;
1518  }
1519  }
1520 
1521  for (local_ordinal_type local_pack_ip=0; local_pack_ip<npacks_per_subpart;++local_pack_ip) {
1522  packindices_sub((n_subparts_per_part-1) * npacks_per_subpart + local_pack_ip) = 2 * (n_subparts_per_part-1) * npacks_per_subpart + local_pack_ip;
1523  }
1524 
1525 #ifdef IFPACK2_BLOCKTRIDICONTAINER_WRITE_MM
1526  std::cout << "packindices_sub = " << std::endl;
1527  for (size_type i = 0; i < packindices_sub.extent(0); ++i) {
1528  std::cout << packindices_sub(i) << " ";
1529  }
1530  std::cout << std::endl;
1531  std::cout << "packindices_sub end" << std::endl;
1532 
1533  std::cout << "packindices_schur = " << std::endl;
1534  for (size_type i = 0; i < packindices_schur.extent(0); ++i) {
1535  for (size_type j = 0; j < packindices_schur.extent(1); ++j) {
1536  std::cout << packindices_schur(i,j) << " ";
1537  }
1538  std::cout << std::endl;
1539  }
1540 
1541  std::cout << "packindices_schur end" << std::endl;
1542 #endif
1543 
1544  Kokkos::deep_copy(interf.packindices_sub, packindices_sub);
1545  Kokkos::deep_copy(interf.packindices_schur, packindices_schur);
1546 
1547  interf.packptr_sub = local_ordinal_type_1d_view(do_not_initialize_tag("packptr"), npacks + 1);
1548  const auto packptr_sub = Kokkos::create_mirror_view(interf.packptr_sub);
1549  packptr_sub(0) = 0;
1550  for (local_ordinal_type k=0;k<npacks + 1;++k)
1551  packptr_sub(k) = packptr(k%npacks_per_subpart) + (k / npacks_per_subpart) * packptr(npacks_per_subpart);
1552 
1553  Kokkos::deep_copy(interf.packptr_sub, packptr_sub);
1554  IFPACK2_BLOCKHELPER_TIMER_FENCE(typename BlockHelperDetails::ImplType<MatrixType>::execution_space)
1555  }
1556  IFPACK2_BLOCKHELPER_TIMER_FENCE(typename BlockHelperDetails::ImplType<MatrixType>::execution_space)
1557 
1558  return interf;
1559  }
1560 
1564  template <typename MatrixType>
1565  struct BlockTridiags {
1567  using local_ordinal_type_1d_view = typename impl_type::local_ordinal_type_1d_view;
1568  using size_type_1d_view = typename impl_type::size_type_1d_view;
1569  using size_type_2d_view = typename impl_type::size_type_2d_view;
1570  using vector_type_3d_view = typename impl_type::vector_type_3d_view;
1571  using vector_type_4d_view = typename impl_type::vector_type_4d_view;
1572 
1573  // flat_td_ptr(i) is the index into flat-array values of the start of the
1574  // i'th tridiag. pack_td_ptr is the same, but for packs. If vector_length ==
1575  // 1, pack_td_ptr is the same as flat_td_ptr; if vector_length > 1, then i %
1576  // vector_length is the position in the pack.
1577  size_type_2d_view flat_td_ptr, pack_td_ptr, pack_td_ptr_schur;
1578  // List of local column indices into A from which to grab
1579  // data. flat_td_ptr(i) points to the start of the i'th tridiag's data.
1580  local_ordinal_type_1d_view A_colindsub;
1581  // Tridiag block values. pack_td_ptr(i) points to the start of the i'th
1582  // tridiag's pack, and i % vector_length gives the position in the pack.
1583  vector_type_3d_view values;
1584  // Schur block values. pack_td_ptr_schur(i) points to the start of the i'th
1585  // Schur's pack, and i % vector_length gives the position in the pack.
1586  vector_type_3d_view values_schur;
1587  // inv(A_00)*A_01 block values.
1588  vector_type_4d_view e_values;
1589 
1590  bool is_diagonal_only;
1591 
1592  BlockTridiags() = default;
1593  BlockTridiags(const BlockTridiags &b) = default;
1594 
1595  // Index into row-major block of a tridiag.
1596  template <typename idx_type>
1597  static KOKKOS_FORCEINLINE_FUNCTION
1598  idx_type IndexToRow (const idx_type& ind) { return (ind + 1) / 3; }
1599  // Given a row of a row-major tridiag, return the index of the first block
1600  // in that row.
1601  template <typename idx_type>
1602  static KOKKOS_FORCEINLINE_FUNCTION
1603  idx_type RowToIndex (const idx_type& row) { return row > 0 ? 3*row - 1 : 0; }
1604  // Number of blocks in a tridiag having a given number of rows.
1605  template <typename idx_type>
1606  static KOKKOS_FORCEINLINE_FUNCTION
1607  idx_type NumBlocks (const idx_type& nrows) { return nrows > 0 ? 3*nrows - 2 : 0; }
1608  // Number of blocks associated to a Schur complement having a given number of rows.
1609  template <typename idx_type>
1610  static KOKKOS_FORCEINLINE_FUNCTION
1611  idx_type NumBlocksSchur (const idx_type& nrows) { return nrows > 0 ? 3*nrows + 2 : 0; }
1612  };
1613 
1614 
1618  template<typename MatrixType>
1620  createBlockTridiags(const BlockHelperDetails::PartInterface<MatrixType> &interf) {
1621  IFPACK2_BLOCKHELPER_TIMER("createBlockTridiags", createBlockTridiags0);
1622  using impl_type = BlockHelperDetails::ImplType<MatrixType>;
1623  using execution_space = typename impl_type::execution_space;
1624  using local_ordinal_type = typename impl_type::local_ordinal_type;
1625  using size_type = typename impl_type::size_type;
1626  using size_type_2d_view = typename impl_type::size_type_2d_view;
1627 
1628  constexpr int vector_length = impl_type::vector_length;
1629 
1631 
1632  const local_ordinal_type ntridiags = interf.partptr_sub.extent(0);
1633 
1634  { // construct the flat index pointers into the tridiag values array.
1635  btdm.flat_td_ptr = size_type_2d_view(do_not_initialize_tag("btdm.flat_td_ptr"), interf.nparts, 2*interf.n_subparts_per_part);
1636  const Kokkos::RangePolicy<execution_space> policy(0, 2 * interf.nparts * interf.n_subparts_per_part );
1637  Kokkos::parallel_scan
1638  ("createBlockTridiags::RangePolicy::flat_td_ptr",
1639  policy, KOKKOS_LAMBDA(const local_ordinal_type &i, size_type &update, const bool &final) {
1640  const local_ordinal_type partidx = i/(2 * interf.n_subparts_per_part);
1641  const local_ordinal_type local_subpartidx = i % (2 * interf.n_subparts_per_part);
1642 
1643  if (final) {
1644  btdm.flat_td_ptr(partidx, local_subpartidx) = update;
1645  }
1646  if (local_subpartidx != (2 * interf.n_subparts_per_part -1)) {
1647  const local_ordinal_type nrows = interf.partptr_sub(interf.nparts*local_subpartidx + partidx,1) - interf.partptr_sub(interf.nparts*local_subpartidx + partidx,0);
1648  if (local_subpartidx % 2 == 0)
1649  update += btdm.NumBlocks(nrows);
1650  else
1651  update += btdm.NumBlocksSchur(nrows);
1652  }
1653  });
1654 
1655  const auto nblocks = Kokkos::create_mirror_view_and_copy
1656  (Kokkos::HostSpace(), Kokkos::subview(btdm.flat_td_ptr, interf.nparts-1, 2*interf.n_subparts_per_part-1));
1657  btdm.is_diagonal_only = (static_cast<local_ordinal_type>(nblocks()) == ntridiags);
1658  }
1659 
1660  // And the packed index pointers.
1661  if (vector_length == 1) {
1662  btdm.pack_td_ptr = btdm.flat_td_ptr;
1663  } else {
1664  //const local_ordinal_type npacks = interf.packptr_sub.extent(0) - 1;
1665 
1666  local_ordinal_type npacks_per_subpart = 0;
1667  const auto part2packrowidx0 = Kokkos::create_mirror_view(interf.part2packrowidx0);
1668  Kokkos::deep_copy(part2packrowidx0, interf.part2packrowidx0);
1669  for (local_ordinal_type ip=1;ip<=interf.nparts;++ip) //n_sub_parts_and_schur
1670  if (part2packrowidx0(ip) != part2packrowidx0(ip-1))
1671  ++npacks_per_subpart;
1672 
1673  btdm.pack_td_ptr = size_type_2d_view(do_not_initialize_tag("btdm.pack_td_ptr"), interf.nparts, 2*interf.n_subparts_per_part);
1674  const Kokkos::RangePolicy<execution_space> policy(0,npacks_per_subpart);
1675 
1676  Kokkos::parallel_for
1677  ("createBlockTridiags::RangePolicy::pack_td_ptr",
1678  policy, KOKKOS_LAMBDA(const local_ordinal_type &i) {
1679  for (local_ordinal_type j = 0; j < 2*interf.n_subparts_per_part; ++j) {
1680  const local_ordinal_type pack_id = ( j == 2*interf.n_subparts_per_part-1 ) ? i+(j-1)*npacks_per_subpart : i+j*npacks_per_subpart;
1681  const local_ordinal_type nparts_in_pack = interf.packptr_sub(pack_id+1) - interf.packptr_sub(pack_id);
1682 
1683  const local_ordinal_type parti = interf.packptr_sub(pack_id);
1684  const local_ordinal_type partidx = parti%interf.nparts;
1685 
1686  for (local_ordinal_type pti=0;pti<nparts_in_pack;++pti) {
1687  btdm.pack_td_ptr(partidx+pti, j) = btdm.flat_td_ptr(i, j);
1688  }
1689  }
1690  });
1691  }
1692 
1693  btdm.pack_td_ptr_schur = size_type_2d_view(do_not_initialize_tag("btdm.pack_td_ptr_schur"), interf.nparts, interf.n_subparts_per_part);
1694 
1695  const auto host_pack_td_ptr_schur = Kokkos::create_mirror_view(btdm.pack_td_ptr_schur);
1696  constexpr local_ordinal_type connection_length = 2;
1697 
1698  host_pack_td_ptr_schur(0,0) = 0;
1699  for (local_ordinal_type i = 0; i < interf.nparts; ++i) {
1700  if (i % vector_length == 0) {
1701  if (i != 0)
1702  host_pack_td_ptr_schur(i,0) = host_pack_td_ptr_schur(i-1,host_pack_td_ptr_schur.extent(1)-1);
1703  for (local_ordinal_type j = 0; j < interf.n_subparts_per_part-1; ++j) {
1704  host_pack_td_ptr_schur(i,j+1) = host_pack_td_ptr_schur(i,j) + btdm.NumBlocks(connection_length) + (j != 0 ? 1 : 0) + (j != interf.n_subparts_per_part-2 ? 1 : 0);
1705  }
1706  }
1707  else {
1708  for (local_ordinal_type j = 0; j < interf.n_subparts_per_part; ++j) {
1709  host_pack_td_ptr_schur(i,j) = host_pack_td_ptr_schur(i-1,j);
1710  }
1711  }
1712  }
1713 
1714  Kokkos::deep_copy(btdm.pack_td_ptr_schur, host_pack_td_ptr_schur);
1715 
1716 #ifdef IFPACK2_BLOCKTRIDICONTAINER_WRITE_MM
1717  const auto host_flat_td_ptr = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), btdm.flat_td_ptr);
1718  std::cout << "flat_td_ptr = " << std::endl;
1719  for (size_type i = 0; i < host_flat_td_ptr.extent(0); ++i) {
1720  for (size_type j = 0; j < host_flat_td_ptr.extent(1); ++j) {
1721  std::cout << host_flat_td_ptr(i,j) << " ";
1722  }
1723  std::cout << std::endl;
1724  }
1725  std::cout << "flat_td_ptr end" << std::endl;
1726 
1727  const auto host_pack_td_ptr = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), btdm.pack_td_ptr);
1728 
1729  std::cout << "pack_td_ptr = " << std::endl;
1730  for (size_type i = 0; i < host_pack_td_ptr.extent(0); ++i) {
1731  for (size_type j = 0; j < host_pack_td_ptr.extent(1); ++j) {
1732  std::cout << host_pack_td_ptr(i,j) << " ";
1733  }
1734  std::cout << std::endl;
1735  }
1736  std::cout << "pack_td_ptr end" << std::endl;
1737 
1738 
1739  std::cout << "pack_td_ptr_schur = " << std::endl;
1740  for (size_type i = 0; i < host_pack_td_ptr_schur.extent(0); ++i) {
1741  for (size_type j = 0; j < host_pack_td_ptr_schur.extent(1); ++j) {
1742  std::cout << host_pack_td_ptr_schur(i,j) << " ";
1743  }
1744  std::cout << std::endl;
1745  }
1746  std::cout << "pack_td_ptr_schur end" << std::endl;
1747 #endif
1748 
1749  // values and A_colindsub are created in the symbolic phase
1750  IFPACK2_BLOCKHELPER_TIMER_FENCE(typename BlockHelperDetails::ImplType<MatrixType>::execution_space)
1751 
1752  return btdm;
1753  }
1754 
1755  // Set the tridiags to be I to the full pack block size. That way, if a
1756  // tridiag within a pack is shorter than the longest one, the extra blocks are
1757  // processed in a safe way. Similarly, in the solve phase, if the extra blocks
1758  // in the packed multvector are 0, and the tridiag LU reflects the extra I
1759  // blocks, then the solve proceeds as though the extra blocks aren't
1760  // present. Since this extra work is part of the SIMD calls, it's not actually
1761  // extra work. Instead, it means we don't have to put checks or masks in, or
1762  // quiet NaNs. This functor has to be called just once, in the symbolic phase,
1763  // since the numeric phase fills in only the used entries, leaving these I
1764  // blocks intact.
1765  template<typename MatrixType>
1766  void
1767  setTridiagsToIdentity
1768  (const BlockTridiags<MatrixType>& btdm,
1769  const typename BlockHelperDetails::ImplType<MatrixType>::local_ordinal_type_1d_view& packptr)
1770  {
1771  using impl_type = BlockHelperDetails::ImplType<MatrixType>;
1772  using execution_space = typename impl_type::execution_space;
1773  using local_ordinal_type = typename impl_type::local_ordinal_type;
1774  using size_type_2d_view = typename impl_type::size_type_2d_view;
1775 
1776  const ConstUnmanaged<size_type_2d_view> pack_td_ptr(btdm.pack_td_ptr);
1777  const local_ordinal_type blocksize = btdm.values.extent(1);
1778 
1779  {
1780  const int vector_length = impl_type::vector_length;
1781  const int internal_vector_length = impl_type::internal_vector_length;
1782 
1783  using btdm_scalar_type = typename impl_type::btdm_scalar_type;
1784  using internal_vector_type = typename impl_type::internal_vector_type;
1785  using internal_vector_type_4d_view =
1786  typename impl_type::internal_vector_type_4d_view;
1787 
1788  using team_policy_type = Kokkos::TeamPolicy<execution_space>;
1789  const internal_vector_type_4d_view values
1790  (reinterpret_cast<internal_vector_type*>(btdm.values.data()),
1791  btdm.values.extent(0),
1792  btdm.values.extent(1),
1793  btdm.values.extent(2),
1794  vector_length/internal_vector_length);
1795  const local_ordinal_type vector_loop_size = values.extent(3);
1796 #if defined(KOKKOS_ENABLE_CUDA) && defined(__CUDA_ARCH__)
1797  local_ordinal_type total_team_size(0);
1798  if (blocksize <= 5) total_team_size = 32;
1799  else if (blocksize <= 9) total_team_size = 64;
1800  else if (blocksize <= 12) total_team_size = 96;
1801  else if (blocksize <= 16) total_team_size = 128;
1802  else if (blocksize <= 20) total_team_size = 160;
1803  else total_team_size = 160;
1804  const local_ordinal_type team_size = total_team_size/vector_loop_size;
1805  const team_policy_type policy(packptr.extent(0)-1, team_size, vector_loop_size);
1806 #elif defined(KOKKOS_ENABLE_HIP)
1807  // FIXME: HIP
1808  // These settings might be completely wrong
1809  // will have to do some experiments to decide
1810  // what makes sense on AMD GPUs
1811  local_ordinal_type total_team_size(0);
1812  if (blocksize <= 5) total_team_size = 32;
1813  else if (blocksize <= 9) total_team_size = 64;
1814  else if (blocksize <= 12) total_team_size = 96;
1815  else if (blocksize <= 16) total_team_size = 128;
1816  else if (blocksize <= 20) total_team_size = 160;
1817  else total_team_size = 160;
1818  const local_ordinal_type team_size = total_team_size/vector_loop_size;
1819  const team_policy_type policy(packptr.extent(0)-1, team_size, vector_loop_size);
1820 #elif defined(KOKKOS_ENABLE_SYCL)
1821  // SYCL: FIXME
1822  local_ordinal_type total_team_size(0);
1823  if (blocksize <= 5) total_team_size = 32;
1824  else if (blocksize <= 9) total_team_size = 64;
1825  else if (blocksize <= 12) total_team_size = 96;
1826  else if (blocksize <= 16) total_team_size = 128;
1827  else if (blocksize <= 20) total_team_size = 160;
1828  else total_team_size = 160;
1829  const local_ordinal_type team_size = total_team_size/vector_loop_size;
1830  const team_policy_type policy(packptr.extent(0)-1, team_size, vector_loop_size);
1831 #else
1832  // Host architecture: team size is always one
1833  const team_policy_type policy(packptr.extent(0)-1, 1, 1);
1834 #endif
1835  Kokkos::parallel_for
1836  ("setTridiagsToIdentity::TeamPolicy",
1837  policy, KOKKOS_LAMBDA(const typename team_policy_type::member_type &member) {
1838  const local_ordinal_type k = member.league_rank();
1839  const local_ordinal_type ibeg = pack_td_ptr(packptr(k),0);
1840  const local_ordinal_type iend = pack_td_ptr(packptr(k),pack_td_ptr.extent(1)-1);
1841 
1842  const local_ordinal_type diff = iend - ibeg;
1843  const local_ordinal_type icount = diff/3 + (diff%3 > 0);
1844  const btdm_scalar_type one(1);
1845  Kokkos::parallel_for(Kokkos::ThreadVectorRange(member, vector_loop_size),[&](const int &v) {
1846  Kokkos::parallel_for(Kokkos::TeamThreadRange(member,icount),[&](const local_ordinal_type &ii) {
1847  const local_ordinal_type i = ibeg + ii*3;
1848  for (local_ordinal_type j=0;j<blocksize;++j) {
1849  values(i,j,j,v) = one;
1850  }
1851  });
1852  });
1853  });
1854  }
1855  }
1856 
1860  template<typename MatrixType>
1861  void
1862  performSymbolicPhase(const Teuchos::RCP<const typename BlockHelperDetails::ImplType<MatrixType>::tpetra_row_matrix_type> &A,
1863  const Teuchos::RCP<const typename BlockHelperDetails::ImplType<MatrixType>::tpetra_crs_graph_type> &g,
1864  const BlockHelperDetails::PartInterface<MatrixType> &interf,
1867  const bool overlap_communication_and_computation,
1868  const Teuchos::RCP<AsyncableImport<MatrixType> > &async_importer,
1869  bool useSeqMethod) {
1870  IFPACK2_BLOCKHELPER_TIMER("BlockTriDi::SymbolicPhase", SymbolicPhase);
1871 
1872  using impl_type = BlockHelperDetails::ImplType<MatrixType>;
1873 
1874  using execution_space = typename impl_type::execution_space;
1875  using host_execution_space = typename impl_type::host_execution_space;
1876 
1877  using local_ordinal_type = typename impl_type::local_ordinal_type;
1878  using global_ordinal_type = typename impl_type::global_ordinal_type;
1879  using size_type = typename impl_type::size_type;
1880  using local_ordinal_type_1d_view = typename impl_type::local_ordinal_type_1d_view;
1881  using size_type_1d_view = typename impl_type::size_type_1d_view;
1882  using vector_type_3d_view = typename impl_type::vector_type_3d_view;
1883  using vector_type_4d_view = typename impl_type::vector_type_4d_view;
1884  using crs_matrix_type = typename impl_type::tpetra_crs_matrix_type;
1885  using block_crs_matrix_type = typename impl_type::tpetra_block_crs_matrix_type;
1886 
1887  constexpr int vector_length = impl_type::vector_length;
1888 
1889  const auto comm = A->getRowMap()->getComm();
1890 
1891  auto A_crs = Teuchos::rcp_dynamic_cast<const crs_matrix_type>(A);
1892  auto A_bcrs = Teuchos::rcp_dynamic_cast<const block_crs_matrix_type>(A);
1893 
1894  bool hasBlockCrsMatrix = ! A_bcrs.is_null ();
1895  TEUCHOS_ASSERT(hasBlockCrsMatrix || g->getLocalNumRows() != 0);
1896  const local_ordinal_type blocksize = hasBlockCrsMatrix ? A->getBlockSize() : A->getLocalNumRows()/g->getLocalNumRows();
1897 
1898  // mirroring to host
1899  const auto partptr = Kokkos::create_mirror_view_and_copy (Kokkos::HostSpace(), interf.partptr);
1900  const auto lclrow = Kokkos::create_mirror_view_and_copy (Kokkos::HostSpace(), interf.lclrow);
1901  const auto rowidx2part = Kokkos::create_mirror_view_and_copy (Kokkos::HostSpace(), interf.rowidx2part);
1902  const auto part2rowidx0 = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), interf.part2rowidx0);
1903  const auto packptr = Kokkos::create_mirror_view_and_copy (Kokkos::HostSpace(), interf.packptr);
1904 
1905  const local_ordinal_type nrows = partptr(partptr.extent(0) - 1);
1906 
1907  Kokkos::View<local_ordinal_type*,host_execution_space> col2row("col2row", A->getLocalNumCols());
1908 
1909  // find column to row map on host
1910 
1911  Kokkos::deep_copy(col2row, Teuchos::OrdinalTraits<local_ordinal_type>::invalid());
1912  {
1913  const auto rowmap = g->getRowMap();
1914  const auto colmap = g->getColMap();
1915  const auto dommap = g->getDomainMap();
1916  TEUCHOS_ASSERT( !(rowmap.is_null() || colmap.is_null() || dommap.is_null()));
1917 
1918 #if !defined(__CUDA_ARCH__) && !defined(__HIP_DEVICE_COMPILE__) && !defined(__SYCL_DEVICE_ONLY__)
1919  const Kokkos::RangePolicy<host_execution_space> policy(0,nrows);
1920  Kokkos::parallel_for
1921  ("performSymbolicPhase::RangePolicy::col2row",
1922  policy, KOKKOS_LAMBDA(const local_ordinal_type &lr) {
1923  const global_ordinal_type gid = rowmap->getGlobalElement(lr);
1925  if (dommap->isNodeGlobalElement(gid)) {
1926  const local_ordinal_type lc = colmap->getLocalElement(gid);
1927 # if defined(BLOCKTRIDICONTAINER_DEBUG)
1929  BlockHelperDetails::get_msg_prefix(comm) << "GID " << gid
1930  << " gives an invalid local column.");
1931 # endif
1932  col2row(lc) = lr;
1933  }
1934  });
1935 #endif
1936  }
1937 
1938  // construct the D and R graphs in A = D + R.
1939  {
1940  const auto local_graph = g->getLocalGraphHost();
1941  const auto local_graph_rowptr = local_graph.row_map;
1942  TEUCHOS_ASSERT(local_graph_rowptr.size() == static_cast<size_t>(nrows + 1));
1943  const auto local_graph_colidx = local_graph.entries;
1944 
1945  //assume no overlap.
1946 
1947  Kokkos::View<local_ordinal_type*,host_execution_space> lclrow2idx("lclrow2idx", nrows);
1948  {
1949  const Kokkos::RangePolicy<host_execution_space> policy(0,nrows);
1950  Kokkos::parallel_for
1951  ("performSymbolicPhase::RangePolicy::lclrow2idx",
1952  policy, KOKKOS_LAMBDA(const local_ordinal_type &i) {
1953  lclrow2idx[lclrow(i)] = i;
1954  });
1955  }
1956 
1957  // count (block) nnzs in D and R.
1959  typename sum_reducer_type::value_type sum_reducer_value;
1960  {
1961  const Kokkos::RangePolicy<host_execution_space> policy(0,nrows);
1962  Kokkos::parallel_reduce
1963  // profiling interface does not work
1964  (//"performSymbolicPhase::RangePolicy::count_nnz",
1965  policy, KOKKOS_LAMBDA(const local_ordinal_type &lr, typename sum_reducer_type::value_type &update) {
1966  // LID -> index.
1967  const local_ordinal_type ri0 = lclrow2idx[lr];
1968  const local_ordinal_type pi0 = rowidx2part(ri0);
1969  for (size_type j=local_graph_rowptr(lr);j<local_graph_rowptr(lr+1);++j) {
1970  const local_ordinal_type lc = local_graph_colidx(j);
1971  const local_ordinal_type lc2r = col2row[lc];
1972  bool incr_R = false;
1973  do { // breakable
1974  if (lc2r == (local_ordinal_type) -1) {
1975  incr_R = true;
1976  break;
1977  }
1978  const local_ordinal_type ri = lclrow2idx[lc2r];
1979  const local_ordinal_type pi = rowidx2part(ri);
1980  if (pi != pi0) {
1981  incr_R = true;
1982  break;
1983  }
1984  // Test for being in the tridiag. This is done in index space. In
1985  // LID space, tridiag LIDs in a row are not necessarily related by
1986  // {-1, 0, 1}.
1987  if (ri0 + 1 >= ri && ri0 <= ri + 1)
1988  ++update.v[0]; // D_nnz
1989  else
1990  incr_R = true;
1991  } while (0);
1992  if (incr_R) {
1993  if (lc < nrows) ++update.v[1]; // R_nnz_owned
1994  else ++update.v[2]; // R_nnz_remote
1995  }
1996  }
1997  }, sum_reducer_type(sum_reducer_value));
1998  }
1999  size_type D_nnz = sum_reducer_value.v[0];
2000  size_type R_nnz_owned = sum_reducer_value.v[1];
2001  size_type R_nnz_remote = sum_reducer_value.v[2];
2002 
2003  if (!overlap_communication_and_computation) {
2004  R_nnz_owned += R_nnz_remote;
2005  R_nnz_remote = 0;
2006  }
2007 
2008  // construct the D_00 graph.
2009  {
2010  const auto flat_td_ptr = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), btdm.flat_td_ptr);
2011 
2012  btdm.A_colindsub = local_ordinal_type_1d_view("btdm.A_colindsub", D_nnz);
2013  const auto D_A_colindsub = Kokkos::create_mirror_view(btdm.A_colindsub);
2014 
2015 #if defined(BLOCKTRIDICONTAINER_DEBUG)
2016  Kokkos::deep_copy(D_A_colindsub, Teuchos::OrdinalTraits<local_ordinal_type>::invalid());
2017 #endif
2018 
2019  const local_ordinal_type nparts = partptr.extent(0) - 1;
2020 
2021  {
2022  const Kokkos::RangePolicy<host_execution_space> policy(0, nparts);
2023  Kokkos::parallel_for
2024  ("performSymbolicPhase::RangePolicy<host_execution_space>::D_graph",
2025  policy, KOKKOS_LAMBDA(const local_ordinal_type &pi0) {
2026  const local_ordinal_type part_ri0 = part2rowidx0(pi0);
2027  local_ordinal_type offset = 0;
2028  for (local_ordinal_type ri0=partptr(pi0);ri0<partptr(pi0+1);++ri0) {
2029  const local_ordinal_type td_row_os = btdm.RowToIndex(ri0 - part_ri0) + offset;
2030  offset = 1;
2031  const local_ordinal_type lr0 = lclrow(ri0);
2032  const size_type j0 = local_graph_rowptr(lr0);
2033  for (size_type j=j0;j<local_graph_rowptr(lr0+1);++j) {
2034  const local_ordinal_type lc = local_graph_colidx(j);
2035  const local_ordinal_type lc2r = col2row[lc];
2036  if (lc2r == (local_ordinal_type) -1) continue;
2037  const local_ordinal_type ri = lclrow2idx[lc2r];
2038  const local_ordinal_type pi = rowidx2part(ri);
2039  if (pi != pi0) continue;
2040  if (ri + 1 < ri0 || ri > ri0 + 1) continue;
2041  const local_ordinal_type row_entry = j - j0;
2042  D_A_colindsub(flat_td_ptr(pi0,0) + ((td_row_os + ri) - ri0)) = row_entry;
2043  }
2044  }
2045  });
2046  }
2047 #if defined(BLOCKTRIDICONTAINER_DEBUG)
2048  for (size_t i=0;i<D_A_colindsub.extent(0);++i)
2050 #endif
2051  Kokkos::deep_copy(btdm.A_colindsub, D_A_colindsub);
2052 
2053  // Allocate values.
2054  {
2055  const auto pack_td_ptr_last = Kokkos::subview(btdm.pack_td_ptr, btdm.pack_td_ptr.extent(0)-1, btdm.pack_td_ptr.extent(1)-1);
2056  const auto num_packed_blocks = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), pack_td_ptr_last);
2057  btdm.values = vector_type_3d_view("btdm.values", num_packed_blocks(), blocksize, blocksize);
2058 
2059  if (interf.n_subparts_per_part > 1) {
2060  const auto pack_td_ptr_schur_last = Kokkos::subview(btdm.pack_td_ptr_schur, btdm.pack_td_ptr_schur.extent(0)-1, btdm.pack_td_ptr_schur.extent(1)-1);
2061  const auto num_packed_blocks_schur = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), pack_td_ptr_schur_last);
2062  btdm.values_schur = vector_type_3d_view("btdm.values_schur", num_packed_blocks_schur(), blocksize, blocksize);
2063  }
2064 
2065  if (vector_length > 1) setTridiagsToIdentity(btdm, interf.packptr);
2066  }
2067  }
2068 
2069  // Construct the R graph.
2070  {
2071  amd.rowptr = size_type_1d_view("amd.rowptr", nrows + 1);
2072  amd.A_colindsub = local_ordinal_type_1d_view(do_not_initialize_tag("amd.A_colindsub"), R_nnz_owned);
2073 
2074  const auto R_rowptr = Kokkos::create_mirror_view(amd.rowptr);
2075  const auto R_A_colindsub = Kokkos::create_mirror_view(amd.A_colindsub);
2076 
2077  amd.rowptr_remote = size_type_1d_view("amd.rowptr_remote", overlap_communication_and_computation ? nrows + 1 : 0);
2078  amd.A_colindsub_remote = local_ordinal_type_1d_view(do_not_initialize_tag("amd.A_colindsub_remote"), R_nnz_remote);
2079 
2080  const auto R_rowptr_remote = Kokkos::create_mirror_view(amd.rowptr_remote);
2081  const auto R_A_colindsub_remote = Kokkos::create_mirror_view(amd.A_colindsub_remote);
2082 
2083  {
2084  const Kokkos::RangePolicy<host_execution_space> policy(0,nrows);
2085  Kokkos::parallel_for
2086  ("performSymbolicPhase::RangePolicy<host_execution_space>::R_graph_count",
2087  policy, KOKKOS_LAMBDA(const local_ordinal_type &lr) {
2088  const local_ordinal_type ri0 = lclrow2idx[lr];
2089  const local_ordinal_type pi0 = rowidx2part(ri0);
2090  const size_type j0 = local_graph_rowptr(lr);
2091  for (size_type j=j0;j<local_graph_rowptr(lr+1);++j) {
2092  const local_ordinal_type lc = local_graph_colidx(j);
2093  const local_ordinal_type lc2r = col2row[lc];
2094  if (lc2r != (local_ordinal_type) -1) {
2095  const local_ordinal_type ri = lclrow2idx[lc2r];
2096  const local_ordinal_type pi = rowidx2part(ri);
2097  if (pi == pi0 && ri + 1 >= ri0 && ri <= ri0 + 1) {
2098  continue;
2099  }
2100  }
2101  // exclusive scan will be performed later
2102  if (!overlap_communication_and_computation || lc < nrows) {
2103  ++R_rowptr(lr);
2104  } else {
2105  ++R_rowptr_remote(lr);
2106  }
2107  }
2108  });
2109  }
2110 
2111  // exclusive scan
2113  {
2114  Kokkos::RangePolicy<host_execution_space> policy(0,nrows+1);
2115  Kokkos::parallel_scan
2116  ("performSymbolicPhase::RangePolicy<host_execution_space>::R_graph_fill",
2117  policy, KOKKOS_LAMBDA(const local_ordinal_type &lr,
2118  update_type &update,
2119  const bool &final) {
2120  update_type val;
2121  val.v[0] = R_rowptr(lr);
2122  if (overlap_communication_and_computation)
2123  val.v[1] = R_rowptr_remote(lr);
2124 
2125  if (final) {
2126  R_rowptr(lr) = update.v[0];
2127  if (overlap_communication_and_computation)
2128  R_rowptr_remote(lr) = update.v[1];
2129 
2130  if (lr < nrows) {
2131  const local_ordinal_type ri0 = lclrow2idx[lr];
2132  const local_ordinal_type pi0 = rowidx2part(ri0);
2133 
2134  size_type cnt_rowptr = R_rowptr(lr);
2135  size_type cnt_rowptr_remote = overlap_communication_and_computation ? R_rowptr_remote(lr) : 0; // when not overlap_communication_and_computation, this value is garbage
2136 
2137  const size_type j0 = local_graph_rowptr(lr);
2138  for (size_type j=j0;j<local_graph_rowptr(lr+1);++j) {
2139  const local_ordinal_type lc = local_graph_colidx(j);
2140  const local_ordinal_type lc2r = col2row[lc];
2141  if (lc2r != (local_ordinal_type) -1) {
2142  const local_ordinal_type ri = lclrow2idx[lc2r];
2143  const local_ordinal_type pi = rowidx2part(ri);
2144  if (pi == pi0 && ri + 1 >= ri0 && ri <= ri0 + 1)
2145  continue;
2146  }
2147  const local_ordinal_type row_entry = j - j0;
2148  if (!overlap_communication_and_computation || lc < nrows)
2149  R_A_colindsub(cnt_rowptr++) = row_entry;
2150  else
2151  R_A_colindsub_remote(cnt_rowptr_remote++) = row_entry;
2152  }
2153  }
2154  }
2155  update += val;
2156  });
2157  }
2158  TEUCHOS_ASSERT(R_rowptr(nrows) == R_nnz_owned);
2159  Kokkos::deep_copy(amd.rowptr, R_rowptr);
2160  Kokkos::deep_copy(amd.A_colindsub, R_A_colindsub);
2161  if (overlap_communication_and_computation) {
2162  TEUCHOS_ASSERT(R_rowptr_remote(nrows) == R_nnz_remote);
2163  Kokkos::deep_copy(amd.rowptr_remote, R_rowptr_remote);
2164  Kokkos::deep_copy(amd.A_colindsub_remote, R_A_colindsub_remote);
2165  }
2166 
2167  // Allocate or view values.
2168  if (hasBlockCrsMatrix)
2169  amd.tpetra_values = (const_cast<block_crs_matrix_type*>(A_bcrs.get())->getValuesDeviceNonConst());
2170  else {
2171  amd.tpetra_values = (const_cast<crs_matrix_type*>(A_crs.get()))->getLocalValuesDevice (Tpetra::Access::ReadWrite);
2172  }
2173  }
2174 
2175  // Allocate view for E and initialize the values with B:
2176 
2177  if (interf.n_subparts_per_part > 1)
2178  btdm.e_values = vector_type_4d_view("btdm.e_values", 2, interf.part2packrowidx0_back, blocksize, blocksize);
2179  }
2180  // Precompute offsets of each A and x entry to speed up residual.
2181  // Applies if all of these are true:
2182  // - hasBlockCrsMatrix
2183  // - execution_space is a GPU
2184  // - !useSeqMethod (since this uses a different scheme for indexing A,x)
2185  //
2186  // Reading A, x take up to 4 and 6 levels of indirection respectively,
2187  // but precomputing the offsets reduces it to 2 for both (get index, then value)
2188  if(BlockHelperDetails::is_device<execution_space>::value && !useSeqMethod && hasBlockCrsMatrix)
2189  {
2190  bool is_async_importer_active = !async_importer.is_null();
2191  local_ordinal_type_1d_view dm2cm = is_async_importer_active ? async_importer->dm2cm : local_ordinal_type_1d_view();
2192  bool ownedRemoteSeparate = overlap_communication_and_computation || !is_async_importer_active;
2193  BlockHelperDetails::precompute_A_x_offsets<MatrixType>(amd, interf, g, dm2cm, blocksize, ownedRemoteSeparate);
2194  }
2195 
2196  IFPACK2_BLOCKHELPER_TIMER_FENCE(typename BlockHelperDetails::ImplType<MatrixType>::execution_space)
2197  }
2198 
2199 
2203  template<typename ArgActiveExecutionMemorySpace>
2205 
2206  template<>
2207  struct ExtractAndFactorizeTridiagsDefaultModeAndAlgo<Kokkos::HostSpace> {
2208  typedef KB::Mode::Serial mode_type;
2209 #if defined(__KOKKOSBATCHED_INTEL_MKL_COMPACT_BATCHED__)
2210  typedef KB::Algo::Level3::CompactMKL algo_type;
2211 #else
2212  typedef KB::Algo::Level3::Blocked algo_type;
2213 #endif
2214  static int recommended_team_size(const int /* blksize */,
2215  const int /* vector_length */,
2216  const int /* internal_vector_length */) {
2217  return 1;
2218  }
2219 
2220  };
2221 
2222 #if defined(KOKKOS_ENABLE_CUDA)
2223  static inline int ExtractAndFactorizeRecommendedCudaTeamSize(const int blksize,
2224  const int vector_length,
2225  const int internal_vector_length) {
2226  const int vector_size = vector_length/internal_vector_length;
2227  int total_team_size(0);
2228  if (blksize <= 5) total_team_size = 32;
2229  else if (blksize <= 9) total_team_size = 32; // 64
2230  else if (blksize <= 12) total_team_size = 96;
2231  else if (blksize <= 16) total_team_size = 128;
2232  else if (blksize <= 20) total_team_size = 160;
2233  else total_team_size = 160;
2234  return 2*total_team_size/vector_size;
2235  }
2236  template<>
2237  struct ExtractAndFactorizeTridiagsDefaultModeAndAlgo<Kokkos::CudaSpace> {
2238  typedef KB::Mode::Team mode_type;
2239  typedef KB::Algo::Level3::Unblocked algo_type;
2240  static int recommended_team_size(const int blksize,
2241  const int vector_length,
2242  const int internal_vector_length) {
2243  return ExtractAndFactorizeRecommendedCudaTeamSize(blksize, vector_length, internal_vector_length);
2244  }
2245  };
2246  template<>
2247  struct ExtractAndFactorizeTridiagsDefaultModeAndAlgo<Kokkos::CudaUVMSpace> {
2248  typedef KB::Mode::Team mode_type;
2249  typedef KB::Algo::Level3::Unblocked algo_type;
2250  static int recommended_team_size(const int blksize,
2251  const int vector_length,
2252  const int internal_vector_length) {
2253  return ExtractAndFactorizeRecommendedCudaTeamSize(blksize, vector_length, internal_vector_length);
2254  }
2255  };
2256 #endif
2257 
2258 #if defined(KOKKOS_ENABLE_HIP)
2259  static inline int ExtractAndFactorizeRecommendedHIPTeamSize(const int blksize,
2260  const int vector_length,
2261  const int internal_vector_length) {
2262  const int vector_size = vector_length/internal_vector_length;
2263  int total_team_size(0);
2264  if (blksize <= 5) total_team_size = 32;
2265  else if (blksize <= 9) total_team_size = 32; // 64
2266  else if (blksize <= 12) total_team_size = 96;
2267  else if (blksize <= 16) total_team_size = 128;
2268  else if (blksize <= 20) total_team_size = 160;
2269  else total_team_size = 160;
2270  return 2*total_team_size/vector_size;
2271  }
2272  template<>
2273  struct ExtractAndFactorizeTridiagsDefaultModeAndAlgo<Kokkos::HIPSpace> {
2274  typedef KB::Mode::Team mode_type;
2275  typedef KB::Algo::Level3::Unblocked algo_type;
2276  static int recommended_team_size(const int blksize,
2277  const int vector_length,
2278  const int internal_vector_length) {
2279  return ExtractAndFactorizeRecommendedHIPTeamSize(blksize, vector_length, internal_vector_length);
2280  }
2281  };
2282  template<>
2283  struct ExtractAndFactorizeTridiagsDefaultModeAndAlgo<Kokkos::HIPHostPinnedSpace> {
2284  typedef KB::Mode::Team mode_type;
2285  typedef KB::Algo::Level3::Unblocked algo_type;
2286  static int recommended_team_size(const int blksize,
2287  const int vector_length,
2288  const int internal_vector_length) {
2289  return ExtractAndFactorizeRecommendedHIPTeamSize(blksize, vector_length, internal_vector_length);
2290  }
2291  };
2292 #endif
2293 
2294 #if defined(KOKKOS_ENABLE_SYCL)
2295  static inline int ExtractAndFactorizeRecommendedSYCLTeamSize(const int blksize,
2296  const int vector_length,
2297  const int internal_vector_length) {
2298  const int vector_size = vector_length/internal_vector_length;
2299  int total_team_size(0);
2300  if (blksize <= 5) total_team_size = 32;
2301  else if (blksize <= 9) total_team_size = 32; // 64
2302  else if (blksize <= 12) total_team_size = 96;
2303  else if (blksize <= 16) total_team_size = 128;
2304  else if (blksize <= 20) total_team_size = 160;
2305  else total_team_size = 160;
2306  return 2*total_team_size/vector_size;
2307  }
2308  template<>
2309  struct ExtractAndFactorizeTridiagsDefaultModeAndAlgo<Kokkos::Experimental::SYCLDeviceUSMSpace> {
2310  typedef KB::Mode::Team mode_type;
2311  typedef KB::Algo::Level3::Unblocked algo_type;
2312  static int recommended_team_size(const int blksize,
2313  const int vector_length,
2314  const int internal_vector_length) {
2315  return ExtractAndFactorizeRecommendedSYCLTeamSize(blksize, vector_length, internal_vector_length);
2316  }
2317  };
2318  template<>
2319  struct ExtractAndFactorizeTridiagsDefaultModeAndAlgo<Kokkos::Experimental::SYCLSharedUSMSpace> {
2320  typedef KB::Mode::Team mode_type;
2321  typedef KB::Algo::Level3::Unblocked algo_type;
2322  static int recommended_team_size(const int blksize,
2323  const int vector_length,
2324  const int internal_vector_length) {
2325  return ExtractAndFactorizeRecommendedSYCLTeamSize(blksize, vector_length, internal_vector_length);
2326  }
2327  };
2328 #endif
2329 
2330  template<typename impl_type, typename WWViewType>
2331  KOKKOS_INLINE_FUNCTION
2332  void
2333  solveMultiVector(const typename Kokkos::TeamPolicy<typename impl_type::execution_space>::member_type &member,
2334  const typename impl_type::local_ordinal_type &/* blocksize */,
2335  const typename impl_type::local_ordinal_type &i0,
2336  const typename impl_type::local_ordinal_type &r0,
2337  const typename impl_type::local_ordinal_type &nrows,
2338  const typename impl_type::local_ordinal_type &v,
2339  const ConstUnmanaged<typename impl_type::internal_vector_type_4d_view> D_internal_vector_values,
2340  const Unmanaged<typename impl_type::internal_vector_type_4d_view> X_internal_vector_values,
2341  const WWViewType &WW,
2342  const bool skip_first_pass=false) {
2343  using execution_space = typename impl_type::execution_space;
2344  using team_policy_type = Kokkos::TeamPolicy<execution_space>;
2345  using member_type = typename team_policy_type::member_type;
2346  using local_ordinal_type = typename impl_type::local_ordinal_type;
2347 
2348  typedef SolveTridiagsDefaultModeAndAlgo
2349  <typename execution_space::memory_space> default_mode_and_algo_type;
2350 
2351  typedef typename default_mode_and_algo_type::mode_type default_mode_type;
2352  typedef typename default_mode_and_algo_type::multi_vector_algo_type default_algo_type;
2353 
2354  using btdm_magnitude_type = typename impl_type::btdm_magnitude_type;
2355 
2356  // constant
2357  const auto one = Kokkos::ArithTraits<btdm_magnitude_type>::one();
2358  const auto zero = Kokkos::ArithTraits<btdm_magnitude_type>::zero();
2359 
2360  // subview pattern
2361  auto A = Kokkos::subview(D_internal_vector_values, i0, Kokkos::ALL(), Kokkos::ALL(), v);
2362  auto X1 = Kokkos::subview(X_internal_vector_values, r0, Kokkos::ALL(), Kokkos::ALL(), v);
2363  auto X2 = X1;
2364 
2365  local_ordinal_type i = i0, r = r0;
2366 
2367 
2368  if (nrows > 1) {
2369  // solve Lx = x
2370  if (skip_first_pass) {
2371  i += (nrows-2) * 3;
2372  r += (nrows-2);
2373  A.assign_data( &D_internal_vector_values(i+2,0,0,v) );
2374  X2.assign_data( &X_internal_vector_values(++r,0,0,v) );
2375  A.assign_data( &D_internal_vector_values(i+3,0,0,v) );
2376  KB::Trsm<member_type,
2377  KB::Side::Left,KB::Uplo::Lower,KB::Trans::NoTranspose,KB::Diag::Unit,
2378  default_mode_type,default_algo_type>
2379  ::invoke(member, one, A, X2);
2380  X1.assign_data( X2.data() );
2381  i+=3;
2382  }
2383  else {
2384  KB::Trsm<member_type,
2385  KB::Side::Left,KB::Uplo::Lower,KB::Trans::NoTranspose,KB::Diag::Unit,
2386  default_mode_type,default_algo_type>
2387  ::invoke(member, one, A, X1);
2388  for (local_ordinal_type tr=1;tr<nrows;++tr,i+=3) {
2389  A.assign_data( &D_internal_vector_values(i+2,0,0,v) );
2390  X2.assign_data( &X_internal_vector_values(++r,0,0,v) );
2391  member.team_barrier();
2392  KB::Gemm<member_type,
2393  KB::Trans::NoTranspose,KB::Trans::NoTranspose,
2394  default_mode_type,default_algo_type>
2395  ::invoke(member, -one, A, X1, one, X2);
2396  A.assign_data( &D_internal_vector_values(i+3,0,0,v) );
2397  KB::Trsm<member_type,
2398  KB::Side::Left,KB::Uplo::Lower,KB::Trans::NoTranspose,KB::Diag::Unit,
2399  default_mode_type,default_algo_type>
2400  ::invoke(member, one, A, X2);
2401  X1.assign_data( X2.data() );
2402  }
2403  }
2404 
2405  // solve Ux = x
2406  KB::Trsm<member_type,
2407  KB::Side::Left,KB::Uplo::Upper,KB::Trans::NoTranspose,KB::Diag::NonUnit,
2408  default_mode_type,default_algo_type>
2409  ::invoke(member, one, A, X1);
2410  for (local_ordinal_type tr=nrows;tr>1;--tr) {
2411  i -= 3;
2412  A.assign_data( &D_internal_vector_values(i+1,0,0,v) );
2413  X2.assign_data( &X_internal_vector_values(--r,0,0,v) );
2414  member.team_barrier();
2415  KB::Gemm<member_type,
2416  KB::Trans::NoTranspose,KB::Trans::NoTranspose,
2417  default_mode_type,default_algo_type>
2418  ::invoke(member, -one, A, X1, one, X2);
2419 
2420  A.assign_data( &D_internal_vector_values(i,0,0,v) );
2421  KB::Trsm<member_type,
2422  KB::Side::Left,KB::Uplo::Upper,KB::Trans::NoTranspose,KB::Diag::NonUnit,
2423  default_mode_type,default_algo_type>
2424  ::invoke(member, one, A, X2);
2425  X1.assign_data( X2.data() );
2426  }
2427  } else {
2428  // matrix is already inverted
2429  auto W = Kokkos::subview(WW, Kokkos::ALL(), Kokkos::ALL(), v);
2430  KB::Copy<member_type,KB::Trans::NoTranspose,default_mode_type>
2431  ::invoke(member, X1, W);
2432  member.team_barrier();
2433  KB::Gemm<member_type,
2434  KB::Trans::NoTranspose,KB::Trans::NoTranspose,
2435  default_mode_type,default_algo_type>
2436  ::invoke(member, one, A, W, zero, X1);
2437  }
2438 
2439  }
2440 
2441  template<typename impl_type, typename WWViewType, typename XViewType>
2442  KOKKOS_INLINE_FUNCTION
2443  void
2444  solveSingleVectorNew(const typename Kokkos::TeamPolicy<typename impl_type::execution_space>::member_type &member,
2445  const typename impl_type::local_ordinal_type &blocksize,
2446  const typename impl_type::local_ordinal_type &i0,
2447  const typename impl_type::local_ordinal_type &r0,
2448  const typename impl_type::local_ordinal_type &nrows,
2449  const typename impl_type::local_ordinal_type &v,
2450  const ConstUnmanaged<typename impl_type::internal_vector_type_4d_view> D_internal_vector_values,
2451  const XViewType &X_internal_vector_values, //Unmanaged<typename impl_type::internal_vector_type_4d_view>
2452  const WWViewType &WW) {
2453  using execution_space = typename impl_type::execution_space;
2454  //using team_policy_type = Kokkos::TeamPolicy<execution_space>;
2455  //using member_type = typename team_policy_type::member_type;
2456  using local_ordinal_type = typename impl_type::local_ordinal_type;
2457 
2458  typedef SolveTridiagsDefaultModeAndAlgo
2459  <typename execution_space::memory_space> default_mode_and_algo_type;
2460 
2461  typedef typename default_mode_and_algo_type::mode_type default_mode_type;
2462  typedef typename default_mode_and_algo_type::single_vector_algo_type default_algo_type;
2463 
2464  using btdm_magnitude_type = typename impl_type::btdm_magnitude_type;
2465 
2466  // base pointers
2467  auto A = D_internal_vector_values.data();
2468  auto X = X_internal_vector_values.data();
2469 
2470  // constant
2471  const auto one = Kokkos::ArithTraits<btdm_magnitude_type>::one();
2472  const auto zero = Kokkos::ArithTraits<btdm_magnitude_type>::zero();
2473  //const local_ordinal_type num_vectors = X_scalar_values.extent(2);
2474 
2475  // const local_ordinal_type blocksize = D_scalar_values.extent(1);
2476  const local_ordinal_type astep = D_internal_vector_values.stride_0();
2477  const local_ordinal_type as0 = D_internal_vector_values.stride_1(); //blocksize*vector_length;
2478  const local_ordinal_type as1 = D_internal_vector_values.stride_2(); //vector_length;
2479  const local_ordinal_type xstep = X_internal_vector_values.stride_0();
2480  const local_ordinal_type xs0 = X_internal_vector_values.stride_1(); //vector_length;
2481 
2482  // move to starting point
2483  A += i0*astep + v;
2484  X += r0*xstep + v;
2485 
2486  //for (local_ordinal_type col=0;col<num_vectors;++col)
2487  if (nrows > 1) {
2488  // solve Lx = x
2489  KOKKOSBATCHED_TRSV_LOWER_NO_TRANSPOSE_INTERNAL_INVOKE
2490  (default_mode_type,default_algo_type,
2491  member,
2492  KB::Diag::Unit,
2493  blocksize,blocksize,
2494  one,
2495  A, as0, as1,
2496  X, xs0);
2497 
2498  for (local_ordinal_type tr=1;tr<nrows;++tr) {
2499  member.team_barrier();
2500  KOKKOSBATCHED_GEMV_NO_TRANSPOSE_INTERNAL_INVOKE
2501  (default_mode_type,default_algo_type,
2502  member,
2503  blocksize, blocksize,
2504  -one,
2505  A+2*astep, as0, as1,
2506  X, xs0,
2507  one,
2508  X+1*xstep, xs0);
2509  KOKKOSBATCHED_TRSV_LOWER_NO_TRANSPOSE_INTERNAL_INVOKE
2510  (default_mode_type,default_algo_type,
2511  member,
2512  KB::Diag::Unit,
2513  blocksize,blocksize,
2514  one,
2515  A+3*astep, as0, as1,
2516  X+1*xstep, xs0);
2517 
2518  A += 3*astep;
2519  X += 1*xstep;
2520  }
2521 
2522  // solve Ux = x
2523  KOKKOSBATCHED_TRSV_UPPER_NO_TRANSPOSE_INTERNAL_INVOKE
2524  (default_mode_type,default_algo_type,
2525  member,
2526  KB::Diag::NonUnit,
2527  blocksize, blocksize,
2528  one,
2529  A, as0, as1,
2530  X, xs0);
2531 
2532  for (local_ordinal_type tr=nrows;tr>1;--tr) {
2533  A -= 3*astep;
2534  member.team_barrier();
2535  KOKKOSBATCHED_GEMV_NO_TRANSPOSE_INTERNAL_INVOKE
2536  (default_mode_type,default_algo_type,
2537  member,
2538  blocksize, blocksize,
2539  -one,
2540  A+1*astep, as0, as1,
2541  X, xs0,
2542  one,
2543  X-1*xstep, xs0);
2544  KOKKOSBATCHED_TRSV_UPPER_NO_TRANSPOSE_INTERNAL_INVOKE
2545  (default_mode_type,default_algo_type,
2546  member,
2547  KB::Diag::NonUnit,
2548  blocksize, blocksize,
2549  one,
2550  A, as0, as1,
2551  X-1*xstep,xs0);
2552  X -= 1*xstep;
2553  }
2554  // for multiple rhs
2555  //X += xs1;
2556  } else {
2557  const local_ordinal_type ws0 = WW.stride_0();
2558  auto W = WW.data() + v;
2559  KOKKOSBATCHED_COPY_VECTOR_NO_TRANSPOSE_INTERNAL_INVOKE
2560  (default_mode_type,
2561  member, blocksize, X, xs0, W, ws0);
2562  member.team_barrier();
2563  KOKKOSBATCHED_GEMV_NO_TRANSPOSE_INTERNAL_INVOKE
2564  (default_mode_type,default_algo_type,
2565  member,
2566  blocksize, blocksize,
2567  one,
2568  A, as0, as1,
2569  W, xs0,
2570  zero,
2571  X, xs0);
2572  }
2573  }
2574 
2575  template<typename local_ordinal_type, typename ViewType>
2576  void writeBTDValuesToFile (const local_ordinal_type &n_parts, const ViewType &scalar_values_device, std::string fileName) {
2577 #ifdef IFPACK2_BLOCKTRIDICONTAINER_WRITE_MM
2578  auto scalar_values = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), scalar_values_device);
2579  std::ofstream myfile;
2580  myfile.open (fileName);
2581 
2582  const local_ordinal_type n_parts_per_pack = n_parts < (local_ordinal_type) scalar_values.extent(3) ? n_parts : scalar_values.extent(3);
2583  local_ordinal_type nnz = scalar_values.extent(0) * scalar_values.extent(1) * scalar_values.extent(2) * n_parts_per_pack;
2584  const local_ordinal_type n_blocks = scalar_values.extent(0)*n_parts_per_pack;
2585  const local_ordinal_type n_blocks_per_part = n_blocks/n_parts;
2586 
2587  const local_ordinal_type block_size = scalar_values.extent(1);
2588 
2589  const local_ordinal_type n_rows_per_part = (n_blocks_per_part+2)/3 * block_size;
2590  const local_ordinal_type n_rows = n_rows_per_part*n_parts;
2591 
2592  const local_ordinal_type n_packs = ceil(float(n_parts)/n_parts_per_pack);
2593 
2594  myfile << "%%MatrixMarket matrix coordinate real general"<< std::endl;
2595  myfile << "%%nnz = " << nnz;
2596  myfile << " block size = " << block_size;
2597  myfile << " number of blocks = " << n_blocks;
2598  myfile << " number of parts = " << n_parts;
2599  myfile << " number of blocks per part = " << n_blocks_per_part;
2600  myfile << " number of rows = " << n_rows ;
2601  myfile << " number of cols = " << n_rows;
2602  myfile << " number of packs = " << n_packs << std::endl;
2603 
2604  myfile << n_rows << " " << n_rows << " " << nnz << std::setprecision(9) << std::endl;
2605 
2606  local_ordinal_type current_part_idx, current_block_idx, current_row_offset, current_col_offset, current_row, current_col;
2607  for (local_ordinal_type i_pack=0;i_pack<n_packs;++i_pack) {
2608  for (local_ordinal_type i_part_in_pack=0;i_part_in_pack<n_parts_per_pack;++i_part_in_pack) {
2609  current_part_idx = i_part_in_pack + i_pack * n_parts_per_pack;
2610  for (local_ordinal_type i_block_in_part=0;i_block_in_part<n_blocks_per_part;++i_block_in_part) {
2611  current_block_idx = i_block_in_part + i_pack * n_blocks_per_part;
2612  if (current_block_idx >= (local_ordinal_type) scalar_values.extent(0))
2613  continue;
2614  if (i_block_in_part % 3 == 0) {
2615  current_row_offset = i_block_in_part/3 * block_size;
2616  current_col_offset = i_block_in_part/3 * block_size;
2617  }
2618  else if (i_block_in_part % 3 == 1) {
2619  current_row_offset = (i_block_in_part-1)/3 * block_size;
2620  current_col_offset = ((i_block_in_part-1)/3+1) * block_size;
2621  }
2622  else if (i_block_in_part % 3 == 2) {
2623  current_row_offset = ((i_block_in_part-2)/3+1) * block_size;
2624  current_col_offset = (i_block_in_part-2)/3 * block_size;
2625  }
2626  current_row_offset += current_part_idx * n_rows_per_part;
2627  current_col_offset += current_part_idx * n_rows_per_part;
2628  for (local_ordinal_type i_in_block=0;i_in_block<block_size;++i_in_block) {
2629  for (local_ordinal_type j_in_block=0;j_in_block<block_size;++j_in_block) {
2630  current_row = current_row_offset + i_in_block + 1;
2631  current_col = current_col_offset + j_in_block + 1;
2632  myfile << current_row << " " << current_col << " " << scalar_values(current_block_idx,i_in_block,j_in_block,i_part_in_pack) << std::endl;
2633  }
2634  }
2635  }
2636  }
2637  }
2638 
2639  myfile.close();
2640 #endif
2641  }
2642 
2643  template<typename local_ordinal_type, typename ViewType>
2644  void write4DMultiVectorValuesToFile (const local_ordinal_type &n_parts, const ViewType &scalar_values_device, std::string fileName) {
2645 #ifdef IFPACK2_BLOCKTRIDICONTAINER_WRITE_MM
2646  auto scalar_values = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), scalar_values_device);
2647  std::ofstream myfile;
2648  myfile.open (fileName);
2649 
2650  const local_ordinal_type n_parts_per_pack = n_parts < scalar_values.extent(3) ? n_parts : scalar_values.extent(3);
2651  const local_ordinal_type n_blocks = scalar_values.extent(0)*n_parts_per_pack;
2652  const local_ordinal_type n_blocks_per_part = n_blocks/n_parts;
2653 
2654  const local_ordinal_type block_size = scalar_values.extent(1);
2655  const local_ordinal_type n_cols = scalar_values.extent(2);
2656 
2657  const local_ordinal_type n_rows_per_part = n_blocks_per_part * block_size;
2658  const local_ordinal_type n_rows = n_rows_per_part*n_parts;
2659 
2660  const local_ordinal_type n_packs = ceil(float(n_parts)/n_parts_per_pack);
2661 
2662 
2663  myfile << "%%MatrixMarket matrix array real general"<< std::endl;
2664  myfile << "%%block size = " << block_size;
2665  myfile << " number of blocks = " << n_blocks;
2666  myfile << " number of parts = " << n_parts;
2667  myfile << " number of blocks per part = " << n_blocks_per_part;
2668  myfile << " number of rows = " << n_rows ;
2669  myfile << " number of cols = " << n_cols;
2670  myfile << " number of packs = " << n_packs << std::endl;
2671 
2672  myfile << n_rows << " " << n_cols << std::setprecision(9) << std::endl;
2673 
2674  local_ordinal_type current_part_idx, current_block_idx, current_row_offset;
2675  (void) current_row_offset;
2676  (void) current_part_idx;
2677  for (local_ordinal_type j_in_block=0;j_in_block<n_cols;++j_in_block) {
2678  for (local_ordinal_type i_pack=0;i_pack<n_packs;++i_pack) {
2679  for (local_ordinal_type i_part_in_pack=0;i_part_in_pack<n_parts_per_pack;++i_part_in_pack) {
2680  current_part_idx = i_part_in_pack + i_pack * n_parts_per_pack;
2681  for (local_ordinal_type i_block_in_part=0;i_block_in_part<n_blocks_per_part;++i_block_in_part) {
2682  current_block_idx = i_block_in_part + i_pack * n_blocks_per_part;
2683 
2684  if (current_block_idx >= (local_ordinal_type) scalar_values.extent(0))
2685  continue;
2686  for (local_ordinal_type i_in_block=0;i_in_block<block_size;++i_in_block) {
2687  myfile << scalar_values(current_block_idx,i_in_block,j_in_block,i_part_in_pack) << std::endl;
2688  }
2689  }
2690  }
2691  }
2692  }
2693  myfile.close();
2694 #endif
2695  }
2696 
2697  template<typename local_ordinal_type, typename ViewType>
2698  void write5DMultiVectorValuesToFile (const local_ordinal_type &n_parts, const ViewType &scalar_values_device, std::string fileName) {
2699 #ifdef IFPACK2_BLOCKTRIDICONTAINER_WRITE_MM
2700  auto scalar_values = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), scalar_values_device);
2701  std::ofstream myfile;
2702  myfile.open (fileName);
2703 
2704  const local_ordinal_type n_parts_per_pack = n_parts < scalar_values.extent(4) ? n_parts : scalar_values.extent(4);
2705  const local_ordinal_type n_blocks = scalar_values.extent(1)*n_parts_per_pack;
2706  const local_ordinal_type n_blocks_per_part = n_blocks/n_parts;
2707 
2708  const local_ordinal_type block_size = scalar_values.extent(2);
2709  const local_ordinal_type n_blocks_cols = scalar_values.extent(0);
2710  const local_ordinal_type n_cols = n_blocks_cols * block_size;
2711 
2712  const local_ordinal_type n_rows_per_part = n_blocks_per_part * block_size;
2713  const local_ordinal_type n_rows = n_rows_per_part*n_parts;
2714 
2715  const local_ordinal_type n_packs = ceil(float(n_parts)/n_parts_per_pack);
2716 
2717  myfile << "%%MatrixMarket matrix array real general"<< std::endl;
2718  myfile << "%%block size = " << block_size;
2719  myfile << " number of blocks = " << n_blocks;
2720  myfile << " number of parts = " << n_parts;
2721  myfile << " number of blocks per part = " << n_blocks_per_part;
2722  myfile << " number of rows = " << n_rows ;
2723  myfile << " number of cols = " << n_cols;
2724  myfile << " number of packs = " << n_packs << std::endl;
2725 
2726  myfile << n_rows << " " << n_cols << std::setprecision(9) << std::endl;
2727 
2728  local_ordinal_type current_part_idx, current_block_idx, current_row_offset;
2729  (void) current_row_offset;
2730  (void) current_part_idx;
2731  for (local_ordinal_type i_block_col=0;i_block_col<n_blocks_cols;++i_block_col) {
2732  for (local_ordinal_type j_in_block=0;j_in_block<block_size;++j_in_block) {
2733  for (local_ordinal_type i_pack=0;i_pack<n_packs;++i_pack) {
2734  for (local_ordinal_type i_part_in_pack=0;i_part_in_pack<n_parts_per_pack;++i_part_in_pack) {
2735  current_part_idx = i_part_in_pack + i_pack * n_parts_per_pack;
2736  for (local_ordinal_type i_block_in_part=0;i_block_in_part<n_blocks_per_part;++i_block_in_part) {
2737  current_block_idx = i_block_in_part + i_pack * n_blocks_per_part;
2738 
2739  if (current_block_idx >= (local_ordinal_type) scalar_values.extent(1))
2740  continue;
2741  for (local_ordinal_type i_in_block=0;i_in_block<block_size;++i_in_block) {
2742  myfile << scalar_values(i_block_col,current_block_idx,i_in_block,j_in_block,i_part_in_pack) << std::endl;
2743  }
2744  }
2745  }
2746  }
2747  }
2748  }
2749  myfile.close();
2750 #endif
2751  }
2752 
2753  template<typename local_ordinal_type, typename member_type, typename ViewType1, typename ViewType2>
2754  KOKKOS_INLINE_FUNCTION
2755  void
2756  copy3DView(const member_type &member, const ViewType1 &view1, const ViewType2 &view2) {
2757 /*
2758  // Kokkos::Experimental::local_deep_copy
2759  auto teamVectorRange =
2760  Kokkos::TeamVectorMDRange<Kokkos::Rank<3>, member_type>(
2761  member, view1.extent(0), view1.extent(1), view1.extent(2));
2762 
2763  Kokkos::parallel_for
2764  (teamVectorRange,
2765  [&](const local_ordinal_type &i, const local_ordinal_type &j, const local_ordinal_type &k) {
2766  view1(i,j,k) = view2(i,j,k);
2767  });
2768 */
2769  Kokkos::Experimental::local_deep_copy(member, view1, view2);
2770  }
2771  template<typename MatrixType, int ScratchLevel>
2772  struct ExtractAndFactorizeTridiags {
2773  public:
2774  using impl_type = BlockHelperDetails::ImplType<MatrixType>;
2775  // a functor cannot have both device_type and execution_space; specialization error in kokkos
2776  using execution_space = typename impl_type::execution_space;
2777  using memory_space = typename impl_type::memory_space;
2779  using local_ordinal_type = typename impl_type::local_ordinal_type;
2780  using size_type = typename impl_type::size_type;
2781  using impl_scalar_type = typename impl_type::impl_scalar_type;
2782  using magnitude_type = typename impl_type::magnitude_type;
2784  using row_matrix_type = typename impl_type::tpetra_row_matrix_type;
2785  using crs_graph_type = typename impl_type::tpetra_crs_graph_type;
2787  using local_ordinal_type_1d_view = typename impl_type::local_ordinal_type_1d_view;
2788  using local_ordinal_type_2d_view = typename impl_type::local_ordinal_type_2d_view;
2789  using size_type_2d_view = typename impl_type::size_type_2d_view;
2790  using impl_scalar_type_1d_view_tpetra = typename impl_type::impl_scalar_type_1d_view_tpetra;
2792  using btdm_scalar_type = typename impl_type::btdm_scalar_type;
2793  using btdm_magnitude_type = typename impl_type::btdm_magnitude_type;
2794  using vector_type_3d_view = typename impl_type::vector_type_3d_view;
2795  using vector_type_4d_view = typename impl_type::vector_type_4d_view;
2796  using internal_vector_type_4d_view = typename impl_type::internal_vector_type_4d_view;
2797  using internal_vector_type_5d_view = typename impl_type::internal_vector_type_5d_view;
2798  using btdm_scalar_type_4d_view = typename impl_type::btdm_scalar_type_4d_view;
2799  using btdm_scalar_type_5d_view = typename impl_type::btdm_scalar_type_5d_view;
2800  using internal_vector_scratch_type_3d_view = Scratch<typename impl_type::internal_vector_type_3d_view>;
2801  using btdm_scalar_scratch_type_3d_view = Scratch<typename impl_type::btdm_scalar_type_3d_view>;
2802 
2803  using internal_vector_type = typename impl_type::internal_vector_type;
2804  static constexpr int vector_length = impl_type::vector_length;
2805  static constexpr int internal_vector_length = impl_type::internal_vector_length;
2806  static_assert(vector_length >= internal_vector_length, "Ifpack2 BlockTriDi Numeric: vector_length must be at least as large as internal_vector_length");
2807  static_assert(vector_length % internal_vector_length == 0, "Ifpack2 BlockTriDi Numeric: vector_length must be divisible by internal_vector_length");
2808 
2810  using team_policy_type = Kokkos::TeamPolicy<execution_space>;
2811  using member_type = typename team_policy_type::member_type;
2812 
2813  private:
2814  // part interface
2815  const ConstUnmanaged<local_ordinal_type_1d_view> partptr, lclrow, packptr, packindices_sub, packptr_sub;
2816  const ConstUnmanaged<local_ordinal_type_2d_view> partptr_sub, part2packrowidx0_sub, packindices_schur;
2817  const local_ordinal_type max_partsz;
2818  // block crs matrix (it could be Kokkos::UVMSpace::size_type, which is int)
2819  using size_type_1d_view_tpetra = Kokkos::View<size_t*,typename impl_type::node_device_type>;
2820  ConstUnmanaged<size_type_1d_view_tpetra> A_block_rowptr;
2821  ConstUnmanaged<size_type_1d_view_tpetra> A_point_rowptr;
2822  ConstUnmanaged<impl_scalar_type_1d_view_tpetra> A_values;
2823  // block tridiags
2824  const ConstUnmanaged<size_type_2d_view> pack_td_ptr, flat_td_ptr, pack_td_ptr_schur;
2825  const ConstUnmanaged<local_ordinal_type_1d_view> A_colindsub;
2826  const Unmanaged<internal_vector_type_4d_view> internal_vector_values, internal_vector_values_schur;
2827  const Unmanaged<internal_vector_type_5d_view> e_internal_vector_values;
2828  const Unmanaged<btdm_scalar_type_4d_view> scalar_values, scalar_values_schur;
2829  const Unmanaged<btdm_scalar_type_5d_view> e_scalar_values;
2830  // shared information
2831  const local_ordinal_type blocksize, blocksize_square;
2832  // diagonal safety
2833  const magnitude_type tiny;
2834  const local_ordinal_type vector_loop_size;
2835 
2836  bool hasBlockCrsMatrix;
2837 
2838  public:
2839  ExtractAndFactorizeTridiags(const BlockTridiags<MatrixType> &btdm_,
2840  const BlockHelperDetails::PartInterface<MatrixType> &interf_,
2843  const magnitude_type& tiny_) :
2844  // interface
2845  partptr(interf_.partptr),
2846  lclrow(interf_.lclrow),
2847  packptr(interf_.packptr),
2848  packindices_sub(interf_.packindices_sub),
2849  packptr_sub(interf_.packptr_sub),
2850  partptr_sub(interf_.partptr_sub),
2851  part2packrowidx0_sub(interf_.part2packrowidx0_sub),
2852  packindices_schur(interf_.packindices_schur),
2853  max_partsz(interf_.max_partsz),
2854  // block tridiags
2855  pack_td_ptr(btdm_.pack_td_ptr),
2856  flat_td_ptr(btdm_.flat_td_ptr),
2857  pack_td_ptr_schur(btdm_.pack_td_ptr_schur),
2858  A_colindsub(btdm_.A_colindsub),
2859  internal_vector_values((internal_vector_type*)btdm_.values.data(),
2860  btdm_.values.extent(0),
2861  btdm_.values.extent(1),
2862  btdm_.values.extent(2),
2863  vector_length/internal_vector_length),
2864  internal_vector_values_schur((internal_vector_type*)btdm_.values_schur.data(),
2865  btdm_.values_schur.extent(0),
2866  btdm_.values_schur.extent(1),
2867  btdm_.values_schur.extent(2),
2868  vector_length/internal_vector_length),
2869  e_internal_vector_values((internal_vector_type*)btdm_.e_values.data(),
2870  btdm_.e_values.extent(0),
2871  btdm_.e_values.extent(1),
2872  btdm_.e_values.extent(2),
2873  btdm_.e_values.extent(3),
2874  vector_length/internal_vector_length),
2875  scalar_values((btdm_scalar_type*)btdm_.values.data(),
2876  btdm_.values.extent(0),
2877  btdm_.values.extent(1),
2878  btdm_.values.extent(2),
2879  vector_length),
2880  scalar_values_schur((btdm_scalar_type*)btdm_.values_schur.data(),
2881  btdm_.values_schur.extent(0),
2882  btdm_.values_schur.extent(1),
2883  btdm_.values_schur.extent(2),
2884  vector_length),
2885  e_scalar_values((btdm_scalar_type*)btdm_.e_values.data(),
2886  btdm_.e_values.extent(0),
2887  btdm_.e_values.extent(1),
2888  btdm_.e_values.extent(2),
2889  btdm_.e_values.extent(3),
2890  vector_length),
2891  blocksize(btdm_.values.extent(1)),
2892  blocksize_square(blocksize*blocksize),
2893  // diagonal weight to avoid zero pivots
2894  tiny(tiny_),
2895  vector_loop_size(vector_length/internal_vector_length) {
2896  using crs_matrix_type = typename impl_type::tpetra_crs_matrix_type;
2897  using block_crs_matrix_type = typename impl_type::tpetra_block_crs_matrix_type;
2898 
2899  auto A_crs = Teuchos::rcp_dynamic_cast<const crs_matrix_type>(A_);
2900  auto A_bcrs = Teuchos::rcp_dynamic_cast<const block_crs_matrix_type>(A_);
2901 
2902  hasBlockCrsMatrix = ! A_bcrs.is_null ();
2903 
2904  A_block_rowptr = G_->getLocalGraphDevice().row_map;
2905  if (hasBlockCrsMatrix) {
2906  A_values = const_cast<block_crs_matrix_type*>(A_bcrs.get())->getValuesDeviceNonConst();
2907  }
2908  else {
2909  A_point_rowptr = A_crs->getCrsGraph()->getLocalGraphDevice().row_map;
2910  A_values = A_crs->getLocalValuesDevice (Tpetra::Access::ReadOnly);
2911  }
2912  }
2913 
2914  private:
2915 
2916  KOKKOS_INLINE_FUNCTION
2917  void
2918  extract(local_ordinal_type partidx,
2919  local_ordinal_type local_subpartidx,
2920  local_ordinal_type npacks) const {
2921 #ifdef IFPACK2_BLOCKTRIDICONTAINER_USE_PRINTF
2922  printf("extract partidx = %d, local_subpartidx = %d, npacks = %d;\n", partidx, local_subpartidx, npacks);
2923 #endif
2924  using tlb = BlockHelperDetails::TpetraLittleBlock<Tpetra::Impl::BlockCrsMatrixLittleBlockArrayLayout>;
2925  const size_type kps = pack_td_ptr(partidx, local_subpartidx);
2926  local_ordinal_type kfs[vector_length] = {};
2927  local_ordinal_type ri0[vector_length] = {};
2928  local_ordinal_type nrows[vector_length] = {};
2929 
2930  for (local_ordinal_type vi=0;vi<npacks;++vi,++partidx) {
2931  kfs[vi] = flat_td_ptr(partidx,local_subpartidx);
2932  ri0[vi] = partptr_sub(pack_td_ptr.extent(0)*local_subpartidx + partidx,0);
2933  nrows[vi] = partptr_sub(pack_td_ptr.extent(0)*local_subpartidx + partidx,1) - ri0[vi];
2934 #ifdef IFPACK2_BLOCKTRIDICONTAINER_USE_PRINTF
2935  printf("kfs[%d] = %d;\n", vi, kfs[vi]);
2936  printf("ri0[%d] = %d;\n", vi, ri0[vi]);
2937  printf("nrows[%d] = %d;\n", vi, nrows[vi]);
2938 #endif
2939  }
2940  local_ordinal_type tr_min = 0;
2941  local_ordinal_type tr_max = nrows[0];
2942  if (local_subpartidx % 2 == 1) {
2943  tr_min -= 1;
2944  tr_max += 1;
2945  }
2946 #ifdef IFPACK2_BLOCKTRIDICONTAINER_USE_PRINTF
2947  printf("tr_min = %d and tr_max = %d;\n", tr_min, tr_max);
2948 #endif
2949  for (local_ordinal_type tr=tr_min,j=0;tr<tr_max;++tr) {
2950  for (local_ordinal_type e=0;e<3;++e) {
2951  if (hasBlockCrsMatrix) {
2952  const impl_scalar_type* block[vector_length] = {};
2953  for (local_ordinal_type vi=0;vi<npacks;++vi) {
2954  const size_type Aj = A_block_rowptr(lclrow(ri0[vi] + tr)) + A_colindsub(kfs[vi] + j);
2955 
2956  block[vi] = &A_values(Aj*blocksize_square);
2957  }
2958  const size_type pi = kps + j;
2959 #ifdef IFPACK2_BLOCKTRIDICONTAINER_USE_PRINTF
2960  printf("Extract pi = %ld, ri0 + tr = %d, kfs + j = %d\n", pi, ri0[0] + tr, kfs[0] + j);
2961 #endif
2962  ++j;
2963  for (local_ordinal_type ii=0;ii<blocksize;++ii) {
2964  for (local_ordinal_type jj=0;jj<blocksize;++jj) {
2965  const auto idx = tlb::getFlatIndex(ii, jj, blocksize);
2966  auto& v = internal_vector_values(pi, ii, jj, 0);
2967  for (local_ordinal_type vi=0;vi<npacks;++vi) {
2968  v[vi] = static_cast<btdm_scalar_type>(block[vi][idx]);
2969  }
2970  }
2971  }
2972  }
2973  else {
2974  const size_type pi = kps + j;
2975 
2976  for (local_ordinal_type vi=0;vi<npacks;++vi) {
2977  const size_type Aj_c = A_colindsub(kfs[vi] + j);
2978 
2979  for (local_ordinal_type ii=0;ii<blocksize;++ii) {
2980  auto point_row_offset = A_point_rowptr(lclrow(ri0[vi] + tr)*blocksize + ii);
2981 
2982  for (local_ordinal_type jj=0;jj<blocksize;++jj) {
2983  scalar_values(pi, ii, jj, vi) = A_values(point_row_offset + Aj_c*blocksize + jj);
2984  }
2985  }
2986  }
2987  ++j;
2988  }
2989  if (nrows[0] == 1) break;
2990  if (local_subpartidx % 2 == 0) {
2991  if (e == 1 && (tr == 0 || tr+1 == nrows[0])) break;
2992  for (local_ordinal_type vi=1;vi<npacks;++vi) {
2993  if ((e == 0 && nrows[vi] == 1) || (e == 1 && tr+1 == nrows[vi])) {
2994  npacks = vi;
2995  break;
2996  }
2997  }
2998  }
2999  else {
3000  if (e == 0 && (tr == -1 || tr == nrows[0])) break;
3001  for (local_ordinal_type vi=1;vi<npacks;++vi) {
3002  if ((e == 0 && nrows[vi] == 1) || (e == 0 && tr == nrows[vi])) {
3003  npacks = vi;
3004  break;
3005  }
3006  }
3007  }
3008  }
3009  }
3010  }
3011 
3012  KOKKOS_INLINE_FUNCTION
3013  void
3014  extract(const member_type &member,
3015  const local_ordinal_type &partidxbeg,
3016  local_ordinal_type local_subpartidx,
3017  const local_ordinal_type &npacks,
3018  const local_ordinal_type &vbeg) const {
3019 #ifdef IFPACK2_BLOCKTRIDICONTAINER_USE_PRINTF
3020  printf("extract partidxbeg = %d, local_subpartidx = %d, npacks = %d, vbeg = %d;\n", partidxbeg, local_subpartidx, npacks, vbeg);
3021 #endif
3022  using tlb = BlockHelperDetails::TpetraLittleBlock<Tpetra::Impl::BlockCrsMatrixLittleBlockArrayLayout>;
3023  local_ordinal_type kfs_vals[internal_vector_length] = {};
3024  local_ordinal_type ri0_vals[internal_vector_length] = {};
3025  local_ordinal_type nrows_vals[internal_vector_length] = {};
3026 
3027  const size_type kps = pack_td_ptr(partidxbeg,local_subpartidx);
3028  for (local_ordinal_type v=vbeg,vi=0;v<npacks && vi<internal_vector_length;++v,++vi) {
3029  kfs_vals[vi] = flat_td_ptr(partidxbeg+vi,local_subpartidx);
3030  ri0_vals[vi] = partptr_sub(pack_td_ptr.extent(0)*local_subpartidx + partidxbeg+vi,0);
3031  nrows_vals[vi] = partptr_sub(pack_td_ptr.extent(0)*local_subpartidx + partidxbeg+vi,1) - ri0_vals[vi];
3032 #ifdef IFPACK2_BLOCKTRIDICONTAINER_USE_PRINTF
3033  printf("kfs_vals[%d] = %d;\n", vi, kfs_vals[vi]);
3034  printf("ri0_vals[%d] = %d;\n", vi, ri0_vals[vi]);
3035  printf("nrows_vals[%d] = %d;\n", vi, nrows_vals[vi]);
3036 #endif
3037  }
3038 
3039  local_ordinal_type j_vals[internal_vector_length] = {};
3040 
3041  local_ordinal_type tr_min = 0;
3042  local_ordinal_type tr_max = nrows_vals[0];
3043  if (local_subpartidx % 2 == 1) {
3044  tr_min -= 1;
3045  tr_max += 1;
3046  }
3047 #ifdef IFPACK2_BLOCKTRIDICONTAINER_USE_PRINTF
3048  printf("tr_min = %d and tr_max = %d;\n", tr_min, tr_max);
3049 #endif
3050  for (local_ordinal_type tr=tr_min;tr<tr_max;++tr) {
3051  for (local_ordinal_type v=vbeg,vi=0;v<npacks && vi<internal_vector_length;++v,++vi) {
3052  const local_ordinal_type nrows = (local_subpartidx % 2 == 0 ? nrows_vals[vi] : nrows_vals[vi]);
3053  if ((local_subpartidx % 2 == 0 && tr < nrows) || (local_subpartidx % 2 == 1 && tr < nrows+1)) {
3054  auto &j = j_vals[vi];
3055  const local_ordinal_type kfs = kfs_vals[vi];
3056  const local_ordinal_type ri0 = ri0_vals[vi];
3057  local_ordinal_type lbeg, lend;
3058  if (local_subpartidx % 2 == 0) {
3059  lbeg = (tr == tr_min ? 1 : 0);
3060  lend = (tr == nrows - 1 ? 2 : 3);
3061  }
3062  else {
3063  lbeg = 0;
3064  lend = 3;
3065  if (tr == tr_min) {
3066  lbeg = 1;
3067  lend = 2;
3068  }
3069  else if (tr == nrows) {
3070  lbeg = 0;
3071  lend = 1;
3072  }
3073  }
3074  if (hasBlockCrsMatrix) {
3075  for (local_ordinal_type l=lbeg;l<lend;++l,++j) {
3076  const size_type Aj = A_block_rowptr(lclrow(ri0 + tr)) + A_colindsub(kfs + j);
3077  const impl_scalar_type* block = &A_values(Aj*blocksize_square);
3078  const size_type pi = kps + j;
3079 #ifdef IFPACK2_BLOCKTRIDICONTAINER_USE_PRINTF
3080  printf("Extract pi = %ld, ri0 + tr = %d, kfs + j = %d, tr = %d, lbeg = %d, lend = %d, l = %d\n", pi, ri0 + tr, kfs + j, tr, lbeg, lend, l);
3081 #endif
3082  Kokkos::parallel_for
3083  (Kokkos::TeamThreadRange(member,blocksize),
3084  [&](const local_ordinal_type &ii) {
3085  for (local_ordinal_type jj=0;jj<blocksize;++jj) {
3086  scalar_values(pi, ii, jj, v) = static_cast<btdm_scalar_type>(block[tlb::getFlatIndex(ii,jj,blocksize)]);
3087  }
3088  });
3089  }
3090  }
3091  else {
3092  for (local_ordinal_type l=lbeg;l<lend;++l,++j) {
3093  const size_type Aj_c = A_colindsub(kfs + j);
3094  const size_type pi = kps + j;
3095  Kokkos::parallel_for
3096  (Kokkos::TeamThreadRange(member,blocksize),
3097  [&](const local_ordinal_type &ii) {
3098  auto point_row_offset = A_point_rowptr(lclrow(ri0 + tr)*blocksize + ii);
3099  for (local_ordinal_type jj=0;jj<blocksize;++jj) {
3100  scalar_values(pi, ii, jj, v) = A_values(point_row_offset + Aj_c*blocksize + jj);
3101  }
3102  });
3103  }
3104  }
3105  }
3106  }
3107  }
3108  }
3109 
3110  template<typename AAViewType,
3111  typename WWViewType>
3112  KOKKOS_INLINE_FUNCTION
3113  void
3114  factorize_subline(const member_type &member,
3115  const local_ordinal_type &i0,
3116  const local_ordinal_type &nrows,
3117  const local_ordinal_type &v,
3118  const AAViewType &AA,
3119  const WWViewType &WW) const {
3120 
3121  typedef ExtractAndFactorizeTridiagsDefaultModeAndAlgo
3122  <typename execution_space::memory_space> default_mode_and_algo_type;
3123 
3124  typedef typename default_mode_and_algo_type::mode_type default_mode_type;
3125  typedef typename default_mode_and_algo_type::algo_type default_algo_type;
3126 
3127  // constant
3128  const auto one = Kokkos::ArithTraits<btdm_magnitude_type>::one();
3129 
3130 #ifdef IFPACK2_BLOCKTRIDICONTAINER_USE_PRINTF
3131  printf("i0 = %d, nrows = %d, v = %d, AA.extent(0) = %ld;\n", i0, nrows, v, AA.extent(0));
3132 #endif
3133 
3134  // subview pattern
3135  auto A = Kokkos::subview(AA, i0, Kokkos::ALL(), Kokkos::ALL(), v);
3136  KB::LU<member_type,
3137  default_mode_type,KB::Algo::LU::Unblocked>
3138  ::invoke(member, A , tiny);
3139 
3140  if (nrows > 1) {
3141  auto B = A;
3142  auto C = A;
3143  local_ordinal_type i = i0;
3144  for (local_ordinal_type tr=1;tr<nrows;++tr,i+=3) {
3145 #ifdef IFPACK2_BLOCKTRIDICONTAINER_USE_PRINTF
3146  printf("tr = %d, i = %d;\n", tr, i);
3147 #endif
3148  B.assign_data( &AA(i+1,0,0,v) );
3149  KB::Trsm<member_type,
3150  KB::Side::Left,KB::Uplo::Lower,KB::Trans::NoTranspose,KB::Diag::Unit,
3151  default_mode_type,default_algo_type>
3152  ::invoke(member, one, A, B);
3153  C.assign_data( &AA(i+2,0,0,v) );
3154  KB::Trsm<member_type,
3155  KB::Side::Right,KB::Uplo::Upper,KB::Trans::NoTranspose,KB::Diag::NonUnit,
3156  default_mode_type,default_algo_type>
3157  ::invoke(member, one, A, C);
3158  A.assign_data( &AA(i+3,0,0,v) );
3159 
3160  member.team_barrier();
3161  KB::Gemm<member_type,
3162  KB::Trans::NoTranspose,KB::Trans::NoTranspose,
3163  default_mode_type,default_algo_type>
3164  ::invoke(member, -one, C, B, one, A);
3165  KB::LU<member_type,
3166  default_mode_type,KB::Algo::LU::Unblocked>
3167  ::invoke(member, A, tiny);
3168  }
3169  } else {
3170  // for block jacobi invert a matrix here
3171  auto W = Kokkos::subview(WW, Kokkos::ALL(), Kokkos::ALL(), v);
3172  KB::Copy<member_type,KB::Trans::NoTranspose,default_mode_type>
3173  ::invoke(member, A, W);
3174  KB::SetIdentity<member_type,default_mode_type>
3175  ::invoke(member, A);
3176  member.team_barrier();
3177  KB::Trsm<member_type,
3178  KB::Side::Left,KB::Uplo::Lower,KB::Trans::NoTranspose,KB::Diag::Unit,
3179  default_mode_type,default_algo_type>
3180  ::invoke(member, one, W, A);
3181  KB::Trsm<member_type,
3182  KB::Side::Left,KB::Uplo::Upper,KB::Trans::NoTranspose,KB::Diag::NonUnit,
3183  default_mode_type,default_algo_type>
3184  ::invoke(member, one, W, A);
3185  }
3186  }
3187 
3188  public:
3189 
3190  struct ExtractAndFactorizeSubLineTag {};
3191  struct ExtractBCDTag {};
3192  struct ComputeETag {};
3193  struct ComputeSchurTag {};
3194  struct FactorizeSchurTag {};
3195 
3196  KOKKOS_INLINE_FUNCTION
3197  void
3198  operator() (const ExtractAndFactorizeSubLineTag &, const member_type &member) const {
3199  // btdm is packed and sorted from largest one
3200  const local_ordinal_type packidx = packindices_sub(member.league_rank());
3201 
3202  const local_ordinal_type subpartidx = packptr_sub(packidx);
3203  const local_ordinal_type n_parts = part2packrowidx0_sub.extent(0);
3204  const local_ordinal_type local_subpartidx = subpartidx/n_parts;
3205  const local_ordinal_type partidx = subpartidx%n_parts;
3206 
3207  const local_ordinal_type npacks = packptr_sub(packidx+1) - subpartidx;
3208  const local_ordinal_type i0 = pack_td_ptr(partidx,local_subpartidx);
3209  const local_ordinal_type nrows = partptr_sub(subpartidx,1) - partptr_sub(subpartidx,0);
3210 
3211  internal_vector_scratch_type_3d_view
3212  WW(member.team_scratch(ScratchLevel), blocksize, blocksize, vector_loop_size);
3213 
3214 #ifdef IFPACK2_BLOCKTRIDICONTAINER_USE_PRINTF
3215  printf("rank = %d, i0 = %d, npacks = %d, nrows = %d, packidx = %d, subpartidx = %d, partidx = %d, local_subpartidx = %d;\n", member.league_rank(), i0, npacks, nrows, packidx, subpartidx, partidx, local_subpartidx);
3216  printf("vector_loop_size = %d\n", vector_loop_size);
3217 #endif
3218 
3219  if (vector_loop_size == 1) {
3220  extract(partidx, local_subpartidx, npacks);
3221  factorize_subline(member, i0, nrows, 0, internal_vector_values, WW);
3222  } else {
3223  Kokkos::parallel_for
3224  (Kokkos::ThreadVectorRange(member, vector_loop_size),
3225  [&](const local_ordinal_type &v) {
3226  const local_ordinal_type vbeg = v*internal_vector_length;
3227 #ifdef IFPACK2_BLOCKTRIDICONTAINER_USE_PRINTF
3228  printf("i0 = %d, npacks = %d, vbeg = %d;\n", i0, npacks, vbeg);
3229 #endif
3230  if (vbeg < npacks)
3231  extract(member, partidx+vbeg, local_subpartidx, npacks, vbeg);
3232  // this is not safe if vector loop size is different from vector size of
3233  // the team policy. we always make sure this when constructing the team policy
3234  member.team_barrier();
3235  factorize_subline(member, i0, nrows, v, internal_vector_values, WW);
3236  });
3237  }
3238  }
3239 
3240  KOKKOS_INLINE_FUNCTION
3241  void
3242  operator() (const ExtractBCDTag &, const member_type &member) const {
3243  // btdm is packed and sorted from largest one
3244  const local_ordinal_type packindices_schur_i = member.league_rank() % packindices_schur.extent(0);
3245  const local_ordinal_type packindices_schur_j = member.league_rank() / packindices_schur.extent(0);
3246  const local_ordinal_type packidx = packindices_schur(packindices_schur_i, packindices_schur_j);
3247 
3248  const local_ordinal_type subpartidx = packptr_sub(packidx);
3249  const local_ordinal_type n_parts = part2packrowidx0_sub.extent(0);
3250  const local_ordinal_type local_subpartidx = subpartidx/n_parts;
3251  const local_ordinal_type partidx = subpartidx%n_parts;
3252 
3253  const local_ordinal_type npacks = packptr_sub(packidx+1) - subpartidx;
3254  //const local_ordinal_type i0 = pack_td_ptr(partidx,local_subpartidx);
3255  //const local_ordinal_type nrows = partptr_sub(subpartidx,1) - partptr_sub(subpartidx,0);
3256 
3257  if (vector_loop_size == 1) {
3258  extract(partidx, local_subpartidx, npacks);
3259  }
3260  else {
3261  Kokkos::parallel_for
3262  (Kokkos::ThreadVectorRange(member, vector_loop_size),
3263  [&](const local_ordinal_type &v) {
3264  const local_ordinal_type vbeg = v*internal_vector_length;
3265 #ifdef IFPACK2_BLOCKTRIDICONTAINER_USE_PRINTF
3266  const local_ordinal_type i0 = pack_td_ptr(partidx,local_subpartidx);
3267  printf("i0 = %d, npacks = %d, vbeg = %d;\n", i0, npacks, vbeg);
3268 #endif
3269  if (vbeg < npacks)
3270  extract(member, partidx+vbeg, local_subpartidx, npacks, vbeg);
3271  });
3272  }
3273 
3274  member.team_barrier();
3275 
3276  const size_type kps1 = pack_td_ptr(partidx, local_subpartidx);
3277  const size_type kps2 = pack_td_ptr(partidx, local_subpartidx+1)-1;
3278 
3279  const local_ordinal_type r1 = part2packrowidx0_sub(partidx,local_subpartidx)-1;
3280  const local_ordinal_type r2 = part2packrowidx0_sub(partidx,local_subpartidx)+2;
3281 
3282 #ifdef IFPACK2_BLOCKTRIDICONTAINER_USE_PRINTF
3283  printf("Copy for Schur complement part id = %d from kps1 = %ld to r1 = %d and from kps2 = %ld to r2 = %d partidx = %d local_subpartidx = %d;\n", packidx, kps1, r1, kps2, r2, partidx, local_subpartidx);
3284 #endif
3285 
3286  // Need to copy D to e_internal_vector_values.
3287  copy3DView<local_ordinal_type>(member, Kokkos::subview(e_internal_vector_values, 0, r1, Kokkos::ALL(), Kokkos::ALL(), Kokkos::ALL()),
3288  Kokkos::subview(internal_vector_values, kps1, Kokkos::ALL(), Kokkos::ALL(), Kokkos::ALL()));
3289 
3290  copy3DView<local_ordinal_type>(member, Kokkos::subview(e_internal_vector_values, 1, r2, Kokkos::ALL(), Kokkos::ALL(), Kokkos::ALL()),
3291  Kokkos::subview(internal_vector_values, kps2, Kokkos::ALL(), Kokkos::ALL(), Kokkos::ALL()));
3292 
3293  }
3294 
3295  KOKKOS_INLINE_FUNCTION
3296  void
3297  operator() (const ComputeETag &, const member_type &member) const {
3298  // btdm is packed and sorted from largest one
3299  const local_ordinal_type packidx = packindices_sub(member.league_rank());
3300 
3301  const local_ordinal_type subpartidx = packptr_sub(packidx);
3302  const local_ordinal_type n_parts = part2packrowidx0_sub.extent(0);
3303  const local_ordinal_type local_subpartidx = subpartidx/n_parts;
3304  const local_ordinal_type partidx = subpartidx%n_parts;
3305 
3306  const local_ordinal_type npacks = packptr_sub(packidx+1) - subpartidx;
3307  const local_ordinal_type i0 = pack_td_ptr(partidx,local_subpartidx);
3308  const local_ordinal_type r0 = part2packrowidx0_sub(partidx,local_subpartidx);
3309  const local_ordinal_type nrows = partptr_sub(subpartidx,1) - partptr_sub(subpartidx,0);
3310  const local_ordinal_type num_vectors = blocksize;
3311 
3312  (void) npacks;
3313 
3314  internal_vector_scratch_type_3d_view
3315  WW(member.team_scratch(ScratchLevel), blocksize, num_vectors, vector_loop_size);
3316  if (local_subpartidx == 0) {
3317  Kokkos::parallel_for
3318  (Kokkos::ThreadVectorRange(member, vector_loop_size),[&](const int &v) {
3319  solveMultiVector<impl_type, internal_vector_scratch_type_3d_view> (member, blocksize, i0, r0, nrows, v, internal_vector_values, Kokkos::subview(e_internal_vector_values, 0, Kokkos::ALL(), Kokkos::ALL(), Kokkos::ALL(), Kokkos::ALL()), WW, true);
3320  });
3321  }
3322  else if (local_subpartidx == (local_ordinal_type) part2packrowidx0_sub.extent(1) - 2) {
3323  Kokkos::parallel_for
3324  (Kokkos::ThreadVectorRange(member, vector_loop_size),[&](const int &v) {
3325  solveMultiVector<impl_type, internal_vector_scratch_type_3d_view> (member, blocksize, i0, r0, nrows, v, internal_vector_values, Kokkos::subview(e_internal_vector_values, 1, Kokkos::ALL(), Kokkos::ALL(), Kokkos::ALL(), Kokkos::ALL()), WW);
3326  });
3327  }
3328  else {
3329  Kokkos::parallel_for
3330  (Kokkos::ThreadVectorRange(member, vector_loop_size),[&](const int &v) {
3331  solveMultiVector<impl_type, internal_vector_scratch_type_3d_view> (member, blocksize, i0, r0, nrows, v, internal_vector_values, Kokkos::subview(e_internal_vector_values, 0, Kokkos::ALL(), Kokkos::ALL(), Kokkos::ALL(), Kokkos::ALL()), WW, true);
3332  solveMultiVector<impl_type, internal_vector_scratch_type_3d_view> (member, blocksize, i0, r0, nrows, v, internal_vector_values, Kokkos::subview(e_internal_vector_values, 1, Kokkos::ALL(), Kokkos::ALL(), Kokkos::ALL(), Kokkos::ALL()), WW);
3333  });
3334  }
3335  }
3336 
3337  KOKKOS_INLINE_FUNCTION
3338  void
3339  operator() (const ComputeSchurTag &, const member_type &member) const {
3340  // btdm is packed and sorted from largest one
3341  const local_ordinal_type packindices_schur_i = member.league_rank() % packindices_schur.extent(0);
3342  const local_ordinal_type packindices_schur_j = member.league_rank() / packindices_schur.extent(0);
3343  const local_ordinal_type packidx = packindices_schur(packindices_schur_i, packindices_schur_j);
3344 
3345  const local_ordinal_type subpartidx = packptr_sub(packidx);
3346  const local_ordinal_type n_parts = part2packrowidx0_sub.extent(0);
3347  const local_ordinal_type local_subpartidx = subpartidx/n_parts;
3348  const local_ordinal_type partidx = subpartidx%n_parts;
3349 
3350  //const local_ordinal_type npacks = packptr_sub(packidx+1) - subpartidx;
3351  const local_ordinal_type i0 = pack_td_ptr(partidx,local_subpartidx);
3352  //const local_ordinal_type r0 = part2packrowidx0_sub(partidx,local_subpartidx);
3353  //const local_ordinal_type nrows = partptr_sub(subpartidx,1) - partptr_sub(subpartidx,0);
3354 
3355  // Compute S = D - C E
3356 
3357  const local_ordinal_type local_subpartidx_schur = (local_subpartidx-1)/2;
3358  const local_ordinal_type i0_schur = local_subpartidx_schur == 0 ? pack_td_ptr_schur(partidx,local_subpartidx_schur) : pack_td_ptr_schur(partidx,local_subpartidx_schur) + 1;
3359  const local_ordinal_type i0_offset = local_subpartidx_schur == 0 ? i0+2 : i0+2;
3360 
3361  for (local_ordinal_type i = 0; i < 4; ++i) { //pack_td_ptr_schur(partidx,local_subpartidx_schur+1)-i0_schur
3362  copy3DView<local_ordinal_type>(member, Kokkos::subview(internal_vector_values_schur, i0_schur+i, Kokkos::ALL(), Kokkos::ALL(), Kokkos::ALL()),
3363  Kokkos::subview(internal_vector_values, i0_offset+i, Kokkos::ALL(), Kokkos::ALL(), Kokkos::ALL()));
3364  }
3365 
3366  member.team_barrier();
3367 
3368  const auto one = Kokkos::ArithTraits<btdm_magnitude_type>::one();
3369 
3370  const size_type c_kps1 = pack_td_ptr(partidx, local_subpartidx)+1;
3371  const size_type c_kps2 = pack_td_ptr(partidx, local_subpartidx+1)-2;
3372 
3373  const local_ordinal_type e_r1 = part2packrowidx0_sub(partidx,local_subpartidx)-1;
3374  const local_ordinal_type e_r2 = part2packrowidx0_sub(partidx,local_subpartidx)+2;
3375 
3376  typedef ExtractAndFactorizeTridiagsDefaultModeAndAlgo
3377  <typename execution_space::memory_space> default_mode_and_algo_type;
3378 
3379  typedef typename default_mode_and_algo_type::mode_type default_mode_type;
3380  typedef typename default_mode_and_algo_type::algo_type default_algo_type;
3381 
3382  Kokkos::parallel_for
3383  (Kokkos::ThreadVectorRange(member, vector_loop_size),[&](const int &v) {
3384  for (size_type i = 0; i < pack_td_ptr_schur(partidx,local_subpartidx_schur+1)-pack_td_ptr_schur(partidx,local_subpartidx_schur); ++i) {
3385  local_ordinal_type e_r, e_c, c_kps;
3386 
3387  if ( local_subpartidx_schur == 0 ) {
3388  if ( i == 0 ) {
3389  e_r = e_r1;
3390  e_c = 0;
3391  c_kps = c_kps1;
3392  }
3393  else if ( i == 3 ) {
3394  e_r = e_r2;
3395  e_c = 1;
3396  c_kps = c_kps2;
3397  }
3398  else if ( i == 4 ) {
3399  e_r = e_r2;
3400  e_c = 0;
3401  c_kps = c_kps2;
3402  }
3403  else {
3404  continue;
3405  }
3406  }
3407  else {
3408  if ( i == 0 ) {
3409  e_r = e_r1;
3410  e_c = 1;
3411  c_kps = c_kps1;
3412  }
3413  else if ( i == 1 ) {
3414  e_r = e_r1;
3415  e_c = 0;
3416  c_kps = c_kps1;
3417  }
3418  else if ( i == 4 ) {
3419  e_r = e_r2;
3420  e_c = 1;
3421  c_kps = c_kps2;
3422  }
3423  else if ( i == 5 ) {
3424  e_r = e_r2;
3425  e_c = 0;
3426  c_kps = c_kps2;
3427  }
3428  else {
3429  continue;
3430  }
3431  }
3432 
3433  auto S = Kokkos::subview(internal_vector_values_schur, pack_td_ptr_schur(partidx,local_subpartidx_schur)+i, Kokkos::ALL(), Kokkos::ALL(), v);
3434  auto C = Kokkos::subview(internal_vector_values, c_kps, Kokkos::ALL(), Kokkos::ALL(), v);
3435  auto E = Kokkos::subview(e_internal_vector_values, e_c, e_r, Kokkos::ALL(), Kokkos::ALL(), v);
3436  KB::Gemm<member_type,
3437  KB::Trans::NoTranspose,KB::Trans::NoTranspose,
3438  default_mode_type,default_algo_type>
3439  ::invoke(member, -one, C, E, one, S);
3440  }
3441  });
3442  }
3443 
3444  KOKKOS_INLINE_FUNCTION
3445  void
3446  operator() (const FactorizeSchurTag &, const member_type &member) const {
3447  const local_ordinal_type packidx = packindices_schur(member.league_rank(), 0);
3448 
3449  const local_ordinal_type subpartidx = packptr_sub(packidx);
3450 
3451  const local_ordinal_type n_parts = part2packrowidx0_sub.extent(0);
3452  const local_ordinal_type partidx = subpartidx%n_parts;
3453 
3454  const local_ordinal_type i0 = pack_td_ptr_schur(partidx,0);
3455  const local_ordinal_type nrows = 2*(pack_td_ptr_schur.extent(1)-1);
3456 
3457  internal_vector_scratch_type_3d_view
3458  WW(member.team_scratch(ScratchLevel), blocksize, blocksize, vector_loop_size);
3459 
3460 #ifdef IFPACK2_BLOCKTRIDICONTAINER_USE_PRINTF
3461  printf("FactorizeSchurTag rank = %d, i0 = %d, nrows = %d, vector_loop_size = %d;\n", member.league_rank(), i0, nrows, vector_loop_size);
3462 #endif
3463 
3464  if (vector_loop_size == 1) {
3465  factorize_subline(member, i0, nrows, 0, internal_vector_values_schur, WW);
3466  } else {
3467  Kokkos::parallel_for
3468  (Kokkos::ThreadVectorRange(member, vector_loop_size),
3469  [&](const local_ordinal_type &v) {
3470  factorize_subline(member, i0, nrows, v, internal_vector_values_schur, WW);
3471  });
3472  }
3473  }
3474 
3475  void run() {
3476  IFPACK2_BLOCKTRIDICONTAINER_PROFILER_REGION_BEGIN;
3477  const local_ordinal_type team_size =
3478  ExtractAndFactorizeTridiagsDefaultModeAndAlgo<typename execution_space::memory_space>::
3479  recommended_team_size(blocksize, vector_length, internal_vector_length);
3480  const local_ordinal_type per_team_scratch = internal_vector_scratch_type_3d_view::
3481  shmem_size(blocksize, blocksize, vector_loop_size);
3482 
3483  {
3484 #ifdef IFPACK2_BLOCKTRIDICONTAINER_USE_PRINTF
3485  printf("Start ExtractAndFactorizeSubLineTag\n");
3486 #endif
3487  IFPACK2_BLOCKHELPER_TIMER("BlockTriDi::NumericPhase::ExtractAndFactorizeSubLineTag", ExtractAndFactorizeSubLineTag0);
3488  Kokkos::TeamPolicy<execution_space,ExtractAndFactorizeSubLineTag>
3489  policy(packindices_sub.extent(0), team_size, vector_loop_size);
3490 
3491 
3492  const local_ordinal_type n_parts = part2packrowidx0_sub.extent(0);
3493  writeBTDValuesToFile(n_parts, scalar_values, "before.mm");
3494 
3495  policy.set_scratch_size(ScratchLevel, Kokkos::PerTeam(per_team_scratch));
3496  Kokkos::parallel_for("ExtractAndFactorize::TeamPolicy::run<ExtractAndFactorizeSubLineTag>",
3497  policy, *this);
3498  execution_space().fence();
3499 
3500  writeBTDValuesToFile(n_parts, scalar_values, "after.mm");
3501 #ifdef IFPACK2_BLOCKTRIDICONTAINER_USE_PRINTF
3502  printf("End ExtractAndFactorizeSubLineTag\n");
3503 #endif
3504  }
3505 
3506  if (packindices_schur.extent(1) > 0)
3507  {
3508  {
3509 #ifdef IFPACK2_BLOCKTRIDICONTAINER_USE_PRINTF
3510  printf("Start ExtractBCDTag\n");
3511 #endif
3512  Kokkos::deep_copy(e_scalar_values, Kokkos::ArithTraits<btdm_magnitude_type>::zero());
3513  Kokkos::deep_copy(scalar_values_schur, Kokkos::ArithTraits<btdm_magnitude_type>::zero());
3514 
3515  write5DMultiVectorValuesToFile(part2packrowidx0_sub.extent(0), e_scalar_values, "e_scalar_values_before_extract.mm");
3516 
3517  {
3518  IFPACK2_BLOCKHELPER_TIMER("BlockTriDi::NumericPhase::ExtractBCDTag", ExtractBCDTag0);
3519  Kokkos::TeamPolicy<execution_space,ExtractBCDTag>
3520  policy(packindices_schur.extent(0)*packindices_schur.extent(1), team_size, vector_loop_size);
3521 
3522  policy.set_scratch_size(ScratchLevel, Kokkos::PerTeam(per_team_scratch));
3523  Kokkos::parallel_for("ExtractAndFactorize::TeamPolicy::run<ExtractBCDTag>",
3524  policy, *this);
3525  execution_space().fence();
3526  }
3527 
3528 #ifdef IFPACK2_BLOCKTRIDICONTAINER_USE_PRINTF
3529  printf("End ExtractBCDTag\n");
3530 #endif
3531  writeBTDValuesToFile(part2packrowidx0_sub.extent(0), scalar_values, "after_extraction_of_BCD.mm");
3532 #ifdef IFPACK2_BLOCKTRIDICONTAINER_USE_PRINTF
3533  printf("Start ComputeETag\n");
3534 #endif
3535  write5DMultiVectorValuesToFile(part2packrowidx0_sub.extent(0), e_scalar_values, "e_scalar_values_after_extract.mm");
3536  {
3537  IFPACK2_BLOCKHELPER_TIMER("BlockTriDi::NumericPhase::ComputeETag", ComputeETag0);
3538  Kokkos::TeamPolicy<execution_space,ComputeETag>
3539  policy(packindices_sub.extent(0), team_size, vector_loop_size);
3540 
3541  policy.set_scratch_size(ScratchLevel, Kokkos::PerTeam(per_team_scratch));
3542  Kokkos::parallel_for("ExtractAndFactorize::TeamPolicy::run<ComputeETag>",
3543  policy, *this);
3544  execution_space().fence();
3545  }
3546  write5DMultiVectorValuesToFile(part2packrowidx0_sub.extent(0), e_scalar_values, "e_scalar_values_after_compute.mm");
3547 
3548 #ifdef IFPACK2_BLOCKTRIDICONTAINER_USE_PRINTF
3549  printf("End ComputeETag\n");
3550 #endif
3551  }
3552 
3553  {
3554 #ifdef IFPACK2_BLOCKTRIDICONTAINER_USE_PRINTF
3555  printf("Start ComputeSchurTag\n");
3556 #endif
3557  IFPACK2_BLOCKHELPER_TIMER("BlockTriDi::NumericPhase::ComputeSchurTag", ComputeSchurTag0);
3558  writeBTDValuesToFile(part2packrowidx0_sub.extent(0), scalar_values_schur, "before_schur.mm");
3559  Kokkos::TeamPolicy<execution_space,ComputeSchurTag>
3560  policy(packindices_schur.extent(0)*packindices_schur.extent(1), team_size, vector_loop_size);
3561 
3562  Kokkos::parallel_for("ExtractAndFactorize::TeamPolicy::run<ComputeSchurTag>",
3563  policy, *this);
3564  writeBTDValuesToFile(part2packrowidx0_sub.extent(0), scalar_values_schur, "after_schur.mm");
3565  execution_space().fence();
3566 #ifdef IFPACK2_BLOCKTRIDICONTAINER_USE_PRINTF
3567  printf("End ComputeSchurTag\n");
3568 #endif
3569  }
3570 
3571  {
3572 #ifdef IFPACK2_BLOCKTRIDICONTAINER_USE_PRINTF
3573  printf("Start FactorizeSchurTag\n");
3574 #endif
3575  IFPACK2_BLOCKHELPER_TIMER("BlockTriDi::NumericPhase::FactorizeSchurTag", FactorizeSchurTag0);
3576  Kokkos::TeamPolicy<execution_space,FactorizeSchurTag>
3577  policy(packindices_schur.extent(0), team_size, vector_loop_size);
3578  policy.set_scratch_size(ScratchLevel, Kokkos::PerTeam(per_team_scratch));
3579  Kokkos::parallel_for("ExtractAndFactorize::TeamPolicy::run<FactorizeSchurTag>",
3580  policy, *this);
3581  execution_space().fence();
3582  writeBTDValuesToFile(part2packrowidx0_sub.extent(0), scalar_values_schur, "after_factor_schur.mm");
3583 #ifdef IFPACK2_BLOCKTRIDICONTAINER_USE_PRINTF
3584  printf("End FactorizeSchurTag\n");
3585 #endif
3586  }
3587  }
3588 
3589  IFPACK2_BLOCKTRIDICONTAINER_PROFILER_REGION_END;
3590  }
3591 
3592  };
3593 
3597  template<typename MatrixType>
3598  void
3599  performNumericPhase(const Teuchos::RCP<const typename BlockHelperDetails::ImplType<MatrixType>::tpetra_row_matrix_type> &A,
3600  const Teuchos::RCP<const typename BlockHelperDetails::ImplType<MatrixType>::tpetra_crs_graph_type> &G,
3601  const BlockHelperDetails::PartInterface<MatrixType> &interf,
3603  const typename BlockHelperDetails::ImplType<MatrixType>::magnitude_type tiny) {
3604  using impl_type = BlockHelperDetails::ImplType<MatrixType>;
3605  using execution_space = typename impl_type::execution_space;
3606  using team_policy_type = Kokkos::TeamPolicy<execution_space>;
3607  using internal_vector_scratch_type_3d_view = Scratch<typename impl_type::internal_vector_type_3d_view>;
3608 
3609  IFPACK2_BLOCKHELPER_TIMER("BlockTriDi::NumericPhase", NumericPhase);
3610 
3611  int blocksize = btdm.values.extent(1);
3612  // Both Kokkos policy vector length and SIMD type vector length are hardcoded in KokkosBatched.
3613  // For large block sizes, have to fall back to level 1 scratch.
3614  int scratch_required = internal_vector_scratch_type_3d_view::shmem_size(blocksize, blocksize, impl_type::vector_length / impl_type::internal_vector_length);
3615  int max_scratch = team_policy_type::scratch_size_max(0);
3616 
3617  if(scratch_required < max_scratch) {
3618  // Can use level 0 scratch
3619  ExtractAndFactorizeTridiags<MatrixType, 0> function(btdm, interf, A, G, tiny);
3620  function.run();
3621  }
3622  else {
3623  // Not enough level 0 scratch, so fall back to level 1
3624  ExtractAndFactorizeTridiags<MatrixType, 1> function(btdm, interf, A, G, tiny);
3625  function.run();
3626  }
3627  IFPACK2_BLOCKHELPER_TIMER_FENCE(typename BlockHelperDetails::ImplType<MatrixType>::execution_space)
3628  }
3629 
3633  template<typename MatrixType>
3635  public:
3637  using execution_space = typename impl_type::execution_space;
3638  using memory_space = typename impl_type::memory_space;
3639 
3640  using local_ordinal_type = typename impl_type::local_ordinal_type;
3641  using impl_scalar_type = typename impl_type::impl_scalar_type;
3642  using btdm_scalar_type = typename impl_type::btdm_scalar_type;
3643  using tpetra_multivector_type = typename impl_type::tpetra_multivector_type;
3644  using local_ordinal_type_1d_view = typename impl_type::local_ordinal_type_1d_view;
3645  using vector_type_3d_view = typename impl_type::vector_type_3d_view;
3646  using impl_scalar_type_2d_view_tpetra = typename impl_type::impl_scalar_type_2d_view_tpetra;
3647  using const_impl_scalar_type_2d_view_tpetra = typename impl_scalar_type_2d_view_tpetra::const_type;
3648  static constexpr int vector_length = impl_type::vector_length;
3649 
3650  using member_type = typename Kokkos::TeamPolicy<execution_space>::member_type;
3651 
3652  private:
3653  // part interface
3654  const ConstUnmanaged<local_ordinal_type_1d_view> partptr;
3655  const ConstUnmanaged<local_ordinal_type_1d_view> packptr;
3656  const ConstUnmanaged<local_ordinal_type_1d_view> part2packrowidx0;
3657  const ConstUnmanaged<local_ordinal_type_1d_view> part2rowidx0;
3658  const ConstUnmanaged<local_ordinal_type_1d_view> lclrow;
3659  const local_ordinal_type blocksize;
3660  const local_ordinal_type num_vectors;
3661 
3662  // packed multivector output (or input)
3663  vector_type_3d_view packed_multivector;
3664  const_impl_scalar_type_2d_view_tpetra scalar_multivector;
3665 
3666  template<typename TagType>
3667  KOKKOS_INLINE_FUNCTION
3668  void copy_multivectors(const local_ordinal_type &j,
3669  const local_ordinal_type &vi,
3670  const local_ordinal_type &pri,
3671  const local_ordinal_type &ri0) const {
3672  for (local_ordinal_type col=0;col<num_vectors;++col)
3673  for (local_ordinal_type i=0;i<blocksize;++i)
3674  packed_multivector(pri, i, col)[vi] = static_cast<btdm_scalar_type>(scalar_multivector(blocksize*lclrow(ri0+j)+i,col));
3675  }
3676 
3677  public:
3678 
3679  MultiVectorConverter(const BlockHelperDetails::PartInterface<MatrixType> &interf,
3680  const vector_type_3d_view &pmv)
3681  : partptr(interf.partptr),
3682  packptr(interf.packptr),
3683  part2packrowidx0(interf.part2packrowidx0),
3684  part2rowidx0(interf.part2rowidx0),
3685  lclrow(interf.lclrow),
3686  blocksize(pmv.extent(1)),
3687  num_vectors(pmv.extent(2)),
3688  packed_multivector(pmv) {}
3689 
3690  // TODO:: modify this routine similar to the team level functions
3691  KOKKOS_INLINE_FUNCTION
3692  void
3693  operator() (const local_ordinal_type &packidx) const {
3694  local_ordinal_type partidx = packptr(packidx);
3695  local_ordinal_type npacks = packptr(packidx+1) - partidx;
3696  const local_ordinal_type pri0 = part2packrowidx0(partidx);
3697 
3698  local_ordinal_type ri0[vector_length] = {};
3699  local_ordinal_type nrows[vector_length] = {};
3700  for (local_ordinal_type v=0;v<npacks;++v,++partidx) {
3701  ri0[v] = part2rowidx0(partidx);
3702  nrows[v] = part2rowidx0(partidx+1) - ri0[v];
3703  }
3704  for (local_ordinal_type j=0;j<nrows[0];++j) {
3705  local_ordinal_type cnt = 1;
3706  for (;cnt<npacks && j!= nrows[cnt];++cnt);
3707  npacks = cnt;
3708  const local_ordinal_type pri = pri0 + j;
3709  for (local_ordinal_type col=0;col<num_vectors;++col)
3710  for (local_ordinal_type i=0;i<blocksize;++i)
3711  for (local_ordinal_type v=0;v<npacks;++v)
3712  packed_multivector(pri, i, col)[v] = static_cast<btdm_scalar_type>(scalar_multivector(blocksize*lclrow(ri0[v]+j)+i,col));
3713  }
3714  }
3715 
3716  KOKKOS_INLINE_FUNCTION
3717  void
3718  operator() (const member_type &member) const {
3719  const local_ordinal_type packidx = member.league_rank();
3720  const local_ordinal_type partidx_begin = packptr(packidx);
3721  const local_ordinal_type npacks = packptr(packidx+1) - partidx_begin;
3722  const local_ordinal_type pri0 = part2packrowidx0(partidx_begin);
3723  Kokkos::parallel_for(Kokkos::ThreadVectorRange(member, npacks), [&](const local_ordinal_type &v) {
3724  const local_ordinal_type partidx = partidx_begin + v;
3725  const local_ordinal_type ri0 = part2rowidx0(partidx);
3726  const local_ordinal_type nrows = part2rowidx0(partidx+1) - ri0;
3727 
3728  if (nrows == 1) {
3729  const local_ordinal_type pri = pri0;
3730  for (local_ordinal_type col=0;col<num_vectors;++col) {
3731  Kokkos::parallel_for(Kokkos::TeamThreadRange(member, blocksize), [&](const local_ordinal_type &i) {
3732  packed_multivector(pri, i, col)[v] = static_cast<btdm_scalar_type>(scalar_multivector(blocksize*lclrow(ri0)+i,col));
3733  });
3734  }
3735  } else {
3736  Kokkos::parallel_for(Kokkos::TeamThreadRange(member, nrows), [&](const local_ordinal_type &j) {
3737  const local_ordinal_type pri = pri0 + j;
3738  for (local_ordinal_type col=0;col<num_vectors;++col)
3739  for (local_ordinal_type i=0;i<blocksize;++i)
3740  packed_multivector(pri, i, col)[v] = static_cast<btdm_scalar_type>(scalar_multivector(blocksize*lclrow(ri0+j)+i,col));
3741  });
3742  }
3743  });
3744  }
3745 
3746  void run(const const_impl_scalar_type_2d_view_tpetra &scalar_multivector_) {
3747  IFPACK2_BLOCKTRIDICONTAINER_PROFILER_REGION_BEGIN;
3748  IFPACK2_BLOCKHELPER_TIMER("BlockTriDi::MultiVectorConverter", MultiVectorConverter0);
3749 
3750  scalar_multivector = scalar_multivector_;
3751  if constexpr (BlockHelperDetails::is_device<execution_space>::value) {
3752  const local_ordinal_type vl = vector_length;
3753  const Kokkos::TeamPolicy<execution_space> policy(packptr.extent(0) - 1, Kokkos::AUTO(), vl);
3754  Kokkos::parallel_for
3755  ("MultiVectorConverter::TeamPolicy", policy, *this);
3756  } else {
3757  const Kokkos::RangePolicy<execution_space> policy(0, packptr.extent(0) - 1);
3758  Kokkos::parallel_for
3759  ("MultiVectorConverter::RangePolicy", policy, *this);
3760  }
3761  IFPACK2_BLOCKTRIDICONTAINER_PROFILER_REGION_END;
3762  IFPACK2_BLOCKHELPER_TIMER_FENCE(execution_space)
3763  }
3764  };
3765 
3769 
3770  template<>
3771  struct SolveTridiagsDefaultModeAndAlgo<Kokkos::HostSpace> {
3772  typedef KB::Mode::Serial mode_type;
3773  typedef KB::Algo::Level2::Unblocked single_vector_algo_type;
3774 #if defined(__KOKKOSBATCHED_INTEL_MKL_COMPACT_BATCHED__)
3775  typedef KB::Algo::Level3::CompactMKL multi_vector_algo_type;
3776 #else
3777  typedef KB::Algo::Level3::Blocked multi_vector_algo_type;
3778 #endif
3779  static int recommended_team_size(const int /* blksize */,
3780  const int /* vector_length */,
3781  const int /* internal_vector_length */) {
3782  return 1;
3783  }
3784  };
3785 
3786 #if defined(KOKKOS_ENABLE_CUDA)
3787  static inline int SolveTridiagsRecommendedCudaTeamSize(const int blksize,
3788  const int vector_length,
3789  const int internal_vector_length) {
3790  const int vector_size = vector_length/internal_vector_length;
3791  int total_team_size(0);
3792  if (blksize <= 5) total_team_size = 32;
3793  else if (blksize <= 9) total_team_size = 32; // 64
3794  else if (blksize <= 12) total_team_size = 96;
3795  else if (blksize <= 16) total_team_size = 128;
3796  else if (blksize <= 20) total_team_size = 160;
3797  else total_team_size = 160;
3798  return total_team_size/vector_size;
3799  }
3800 
3801  template<>
3802  struct SolveTridiagsDefaultModeAndAlgo<Kokkos::CudaSpace> {
3803  typedef KB::Mode::Team mode_type;
3804  typedef KB::Algo::Level2::Unblocked single_vector_algo_type;
3805  typedef KB::Algo::Level3::Unblocked multi_vector_algo_type;
3806  static int recommended_team_size(const int blksize,
3807  const int vector_length,
3808  const int internal_vector_length) {
3809  return SolveTridiagsRecommendedCudaTeamSize(blksize, vector_length, internal_vector_length);
3810  }
3811  };
3812  template<>
3813  struct SolveTridiagsDefaultModeAndAlgo<Kokkos::CudaUVMSpace> {
3814  typedef KB::Mode::Team mode_type;
3815  typedef KB::Algo::Level2::Unblocked single_vector_algo_type;
3816  typedef KB::Algo::Level3::Unblocked multi_vector_algo_type;
3817  static int recommended_team_size(const int blksize,
3818  const int vector_length,
3819  const int internal_vector_length) {
3820  return SolveTridiagsRecommendedCudaTeamSize(blksize, vector_length, internal_vector_length);
3821  }
3822  };
3823 #endif
3824 
3825 #if defined(KOKKOS_ENABLE_HIP)
3826  static inline int SolveTridiagsRecommendedHIPTeamSize(const int blksize,
3827  const int vector_length,
3828  const int internal_vector_length) {
3829  const int vector_size = vector_length/internal_vector_length;
3830  int total_team_size(0);
3831  if (blksize <= 5) total_team_size = 32;
3832  else if (blksize <= 9) total_team_size = 32; // 64
3833  else if (blksize <= 12) total_team_size = 96;
3834  else if (blksize <= 16) total_team_size = 128;
3835  else if (blksize <= 20) total_team_size = 160;
3836  else total_team_size = 160;
3837  return total_team_size/vector_size;
3838  }
3839 
3840  template<>
3841  struct SolveTridiagsDefaultModeAndAlgo<Kokkos::HIPSpace> {
3842  typedef KB::Mode::Team mode_type;
3843  typedef KB::Algo::Level2::Unblocked single_vector_algo_type;
3844  typedef KB::Algo::Level3::Unblocked multi_vector_algo_type;
3845  static int recommended_team_size(const int blksize,
3846  const int vector_length,
3847  const int internal_vector_length) {
3848  return SolveTridiagsRecommendedHIPTeamSize(blksize, vector_length, internal_vector_length);
3849  }
3850  };
3851  template<>
3852  struct SolveTridiagsDefaultModeAndAlgo<Kokkos::HIPHostPinnedSpace> {
3853  typedef KB::Mode::Team mode_type;
3854  typedef KB::Algo::Level2::Unblocked single_vector_algo_type;
3855  typedef KB::Algo::Level3::Unblocked multi_vector_algo_type;
3856  static int recommended_team_size(const int blksize,
3857  const int vector_length,
3858  const int internal_vector_length) {
3859  return SolveTridiagsRecommendedHIPTeamSize(blksize, vector_length, internal_vector_length);
3860  }
3861  };
3862 #endif
3863 
3864 #if defined(KOKKOS_ENABLE_SYCL)
3865  static inline int SolveTridiagsRecommendedSYCLTeamSize(const int blksize,
3866  const int vector_length,
3867  const int internal_vector_length) {
3868  const int vector_size = vector_length/internal_vector_length;
3869  int total_team_size(0);
3870  if (blksize <= 5) total_team_size = 32;
3871  else if (blksize <= 9) total_team_size = 32; // 64
3872  else if (blksize <= 12) total_team_size = 96;
3873  else if (blksize <= 16) total_team_size = 128;
3874  else if (blksize <= 20) total_team_size = 160;
3875  else total_team_size = 160;
3876  return total_team_size/vector_size;
3877  }
3878 
3879  template<>
3880  struct SolveTridiagsDefaultModeAndAlgo<Kokkos::Experimental::SYCLSharedUSMSpace> {
3881  typedef KB::Mode::Team mode_type;
3882  typedef KB::Algo::Level2::Unblocked single_vector_algo_type;
3883  typedef KB::Algo::Level3::Unblocked multi_vector_algo_type;
3884  static int recommended_team_size(const int blksize,
3885  const int vector_length,
3886  const int internal_vector_length) {
3887  return SolveTridiagsRecommendedSYCLTeamSize(blksize, vector_length, internal_vector_length);
3888  }
3889  };
3890  template<>
3891  struct SolveTridiagsDefaultModeAndAlgo<Kokkos::Experimental::SYCLDeviceUSMSpace> {
3892  typedef KB::Mode::Team mode_type;
3893  typedef KB::Algo::Level2::Unblocked single_vector_algo_type;
3894  typedef KB::Algo::Level3::Unblocked multi_vector_algo_type;
3895  static int recommended_team_size(const int blksize,
3896  const int vector_length,
3897  const int internal_vector_length) {
3898  return SolveTridiagsRecommendedSYCLTeamSize(blksize, vector_length, internal_vector_length);
3899  }
3900  };
3901 #endif
3902 
3903 
3904 
3905 
3906  template<typename MatrixType>
3907  struct SolveTridiags {
3908  public:
3909  using impl_type = BlockHelperDetails::ImplType<MatrixType>;
3910  using execution_space = typename impl_type::execution_space;
3911 
3912  using local_ordinal_type = typename impl_type::local_ordinal_type;
3913  using size_type = typename impl_type::size_type;
3914  using impl_scalar_type = typename impl_type::impl_scalar_type;
3915  using magnitude_type = typename impl_type::magnitude_type;
3916  using btdm_scalar_type = typename impl_type::btdm_scalar_type;
3917  using btdm_magnitude_type = typename impl_type::btdm_magnitude_type;
3919  using local_ordinal_type_1d_view = typename impl_type::local_ordinal_type_1d_view;
3920  using local_ordinal_type_2d_view = typename impl_type::local_ordinal_type_2d_view;
3921  using size_type_2d_view = typename impl_type::size_type_2d_view;
3923  using vector_type_3d_view = typename impl_type::vector_type_3d_view;
3924  using internal_vector_type_4d_view = typename impl_type::internal_vector_type_4d_view;
3925  using internal_vector_type_5d_view = typename impl_type::internal_vector_type_5d_view;
3926  using btdm_scalar_type_4d_view = typename impl_type::btdm_scalar_type_4d_view;
3927 
3928  using internal_vector_scratch_type_3d_view = Scratch<typename impl_type::internal_vector_type_3d_view>;
3929 
3930  using internal_vector_type =typename impl_type::internal_vector_type;
3931  static constexpr int vector_length = impl_type::vector_length;
3932  static constexpr int internal_vector_length = impl_type::internal_vector_length;
3933 
3935  using impl_scalar_type_1d_view = typename impl_type::impl_scalar_type_1d_view;
3936  using impl_scalar_type_2d_view_tpetra = typename impl_type::impl_scalar_type_2d_view_tpetra;
3937 
3939  using team_policy_type = Kokkos::TeamPolicy<execution_space>;
3940  using member_type = typename team_policy_type::member_type;
3941 
3942  private:
3943  // part interface
3944  local_ordinal_type n_subparts_per_part;
3945  const ConstUnmanaged<local_ordinal_type_1d_view> partptr;
3946  const ConstUnmanaged<local_ordinal_type_1d_view> packptr;
3947  const ConstUnmanaged<local_ordinal_type_1d_view> packindices_sub;
3948  const ConstUnmanaged<local_ordinal_type_2d_view> packindices_schur;
3949  const ConstUnmanaged<local_ordinal_type_1d_view> part2packrowidx0;
3950  const ConstUnmanaged<local_ordinal_type_2d_view> part2packrowidx0_sub;
3951  const ConstUnmanaged<local_ordinal_type_1d_view> lclrow;
3952  const ConstUnmanaged<local_ordinal_type_1d_view> packptr_sub;
3953 
3954  const ConstUnmanaged<local_ordinal_type_2d_view> partptr_sub;
3955  const ConstUnmanaged<size_type_2d_view> pack_td_ptr_schur;
3956 
3957  // block tridiags
3958  const ConstUnmanaged<size_type_2d_view> pack_td_ptr;
3959 
3960  // block tridiags values
3961  const ConstUnmanaged<internal_vector_type_4d_view> D_internal_vector_values;
3962  const Unmanaged<internal_vector_type_4d_view> X_internal_vector_values;
3963  const Unmanaged<btdm_scalar_type_4d_view> X_internal_scalar_values;
3964 
3965  internal_vector_type_4d_view X_internal_vector_values_schur;
3966 
3967  const ConstUnmanaged<internal_vector_type_4d_view> D_internal_vector_values_schur;
3968  const ConstUnmanaged<internal_vector_type_5d_view> e_internal_vector_values;
3969 
3970 
3971  const local_ordinal_type vector_loop_size;
3972 
3973  // copy to multivectors : damping factor and Y_scalar_multivector
3974  Unmanaged<impl_scalar_type_2d_view_tpetra> Y_scalar_multivector;
3975 #if defined(__CUDA_ARCH__) || defined(__HIP_DEVICE_COMPILE__) || defined(__SYCL_DEVICE_ONLY__)
3976  AtomicUnmanaged<impl_scalar_type_1d_view> Z_scalar_vector;
3977 #else
3978  /* */ Unmanaged<impl_scalar_type_1d_view> Z_scalar_vector;
3979 #endif
3980  const impl_scalar_type df;
3981  const bool compute_diff;
3982 
3983  public:
3984  SolveTridiags(const BlockHelperDetails::PartInterface<MatrixType> &interf,
3985  const BlockTridiags<MatrixType> &btdm,
3986  const vector_type_3d_view &pmv,
3987  const impl_scalar_type damping_factor,
3988  const bool is_norm_manager_active)
3989  :
3990  // interface
3991  n_subparts_per_part(interf.n_subparts_per_part),
3992  partptr(interf.partptr),
3993  packptr(interf.packptr),
3994  packindices_sub(interf.packindices_sub),
3995  packindices_schur(interf.packindices_schur),
3996  part2packrowidx0(interf.part2packrowidx0),
3997  part2packrowidx0_sub(interf.part2packrowidx0_sub),
3998  lclrow(interf.lclrow),
3999  packptr_sub(interf.packptr_sub),
4000  partptr_sub(interf.partptr_sub),
4001  pack_td_ptr_schur(btdm.pack_td_ptr_schur),
4002  // block tridiags and multivector
4003  pack_td_ptr(btdm.pack_td_ptr),
4004  D_internal_vector_values((internal_vector_type*)btdm.values.data(),
4005  btdm.values.extent(0),
4006  btdm.values.extent(1),
4007  btdm.values.extent(2),
4008  vector_length/internal_vector_length),
4009  X_internal_vector_values((internal_vector_type*)pmv.data(),
4010  pmv.extent(0),
4011  pmv.extent(1),
4012  pmv.extent(2),
4013  vector_length/internal_vector_length),
4014  X_internal_scalar_values((btdm_scalar_type*)pmv.data(),
4015  pmv.extent(0),
4016  pmv.extent(1),
4017  pmv.extent(2),
4018  vector_length),
4019  X_internal_vector_values_schur(do_not_initialize_tag("X_internal_vector_values_schur"),
4020  2*(n_subparts_per_part-1) * part2packrowidx0_sub.extent(0),
4021  pmv.extent(1),
4022  pmv.extent(2),
4023  vector_length/internal_vector_length),
4024  D_internal_vector_values_schur((internal_vector_type*)btdm.values_schur.data(),
4025  btdm.values_schur.extent(0),
4026  btdm.values_schur.extent(1),
4027  btdm.values_schur.extent(2),
4028  vector_length/internal_vector_length),
4029  e_internal_vector_values((internal_vector_type*)btdm.e_values.data(),
4030  btdm.e_values.extent(0),
4031  btdm.e_values.extent(1),
4032  btdm.e_values.extent(2),
4033  btdm.e_values.extent(3),
4034  vector_length/internal_vector_length),
4035  vector_loop_size(vector_length/internal_vector_length),
4036  Y_scalar_multivector(),
4037  Z_scalar_vector(),
4038  df(damping_factor),
4039  compute_diff(is_norm_manager_active)
4040  {}
4041 
4042  public:
4043 
4045  KOKKOS_INLINE_FUNCTION
4046  void
4047  copyToFlatMultiVector(const member_type &member,
4048  const local_ordinal_type partidxbeg, // partidx for v = 0
4049  const local_ordinal_type npacks,
4050  const local_ordinal_type pri0,
4051  const local_ordinal_type v, // index with a loop of vector_loop_size
4052  const local_ordinal_type blocksize,
4053  const local_ordinal_type num_vectors) const {
4054  const local_ordinal_type vbeg = v*internal_vector_length;
4055  if (vbeg < npacks) {
4056  local_ordinal_type ri0_vals[internal_vector_length] = {};
4057  local_ordinal_type nrows_vals[internal_vector_length] = {};
4058  for (local_ordinal_type vv=vbeg,vi=0;vv<npacks && vi<internal_vector_length;++vv,++vi) {
4059  const local_ordinal_type partidx = partidxbeg+vv;
4060  ri0_vals[vi] = partptr(partidx);
4061  nrows_vals[vi] = partptr(partidx+1) - ri0_vals[vi];
4062  }
4063 
4064  impl_scalar_type z_partial_sum(0);
4065  if (nrows_vals[0] == 1) {
4066  const local_ordinal_type j=0, pri=pri0;
4067  {
4068  for (local_ordinal_type vv=vbeg,vi=0;vv<npacks && vi<internal_vector_length;++vv,++vi) {
4069  const local_ordinal_type ri0 = ri0_vals[vi];
4070  const local_ordinal_type nrows = nrows_vals[vi];
4071  if (j < nrows) {
4072  Kokkos::parallel_for
4073  (Kokkos::TeamThreadRange(member, blocksize),
4074  [&](const local_ordinal_type &i) {
4075  const local_ordinal_type row = blocksize*lclrow(ri0+j)+i;
4076  for (local_ordinal_type col=0;col<num_vectors;++col) {
4077  impl_scalar_type &y = Y_scalar_multivector(row,col);
4078  const impl_scalar_type yd = X_internal_vector_values(pri, i, col, v)[vi] - y;
4079  y += df*yd;
4080 
4081  {//if (compute_diff) {
4082  const auto yd_abs = Kokkos::ArithTraits<impl_scalar_type>::abs(yd);
4083  z_partial_sum += yd_abs*yd_abs;
4084  }
4085  }
4086  });
4087  }
4088  }
4089  }
4090  } else {
4091  Kokkos::parallel_for
4092  (Kokkos::TeamThreadRange(member, nrows_vals[0]),
4093  [&](const local_ordinal_type &j) {
4094  const local_ordinal_type pri = pri0 + j;
4095  for (local_ordinal_type vv=vbeg,vi=0;vv<npacks && vi<internal_vector_length;++vv,++vi) {
4096  const local_ordinal_type ri0 = ri0_vals[vi];
4097  const local_ordinal_type nrows = nrows_vals[vi];
4098  if (j < nrows) {
4099  for (local_ordinal_type col=0;col<num_vectors;++col) {
4100  for (local_ordinal_type i=0;i<blocksize;++i) {
4101  const local_ordinal_type row = blocksize*lclrow(ri0+j)+i;
4102  impl_scalar_type &y = Y_scalar_multivector(row,col);
4103  const impl_scalar_type yd = X_internal_vector_values(pri, i, col, v)[vi] - y;
4104  y += df*yd;
4105 
4106  {//if (compute_diff) {
4107  const auto yd_abs = Kokkos::ArithTraits<impl_scalar_type>::abs(yd);
4108  z_partial_sum += yd_abs*yd_abs;
4109  }
4110  }
4111  }
4112  }
4113  }
4114  });
4115  }
4116  //if (compute_diff)
4117  Z_scalar_vector(member.league_rank()) += z_partial_sum;
4118  }
4119  }
4120 
4124  template<typename WWViewType>
4125  KOKKOS_INLINE_FUNCTION
4126  void
4127  solveSingleVector(const member_type &member,
4128  const local_ordinal_type &blocksize,
4129  const local_ordinal_type &i0,
4130  const local_ordinal_type &r0,
4131  const local_ordinal_type &nrows,
4132  const local_ordinal_type &v,
4133  const WWViewType &WW) const {
4134 
4135  typedef SolveTridiagsDefaultModeAndAlgo
4136  <typename execution_space::memory_space> default_mode_and_algo_type;
4137 
4138  typedef typename default_mode_and_algo_type::mode_type default_mode_type;
4139  typedef typename default_mode_and_algo_type::single_vector_algo_type default_algo_type;
4140 
4141  // base pointers
4142  auto A = D_internal_vector_values.data();
4143  auto X = X_internal_vector_values.data();
4144 
4145  // constant
4146  const auto one = Kokkos::ArithTraits<btdm_magnitude_type>::one();
4147  const auto zero = Kokkos::ArithTraits<btdm_magnitude_type>::zero();
4148  //const local_ordinal_type num_vectors = X_scalar_values.extent(2);
4149 
4150  // const local_ordinal_type blocksize = D_scalar_values.extent(1);
4151  const local_ordinal_type astep = D_internal_vector_values.stride_0();
4152  const local_ordinal_type as0 = D_internal_vector_values.stride_1(); //blocksize*vector_length;
4153  const local_ordinal_type as1 = D_internal_vector_values.stride_2(); //vector_length;
4154  const local_ordinal_type xstep = X_internal_vector_values.stride_0();
4155  const local_ordinal_type xs0 = X_internal_vector_values.stride_1(); //vector_length;
4156 
4157  // move to starting point
4158  A += i0*astep + v;
4159  X += r0*xstep + v;
4160 
4161  //for (local_ordinal_type col=0;col<num_vectors;++col)
4162  if (nrows > 1) {
4163  // solve Lx = x
4164  KOKKOSBATCHED_TRSV_LOWER_NO_TRANSPOSE_INTERNAL_INVOKE
4165  (default_mode_type,default_algo_type,
4166  member,
4167  KB::Diag::Unit,
4168  blocksize,blocksize,
4169  one,
4170  A, as0, as1,
4171  X, xs0);
4172 
4173  for (local_ordinal_type tr=1;tr<nrows;++tr) {
4174  member.team_barrier();
4175  KOKKOSBATCHED_GEMV_NO_TRANSPOSE_INTERNAL_INVOKE
4176  (default_mode_type,default_algo_type,
4177  member,
4178  blocksize, blocksize,
4179  -one,
4180  A+2*astep, as0, as1,
4181  X, xs0,
4182  one,
4183  X+1*xstep, xs0);
4184  KOKKOSBATCHED_TRSV_LOWER_NO_TRANSPOSE_INTERNAL_INVOKE
4185  (default_mode_type,default_algo_type,
4186  member,
4187  KB::Diag::Unit,
4188  blocksize,blocksize,
4189  one,
4190  A+3*astep, as0, as1,
4191  X+1*xstep, xs0);
4192 
4193  A += 3*astep;
4194  X += 1*xstep;
4195  }
4196 
4197  // solve Ux = x
4198  KOKKOSBATCHED_TRSV_UPPER_NO_TRANSPOSE_INTERNAL_INVOKE
4199  (default_mode_type,default_algo_type,
4200  member,
4201  KB::Diag::NonUnit,
4202  blocksize, blocksize,
4203  one,
4204  A, as0, as1,
4205  X, xs0);
4206 
4207  for (local_ordinal_type tr=nrows;tr>1;--tr) {
4208  A -= 3*astep;
4209  member.team_barrier();
4210  KOKKOSBATCHED_GEMV_NO_TRANSPOSE_INTERNAL_INVOKE
4211  (default_mode_type,default_algo_type,
4212  member,
4213  blocksize, blocksize,
4214  -one,
4215  A+1*astep, as0, as1,
4216  X, xs0,
4217  one,
4218  X-1*xstep, xs0);
4219  KOKKOSBATCHED_TRSV_UPPER_NO_TRANSPOSE_INTERNAL_INVOKE
4220  (default_mode_type,default_algo_type,
4221  member,
4222  KB::Diag::NonUnit,
4223  blocksize, blocksize,
4224  one,
4225  A, as0, as1,
4226  X-1*xstep,xs0);
4227  X -= 1*xstep;
4228  }
4229  // for multiple rhs
4230  //X += xs1;
4231  } else {
4232  const local_ordinal_type ws0 = WW.stride_0();
4233  auto W = WW.data() + v;
4234  KOKKOSBATCHED_COPY_VECTOR_NO_TRANSPOSE_INTERNAL_INVOKE
4235  (default_mode_type,
4236  member, blocksize, X, xs0, W, ws0);
4237  member.team_barrier();
4238  KOKKOSBATCHED_GEMV_NO_TRANSPOSE_INTERNAL_INVOKE
4239  (default_mode_type,default_algo_type,
4240  member,
4241  blocksize, blocksize,
4242  one,
4243  A, as0, as1,
4244  W, xs0,
4245  zero,
4246  X, xs0);
4247  }
4248  }
4249 
4250  template<typename WWViewType>
4251  KOKKOS_INLINE_FUNCTION
4252  void
4253  solveMultiVector(const member_type &member,
4254  const local_ordinal_type &/* blocksize */,
4255  const local_ordinal_type &i0,
4256  const local_ordinal_type &r0,
4257  const local_ordinal_type &nrows,
4258  const local_ordinal_type &v,
4259  const WWViewType &WW) const {
4260 
4261  typedef SolveTridiagsDefaultModeAndAlgo
4262  <typename execution_space::memory_space> default_mode_and_algo_type;
4263 
4264  typedef typename default_mode_and_algo_type::mode_type default_mode_type;
4265  typedef typename default_mode_and_algo_type::multi_vector_algo_type default_algo_type;
4266 
4267  // constant
4268  const auto one = Kokkos::ArithTraits<btdm_magnitude_type>::one();
4269  const auto zero = Kokkos::ArithTraits<btdm_magnitude_type>::zero();
4270 
4271  // subview pattern
4272  auto A = Kokkos::subview(D_internal_vector_values, i0, Kokkos::ALL(), Kokkos::ALL(), v);
4273  auto X1 = Kokkos::subview(X_internal_vector_values, r0, Kokkos::ALL(), Kokkos::ALL(), v);
4274  auto X2 = X1;
4275 
4276  local_ordinal_type i = i0, r = r0;
4277 
4278 
4279  if (nrows > 1) {
4280  // solve Lx = x
4281  KB::Trsm<member_type,
4282  KB::Side::Left,KB::Uplo::Lower,KB::Trans::NoTranspose,KB::Diag::Unit,
4283  default_mode_type,default_algo_type>
4284  ::invoke(member, one, A, X1);
4285  for (local_ordinal_type tr=1;tr<nrows;++tr,i+=3) {
4286  A.assign_data( &D_internal_vector_values(i+2,0,0,v) );
4287  X2.assign_data( &X_internal_vector_values(++r,0,0,v) );
4288  member.team_barrier();
4289  KB::Gemm<member_type,
4290  KB::Trans::NoTranspose,KB::Trans::NoTranspose,
4291  default_mode_type,default_algo_type>
4292  ::invoke(member, -one, A, X1, one, X2);
4293  A.assign_data( &D_internal_vector_values(i+3,0,0,v) );
4294  KB::Trsm<member_type,
4295  KB::Side::Left,KB::Uplo::Lower,KB::Trans::NoTranspose,KB::Diag::Unit,
4296  default_mode_type,default_algo_type>
4297  ::invoke(member, one, A, X2);
4298  X1.assign_data( X2.data() );
4299  }
4300 
4301  // solve Ux = x
4302  KB::Trsm<member_type,
4303  KB::Side::Left,KB::Uplo::Upper,KB::Trans::NoTranspose,KB::Diag::NonUnit,
4304  default_mode_type,default_algo_type>
4305  ::invoke(member, one, A, X1);
4306  for (local_ordinal_type tr=nrows;tr>1;--tr) {
4307  i -= 3;
4308  A.assign_data( &D_internal_vector_values(i+1,0,0,v) );
4309  X2.assign_data( &X_internal_vector_values(--r,0,0,v) );
4310  member.team_barrier();
4311  KB::Gemm<member_type,
4312  KB::Trans::NoTranspose,KB::Trans::NoTranspose,
4313  default_mode_type,default_algo_type>
4314  ::invoke(member, -one, A, X1, one, X2);
4315 
4316  A.assign_data( &D_internal_vector_values(i,0,0,v) );
4317  KB::Trsm<member_type,
4318  KB::Side::Left,KB::Uplo::Upper,KB::Trans::NoTranspose,KB::Diag::NonUnit,
4319  default_mode_type,default_algo_type>
4320  ::invoke(member, one, A, X2);
4321  X1.assign_data( X2.data() );
4322  }
4323  } else {
4324  // matrix is already inverted
4325  auto W = Kokkos::subview(WW, Kokkos::ALL(), Kokkos::ALL(), v);
4326  KB::Copy<member_type,KB::Trans::NoTranspose,default_mode_type>
4327  ::invoke(member, X1, W);
4328  member.team_barrier();
4329  KB::Gemm<member_type,
4330  KB::Trans::NoTranspose,KB::Trans::NoTranspose,
4331  default_mode_type,default_algo_type>
4332  ::invoke(member, one, A, W, zero, X1);
4333  }
4334  }
4335 
4336  template<int B> struct SingleVectorTag {};
4337  template<int B> struct MultiVectorTag {};
4338 
4339  template<int B> struct SingleVectorSubLineTag {};
4340  template<int B> struct MultiVectorSubLineTag {};
4341  template<int B> struct SingleVectorApplyCTag {};
4342  template<int B> struct MultiVectorApplyCTag {};
4343  template<int B> struct SingleVectorSchurTag {};
4344  template<int B> struct MultiVectorSchurTag {};
4345  template<int B> struct SingleVectorApplyETag {};
4346  template<int B> struct MultiVectorApplyETag {};
4347  template<int B> struct SingleVectorCopyToFlatTag {};
4348  template<int B> struct SingleZeroingTag {};
4349 
4350  template<int B>
4351  KOKKOS_INLINE_FUNCTION
4352  void
4353  operator() (const SingleVectorTag<B> &, const member_type &member) const {
4354  const local_ordinal_type packidx = member.league_rank();
4355  const local_ordinal_type partidx = packptr(packidx);
4356  const local_ordinal_type npacks = packptr(packidx+1) - partidx;
4357  const local_ordinal_type pri0 = part2packrowidx0(partidx);
4358  const local_ordinal_type i0 = pack_td_ptr(partidx,0);
4359  const local_ordinal_type r0 = part2packrowidx0(partidx);
4360  const local_ordinal_type nrows = partptr(partidx+1) - partptr(partidx);
4361  const local_ordinal_type blocksize = (B == 0 ? D_internal_vector_values.extent(1) : B);
4362  const local_ordinal_type num_vectors = 1;
4363  internal_vector_scratch_type_3d_view
4364  WW(member.team_scratch(0), blocksize, 1, vector_loop_size);
4365  Kokkos::single(Kokkos::PerTeam(member), [&]() {
4366  Z_scalar_vector(member.league_rank()) = impl_scalar_type(0);
4367  });
4368  Kokkos::parallel_for
4369  (Kokkos::ThreadVectorRange(member, vector_loop_size),[&](const int &v) {
4370  solveSingleVector(member, blocksize, i0, r0, nrows, v, WW);
4371  copyToFlatMultiVector(member, partidx, npacks, pri0, v, blocksize, num_vectors);
4372  });
4373  }
4374 
4375  template<int B>
4376  KOKKOS_INLINE_FUNCTION
4377  void
4378  operator() (const MultiVectorTag<B> &, const member_type &member) const {
4379  const local_ordinal_type packidx = member.league_rank();
4380  const local_ordinal_type partidx = packptr(packidx);
4381  const local_ordinal_type npacks = packptr(packidx+1) - partidx;
4382  const local_ordinal_type pri0 = part2packrowidx0(partidx);
4383  const local_ordinal_type i0 = pack_td_ptr(partidx,0);
4384  const local_ordinal_type r0 = part2packrowidx0(partidx);
4385  const local_ordinal_type nrows = partptr(partidx+1) - partptr(partidx);
4386  const local_ordinal_type blocksize = (B == 0 ? D_internal_vector_values.extent(1) : B);
4387  const local_ordinal_type num_vectors = X_internal_vector_values.extent(2);
4388 
4389  internal_vector_scratch_type_3d_view
4390  WW(member.team_scratch(0), blocksize, num_vectors, vector_loop_size);
4391  Kokkos::single(Kokkos::PerTeam(member), [&]() {
4392  Z_scalar_vector(member.league_rank()) = impl_scalar_type(0);
4393  });
4394  Kokkos::parallel_for
4395  (Kokkos::ThreadVectorRange(member, vector_loop_size),[&](const int &v) {
4396  solveMultiVector(member, blocksize, i0, r0, nrows, v, WW);
4397  copyToFlatMultiVector(member, partidx, npacks, pri0, v, blocksize, num_vectors);
4398  });
4399  }
4400 
4401  template<int B>
4402  KOKKOS_INLINE_FUNCTION
4403  void
4404  operator() (const SingleVectorSubLineTag<B> &, const member_type &member) const {
4405  // btdm is packed and sorted from largest one
4406  const local_ordinal_type packidx = packindices_sub(member.league_rank());
4407 
4408  const local_ordinal_type subpartidx = packptr_sub(packidx);
4409  const local_ordinal_type n_parts = part2packrowidx0_sub.extent(0);
4410  const local_ordinal_type local_subpartidx = subpartidx/n_parts;
4411  const local_ordinal_type partidx = subpartidx%n_parts;
4412 
4413  const local_ordinal_type npacks = packptr_sub(packidx+1) - subpartidx;
4414  const local_ordinal_type i0 = pack_td_ptr(partidx,local_subpartidx);
4415  const local_ordinal_type r0 = part2packrowidx0_sub(partidx,local_subpartidx);
4416  const local_ordinal_type nrows = partptr_sub(subpartidx,1) - partptr_sub(subpartidx,0);
4417  const local_ordinal_type blocksize = e_internal_vector_values.extent(2);
4418 
4419  //(void) i0;
4420  //(void) nrows;
4421  (void) npacks;
4422 
4423  internal_vector_scratch_type_3d_view
4424  WW(member.team_scratch(0), blocksize, 1, vector_loop_size);
4425 
4426  Kokkos::parallel_for
4427  (Kokkos::ThreadVectorRange(member, vector_loop_size),[&](const int &v) {
4428  solveSingleVectorNew<impl_type, internal_vector_scratch_type_3d_view> (member, blocksize, i0, r0, nrows, v, D_internal_vector_values, X_internal_vector_values, WW);
4429  });
4430  }
4431 
4432  template<int B>
4433  KOKKOS_INLINE_FUNCTION
4434  void
4435  operator() (const SingleVectorApplyCTag<B> &, const member_type &member) const {
4436  // btdm is packed and sorted from largest one
4437  //const local_ordinal_type packidx = packindices_schur(member.league_rank());
4438  const local_ordinal_type packidx = packindices_sub(member.league_rank());
4439 
4440  const local_ordinal_type subpartidx = packptr_sub(packidx);
4441  const local_ordinal_type n_parts = part2packrowidx0_sub.extent(0);
4442  const local_ordinal_type local_subpartidx = subpartidx/n_parts;
4443  const local_ordinal_type partidx = subpartidx%n_parts;
4444  const local_ordinal_type blocksize = e_internal_vector_values.extent(2);
4445 
4446  //const local_ordinal_type npacks = packptr_sub(packidx+1) - subpartidx;
4447  const local_ordinal_type i0 = pack_td_ptr(partidx,local_subpartidx);
4448  const local_ordinal_type r0 = part2packrowidx0_sub(partidx,local_subpartidx);
4449  const local_ordinal_type nrows = partptr_sub(subpartidx,1) - partptr_sub(subpartidx,0);
4450 
4451  internal_vector_scratch_type_3d_view
4452  WW(member.team_scratch(0), blocksize, blocksize, vector_loop_size);
4453 
4454  // Compute v_2 = v_2 - C v_1
4455 
4456  const local_ordinal_type local_subpartidx_schur = (local_subpartidx-1)/2;
4457  const local_ordinal_type i0_schur = local_subpartidx_schur == 0 ? pack_td_ptr_schur(partidx,local_subpartidx_schur) : pack_td_ptr_schur(partidx,local_subpartidx_schur) + 1;
4458  const local_ordinal_type i0_offset = local_subpartidx_schur == 0 ? i0+2 : i0+2;
4459 
4460  (void) i0_schur;
4461  (void) i0_offset;
4462 
4463  const auto one = Kokkos::ArithTraits<btdm_magnitude_type>::one();
4464 
4465  const size_type c_kps2 = local_subpartidx > 0 ? pack_td_ptr(partidx, local_subpartidx)-2 : 0;
4466  const size_type c_kps1 = pack_td_ptr(partidx, local_subpartidx+1)+1;
4467 
4468  typedef SolveTridiagsDefaultModeAndAlgo
4469  <typename execution_space::memory_space> default_mode_and_algo_type;
4470 
4471  typedef typename default_mode_and_algo_type::mode_type default_mode_type;
4472  typedef typename default_mode_and_algo_type::single_vector_algo_type default_algo_type;
4473 
4474  if (local_subpartidx == 0) {
4475  Kokkos::parallel_for
4476  (Kokkos::ThreadVectorRange(member, vector_loop_size),[&](const int &v) {
4477  auto v_1 = Kokkos::subview(X_internal_vector_values, r0+nrows-1, Kokkos::ALL(), 0, v);
4478  auto v_2 = Kokkos::subview(X_internal_vector_values, r0+nrows, Kokkos::ALL(), 0, v);
4479  auto C = Kokkos::subview(D_internal_vector_values, c_kps1, Kokkos::ALL(), Kokkos::ALL(), v);
4480 
4481  KOKKOSBATCHED_GEMV_NO_TRANSPOSE_INTERNAL_INVOKE
4482  (default_mode_type,default_algo_type,
4483  member,
4484  blocksize, blocksize,
4485  -one,
4486  C.data(), C.stride_0(), C.stride_1(),
4487  v_1.data(), v_1.stride_0(),
4488  one,
4489  v_2.data(), v_2.stride_0());
4490  });
4491  }
4492  else if (local_subpartidx == (local_ordinal_type) part2packrowidx0_sub.extent(1) - 2) {
4493  Kokkos::parallel_for
4494  (Kokkos::ThreadVectorRange(member, vector_loop_size),[&](const int &v) {
4495  auto v_1 = Kokkos::subview(X_internal_vector_values, r0, Kokkos::ALL(), 0, v);
4496  auto v_2 = Kokkos::subview(X_internal_vector_values, r0-1, Kokkos::ALL(), 0, v);
4497  auto C = Kokkos::subview(D_internal_vector_values, c_kps2, Kokkos::ALL(), Kokkos::ALL(), v);
4498 
4499  KOKKOSBATCHED_GEMV_NO_TRANSPOSE_INTERNAL_INVOKE
4500  (default_mode_type,default_algo_type,
4501  member,
4502  blocksize, blocksize,
4503  -one,
4504  C.data(), C.stride_0(), C.stride_1(),
4505  v_1.data(), v_1.stride_0(),
4506  one,
4507  v_2.data(), v_2.stride_0());
4508  });
4509  }
4510  else {
4511  Kokkos::parallel_for
4512  (Kokkos::ThreadVectorRange(member, vector_loop_size),[&](const int &v) {
4513  {
4514  auto v_1 = Kokkos::subview(X_internal_vector_values, r0+nrows-1, Kokkos::ALL(), 0, v);
4515  auto v_2 = Kokkos::subview(X_internal_vector_values, r0+nrows, Kokkos::ALL(), 0, v);
4516  auto C = Kokkos::subview(D_internal_vector_values, c_kps1, Kokkos::ALL(), Kokkos::ALL(), v);
4517 
4518  KOKKOSBATCHED_GEMV_NO_TRANSPOSE_INTERNAL_INVOKE
4519  (default_mode_type,default_algo_type,
4520  member,
4521  blocksize, blocksize,
4522  -one,
4523  C.data(), C.stride_0(), C.stride_1(),
4524  v_1.data(), v_1.stride_0(),
4525  one,
4526  v_2.data(), v_2.stride_0());
4527  }
4528  {
4529  auto v_1 = Kokkos::subview(X_internal_vector_values, r0, Kokkos::ALL(), 0, v);
4530  auto v_2 = Kokkos::subview(X_internal_vector_values, r0-1, Kokkos::ALL(), 0, v);
4531  auto C = Kokkos::subview(D_internal_vector_values, c_kps2, Kokkos::ALL(), Kokkos::ALL(), v);
4532 
4533  KOKKOSBATCHED_GEMV_NO_TRANSPOSE_INTERNAL_INVOKE
4534  (default_mode_type,default_algo_type,
4535  member,
4536  blocksize, blocksize,
4537  -one,
4538  C.data(), C.stride_0(), C.stride_1(),
4539  v_1.data(), v_1.stride_0(),
4540  one,
4541  v_2.data(), v_2.stride_0());
4542  }
4543  });
4544  }
4545  }
4546 
4547  template<int B>
4548  KOKKOS_INLINE_FUNCTION
4549  void
4550  operator() (const SingleVectorSchurTag<B> &, const member_type &member) const {
4551  const local_ordinal_type packidx = packindices_sub(member.league_rank());
4552 
4553  const local_ordinal_type partidx = packptr_sub(packidx);
4554 
4555  const local_ordinal_type blocksize = e_internal_vector_values.extent(2);
4556 
4557  const local_ordinal_type i0_schur = pack_td_ptr_schur(partidx,0);
4558  const local_ordinal_type nrows = 2*(n_subparts_per_part-1);
4559 
4560  const local_ordinal_type r0_schur = nrows * member.league_rank();
4561 
4562  internal_vector_scratch_type_3d_view
4563  WW(member.team_scratch(0), blocksize, blocksize, vector_loop_size);
4564 
4565  for (local_ordinal_type schur_sub_part = 0; schur_sub_part < n_subparts_per_part-1; ++schur_sub_part) {
4566  const local_ordinal_type r0 = part2packrowidx0_sub(partidx,2*schur_sub_part+1);
4567  for (local_ordinal_type i = 0; i < 2; ++i) {
4568  copy3DView<local_ordinal_type>(member,
4569  Kokkos::subview(X_internal_vector_values_schur, r0_schur+2*schur_sub_part+i, Kokkos::ALL(), Kokkos::ALL(), Kokkos::ALL()),
4570  Kokkos::subview(X_internal_vector_values, r0+i, Kokkos::ALL(), Kokkos::ALL(), Kokkos::ALL()));
4571  }
4572  }
4573 
4574  Kokkos::parallel_for
4575  (Kokkos::ThreadVectorRange(member, vector_loop_size),[&](const int &v) {
4576  solveSingleVectorNew<impl_type, internal_vector_scratch_type_3d_view> (member, blocksize, i0_schur, r0_schur, nrows, v, D_internal_vector_values_schur, X_internal_vector_values_schur, WW);
4577  });
4578 
4579  for (local_ordinal_type schur_sub_part = 0; schur_sub_part < n_subparts_per_part-1; ++schur_sub_part) {
4580  const local_ordinal_type r0 = part2packrowidx0_sub(partidx,2*schur_sub_part+1);
4581  for (local_ordinal_type i = 0; i < 2; ++i) {
4582  copy3DView<local_ordinal_type>(member,
4583  Kokkos::subview(X_internal_vector_values, r0+i, Kokkos::ALL(), Kokkos::ALL(), Kokkos::ALL()),
4584  Kokkos::subview(X_internal_vector_values_schur, r0_schur+2*schur_sub_part+i, Kokkos::ALL(), Kokkos::ALL(), Kokkos::ALL()));
4585  }
4586  }
4587  }
4588 
4589  template<int B>
4590  KOKKOS_INLINE_FUNCTION
4591  void
4592  operator() (const SingleVectorApplyETag<B> &, const member_type &member) const {
4593  const local_ordinal_type packidx = packindices_sub(member.league_rank());
4594 
4595  const local_ordinal_type subpartidx = packptr_sub(packidx);
4596  const local_ordinal_type n_parts = part2packrowidx0_sub.extent(0);
4597  const local_ordinal_type local_subpartidx = subpartidx/n_parts;
4598  const local_ordinal_type partidx = subpartidx%n_parts;
4599  const local_ordinal_type blocksize = e_internal_vector_values.extent(2);
4600 
4601  const local_ordinal_type r0 = part2packrowidx0_sub(partidx,local_subpartidx);
4602  const local_ordinal_type nrows = partptr_sub(subpartidx,1) - partptr_sub(subpartidx,0);
4603 
4604  internal_vector_scratch_type_3d_view
4605  WW(member.team_scratch(0), blocksize, blocksize, vector_loop_size);
4606 
4607  // Compute v_2 = v_2 - C v_1
4608 
4609  const auto one = Kokkos::ArithTraits<btdm_magnitude_type>::one();
4610 
4611  typedef SolveTridiagsDefaultModeAndAlgo
4612  <typename execution_space::memory_space> default_mode_and_algo_type;
4613 
4614  typedef typename default_mode_and_algo_type::mode_type default_mode_type;
4615  typedef typename default_mode_and_algo_type::single_vector_algo_type default_algo_type;
4616 
4617  if (local_subpartidx == 0) {
4618  Kokkos::parallel_for
4619  (Kokkos::ThreadVectorRange(member, vector_loop_size),[&](const int &v) {
4620 
4621  auto v_2 = Kokkos::subview(X_internal_vector_values, r0+nrows, Kokkos::ALL(), 0, v);
4622 
4623  for (local_ordinal_type row = 0; row < nrows; ++row) {
4624  auto v_1 = Kokkos::subview(X_internal_vector_values, r0+row, Kokkos::ALL(), 0, v);
4625  auto E = Kokkos::subview(e_internal_vector_values, 0, r0+row, Kokkos::ALL(), Kokkos::ALL(), v);
4626 
4627  KOKKOSBATCHED_GEMV_NO_TRANSPOSE_INTERNAL_INVOKE
4628  (default_mode_type,default_algo_type,
4629  member,
4630  blocksize, blocksize,
4631  -one,
4632  E.data(), E.stride_0(), E.stride_1(),
4633  v_2.data(), v_2.stride_0(),
4634  one,
4635  v_1.data(), v_1.stride_0());
4636  }
4637  });
4638  }
4639  else if (local_subpartidx == (local_ordinal_type) part2packrowidx0_sub.extent(1) - 2) {
4640  Kokkos::parallel_for
4641  (Kokkos::ThreadVectorRange(member, vector_loop_size),[&](const int &v) {
4642  auto v_2 = Kokkos::subview(X_internal_vector_values, r0-1, Kokkos::ALL(), 0, v);
4643 
4644  for (local_ordinal_type row = 0; row < nrows; ++row) {
4645  auto v_1 = Kokkos::subview(X_internal_vector_values, r0+row, Kokkos::ALL(), 0, v);
4646  auto E = Kokkos::subview(e_internal_vector_values, 1, r0+row, Kokkos::ALL(), Kokkos::ALL(), v);
4647 
4648  KOKKOSBATCHED_GEMV_NO_TRANSPOSE_INTERNAL_INVOKE
4649  (default_mode_type,default_algo_type,
4650  member,
4651  blocksize, blocksize,
4652  -one,
4653  E.data(), E.stride_0(), E.stride_1(),
4654  v_2.data(), v_2.stride_0(),
4655  one,
4656  v_1.data(), v_1.stride_0());
4657  }
4658  });
4659  }
4660  else {
4661  Kokkos::parallel_for
4662  (Kokkos::ThreadVectorRange(member, vector_loop_size),[&](const int &v) {
4663  {
4664  auto v_2 = Kokkos::subview(X_internal_vector_values, r0+nrows, Kokkos::ALL(), 0, v);
4665 
4666  for (local_ordinal_type row = 0; row < nrows; ++row) {
4667  auto v_1 = Kokkos::subview(X_internal_vector_values, r0+row, Kokkos::ALL(), 0, v);
4668  auto E = Kokkos::subview(e_internal_vector_values, 0, r0+row, Kokkos::ALL(), Kokkos::ALL(), v);
4669 
4670  KOKKOSBATCHED_GEMV_NO_TRANSPOSE_INTERNAL_INVOKE
4671  (default_mode_type,default_algo_type,
4672  member,
4673  blocksize, blocksize,
4674  -one,
4675  E.data(), E.stride_0(), E.stride_1(),
4676  v_2.data(), v_2.stride_0(),
4677  one,
4678  v_1.data(), v_1.stride_0());
4679  }
4680  }
4681  {
4682  auto v_2 = Kokkos::subview(X_internal_vector_values, r0-1, Kokkos::ALL(), 0, v);
4683 
4684  for (local_ordinal_type row = 0; row < nrows; ++row) {
4685  auto v_1 = Kokkos::subview(X_internal_vector_values, r0+row, Kokkos::ALL(), 0, v);
4686  auto E = Kokkos::subview(e_internal_vector_values, 1, r0+row, Kokkos::ALL(), Kokkos::ALL(), v);
4687 
4688  KOKKOSBATCHED_GEMV_NO_TRANSPOSE_INTERNAL_INVOKE
4689  (default_mode_type,default_algo_type,
4690  member,
4691  blocksize, blocksize,
4692  -one,
4693  E.data(), E.stride_0(), E.stride_1(),
4694  v_2.data(), v_2.stride_0(),
4695  one,
4696  v_1.data(), v_1.stride_0());
4697  }
4698  }
4699  });
4700  }
4701  }
4702 
4703  template<int B>
4704  KOKKOS_INLINE_FUNCTION
4705  void
4706  operator() (const SingleVectorCopyToFlatTag<B> &, const member_type &member) const {
4707  const local_ordinal_type packidx = member.league_rank();
4708  const local_ordinal_type partidx = packptr(packidx);
4709  const local_ordinal_type npacks = packptr(packidx+1) - partidx;
4710  const local_ordinal_type pri0 = part2packrowidx0(partidx);
4711  const local_ordinal_type blocksize = (B == 0 ? D_internal_vector_values.extent(1) : B);
4712  const local_ordinal_type num_vectors = 1;
4713 
4714  Kokkos::parallel_for
4715  (Kokkos::ThreadVectorRange(member, vector_loop_size),[&](const int &v) {
4716  copyToFlatMultiVector(member, partidx, npacks, pri0, v, blocksize, num_vectors);
4717  });
4718  }
4719 
4720  template<int B>
4721  KOKKOS_INLINE_FUNCTION
4722  void
4723  operator() (const SingleZeroingTag<B> &, const member_type &member) const {
4724  Kokkos::single(Kokkos::PerTeam(member), [&]() {
4725  Z_scalar_vector(member.league_rank()) = impl_scalar_type(0);
4726  });
4727  }
4728 
4729  void run(const impl_scalar_type_2d_view_tpetra &Y,
4730  const impl_scalar_type_1d_view &Z) {
4731  IFPACK2_BLOCKTRIDICONTAINER_PROFILER_REGION_BEGIN;
4732  IFPACK2_BLOCKHELPER_TIMER("BlockTriDi::SolveTridiags", SolveTridiags);
4733 
4735  this->Y_scalar_multivector = Y;
4736  this->Z_scalar_vector = Z;
4737 
4738  const local_ordinal_type num_vectors = X_internal_vector_values.extent(2);
4739  const local_ordinal_type blocksize = D_internal_vector_values.extent(1);
4740 
4741  const local_ordinal_type team_size =
4742  SolveTridiagsDefaultModeAndAlgo<typename execution_space::memory_space>::
4743  recommended_team_size(blocksize, vector_length, internal_vector_length);
4744  const int per_team_scratch = internal_vector_scratch_type_3d_view
4745  ::shmem_size(blocksize, num_vectors, vector_loop_size);
4746 
4747 #if defined(KOKKOS_ENABLE_DEPRECATED_CODE)
4748 #define BLOCKTRIDICONTAINER_DETAILS_SOLVETRIDIAGS(B) \
4749  if (num_vectors == 1) { \
4750  const Kokkos::TeamPolicy<execution_space,SingleVectorTag<B> > \
4751  policy(packptr.extent(0) - 1, team_size, vector_loop_size); \
4752  Kokkos::parallel_for \
4753  ("SolveTridiags::TeamPolicy::run<SingleVector>", \
4754  policy.set_scratch_size(0,Kokkos::PerTeam(per_team_scratch)), *this); \
4755  } else { \
4756  const Kokkos::TeamPolicy<execution_space,MultiVectorTag<B> > \
4757  policy(packptr.extent(0) - 1, team_size, vector_loop_size); \
4758  Kokkos::parallel_for \
4759  ("SolveTridiags::TeamPolicy::run<MultiVector>", \
4760  policy.set_scratch_size(0,Kokkos::PerTeam(per_team_scratch)), *this); \
4761  } break
4762 #else
4763 #define BLOCKTRIDICONTAINER_DETAILS_SOLVETRIDIAGS(B) \
4764  if (num_vectors == 1) { \
4765  if (packindices_schur.extent(1) <= 0) { \
4766  Kokkos::TeamPolicy<execution_space,SingleVectorTag<B> > \
4767  policy(packptr.extent(0) - 1, team_size, vector_loop_size); \
4768  policy.set_scratch_size(0,Kokkos::PerTeam(per_team_scratch)); \
4769  Kokkos::parallel_for \
4770  ("SolveTridiags::TeamPolicy::run<SingleVector>", \
4771  policy, *this); \
4772  } \
4773  else { \
4774  { \
4775  \
4776  Kokkos::TeamPolicy<execution_space,SingleZeroingTag<B> > \
4777  policy(packptr.extent(0) - 1, team_size, vector_loop_size); \
4778  Kokkos::parallel_for \
4779  ("SolveTridiags::TeamPolicy::run<SingleZeroingTag>", \
4780  policy, *this); \
4781  } \
4782  { \
4783  IFPACK2_BLOCKHELPER_TIMER("BlockTriDi::ApplyInverseJacobi::SingleVectorSubLineTag", SingleVectorSubLineTag0); \
4784  write4DMultiVectorValuesToFile(part2packrowidx0_sub.extent(0), X_internal_scalar_values, "x_scalar_values_before_SingleVectorSubLineTag.mm"); \
4785  Kokkos::TeamPolicy<execution_space,SingleVectorSubLineTag<B> > \
4786  policy(packindices_sub.extent(0), team_size, vector_loop_size); \
4787  policy.set_scratch_size(0,Kokkos::PerTeam(per_team_scratch)); \
4788  Kokkos::parallel_for \
4789  ("SolveTridiags::TeamPolicy::run<SingleVector>", \
4790  policy, *this); \
4791  write4DMultiVectorValuesToFile(part2packrowidx0_sub.extent(0), X_internal_scalar_values, "x_scalar_values_after_SingleVectorSubLineTag.mm"); \
4792  IFPACK2_BLOCKHELPER_TIMER_FENCE(execution_space) \
4793  } \
4794  { \
4795  IFPACK2_BLOCKHELPER_TIMER("BlockTriDi::ApplyInverseJacobi::SingleVectorApplyCTag", SingleVectorApplyCTag0); \
4796  write4DMultiVectorValuesToFile(part2packrowidx0_sub.extent(0), X_internal_scalar_values, "x_scalar_values_before_SingleVectorApplyCTag.mm"); \
4797  Kokkos::TeamPolicy<execution_space,SingleVectorApplyCTag<B> > \
4798  policy(packindices_sub.extent(0), team_size, vector_loop_size); \
4799  policy.set_scratch_size(0,Kokkos::PerTeam(per_team_scratch)); \
4800  Kokkos::parallel_for \
4801  ("SolveTridiags::TeamPolicy::run<SingleVector>", \
4802  policy, *this); \
4803  write4DMultiVectorValuesToFile(part2packrowidx0_sub.extent(0), X_internal_scalar_values, "x_scalar_values_after_SingleVectorApplyCTag.mm"); \
4804  IFPACK2_BLOCKHELPER_TIMER_FENCE(execution_space) \
4805  } \
4806  { \
4807  IFPACK2_BLOCKHELPER_TIMER("BlockTriDi::ApplyInverseJacobi::SingleVectorSchurTag", SingleVectorSchurTag0); \
4808  write4DMultiVectorValuesToFile(part2packrowidx0_sub.extent(0), X_internal_scalar_values, "x_scalar_values_before_SingleVectorSchurTag.mm"); \
4809  Kokkos::TeamPolicy<execution_space,SingleVectorSchurTag<B> > \
4810  policy(packindices_schur.extent(0), team_size, vector_loop_size); \
4811  policy.set_scratch_size(0,Kokkos::PerTeam(per_team_scratch)); \
4812  Kokkos::parallel_for \
4813  ("SolveTridiags::TeamPolicy::run<SingleVector>", \
4814  policy, *this); \
4815  write4DMultiVectorValuesToFile(part2packrowidx0_sub.extent(0), X_internal_scalar_values, "x_scalar_values_after_SingleVectorSchurTag.mm"); \
4816  IFPACK2_BLOCKHELPER_TIMER_FENCE(execution_space) \
4817  } \
4818  { \
4819  IFPACK2_BLOCKHELPER_TIMER("BlockTriDi::ApplyInverseJacobi::SingleVectorApplyETag", SingleVectorApplyETag0); \
4820  write4DMultiVectorValuesToFile(part2packrowidx0_sub.extent(0), X_internal_scalar_values, "x_scalar_values_before_SingleVectorApplyETag.mm"); \
4821  Kokkos::TeamPolicy<execution_space,SingleVectorApplyETag<B> > \
4822  policy(packindices_sub.extent(0), team_size, vector_loop_size); \
4823  policy.set_scratch_size(0,Kokkos::PerTeam(per_team_scratch)); \
4824  Kokkos::parallel_for \
4825  ("SolveTridiags::TeamPolicy::run<SingleVector>", \
4826  policy, *this); \
4827  write4DMultiVectorValuesToFile(part2packrowidx0_sub.extent(0), X_internal_scalar_values, "x_scalar_values_after_SingleVectorApplyETag.mm"); \
4828  IFPACK2_BLOCKHELPER_TIMER_FENCE(execution_space) \
4829  } \
4830  { \
4831  \
4832  Kokkos::TeamPolicy<execution_space,SingleVectorCopyToFlatTag<B> > \
4833  policy(packptr.extent(0) - 1, team_size, vector_loop_size); \
4834  Kokkos::parallel_for \
4835  ("SolveTridiags::TeamPolicy::run<SingleVectorCopyToFlatTag>", \
4836  policy, *this); \
4837  } \
4838  } \
4839  } else { \
4840  Kokkos::TeamPolicy<execution_space,MultiVectorTag<B> > \
4841  policy(packptr.extent(0) - 1, team_size, vector_loop_size); \
4842  policy.set_scratch_size(0,Kokkos::PerTeam(per_team_scratch)); \
4843  Kokkos::parallel_for \
4844  ("SolveTridiags::TeamPolicy::run<MultiVector>", \
4845  policy, *this); \
4846  } break
4847 #endif
4848  switch (blocksize) {
4849  case 3: BLOCKTRIDICONTAINER_DETAILS_SOLVETRIDIAGS( 3);
4850  case 5: BLOCKTRIDICONTAINER_DETAILS_SOLVETRIDIAGS( 5);
4851  case 6: BLOCKTRIDICONTAINER_DETAILS_SOLVETRIDIAGS( 6);
4852  case 7: BLOCKTRIDICONTAINER_DETAILS_SOLVETRIDIAGS( 7);
4853  case 10: BLOCKTRIDICONTAINER_DETAILS_SOLVETRIDIAGS(10);
4854  case 11: BLOCKTRIDICONTAINER_DETAILS_SOLVETRIDIAGS(11);
4855  case 12: BLOCKTRIDICONTAINER_DETAILS_SOLVETRIDIAGS(12);
4856  case 13: BLOCKTRIDICONTAINER_DETAILS_SOLVETRIDIAGS(13);
4857  case 16: BLOCKTRIDICONTAINER_DETAILS_SOLVETRIDIAGS(16);
4858  case 17: BLOCKTRIDICONTAINER_DETAILS_SOLVETRIDIAGS(17);
4859  case 18: BLOCKTRIDICONTAINER_DETAILS_SOLVETRIDIAGS(18);
4860  case 19: BLOCKTRIDICONTAINER_DETAILS_SOLVETRIDIAGS(19);
4861  default : BLOCKTRIDICONTAINER_DETAILS_SOLVETRIDIAGS( 0);
4862  }
4863 #undef BLOCKTRIDICONTAINER_DETAILS_SOLVETRIDIAGS
4864 
4865  IFPACK2_BLOCKTRIDICONTAINER_PROFILER_REGION_END;
4866  IFPACK2_BLOCKHELPER_TIMER_FENCE(execution_space)
4867  }
4868  };
4869 
4873  template<typename MatrixType>
4874  int
4876  const Teuchos::RCP<const typename BlockHelperDetails::ImplType<MatrixType>::tpetra_row_matrix_type> &A,
4877  const Teuchos::RCP<const typename BlockHelperDetails::ImplType<MatrixType>::tpetra_crs_graph_type> &G,
4878  const Teuchos::RCP<const typename BlockHelperDetails::ImplType<MatrixType>::tpetra_import_type> &tpetra_importer,
4879  const Teuchos::RCP<AsyncableImport<MatrixType> > &async_importer,
4880  const bool overlap_communication_and_computation,
4881  // tpetra interface
4882  const typename BlockHelperDetails::ImplType<MatrixType>::tpetra_multivector_type &X, // tpetra interface
4883  /* */ typename BlockHelperDetails::ImplType<MatrixType>::tpetra_multivector_type &Y, // tpetra interface
4884  /* */ typename BlockHelperDetails::ImplType<MatrixType>::tpetra_multivector_type &Z, // temporary tpetra interface (seq_method)
4885  /* */ typename BlockHelperDetails::ImplType<MatrixType>::impl_scalar_type_1d_view &W, // temporary tpetra interface (diff)
4886  // local object interface
4887  const BlockHelperDetails::PartInterface<MatrixType> &interf, // mesh interface
4888  const BlockTridiags<MatrixType> &btdm, // packed block tridiagonal matrices
4889  const BlockHelperDetails::AmD<MatrixType> &amd, // R = A - D
4890  /* */ typename BlockHelperDetails::ImplType<MatrixType>::vector_type_1d_view &work, // workspace for packed multivector of right hand side
4891  /* */ BlockHelperDetails::NormManager<MatrixType> &norm_manager,
4892  // preconditioner parameters
4893  const typename BlockHelperDetails::ImplType<MatrixType>::impl_scalar_type &damping_factor,
4894  /* */ bool is_y_zero,
4895  const int max_num_sweeps,
4896  const typename BlockHelperDetails::ImplType<MatrixType>::magnitude_type tol,
4897  const int check_tol_every) {
4898  IFPACK2_BLOCKHELPER_TIMER("BlockTriDi::ApplyInverseJacobi", ApplyInverseJacobi);
4899 
4900  using impl_type = BlockHelperDetails::ImplType<MatrixType>;
4901  using node_memory_space = typename impl_type::node_memory_space;
4902  using local_ordinal_type = typename impl_type::local_ordinal_type;
4903  using size_type = typename impl_type::size_type;
4904  using impl_scalar_type = typename impl_type::impl_scalar_type;
4905  using magnitude_type = typename impl_type::magnitude_type;
4906  using local_ordinal_type_1d_view = typename impl_type::local_ordinal_type_1d_view;
4907  using vector_type_1d_view = typename impl_type::vector_type_1d_view;
4908  using vector_type_3d_view = typename impl_type::vector_type_3d_view;
4909  using tpetra_multivector_type = typename impl_type::tpetra_multivector_type;
4910 
4911  using impl_scalar_type_1d_view = typename impl_type::impl_scalar_type_1d_view;
4912 
4913  // either tpetra importer or async importer must be active
4914  TEUCHOS_TEST_FOR_EXCEPT_MSG(!tpetra_importer.is_null() && !async_importer.is_null(),
4915  "Neither Tpetra importer nor Async importer is null.");
4916  // max number of sweeps should be positive number
4917  TEUCHOS_TEST_FOR_EXCEPT_MSG(max_num_sweeps <= 0,
4918  "Maximum number of sweeps must be >= 1.");
4919 
4920  // const parameters
4921  const bool is_seq_method_requested = !tpetra_importer.is_null();
4922  const bool is_async_importer_active = !async_importer.is_null();
4923  const bool is_norm_manager_active = tol > Kokkos::ArithTraits<magnitude_type>::zero();
4924  const magnitude_type tolerance = tol*tol;
4925  const local_ordinal_type blocksize = btdm.values.extent(1);
4926  const local_ordinal_type num_vectors = Y.getNumVectors();
4927  const local_ordinal_type num_blockrows = interf.part2packrowidx0_back;
4928 
4929  const impl_scalar_type zero(0.0);
4930 
4931  TEUCHOS_TEST_FOR_EXCEPT_MSG(is_norm_manager_active && is_seq_method_requested,
4932  "The seq method for applyInverseJacobi, " <<
4933  "which in any case is for developer use only, " <<
4934  "does not support norm-based termination.");
4935  const bool device_accessible_from_host = Kokkos::SpaceAccessibility<
4936  Kokkos::DefaultHostExecutionSpace, node_memory_space>::accessible;
4937  TEUCHOS_TEST_FOR_EXCEPTION(is_seq_method_requested && !device_accessible_from_host,
4938  std::invalid_argument,
4939  "The seq method for applyInverseJacobi, " <<
4940  "which in any case is for developer use only, " <<
4941  "only supports memory spaces accessible from host.");
4942 
4943  // if workspace is needed more, resize it
4944  const size_type work_span_required = num_blockrows*num_vectors*blocksize;
4945  if (work.span() < work_span_required)
4946  work = vector_type_1d_view("vector workspace 1d view", work_span_required);
4947 
4948  // construct W
4949  const local_ordinal_type W_size = interf.packptr.extent(0)-1;
4950  if (local_ordinal_type(W.extent(0)) < W_size)
4951  W = impl_scalar_type_1d_view("W", W_size);
4952 
4953  typename impl_type::impl_scalar_type_2d_view_tpetra remote_multivector;
4954  {
4955  if (is_seq_method_requested) {
4956  if (Z.getNumVectors() != Y.getNumVectors())
4957  Z = tpetra_multivector_type(tpetra_importer->getTargetMap(), num_vectors, false);
4958  } else {
4959  if (is_async_importer_active) {
4960  // create comm data buffer and keep it here
4961  async_importer->createDataBuffer(num_vectors);
4962  remote_multivector = async_importer->getRemoteMultiVectorLocalView();
4963  }
4964  }
4965  }
4966 
4967  // wrap the workspace with 3d view
4968  vector_type_3d_view pmv(work.data(), num_blockrows, blocksize, num_vectors);
4969  const auto XX = X.getLocalViewDevice(Tpetra::Access::ReadOnly);
4970  const auto YY = Y.getLocalViewDevice(Tpetra::Access::ReadWrite);
4971  const auto ZZ = Z.getLocalViewDevice(Tpetra::Access::ReadWrite);
4972  if (is_y_zero) Kokkos::deep_copy(YY, zero);
4973 
4974  MultiVectorConverter<MatrixType> multivector_converter(interf, pmv);
4975  SolveTridiags<MatrixType> solve_tridiags(interf, btdm, pmv,
4976  damping_factor, is_norm_manager_active);
4977 
4978  const local_ordinal_type_1d_view dummy_local_ordinal_type_1d_view;
4979 
4980 
4981  auto A_crs = Teuchos::rcp_dynamic_cast<const typename impl_type::tpetra_crs_matrix_type>(A);
4982  auto A_bcrs = Teuchos::rcp_dynamic_cast<const typename impl_type::tpetra_block_crs_matrix_type>(A);
4983 
4984  bool hasBlockCrsMatrix = ! A_bcrs.is_null ();
4985 
4986  // This is OK here to use the graph of the A_crs matrix and a block size of 1
4987  const auto g = hasBlockCrsMatrix ? A_bcrs->getCrsGraph() : *(A_crs->getCrsGraph()); // tpetra crs graph object
4988 
4989  BlockHelperDetails::ComputeResidualVector<MatrixType>
4990  compute_residual_vector(amd, G->getLocalGraphDevice(), g.getLocalGraphDevice(), blocksize, interf,
4991  is_async_importer_active ? async_importer->dm2cm : dummy_local_ordinal_type_1d_view,
4992  hasBlockCrsMatrix);
4993 
4994  // norm manager workspace resize
4995  if (is_norm_manager_active)
4996  norm_manager.setCheckFrequency(check_tol_every);
4997 
4998  // iterate
4999  int sweep = 0;
5000  for (;sweep<max_num_sweeps;++sweep) {
5001  {
5002  if (is_y_zero) {
5003  // pmv := x(lclrow)
5004  multivector_converter.run(XX);
5005  } else {
5006  if (is_seq_method_requested) {
5007  // SEQ METHOD IS TESTING ONLY
5008 
5009  // y := x - R y
5010  Z.doImport(Y, *tpetra_importer, Tpetra::REPLACE);
5011  compute_residual_vector.run(YY, XX, ZZ);
5012 
5013  // pmv := y(lclrow).
5014  multivector_converter.run(YY);
5015  } else {
5016  // fused y := x - R y and pmv := y(lclrow);
5017  // real use case does not use overlap comp and comm
5018  if (overlap_communication_and_computation || !is_async_importer_active) {
5019  if (is_async_importer_active) async_importer->asyncSendRecv(YY);
5020  // OverlapTag, compute_owned = true
5021  compute_residual_vector.run(pmv, XX, YY, remote_multivector, true);
5022  if (is_norm_manager_active && norm_manager.checkDone(sweep, tolerance)) {
5023  if (is_async_importer_active) async_importer->cancel();
5024  break;
5025  }
5026  if (is_async_importer_active) {
5027  async_importer->syncRecv();
5028  // OverlapTag, compute_owned = false
5029  compute_residual_vector.run(pmv, XX, YY, remote_multivector, false);
5030  }
5031  } else {
5032  if (is_async_importer_active)
5033  async_importer->syncExchange(YY);
5034  if (is_norm_manager_active && norm_manager.checkDone(sweep, tolerance)) break;
5035  // AsyncTag
5036  compute_residual_vector.run(pmv, XX, YY, remote_multivector);
5037  }
5038  }
5039  }
5040  }
5041 
5042  // pmv := inv(D) pmv.
5043  {
5044  solve_tridiags.run(YY, W);
5045  }
5046  {
5047  if (is_norm_manager_active) {
5048  // y(lclrow) = (b - a) y(lclrow) + a pmv, with b = 1 always.
5049  BlockHelperDetails::reduceVector<MatrixType>(W, norm_manager.getBuffer());
5050  if (sweep + 1 == max_num_sweeps) {
5051  norm_manager.ireduce(sweep, true);
5052  norm_manager.checkDone(sweep + 1, tolerance, true);
5053  } else {
5054  norm_manager.ireduce(sweep);
5055  }
5056  }
5057  }
5058  is_y_zero = false;
5059  }
5060 
5061  //sqrt the norms for the caller's use.
5062  if (is_norm_manager_active) norm_manager.finalize();
5063 
5064  return sweep;
5065  }
5066 
5067 
5068  template<typename MatrixType>
5069  struct ImplObject {
5070  using impl_type = BlockHelperDetails::ImplType<MatrixType>;
5071  using part_interface_type = BlockHelperDetails::PartInterface<MatrixType>;
5072  using block_tridiags_type = BlockTridiags<MatrixType>;
5073  using amd_type = BlockHelperDetails::AmD<MatrixType>;
5074  using norm_manager_type = BlockHelperDetails::NormManager<MatrixType>;
5075  using async_import_type = AsyncableImport<MatrixType>;
5076 
5077  // distructed objects
5081  Teuchos::RCP<async_import_type> async_importer;
5082  bool overlap_communication_and_computation;
5083 
5084  // copy of Y (mutable to penentrate const)
5085  mutable typename impl_type::tpetra_multivector_type Z;
5086  mutable typename impl_type::impl_scalar_type_1d_view W;
5087 
5088  // local objects
5089  part_interface_type part_interface;
5090  block_tridiags_type block_tridiags; // D
5091  amd_type a_minus_d; // R = A - D
5092  mutable typename impl_type::vector_type_1d_view work; // right hand side workspace
5093  mutable norm_manager_type norm_manager;
5094  };
5095 
5096  } // namespace BlockTriDiContainerDetails
5097 
5098 } // namespace Ifpack2
5099 
5100 #endif
Definition: Ifpack2_BlockTriDiContainer_impl.hpp:140
#define TEUCHOS_TEST_FOR_EXCEPTION(throw_exception_test, Exception, msg)
size_type size() const
size_t size_type
Definition: Ifpack2_BlockHelper.hpp:253
Teuchos::RCP< AsyncableImport< MatrixType > > createBlockCrsAsyncImporter(const Teuchos::RCP< const typename BlockHelperDetails::ImplType< MatrixType >::tpetra_row_matrix_type > &A)
Definition: Ifpack2_BlockTriDiContainer_impl.hpp:884
int applyInverseJacobi(const Teuchos::RCP< const typename BlockHelperDetails::ImplType< MatrixType >::tpetra_row_matrix_type > &A, const Teuchos::RCP< const typename BlockHelperDetails::ImplType< MatrixType >::tpetra_crs_graph_type > &G, const Teuchos::RCP< const typename BlockHelperDetails::ImplType< MatrixType >::tpetra_import_type > &tpetra_importer, const Teuchos::RCP< AsyncableImport< MatrixType > > &async_importer, const bool overlap_communication_and_computation, const typename BlockHelperDetails::ImplType< MatrixType >::tpetra_multivector_type &X, typename BlockHelperDetails::ImplType< MatrixType >::tpetra_multivector_type &Y, typename BlockHelperDetails::ImplType< MatrixType >::tpetra_multivector_type &Z, typename BlockHelperDetails::ImplType< MatrixType >::impl_scalar_type_1d_view &W, const BlockHelperDetails::PartInterface< MatrixType > &interf, const BlockTridiags< MatrixType > &btdm, const BlockHelperDetails::AmD< MatrixType > &amd, typename BlockHelperDetails::ImplType< MatrixType >::vector_type_1d_view &work, BlockHelperDetails::NormManager< MatrixType > &norm_manager, const typename BlockHelperDetails::ImplType< MatrixType >::impl_scalar_type &damping_factor, bool is_y_zero, const int max_num_sweeps, const typename BlockHelperDetails::ImplType< MatrixType >::magnitude_type tol, const int check_tol_every)
Definition: Ifpack2_BlockTriDiContainer_impl.hpp:4875
TEUCHOS_DEPRECATED RCP< T > rcp(T *p, Dealloc_T dealloc, bool owns_mem)
#define TEUCHOS_TEST_FOR_EXCEPT_MSG(throw_exception_test, msg)
BlockTridiags< MatrixType > createBlockTridiags(const BlockHelperDetails::PartInterface< MatrixType > &interf)
Definition: Ifpack2_BlockTriDiContainer_impl.hpp:1620
Definition: Ifpack2_BlockHelper.hpp:351
void performSymbolicPhase(const Teuchos::RCP< const typename BlockHelperDetails::ImplType< MatrixType >::tpetra_row_matrix_type > &A, const Teuchos::RCP< const typename BlockHelperDetails::ImplType< MatrixType >::tpetra_crs_graph_type > &g, const BlockHelperDetails::PartInterface< MatrixType > &interf, BlockTridiags< MatrixType > &btdm, BlockHelperDetails::AmD< MatrixType > &amd, const bool overlap_communication_and_computation, const Teuchos::RCP< AsyncableImport< MatrixType > > &async_importer, bool useSeqMethod)
Definition: Ifpack2_BlockTriDiContainer_impl.hpp:1862
Kokkos::ViewAllocateWithoutInitializing do_not_initialize_tag
Definition: Ifpack2_BlockTriDiContainer_impl.hpp:96
void performNumericPhase(const Teuchos::RCP< const typename BlockHelperDetails::ImplType< MatrixType >::tpetra_row_matrix_type > &A, const Teuchos::RCP< const typename BlockHelperDetails::ImplType< MatrixType >::tpetra_crs_graph_type > &G, const BlockHelperDetails::PartInterface< MatrixType > &interf, BlockTridiags< MatrixType > &btdm, const typename BlockHelperDetails::ImplType< MatrixType >::magnitude_type tiny)
Definition: Ifpack2_BlockTriDiContainer_impl.hpp:3599
void send(const Packet sendBuffer[], const Ordinal count, const int destRank, const int tag, const Comm< Ordinal > &comm)
T * getRawPtr() const
Kokkos::Details::ArithTraits< scalar_type >::val_type impl_scalar_type
Definition: Ifpack2_BlockHelper.hpp:262
Definition: Ifpack2_BlockHelper.hpp:188
BlockHelperDetails::PartInterface< MatrixType > createPartInterface(const Teuchos::RCP< const typename BlockHelperDetails::ImplType< MatrixType >::tpetra_row_matrix_type > &A, const Teuchos::RCP< const typename BlockHelperDetails::ImplType< MatrixType >::tpetra_crs_graph_type > &G, const Teuchos::Array< Teuchos::Array< typename BlockHelperDetails::ImplType< MatrixType >::local_ordinal_type > > &partitions, const typename BlockHelperDetails::ImplType< MatrixType >::local_ordinal_type n_subparts_per_part_in)
Definition: Ifpack2_BlockTriDiContainer_impl.hpp:1043
Kokkos::View< size_type *, device_type > size_type_1d_view
Definition: Ifpack2_BlockHelper.hpp:320
Teuchos::RCP< const typename BlockHelperDetails::ImplType< MatrixType >::tpetra_import_type > createBlockCrsTpetraImporter(const Teuchos::RCP< const typename BlockHelperDetails::ImplType< MatrixType >::tpetra_row_matrix_type > &A)
Definition: Ifpack2_BlockTriDiContainer_impl.hpp:163
RCP< CommRequest< Ordinal > > isend(const ArrayRCP< const Packet > &sendBuffer, const int destRank, const int tag, const Comm< Ordinal > &comm)
#define TEUCHOS_ASSERT(assertion_test)
Definition: Ifpack2_BlockHelper.hpp:215
Definition: Ifpack2_BlockHelper.hpp:249
Definition: Ifpack2_BlockTriDiContainer_impl.hpp:1565
Definition: Ifpack2_BlockComputeResidualVector.hpp:23
Definition: Ifpack2_BlockTriDiContainer_impl.hpp:3634