Ifpack2 Templated Preconditioning Package  Version 1.0
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Groups Pages
Ifpack2_BlockTriDiContainer_impl.hpp
1 /*@HEADER
2 // ***********************************************************************
3 //
4 // Ifpack2: Templated Object-Oriented Algebraic Preconditioner Package
5 // Copyright (2009) Sandia Corporation
6 //
7 // Under terms of Contract DE-AC04-94AL85000, there is a non-exclusive
8 // license for use of this work by or on behalf of the U.S. Government.
9 //
10 // Redistribution and use in source and binary forms, with or without
11 // modification, are permitted provided that the following conditions are
12 // met:
13 //
14 // 1. Redistributions of source code must retain the above copyright
15 // notice, this list of conditions and the following disclaimer.
16 //
17 // 2. Redistributions in binary form must reproduce the above copyright
18 // notice, this list of conditions and the following disclaimer in the
19 // documentation and/or other materials provided with the distribution.
20 //
21 // 3. Neither the name of the Corporation nor the names of the
22 // contributors may be used to endorse or promote products derived from
23 // this software without specific prior written permission.
24 //
25 // THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
26 // EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
27 // IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
28 // PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
29 // CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
30 // EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
31 // PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
32 // PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
33 // LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
34 // NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
35 // SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
36 //
37 // Questions? Contact Michael A. Heroux (maherou@sandia.gov)
38 //
39 // ***********************************************************************
40 //@HEADER
41 */
42 
43 #ifndef IFPACK2_BLOCKTRIDICONTAINER_IMPL_HPP
44 #define IFPACK2_BLOCKTRIDICONTAINER_IMPL_HPP
45 
46 //#define IFPACK2_BLOCKTRIDICONTAINER_WRITE_MM
47 //#define IFPACK2_BLOCKTRIDICONTAINER_USE_PRINTF
48 
50 
51 #include <Tpetra_Details_extractMpiCommFromTeuchos.hpp>
52 #include <Tpetra_Distributor.hpp>
53 #include <Tpetra_BlockMultiVector.hpp>
54 
55 #include <Kokkos_ArithTraits.hpp>
56 #include <KokkosBatched_Util.hpp>
57 #include <KokkosBatched_Vector.hpp>
58 #include <KokkosBatched_Copy_Decl.hpp>
59 #include <KokkosBatched_Copy_Impl.hpp>
60 #include <KokkosBatched_AddRadial_Decl.hpp>
61 #include <KokkosBatched_AddRadial_Impl.hpp>
62 #include <KokkosBatched_SetIdentity_Decl.hpp>
63 #include <KokkosBatched_SetIdentity_Impl.hpp>
64 #include <KokkosBatched_Gemm_Decl.hpp>
65 #include <KokkosBatched_Gemm_Serial_Impl.hpp>
66 #include <KokkosBatched_Gemm_Team_Impl.hpp>
67 #include <KokkosBatched_Gemv_Decl.hpp>
68 #include <KokkosBatched_Gemv_Team_Impl.hpp>
69 #include <KokkosBatched_Trsm_Decl.hpp>
70 #include <KokkosBatched_Trsm_Serial_Impl.hpp>
71 #include <KokkosBatched_Trsm_Team_Impl.hpp>
72 #include <KokkosBatched_Trsv_Decl.hpp>
73 #include <KokkosBatched_Trsv_Serial_Impl.hpp>
74 #include <KokkosBatched_Trsv_Team_Impl.hpp>
75 #include <KokkosBatched_LU_Decl.hpp>
76 #include <KokkosBatched_LU_Serial_Impl.hpp>
77 #include <KokkosBatched_LU_Team_Impl.hpp>
78 
79 #include <KokkosBlas1_nrm1.hpp>
80 #include <KokkosBlas1_nrm2.hpp>
81 
82 #include <memory>
83 
84 #include "Ifpack2_BlockHelper.hpp"
85 #include "Ifpack2_BlockComputeResidualVector.hpp"
86 
87 //#include <KokkosBlas2_gemv.hpp>
88 
89 // need to interface this into cmake variable (or only use this flag when it is necessary)
90 //#define IFPACK2_BLOCKTRIDICONTAINER_ENABLE_PROFILE
91 //#undef IFPACK2_BLOCKTRIDICONTAINER_ENABLE_PROFILE
92 #if defined(KOKKOS_ENABLE_CUDA) && defined(IFPACK2_BLOCKTRIDICONTAINER_ENABLE_PROFILE)
93 #include "cuda_profiler_api.h"
94 #endif
95 
96 // I am not 100% sure about the mpi 3 on cuda
97 #if MPI_VERSION >= 3
98 #define IFPACK2_BLOCKTRIDICONTAINER_USE_MPI_3
99 #endif
100 
101 // ::: Experiments :::
102 // define either pinned memory or cudamemory for mpi
103 // if both macros are disabled, it will use tpetra memory space which is uvm space for cuda
104 // if defined, this use pinned memory instead of device pointer
105 // by default, we enable pinned memory
106 #define IFPACK2_BLOCKTRIDICONTAINER_USE_PINNED_MEMORY_FOR_MPI
107 //#define IFPACK2_BLOCKTRIDICONTAINER_USE_CUDA_MEMORY_FOR_MPI
108 
109 // if defined, all views are allocated on cuda space intead of cuda uvm space
110 #define IFPACK2_BLOCKTRIDICONTAINER_USE_CUDA_SPACE
111 
112 // if defined, btdm_scalar_type is used (if impl_scala_type is double, btdm_scalar_type is float)
113 #if defined(HAVE_IFPACK2_BLOCKTRIDICONTAINER_SMALL_SCALAR)
114 #define IFPACK2_BLOCKTRIDICONTAINER_USE_SMALL_SCALAR_FOR_BLOCKTRIDIAG
115 #endif
116 
117 // if defined, it uses multiple execution spaces
118 #define IFPACK2_BLOCKTRIDICONTAINER_USE_EXEC_SPACE_INSTANCES
119 
120 namespace Ifpack2 {
121 
122  namespace BlockTriDiContainerDetails {
123 
124  namespace KB = KokkosBatched;
125 
129  using do_not_initialize_tag = Kokkos::ViewAllocateWithoutInitializing;
130 
131  template <typename MemoryTraitsType, Kokkos::MemoryTraitsFlags flag>
132  using MemoryTraits = Kokkos::MemoryTraits<MemoryTraitsType::is_unmanaged |
133  MemoryTraitsType::is_random_access |
134  flag>;
135 
136  template <typename ViewType>
137  using Unmanaged = Kokkos::View<typename ViewType::data_type,
138  typename ViewType::array_layout,
139  typename ViewType::device_type,
140  MemoryTraits<typename ViewType::memory_traits,Kokkos::Unmanaged> >;
141  template <typename ViewType>
142  using Atomic = Kokkos::View<typename ViewType::data_type,
143  typename ViewType::array_layout,
144  typename ViewType::device_type,
145  MemoryTraits<typename ViewType::memory_traits,Kokkos::Atomic> >;
146  template <typename ViewType>
147  using Const = Kokkos::View<typename ViewType::const_data_type,
148  typename ViewType::array_layout,
149  typename ViewType::device_type,
150  typename ViewType::memory_traits>;
151  template <typename ViewType>
152  using ConstUnmanaged = Const<Unmanaged<ViewType> >;
153 
154  template <typename ViewType>
155  using AtomicUnmanaged = Atomic<Unmanaged<ViewType> >;
156 
157  template <typename ViewType>
158  using Unmanaged = Kokkos::View<typename ViewType::data_type,
159  typename ViewType::array_layout,
160  typename ViewType::device_type,
161  MemoryTraits<typename ViewType::memory_traits,Kokkos::Unmanaged> >;
162 
163 
164  template <typename ViewType>
165  using Scratch = Kokkos::View<typename ViewType::data_type,
166  typename ViewType::array_layout,
167  typename ViewType::execution_space::scratch_memory_space,
168  MemoryTraits<typename ViewType::memory_traits, Kokkos::Unmanaged> >;
169 
173  template<typename T> struct BlockTridiagScalarType { typedef T type; };
174 #if defined(IFPACK2_BLOCKTRIDICONTAINER_USE_SMALL_SCALAR_FOR_BLOCKTRIDIAG)
175  template<> struct BlockTridiagScalarType<double> { typedef float type; };
176  //template<> struct SmallScalarType<Kokkos::complex<double> > { typedef Kokkos::complex<float> type; };
177 #endif
178 
179 #if defined(KOKKOS_ENABLE_CUDA) && defined(IFPACK2_BLOCKTRIDICONTAINER_ENABLE_PROFILE)
180 #define IFPACK2_BLOCKTRIDICONTAINER_PROFILER_REGION_BEGIN \
181  KOKKOS_IMPL_CUDA_SAFE_CALL(cudaProfilerStart());
182 
183 #define IFPACK2_BLOCKTRIDICONTAINER_PROFILER_REGION_END \
184  { KOKKOS_IMPL_CUDA_SAFE_CALL( cudaProfilerStop() ); }
185 #else
186 #define IFPACK2_BLOCKTRIDICONTAINER_PROFILER_REGION_BEGIN
188 #define IFPACK2_BLOCKTRIDICONTAINER_PROFILER_REGION_END
189 #endif
190 
194  template<typename MatrixType>
196  createBlockCrsTpetraImporter(const Teuchos::RCP<const typename BlockHelperDetails::ImplType<MatrixType>::tpetra_block_crs_matrix_type> &A) {
197  IFPACK2_BLOCKHELPER_TIMER("BlockTriDi::CreateBlockCrsTpetraImporter");
199  using tpetra_map_type = typename impl_type::tpetra_map_type;
200  using tpetra_mv_type = typename impl_type::tpetra_block_multivector_type;
201  using tpetra_import_type = typename impl_type::tpetra_import_type;
202 
203  const auto g = A->getCrsGraph(); // tpetra crs graph object
204  const auto blocksize = A->getBlockSize();
205  const auto src = Teuchos::rcp(new tpetra_map_type(tpetra_mv_type::makePointMap(*g.getDomainMap(), blocksize)));
206  const auto tgt = Teuchos::rcp(new tpetra_map_type(tpetra_mv_type::makePointMap(*g.getColMap() , blocksize)));
207  IFPACK2_BLOCKHELPER_TIMER_FENCE(typename BlockHelperDetails::ImplType<MatrixType>::execution_space)
208  return Teuchos::rcp(new tpetra_import_type(src, tgt));
209  }
210 
211  // Partial replacement for forward-mode MultiVector::doImport.
212  // Permits overlapped communication and computation, but also supports sync'ed.
213  // I'm finding that overlapped comm/comp can give quite poor performance on some
214  // platforms, so we can't just use it straightforwardly always.
215 
216  template<typename MatrixType>
217  struct AsyncableImport {
218  public:
220 
221  private:
225 #if !defined(HAVE_IFPACK2_MPI)
226  typedef int MPI_Request;
227  typedef int MPI_Comm;
228 #endif
229  using scalar_type = typename impl_type::scalar_type;
232 
233  static int isend(const MPI_Comm comm, const char* buf, int count, int dest, int tag, MPI_Request* ireq) {
234 #ifdef HAVE_IFPACK2_MPI
235  MPI_Request ureq;
236  int ret = MPI_Isend(const_cast<char*>(buf), count, MPI_CHAR, dest, tag, comm, ireq == NULL ? &ureq : ireq);
237  if (ireq == NULL) MPI_Request_free(&ureq);
238  return ret;
239 #else
240  return 0;
241 #endif
242  }
243 
244  static int irecv(const MPI_Comm comm, char* buf, int count, int src, int tag, MPI_Request* ireq) {
245 #ifdef HAVE_IFPACK2_MPI
246  MPI_Request ureq;
247  int ret = MPI_Irecv(buf, count, MPI_CHAR, src, tag, comm, ireq == NULL ? &ureq : ireq);
248  if (ireq == NULL) MPI_Request_free(&ureq);
249  return ret;
250 #else
251  return 0;
252 #endif
253  }
254 
255  static int waitany(int count, MPI_Request* reqs, int* index) {
256 #ifdef HAVE_IFPACK2_MPI
257  return MPI_Waitany(count, reqs, index, MPI_STATUS_IGNORE);
258 #else
259  return 0;
260 #endif
261  }
262 
263  static int waitall(int count, MPI_Request* reqs) {
264 #ifdef HAVE_IFPACK2_MPI
265  return MPI_Waitall(count, reqs, MPI_STATUS_IGNORE);
266 #else
267  return 0;
268 #endif
269  }
270 
271  public:
272  using tpetra_map_type = typename impl_type::tpetra_map_type;
273  using tpetra_import_type = typename impl_type::tpetra_import_type;
274 
275  using local_ordinal_type = typename impl_type::local_ordinal_type;
276  using global_ordinal_type = typename impl_type::global_ordinal_type;
277  using size_type = typename impl_type::size_type;
278  using impl_scalar_type = typename impl_type::impl_scalar_type;
279 
280  using int_1d_view_host = Kokkos::View<int*,Kokkos::HostSpace>;
281  using local_ordinal_type_1d_view_host = Kokkos::View<local_ordinal_type*,Kokkos::HostSpace>;
282 
283  using execution_space = typename impl_type::execution_space;
284  using memory_space = typename impl_type::memory_space;
285  using local_ordinal_type_1d_view = typename impl_type::local_ordinal_type_1d_view;
286  using size_type_1d_view = typename impl_type::size_type_1d_view;
287  using size_type_1d_view_host = Kokkos::View<size_type*,Kokkos::HostSpace>;
288 
289 #if defined(KOKKOS_ENABLE_CUDA)
290  using impl_scalar_type_1d_view =
291  typename std::conditional<std::is_same<execution_space,Kokkos::Cuda>::value,
292 # if defined(IFPACK2_BLOCKTRIDICONTAINER_USE_PINNED_MEMORY_FOR_MPI)
293  Kokkos::View<impl_scalar_type*,Kokkos::CudaHostPinnedSpace>,
294 # elif defined(IFPACK2_BLOCKTRIDICONTAINER_USE_CUDA_MEMORY_FOR_MPI)
295  Kokkos::View<impl_scalar_type*,Kokkos::CudaSpace>,
296 # else // no experimental macros are defined
297  typename impl_type::impl_scalar_type_1d_view,
298 # endif
299  typename impl_type::impl_scalar_type_1d_view>::type;
300 #else
301  using impl_scalar_type_1d_view = typename impl_type::impl_scalar_type_1d_view;
302 #endif
303  using impl_scalar_type_1d_view_host = Kokkos::View<impl_scalar_type*,Kokkos::HostSpace>;
304  using impl_scalar_type_2d_view = typename impl_type::impl_scalar_type_2d_view;
305  using impl_scalar_type_2d_view_tpetra = typename impl_type::impl_scalar_type_2d_view_tpetra;
306 
307 #ifdef HAVE_IFPACK2_MPI
308  MPI_Comm comm;
309 #endif
310 
311  impl_scalar_type_2d_view_tpetra remote_multivector;
312  local_ordinal_type blocksize;
313 
314  template<typename T>
315  struct SendRecvPair {
316  T send, recv;
317  };
318 
319  // (s)end and (r)eceive data:
320  SendRecvPair<int_1d_view_host> pids; // mpi ranks
321  SendRecvPair<std::vector<MPI_Request> > reqs; // MPI_Request is pointer, cannot use kokkos view
322  SendRecvPair<size_type_1d_view> offset; // offsets to local id list and data buffer
323  SendRecvPair<size_type_1d_view_host> offset_host; // offsets to local id list and data buffer
324  SendRecvPair<local_ordinal_type_1d_view> lids; // local id list
325  SendRecvPair<impl_scalar_type_1d_view> buffer; // data buffer
326  SendRecvPair<impl_scalar_type_1d_view_host> buffer_host; // data buffer
327 
328  local_ordinal_type_1d_view dm2cm; // permutation
329 
330 #if defined(KOKKOS_ENABLE_CUDA) || defined(KOKKOS_ENABLE_HIP) || defined(KOKKOS_ENABLE_SYCL)
331  using exec_instance_1d_std_vector = std::vector<execution_space>;
332  exec_instance_1d_std_vector exec_instances;
333 #endif
334 
335  // for cuda
336  public:
337  void setOffsetValues(const Teuchos::ArrayView<const size_t> &lens,
338  const size_type_1d_view &offs) {
339  // wrap lens to kokkos view and deep copy to device
340  Kokkos::View<size_t*,Kokkos::HostSpace> lens_host(const_cast<size_t*>(lens.getRawPtr()), lens.size());
341  const auto lens_device = Kokkos::create_mirror_view_and_copy(memory_space(), lens_host);
342 
343  // exclusive scan
344  const Kokkos::RangePolicy<execution_space> policy(0,offs.extent(0));
345  const local_ordinal_type lens_size = lens_device.extent(0);
346  Kokkos::parallel_scan
347  ("AsyncableImport::RangePolicy::setOffsetValues",
348  policy, KOKKOS_LAMBDA(const local_ordinal_type &i, size_type &update, const bool &final) {
349  if (final)
350  offs(i) = update;
351  update += (i < lens_size ? lens_device[i] : 0);
352  });
353  }
354 
355  void setOffsetValuesHost(const Teuchos::ArrayView<const size_t> &lens,
356  const size_type_1d_view_host &offs) {
357  // wrap lens to kokkos view and deep copy to device
358  Kokkos::View<size_t*,Kokkos::HostSpace> lens_host(const_cast<size_t*>(lens.getRawPtr()), lens.size());
359  const auto lens_device = Kokkos::create_mirror_view_and_copy(memory_space(), lens_host);
360 
361  // exclusive scan
362  offs(0) = 0;
363  for (local_ordinal_type i=1,iend=offs.extent(0);i<iend;++i) {
364  offs(i) = offs(i-1) + lens[i-1];
365  }
366  }
367 
368  private:
369  void createMpiRequests(const tpetra_import_type &import) {
370  Tpetra::Distributor &distributor = import.getDistributor();
371 
372  // copy pids from distributor
373  const auto pids_from = distributor.getProcsFrom();
374  pids.recv = int_1d_view_host(do_not_initialize_tag("pids recv"), pids_from.size());
375  memcpy(pids.recv.data(), pids_from.getRawPtr(), sizeof(int)*pids.recv.extent(0));
376 
377  const auto pids_to = distributor.getProcsTo();
378  pids.send = int_1d_view_host(do_not_initialize_tag("pids send"), pids_to.size());
379  memcpy(pids.send.data(), pids_to.getRawPtr(), sizeof(int)*pids.send.extent(0));
380 
381  // mpi requests
382  reqs.recv.resize(pids.recv.extent(0)); memset(reqs.recv.data(), 0, reqs.recv.size()*sizeof(MPI_Request));
383  reqs.send.resize(pids.send.extent(0)); memset(reqs.send.data(), 0, reqs.send.size()*sizeof(MPI_Request));
384 
385  // construct offsets
386 #if 0
387  const auto lengths_to = distributor.getLengthsTo();
388  offset.send = size_type_1d_view(do_not_initialize_tag("offset send"), lengths_to.size() + 1);
389 
390  const auto lengths_from = distributor.getLengthsFrom();
391  offset.recv = size_type_1d_view(do_not_initialize_tag("offset recv"), lengths_from.size() + 1);
392 
393  setOffsetValues(lengths_to, offset.send);
394  offset_host.send = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), offset.send);
395 
396  setOffsetValues(lengths_from, offset.recv);
397  offset_host.recv = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), offset.recv);
398 #else
399  const auto lengths_to = distributor.getLengthsTo();
400  offset_host.send = size_type_1d_view_host(do_not_initialize_tag("offset send"), lengths_to.size() + 1);
401 
402  const auto lengths_from = distributor.getLengthsFrom();
403  offset_host.recv = size_type_1d_view_host(do_not_initialize_tag("offset recv"), lengths_from.size() + 1);
404 
405  setOffsetValuesHost(lengths_to, offset_host.send);
406  //offset.send = Kokkos::create_mirror_view_and_copy(memory_space(), offset_host.send);
407 
408  setOffsetValuesHost(lengths_from, offset_host.recv);
409  //offset.recv = Kokkos::create_mirror_view_and_copy(memory_space(), offset_host.recv);
410 #endif
411  }
412 
413  void createSendRecvIDs(const tpetra_import_type &import) {
414  // For each remote PID, the list of LIDs to receive.
415  const auto remote_lids = import.getRemoteLIDs();
416  const local_ordinal_type_1d_view_host
417  remote_lids_view_host(const_cast<local_ordinal_type*>(remote_lids.getRawPtr()), remote_lids.size());
418  lids.recv = local_ordinal_type_1d_view(do_not_initialize_tag("lids recv"), remote_lids.size());
419  Kokkos::deep_copy(lids.recv, remote_lids_view_host);
420 
421  // For each export PID, the list of LIDs to send.
422  auto epids = import.getExportPIDs();
423  auto elids = import.getExportLIDs();
424  TEUCHOS_ASSERT(epids.size() == elids.size());
425  lids.send = local_ordinal_type_1d_view(do_not_initialize_tag("lids send"), elids.size());
426  auto lids_send_host = Kokkos::create_mirror_view(lids.send);
427 
428  // naive search (not sure if pids or epids are sorted)
429  for (local_ordinal_type cnt=0,i=0,iend=pids.send.extent(0);i<iend;++i) {
430  const auto pid_send_value = pids.send[i];
431  for (local_ordinal_type j=0,jend=epids.size();j<jend;++j)
432  if (epids[j] == pid_send_value) lids_send_host[cnt++] = elids[j];
433  TEUCHOS_ASSERT(static_cast<size_t>(cnt) == offset_host.send[i+1]);
434  }
435  Kokkos::deep_copy(lids.send, lids_send_host);
436  }
437 
438  void createExecutionSpaceInstances() {
439 #if defined(KOKKOS_ENABLE_CUDA) || defined(KOKKOS_ENABLE_HIP) || defined(KOKKOS_ENABLE_SYCL)
440  //The following line creates 8 streams:
441  exec_instances =
442  Kokkos::Experimental::partition_space(execution_space(), 1, 1, 1, 1, 1, 1, 1, 1);
443 #endif
444  }
445 
446  public:
447  // for cuda, all tag types are public
448  struct ToBuffer {};
449  struct ToMultiVector {};
450 
451  AsyncableImport (const Teuchos::RCP<const tpetra_map_type>& src_map,
453  const local_ordinal_type blocksize_,
454  const local_ordinal_type_1d_view dm2cm_) {
455  blocksize = blocksize_;
456  dm2cm = dm2cm_;
457 
458 #ifdef HAVE_IFPACK2_MPI
459  comm = Tpetra::Details::extractMpiCommFromTeuchos(*tgt_map->getComm());
460 #endif
461  const tpetra_import_type import(src_map, tgt_map);
462 
463  createMpiRequests(import);
464  createSendRecvIDs(import);
465  createExecutionSpaceInstances();
466  }
467 
468  void createDataBuffer(const local_ordinal_type &num_vectors) {
469  const size_type extent_0 = lids.recv.extent(0)*blocksize;
470  const size_type extent_1 = num_vectors;
471  if (remote_multivector.extent(0) == extent_0 &&
472  remote_multivector.extent(1) == extent_1) {
473  // skip
474  } else {
475  remote_multivector =
476  impl_scalar_type_2d_view_tpetra(do_not_initialize_tag("remote multivector"), extent_0, extent_1);
477 
478  const auto send_buffer_size = offset_host.send[offset_host.send.extent(0)-1]*blocksize*num_vectors;
479  const auto recv_buffer_size = offset_host.recv[offset_host.recv.extent(0)-1]*blocksize*num_vectors;
480 
481  buffer.send = impl_scalar_type_1d_view(do_not_initialize_tag("buffer send"), send_buffer_size);
482  buffer.recv = impl_scalar_type_1d_view(do_not_initialize_tag("buffer recv"), recv_buffer_size);
483 
484  if (!Tpetra::Details::Behavior::assumeMpiIsGPUAware()) {
485  buffer_host.send = impl_scalar_type_1d_view_host(do_not_initialize_tag("buffer send"), send_buffer_size);
486  buffer_host.recv = impl_scalar_type_1d_view_host(do_not_initialize_tag("buffer recv"), recv_buffer_size);
487  }
488  }
489  }
490 
491  void cancel () {
492 #ifdef HAVE_IFPACK2_MPI
493  waitall(reqs.recv.size(), reqs.recv.data());
494  waitall(reqs.send.size(), reqs.send.data());
495 #endif
496  }
497 
498  // ======================================================================
499  // Async version using execution space instances
500  // ======================================================================
501 
502 #if defined(KOKKOS_ENABLE_CUDA) || defined(KOKKOS_ENABLE_HIP) || defined(KOKKOS_ENABLE_SYCL)
503  template<typename PackTag>
504  static
505  void copy(const local_ordinal_type_1d_view &lids_,
506  const impl_scalar_type_1d_view &buffer_,
507  const local_ordinal_type ibeg_,
508  const local_ordinal_type iend_,
509  const impl_scalar_type_2d_view_tpetra &multivector_,
510  const local_ordinal_type blocksize_,
511  const execution_space &exec_instance_) {
512  const local_ordinal_type num_vectors = multivector_.extent(1);
513  const local_ordinal_type mv_blocksize = blocksize_*num_vectors;
514  const local_ordinal_type idiff = iend_ - ibeg_;
515  const auto abase = buffer_.data() + mv_blocksize*ibeg_;
516 
517  using team_policy_type = Kokkos::TeamPolicy<execution_space>;
518  local_ordinal_type vector_size(0);
519  if (blocksize_ <= 4) vector_size = 4;
520  else if (blocksize_ <= 8) vector_size = 8;
521  else if (blocksize_ <= 16) vector_size = 16;
522  else vector_size = 32;
523 
524  const auto work_item_property = Kokkos::Experimental::WorkItemProperty::HintLightWeight;
525  const team_policy_type policy(exec_instance_, idiff, 1, vector_size);
526  Kokkos::parallel_for
527  (//"AsyncableImport::TeamPolicy::copyViaCudaStream",
528  Kokkos::Experimental::require(policy, work_item_property),
529  KOKKOS_LAMBDA(const typename team_policy_type::member_type &member) {
530  const local_ordinal_type i = member.league_rank();
531  Kokkos::parallel_for
532  (Kokkos::TeamThreadRange(member,num_vectors),[&](const local_ordinal_type &j) {
533  auto aptr = abase + blocksize_*(i + idiff*j);
534  auto bptr = &multivector_(blocksize_*lids_(i + ibeg_), j);
535  if (std::is_same<PackTag,ToBuffer>::value)
536  Kokkos::parallel_for
537  (Kokkos::ThreadVectorRange(member,blocksize_),[&](const local_ordinal_type &k) {
538  aptr[k] = bptr[k];
539  });
540  else
541  Kokkos::parallel_for
542  (Kokkos::ThreadVectorRange(member,blocksize_),[&](const local_ordinal_type &k) {
543  bptr[k] = aptr[k];
544  });
545  });
546  });
547  }
548 
549  void asyncSendRecvVar1(const impl_scalar_type_2d_view_tpetra &mv) {
550  IFPACK2_BLOCKHELPER_TIMER("BlockTriDi::AsyncableImport::AsyncSendRecv");
551 
552 #ifdef HAVE_IFPACK2_MPI
553  // constants and reallocate data buffers if necessary
554  const local_ordinal_type num_vectors = mv.extent(1);
555  const local_ordinal_type mv_blocksize = blocksize*num_vectors;
556 
557  // 0. post receive async
558  for (local_ordinal_type i=0,iend=pids.recv.extent(0);i<iend;++i) {
559  if(Tpetra::Details::Behavior::assumeMpiIsGPUAware()) {
560  irecv(comm,
561  reinterpret_cast<char*>(buffer.recv.data() + offset_host.recv[i]*mv_blocksize),
562  (offset_host.recv[i+1] - offset_host.recv[i])*mv_blocksize*sizeof(impl_scalar_type),
563  pids.recv[i],
564  42,
565  &reqs.recv[i]);
566  }
567  else {
568  irecv(comm,
569  reinterpret_cast<char*>(buffer_host.recv.data() + offset_host.recv[i]*mv_blocksize),
570  (offset_host.recv[i+1] - offset_host.recv[i])*mv_blocksize*sizeof(impl_scalar_type),
571  pids.recv[i],
572  42,
573  &reqs.recv[i]);
574  }
575  }
576 
578  execution_space().fence();
579 
580  // 1. async memcpy
581  for (local_ordinal_type i=0;i<static_cast<local_ordinal_type>(pids.send.extent(0));++i) {
582  // 1.0. enqueue pack buffer
583  if (i<8) exec_instances[i%8].fence();
584  copy<ToBuffer>(lids.send, buffer.send,
585  offset_host.send(i), offset_host.send(i+1),
586  mv, blocksize,
587  //execution_space());
588  exec_instances[i%8]);
589  if (!Tpetra::Details::Behavior::assumeMpiIsGPUAware()) {
590  //if (i<8) exec_instances[i%8].fence();
591  const local_ordinal_type num_vectors = mv.extent(1);
592  const local_ordinal_type mv_blocksize = blocksize*num_vectors;
593 
594  Kokkos::deep_copy(exec_instances[i%8],
595  Kokkos::subview(buffer_host.send,
596  Kokkos::pair<local_ordinal_type, local_ordinal_type>(
597  offset_host.send(i)*mv_blocksize,
598  offset_host.send(i+1)*mv_blocksize)),
599  Kokkos::subview(buffer.send,
600  Kokkos::pair<local_ordinal_type, local_ordinal_type>(
601  offset_host.send(i)*mv_blocksize,
602  offset_host.send(i+1)*mv_blocksize)));
603  }
604  }
606  //execution_space().fence();
607  for (local_ordinal_type i=0;i<static_cast<local_ordinal_type>(pids.send.extent(0));++i) {
608  // 1.1. sync the stream and isend
609  if (i<8) exec_instances[i%8].fence();
610  if(Tpetra::Details::Behavior::assumeMpiIsGPUAware()) {
611  isend(comm,
612  reinterpret_cast<const char*>(buffer.send.data() + offset_host.send[i]*mv_blocksize),
613  (offset_host.send[i+1] - offset_host.send[i])*mv_blocksize*sizeof(impl_scalar_type),
614  pids.send[i],
615  42,
616  &reqs.send[i]);
617  }
618  else {
619  isend(comm,
620  reinterpret_cast<const char*>(buffer_host.send.data() + offset_host.send[i]*mv_blocksize),
621  (offset_host.send[i+1] - offset_host.send[i])*mv_blocksize*sizeof(impl_scalar_type),
622  pids.send[i],
623  42,
624  &reqs.send[i]);
625  }
626  }
627 
628  // 2. poke communication
629  for (local_ordinal_type i=0,iend=pids.recv.extent(0);i<iend;++i) {
630  int flag;
631  MPI_Status stat;
632  MPI_Iprobe(pids.recv[i], 42, comm, &flag, &stat);
633  }
634 #endif // HAVE_IFPACK2_MPI
635  IFPACK2_BLOCKHELPER_TIMER_FENCE(execution_space)
636  }
637 
638  void syncRecvVar1() {
639  IFPACK2_BLOCKHELPER_TIMER("BlockTriDi::AsyncableImport::SyncRecv");
640 #ifdef HAVE_IFPACK2_MPI
641  // 0. wait for receive async.
642  for (local_ordinal_type i=0;i<static_cast<local_ordinal_type>(pids.recv.extent(0));++i) {
643  local_ordinal_type idx = i;
644 
645  // 0.0. wait any
646  waitany(pids.recv.extent(0), reqs.recv.data(), &idx);
647 
648  if (!Tpetra::Details::Behavior::assumeMpiIsGPUAware()) {
649  const local_ordinal_type num_vectors = remote_multivector.extent(1);
650  const local_ordinal_type mv_blocksize = blocksize*num_vectors;
651 
652  Kokkos::deep_copy(
653  Kokkos::subview(buffer.recv,
654  Kokkos::pair<local_ordinal_type, local_ordinal_type>(
655  offset_host.recv(idx)*mv_blocksize,
656  offset_host.recv(idx+1)*mv_blocksize)),
657  Kokkos::subview(buffer_host.recv,
658  Kokkos::pair<local_ordinal_type, local_ordinal_type>(
659  offset_host.recv(idx)*mv_blocksize,
660  offset_host.recv(idx+1)*mv_blocksize)));
661  }
662 
663  // 0.1. unpack data after data is moved into a device
664  copy<ToMultiVector>(lids.recv, buffer.recv,
665  offset_host.recv(idx), offset_host.recv(idx+1),
666  remote_multivector, blocksize,
667  exec_instances[idx%8]);
668  }
669 
670  // 1. fire up all cuda events
671  Kokkos::fence();
672 
673  // 2. cleanup all open comm
674  waitall(reqs.send.size(), reqs.send.data());
675 #endif // HAVE_IFPACK2_MPI
676  IFPACK2_BLOCKHELPER_TIMER_FENCE(execution_space)
677  }
678 #endif //defined(KOKKOS_ENABLE_CUDA|HIP|SYCL)
679 
680  // ======================================================================
681  // Generic version without using execution space instances
682  // - only difference between device and host architecture is on using team
683  // or range policies.
684  // ======================================================================
685  template<typename PackTag>
686  static
687  void copy(const local_ordinal_type_1d_view &lids_,
688  const impl_scalar_type_1d_view &buffer_,
689  const local_ordinal_type &ibeg_,
690  const local_ordinal_type &iend_,
691  const impl_scalar_type_2d_view_tpetra &multivector_,
692  const local_ordinal_type blocksize_) {
693  const local_ordinal_type num_vectors = multivector_.extent(1);
694  const local_ordinal_type mv_blocksize = blocksize_*num_vectors;
695  const local_ordinal_type idiff = iend_ - ibeg_;
696  const auto abase = buffer_.data() + mv_blocksize*ibeg_;
697  if constexpr (BlockHelperDetails::is_device<execution_space>::value) {
698  using team_policy_type = Kokkos::TeamPolicy<execution_space>;
699  local_ordinal_type vector_size(0);
700  if (blocksize_ <= 4) vector_size = 4;
701  else if (blocksize_ <= 8) vector_size = 8;
702  else if (blocksize_ <= 16) vector_size = 16;
703  else vector_size = 32;
704  const team_policy_type policy(idiff, 1, vector_size);
705  Kokkos::parallel_for
706  ("AsyncableImport::TeamPolicy::copy",
707  policy, KOKKOS_LAMBDA(const typename team_policy_type::member_type &member) {
708  const local_ordinal_type i = member.league_rank();
709  Kokkos::parallel_for
710  (Kokkos::TeamThreadRange(member,num_vectors),[&](const local_ordinal_type &j) {
711  auto aptr = abase + blocksize_*(i + idiff*j);
712  auto bptr = &multivector_(blocksize_*lids_(i + ibeg_), j);
713  if (std::is_same<PackTag,ToBuffer>::value)
714  Kokkos::parallel_for
715  (Kokkos::ThreadVectorRange(member,blocksize_),[&](const local_ordinal_type &k) {
716  aptr[k] = bptr[k];
717  });
718  else
719  Kokkos::parallel_for
720  (Kokkos::ThreadVectorRange(member,blocksize_),[&](const local_ordinal_type &k) {
721  bptr[k] = aptr[k];
722  });
723  });
724  });
725  } else {
726  const Kokkos::RangePolicy<execution_space> policy(0, idiff*num_vectors);
727  Kokkos::parallel_for
728  ("AsyncableImport::RangePolicy::copy",
729  policy, KOKKOS_LAMBDA(const local_ordinal_type &ij) {
730  const local_ordinal_type i = ij%idiff;
731  const local_ordinal_type j = ij/idiff;
732  auto aptr = abase + blocksize_*(i + idiff*j);
733  auto bptr = &multivector_(blocksize_*lids_(i + ibeg_), j);
734  auto from = std::is_same<PackTag,ToBuffer>::value ? bptr : aptr;
735  auto to = std::is_same<PackTag,ToBuffer>::value ? aptr : bptr;
736  memcpy(to, from, sizeof(impl_scalar_type)*blocksize_);
737  });
738  }
739  }
740 
741 
745  void asyncSendRecvVar0(const impl_scalar_type_2d_view_tpetra &mv) {
746  IFPACK2_BLOCKHELPER_TIMER("BlockTriDi::AsyncableImport::AsyncSendRecv");
747 
748 #ifdef HAVE_IFPACK2_MPI
749  // constants and reallocate data buffers if necessary
750  const local_ordinal_type num_vectors = mv.extent(1);
751  const local_ordinal_type mv_blocksize = blocksize*num_vectors;
752 
753  // receive async
754  for (local_ordinal_type i=0,iend=pids.recv.extent(0);i<iend;++i) {
755  if(Tpetra::Details::Behavior::assumeMpiIsGPUAware()) {
756  irecv(comm,
757  reinterpret_cast<char*>(buffer.recv.data() + offset_host.recv[i]*mv_blocksize),
758  (offset_host.recv[i+1] - offset_host.recv[i])*mv_blocksize*sizeof(impl_scalar_type),
759  pids.recv[i],
760  42,
761  &reqs.recv[i]);
762  }
763  else {
764  irecv(comm,
765  reinterpret_cast<char*>(buffer_host.recv.data() + offset_host.recv[i]*mv_blocksize),
766  (offset_host.recv[i+1] - offset_host.recv[i])*mv_blocksize*sizeof(impl_scalar_type),
767  pids.recv[i],
768  42,
769  &reqs.recv[i]);
770  }
771  }
772 
773  // send async
774  for (local_ordinal_type i=0,iend=pids.send.extent(0);i<iend;++i) {
775  copy<ToBuffer>(lids.send, buffer.send, offset_host.send(i), offset_host.send(i+1),
776  mv, blocksize);
777  Kokkos::fence();
778  if(Tpetra::Details::Behavior::assumeMpiIsGPUAware()) {
779  isend(comm,
780  reinterpret_cast<const char*>(buffer.send.data() + offset_host.send[i]*mv_blocksize),
781  (offset_host.send[i+1] - offset_host.send[i])*mv_blocksize*sizeof(impl_scalar_type),
782  pids.send[i],
783  42,
784  &reqs.send[i]);
785  }
786  else {
787  Kokkos::deep_copy(
788  Kokkos::subview(buffer_host.send,
789  Kokkos::pair<local_ordinal_type, local_ordinal_type>(
790  offset_host.send(i)*mv_blocksize,
791  offset_host.send(i+1)*mv_blocksize)),
792  Kokkos::subview(buffer.send,
793  Kokkos::pair<local_ordinal_type, local_ordinal_type>(
794  offset_host.send(i)*mv_blocksize,
795  offset_host.send(i+1)*mv_blocksize)));
796  isend(comm,
797  reinterpret_cast<const char*>(buffer_host.send.data() + offset_host.send[i]*mv_blocksize),
798  (offset_host.send[i+1] - offset_host.send[i])*mv_blocksize*sizeof(impl_scalar_type),
799  pids.send[i],
800  42,
801  &reqs.send[i]);
802  }
803  }
804 
805  // I find that issuing an Iprobe seems to nudge some MPIs into action,
806  // which helps with overlapped comm/comp performance.
807  for (local_ordinal_type i=0,iend=pids.recv.extent(0);i<iend;++i) {
808  int flag;
809  MPI_Status stat;
810  MPI_Iprobe(pids.recv[i], 42, comm, &flag, &stat);
811  }
812 #endif
813  IFPACK2_BLOCKHELPER_TIMER_FENCE(execution_space)
814  }
815 
816  void syncRecvVar0() {
817  IFPACK2_BLOCKHELPER_TIMER("BlockTriDi::AsyncableImport::SyncRecv");
818 #ifdef HAVE_IFPACK2_MPI
819  // receive async.
820  for (local_ordinal_type i=0,iend=pids.recv.extent(0);i<iend;++i) {
821  local_ordinal_type idx = i;
822  waitany(pids.recv.extent(0), reqs.recv.data(), &idx);
823  if (!Tpetra::Details::Behavior::assumeMpiIsGPUAware()) {
824  const local_ordinal_type num_vectors = remote_multivector.extent(1);
825  const local_ordinal_type mv_blocksize = blocksize*num_vectors;
826  Kokkos::deep_copy(
827  Kokkos::subview(buffer.recv,
828  Kokkos::pair<local_ordinal_type, local_ordinal_type>(
829  offset_host.recv(idx)*mv_blocksize,
830  offset_host.recv(idx+1)*mv_blocksize)),
831  Kokkos::subview(buffer_host.recv,
832  Kokkos::pair<local_ordinal_type, local_ordinal_type>(
833  offset_host.recv(idx)*mv_blocksize,
834  offset_host.recv(idx+1)*mv_blocksize)));
835  }
836  copy<ToMultiVector>(lids.recv, buffer.recv, offset_host.recv(idx), offset_host.recv(idx+1),
837  remote_multivector, blocksize);
838  }
839  // wait on the sends to match all Isends with a cleanup operation.
840  waitall(reqs.send.size(), reqs.send.data());
841 #endif
842  IFPACK2_BLOCKHELPER_TIMER_FENCE(execution_space)
843  }
844 
848  void asyncSendRecv(const impl_scalar_type_2d_view_tpetra &mv) {
849 #if defined(KOKKOS_ENABLE_CUDA) || defined(KOKKOS_ENABLE_HIP) || defined(KOKKOS_ENABLE_SYCL)
850 #if defined(IFPACK2_BLOCKTRIDICONTAINER_USE_EXEC_SPACE_INSTANCES)
851  asyncSendRecvVar1(mv);
852 #else
853  asyncSendRecvVar0(mv);
854 #endif
855 #else
856  asyncSendRecvVar0(mv);
857 #endif
858  }
859  void syncRecv() {
860 #if defined(KOKKOS_ENABLE_CUDA) || defined(KOKKOS_ENABLE_HIP) || defined(KOKKOS_ENABLE_SYCL)
861 #if defined(IFPACK2_BLOCKTRIDICONTAINER_USE_EXEC_SPACE_INSTANCES)
862  syncRecvVar1();
863 #else
864  syncRecvVar0();
865 #endif
866 #else
867  syncRecvVar0();
868 #endif
869  }
870 
871  void syncExchange(const impl_scalar_type_2d_view_tpetra &mv) {
872  IFPACK2_BLOCKHELPER_TIMER("BlockTriDi::AsyncableImport::SyncExchange");
873  asyncSendRecv(mv);
874  syncRecv();
875  IFPACK2_BLOCKHELPER_TIMER_FENCE(execution_space)
876  }
877 
878  impl_scalar_type_2d_view_tpetra getRemoteMultiVectorLocalView() const { return remote_multivector; }
879  };
880 
884  template<typename MatrixType>
886  createBlockCrsAsyncImporter(const Teuchos::RCP<const typename BlockHelperDetails::ImplType<MatrixType>::tpetra_block_crs_matrix_type> &A) {
888  using tpetra_map_type = typename impl_type::tpetra_map_type;
889  using local_ordinal_type = typename impl_type::local_ordinal_type;
890  using global_ordinal_type = typename impl_type::global_ordinal_type;
891  using local_ordinal_type_1d_view = typename impl_type::local_ordinal_type_1d_view;
892 
893  const auto g = A->getCrsGraph(); // tpetra crs graph object
894  const auto blocksize = A->getBlockSize();
895  const auto domain_map = g.getDomainMap();
896  const auto column_map = g.getColMap();
897 
898  std::vector<global_ordinal_type> gids;
899  bool separate_remotes = true, found_first = false, need_owned_permutation = false;
900  for (size_t i=0;i<column_map->getLocalNumElements();++i) {
901  const global_ordinal_type gid = column_map->getGlobalElement(i);
902  if (!domain_map->isNodeGlobalElement(gid)) {
903  found_first = true;
904  gids.push_back(gid);
905  } else if (found_first) {
906  separate_remotes = false;
907  break;
908  }
909  if (!need_owned_permutation &&
910  domain_map->getLocalElement(gid) != static_cast<local_ordinal_type>(i)) {
911  // The owned part of the domain and column maps are different
912  // orderings. We *could* do a super efficient impl of this case in the
913  // num_sweeps > 1 case by adding complexity to PermuteAndRepack. But,
914  // really, if a caller cares about speed, they wouldn't make different
915  // local permutations like this. So we punt on the best impl and go for
916  // a pretty good one: the permutation is done in place in
917  // compute_b_minus_Rx for the pure-owned part of the MVP. The only cost
918  // is the presumably worse memory access pattern of the input vector.
919  need_owned_permutation = true;
920  }
921  }
922 
923  if (separate_remotes) {
925  const auto parsimonious_col_map
926  = Teuchos::rcp(new tpetra_map_type(invalid, gids.data(), gids.size(), 0, domain_map->getComm()));
927  if (parsimonious_col_map->getGlobalNumElements() > 0) {
928  // make the importer only if needed.
929  local_ordinal_type_1d_view dm2cm;
930  if (need_owned_permutation) {
931  dm2cm = local_ordinal_type_1d_view(do_not_initialize_tag("dm2cm"), domain_map->getLocalNumElements());
932  const auto dm2cm_host = Kokkos::create_mirror_view(dm2cm);
933  for (size_t i=0;i<domain_map->getLocalNumElements();++i)
934  dm2cm_host(i) = domain_map->getLocalElement(column_map->getGlobalElement(i));
935  Kokkos::deep_copy(dm2cm, dm2cm_host);
936  }
937  return Teuchos::rcp(new AsyncableImport<MatrixType>(domain_map, parsimonious_col_map, blocksize, dm2cm));
938  }
939  }
940  return Teuchos::null;
941  }
942 
943  template<typename local_ordinal_type>
944  local_ordinal_type costTRSM(const local_ordinal_type block_size) {
945  return block_size*block_size;
946  }
947 
948  template<typename local_ordinal_type>
949  local_ordinal_type costGEMV(const local_ordinal_type block_size) {
950  return 2*block_size*block_size;
951  }
952 
953  template<typename local_ordinal_type>
954  local_ordinal_type costTriDiagSolve(const local_ordinal_type subline_length, const local_ordinal_type block_size) {
955  return 2 * subline_length * costTRSM(block_size) + 2 * (subline_length-1) * costGEMV(block_size);
956  }
957 
958  template<typename local_ordinal_type>
959  local_ordinal_type costSolveSchur(const local_ordinal_type num_parts,
960  const local_ordinal_type num_teams,
961  const local_ordinal_type line_length,
962  const local_ordinal_type block_size,
963  const local_ordinal_type n_subparts_per_part) {
964  const local_ordinal_type subline_length = ceil(double(line_length - (n_subparts_per_part-1) * 2) / n_subparts_per_part);
965  if (subline_length < 1) {
966  return INT_MAX;
967  }
968 
969  const local_ordinal_type p_n_lines = ceil(double(num_parts)/num_teams);
970  const local_ordinal_type p_n_sublines = ceil(double(n_subparts_per_part)*num_parts/num_teams);
971  const local_ordinal_type p_n_sublines_2 = ceil(double(n_subparts_per_part-1)*num_parts/num_teams);
972 
973  const local_ordinal_type p_costApplyE = p_n_sublines_2 * subline_length * 2 * costGEMV(block_size);
974  const local_ordinal_type p_costApplyS = p_n_lines * costTriDiagSolve((n_subparts_per_part-1)*2,block_size);
975  const local_ordinal_type p_costApplyAinv = p_n_sublines * costTriDiagSolve(subline_length,block_size);
976  const local_ordinal_type p_costApplyC = p_n_sublines_2 * 2 * costGEMV(block_size);
977 
978  if (n_subparts_per_part == 1) {
979  return p_costApplyAinv;
980  }
981  return p_costApplyE + p_costApplyS + p_costApplyAinv + p_costApplyC;
982  }
983 
984  template<typename local_ordinal_type>
985  local_ordinal_type getAutomaticNSubparts(const local_ordinal_type num_parts,
986  const local_ordinal_type num_teams,
987  const local_ordinal_type line_length,
988  const local_ordinal_type block_size) {
989  local_ordinal_type n_subparts_per_part_0 = 1;
990  local_ordinal_type flop_0 = costSolveSchur(num_parts, num_teams, line_length, block_size, n_subparts_per_part_0);
991  local_ordinal_type flop_1 = costSolveSchur(num_parts, num_teams, line_length, block_size, n_subparts_per_part_0+1);
992  while (flop_0 > flop_1) {
993  flop_0 = flop_1;
994  flop_1 = costSolveSchur(num_parts, num_teams, line_length, block_size, (++n_subparts_per_part_0)+1);
995  }
996  return n_subparts_per_part_0;
997  }
998 
999  template<typename ArgActiveExecutionMemorySpace>
1000  struct SolveTridiagsDefaultModeAndAlgo;
1001 
1005  template<typename MatrixType>
1006  BlockHelperDetails::PartInterface<MatrixType>
1007  createPartInterface(const Teuchos::RCP<const typename BlockHelperDetails::ImplType<MatrixType>::tpetra_block_crs_matrix_type> &A,
1008  const Teuchos::Array<Teuchos::Array<typename BlockHelperDetails::ImplType<MatrixType>::local_ordinal_type> > &partitions,
1009  const typename BlockHelperDetails::ImplType<MatrixType>::local_ordinal_type n_subparts_per_part_in) {
1010  IFPACK2_BLOCKHELPER_TIMER("createPartInterface");
1011  using impl_type = BlockHelperDetails::ImplType<MatrixType>;
1012  using local_ordinal_type = typename impl_type::local_ordinal_type;
1013  using local_ordinal_type_1d_view = typename impl_type::local_ordinal_type_1d_view;
1014  using local_ordinal_type_2d_view = typename impl_type::local_ordinal_type_2d_view;
1015  using size_type = typename impl_type::size_type;
1016 
1017  const auto blocksize = A->getBlockSize();
1018  constexpr int vector_length = impl_type::vector_length;
1019  constexpr int internal_vector_length = impl_type::internal_vector_length;
1020 
1021  const auto comm = A->getRowMap()->getComm();
1022 
1023  BlockHelperDetails::PartInterface<MatrixType> interf;
1024 
1025  const bool jacobi = partitions.size() == 0;
1026  const local_ordinal_type A_n_lclrows = A->getLocalNumRows();
1027  const local_ordinal_type nparts = jacobi ? A_n_lclrows : partitions.size();
1028 
1029  typedef std::pair<local_ordinal_type,local_ordinal_type> size_idx_pair_type;
1030  std::vector<size_idx_pair_type> partsz(nparts);
1031 
1032  if (!jacobi) {
1033  for (local_ordinal_type i=0;i<nparts;++i)
1034  partsz[i] = size_idx_pair_type(partitions[i].size(), i);
1035  std::sort(partsz.begin(), partsz.end(),
1036  [] (const size_idx_pair_type& x, const size_idx_pair_type& y) {
1037  return x.first > y.first;
1038  });
1039  }
1040 
1041  local_ordinal_type n_subparts_per_part;
1042  if (n_subparts_per_part_in == -1) {
1043  // If the number of subparts is set to -1, the user let the algorithm
1044  // decides the value automatically
1045  using execution_space = typename impl_type::execution_space;
1046 
1047  const int line_length = partsz[0].first;
1048 
1049  const local_ordinal_type team_size =
1050  SolveTridiagsDefaultModeAndAlgo<typename execution_space::memory_space>::
1051  recommended_team_size(blocksize, vector_length, internal_vector_length);
1052 
1053  const local_ordinal_type num_teams = execution_space().concurrency() / (team_size * vector_length);
1054 
1055  n_subparts_per_part = getAutomaticNSubparts(nparts, num_teams, line_length, blocksize);
1056 
1057  printf("Automatically chosen n_subparts_per_part = %d for nparts = %d, num_teams = %d, team_size = %d, line_length = %d, and blocksize = %d;\n", n_subparts_per_part, nparts, num_teams, team_size, line_length, blocksize);
1058  }
1059  else {
1060  n_subparts_per_part = n_subparts_per_part_in;
1061  }
1062 
1063  // Total number of sub lines:
1064  const local_ordinal_type n_sub_parts = nparts * n_subparts_per_part;
1065  // Total number of sub lines + the Schur complement blocks.
1066  // For a given live 2 sub lines implies one Schur complement, 3 sub lines implies two Schur complements etc.
1067  const local_ordinal_type n_sub_parts_and_schur = n_sub_parts + nparts * (n_subparts_per_part-1);
1068 
1069 #if defined(BLOCKTRIDICONTAINER_DEBUG)
1070  local_ordinal_type nrows = 0;
1071  if (jacobi)
1072  nrows = nparts;
1073  else
1074  for (local_ordinal_type i=0;i<nparts;++i) nrows += partitions[i].size();
1075 
1077  (nrows != A_n_lclrows, BlockHelperDetails::get_msg_prefix(comm) << "The #rows implied by the local partition is not "
1078  << "the same as getLocalNumRows: " << nrows << " vs " << A_n_lclrows);
1079 #endif
1080 
1081  // permutation vector
1082  std::vector<local_ordinal_type> p;
1083  if (jacobi) {
1084  interf.max_partsz = 1;
1085  interf.max_subpartsz = 0;
1086  interf.n_subparts_per_part = 1;
1087  interf.nparts = nparts;
1088  } else {
1089  // reorder parts to maximize simd packing efficiency
1090  p.resize(nparts);
1091 
1092  for (local_ordinal_type i=0;i<nparts;++i)
1093  p[i] = partsz[i].second;
1094 
1095  interf.max_partsz = partsz[0].first;
1096 
1097  constexpr local_ordinal_type connection_length = 2;
1098  const local_ordinal_type sub_line_length = (interf.max_partsz - (n_subparts_per_part - 1) * connection_length) / n_subparts_per_part;
1099  const local_ordinal_type last_sub_line_length = interf.max_partsz - (n_subparts_per_part - 1) * (connection_length + sub_line_length);
1100 
1101  interf.max_subpartsz = (sub_line_length > last_sub_line_length) ? sub_line_length : last_sub_line_length;
1102  interf.n_subparts_per_part = n_subparts_per_part;
1103  interf.nparts = nparts;
1104  }
1105 
1106  // allocate parts
1107  interf.partptr = local_ordinal_type_1d_view(do_not_initialize_tag("partptr"), nparts + 1);
1108  interf.lclrow = local_ordinal_type_1d_view(do_not_initialize_tag("lclrow"), A_n_lclrows);
1109  interf.part2rowidx0 = local_ordinal_type_1d_view(do_not_initialize_tag("part2rowidx0"), nparts + 1);
1110  interf.part2packrowidx0 = local_ordinal_type_1d_view(do_not_initialize_tag("part2packrowidx0"), nparts + 1);
1111  interf.rowidx2part = local_ordinal_type_1d_view(do_not_initialize_tag("rowidx2part"), A_n_lclrows);
1112 
1113  interf.part2rowidx0_sub = local_ordinal_type_1d_view(do_not_initialize_tag("part2rowidx0_sub"), n_sub_parts_and_schur + 1);
1114  interf.part2packrowidx0_sub = local_ordinal_type_2d_view(do_not_initialize_tag("part2packrowidx0_sub"), nparts, 2 * n_subparts_per_part);
1115  interf.rowidx2part_sub = local_ordinal_type_1d_view(do_not_initialize_tag("rowidx2part"), A_n_lclrows);
1116 
1117  interf.partptr_sub = local_ordinal_type_2d_view(do_not_initialize_tag("partptr_sub"), n_sub_parts_and_schur, 2);
1118 
1119  // mirror to host and compute on host execution space
1120  const auto partptr = Kokkos::create_mirror_view(interf.partptr);
1121  const auto partptr_sub = Kokkos::create_mirror_view(interf.partptr_sub);
1122 
1123  const auto lclrow = Kokkos::create_mirror_view(interf.lclrow);
1124  const auto part2rowidx0 = Kokkos::create_mirror_view(interf.part2rowidx0);
1125  const auto part2packrowidx0 = Kokkos::create_mirror_view(interf.part2packrowidx0);
1126  const auto rowidx2part = Kokkos::create_mirror_view(interf.rowidx2part);
1127 
1128  const auto part2rowidx0_sub = Kokkos::create_mirror_view(interf.part2rowidx0_sub);
1129  const auto part2packrowidx0_sub = Kokkos::create_mirror_view(Kokkos::HostSpace(), interf.part2packrowidx0_sub);
1130  const auto rowidx2part_sub = Kokkos::create_mirror_view(interf.rowidx2part_sub);
1131 
1132  // Determine parts.
1133  interf.row_contiguous = true;
1134  partptr(0) = 0;
1135  part2rowidx0(0) = 0;
1136  part2packrowidx0(0) = 0;
1137  local_ordinal_type pack_nrows = 0;
1138  local_ordinal_type pack_nrows_sub = 0;
1139  if (jacobi) {
1140  IFPACK2_BLOCKHELPER_TIMER("compute part indices (Jacobi)");
1141  for (local_ordinal_type ip=0;ip<nparts;++ip) {
1142  constexpr local_ordinal_type ipnrows = 1;
1143  //assume No overlap.
1144  part2rowidx0(ip+1) = part2rowidx0(ip) + ipnrows;
1145  // Since parts are ordered in decreasing size, the size of the first
1146  // part in a pack is the size for all parts in the pack.
1147  if (ip % vector_length == 0) pack_nrows = ipnrows;
1148  part2packrowidx0(ip+1) = part2packrowidx0(ip) + ((ip+1) % vector_length == 0 || ip+1 == nparts ? pack_nrows : 0);
1149  const local_ordinal_type offset = partptr(ip);
1150  for (local_ordinal_type i=0;i<ipnrows;++i) {
1151  const auto lcl_row = ip;
1152  TEUCHOS_TEST_FOR_EXCEPT_MSG(lcl_row < 0 || lcl_row >= A_n_lclrows,
1153  BlockHelperDetails::get_msg_prefix(comm)
1154  << "partitions[" << p[ip] << "]["
1155  << i << "] = " << lcl_row
1156  << " but input matrix implies limits of [0, " << A_n_lclrows-1
1157  << "].");
1158  lclrow(offset+i) = lcl_row;
1159  rowidx2part(offset+i) = ip;
1160  if (interf.row_contiguous && offset+i > 0 && lclrow((offset+i)-1) + 1 != lcl_row)
1161  interf.row_contiguous = false;
1162  }
1163  partptr(ip+1) = offset + ipnrows;
1164  }
1165  part2rowidx0_sub(0) = 0;
1166  partptr_sub(0, 0) = 0;
1167 
1168  for (local_ordinal_type ip=0;ip<nparts;++ip) {
1169  constexpr local_ordinal_type ipnrows = 1;
1170  const local_ordinal_type full_line_length = partptr(ip+1) - partptr(ip);
1171 
1173  (full_line_length != ipnrows, std::logic_error,
1174  "In the part " << ip );
1175 
1176  constexpr local_ordinal_type connection_length = 2;
1177 
1178  if (full_line_length < n_subparts_per_part + (n_subparts_per_part - 1) * connection_length )
1180  (true, std::logic_error,
1181  "The part " << ip << " is too short to use " << n_subparts_per_part << " sub parts.");
1182 
1183  const local_ordinal_type sub_line_length = (full_line_length - (n_subparts_per_part - 1) * connection_length) / n_subparts_per_part;
1184  const local_ordinal_type last_sub_line_length = full_line_length - (n_subparts_per_part - 1) * (connection_length + sub_line_length);
1185 
1186  if (ip % vector_length == 0) pack_nrows_sub = ipnrows;
1187 
1188  for (local_ordinal_type local_sub_ip=0; local_sub_ip<n_subparts_per_part;++local_sub_ip) {
1189  const local_ordinal_type sub_ip = nparts*(2*local_sub_ip) + ip;
1190  const local_ordinal_type schur_ip = nparts*(2*local_sub_ip+1) + ip;
1191  if (local_sub_ip != n_subparts_per_part-1) {
1192  if (local_sub_ip != 0) {
1193  partptr_sub(sub_ip, 0) = partptr_sub(nparts*(2*local_sub_ip-1) + ip, 1);
1194  }
1195  else if (ip != 0) {
1196  partptr_sub(sub_ip, 0) = partptr_sub(nparts*2*(n_subparts_per_part-1) + ip - 1, 1);
1197  }
1198  partptr_sub(sub_ip, 1) = sub_line_length + partptr_sub(sub_ip, 0);
1199  partptr_sub(schur_ip, 0) = partptr_sub(sub_ip, 1);
1200  partptr_sub(schur_ip, 1) = connection_length + partptr_sub(schur_ip, 0);
1201 
1202  part2rowidx0_sub(sub_ip + 1) = part2rowidx0_sub(sub_ip) + sub_line_length;
1203  part2rowidx0_sub(sub_ip + 2) = part2rowidx0_sub(sub_ip + 1) + connection_length;
1204 
1205 #ifdef IFPACK2_BLOCKTRIDICONTAINER_USE_PRINTF
1206  printf("Sub Part index = %d, first LID associated to the sub part = %d, sub part size = %d;\n", sub_ip, partptr_sub(ip, 2 * local_sub_ip), sub_line_length);
1207  printf("Sub Part index Schur = %d, first LID associated to the sub part = %d, sub part size = %d;\n", sub_ip + 1, partptr_sub(ip, 2 * local_sub_ip + 1), connection_length);
1208 #endif
1209  }
1210  else {
1211  if (local_sub_ip != 0) {
1212  partptr_sub(sub_ip, 0) = partptr_sub(nparts*(2*local_sub_ip-1) + ip, 1);
1213  }
1214  else if (ip != 0) {
1215  partptr_sub(sub_ip, 0) = partptr_sub(nparts*2*(n_subparts_per_part-1) + ip - 1, 1);
1216  }
1217  partptr_sub(sub_ip, 1) = last_sub_line_length + partptr_sub(sub_ip, 0);
1218 
1219  part2rowidx0_sub(sub_ip + 1) = part2rowidx0_sub(sub_ip) + last_sub_line_length;
1220 
1221 #ifdef IFPACK2_BLOCKTRIDICONTAINER_USE_PRINTF
1222  printf("Sub Part index = %d, first LID associated to the sub part = %d, sub part size = %d;\n", sub_ip, partptr_sub(ip, 2 * local_sub_ip), last_sub_line_length);
1223 #endif
1224  }
1225  }
1226  }
1227 
1228 #ifdef IFPACK2_BLOCKTRIDICONTAINER_WRITE_MM
1229  std::cout << "partptr_sub = " << std::endl;
1230  for (size_type i = 0; i < partptr_sub.extent(0); ++i) {
1231  for (size_type j = 0; j < partptr_sub.extent(1); ++j) {
1232  std::cout << partptr_sub(i,j) << " ";
1233  }
1234  std::cout << std::endl;
1235  }
1236  std::cout << "partptr_sub end" << std::endl;
1237 #endif
1238 
1239  {
1240  local_ordinal_type npacks = ceil(float(nparts)/vector_length);
1241 
1242  local_ordinal_type ip_max = nparts > vector_length ? vector_length : nparts;
1243  for (local_ordinal_type ip=0;ip<ip_max;++ip) {
1244  part2packrowidx0_sub(ip, 0) = 0;
1245  }
1246  for (local_ordinal_type ipack=0;ipack<npacks;++ipack) {
1247  if (ipack != 0) {
1248  local_ordinal_type ip_min = ipack*vector_length;
1249  ip_max = nparts > (ipack+1)*vector_length ? (ipack+1)*vector_length : nparts;
1250  for (local_ordinal_type ip=ip_min;ip<ip_max;++ip) {
1251  part2packrowidx0_sub(ip, 0) = part2packrowidx0_sub(ip-vector_length, part2packrowidx0_sub.extent(1)-1);
1252  }
1253  }
1254 
1255  for (size_type local_sub_ip=0; local_sub_ip<part2packrowidx0_sub.extent(1)-1;++local_sub_ip) {
1256  local_ordinal_type ip_min = ipack*vector_length;
1257  ip_max = nparts > (ipack+1)*vector_length ? (ipack+1)*vector_length : nparts;
1258 
1259  const local_ordinal_type full_line_length = partptr(ip_min+1) - partptr(ip_min);
1260 
1261  constexpr local_ordinal_type connection_length = 2;
1262 
1263  const local_ordinal_type sub_line_length = (full_line_length - (n_subparts_per_part - 1) * connection_length) / n_subparts_per_part;
1264  const local_ordinal_type last_sub_line_length = full_line_length - (n_subparts_per_part - 1) * (connection_length + sub_line_length);
1265 
1266  if (local_sub_ip % 2 == 0) pack_nrows_sub = sub_line_length;
1267  if (local_sub_ip % 2 == 1) pack_nrows_sub = connection_length;
1268  if (local_sub_ip == part2packrowidx0_sub.extent(1)-2) pack_nrows_sub = last_sub_line_length;
1269 
1270  part2packrowidx0_sub(ip_min, local_sub_ip + 1) = part2packrowidx0_sub(ip_min, local_sub_ip) + pack_nrows_sub;
1271 
1272  for (local_ordinal_type ip=ip_min+1;ip<ip_max;++ip) {
1273  part2packrowidx0_sub(ip, local_sub_ip + 1) = part2packrowidx0_sub(ip_min, local_sub_ip + 1);
1274  }
1275  }
1276  }
1277 
1278  Kokkos::deep_copy(interf.part2packrowidx0_sub, part2packrowidx0_sub);
1279  }
1280  IFPACK2_BLOCKHELPER_TIMER_FENCE(typename BlockHelperDetails::ImplType<MatrixType>::execution_space)
1281  } else {
1282  IFPACK2_BLOCKHELPER_TIMER("compute part indices");
1283  for (local_ordinal_type ip=0;ip<nparts;++ip) {
1284  const auto* part = &partitions[p[ip]];
1285  const local_ordinal_type ipnrows = part->size();
1286  TEUCHOS_ASSERT(ip == 0 || (ipnrows <= static_cast<local_ordinal_type>(partitions[p[ip-1]].size())));
1287  TEUCHOS_TEST_FOR_EXCEPT_MSG(ipnrows == 0,
1288  BlockHelperDetails::get_msg_prefix(comm)
1289  << "partition " << p[ip]
1290  << " is empty, which is not allowed.");
1291  //assume No overlap.
1292  part2rowidx0(ip+1) = part2rowidx0(ip) + ipnrows;
1293  // Since parts are ordered in decreasing size, the size of the first
1294  // part in a pack is the size for all parts in the pack.
1295  if (ip % vector_length == 0) pack_nrows = ipnrows;
1296  part2packrowidx0(ip+1) = part2packrowidx0(ip) + ((ip+1) % vector_length == 0 || ip+1 == nparts ? pack_nrows : 0);
1297  const local_ordinal_type offset = partptr(ip);
1298  for (local_ordinal_type i=0;i<ipnrows;++i) {
1299  const auto lcl_row = (*part)[i];
1300  TEUCHOS_TEST_FOR_EXCEPT_MSG(lcl_row < 0 || lcl_row >= A_n_lclrows,
1301  BlockHelperDetails::get_msg_prefix(comm)
1302  << "partitions[" << p[ip] << "]["
1303  << i << "] = " << lcl_row
1304  << " but input matrix implies limits of [0, " << A_n_lclrows-1
1305  << "].");
1306  lclrow(offset+i) = lcl_row;
1307  rowidx2part(offset+i) = ip;
1308  if (interf.row_contiguous && offset+i > 0 && lclrow((offset+i)-1) + 1 != lcl_row)
1309  interf.row_contiguous = false;
1310  }
1311  partptr(ip+1) = offset + ipnrows;
1312 
1313 #ifdef IFPACK2_BLOCKTRIDICONTAINER_USE_PRINTF
1314  printf("Part index = ip = %d, first LID associated to the part = partptr(ip) = offset = %d, part->size() = ipnrows = %d;\n", ip, offset, ipnrows);
1315  printf("partptr(%d+1) = %d\n", ip, partptr(ip+1));
1316 #endif
1317  }
1318 
1319  part2rowidx0_sub(0) = 0;
1320  partptr_sub(0, 0) = 0;
1321  //const local_ordinal_type number_pack_per_sub_part = ceil(float(nparts)/vector_length);
1322 
1323  for (local_ordinal_type ip=0;ip<nparts;++ip) {
1324  const auto* part = &partitions[p[ip]];
1325  const local_ordinal_type ipnrows = part->size();
1326  const local_ordinal_type full_line_length = partptr(ip+1) - partptr(ip);
1327 
1329  (full_line_length != ipnrows, std::logic_error,
1330  "In the part " << ip );
1331 
1332  constexpr local_ordinal_type connection_length = 2;
1333 
1334  if (full_line_length < n_subparts_per_part + (n_subparts_per_part - 1) * connection_length )
1336  (true, std::logic_error,
1337  "The part " << ip << " is too short to use " << n_subparts_per_part << " sub parts.");
1338 
1339  const local_ordinal_type sub_line_length = (full_line_length - (n_subparts_per_part - 1) * connection_length) / n_subparts_per_part;
1340  const local_ordinal_type last_sub_line_length = full_line_length - (n_subparts_per_part - 1) * (connection_length + sub_line_length);
1341 
1342  if (ip % vector_length == 0) pack_nrows_sub = ipnrows;
1343 
1344  for (local_ordinal_type local_sub_ip=0; local_sub_ip<n_subparts_per_part;++local_sub_ip) {
1345  const local_ordinal_type sub_ip = nparts*(2*local_sub_ip) + ip;
1346  const local_ordinal_type schur_ip = nparts*(2*local_sub_ip+1) + ip;
1347  if (local_sub_ip != n_subparts_per_part-1) {
1348  if (local_sub_ip != 0) {
1349  partptr_sub(sub_ip, 0) = partptr_sub(nparts*(2*local_sub_ip-1) + ip, 1);
1350  }
1351  else if (ip != 0) {
1352  partptr_sub(sub_ip, 0) = partptr_sub(nparts*2*(n_subparts_per_part-1) + ip - 1, 1);
1353  }
1354  partptr_sub(sub_ip, 1) = sub_line_length + partptr_sub(sub_ip, 0);
1355  partptr_sub(schur_ip, 0) = partptr_sub(sub_ip, 1);
1356  partptr_sub(schur_ip, 1) = connection_length + partptr_sub(schur_ip, 0);
1357 
1358  part2rowidx0_sub(sub_ip + 1) = part2rowidx0_sub(sub_ip) + sub_line_length;
1359  part2rowidx0_sub(sub_ip + 2) = part2rowidx0_sub(sub_ip + 1) + connection_length;
1360 
1361 #ifdef IFPACK2_BLOCKTRIDICONTAINER_USE_PRINTF
1362  printf("Sub Part index = %d, first LID associated to the sub part = %d, sub part size = %d;\n", sub_ip, partptr_sub(sub_ip, 0), sub_line_length);
1363  printf("Sub Part index Schur = %d, first LID associated to the sub part = %d, sub part size = %d;\n", sub_ip + 1, partptr_sub(ip, 2 * local_sub_ip + 1), connection_length);
1364 #endif
1365  }
1366  else {
1367  if (local_sub_ip != 0) {
1368  partptr_sub(sub_ip, 0) = partptr_sub(nparts*(2*local_sub_ip-1) + ip, 1);
1369  }
1370  else if (ip != 0) {
1371  partptr_sub(sub_ip, 0) = partptr_sub(nparts*2*(n_subparts_per_part-1) + ip - 1, 1);
1372  }
1373  partptr_sub(sub_ip, 1) = last_sub_line_length + partptr_sub(sub_ip, 0);
1374 
1375  part2rowidx0_sub(sub_ip + 1) = part2rowidx0_sub(sub_ip) + last_sub_line_length;
1376 
1377 #ifdef IFPACK2_BLOCKTRIDICONTAINER_USE_PRINTF
1378  printf("Sub Part index = %d, first LID associated to the sub part = %d, sub part size = %d;\n", sub_ip, partptr_sub(sub_ip, 0), last_sub_line_length);
1379 #endif
1380  }
1381  }
1382  }
1383 
1384  {
1385  local_ordinal_type npacks = ceil(float(nparts)/vector_length);
1386 
1387  local_ordinal_type ip_max = nparts > vector_length ? vector_length : nparts;
1388  for (local_ordinal_type ip=0;ip<ip_max;++ip) {
1389  part2packrowidx0_sub(ip, 0) = 0;
1390  }
1391  for (local_ordinal_type ipack=0;ipack<npacks;++ipack) {
1392  if (ipack != 0) {
1393  local_ordinal_type ip_min = ipack*vector_length;
1394  ip_max = nparts > (ipack+1)*vector_length ? (ipack+1)*vector_length : nparts;
1395  for (local_ordinal_type ip=ip_min;ip<ip_max;++ip) {
1396  part2packrowidx0_sub(ip, 0) = part2packrowidx0_sub(ip-vector_length, part2packrowidx0_sub.extent(1)-1);
1397  }
1398  }
1399 
1400  for (size_type local_sub_ip=0; local_sub_ip<part2packrowidx0_sub.extent(1)-1;++local_sub_ip) {
1401  local_ordinal_type ip_min = ipack*vector_length;
1402  ip_max = nparts > (ipack+1)*vector_length ? (ipack+1)*vector_length : nparts;
1403 
1404  const local_ordinal_type full_line_length = partptr(ip_min+1) - partptr(ip_min);
1405 
1406  constexpr local_ordinal_type connection_length = 2;
1407 
1408  const local_ordinal_type sub_line_length = (full_line_length - (n_subparts_per_part - 1) * connection_length) / n_subparts_per_part;
1409  const local_ordinal_type last_sub_line_length = full_line_length - (n_subparts_per_part - 1) * (connection_length + sub_line_length);
1410 
1411  if (local_sub_ip % 2 == 0) pack_nrows_sub = sub_line_length;
1412  if (local_sub_ip % 2 == 1) pack_nrows_sub = connection_length;
1413  if (local_sub_ip == part2packrowidx0_sub.extent(1)-2) pack_nrows_sub = last_sub_line_length;
1414 
1415  part2packrowidx0_sub(ip_min, local_sub_ip + 1) = part2packrowidx0_sub(ip_min, local_sub_ip) + pack_nrows_sub;
1416 
1417  for (local_ordinal_type ip=ip_min+1;ip<ip_max;++ip) {
1418  part2packrowidx0_sub(ip, local_sub_ip + 1) = part2packrowidx0_sub(ip_min, local_sub_ip + 1);
1419  }
1420  }
1421  }
1422 
1423  Kokkos::deep_copy(interf.part2packrowidx0_sub, part2packrowidx0_sub);
1424  }
1425  IFPACK2_BLOCKHELPER_TIMER_FENCE(typename BlockHelperDetails::ImplType<MatrixType>::execution_space)
1426  }
1427 #if defined(BLOCKTRIDICONTAINER_DEBUG)
1428  TEUCHOS_ASSERT(partptr(nparts) == nrows);
1429 #endif
1430  if (lclrow(0) != 0) interf.row_contiguous = false;
1431 
1432  Kokkos::deep_copy(interf.partptr, partptr);
1433  Kokkos::deep_copy(interf.lclrow, lclrow);
1434 
1435  Kokkos::deep_copy(interf.partptr_sub, partptr_sub);
1436 
1437  //assume No overlap. Thus:
1438  interf.part2rowidx0 = interf.partptr;
1439  Kokkos::deep_copy(interf.part2packrowidx0, part2packrowidx0);
1440 
1441  interf.part2packrowidx0_back = part2packrowidx0_sub(part2packrowidx0_sub.extent(0) - 1, part2packrowidx0_sub.extent(1) - 1);
1442  Kokkos::deep_copy(interf.rowidx2part, rowidx2part);
1443 
1444  { // Fill packptr.
1445  IFPACK2_BLOCKHELPER_TIMER("Fill packptr");
1446  local_ordinal_type npacks = ceil(float(nparts)/vector_length) * (part2packrowidx0_sub.extent(1)-1);
1447  npacks = 0;
1448  for (local_ordinal_type ip=1;ip<=nparts;++ip) //n_sub_parts_and_schur
1449  if (part2packrowidx0(ip) != part2packrowidx0(ip-1))
1450  ++npacks;
1451 
1452  interf.packptr = local_ordinal_type_1d_view(do_not_initialize_tag("packptr"), npacks + 1);
1453  const auto packptr = Kokkos::create_mirror_view(interf.packptr);
1454  packptr(0) = 0;
1455  for (local_ordinal_type ip=1,k=1;ip<=nparts;++ip)
1456  if (part2packrowidx0(ip) != part2packrowidx0(ip-1))
1457  packptr(k++) = ip;
1458 
1459  Kokkos::deep_copy(interf.packptr, packptr);
1460 
1461  local_ordinal_type npacks_per_subpart = ceil(float(nparts)/vector_length);
1462  npacks = ceil(float(nparts)/vector_length) * (part2packrowidx0_sub.extent(1)-1);
1463 
1464  interf.packindices_sub = local_ordinal_type_1d_view(do_not_initialize_tag("packindices_sub"), npacks_per_subpart*n_subparts_per_part);
1465  interf.packindices_schur = local_ordinal_type_2d_view(do_not_initialize_tag("packindices_schur"), npacks_per_subpart,n_subparts_per_part-1);
1466 
1467  const auto packindices_sub = Kokkos::create_mirror_view(interf.packindices_sub);
1468  const auto packindices_schur = Kokkos::create_mirror_view(interf.packindices_schur);
1469 
1470 
1471  // Fill packindices_sub and packindices_schur
1472  for (local_ordinal_type local_sub_ip=0; local_sub_ip<n_subparts_per_part-1;++local_sub_ip) {
1473  for (local_ordinal_type local_pack_ip=0; local_pack_ip<npacks_per_subpart;++local_pack_ip) {
1474  packindices_sub(local_sub_ip * npacks_per_subpart + local_pack_ip) = 2 * local_sub_ip * npacks_per_subpart + local_pack_ip;
1475  packindices_schur(local_pack_ip,local_sub_ip) = 2 * local_sub_ip * npacks_per_subpart + local_pack_ip + npacks_per_subpart;
1476  }
1477  }
1478 
1479  for (local_ordinal_type local_pack_ip=0; local_pack_ip<npacks_per_subpart;++local_pack_ip) {
1480  packindices_sub((n_subparts_per_part-1) * npacks_per_subpart + local_pack_ip) = 2 * (n_subparts_per_part-1) * npacks_per_subpart + local_pack_ip;
1481  }
1482 
1483 #ifdef IFPACK2_BLOCKTRIDICONTAINER_WRITE_MM
1484  std::cout << "packindices_sub = " << std::endl;
1485  for (size_type i = 0; i < packindices_sub.extent(0); ++i) {
1486  std::cout << packindices_sub(i) << " ";
1487  }
1488  std::cout << std::endl;
1489  std::cout << "packindices_sub end" << std::endl;
1490 
1491  std::cout << "packindices_schur = " << std::endl;
1492  for (size_type i = 0; i < packindices_schur.extent(0); ++i) {
1493  for (size_type j = 0; j < packindices_schur.extent(1); ++j) {
1494  std::cout << packindices_schur(i,j) << " ";
1495  }
1496  std::cout << std::endl;
1497  }
1498 
1499  std::cout << "packindices_schur end" << std::endl;
1500 #endif
1501 
1502  Kokkos::deep_copy(interf.packindices_sub, packindices_sub);
1503  Kokkos::deep_copy(interf.packindices_schur, packindices_schur);
1504 
1505  interf.packptr_sub = local_ordinal_type_1d_view(do_not_initialize_tag("packptr"), npacks + 1);
1506  const auto packptr_sub = Kokkos::create_mirror_view(interf.packptr_sub);
1507  packptr_sub(0) = 0;
1508  for (local_ordinal_type k=0;k<npacks + 1;++k)
1509  packptr_sub(k) = packptr(k%npacks_per_subpart) + (k / npacks_per_subpart) * packptr(npacks_per_subpart);
1510 
1511  Kokkos::deep_copy(interf.packptr_sub, packptr_sub);
1512  IFPACK2_BLOCKHELPER_TIMER_FENCE(typename BlockHelperDetails::ImplType<MatrixType>::execution_space)
1513  }
1514  IFPACK2_BLOCKHELPER_TIMER_FENCE(typename BlockHelperDetails::ImplType<MatrixType>::execution_space)
1515 
1516  return interf;
1517  }
1518 
1522  template <typename MatrixType>
1523  struct BlockTridiags {
1525  using local_ordinal_type_1d_view = typename impl_type::local_ordinal_type_1d_view;
1526  using size_type_1d_view = typename impl_type::size_type_1d_view;
1527  using size_type_2d_view = typename impl_type::size_type_2d_view;
1528  using vector_type_3d_view = typename impl_type::vector_type_3d_view;
1529  using vector_type_4d_view = typename impl_type::vector_type_4d_view;
1530 
1531  // flat_td_ptr(i) is the index into flat-array values of the start of the
1532  // i'th tridiag. pack_td_ptr is the same, but for packs. If vector_length ==
1533  // 1, pack_td_ptr is the same as flat_td_ptr; if vector_length > 1, then i %
1534  // vector_length is the position in the pack.
1535  size_type_2d_view flat_td_ptr, pack_td_ptr, pack_td_ptr_schur;
1536  // List of local column indices into A from which to grab
1537  // data. flat_td_ptr(i) points to the start of the i'th tridiag's data.
1538  local_ordinal_type_1d_view A_colindsub;
1539  // Tridiag block values. pack_td_ptr(i) points to the start of the i'th
1540  // tridiag's pack, and i % vector_length gives the position in the pack.
1541  vector_type_3d_view values;
1542  // Schur block values. pack_td_ptr_schur(i) points to the start of the i'th
1543  // Schur's pack, and i % vector_length gives the position in the pack.
1544  vector_type_3d_view values_schur;
1545  // inv(A_00)*A_01 block values.
1546  vector_type_4d_view e_values;
1547 
1548  bool is_diagonal_only;
1549 
1550  BlockTridiags() = default;
1551  BlockTridiags(const BlockTridiags &b) = default;
1552 
1553  // Index into row-major block of a tridiag.
1554  template <typename idx_type>
1555  static KOKKOS_FORCEINLINE_FUNCTION
1556  idx_type IndexToRow (const idx_type& ind) { return (ind + 1) / 3; }
1557  // Given a row of a row-major tridiag, return the index of the first block
1558  // in that row.
1559  template <typename idx_type>
1560  static KOKKOS_FORCEINLINE_FUNCTION
1561  idx_type RowToIndex (const idx_type& row) { return row > 0 ? 3*row - 1 : 0; }
1562  // Number of blocks in a tridiag having a given number of rows.
1563  template <typename idx_type>
1564  static KOKKOS_FORCEINLINE_FUNCTION
1565  idx_type NumBlocks (const idx_type& nrows) { return nrows > 0 ? 3*nrows - 2 : 0; }
1566  // Number of blocks associated to a Schur complement having a given number of rows.
1567  template <typename idx_type>
1568  static KOKKOS_FORCEINLINE_FUNCTION
1569  idx_type NumBlocksSchur (const idx_type& nrows) { return nrows > 0 ? 3*nrows + 2 : 0; }
1570  };
1571 
1572 
1576  template<typename MatrixType>
1578  createBlockTridiags(const BlockHelperDetails::PartInterface<MatrixType> &interf) {
1579  IFPACK2_BLOCKHELPER_TIMER("createBlockTridiags");
1580  using impl_type = BlockHelperDetails::ImplType<MatrixType>;
1581  using execution_space = typename impl_type::execution_space;
1582  using local_ordinal_type = typename impl_type::local_ordinal_type;
1583  using size_type = typename impl_type::size_type;
1584  using size_type_2d_view = typename impl_type::size_type_2d_view;
1585 
1586  constexpr int vector_length = impl_type::vector_length;
1587 
1589 
1590  const local_ordinal_type ntridiags = interf.partptr_sub.extent(0);
1591 
1592  { // construct the flat index pointers into the tridiag values array.
1593  btdm.flat_td_ptr = size_type_2d_view(do_not_initialize_tag("btdm.flat_td_ptr"), interf.nparts, 2*interf.n_subparts_per_part);
1594  const Kokkos::RangePolicy<execution_space> policy(0, 2 * interf.nparts * interf.n_subparts_per_part );
1595  Kokkos::parallel_scan
1596  ("createBlockTridiags::RangePolicy::flat_td_ptr",
1597  policy, KOKKOS_LAMBDA(const local_ordinal_type &i, size_type &update, const bool &final) {
1598  const local_ordinal_type partidx = i/(2 * interf.n_subparts_per_part);
1599  const local_ordinal_type local_subpartidx = i % (2 * interf.n_subparts_per_part);
1600 
1601  if (final) {
1602  btdm.flat_td_ptr(partidx, local_subpartidx) = update;
1603  }
1604  if (local_subpartidx != (2 * interf.n_subparts_per_part -1)) {
1605  const local_ordinal_type nrows = interf.partptr_sub(interf.nparts*local_subpartidx + partidx,1) - interf.partptr_sub(interf.nparts*local_subpartidx + partidx,0);
1606  if (local_subpartidx % 2 == 0)
1607  update += btdm.NumBlocks(nrows);
1608  else
1609  update += btdm.NumBlocksSchur(nrows);
1610  }
1611  });
1612 
1613  const auto nblocks = Kokkos::create_mirror_view_and_copy
1614  (Kokkos::HostSpace(), Kokkos::subview(btdm.flat_td_ptr, interf.nparts-1, 2*interf.n_subparts_per_part-1));
1615  btdm.is_diagonal_only = (static_cast<local_ordinal_type>(nblocks()) == ntridiags);
1616  }
1617 
1618  // And the packed index pointers.
1619  if (vector_length == 1) {
1620  btdm.pack_td_ptr = btdm.flat_td_ptr;
1621  } else {
1622  //const local_ordinal_type npacks = interf.packptr_sub.extent(0) - 1;
1623 
1624  local_ordinal_type npacks_per_subpart = 0;
1625  const auto part2packrowidx0 = Kokkos::create_mirror_view(interf.part2packrowidx0);
1626  Kokkos::deep_copy(part2packrowidx0, interf.part2packrowidx0);
1627  for (local_ordinal_type ip=1;ip<=interf.nparts;++ip) //n_sub_parts_and_schur
1628  if (part2packrowidx0(ip) != part2packrowidx0(ip-1))
1629  ++npacks_per_subpart;
1630 
1631  btdm.pack_td_ptr = size_type_2d_view(do_not_initialize_tag("btdm.pack_td_ptr"), interf.nparts, 2*interf.n_subparts_per_part);
1632  const Kokkos::RangePolicy<execution_space> policy(0,npacks_per_subpart);
1633 
1634  Kokkos::parallel_for
1635  ("createBlockTridiags::RangePolicy::pack_td_ptr",
1636  policy, KOKKOS_LAMBDA(const local_ordinal_type &i) {
1637  for (local_ordinal_type j = 0; j < 2*interf.n_subparts_per_part; ++j) {
1638  const local_ordinal_type pack_id = ( j == 2*interf.n_subparts_per_part-1 ) ? i+(j-1)*npacks_per_subpart : i+j*npacks_per_subpart;
1639  const local_ordinal_type nparts_in_pack = interf.packptr_sub(pack_id+1) - interf.packptr_sub(pack_id);
1640 
1641  const local_ordinal_type parti = interf.packptr_sub(pack_id);
1642  const local_ordinal_type partidx = parti%interf.nparts;
1643 
1644  for (local_ordinal_type pti=0;pti<nparts_in_pack;++pti) {
1645  btdm.pack_td_ptr(partidx+pti, j) = btdm.flat_td_ptr(i, j);
1646  }
1647  }
1648  });
1649  }
1650 
1651  btdm.pack_td_ptr_schur = size_type_2d_view(do_not_initialize_tag("btdm.pack_td_ptr_schur"), interf.nparts, interf.n_subparts_per_part);
1652 
1653  const auto host_pack_td_ptr_schur = Kokkos::create_mirror_view(btdm.pack_td_ptr_schur);
1654  constexpr local_ordinal_type connection_length = 2;
1655 
1656  host_pack_td_ptr_schur(0,0) = 0;
1657  for (local_ordinal_type i = 0; i < interf.nparts; ++i) {
1658  if (i % vector_length == 0) {
1659  if (i != 0)
1660  host_pack_td_ptr_schur(i,0) = host_pack_td_ptr_schur(i-1,host_pack_td_ptr_schur.extent(1)-1);
1661  for (local_ordinal_type j = 0; j < interf.n_subparts_per_part-1; ++j) {
1662  host_pack_td_ptr_schur(i,j+1) = host_pack_td_ptr_schur(i,j) + btdm.NumBlocks(connection_length) + (j != 0 ? 1 : 0) + (j != interf.n_subparts_per_part-2 ? 1 : 0);
1663  }
1664  }
1665  else {
1666  for (local_ordinal_type j = 0; j < interf.n_subparts_per_part; ++j) {
1667  host_pack_td_ptr_schur(i,j) = host_pack_td_ptr_schur(i-1,j);
1668  }
1669  }
1670  }
1671 
1672  Kokkos::deep_copy(btdm.pack_td_ptr_schur, host_pack_td_ptr_schur);
1673 
1674 #ifdef IFPACK2_BLOCKTRIDICONTAINER_WRITE_MM
1675  const auto host_flat_td_ptr = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), btdm.flat_td_ptr);
1676  std::cout << "flat_td_ptr = " << std::endl;
1677  for (size_type i = 0; i < host_flat_td_ptr.extent(0); ++i) {
1678  for (size_type j = 0; j < host_flat_td_ptr.extent(1); ++j) {
1679  std::cout << host_flat_td_ptr(i,j) << " ";
1680  }
1681  std::cout << std::endl;
1682  }
1683  std::cout << "flat_td_ptr end" << std::endl;
1684 
1685  const auto host_pack_td_ptr = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), btdm.pack_td_ptr);
1686 
1687  std::cout << "pack_td_ptr = " << std::endl;
1688  for (size_type i = 0; i < host_pack_td_ptr.extent(0); ++i) {
1689  for (size_type j = 0; j < host_pack_td_ptr.extent(1); ++j) {
1690  std::cout << host_pack_td_ptr(i,j) << " ";
1691  }
1692  std::cout << std::endl;
1693  }
1694  std::cout << "pack_td_ptr end" << std::endl;
1695 
1696 
1697  std::cout << "pack_td_ptr_schur = " << std::endl;
1698  for (size_type i = 0; i < host_pack_td_ptr_schur.extent(0); ++i) {
1699  for (size_type j = 0; j < host_pack_td_ptr_schur.extent(1); ++j) {
1700  std::cout << host_pack_td_ptr_schur(i,j) << " ";
1701  }
1702  std::cout << std::endl;
1703  }
1704  std::cout << "pack_td_ptr_schur end" << std::endl;
1705 #endif
1706 
1707  // values and A_colindsub are created in the symbolic phase
1708  IFPACK2_BLOCKHELPER_TIMER_FENCE(typename BlockHelperDetails::ImplType<MatrixType>::execution_space)
1709 
1710  return btdm;
1711  }
1712 
1713  // Set the tridiags to be I to the full pack block size. That way, if a
1714  // tridiag within a pack is shorter than the longest one, the extra blocks are
1715  // processed in a safe way. Similarly, in the solve phase, if the extra blocks
1716  // in the packed multvector are 0, and the tridiag LU reflects the extra I
1717  // blocks, then the solve proceeds as though the extra blocks aren't
1718  // present. Since this extra work is part of the SIMD calls, it's not actually
1719  // extra work. Instead, it means we don't have to put checks or masks in, or
1720  // quiet NaNs. This functor has to be called just once, in the symbolic phase,
1721  // since the numeric phase fills in only the used entries, leaving these I
1722  // blocks intact.
1723  template<typename MatrixType>
1724  void
1725  setTridiagsToIdentity
1726  (const BlockTridiags<MatrixType>& btdm,
1727  const typename BlockHelperDetails::ImplType<MatrixType>::local_ordinal_type_1d_view& packptr)
1728  {
1729  using impl_type = BlockHelperDetails::ImplType<MatrixType>;
1730  using execution_space = typename impl_type::execution_space;
1731  using local_ordinal_type = typename impl_type::local_ordinal_type;
1732  using size_type_2d_view = typename impl_type::size_type_2d_view;
1733 
1734  const ConstUnmanaged<size_type_2d_view> pack_td_ptr(btdm.pack_td_ptr);
1735  const local_ordinal_type blocksize = btdm.values.extent(1);
1736 
1737  {
1738  const int vector_length = impl_type::vector_length;
1739  const int internal_vector_length = impl_type::internal_vector_length;
1740 
1741  using btdm_scalar_type = typename impl_type::btdm_scalar_type;
1742  using internal_vector_type = typename impl_type::internal_vector_type;
1743  using internal_vector_type_4d_view =
1744  typename impl_type::internal_vector_type_4d_view;
1745 
1746  using team_policy_type = Kokkos::TeamPolicy<execution_space>;
1747  const internal_vector_type_4d_view values
1748  (reinterpret_cast<internal_vector_type*>(btdm.values.data()),
1749  btdm.values.extent(0),
1750  btdm.values.extent(1),
1751  btdm.values.extent(2),
1752  vector_length/internal_vector_length);
1753  const local_ordinal_type vector_loop_size = values.extent(3);
1754 #if defined(KOKKOS_ENABLE_CUDA) && defined(__CUDA_ARCH__)
1755  local_ordinal_type total_team_size(0);
1756  if (blocksize <= 5) total_team_size = 32;
1757  else if (blocksize <= 9) total_team_size = 64;
1758  else if (blocksize <= 12) total_team_size = 96;
1759  else if (blocksize <= 16) total_team_size = 128;
1760  else if (blocksize <= 20) total_team_size = 160;
1761  else total_team_size = 160;
1762  const local_ordinal_type team_size = total_team_size/vector_loop_size;
1763  const team_policy_type policy(packptr.extent(0)-1, team_size, vector_loop_size);
1764 #elif defined(KOKKOS_ENABLE_HIP)
1765  // FIXME: HIP
1766  // These settings might be completely wrong
1767  // will have to do some experiments to decide
1768  // what makes sense on AMD GPUs
1769  local_ordinal_type total_team_size(0);
1770  if (blocksize <= 5) total_team_size = 32;
1771  else if (blocksize <= 9) total_team_size = 64;
1772  else if (blocksize <= 12) total_team_size = 96;
1773  else if (blocksize <= 16) total_team_size = 128;
1774  else if (blocksize <= 20) total_team_size = 160;
1775  else total_team_size = 160;
1776  const local_ordinal_type team_size = total_team_size/vector_loop_size;
1777  const team_policy_type policy(packptr.extent(0)-1, team_size, vector_loop_size);
1778 #elif defined(KOKKOS_ENABLE_SYCL)
1779  // SYCL: FIXME
1780  local_ordinal_type total_team_size(0);
1781  if (blocksize <= 5) total_team_size = 32;
1782  else if (blocksize <= 9) total_team_size = 64;
1783  else if (blocksize <= 12) total_team_size = 96;
1784  else if (blocksize <= 16) total_team_size = 128;
1785  else if (blocksize <= 20) total_team_size = 160;
1786  else total_team_size = 160;
1787  const local_ordinal_type team_size = total_team_size/vector_loop_size;
1788  const team_policy_type policy(packptr.extent(0)-1, team_size, vector_loop_size);
1789 #else
1790  // Host architecture: team size is always one
1791  const team_policy_type policy(packptr.extent(0)-1, 1, 1);
1792 #endif
1793  Kokkos::parallel_for
1794  ("setTridiagsToIdentity::TeamPolicy",
1795  policy, KOKKOS_LAMBDA(const typename team_policy_type::member_type &member) {
1796  const local_ordinal_type k = member.league_rank();
1797  const local_ordinal_type ibeg = pack_td_ptr(packptr(k),0);
1798  const local_ordinal_type iend = pack_td_ptr(packptr(k),pack_td_ptr.extent(1)-1);
1799 
1800  const local_ordinal_type diff = iend - ibeg;
1801  const local_ordinal_type icount = diff/3 + (diff%3 > 0);
1802  const btdm_scalar_type one(1);
1803  Kokkos::parallel_for(Kokkos::ThreadVectorRange(member, vector_loop_size),[&](const int &v) {
1804  Kokkos::parallel_for(Kokkos::TeamThreadRange(member,icount),[&](const local_ordinal_type &ii) {
1805  const local_ordinal_type i = ibeg + ii*3;
1806  for (local_ordinal_type j=0;j<blocksize;++j) {
1807  values(i,j,j,v) = one;
1808  }
1809  });
1810  });
1811  });
1812  }
1813  }
1814 
1818  template<typename MatrixType>
1819  void
1820  performSymbolicPhase(const Teuchos::RCP<const typename BlockHelperDetails::ImplType<MatrixType>::tpetra_block_crs_matrix_type> &A,
1821  const BlockHelperDetails::PartInterface<MatrixType> &interf,
1824  const bool overlap_communication_and_computation) {
1825  IFPACK2_BLOCKHELPER_TIMER("BlockTriDi::SymbolicPhase");
1826 
1827  using impl_type = BlockHelperDetails::ImplType<MatrixType>;
1828 
1829  // using node_memory_space = typename impl_type::node_memory_space;
1830  using host_execution_space = typename impl_type::host_execution_space;
1831 
1832  using local_ordinal_type = typename impl_type::local_ordinal_type;
1833  using global_ordinal_type = typename impl_type::global_ordinal_type;
1834  using size_type = typename impl_type::size_type;
1835  using local_ordinal_type_1d_view = typename impl_type::local_ordinal_type_1d_view;
1836  using size_type_1d_view = typename impl_type::size_type_1d_view;
1837  using vector_type_3d_view = typename impl_type::vector_type_3d_view;
1838  using vector_type_4d_view = typename impl_type::vector_type_4d_view;
1839  using block_crs_matrix_type = typename impl_type::tpetra_block_crs_matrix_type;
1840 
1841  constexpr int vector_length = impl_type::vector_length;
1842 
1843  const auto comm = A->getRowMap()->getComm();
1844 
1845  const auto& g = A->getCrsGraph();
1846 
1847  const auto blocksize = A->getBlockSize();
1848 
1849  // mirroring to host
1850  const auto partptr = Kokkos::create_mirror_view_and_copy (Kokkos::HostSpace(), interf.partptr);
1851  const auto lclrow = Kokkos::create_mirror_view_and_copy (Kokkos::HostSpace(), interf.lclrow);
1852  const auto rowidx2part = Kokkos::create_mirror_view_and_copy (Kokkos::HostSpace(), interf.rowidx2part);
1853  const auto part2rowidx0 = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), interf.part2rowidx0);
1854  const auto packptr = Kokkos::create_mirror_view_and_copy (Kokkos::HostSpace(), interf.packptr);
1855 
1856  const local_ordinal_type nrows = partptr(partptr.extent(0) - 1);
1857 
1858  Kokkos::View<local_ordinal_type*,host_execution_space> col2row("col2row", A->getLocalNumCols());
1859 
1860  // find column to row map on host
1861 
1862  Kokkos::deep_copy(col2row, Teuchos::OrdinalTraits<local_ordinal_type>::invalid());
1863  {
1864  const auto rowmap = g.getRowMap();
1865  const auto colmap = g.getColMap();
1866  const auto dommap = g.getDomainMap();
1867  TEUCHOS_ASSERT( !(rowmap.is_null() || colmap.is_null() || dommap.is_null()));
1868 
1869 #if !defined(__CUDA_ARCH__) && !defined(__HIP_DEVICE_COMPILE__) && !defined(__SYCL_DEVICE_ONLY__)
1870  const Kokkos::RangePolicy<host_execution_space> policy(0,nrows);
1871  Kokkos::parallel_for
1872  ("performSymbolicPhase::RangePolicy::col2row",
1873  policy, KOKKOS_LAMBDA(const local_ordinal_type &lr) {
1874  const global_ordinal_type gid = rowmap->getGlobalElement(lr);
1876  if (dommap->isNodeGlobalElement(gid)) {
1877  const local_ordinal_type lc = colmap->getLocalElement(gid);
1878 # if defined(BLOCKTRIDICONTAINER_DEBUG)
1880  BlockHelperDetails::get_msg_prefix(comm) << "GID " << gid
1881  << " gives an invalid local column.");
1882 # endif
1883  col2row(lc) = lr;
1884  }
1885  });
1886 #endif
1887  }
1888 
1889  // construct the D and R graphs in A = D + R.
1890  {
1891  const auto local_graph = g.getLocalGraphHost();
1892  const auto local_graph_rowptr = local_graph.row_map;
1893  TEUCHOS_ASSERT(local_graph_rowptr.size() == static_cast<size_t>(nrows + 1));
1894  const auto local_graph_colidx = local_graph.entries;
1895 
1896  //assume no overlap.
1897 
1898  Kokkos::View<local_ordinal_type*,host_execution_space> lclrow2idx("lclrow2idx", nrows);
1899  {
1900  const Kokkos::RangePolicy<host_execution_space> policy(0,nrows);
1901  Kokkos::parallel_for
1902  ("performSymbolicPhase::RangePolicy::lclrow2idx",
1903  policy, KOKKOS_LAMBDA(const local_ordinal_type &i) {
1904  lclrow2idx[lclrow(i)] = i;
1905  });
1906  }
1907 
1908  // count (block) nnzs in D and R.
1910  typename sum_reducer_type::value_type sum_reducer_value;
1911  {
1912  const Kokkos::RangePolicy<host_execution_space> policy(0,nrows);
1913  Kokkos::parallel_reduce
1914  // profiling interface does not work
1915  (//"performSymbolicPhase::RangePolicy::count_nnz",
1916  policy, KOKKOS_LAMBDA(const local_ordinal_type &lr, typename sum_reducer_type::value_type &update) {
1917  // LID -> index.
1918  const local_ordinal_type ri0 = lclrow2idx[lr];
1919  const local_ordinal_type pi0 = rowidx2part(ri0);
1920  for (size_type j=local_graph_rowptr(lr);j<local_graph_rowptr(lr+1);++j) {
1921  const local_ordinal_type lc = local_graph_colidx(j);
1922  const local_ordinal_type lc2r = col2row[lc];
1923  bool incr_R = false;
1924  do { // breakable
1925  if (lc2r == (local_ordinal_type) -1) {
1926  incr_R = true;
1927  break;
1928  }
1929  const local_ordinal_type ri = lclrow2idx[lc2r];
1930  const local_ordinal_type pi = rowidx2part(ri);
1931  if (pi != pi0) {
1932  incr_R = true;
1933  break;
1934  }
1935  // Test for being in the tridiag. This is done in index space. In
1936  // LID space, tridiag LIDs in a row are not necessarily related by
1937  // {-1, 0, 1}.
1938  if (ri0 + 1 >= ri && ri0 <= ri + 1)
1939  ++update.v[0]; // D_nnz
1940  else
1941  incr_R = true;
1942  } while (0);
1943  if (incr_R) {
1944  if (lc < nrows) ++update.v[1]; // R_nnz_owned
1945  else ++update.v[2]; // R_nnz_remote
1946  }
1947  }
1948  }, sum_reducer_type(sum_reducer_value));
1949  }
1950  size_type D_nnz = sum_reducer_value.v[0];
1951  size_type R_nnz_owned = sum_reducer_value.v[1];
1952  size_type R_nnz_remote = sum_reducer_value.v[2];
1953 
1954  if (!overlap_communication_and_computation) {
1955  R_nnz_owned += R_nnz_remote;
1956  R_nnz_remote = 0;
1957  }
1958 
1959  // construct the D_00 graph.
1960  {
1961  const auto flat_td_ptr = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), btdm.flat_td_ptr);
1962 
1963  btdm.A_colindsub = local_ordinal_type_1d_view("btdm.A_colindsub", D_nnz);
1964  const auto D_A_colindsub = Kokkos::create_mirror_view(btdm.A_colindsub);
1965 
1966 #if defined(BLOCKTRIDICONTAINER_DEBUG)
1967  Kokkos::deep_copy(D_A_colindsub, Teuchos::OrdinalTraits<local_ordinal_type>::invalid());
1968 #endif
1969 
1970  const local_ordinal_type nparts = partptr.extent(0) - 1;
1971 
1972  {
1973  const Kokkos::RangePolicy<host_execution_space> policy(0, nparts);
1974  Kokkos::parallel_for
1975  ("performSymbolicPhase::RangePolicy<host_execution_space>::D_graph",
1976  policy, KOKKOS_LAMBDA(const local_ordinal_type &pi0) {
1977  const local_ordinal_type part_ri0 = part2rowidx0(pi0);
1978  local_ordinal_type offset = 0;
1979  for (local_ordinal_type ri0=partptr(pi0);ri0<partptr(pi0+1);++ri0) {
1980  const local_ordinal_type td_row_os = btdm.RowToIndex(ri0 - part_ri0) + offset;
1981  offset = 1;
1982  const local_ordinal_type lr0 = lclrow(ri0);
1983  const size_type j0 = local_graph_rowptr(lr0);
1984  for (size_type j=j0;j<local_graph_rowptr(lr0+1);++j) {
1985  const local_ordinal_type lc = local_graph_colidx(j);
1986  const local_ordinal_type lc2r = col2row[lc];
1987  if (lc2r == (local_ordinal_type) -1) continue;
1988  const local_ordinal_type ri = lclrow2idx[lc2r];
1989  const local_ordinal_type pi = rowidx2part(ri);
1990  if (pi != pi0) continue;
1991  if (ri + 1 < ri0 || ri > ri0 + 1) continue;
1992  const local_ordinal_type row_entry = j - j0;
1993  D_A_colindsub(flat_td_ptr(pi0,0) + ((td_row_os + ri) - ri0)) = row_entry;
1994  }
1995  }
1996  });
1997  }
1998 #if defined(BLOCKTRIDICONTAINER_DEBUG)
1999  for (size_t i=0;i<D_A_colindsub.extent(0);++i)
2001 #endif
2002  Kokkos::deep_copy(btdm.A_colindsub, D_A_colindsub);
2003 
2004  // Allocate values.
2005  {
2006  const auto pack_td_ptr_last = Kokkos::subview(btdm.pack_td_ptr, btdm.pack_td_ptr.extent(0)-1, btdm.pack_td_ptr.extent(1)-1);
2007  const auto num_packed_blocks = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), pack_td_ptr_last);
2008  btdm.values = vector_type_3d_view("btdm.values", num_packed_blocks(), blocksize, blocksize);
2009 
2010  if (interf.n_subparts_per_part > 1) {
2011  const auto pack_td_ptr_schur_last = Kokkos::subview(btdm.pack_td_ptr_schur, btdm.pack_td_ptr_schur.extent(0)-1, btdm.pack_td_ptr_schur.extent(1)-1);
2012  const auto num_packed_blocks_schur = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), pack_td_ptr_schur_last);
2013  btdm.values_schur = vector_type_3d_view("btdm.values_schur", num_packed_blocks_schur(), blocksize, blocksize);
2014  }
2015 
2016  if (vector_length > 1) setTridiagsToIdentity(btdm, interf.packptr);
2017  }
2018  }
2019 
2020  // Construct the R graph.
2021  {
2022  amd.rowptr = size_type_1d_view("amd.rowptr", nrows + 1);
2023  amd.A_colindsub = local_ordinal_type_1d_view(do_not_initialize_tag("amd.A_colindsub"), R_nnz_owned);
2024 
2025  const auto R_rowptr = Kokkos::create_mirror_view(amd.rowptr);
2026  const auto R_A_colindsub = Kokkos::create_mirror_view(amd.A_colindsub);
2027 
2028  amd.rowptr_remote = size_type_1d_view("amd.rowptr_remote", overlap_communication_and_computation ? nrows + 1 : 0);
2029  amd.A_colindsub_remote = local_ordinal_type_1d_view(do_not_initialize_tag("amd.A_colindsub_remote"), R_nnz_remote);
2030 
2031  const auto R_rowptr_remote = Kokkos::create_mirror_view(amd.rowptr_remote);
2032  const auto R_A_colindsub_remote = Kokkos::create_mirror_view(amd.A_colindsub_remote);
2033 
2034  {
2035  const Kokkos::RangePolicy<host_execution_space> policy(0,nrows);
2036  Kokkos::parallel_for
2037  ("performSymbolicPhase::RangePolicy<host_execution_space>::R_graph_count",
2038  policy, KOKKOS_LAMBDA(const local_ordinal_type &lr) {
2039  const local_ordinal_type ri0 = lclrow2idx[lr];
2040  const local_ordinal_type pi0 = rowidx2part(ri0);
2041  const size_type j0 = local_graph_rowptr(lr);
2042  for (size_type j=j0;j<local_graph_rowptr(lr+1);++j) {
2043  const local_ordinal_type lc = local_graph_colidx(j);
2044  const local_ordinal_type lc2r = col2row[lc];
2045  if (lc2r != (local_ordinal_type) -1) {
2046  const local_ordinal_type ri = lclrow2idx[lc2r];
2047  const local_ordinal_type pi = rowidx2part(ri);
2048  if (pi == pi0 && ri + 1 >= ri0 && ri <= ri0 + 1) {
2049  continue;
2050  }
2051  }
2052  // exclusive scan will be performed later
2053  if (!overlap_communication_and_computation || lc < nrows) {
2054  ++R_rowptr(lr);
2055  } else {
2056  ++R_rowptr_remote(lr);
2057  }
2058  }
2059  });
2060  }
2061 
2062  // exclusive scan
2064  {
2065  Kokkos::RangePolicy<host_execution_space> policy(0,nrows+1);
2066  Kokkos::parallel_scan
2067  ("performSymbolicPhase::RangePolicy<host_execution_space>::R_graph_fill",
2068  policy, KOKKOS_LAMBDA(const local_ordinal_type &lr,
2069  update_type &update,
2070  const bool &final) {
2071  update_type val;
2072  val.v[0] = R_rowptr(lr);
2073  if (overlap_communication_and_computation)
2074  val.v[1] = R_rowptr_remote(lr);
2075 
2076  if (final) {
2077  R_rowptr(lr) = update.v[0];
2078  if (overlap_communication_and_computation)
2079  R_rowptr_remote(lr) = update.v[1];
2080 
2081  if (lr < nrows) {
2082  const local_ordinal_type ri0 = lclrow2idx[lr];
2083  const local_ordinal_type pi0 = rowidx2part(ri0);
2084 
2085  size_type cnt_rowptr = R_rowptr(lr);
2086  size_type cnt_rowptr_remote = overlap_communication_and_computation ? R_rowptr_remote(lr) : 0; // when not overlap_communication_and_computation, this value is garbage
2087 
2088  const size_type j0 = local_graph_rowptr(lr);
2089  for (size_type j=j0;j<local_graph_rowptr(lr+1);++j) {
2090  const local_ordinal_type lc = local_graph_colidx(j);
2091  const local_ordinal_type lc2r = col2row[lc];
2092  if (lc2r != (local_ordinal_type) -1) {
2093  const local_ordinal_type ri = lclrow2idx[lc2r];
2094  const local_ordinal_type pi = rowidx2part(ri);
2095  if (pi == pi0 && ri + 1 >= ri0 && ri <= ri0 + 1)
2096  continue;
2097  }
2098  const local_ordinal_type row_entry = j - j0;
2099  if (!overlap_communication_and_computation || lc < nrows)
2100  R_A_colindsub(cnt_rowptr++) = row_entry;
2101  else
2102  R_A_colindsub_remote(cnt_rowptr_remote++) = row_entry;
2103  }
2104  }
2105  }
2106  update += val;
2107  });
2108  }
2109  TEUCHOS_ASSERT(R_rowptr(nrows) == R_nnz_owned);
2110  Kokkos::deep_copy(amd.rowptr, R_rowptr);
2111  Kokkos::deep_copy(amd.A_colindsub, R_A_colindsub);
2112  if (overlap_communication_and_computation) {
2113  TEUCHOS_ASSERT(R_rowptr_remote(nrows) == R_nnz_remote);
2114  Kokkos::deep_copy(amd.rowptr_remote, R_rowptr_remote);
2115  Kokkos::deep_copy(amd.A_colindsub_remote, R_A_colindsub_remote);
2116  }
2117 
2118  // Allocate or view values.
2119  amd.tpetra_values = (const_cast<block_crs_matrix_type*>(A.get())->getValuesDeviceNonConst());
2120 
2121  }
2122 
2123  // Allocate view for E and initialize the values with B:
2124 
2125  if (interf.n_subparts_per_part > 1)
2126  btdm.e_values = vector_type_4d_view("btdm.e_values", 2, interf.part2packrowidx0_back, blocksize, blocksize);
2127  }
2128  IFPACK2_BLOCKHELPER_TIMER_FENCE(typename BlockHelperDetails::ImplType<MatrixType>::execution_space)
2129  }
2130 
2131 
2135  template<typename ArgActiveExecutionMemorySpace>
2137 
2138  template<>
2139  struct ExtractAndFactorizeTridiagsDefaultModeAndAlgo<Kokkos::HostSpace> {
2140  typedef KB::Mode::Serial mode_type;
2141 #if defined(__KOKKOSBATCHED_INTEL_MKL_COMPACT_BATCHED__)
2142  typedef KB::Algo::Level3::CompactMKL algo_type;
2143 #else
2144  typedef KB::Algo::Level3::Blocked algo_type;
2145 #endif
2146  static int recommended_team_size(const int /* blksize */,
2147  const int /* vector_length */,
2148  const int /* internal_vector_length */) {
2149  return 1;
2150  }
2151 
2152  };
2153 
2154 #if defined(KOKKOS_ENABLE_CUDA)
2155  static inline int ExtractAndFactorizeRecommendedCudaTeamSize(const int blksize,
2156  const int vector_length,
2157  const int internal_vector_length) {
2158  const int vector_size = vector_length/internal_vector_length;
2159  int total_team_size(0);
2160  if (blksize <= 5) total_team_size = 32;
2161  else if (blksize <= 9) total_team_size = 32; // 64
2162  else if (blksize <= 12) total_team_size = 96;
2163  else if (blksize <= 16) total_team_size = 128;
2164  else if (blksize <= 20) total_team_size = 160;
2165  else total_team_size = 160;
2166  return 2*total_team_size/vector_size;
2167  }
2168  template<>
2169  struct ExtractAndFactorizeTridiagsDefaultModeAndAlgo<Kokkos::CudaSpace> {
2170  typedef KB::Mode::Team mode_type;
2171  typedef KB::Algo::Level3::Unblocked algo_type;
2172  static int recommended_team_size(const int blksize,
2173  const int vector_length,
2174  const int internal_vector_length) {
2175  return ExtractAndFactorizeRecommendedCudaTeamSize(blksize, vector_length, internal_vector_length);
2176  }
2177  };
2178  template<>
2179  struct ExtractAndFactorizeTridiagsDefaultModeAndAlgo<Kokkos::CudaUVMSpace> {
2180  typedef KB::Mode::Team mode_type;
2181  typedef KB::Algo::Level3::Unblocked algo_type;
2182  static int recommended_team_size(const int blksize,
2183  const int vector_length,
2184  const int internal_vector_length) {
2185  return ExtractAndFactorizeRecommendedCudaTeamSize(blksize, vector_length, internal_vector_length);
2186  }
2187  };
2188 #endif
2189 
2190 #if defined(KOKKOS_ENABLE_HIP)
2191  static inline int ExtractAndFactorizeRecommendedHIPTeamSize(const int blksize,
2192  const int vector_length,
2193  const int internal_vector_length) {
2194  const int vector_size = vector_length/internal_vector_length;
2195  int total_team_size(0);
2196  if (blksize <= 5) total_team_size = 32;
2197  else if (blksize <= 9) total_team_size = 32; // 64
2198  else if (blksize <= 12) total_team_size = 96;
2199  else if (blksize <= 16) total_team_size = 128;
2200  else if (blksize <= 20) total_team_size = 160;
2201  else total_team_size = 160;
2202  return 2*total_team_size/vector_size;
2203  }
2204  template<>
2205  struct ExtractAndFactorizeTridiagsDefaultModeAndAlgo<Kokkos::HIPSpace> {
2206  typedef KB::Mode::Team mode_type;
2207  typedef KB::Algo::Level3::Unblocked algo_type;
2208  static int recommended_team_size(const int blksize,
2209  const int vector_length,
2210  const int internal_vector_length) {
2211  return ExtractAndFactorizeRecommendedHIPTeamSize(blksize, vector_length, internal_vector_length);
2212  }
2213  };
2214  template<>
2215  struct ExtractAndFactorizeTridiagsDefaultModeAndAlgo<Kokkos::HIPHostPinnedSpace> {
2216  typedef KB::Mode::Team mode_type;
2217  typedef KB::Algo::Level3::Unblocked algo_type;
2218  static int recommended_team_size(const int blksize,
2219  const int vector_length,
2220  const int internal_vector_length) {
2221  return ExtractAndFactorizeRecommendedHIPTeamSize(blksize, vector_length, internal_vector_length);
2222  }
2223  };
2224 #endif
2225 
2226 #if defined(KOKKOS_ENABLE_SYCL)
2227  static inline int ExtractAndFactorizeRecommendedSYCLTeamSize(const int blksize,
2228  const int vector_length,
2229  const int internal_vector_length) {
2230  const int vector_size = vector_length/internal_vector_length;
2231  int total_team_size(0);
2232  if (blksize <= 5) total_team_size = 32;
2233  else if (blksize <= 9) total_team_size = 32; // 64
2234  else if (blksize <= 12) total_team_size = 96;
2235  else if (blksize <= 16) total_team_size = 128;
2236  else if (blksize <= 20) total_team_size = 160;
2237  else total_team_size = 160;
2238  return 2*total_team_size/vector_size;
2239  }
2240  template<>
2241  struct ExtractAndFactorizeTridiagsDefaultModeAndAlgo<Kokkos::Experimental::SYCLDeviceUSMSpace> {
2242  typedef KB::Mode::Team mode_type;
2243  typedef KB::Algo::Level3::Unblocked algo_type;
2244  static int recommended_team_size(const int blksize,
2245  const int vector_length,
2246  const int internal_vector_length) {
2247  return ExtractAndFactorizeRecommendedSYCLTeamSize(blksize, vector_length, internal_vector_length);
2248  }
2249  };
2250  template<>
2251  struct ExtractAndFactorizeTridiagsDefaultModeAndAlgo<Kokkos::Experimental::SYCLSharedUSMSpace> {
2252  typedef KB::Mode::Team mode_type;
2253  typedef KB::Algo::Level3::Unblocked algo_type;
2254  static int recommended_team_size(const int blksize,
2255  const int vector_length,
2256  const int internal_vector_length) {
2257  return ExtractAndFactorizeRecommendedSYCLTeamSize(blksize, vector_length, internal_vector_length);
2258  }
2259  };
2260 #endif
2261 
2262  template<typename impl_type, typename WWViewType>
2263  KOKKOS_INLINE_FUNCTION
2264  void
2265  solveMultiVector(const typename Kokkos::TeamPolicy<typename impl_type::execution_space>::member_type &member,
2266  const typename impl_type::local_ordinal_type &/* blocksize */,
2267  const typename impl_type::local_ordinal_type &i0,
2268  const typename impl_type::local_ordinal_type &r0,
2269  const typename impl_type::local_ordinal_type &nrows,
2270  const typename impl_type::local_ordinal_type &v,
2271  const ConstUnmanaged<typename impl_type::internal_vector_type_4d_view> D_internal_vector_values,
2272  const Unmanaged<typename impl_type::internal_vector_type_4d_view> X_internal_vector_values,
2273  const WWViewType &WW,
2274  const bool skip_first_pass=false) {
2275  using execution_space = typename impl_type::execution_space;
2276  using team_policy_type = Kokkos::TeamPolicy<execution_space>;
2277  using member_type = typename team_policy_type::member_type;
2278  using local_ordinal_type = typename impl_type::local_ordinal_type;
2279 
2280  typedef SolveTridiagsDefaultModeAndAlgo
2281  <typename execution_space::memory_space> default_mode_and_algo_type;
2282 
2283  typedef typename default_mode_and_algo_type::mode_type default_mode_type;
2284  typedef typename default_mode_and_algo_type::multi_vector_algo_type default_algo_type;
2285 
2286  using btdm_magnitude_type = typename impl_type::btdm_magnitude_type;
2287 
2288  // constant
2289  const auto one = Kokkos::ArithTraits<btdm_magnitude_type>::one();
2290  const auto zero = Kokkos::ArithTraits<btdm_magnitude_type>::zero();
2291 
2292  // subview pattern
2293  auto A = Kokkos::subview(D_internal_vector_values, i0, Kokkos::ALL(), Kokkos::ALL(), v);
2294  auto X1 = Kokkos::subview(X_internal_vector_values, r0, Kokkos::ALL(), Kokkos::ALL(), v);
2295  auto X2 = X1;
2296 
2297  local_ordinal_type i = i0, r = r0;
2298 
2299 
2300  if (nrows > 1) {
2301  // solve Lx = x
2302  if (skip_first_pass) {
2303  i += (nrows-2) * 3;
2304  r += (nrows-2);
2305  A.assign_data( &D_internal_vector_values(i+2,0,0,v) );
2306  X2.assign_data( &X_internal_vector_values(++r,0,0,v) );
2307  A.assign_data( &D_internal_vector_values(i+3,0,0,v) );
2308  KB::Trsm<member_type,
2309  KB::Side::Left,KB::Uplo::Lower,KB::Trans::NoTranspose,KB::Diag::Unit,
2310  default_mode_type,default_algo_type>
2311  ::invoke(member, one, A, X2);
2312  X1.assign_data( X2.data() );
2313  i+=3;
2314  }
2315  else {
2316  KB::Trsm<member_type,
2317  KB::Side::Left,KB::Uplo::Lower,KB::Trans::NoTranspose,KB::Diag::Unit,
2318  default_mode_type,default_algo_type>
2319  ::invoke(member, one, A, X1);
2320  for (local_ordinal_type tr=1;tr<nrows;++tr,i+=3) {
2321  A.assign_data( &D_internal_vector_values(i+2,0,0,v) );
2322  X2.assign_data( &X_internal_vector_values(++r,0,0,v) );
2323  member.team_barrier();
2324  KB::Gemm<member_type,
2325  KB::Trans::NoTranspose,KB::Trans::NoTranspose,
2326  default_mode_type,default_algo_type>
2327  ::invoke(member, -one, A, X1, one, X2);
2328  A.assign_data( &D_internal_vector_values(i+3,0,0,v) );
2329  KB::Trsm<member_type,
2330  KB::Side::Left,KB::Uplo::Lower,KB::Trans::NoTranspose,KB::Diag::Unit,
2331  default_mode_type,default_algo_type>
2332  ::invoke(member, one, A, X2);
2333  X1.assign_data( X2.data() );
2334  }
2335  }
2336 
2337  // solve Ux = x
2338  KB::Trsm<member_type,
2339  KB::Side::Left,KB::Uplo::Upper,KB::Trans::NoTranspose,KB::Diag::NonUnit,
2340  default_mode_type,default_algo_type>
2341  ::invoke(member, one, A, X1);
2342  for (local_ordinal_type tr=nrows;tr>1;--tr) {
2343  i -= 3;
2344  A.assign_data( &D_internal_vector_values(i+1,0,0,v) );
2345  X2.assign_data( &X_internal_vector_values(--r,0,0,v) );
2346  member.team_barrier();
2347  KB::Gemm<member_type,
2348  KB::Trans::NoTranspose,KB::Trans::NoTranspose,
2349  default_mode_type,default_algo_type>
2350  ::invoke(member, -one, A, X1, one, X2);
2351 
2352  A.assign_data( &D_internal_vector_values(i,0,0,v) );
2353  KB::Trsm<member_type,
2354  KB::Side::Left,KB::Uplo::Upper,KB::Trans::NoTranspose,KB::Diag::NonUnit,
2355  default_mode_type,default_algo_type>
2356  ::invoke(member, one, A, X2);
2357  X1.assign_data( X2.data() );
2358  }
2359  } else {
2360  // matrix is already inverted
2361  auto W = Kokkos::subview(WW, Kokkos::ALL(), Kokkos::ALL(), v);
2362  KB::Copy<member_type,KB::Trans::NoTranspose,default_mode_type>
2363  ::invoke(member, X1, W);
2364  member.team_barrier();
2365  KB::Gemm<member_type,
2366  KB::Trans::NoTranspose,KB::Trans::NoTranspose,
2367  default_mode_type,default_algo_type>
2368  ::invoke(member, one, A, W, zero, X1);
2369  }
2370 
2371  }
2372 
2373  template<typename impl_type, typename WWViewType, typename XViewType>
2374  KOKKOS_INLINE_FUNCTION
2375  void
2376  solveSingleVectorNew(const typename Kokkos::TeamPolicy<typename impl_type::execution_space>::member_type &member,
2377  const typename impl_type::local_ordinal_type &blocksize,
2378  const typename impl_type::local_ordinal_type &i0,
2379  const typename impl_type::local_ordinal_type &r0,
2380  const typename impl_type::local_ordinal_type &nrows,
2381  const typename impl_type::local_ordinal_type &v,
2382  const ConstUnmanaged<typename impl_type::internal_vector_type_4d_view> D_internal_vector_values,
2383  const XViewType &X_internal_vector_values, //Unmanaged<typename impl_type::internal_vector_type_4d_view>
2384  const WWViewType &WW) {
2385  using execution_space = typename impl_type::execution_space;
2386  //using team_policy_type = Kokkos::TeamPolicy<execution_space>;
2387  //using member_type = typename team_policy_type::member_type;
2388  using local_ordinal_type = typename impl_type::local_ordinal_type;
2389 
2390  typedef SolveTridiagsDefaultModeAndAlgo
2391  <typename execution_space::memory_space> default_mode_and_algo_type;
2392 
2393  typedef typename default_mode_and_algo_type::mode_type default_mode_type;
2394  typedef typename default_mode_and_algo_type::single_vector_algo_type default_algo_type;
2395 
2396  using btdm_magnitude_type = typename impl_type::btdm_magnitude_type;
2397 
2398  // base pointers
2399  auto A = D_internal_vector_values.data();
2400  auto X = X_internal_vector_values.data();
2401 
2402  // constant
2403  const auto one = Kokkos::ArithTraits<btdm_magnitude_type>::one();
2404  const auto zero = Kokkos::ArithTraits<btdm_magnitude_type>::zero();
2405  //const local_ordinal_type num_vectors = X_scalar_values.extent(2);
2406 
2407  // const local_ordinal_type blocksize = D_scalar_values.extent(1);
2408  const local_ordinal_type astep = D_internal_vector_values.stride_0();
2409  const local_ordinal_type as0 = D_internal_vector_values.stride_1(); //blocksize*vector_length;
2410  const local_ordinal_type as1 = D_internal_vector_values.stride_2(); //vector_length;
2411  const local_ordinal_type xstep = X_internal_vector_values.stride_0();
2412  const local_ordinal_type xs0 = X_internal_vector_values.stride_1(); //vector_length;
2413 
2414  // move to starting point
2415  A += i0*astep + v;
2416  X += r0*xstep + v;
2417 
2418  //for (local_ordinal_type col=0;col<num_vectors;++col)
2419  if (nrows > 1) {
2420  // solve Lx = x
2421  KOKKOSBATCHED_TRSV_LOWER_NO_TRANSPOSE_INTERNAL_INVOKE
2422  (default_mode_type,default_algo_type,
2423  member,
2424  KB::Diag::Unit,
2425  blocksize,blocksize,
2426  one,
2427  A, as0, as1,
2428  X, xs0);
2429 
2430  for (local_ordinal_type tr=1;tr<nrows;++tr) {
2431  member.team_barrier();
2432  KOKKOSBATCHED_GEMV_NO_TRANSPOSE_INTERNAL_INVOKE
2433  (default_mode_type,default_algo_type,
2434  member,
2435  blocksize, blocksize,
2436  -one,
2437  A+2*astep, as0, as1,
2438  X, xs0,
2439  one,
2440  X+1*xstep, xs0);
2441  KOKKOSBATCHED_TRSV_LOWER_NO_TRANSPOSE_INTERNAL_INVOKE
2442  (default_mode_type,default_algo_type,
2443  member,
2444  KB::Diag::Unit,
2445  blocksize,blocksize,
2446  one,
2447  A+3*astep, as0, as1,
2448  X+1*xstep, xs0);
2449 
2450  A += 3*astep;
2451  X += 1*xstep;
2452  }
2453 
2454  // solve Ux = x
2455  KOKKOSBATCHED_TRSV_UPPER_NO_TRANSPOSE_INTERNAL_INVOKE
2456  (default_mode_type,default_algo_type,
2457  member,
2458  KB::Diag::NonUnit,
2459  blocksize, blocksize,
2460  one,
2461  A, as0, as1,
2462  X, xs0);
2463 
2464  for (local_ordinal_type tr=nrows;tr>1;--tr) {
2465  A -= 3*astep;
2466  member.team_barrier();
2467  KOKKOSBATCHED_GEMV_NO_TRANSPOSE_INTERNAL_INVOKE
2468  (default_mode_type,default_algo_type,
2469  member,
2470  blocksize, blocksize,
2471  -one,
2472  A+1*astep, as0, as1,
2473  X, xs0,
2474  one,
2475  X-1*xstep, xs0);
2476  KOKKOSBATCHED_TRSV_UPPER_NO_TRANSPOSE_INTERNAL_INVOKE
2477  (default_mode_type,default_algo_type,
2478  member,
2479  KB::Diag::NonUnit,
2480  blocksize, blocksize,
2481  one,
2482  A, as0, as1,
2483  X-1*xstep,xs0);
2484  X -= 1*xstep;
2485  }
2486  // for multiple rhs
2487  //X += xs1;
2488  } else {
2489  const local_ordinal_type ws0 = WW.stride_0();
2490  auto W = WW.data() + v;
2491  KOKKOSBATCHED_COPY_VECTOR_NO_TRANSPOSE_INTERNAL_INVOKE
2492  (default_mode_type,
2493  member, blocksize, X, xs0, W, ws0);
2494  member.team_barrier();
2495  KOKKOSBATCHED_GEMV_NO_TRANSPOSE_INTERNAL_INVOKE
2496  (default_mode_type,default_algo_type,
2497  member,
2498  blocksize, blocksize,
2499  one,
2500  A, as0, as1,
2501  W, xs0,
2502  zero,
2503  X, xs0);
2504  }
2505  }
2506 
2507  template<typename local_ordinal_type, typename ViewType>
2508  void writeBTDValuesToFile (const local_ordinal_type &n_parts, const ViewType &scalar_values_device, std::string fileName) {
2509 #ifdef IFPACK2_BLOCKTRIDICONTAINER_WRITE_MM
2510  auto scalar_values = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), scalar_values_device);
2511  std::ofstream myfile;
2512  myfile.open (fileName);
2513 
2514  const local_ordinal_type n_parts_per_pack = n_parts < (local_ordinal_type) scalar_values.extent(3) ? n_parts : scalar_values.extent(3);
2515  local_ordinal_type nnz = scalar_values.extent(0) * scalar_values.extent(1) * scalar_values.extent(2) * n_parts_per_pack;
2516  const local_ordinal_type n_blocks = scalar_values.extent(0)*n_parts_per_pack;
2517  const local_ordinal_type n_blocks_per_part = n_blocks/n_parts;
2518 
2519  const local_ordinal_type block_size = scalar_values.extent(1);
2520 
2521  const local_ordinal_type n_rows_per_part = (n_blocks_per_part+2)/3 * block_size;
2522  const local_ordinal_type n_rows = n_rows_per_part*n_parts;
2523 
2524  const local_ordinal_type n_packs = ceil(float(n_parts)/n_parts_per_pack);
2525 
2526  myfile << "%%MatrixMarket matrix coordinate real general"<< std::endl;
2527  myfile << "%%nnz = " << nnz;
2528  myfile << " block size = " << block_size;
2529  myfile << " number of blocks = " << n_blocks;
2530  myfile << " number of parts = " << n_parts;
2531  myfile << " number of blocks per part = " << n_blocks_per_part;
2532  myfile << " number of rows = " << n_rows ;
2533  myfile << " number of cols = " << n_rows;
2534  myfile << " number of packs = " << n_packs << std::endl;
2535 
2536  myfile << n_rows << " " << n_rows << " " << nnz << std::setprecision(9) << std::endl;
2537 
2538  local_ordinal_type current_part_idx, current_block_idx, current_row_offset, current_col_offset, current_row, current_col;
2539  for (local_ordinal_type i_pack=0;i_pack<n_packs;++i_pack) {
2540  for (local_ordinal_type i_part_in_pack=0;i_part_in_pack<n_parts_per_pack;++i_part_in_pack) {
2541  current_part_idx = i_part_in_pack + i_pack * n_parts_per_pack;
2542  for (local_ordinal_type i_block_in_part=0;i_block_in_part<n_blocks_per_part;++i_block_in_part) {
2543  current_block_idx = i_block_in_part + i_pack * n_blocks_per_part;
2544  if (current_block_idx >= (local_ordinal_type) scalar_values.extent(0))
2545  continue;
2546  if (i_block_in_part % 3 == 0) {
2547  current_row_offset = i_block_in_part/3 * block_size;
2548  current_col_offset = i_block_in_part/3 * block_size;
2549  }
2550  else if (i_block_in_part % 3 == 1) {
2551  current_row_offset = (i_block_in_part-1)/3 * block_size;
2552  current_col_offset = ((i_block_in_part-1)/3+1) * block_size;
2553  }
2554  else if (i_block_in_part % 3 == 2) {
2555  current_row_offset = ((i_block_in_part-2)/3+1) * block_size;
2556  current_col_offset = (i_block_in_part-2)/3 * block_size;
2557  }
2558  current_row_offset += current_part_idx * n_rows_per_part;
2559  current_col_offset += current_part_idx * n_rows_per_part;
2560  for (local_ordinal_type i_in_block=0;i_in_block<block_size;++i_in_block) {
2561  for (local_ordinal_type j_in_block=0;j_in_block<block_size;++j_in_block) {
2562  current_row = current_row_offset + i_in_block + 1;
2563  current_col = current_col_offset + j_in_block + 1;
2564  myfile << current_row << " " << current_col << " " << scalar_values(current_block_idx,i_in_block,j_in_block,i_part_in_pack) << std::endl;
2565  }
2566  }
2567  }
2568  }
2569  }
2570 
2571  myfile.close();
2572 #endif
2573  }
2574 
2575  template<typename local_ordinal_type, typename ViewType>
2576  void write4DMultiVectorValuesToFile (const local_ordinal_type &n_parts, const ViewType &scalar_values_device, std::string fileName) {
2577 #ifdef IFPACK2_BLOCKTRIDICONTAINER_WRITE_MM
2578  auto scalar_values = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), scalar_values_device);
2579  std::ofstream myfile;
2580  myfile.open (fileName);
2581 
2582  const local_ordinal_type n_parts_per_pack = n_parts < scalar_values.extent(3) ? n_parts : scalar_values.extent(3);
2583  const local_ordinal_type n_blocks = scalar_values.extent(0)*n_parts_per_pack;
2584  const local_ordinal_type n_blocks_per_part = n_blocks/n_parts;
2585 
2586  const local_ordinal_type block_size = scalar_values.extent(1);
2587  const local_ordinal_type n_cols = scalar_values.extent(2);
2588 
2589  const local_ordinal_type n_rows_per_part = n_blocks_per_part * block_size;
2590  const local_ordinal_type n_rows = n_rows_per_part*n_parts;
2591 
2592  const local_ordinal_type n_packs = ceil(float(n_parts)/n_parts_per_pack);
2593 
2594 
2595  myfile << "%%MatrixMarket matrix array real general"<< std::endl;
2596  myfile << "%%block size = " << block_size;
2597  myfile << " number of blocks = " << n_blocks;
2598  myfile << " number of parts = " << n_parts;
2599  myfile << " number of blocks per part = " << n_blocks_per_part;
2600  myfile << " number of rows = " << n_rows ;
2601  myfile << " number of cols = " << n_cols;
2602  myfile << " number of packs = " << n_packs << std::endl;
2603 
2604  myfile << n_rows << " " << n_cols << std::setprecision(9) << std::endl;
2605 
2606  local_ordinal_type current_part_idx, current_block_idx, current_row_offset;
2607  (void) current_row_offset;
2608  (void) current_part_idx;
2609  for (local_ordinal_type j_in_block=0;j_in_block<n_cols;++j_in_block) {
2610  for (local_ordinal_type i_pack=0;i_pack<n_packs;++i_pack) {
2611  for (local_ordinal_type i_part_in_pack=0;i_part_in_pack<n_parts_per_pack;++i_part_in_pack) {
2612  current_part_idx = i_part_in_pack + i_pack * n_parts_per_pack;
2613  for (local_ordinal_type i_block_in_part=0;i_block_in_part<n_blocks_per_part;++i_block_in_part) {
2614  current_block_idx = i_block_in_part + i_pack * n_blocks_per_part;
2615 
2616  if (current_block_idx >= (local_ordinal_type) scalar_values.extent(0))
2617  continue;
2618  for (local_ordinal_type i_in_block=0;i_in_block<block_size;++i_in_block) {
2619  myfile << scalar_values(current_block_idx,i_in_block,j_in_block,i_part_in_pack) << std::endl;
2620  }
2621  }
2622  }
2623  }
2624  }
2625  myfile.close();
2626 #endif
2627  }
2628 
2629  template<typename local_ordinal_type, typename ViewType>
2630  void write5DMultiVectorValuesToFile (const local_ordinal_type &n_parts, const ViewType &scalar_values_device, std::string fileName) {
2631 #ifdef IFPACK2_BLOCKTRIDICONTAINER_WRITE_MM
2632  auto scalar_values = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), scalar_values_device);
2633  std::ofstream myfile;
2634  myfile.open (fileName);
2635 
2636  const local_ordinal_type n_parts_per_pack = n_parts < scalar_values.extent(4) ? n_parts : scalar_values.extent(4);
2637  const local_ordinal_type n_blocks = scalar_values.extent(1)*n_parts_per_pack;
2638  const local_ordinal_type n_blocks_per_part = n_blocks/n_parts;
2639 
2640  const local_ordinal_type block_size = scalar_values.extent(2);
2641  const local_ordinal_type n_blocks_cols = scalar_values.extent(0);
2642  const local_ordinal_type n_cols = n_blocks_cols * block_size;
2643 
2644  const local_ordinal_type n_rows_per_part = n_blocks_per_part * block_size;
2645  const local_ordinal_type n_rows = n_rows_per_part*n_parts;
2646 
2647  const local_ordinal_type n_packs = ceil(float(n_parts)/n_parts_per_pack);
2648 
2649  myfile << "%%MatrixMarket matrix array real general"<< std::endl;
2650  myfile << "%%block size = " << block_size;
2651  myfile << " number of blocks = " << n_blocks;
2652  myfile << " number of parts = " << n_parts;
2653  myfile << " number of blocks per part = " << n_blocks_per_part;
2654  myfile << " number of rows = " << n_rows ;
2655  myfile << " number of cols = " << n_cols;
2656  myfile << " number of packs = " << n_packs << std::endl;
2657 
2658  myfile << n_rows << " " << n_cols << std::setprecision(9) << std::endl;
2659 
2660  local_ordinal_type current_part_idx, current_block_idx, current_row_offset;
2661  (void) current_row_offset;
2662  (void) current_part_idx;
2663  for (local_ordinal_type i_block_col=0;i_block_col<n_blocks_cols;++i_block_col) {
2664  for (local_ordinal_type j_in_block=0;j_in_block<block_size;++j_in_block) {
2665  for (local_ordinal_type i_pack=0;i_pack<n_packs;++i_pack) {
2666  for (local_ordinal_type i_part_in_pack=0;i_part_in_pack<n_parts_per_pack;++i_part_in_pack) {
2667  current_part_idx = i_part_in_pack + i_pack * n_parts_per_pack;
2668  for (local_ordinal_type i_block_in_part=0;i_block_in_part<n_blocks_per_part;++i_block_in_part) {
2669  current_block_idx = i_block_in_part + i_pack * n_blocks_per_part;
2670 
2671  if (current_block_idx >= (local_ordinal_type) scalar_values.extent(1))
2672  continue;
2673  for (local_ordinal_type i_in_block=0;i_in_block<block_size;++i_in_block) {
2674  myfile << scalar_values(i_block_col,current_block_idx,i_in_block,j_in_block,i_part_in_pack) << std::endl;
2675  }
2676  }
2677  }
2678  }
2679  }
2680  }
2681  myfile.close();
2682 #endif
2683  }
2684 
2685  template<typename local_ordinal_type, typename member_type, typename ViewType1, typename ViewType2>
2686  KOKKOS_INLINE_FUNCTION
2687  void
2688  copy3DView(const member_type &member, const ViewType1 &view1, const ViewType2 &view2) {
2689 /*
2690  // Kokkos::Experimental::local_deep_copy
2691  auto teamVectorRange =
2692  Kokkos::TeamVectorMDRange<Kokkos::Rank<3>, member_type>(
2693  member, view1.extent(0), view1.extent(1), view1.extent(2));
2694 
2695  Kokkos::parallel_for
2696  (teamVectorRange,
2697  [&](const local_ordinal_type &i, const local_ordinal_type &j, const local_ordinal_type &k) {
2698  view1(i,j,k) = view2(i,j,k);
2699  });
2700 */
2701  Kokkos::Experimental::local_deep_copy(member, view1, view2);
2702  }
2703  template<typename MatrixType>
2704  struct ExtractAndFactorizeTridiags {
2705  public:
2706  using impl_type = BlockHelperDetails::ImplType<MatrixType>;
2707  // a functor cannot have both device_type and execution_space; specialization error in kokkos
2708  using execution_space = typename impl_type::execution_space;
2709  using memory_space = typename impl_type::memory_space;
2711  using local_ordinal_type = typename impl_type::local_ordinal_type;
2712  using size_type = typename impl_type::size_type;
2713  using impl_scalar_type = typename impl_type::impl_scalar_type;
2714  using magnitude_type = typename impl_type::magnitude_type;
2716  using block_crs_matrix_type = typename impl_type::tpetra_block_crs_matrix_type;
2718  using local_ordinal_type_1d_view = typename impl_type::local_ordinal_type_1d_view;
2719  using local_ordinal_type_2d_view = typename impl_type::local_ordinal_type_2d_view;
2720  using size_type_2d_view = typename impl_type::size_type_2d_view;
2721  using impl_scalar_type_1d_view_tpetra = typename impl_type::impl_scalar_type_1d_view_tpetra;
2723  using btdm_scalar_type = typename impl_type::btdm_scalar_type;
2724  using btdm_magnitude_type = typename impl_type::btdm_magnitude_type;
2725  using vector_type_3d_view = typename impl_type::vector_type_3d_view;
2726  using vector_type_4d_view = typename impl_type::vector_type_4d_view;
2727  using internal_vector_type_4d_view = typename impl_type::internal_vector_type_4d_view;
2728  using internal_vector_type_5d_view = typename impl_type::internal_vector_type_5d_view;
2729  using btdm_scalar_type_4d_view = typename impl_type::btdm_scalar_type_4d_view;
2730  using btdm_scalar_type_5d_view = typename impl_type::btdm_scalar_type_5d_view;
2731  using internal_vector_scratch_type_3d_view = Scratch<typename impl_type::internal_vector_type_3d_view>;
2732  using btdm_scalar_scratch_type_3d_view = Scratch<typename impl_type::btdm_scalar_type_3d_view>;
2733 
2734  using internal_vector_type = typename impl_type::internal_vector_type;
2735  static constexpr int vector_length = impl_type::vector_length;
2736  static constexpr int internal_vector_length = impl_type::internal_vector_length;
2737 
2739  using team_policy_type = Kokkos::TeamPolicy<execution_space>;
2740  using member_type = typename team_policy_type::member_type;
2741 
2742  private:
2743  // part interface
2744  const ConstUnmanaged<local_ordinal_type_1d_view> partptr, lclrow, packptr, packindices_sub, packptr_sub;
2745  const ConstUnmanaged<local_ordinal_type_2d_view> partptr_sub, part2packrowidx0_sub, packindices_schur;
2746  const local_ordinal_type max_partsz;
2747  // block crs matrix (it could be Kokkos::UVMSpace::size_type, which is int)
2748  using size_type_1d_view_tpetra = Kokkos::View<size_t*,typename impl_type::node_device_type>;
2749  const ConstUnmanaged<size_type_1d_view_tpetra> A_rowptr;
2750  const ConstUnmanaged<impl_scalar_type_1d_view_tpetra> A_values;
2751  // block tridiags
2752  const ConstUnmanaged<size_type_2d_view> pack_td_ptr, flat_td_ptr, pack_td_ptr_schur;
2753  const ConstUnmanaged<local_ordinal_type_1d_view> A_colindsub;
2754  const Unmanaged<internal_vector_type_4d_view> internal_vector_values, internal_vector_values_schur;
2755  const Unmanaged<internal_vector_type_5d_view> e_internal_vector_values;
2756  const Unmanaged<btdm_scalar_type_4d_view> scalar_values, scalar_values_schur;
2757  const Unmanaged<btdm_scalar_type_5d_view> e_scalar_values;
2758  // shared information
2759  const local_ordinal_type blocksize, blocksize_square;
2760  // diagonal safety
2761  const magnitude_type tiny;
2762  const local_ordinal_type vector_loop_size;
2763  const local_ordinal_type vector_length_value;
2764 
2765  public:
2766  ExtractAndFactorizeTridiags(const BlockTridiags<MatrixType> &btdm_,
2767  const BlockHelperDetails::PartInterface<MatrixType> &interf_,
2769  const magnitude_type& tiny_) :
2770  // interface
2771  partptr(interf_.partptr),
2772  lclrow(interf_.lclrow),
2773  packptr(interf_.packptr),
2774  packindices_sub(interf_.packindices_sub),
2775  packptr_sub(interf_.packptr_sub),
2776  partptr_sub(interf_.partptr_sub),
2777  part2packrowidx0_sub(interf_.part2packrowidx0_sub),
2778  packindices_schur(interf_.packindices_schur),
2779  max_partsz(interf_.max_partsz),
2780  // block crs matrix
2781  A_rowptr(A_->getCrsGraph().getLocalGraphDevice().row_map),
2782  A_values(const_cast<block_crs_matrix_type*>(A_.get())->getValuesDeviceNonConst()),
2783  // block tridiags
2784  pack_td_ptr(btdm_.pack_td_ptr),
2785  flat_td_ptr(btdm_.flat_td_ptr),
2786  pack_td_ptr_schur(btdm_.pack_td_ptr_schur),
2787  A_colindsub(btdm_.A_colindsub),
2788  internal_vector_values((internal_vector_type*)btdm_.values.data(),
2789  btdm_.values.extent(0),
2790  btdm_.values.extent(1),
2791  btdm_.values.extent(2),
2792  vector_length/internal_vector_length),
2793  internal_vector_values_schur((internal_vector_type*)btdm_.values_schur.data(),
2794  btdm_.values_schur.extent(0),
2795  btdm_.values_schur.extent(1),
2796  btdm_.values_schur.extent(2),
2797  vector_length/internal_vector_length),
2798  e_internal_vector_values((internal_vector_type*)btdm_.e_values.data(),
2799  btdm_.e_values.extent(0),
2800  btdm_.e_values.extent(1),
2801  btdm_.e_values.extent(2),
2802  btdm_.e_values.extent(3),
2803  vector_length/internal_vector_length),
2804  scalar_values((btdm_scalar_type*)btdm_.values.data(),
2805  btdm_.values.extent(0),
2806  btdm_.values.extent(1),
2807  btdm_.values.extent(2),
2808  vector_length),
2809  scalar_values_schur((btdm_scalar_type*)btdm_.values_schur.data(),
2810  btdm_.values_schur.extent(0),
2811  btdm_.values_schur.extent(1),
2812  btdm_.values_schur.extent(2),
2813  vector_length),
2814  e_scalar_values((btdm_scalar_type*)btdm_.e_values.data(),
2815  btdm_.e_values.extent(0),
2816  btdm_.e_values.extent(1),
2817  btdm_.e_values.extent(2),
2818  btdm_.e_values.extent(3),
2819  vector_length),
2820  blocksize(btdm_.values.extent(1)),
2821  blocksize_square(blocksize*blocksize),
2822  // diagonal weight to avoid zero pivots
2823  tiny(tiny_),
2824  vector_loop_size(vector_length/internal_vector_length),
2825  vector_length_value(vector_length) {}
2826 
2827  private:
2828 
2829  KOKKOS_INLINE_FUNCTION
2830  void
2831  extract(local_ordinal_type partidx,
2832  local_ordinal_type local_subpartidx,
2833  local_ordinal_type npacks) const {
2834 #ifdef IFPACK2_BLOCKTRIDICONTAINER_USE_PRINTF
2835  printf("extract partidx = %d, local_subpartidx = %d, npacks = %d;\n", partidx, local_subpartidx, npacks);
2836 #endif
2837  using tlb = BlockHelperDetails::TpetraLittleBlock<Tpetra::Impl::BlockCrsMatrixLittleBlockArrayLayout>;
2838  const size_type kps = pack_td_ptr(partidx, local_subpartidx);
2839  local_ordinal_type kfs[vector_length] = {};
2840  local_ordinal_type ri0[vector_length] = {};
2841  local_ordinal_type nrows[vector_length] = {};
2842 
2843  for (local_ordinal_type vi=0;vi<npacks;++vi,++partidx) {
2844  kfs[vi] = flat_td_ptr(partidx,local_subpartidx);
2845  ri0[vi] = partptr_sub(pack_td_ptr.extent(0)*local_subpartidx + partidx,0);
2846  nrows[vi] = partptr_sub(pack_td_ptr.extent(0)*local_subpartidx + partidx,1) - ri0[vi];
2847 #ifdef IFPACK2_BLOCKTRIDICONTAINER_USE_PRINTF
2848  printf("kfs[%d] = %d;\n", vi, kfs[vi]);
2849  printf("ri0[%d] = %d;\n", vi, ri0[vi]);
2850  printf("nrows[%d] = %d;\n", vi, nrows[vi]);
2851 #endif
2852  }
2853  local_ordinal_type tr_min = 0;
2854  local_ordinal_type tr_max = nrows[0];
2855  if (local_subpartidx % 2 == 1) {
2856  tr_min -= 1;
2857  tr_max += 1;
2858  }
2859 #ifdef IFPACK2_BLOCKTRIDICONTAINER_USE_PRINTF
2860  printf("tr_min = %d and tr_max = %d;\n", tr_min, tr_max);
2861 #endif
2862  for (local_ordinal_type tr=tr_min,j=0;tr<tr_max;++tr) {
2863  for (local_ordinal_type e=0;e<3;++e) {
2864  const impl_scalar_type* block[vector_length] = {};
2865  for (local_ordinal_type vi=0;vi<npacks;++vi) {
2866  const size_type Aj = A_rowptr(lclrow(ri0[vi] + tr)) + A_colindsub(kfs[vi] + j);
2867  block[vi] = &A_values(Aj*blocksize_square);
2868  }
2869  const size_type pi = kps + j;
2870 #ifdef IFPACK2_BLOCKTRIDICONTAINER_USE_PRINTF
2871  printf("Extract pi = %ld, ri0 + tr = %d, kfs + j = %d\n", pi, ri0[0] + tr, kfs[0] + j);
2872 #endif
2873  ++j;
2874  for (local_ordinal_type ii=0;ii<blocksize;++ii) {
2875  for (local_ordinal_type jj=0;jj<blocksize;++jj) {
2876  const auto idx = tlb::getFlatIndex(ii, jj, blocksize);
2877  auto& v = internal_vector_values(pi, ii, jj, 0);
2878  for (local_ordinal_type vi=0;vi<npacks;++vi)
2879  v[vi] = static_cast<btdm_scalar_type>(block[vi][idx]);
2880  }
2881  }
2882 
2883  if (nrows[0] == 1) break;
2884  if (local_subpartidx % 2 == 0) {
2885  if (e == 1 && (tr == 0 || tr+1 == nrows[0])) break;
2886  for (local_ordinal_type vi=1;vi<npacks;++vi) {
2887  if ((e == 0 && nrows[vi] == 1) || (e == 1 && tr+1 == nrows[vi])) {
2888  npacks = vi;
2889  break;
2890  }
2891  }
2892  }
2893  else {
2894  if (e == 0 && (tr == -1 || tr == nrows[0])) break;
2895  for (local_ordinal_type vi=1;vi<npacks;++vi) {
2896  if ((e == 0 && nrows[vi] == 1) || (e == 0 && tr == nrows[vi])) {
2897  npacks = vi;
2898  break;
2899  }
2900  }
2901  }
2902  }
2903  }
2904  }
2905 
2906  KOKKOS_INLINE_FUNCTION
2907  void
2908  extract(const member_type &member,
2909  const local_ordinal_type &partidxbeg,
2910  local_ordinal_type local_subpartidx,
2911  const local_ordinal_type &npacks,
2912  const local_ordinal_type &vbeg) const {
2913 #ifdef IFPACK2_BLOCKTRIDICONTAINER_USE_PRINTF
2914  printf("extract partidxbeg = %d, local_subpartidx = %d, npacks = %d, vbeg = %d;\n", partidxbeg, local_subpartidx, npacks, vbeg);
2915 #endif
2916  using tlb = BlockHelperDetails::TpetraLittleBlock<Tpetra::Impl::BlockCrsMatrixLittleBlockArrayLayout>;
2917  local_ordinal_type kfs_vals[internal_vector_length] = {};
2918  local_ordinal_type ri0_vals[internal_vector_length] = {};
2919  local_ordinal_type nrows_vals[internal_vector_length] = {};
2920 
2921  const size_type kps = pack_td_ptr(partidxbeg,local_subpartidx);
2922  for (local_ordinal_type v=vbeg,vi=0;v<npacks && vi<internal_vector_length;++v,++vi) {
2923  kfs_vals[vi] = flat_td_ptr(partidxbeg+vi,local_subpartidx);
2924  ri0_vals[vi] = partptr_sub(pack_td_ptr.extent(0)*local_subpartidx + partidxbeg+vi,0);
2925  nrows_vals[vi] = partptr_sub(pack_td_ptr.extent(0)*local_subpartidx + partidxbeg+vi,1) - ri0_vals[vi];
2926 #ifdef IFPACK2_BLOCKTRIDICONTAINER_USE_PRINTF
2927  printf("kfs_vals[%d] = %d;\n", vi, kfs_vals[vi]);
2928  printf("ri0_vals[%d] = %d;\n", vi, ri0_vals[vi]);
2929  printf("nrows_vals[%d] = %d;\n", vi, nrows_vals[vi]);
2930 #endif
2931  }
2932 
2933  local_ordinal_type j_vals[internal_vector_length] = {};
2934 
2935  local_ordinal_type tr_min = 0;
2936  local_ordinal_type tr_max = nrows_vals[0];
2937  if (local_subpartidx % 2 == 1) {
2938  tr_min -= 1;
2939  tr_max += 1;
2940  }
2941 #ifdef IFPACK2_BLOCKTRIDICONTAINER_USE_PRINTF
2942  printf("tr_min = %d and tr_max = %d;\n", tr_min, tr_max);
2943 #endif
2944  for (local_ordinal_type tr=tr_min;tr<tr_max;++tr) {
2945  for (local_ordinal_type v=vbeg,vi=0;v<npacks && vi<internal_vector_length;++v,++vi) {
2946  const local_ordinal_type nrows = (local_subpartidx % 2 == 0 ? nrows_vals[vi] : nrows_vals[vi]);
2947  if ((local_subpartidx % 2 == 0 && tr < nrows) || (local_subpartidx % 2 == 1 && tr < nrows+1)) {
2948  auto &j = j_vals[vi];
2949  const local_ordinal_type kfs = kfs_vals[vi];
2950  const local_ordinal_type ri0 = ri0_vals[vi];
2951  local_ordinal_type lbeg, lend;
2952  if (local_subpartidx % 2 == 0) {
2953  lbeg = (tr == tr_min ? 1 : 0);
2954  lend = (tr == nrows - 1 ? 2 : 3);
2955  }
2956  else {
2957  lbeg = 0;
2958  lend = 3;
2959  if (tr == tr_min) {
2960  lbeg = 1;
2961  lend = 2;
2962  }
2963  else if (tr == nrows) {
2964  lbeg = 0;
2965  lend = 1;
2966  }
2967  }
2968  for (local_ordinal_type l=lbeg;l<lend;++l,++j) {
2969  const size_type Aj = A_rowptr(lclrow(ri0 + tr)) + A_colindsub(kfs + j);
2970  const impl_scalar_type* block = &A_values(Aj*blocksize_square);
2971  const size_type pi = kps + j;
2972 #ifdef IFPACK2_BLOCKTRIDICONTAINER_USE_PRINTF
2973  printf("Extract pi = %ld, ri0 + tr = %d, kfs + j = %d, tr = %d, lbeg = %d, lend = %d, l = %d\n", pi, ri0 + tr, kfs + j, tr, lbeg, lend, l);
2974 #endif
2975  Kokkos::parallel_for
2976  (Kokkos::TeamThreadRange(member,blocksize),
2977  [&](const local_ordinal_type &ii) {
2978  for (local_ordinal_type jj=0;jj<blocksize;++jj) {
2979  scalar_values(pi, ii, jj, v) = static_cast<btdm_scalar_type>(block[tlb::getFlatIndex(ii,jj,blocksize)]);
2980  }
2981  });
2982  }
2983  }
2984  }
2985  }
2986  }
2987 
2988  template<typename AAViewType,
2989  typename WWViewType>
2990  KOKKOS_INLINE_FUNCTION
2991  void
2992  factorize_subline(const member_type &member,
2993  const local_ordinal_type &i0,
2994  const local_ordinal_type &nrows,
2995  const local_ordinal_type &v,
2996  const AAViewType &AA,
2997  const WWViewType &WW) const {
2998 
2999  typedef ExtractAndFactorizeTridiagsDefaultModeAndAlgo
3000  <typename execution_space::memory_space> default_mode_and_algo_type;
3001 
3002  typedef typename default_mode_and_algo_type::mode_type default_mode_type;
3003  typedef typename default_mode_and_algo_type::algo_type default_algo_type;
3004 
3005  // constant
3006  const auto one = Kokkos::ArithTraits<btdm_magnitude_type>::one();
3007 
3008 #ifdef IFPACK2_BLOCKTRIDICONTAINER_USE_PRINTF
3009  printf("i0 = %d, nrows = %d, v = %d, AA.extent(0) = %ld;\n", i0, nrows, v, AA.extent(0));
3010 #endif
3011 
3012  // subview pattern
3013  auto A = Kokkos::subview(AA, i0, Kokkos::ALL(), Kokkos::ALL(), v);
3014  KB::LU<member_type,
3015  default_mode_type,KB::Algo::LU::Unblocked>
3016  ::invoke(member, A , tiny);
3017 
3018  if (nrows > 1) {
3019  auto B = A;
3020  auto C = A;
3021  local_ordinal_type i = i0;
3022  for (local_ordinal_type tr=1;tr<nrows;++tr,i+=3) {
3023 #ifdef IFPACK2_BLOCKTRIDICONTAINER_USE_PRINTF
3024  printf("tr = %d, i = %d;\n", tr, i);
3025 #endif
3026  B.assign_data( &AA(i+1,0,0,v) );
3027  KB::Trsm<member_type,
3028  KB::Side::Left,KB::Uplo::Lower,KB::Trans::NoTranspose,KB::Diag::Unit,
3029  default_mode_type,default_algo_type>
3030  ::invoke(member, one, A, B);
3031  C.assign_data( &AA(i+2,0,0,v) );
3032  KB::Trsm<member_type,
3033  KB::Side::Right,KB::Uplo::Upper,KB::Trans::NoTranspose,KB::Diag::NonUnit,
3034  default_mode_type,default_algo_type>
3035  ::invoke(member, one, A, C);
3036  A.assign_data( &AA(i+3,0,0,v) );
3037 
3038  member.team_barrier();
3039  KB::Gemm<member_type,
3040  KB::Trans::NoTranspose,KB::Trans::NoTranspose,
3041  default_mode_type,default_algo_type>
3042  ::invoke(member, -one, C, B, one, A);
3043  KB::LU<member_type,
3044  default_mode_type,KB::Algo::LU::Unblocked>
3045  ::invoke(member, A, tiny);
3046  }
3047  } else {
3048  // for block jacobi invert a matrix here
3049  auto W = Kokkos::subview(WW, Kokkos::ALL(), Kokkos::ALL(), v);
3050  KB::Copy<member_type,KB::Trans::NoTranspose,default_mode_type>
3051  ::invoke(member, A, W);
3052  KB::SetIdentity<member_type,default_mode_type>
3053  ::invoke(member, A);
3054  member.team_barrier();
3055  KB::Trsm<member_type,
3056  KB::Side::Left,KB::Uplo::Lower,KB::Trans::NoTranspose,KB::Diag::Unit,
3057  default_mode_type,default_algo_type>
3058  ::invoke(member, one, W, A);
3059  KB::Trsm<member_type,
3060  KB::Side::Left,KB::Uplo::Upper,KB::Trans::NoTranspose,KB::Diag::NonUnit,
3061  default_mode_type,default_algo_type>
3062  ::invoke(member, one, W, A);
3063  }
3064  }
3065 
3066  public:
3067 
3068  struct ExtractAndFactorizeSubLineTag {};
3069  struct ExtractBCDTag {};
3070  struct ComputeETag {};
3071  struct ComputeSchurTag {};
3072  struct FactorizeSchurTag {};
3073 
3074  KOKKOS_INLINE_FUNCTION
3075  void
3076  operator() (const ExtractAndFactorizeSubLineTag &, const member_type &member) const {
3077  // btdm is packed and sorted from largest one
3078  const local_ordinal_type packidx = packindices_sub(member.league_rank());
3079 
3080  const local_ordinal_type subpartidx = packptr_sub(packidx);
3081  const local_ordinal_type n_parts = part2packrowidx0_sub.extent(0);
3082  const local_ordinal_type local_subpartidx = subpartidx/n_parts;
3083  const local_ordinal_type partidx = subpartidx%n_parts;
3084 
3085  const local_ordinal_type npacks = packptr_sub(packidx+1) - subpartidx;
3086  const local_ordinal_type i0 = pack_td_ptr(partidx,local_subpartidx);
3087  const local_ordinal_type nrows = partptr_sub(subpartidx,1) - partptr_sub(subpartidx,0);
3088 
3089  internal_vector_scratch_type_3d_view
3090  WW(member.team_scratch(0), blocksize, blocksize, vector_loop_size);
3091 
3092 #ifdef IFPACK2_BLOCKTRIDICONTAINER_USE_PRINTF
3093  printf("rank = %d, i0 = %d, npacks = %d, nrows = %d, packidx = %d, subpartidx = %d, partidx = %d, local_subpartidx = %d;\n", member.league_rank(), i0, npacks, nrows, packidx, subpartidx, partidx, local_subpartidx);
3094  printf("vector_loop_size = %d\n", vector_loop_size);
3095 #endif
3096 
3097  if (vector_loop_size == 1) {
3098  extract(partidx, local_subpartidx, npacks);
3099  factorize_subline(member, i0, nrows, 0, internal_vector_values, WW);
3100  } else {
3101  Kokkos::parallel_for
3102  (Kokkos::ThreadVectorRange(member, vector_loop_size),
3103  [&](const local_ordinal_type &v) {
3104  const local_ordinal_type vbeg = v*internal_vector_length;
3105 #ifdef IFPACK2_BLOCKTRIDICONTAINER_USE_PRINTF
3106  printf("i0 = %d, npacks = %d, vbeg = %d;\n", i0, npacks, vbeg);
3107 #endif
3108  if (vbeg < npacks)
3109  extract(member, partidx+vbeg, local_subpartidx, npacks, vbeg);
3110  // this is not safe if vector loop size is different from vector size of
3111  // the team policy. we always make sure this when constructing the team policy
3112  member.team_barrier();
3113  factorize_subline(member, i0, nrows, v, internal_vector_values, WW);
3114  });
3115  }
3116  }
3117 
3118  KOKKOS_INLINE_FUNCTION
3119  void
3120  operator() (const ExtractBCDTag &, const member_type &member) const {
3121  // btdm is packed and sorted from largest one
3122  const local_ordinal_type packindices_schur_i = member.league_rank() % packindices_schur.extent(0);
3123  const local_ordinal_type packindices_schur_j = member.league_rank() / packindices_schur.extent(0);
3124  const local_ordinal_type packidx = packindices_schur(packindices_schur_i, packindices_schur_j);
3125 
3126  const local_ordinal_type subpartidx = packptr_sub(packidx);
3127  const local_ordinal_type n_parts = part2packrowidx0_sub.extent(0);
3128  const local_ordinal_type local_subpartidx = subpartidx/n_parts;
3129  const local_ordinal_type partidx = subpartidx%n_parts;
3130 
3131  const local_ordinal_type npacks = packptr_sub(packidx+1) - subpartidx;
3132  //const local_ordinal_type i0 = pack_td_ptr(partidx,local_subpartidx);
3133  //const local_ordinal_type nrows = partptr_sub(subpartidx,1) - partptr_sub(subpartidx,0);
3134 
3135  if (vector_loop_size == 1) {
3136  extract(partidx, local_subpartidx, npacks);
3137  }
3138  else {
3139  Kokkos::parallel_for
3140  (Kokkos::ThreadVectorRange(member, vector_loop_size),
3141  [&](const local_ordinal_type &v) {
3142  const local_ordinal_type vbeg = v*internal_vector_length;
3143 #ifdef IFPACK2_BLOCKTRIDICONTAINER_USE_PRINTF
3144  const local_ordinal_type i0 = pack_td_ptr(partidx,local_subpartidx);
3145  printf("i0 = %d, npacks = %d, vbeg = %d;\n", i0, npacks, vbeg);
3146 #endif
3147  if (vbeg < npacks)
3148  extract(member, partidx+vbeg, local_subpartidx, npacks, vbeg);
3149  });
3150  }
3151 
3152  member.team_barrier();
3153 
3154  const size_type kps1 = pack_td_ptr(partidx, local_subpartidx);
3155  const size_type kps2 = pack_td_ptr(partidx, local_subpartidx+1)-1;
3156 
3157  const local_ordinal_type r1 = part2packrowidx0_sub(partidx,local_subpartidx)-1;
3158  const local_ordinal_type r2 = part2packrowidx0_sub(partidx,local_subpartidx)+2;
3159 
3160 #ifdef IFPACK2_BLOCKTRIDICONTAINER_USE_PRINTF
3161  printf("Copy for Schur complement part id = %d from kps1 = %ld to r1 = %d and from kps2 = %ld to r2 = %d partidx = %d local_subpartidx = %d;\n", packidx, kps1, r1, kps2, r2, partidx, local_subpartidx);
3162 #endif
3163 
3164  // Need to copy D to e_internal_vector_values.
3165  copy3DView<local_ordinal_type>(member, Kokkos::subview(e_internal_vector_values, 0, r1, Kokkos::ALL(), Kokkos::ALL(), Kokkos::ALL()),
3166  Kokkos::subview(internal_vector_values, kps1, Kokkos::ALL(), Kokkos::ALL(), Kokkos::ALL()));
3167 
3168  copy3DView<local_ordinal_type>(member, Kokkos::subview(e_internal_vector_values, 1, r2, Kokkos::ALL(), Kokkos::ALL(), Kokkos::ALL()),
3169  Kokkos::subview(internal_vector_values, kps2, Kokkos::ALL(), Kokkos::ALL(), Kokkos::ALL()));
3170 
3171  }
3172 
3173  KOKKOS_INLINE_FUNCTION
3174  void
3175  operator() (const ComputeETag &, const member_type &member) const {
3176  // btdm is packed and sorted from largest one
3177  const local_ordinal_type packidx = packindices_sub(member.league_rank());
3178 
3179  const local_ordinal_type subpartidx = packptr_sub(packidx);
3180  const local_ordinal_type n_parts = part2packrowidx0_sub.extent(0);
3181  const local_ordinal_type local_subpartidx = subpartidx/n_parts;
3182  const local_ordinal_type partidx = subpartidx%n_parts;
3183 
3184  const local_ordinal_type npacks = packptr_sub(packidx+1) - subpartidx;
3185  const local_ordinal_type i0 = pack_td_ptr(partidx,local_subpartidx);
3186  const local_ordinal_type r0 = part2packrowidx0_sub(partidx,local_subpartidx);
3187  const local_ordinal_type nrows = partptr_sub(subpartidx,1) - partptr_sub(subpartidx,0);
3188  const local_ordinal_type num_vectors = blocksize;
3189 
3190  (void) npacks;
3191 
3192  internal_vector_scratch_type_3d_view
3193  WW(member.team_scratch(0), blocksize, num_vectors, vector_loop_size);
3194  if (local_subpartidx == 0) {
3195  Kokkos::parallel_for
3196  (Kokkos::ThreadVectorRange(member, vector_loop_size),[&](const int &v) {
3197  solveMultiVector<impl_type, internal_vector_scratch_type_3d_view> (member, blocksize, i0, r0, nrows, v, internal_vector_values, Kokkos::subview(e_internal_vector_values, 0, Kokkos::ALL(), Kokkos::ALL(), Kokkos::ALL(), Kokkos::ALL()), WW, true);
3198  });
3199  }
3200  else if (local_subpartidx == (local_ordinal_type) part2packrowidx0_sub.extent(1) - 2) {
3201  Kokkos::parallel_for
3202  (Kokkos::ThreadVectorRange(member, vector_loop_size),[&](const int &v) {
3203  solveMultiVector<impl_type, internal_vector_scratch_type_3d_view> (member, blocksize, i0, r0, nrows, v, internal_vector_values, Kokkos::subview(e_internal_vector_values, 1, Kokkos::ALL(), Kokkos::ALL(), Kokkos::ALL(), Kokkos::ALL()), WW);
3204  });
3205  }
3206  else {
3207  Kokkos::parallel_for
3208  (Kokkos::ThreadVectorRange(member, vector_loop_size),[&](const int &v) {
3209  solveMultiVector<impl_type, internal_vector_scratch_type_3d_view> (member, blocksize, i0, r0, nrows, v, internal_vector_values, Kokkos::subview(e_internal_vector_values, 0, Kokkos::ALL(), Kokkos::ALL(), Kokkos::ALL(), Kokkos::ALL()), WW, true);
3210  solveMultiVector<impl_type, internal_vector_scratch_type_3d_view> (member, blocksize, i0, r0, nrows, v, internal_vector_values, Kokkos::subview(e_internal_vector_values, 1, Kokkos::ALL(), Kokkos::ALL(), Kokkos::ALL(), Kokkos::ALL()), WW);
3211  });
3212  }
3213  }
3214 
3215  KOKKOS_INLINE_FUNCTION
3216  void
3217  operator() (const ComputeSchurTag &, const member_type &member) const {
3218  // btdm is packed and sorted from largest one
3219  const local_ordinal_type packindices_schur_i = member.league_rank() % packindices_schur.extent(0);
3220  const local_ordinal_type packindices_schur_j = member.league_rank() / packindices_schur.extent(0);
3221  const local_ordinal_type packidx = packindices_schur(packindices_schur_i, packindices_schur_j);
3222 
3223  const local_ordinal_type subpartidx = packptr_sub(packidx);
3224  const local_ordinal_type n_parts = part2packrowidx0_sub.extent(0);
3225  const local_ordinal_type local_subpartidx = subpartidx/n_parts;
3226  const local_ordinal_type partidx = subpartidx%n_parts;
3227 
3228  //const local_ordinal_type npacks = packptr_sub(packidx+1) - subpartidx;
3229  const local_ordinal_type i0 = pack_td_ptr(partidx,local_subpartidx);
3230  //const local_ordinal_type r0 = part2packrowidx0_sub(partidx,local_subpartidx);
3231  //const local_ordinal_type nrows = partptr_sub(subpartidx,1) - partptr_sub(subpartidx,0);
3232 
3233  internal_vector_scratch_type_3d_view
3234  WW(member.team_scratch(0), blocksize, blocksize, vector_loop_size);
3235 
3236  // Compute S = D - C E
3237 
3238  const local_ordinal_type local_subpartidx_schur = (local_subpartidx-1)/2;
3239  const local_ordinal_type i0_schur = local_subpartidx_schur == 0 ? pack_td_ptr_schur(partidx,local_subpartidx_schur) : pack_td_ptr_schur(partidx,local_subpartidx_schur) + 1;
3240  const local_ordinal_type i0_offset = local_subpartidx_schur == 0 ? i0+2 : i0+2;
3241 
3242  for (local_ordinal_type i = 0; i < 4; ++i) { //pack_td_ptr_schur(partidx,local_subpartidx_schur+1)-i0_schur
3243  copy3DView<local_ordinal_type>(member, Kokkos::subview(internal_vector_values_schur, i0_schur+i, Kokkos::ALL(), Kokkos::ALL(), Kokkos::ALL()),
3244  Kokkos::subview(internal_vector_values, i0_offset+i, Kokkos::ALL(), Kokkos::ALL(), Kokkos::ALL()));
3245  }
3246 
3247  member.team_barrier();
3248 
3249  const auto one = Kokkos::ArithTraits<btdm_magnitude_type>::one();
3250 
3251  const size_type c_kps1 = pack_td_ptr(partidx, local_subpartidx)+1;
3252  const size_type c_kps2 = pack_td_ptr(partidx, local_subpartidx+1)-2;
3253 
3254  const local_ordinal_type e_r1 = part2packrowidx0_sub(partidx,local_subpartidx)-1;
3255  const local_ordinal_type e_r2 = part2packrowidx0_sub(partidx,local_subpartidx)+2;
3256 
3257  typedef ExtractAndFactorizeTridiagsDefaultModeAndAlgo
3258  <typename execution_space::memory_space> default_mode_and_algo_type;
3259 
3260  typedef typename default_mode_and_algo_type::mode_type default_mode_type;
3261  typedef typename default_mode_and_algo_type::algo_type default_algo_type;
3262 
3263  Kokkos::parallel_for
3264  (Kokkos::ThreadVectorRange(member, vector_loop_size),[&](const int &v) {
3265  for (size_type i = 0; i < pack_td_ptr_schur(partidx,local_subpartidx_schur+1)-pack_td_ptr_schur(partidx,local_subpartidx_schur); ++i) {
3266  local_ordinal_type e_r, e_c, c_kps;
3267 
3268  if ( local_subpartidx_schur == 0 ) {
3269  if ( i == 0 ) {
3270  e_r = e_r1;
3271  e_c = 0;
3272  c_kps = c_kps1;
3273  }
3274  else if ( i == 3 ) {
3275  e_r = e_r2;
3276  e_c = 1;
3277  c_kps = c_kps2;
3278  }
3279  else if ( i == 4 ) {
3280  e_r = e_r2;
3281  e_c = 0;
3282  c_kps = c_kps2;
3283  }
3284  else {
3285  continue;
3286  }
3287  }
3288  else {
3289  if ( i == 0 ) {
3290  e_r = e_r1;
3291  e_c = 1;
3292  c_kps = c_kps1;
3293  }
3294  else if ( i == 1 ) {
3295  e_r = e_r1;
3296  e_c = 0;
3297  c_kps = c_kps1;
3298  }
3299  else if ( i == 4 ) {
3300  e_r = e_r2;
3301  e_c = 1;
3302  c_kps = c_kps2;
3303  }
3304  else if ( i == 5 ) {
3305  e_r = e_r2;
3306  e_c = 0;
3307  c_kps = c_kps2;
3308  }
3309  else {
3310  continue;
3311  }
3312  }
3313 
3314  auto S = Kokkos::subview(internal_vector_values_schur, pack_td_ptr_schur(partidx,local_subpartidx_schur)+i, Kokkos::ALL(), Kokkos::ALL(), v);
3315  auto C = Kokkos::subview(internal_vector_values, c_kps, Kokkos::ALL(), Kokkos::ALL(), v);
3316  auto E = Kokkos::subview(e_internal_vector_values, e_c, e_r, Kokkos::ALL(), Kokkos::ALL(), v);
3317  KB::Gemm<member_type,
3318  KB::Trans::NoTranspose,KB::Trans::NoTranspose,
3319  default_mode_type,default_algo_type>
3320  ::invoke(member, -one, C, E, one, S);
3321  }
3322  });
3323  }
3324 
3325  KOKKOS_INLINE_FUNCTION
3326  void
3327  operator() (const FactorizeSchurTag &, const member_type &member) const {
3328  const local_ordinal_type packidx = packindices_schur(member.league_rank(), 0);
3329 
3330  const local_ordinal_type subpartidx = packptr_sub(packidx);
3331 
3332  const local_ordinal_type n_parts = part2packrowidx0_sub.extent(0);
3333  const local_ordinal_type partidx = subpartidx%n_parts;
3334 
3335  const local_ordinal_type i0 = pack_td_ptr_schur(partidx,0);
3336  const local_ordinal_type nrows = 2*(pack_td_ptr_schur.extent(1)-1);
3337 
3338  internal_vector_scratch_type_3d_view
3339  WW(member.team_scratch(0), blocksize, blocksize, vector_loop_size);
3340 
3341 #ifdef IFPACK2_BLOCKTRIDICONTAINER_USE_PRINTF
3342  printf("FactorizeSchurTag rank = %d, i0 = %d, nrows = %d, vector_loop_size = %d;\n", member.league_rank(), i0, nrows, vector_loop_size);
3343 #endif
3344 
3345  if (vector_loop_size == 1) {
3346  factorize_subline(member, i0, nrows, 0, internal_vector_values_schur, WW);
3347  } else {
3348  Kokkos::parallel_for
3349  (Kokkos::ThreadVectorRange(member, vector_loop_size),
3350  [&](const local_ordinal_type &v) {
3351  factorize_subline(member, i0, nrows, v, internal_vector_values_schur, WW);
3352  });
3353  }
3354  }
3355 
3356  void run() {
3357  IFPACK2_BLOCKTRIDICONTAINER_PROFILER_REGION_BEGIN;
3358  const local_ordinal_type team_size =
3359  ExtractAndFactorizeTridiagsDefaultModeAndAlgo<typename execution_space::memory_space>::
3360  recommended_team_size(blocksize, vector_length, internal_vector_length);
3361  const local_ordinal_type per_team_scratch = internal_vector_scratch_type_3d_view::
3362  shmem_size(blocksize, blocksize, vector_loop_size);
3363 
3364  {
3365 #ifdef IFPACK2_BLOCKTRIDICONTAINER_USE_PRINTF
3366  printf("Start ExtractAndFactorizeSubLineTag\n");
3367 #endif
3368  IFPACK2_BLOCKHELPER_TIMER("BlockTriDi::NumericPhase::ExtractAndFactorizeSubLineTag");
3369  Kokkos::TeamPolicy<execution_space,ExtractAndFactorizeSubLineTag>
3370  policy(packindices_sub.extent(0), team_size, vector_loop_size);
3371 
3372 
3373  const local_ordinal_type n_parts = part2packrowidx0_sub.extent(0);
3374  writeBTDValuesToFile(n_parts, scalar_values, "before.mm");
3375 
3376  policy.set_scratch_size(0,Kokkos::PerTeam(per_team_scratch));
3377  Kokkos::parallel_for("ExtractAndFactorize::TeamPolicy::run<ExtractAndFactorizeSubLineTag>",
3378  policy, *this);
3379  execution_space().fence();
3380 
3381  writeBTDValuesToFile(n_parts, scalar_values, "after.mm");
3382 #ifdef IFPACK2_BLOCKTRIDICONTAINER_USE_PRINTF
3383  printf("End ExtractAndFactorizeSubLineTag\n");
3384 #endif
3385  }
3386 
3387  if (packindices_schur.extent(1) > 0)
3388  {
3389  {
3390 #ifdef IFPACK2_BLOCKTRIDICONTAINER_USE_PRINTF
3391  printf("Start ExtractBCDTag\n");
3392 #endif
3393  Kokkos::deep_copy(e_scalar_values, Kokkos::ArithTraits<btdm_magnitude_type>::zero());
3394  Kokkos::deep_copy(scalar_values_schur, Kokkos::ArithTraits<btdm_magnitude_type>::zero());
3395 
3396  write5DMultiVectorValuesToFile(part2packrowidx0_sub.extent(0), e_scalar_values, "e_scalar_values_before_extract.mm");
3397 
3398  {
3399  IFPACK2_BLOCKHELPER_TIMER("BlockTriDi::NumericPhase::ExtractBCDTag");
3400  Kokkos::TeamPolicy<execution_space,ExtractBCDTag>
3401  policy(packindices_schur.extent(0)*packindices_schur.extent(1), team_size, vector_loop_size);
3402 
3403  policy.set_scratch_size(0,Kokkos::PerTeam(per_team_scratch));
3404  Kokkos::parallel_for("ExtractAndFactorize::TeamPolicy::run<ExtractBCDTag>",
3405  policy, *this);
3406  execution_space().fence();
3407  }
3408 
3409 #ifdef IFPACK2_BLOCKTRIDICONTAINER_USE_PRINTF
3410  printf("End ExtractBCDTag\n");
3411 #endif
3412  writeBTDValuesToFile(part2packrowidx0_sub.extent(0), scalar_values, "after_extraction_of_BCD.mm");
3413 #ifdef IFPACK2_BLOCKTRIDICONTAINER_USE_PRINTF
3414  printf("Start ComputeETag\n");
3415 #endif
3416  write5DMultiVectorValuesToFile(part2packrowidx0_sub.extent(0), e_scalar_values, "e_scalar_values_after_extract.mm");
3417  {
3418  IFPACK2_BLOCKHELPER_TIMER("BlockTriDi::NumericPhase::ComputeETag");
3419  Kokkos::TeamPolicy<execution_space,ComputeETag>
3420  policy(packindices_sub.extent(0), team_size, vector_loop_size);
3421 
3422  policy.set_scratch_size(0,Kokkos::PerTeam(per_team_scratch));
3423  Kokkos::parallel_for("ExtractAndFactorize::TeamPolicy::run<ComputeETag>",
3424  policy, *this);
3425  execution_space().fence();
3426  }
3427  write5DMultiVectorValuesToFile(part2packrowidx0_sub.extent(0), e_scalar_values, "e_scalar_values_after_compute.mm");
3428 
3429 #ifdef IFPACK2_BLOCKTRIDICONTAINER_USE_PRINTF
3430  printf("End ComputeETag\n");
3431 #endif
3432  }
3433 
3434  {
3435 #ifdef IFPACK2_BLOCKTRIDICONTAINER_USE_PRINTF
3436  printf("Start ComputeSchurTag\n");
3437 #endif
3438  IFPACK2_BLOCKHELPER_TIMER("BlockTriDi::NumericPhase::ComputeSchurTag");
3439  writeBTDValuesToFile(part2packrowidx0_sub.extent(0), scalar_values_schur, "before_schur.mm");
3440  Kokkos::TeamPolicy<execution_space,ComputeSchurTag>
3441  policy(packindices_schur.extent(0)*packindices_schur.extent(1), team_size, vector_loop_size);
3442 
3443  policy.set_scratch_size(0,Kokkos::PerTeam(per_team_scratch));
3444  Kokkos::parallel_for("ExtractAndFactorize::TeamPolicy::run<ComputeSchurTag>",
3445  policy, *this);
3446  writeBTDValuesToFile(part2packrowidx0_sub.extent(0), scalar_values_schur, "after_schur.mm");
3447  execution_space().fence();
3448 #ifdef IFPACK2_BLOCKTRIDICONTAINER_USE_PRINTF
3449  printf("End ComputeSchurTag\n");
3450 #endif
3451  }
3452 
3453  {
3454 #ifdef IFPACK2_BLOCKTRIDICONTAINER_USE_PRINTF
3455  printf("Start FactorizeSchurTag\n");
3456 #endif
3457  IFPACK2_BLOCKHELPER_TIMER("BlockTriDi::NumericPhase::FactorizeSchurTag");
3458  Kokkos::TeamPolicy<execution_space,FactorizeSchurTag>
3459  policy(packindices_schur.extent(0), team_size, vector_loop_size);
3460  policy.set_scratch_size(0,Kokkos::PerTeam(per_team_scratch));
3461  Kokkos::parallel_for("ExtractAndFactorize::TeamPolicy::run<FactorizeSchurTag>",
3462  policy, *this);
3463  execution_space().fence();
3464  writeBTDValuesToFile(part2packrowidx0_sub.extent(0), scalar_values_schur, "after_factor_schur.mm");
3465 #ifdef IFPACK2_BLOCKTRIDICONTAINER_USE_PRINTF
3466  printf("End FactorizeSchurTag\n");
3467 #endif
3468  }
3469  }
3470 
3471  IFPACK2_BLOCKTRIDICONTAINER_PROFILER_REGION_END;
3472  }
3473 
3474  };
3475 
3479  template<typename MatrixType>
3480  void
3481  performNumericPhase(const Teuchos::RCP<const typename BlockHelperDetails::ImplType<MatrixType>::tpetra_block_crs_matrix_type> &A,
3482  const BlockHelperDetails::PartInterface<MatrixType> &interf,
3484  const typename BlockHelperDetails::ImplType<MatrixType>::magnitude_type tiny) {
3485  IFPACK2_BLOCKHELPER_TIMER("BlockTriDi::NumericPhase");
3486  ExtractAndFactorizeTridiags<MatrixType> function(btdm, interf, A, tiny);
3487  function.run();
3488  IFPACK2_BLOCKHELPER_TIMER_FENCE(typename BlockHelperDetails::ImplType<MatrixType>::execution_space)
3489  }
3490 
3494  template<typename MatrixType>
3496  public:
3498  using execution_space = typename impl_type::execution_space;
3499  using memory_space = typename impl_type::memory_space;
3500 
3501  using local_ordinal_type = typename impl_type::local_ordinal_type;
3502  using impl_scalar_type = typename impl_type::impl_scalar_type;
3503  using btdm_scalar_type = typename impl_type::btdm_scalar_type;
3504  using tpetra_multivector_type = typename impl_type::tpetra_multivector_type;
3505  using local_ordinal_type_1d_view = typename impl_type::local_ordinal_type_1d_view;
3506  using vector_type_3d_view = typename impl_type::vector_type_3d_view;
3507  using impl_scalar_type_2d_view_tpetra = typename impl_type::impl_scalar_type_2d_view_tpetra;
3508  using const_impl_scalar_type_2d_view_tpetra = typename impl_scalar_type_2d_view_tpetra::const_type;
3509  static constexpr int vector_length = impl_type::vector_length;
3510 
3511  using member_type = typename Kokkos::TeamPolicy<execution_space>::member_type;
3512 
3513  private:
3514  // part interface
3515  const ConstUnmanaged<local_ordinal_type_1d_view> partptr;
3516  const ConstUnmanaged<local_ordinal_type_1d_view> packptr;
3517  const ConstUnmanaged<local_ordinal_type_1d_view> part2packrowidx0;
3518  const ConstUnmanaged<local_ordinal_type_1d_view> part2rowidx0;
3519  const ConstUnmanaged<local_ordinal_type_1d_view> lclrow;
3520  const local_ordinal_type blocksize;
3521  const local_ordinal_type num_vectors;
3522 
3523  // packed multivector output (or input)
3524  vector_type_3d_view packed_multivector;
3525  const_impl_scalar_type_2d_view_tpetra scalar_multivector;
3526 
3527  template<typename TagType>
3528  KOKKOS_INLINE_FUNCTION
3529  void copy_multivectors(const local_ordinal_type &j,
3530  const local_ordinal_type &vi,
3531  const local_ordinal_type &pri,
3532  const local_ordinal_type &ri0) const {
3533  for (local_ordinal_type col=0;col<num_vectors;++col)
3534  for (local_ordinal_type i=0;i<blocksize;++i)
3535  packed_multivector(pri, i, col)[vi] = static_cast<btdm_scalar_type>(scalar_multivector(blocksize*lclrow(ri0+j)+i,col));
3536  }
3537 
3538  public:
3539 
3540  MultiVectorConverter(const BlockHelperDetails::PartInterface<MatrixType> &interf,
3541  const vector_type_3d_view &pmv)
3542  : partptr(interf.partptr),
3543  packptr(interf.packptr),
3544  part2packrowidx0(interf.part2packrowidx0),
3545  part2rowidx0(interf.part2rowidx0),
3546  lclrow(interf.lclrow),
3547  blocksize(pmv.extent(1)),
3548  num_vectors(pmv.extent(2)),
3549  packed_multivector(pmv) {}
3550 
3551  // TODO:: modify this routine similar to the team level functions
3552  // inline ---> FIXME HIP: should not need the KOKKOS_INLINE_FUNCTION below...
3553  KOKKOS_INLINE_FUNCTION
3554  void
3555  operator() (const local_ordinal_type &packidx) const {
3556  local_ordinal_type partidx = packptr(packidx);
3557  local_ordinal_type npacks = packptr(packidx+1) - partidx;
3558  const local_ordinal_type pri0 = part2packrowidx0(partidx);
3559 
3560  local_ordinal_type ri0[vector_length] = {};
3561  local_ordinal_type nrows[vector_length] = {};
3562  for (local_ordinal_type v=0;v<npacks;++v,++partidx) {
3563  ri0[v] = part2rowidx0(partidx);
3564  nrows[v] = part2rowidx0(partidx+1) - ri0[v];
3565  }
3566  for (local_ordinal_type j=0;j<nrows[0];++j) {
3567  local_ordinal_type cnt = 1;
3568  for (;cnt<npacks && j!= nrows[cnt];++cnt);
3569  npacks = cnt;
3570  const local_ordinal_type pri = pri0 + j;
3571  for (local_ordinal_type col=0;col<num_vectors;++col)
3572  for (local_ordinal_type i=0;i<blocksize;++i)
3573  for (local_ordinal_type v=0;v<npacks;++v)
3574  packed_multivector(pri, i, col)[v] = static_cast<btdm_scalar_type>(scalar_multivector(blocksize*lclrow(ri0[v]+j)+i,col));
3575  }
3576  }
3577 
3578  KOKKOS_INLINE_FUNCTION
3579  void
3580  operator() (const member_type &member) const {
3581  const local_ordinal_type packidx = member.league_rank();
3582  const local_ordinal_type partidx_begin = packptr(packidx);
3583  const local_ordinal_type npacks = packptr(packidx+1) - partidx_begin;
3584  const local_ordinal_type pri0 = part2packrowidx0(partidx_begin);
3585  Kokkos::parallel_for(Kokkos::ThreadVectorRange(member, npacks), [&](const local_ordinal_type &v) {
3586  const local_ordinal_type partidx = partidx_begin + v;
3587  const local_ordinal_type ri0 = part2rowidx0(partidx);
3588  const local_ordinal_type nrows = part2rowidx0(partidx+1) - ri0;
3589 
3590  if (nrows == 1) {
3591  const local_ordinal_type pri = pri0;
3592  for (local_ordinal_type col=0;col<num_vectors;++col) {
3593  Kokkos::parallel_for(Kokkos::TeamThreadRange(member, blocksize), [&](const local_ordinal_type &i) {
3594  packed_multivector(pri, i, col)[v] = static_cast<btdm_scalar_type>(scalar_multivector(blocksize*lclrow(ri0)+i,col));
3595  });
3596  }
3597  } else {
3598  Kokkos::parallel_for(Kokkos::TeamThreadRange(member, nrows), [&](const local_ordinal_type &j) {
3599  const local_ordinal_type pri = pri0 + j;
3600  for (local_ordinal_type col=0;col<num_vectors;++col)
3601  for (local_ordinal_type i=0;i<blocksize;++i)
3602  packed_multivector(pri, i, col)[v] = static_cast<btdm_scalar_type>(scalar_multivector(blocksize*lclrow(ri0+j)+i,col));
3603  });
3604  }
3605  });
3606  }
3607 
3608  void run(const const_impl_scalar_type_2d_view_tpetra &scalar_multivector_) {
3609  IFPACK2_BLOCKTRIDICONTAINER_PROFILER_REGION_BEGIN;
3610  IFPACK2_BLOCKHELPER_TIMER("BlockTriDi::MultiVectorConverter");
3611 
3612  scalar_multivector = scalar_multivector_;
3613  if constexpr (BlockHelperDetails::is_device<execution_space>::value) {
3614  const local_ordinal_type vl = vector_length;
3615  const Kokkos::TeamPolicy<execution_space> policy(packptr.extent(0) - 1, Kokkos::AUTO(), vl);
3616  Kokkos::parallel_for
3617  ("MultiVectorConverter::TeamPolicy", policy, *this);
3618  } else {
3619  const Kokkos::RangePolicy<execution_space> policy(0, packptr.extent(0) - 1);
3620  Kokkos::parallel_for
3621  ("MultiVectorConverter::RangePolicy", policy, *this);
3622  }
3623  IFPACK2_BLOCKTRIDICONTAINER_PROFILER_REGION_END;
3624  IFPACK2_BLOCKHELPER_TIMER_FENCE(execution_space)
3625  }
3626  };
3627 
3631 
3632  template<>
3633  struct SolveTridiagsDefaultModeAndAlgo<Kokkos::HostSpace> {
3634  typedef KB::Mode::Serial mode_type;
3635  typedef KB::Algo::Level2::Unblocked single_vector_algo_type;
3636 #if defined(__KOKKOSBATCHED_INTEL_MKL_COMPACT_BATCHED__)
3637  typedef KB::Algo::Level3::CompactMKL multi_vector_algo_type;
3638 #else
3639  typedef KB::Algo::Level3::Blocked multi_vector_algo_type;
3640 #endif
3641  static int recommended_team_size(const int /* blksize */,
3642  const int /* vector_length */,
3643  const int /* internal_vector_length */) {
3644  return 1;
3645  }
3646  };
3647 
3648 #if defined(KOKKOS_ENABLE_CUDA)
3649  static inline int SolveTridiagsRecommendedCudaTeamSize(const int blksize,
3650  const int vector_length,
3651  const int internal_vector_length) {
3652  const int vector_size = vector_length/internal_vector_length;
3653  int total_team_size(0);
3654  if (blksize <= 5) total_team_size = 32;
3655  else if (blksize <= 9) total_team_size = 32; // 64
3656  else if (blksize <= 12) total_team_size = 96;
3657  else if (blksize <= 16) total_team_size = 128;
3658  else if (blksize <= 20) total_team_size = 160;
3659  else total_team_size = 160;
3660  return total_team_size/vector_size;
3661  }
3662 
3663  template<>
3664  struct SolveTridiagsDefaultModeAndAlgo<Kokkos::CudaSpace> {
3665  typedef KB::Mode::Team mode_type;
3666  typedef KB::Algo::Level2::Unblocked single_vector_algo_type;
3667  typedef KB::Algo::Level3::Unblocked multi_vector_algo_type;
3668  static int recommended_team_size(const int blksize,
3669  const int vector_length,
3670  const int internal_vector_length) {
3671  return SolveTridiagsRecommendedCudaTeamSize(blksize, vector_length, internal_vector_length);
3672  }
3673  };
3674  template<>
3675  struct SolveTridiagsDefaultModeAndAlgo<Kokkos::CudaUVMSpace> {
3676  typedef KB::Mode::Team mode_type;
3677  typedef KB::Algo::Level2::Unblocked single_vector_algo_type;
3678  typedef KB::Algo::Level3::Unblocked multi_vector_algo_type;
3679  static int recommended_team_size(const int blksize,
3680  const int vector_length,
3681  const int internal_vector_length) {
3682  return SolveTridiagsRecommendedCudaTeamSize(blksize, vector_length, internal_vector_length);
3683  }
3684  };
3685 #endif
3686 
3687 #if defined(KOKKOS_ENABLE_HIP)
3688  static inline int SolveTridiagsRecommendedHIPTeamSize(const int blksize,
3689  const int vector_length,
3690  const int internal_vector_length) {
3691  const int vector_size = vector_length/internal_vector_length;
3692  int total_team_size(0);
3693  if (blksize <= 5) total_team_size = 32;
3694  else if (blksize <= 9) total_team_size = 32; // 64
3695  else if (blksize <= 12) total_team_size = 96;
3696  else if (blksize <= 16) total_team_size = 128;
3697  else if (blksize <= 20) total_team_size = 160;
3698  else total_team_size = 160;
3699  return total_team_size/vector_size;
3700  }
3701 
3702  template<>
3703  struct SolveTridiagsDefaultModeAndAlgo<Kokkos::HIPSpace> {
3704  typedef KB::Mode::Team mode_type;
3705  typedef KB::Algo::Level2::Unblocked single_vector_algo_type;
3706  typedef KB::Algo::Level3::Unblocked multi_vector_algo_type;
3707  static int recommended_team_size(const int blksize,
3708  const int vector_length,
3709  const int internal_vector_length) {
3710  return SolveTridiagsRecommendedHIPTeamSize(blksize, vector_length, internal_vector_length);
3711  }
3712  };
3713  template<>
3714  struct SolveTridiagsDefaultModeAndAlgo<Kokkos::HIPHostPinnedSpace> {
3715  typedef KB::Mode::Team mode_type;
3716  typedef KB::Algo::Level2::Unblocked single_vector_algo_type;
3717  typedef KB::Algo::Level3::Unblocked multi_vector_algo_type;
3718  static int recommended_team_size(const int blksize,
3719  const int vector_length,
3720  const int internal_vector_length) {
3721  return SolveTridiagsRecommendedHIPTeamSize(blksize, vector_length, internal_vector_length);
3722  }
3723  };
3724 #endif
3725 
3726 #if defined(KOKKOS_ENABLE_SYCL)
3727  static inline int SolveTridiagsRecommendedSYCLTeamSize(const int blksize,
3728  const int vector_length,
3729  const int internal_vector_length) {
3730  const int vector_size = vector_length/internal_vector_length;
3731  int total_team_size(0);
3732  if (blksize <= 5) total_team_size = 32;
3733  else if (blksize <= 9) total_team_size = 32; // 64
3734  else if (blksize <= 12) total_team_size = 96;
3735  else if (blksize <= 16) total_team_size = 128;
3736  else if (blksize <= 20) total_team_size = 160;
3737  else total_team_size = 160;
3738  return total_team_size/vector_size;
3739  }
3740 
3741  template<>
3742  struct SolveTridiagsDefaultModeAndAlgo<Kokkos::Experimental::SYCLSharedUSMSpace> {
3743  typedef KB::Mode::Team mode_type;
3744  typedef KB::Algo::Level2::Unblocked single_vector_algo_type;
3745  typedef KB::Algo::Level3::Unblocked multi_vector_algo_type;
3746  static int recommended_team_size(const int blksize,
3747  const int vector_length,
3748  const int internal_vector_length) {
3749  return SolveTridiagsRecommendedSYCLTeamSize(blksize, vector_length, internal_vector_length);
3750  }
3751  };
3752  template<>
3753  struct SolveTridiagsDefaultModeAndAlgo<Kokkos::Experimental::SYCLDeviceUSMSpace> {
3754  typedef KB::Mode::Team mode_type;
3755  typedef KB::Algo::Level2::Unblocked single_vector_algo_type;
3756  typedef KB::Algo::Level3::Unblocked multi_vector_algo_type;
3757  static int recommended_team_size(const int blksize,
3758  const int vector_length,
3759  const int internal_vector_length) {
3760  return SolveTridiagsRecommendedSYCLTeamSize(blksize, vector_length, internal_vector_length);
3761  }
3762  };
3763 #endif
3764 
3765 
3766 
3767 
3768  template<typename MatrixType>
3769  struct SolveTridiags {
3770  public:
3771  using impl_type = BlockHelperDetails::ImplType<MatrixType>;
3772  using execution_space = typename impl_type::execution_space;
3773 
3774  using local_ordinal_type = typename impl_type::local_ordinal_type;
3775  using size_type = typename impl_type::size_type;
3776  using impl_scalar_type = typename impl_type::impl_scalar_type;
3777  using magnitude_type = typename impl_type::magnitude_type;
3778  using btdm_scalar_type = typename impl_type::btdm_scalar_type;
3779  using btdm_magnitude_type = typename impl_type::btdm_magnitude_type;
3781  using local_ordinal_type_1d_view = typename impl_type::local_ordinal_type_1d_view;
3782  using local_ordinal_type_2d_view = typename impl_type::local_ordinal_type_2d_view;
3783  using size_type_2d_view = typename impl_type::size_type_2d_view;
3785  using vector_type_3d_view = typename impl_type::vector_type_3d_view;
3786  using internal_vector_type_4d_view = typename impl_type::internal_vector_type_4d_view;
3787  using internal_vector_type_5d_view = typename impl_type::internal_vector_type_5d_view;
3788  using btdm_scalar_type_4d_view = typename impl_type::btdm_scalar_type_4d_view;
3789 
3790  using internal_vector_scratch_type_3d_view = Scratch<typename impl_type::internal_vector_type_3d_view>;
3791 
3792  using internal_vector_type =typename impl_type::internal_vector_type;
3793  static constexpr int vector_length = impl_type::vector_length;
3794  static constexpr int internal_vector_length = impl_type::internal_vector_length;
3795 
3797  using impl_scalar_type_1d_view = typename impl_type::impl_scalar_type_1d_view;
3798  using impl_scalar_type_2d_view_tpetra = typename impl_type::impl_scalar_type_2d_view_tpetra;
3799 
3801  using team_policy_type = Kokkos::TeamPolicy<execution_space>;
3802  using member_type = typename team_policy_type::member_type;
3803 
3804  private:
3805  // part interface
3806  local_ordinal_type n_subparts_per_part;
3807  const ConstUnmanaged<local_ordinal_type_1d_view> partptr;
3808  const ConstUnmanaged<local_ordinal_type_1d_view> packptr;
3809  const ConstUnmanaged<local_ordinal_type_1d_view> packindices_sub;
3810  const ConstUnmanaged<local_ordinal_type_2d_view> packindices_schur;
3811  const ConstUnmanaged<local_ordinal_type_1d_view> part2packrowidx0;
3812  const ConstUnmanaged<local_ordinal_type_2d_view> part2packrowidx0_sub;
3813  const ConstUnmanaged<local_ordinal_type_1d_view> lclrow;
3814  const ConstUnmanaged<local_ordinal_type_1d_view> packptr_sub;
3815 
3816  const ConstUnmanaged<local_ordinal_type_2d_view> partptr_sub;
3817  const ConstUnmanaged<size_type_2d_view> pack_td_ptr_schur;
3818 
3819  // block tridiags
3820  const ConstUnmanaged<size_type_2d_view> pack_td_ptr;
3821 
3822  // block tridiags values
3823  const ConstUnmanaged<internal_vector_type_4d_view> D_internal_vector_values;
3824  const Unmanaged<internal_vector_type_4d_view> X_internal_vector_values;
3825  const Unmanaged<btdm_scalar_type_4d_view> X_internal_scalar_values;
3826 
3827  internal_vector_type_4d_view X_internal_vector_values_schur;
3828 
3829  const ConstUnmanaged<internal_vector_type_4d_view> D_internal_vector_values_schur;
3830  const ConstUnmanaged<internal_vector_type_5d_view> e_internal_vector_values;
3831 
3832 
3833  const local_ordinal_type vector_loop_size;
3834 
3835  // copy to multivectors : damping factor and Y_scalar_multivector
3836  Unmanaged<impl_scalar_type_2d_view_tpetra> Y_scalar_multivector;
3837 #if defined(__CUDA_ARCH__) || defined(__HIP_DEVICE_COMPILE__) || defined(__SYCL_DEVICE_ONLY__)
3838  AtomicUnmanaged<impl_scalar_type_1d_view> Z_scalar_vector;
3839 #else
3840  /* */ Unmanaged<impl_scalar_type_1d_view> Z_scalar_vector;
3841 #endif
3842  const impl_scalar_type df;
3843  const bool compute_diff;
3844 
3845  public:
3846  SolveTridiags(const BlockHelperDetails::PartInterface<MatrixType> &interf,
3847  const BlockTridiags<MatrixType> &btdm,
3848  const vector_type_3d_view &pmv,
3849  const impl_scalar_type damping_factor,
3850  const bool is_norm_manager_active)
3851  :
3852  // interface
3853  n_subparts_per_part(interf.n_subparts_per_part),
3854  partptr(interf.partptr),
3855  packptr(interf.packptr),
3856  packindices_sub(interf.packindices_sub),
3857  packindices_schur(interf.packindices_schur),
3858  part2packrowidx0(interf.part2packrowidx0),
3859  part2packrowidx0_sub(interf.part2packrowidx0_sub),
3860  lclrow(interf.lclrow),
3861  packptr_sub(interf.packptr_sub),
3862  partptr_sub(interf.partptr_sub),
3863  pack_td_ptr_schur(btdm.pack_td_ptr_schur),
3864  // block tridiags and multivector
3865  pack_td_ptr(btdm.pack_td_ptr),
3866  D_internal_vector_values((internal_vector_type*)btdm.values.data(),
3867  btdm.values.extent(0),
3868  btdm.values.extent(1),
3869  btdm.values.extent(2),
3870  vector_length/internal_vector_length),
3871  X_internal_vector_values((internal_vector_type*)pmv.data(),
3872  pmv.extent(0),
3873  pmv.extent(1),
3874  pmv.extent(2),
3875  vector_length/internal_vector_length),
3876  X_internal_scalar_values((btdm_scalar_type*)pmv.data(),
3877  pmv.extent(0),
3878  pmv.extent(1),
3879  pmv.extent(2),
3880  vector_length),
3881  X_internal_vector_values_schur(do_not_initialize_tag("X_internal_vector_values_schur"),
3882  2*(n_subparts_per_part-1) * part2packrowidx0_sub.extent(0),
3883  pmv.extent(1),
3884  pmv.extent(2),
3885  vector_length/internal_vector_length),
3886  D_internal_vector_values_schur((internal_vector_type*)btdm.values_schur.data(),
3887  btdm.values_schur.extent(0),
3888  btdm.values_schur.extent(1),
3889  btdm.values_schur.extent(2),
3890  vector_length/internal_vector_length),
3891  e_internal_vector_values((internal_vector_type*)btdm.e_values.data(),
3892  btdm.e_values.extent(0),
3893  btdm.e_values.extent(1),
3894  btdm.e_values.extent(2),
3895  btdm.e_values.extent(3),
3896  vector_length/internal_vector_length),
3897  vector_loop_size(vector_length/internal_vector_length),
3898  Y_scalar_multivector(),
3899  Z_scalar_vector(),
3900  df(damping_factor),
3901  compute_diff(is_norm_manager_active)
3902  {}
3903 
3904  public:
3905 
3907  KOKKOS_INLINE_FUNCTION
3908  void
3909  copyToFlatMultiVector(const member_type &member,
3910  const local_ordinal_type partidxbeg, // partidx for v = 0
3911  const local_ordinal_type npacks,
3912  const local_ordinal_type pri0,
3913  const local_ordinal_type v, // index with a loop of vector_loop_size
3914  const local_ordinal_type blocksize,
3915  const local_ordinal_type num_vectors) const {
3916  const local_ordinal_type vbeg = v*internal_vector_length;
3917  if (vbeg < npacks) {
3918  local_ordinal_type ri0_vals[internal_vector_length] = {};
3919  local_ordinal_type nrows_vals[internal_vector_length] = {};
3920  for (local_ordinal_type vv=vbeg,vi=0;vv<npacks && vi<internal_vector_length;++vv,++vi) {
3921  const local_ordinal_type partidx = partidxbeg+vv;
3922  ri0_vals[vi] = partptr(partidx);
3923  nrows_vals[vi] = partptr(partidx+1) - ri0_vals[vi];
3924  }
3925 
3926  impl_scalar_type z_partial_sum(0);
3927  if (nrows_vals[0] == 1) {
3928  const local_ordinal_type j=0, pri=pri0;
3929  {
3930  for (local_ordinal_type vv=vbeg,vi=0;vv<npacks && vi<internal_vector_length;++vv,++vi) {
3931  const local_ordinal_type ri0 = ri0_vals[vi];
3932  const local_ordinal_type nrows = nrows_vals[vi];
3933  if (j < nrows) {
3934  Kokkos::parallel_for
3935  (Kokkos::TeamThreadRange(member, blocksize),
3936  [&](const local_ordinal_type &i) {
3937  const local_ordinal_type row = blocksize*lclrow(ri0+j)+i;
3938  for (local_ordinal_type col=0;col<num_vectors;++col) {
3939  impl_scalar_type &y = Y_scalar_multivector(row,col);
3940  const impl_scalar_type yd = X_internal_vector_values(pri, i, col, v)[vi] - y;
3941  y += df*yd;
3942 
3943  {//if (compute_diff) {
3944  const auto yd_abs = Kokkos::ArithTraits<impl_scalar_type>::abs(yd);
3945  z_partial_sum += yd_abs*yd_abs;
3946  }
3947  }
3948  });
3949  }
3950  }
3951  }
3952  } else {
3953  Kokkos::parallel_for
3954  (Kokkos::TeamThreadRange(member, nrows_vals[0]),
3955  [&](const local_ordinal_type &j) {
3956  const local_ordinal_type pri = pri0 + j;
3957  for (local_ordinal_type vv=vbeg,vi=0;vv<npacks && vi<internal_vector_length;++vv,++vi) {
3958  const local_ordinal_type ri0 = ri0_vals[vi];
3959  const local_ordinal_type nrows = nrows_vals[vi];
3960  if (j < nrows) {
3961  for (local_ordinal_type col=0;col<num_vectors;++col) {
3962  for (local_ordinal_type i=0;i<blocksize;++i) {
3963  const local_ordinal_type row = blocksize*lclrow(ri0+j)+i;
3964  impl_scalar_type &y = Y_scalar_multivector(row,col);
3965  const impl_scalar_type yd = X_internal_vector_values(pri, i, col, v)[vi] - y;
3966  y += df*yd;
3967 
3968  {//if (compute_diff) {
3969  const auto yd_abs = Kokkos::ArithTraits<impl_scalar_type>::abs(yd);
3970  z_partial_sum += yd_abs*yd_abs;
3971  }
3972  }
3973  }
3974  }
3975  }
3976  });
3977  }
3978  //if (compute_diff)
3979  Z_scalar_vector(member.league_rank()) += z_partial_sum;
3980  }
3981  }
3982 
3986  template<typename WWViewType>
3987  KOKKOS_INLINE_FUNCTION
3988  void
3989  solveSingleVector(const member_type &member,
3990  const local_ordinal_type &blocksize,
3991  const local_ordinal_type &i0,
3992  const local_ordinal_type &r0,
3993  const local_ordinal_type &nrows,
3994  const local_ordinal_type &v,
3995  const WWViewType &WW) const {
3996 
3997  typedef SolveTridiagsDefaultModeAndAlgo
3998  <typename execution_space::memory_space> default_mode_and_algo_type;
3999 
4000  typedef typename default_mode_and_algo_type::mode_type default_mode_type;
4001  typedef typename default_mode_and_algo_type::single_vector_algo_type default_algo_type;
4002 
4003  // base pointers
4004  auto A = D_internal_vector_values.data();
4005  auto X = X_internal_vector_values.data();
4006 
4007  // constant
4008  const auto one = Kokkos::ArithTraits<btdm_magnitude_type>::one();
4009  const auto zero = Kokkos::ArithTraits<btdm_magnitude_type>::zero();
4010  //const local_ordinal_type num_vectors = X_scalar_values.extent(2);
4011 
4012  // const local_ordinal_type blocksize = D_scalar_values.extent(1);
4013  const local_ordinal_type astep = D_internal_vector_values.stride_0();
4014  const local_ordinal_type as0 = D_internal_vector_values.stride_1(); //blocksize*vector_length;
4015  const local_ordinal_type as1 = D_internal_vector_values.stride_2(); //vector_length;
4016  const local_ordinal_type xstep = X_internal_vector_values.stride_0();
4017  const local_ordinal_type xs0 = X_internal_vector_values.stride_1(); //vector_length;
4018 
4019  // move to starting point
4020  A += i0*astep + v;
4021  X += r0*xstep + v;
4022 
4023  //for (local_ordinal_type col=0;col<num_vectors;++col)
4024  if (nrows > 1) {
4025  // solve Lx = x
4026  KOKKOSBATCHED_TRSV_LOWER_NO_TRANSPOSE_INTERNAL_INVOKE
4027  (default_mode_type,default_algo_type,
4028  member,
4029  KB::Diag::Unit,
4030  blocksize,blocksize,
4031  one,
4032  A, as0, as1,
4033  X, xs0);
4034 
4035  for (local_ordinal_type tr=1;tr<nrows;++tr) {
4036  member.team_barrier();
4037  KOKKOSBATCHED_GEMV_NO_TRANSPOSE_INTERNAL_INVOKE
4038  (default_mode_type,default_algo_type,
4039  member,
4040  blocksize, blocksize,
4041  -one,
4042  A+2*astep, as0, as1,
4043  X, xs0,
4044  one,
4045  X+1*xstep, xs0);
4046  KOKKOSBATCHED_TRSV_LOWER_NO_TRANSPOSE_INTERNAL_INVOKE
4047  (default_mode_type,default_algo_type,
4048  member,
4049  KB::Diag::Unit,
4050  blocksize,blocksize,
4051  one,
4052  A+3*astep, as0, as1,
4053  X+1*xstep, xs0);
4054 
4055  A += 3*astep;
4056  X += 1*xstep;
4057  }
4058 
4059  // solve Ux = x
4060  KOKKOSBATCHED_TRSV_UPPER_NO_TRANSPOSE_INTERNAL_INVOKE
4061  (default_mode_type,default_algo_type,
4062  member,
4063  KB::Diag::NonUnit,
4064  blocksize, blocksize,
4065  one,
4066  A, as0, as1,
4067  X, xs0);
4068 
4069  for (local_ordinal_type tr=nrows;tr>1;--tr) {
4070  A -= 3*astep;
4071  member.team_barrier();
4072  KOKKOSBATCHED_GEMV_NO_TRANSPOSE_INTERNAL_INVOKE
4073  (default_mode_type,default_algo_type,
4074  member,
4075  blocksize, blocksize,
4076  -one,
4077  A+1*astep, as0, as1,
4078  X, xs0,
4079  one,
4080  X-1*xstep, xs0);
4081  KOKKOSBATCHED_TRSV_UPPER_NO_TRANSPOSE_INTERNAL_INVOKE
4082  (default_mode_type,default_algo_type,
4083  member,
4084  KB::Diag::NonUnit,
4085  blocksize, blocksize,
4086  one,
4087  A, as0, as1,
4088  X-1*xstep,xs0);
4089  X -= 1*xstep;
4090  }
4091  // for multiple rhs
4092  //X += xs1;
4093  } else {
4094  const local_ordinal_type ws0 = WW.stride_0();
4095  auto W = WW.data() + v;
4096  KOKKOSBATCHED_COPY_VECTOR_NO_TRANSPOSE_INTERNAL_INVOKE
4097  (default_mode_type,
4098  member, blocksize, X, xs0, W, ws0);
4099  member.team_barrier();
4100  KOKKOSBATCHED_GEMV_NO_TRANSPOSE_INTERNAL_INVOKE
4101  (default_mode_type,default_algo_type,
4102  member,
4103  blocksize, blocksize,
4104  one,
4105  A, as0, as1,
4106  W, xs0,
4107  zero,
4108  X, xs0);
4109  }
4110  }
4111 
4112  template<typename WWViewType>
4113  KOKKOS_INLINE_FUNCTION
4114  void
4115  solveMultiVector(const member_type &member,
4116  const local_ordinal_type &/* blocksize */,
4117  const local_ordinal_type &i0,
4118  const local_ordinal_type &r0,
4119  const local_ordinal_type &nrows,
4120  const local_ordinal_type &v,
4121  const WWViewType &WW) const {
4122 
4123  typedef SolveTridiagsDefaultModeAndAlgo
4124  <typename execution_space::memory_space> default_mode_and_algo_type;
4125 
4126  typedef typename default_mode_and_algo_type::mode_type default_mode_type;
4127  typedef typename default_mode_and_algo_type::multi_vector_algo_type default_algo_type;
4128 
4129  // constant
4130  const auto one = Kokkos::ArithTraits<btdm_magnitude_type>::one();
4131  const auto zero = Kokkos::ArithTraits<btdm_magnitude_type>::zero();
4132 
4133  // subview pattern
4134  auto A = Kokkos::subview(D_internal_vector_values, i0, Kokkos::ALL(), Kokkos::ALL(), v);
4135  auto X1 = Kokkos::subview(X_internal_vector_values, r0, Kokkos::ALL(), Kokkos::ALL(), v);
4136  auto X2 = X1;
4137 
4138  local_ordinal_type i = i0, r = r0;
4139 
4140 
4141  if (nrows > 1) {
4142  // solve Lx = x
4143  KB::Trsm<member_type,
4144  KB::Side::Left,KB::Uplo::Lower,KB::Trans::NoTranspose,KB::Diag::Unit,
4145  default_mode_type,default_algo_type>
4146  ::invoke(member, one, A, X1);
4147  for (local_ordinal_type tr=1;tr<nrows;++tr,i+=3) {
4148  A.assign_data( &D_internal_vector_values(i+2,0,0,v) );
4149  X2.assign_data( &X_internal_vector_values(++r,0,0,v) );
4150  member.team_barrier();
4151  KB::Gemm<member_type,
4152  KB::Trans::NoTranspose,KB::Trans::NoTranspose,
4153  default_mode_type,default_algo_type>
4154  ::invoke(member, -one, A, X1, one, X2);
4155  A.assign_data( &D_internal_vector_values(i+3,0,0,v) );
4156  KB::Trsm<member_type,
4157  KB::Side::Left,KB::Uplo::Lower,KB::Trans::NoTranspose,KB::Diag::Unit,
4158  default_mode_type,default_algo_type>
4159  ::invoke(member, one, A, X2);
4160  X1.assign_data( X2.data() );
4161  }
4162 
4163  // solve Ux = x
4164  KB::Trsm<member_type,
4165  KB::Side::Left,KB::Uplo::Upper,KB::Trans::NoTranspose,KB::Diag::NonUnit,
4166  default_mode_type,default_algo_type>
4167  ::invoke(member, one, A, X1);
4168  for (local_ordinal_type tr=nrows;tr>1;--tr) {
4169  i -= 3;
4170  A.assign_data( &D_internal_vector_values(i+1,0,0,v) );
4171  X2.assign_data( &X_internal_vector_values(--r,0,0,v) );
4172  member.team_barrier();
4173  KB::Gemm<member_type,
4174  KB::Trans::NoTranspose,KB::Trans::NoTranspose,
4175  default_mode_type,default_algo_type>
4176  ::invoke(member, -one, A, X1, one, X2);
4177 
4178  A.assign_data( &D_internal_vector_values(i,0,0,v) );
4179  KB::Trsm<member_type,
4180  KB::Side::Left,KB::Uplo::Upper,KB::Trans::NoTranspose,KB::Diag::NonUnit,
4181  default_mode_type,default_algo_type>
4182  ::invoke(member, one, A, X2);
4183  X1.assign_data( X2.data() );
4184  }
4185  } else {
4186  // matrix is already inverted
4187  auto W = Kokkos::subview(WW, Kokkos::ALL(), Kokkos::ALL(), v);
4188  KB::Copy<member_type,KB::Trans::NoTranspose,default_mode_type>
4189  ::invoke(member, X1, W);
4190  member.team_barrier();
4191  KB::Gemm<member_type,
4192  KB::Trans::NoTranspose,KB::Trans::NoTranspose,
4193  default_mode_type,default_algo_type>
4194  ::invoke(member, one, A, W, zero, X1);
4195  }
4196  }
4197 
4198  template<int B> struct SingleVectorTag {};
4199  template<int B> struct MultiVectorTag {};
4200 
4201  template<int B> struct SingleVectorSubLineTag {};
4202  template<int B> struct MultiVectorSubLineTag {};
4203  template<int B> struct SingleVectorApplyCTag {};
4204  template<int B> struct MultiVectorApplyCTag {};
4205  template<int B> struct SingleVectorSchurTag {};
4206  template<int B> struct MultiVectorSchurTag {};
4207  template<int B> struct SingleVectorApplyETag {};
4208  template<int B> struct MultiVectorApplyETag {};
4209  template<int B> struct SingleVectorCopyToFlatTag {};
4210  template<int B> struct SingleZeroingTag {};
4211 
4212  template<int B>
4213  KOKKOS_INLINE_FUNCTION
4214  void
4215  operator() (const SingleVectorTag<B> &, const member_type &member) const {
4216  const local_ordinal_type packidx = member.league_rank();
4217  const local_ordinal_type partidx = packptr(packidx);
4218  const local_ordinal_type npacks = packptr(packidx+1) - partidx;
4219  const local_ordinal_type pri0 = part2packrowidx0(partidx);
4220  const local_ordinal_type i0 = pack_td_ptr(partidx,0);
4221  const local_ordinal_type r0 = part2packrowidx0(partidx);
4222  const local_ordinal_type nrows = partptr(partidx+1) - partptr(partidx);
4223  const local_ordinal_type blocksize = (B == 0 ? D_internal_vector_values.extent(1) : B);
4224  const local_ordinal_type num_vectors = 1;
4225  internal_vector_scratch_type_3d_view
4226  WW(member.team_scratch(0), blocksize, 1, vector_loop_size);
4227  Kokkos::single(Kokkos::PerTeam(member), [&]() {
4228  Z_scalar_vector(member.league_rank()) = impl_scalar_type(0);
4229  });
4230  Kokkos::parallel_for
4231  (Kokkos::ThreadVectorRange(member, vector_loop_size),[&](const int &v) {
4232  solveSingleVector(member, blocksize, i0, r0, nrows, v, WW);
4233  copyToFlatMultiVector(member, partidx, npacks, pri0, v, blocksize, num_vectors);
4234  });
4235  }
4236 
4237  template<int B>
4238  KOKKOS_INLINE_FUNCTION
4239  void
4240  operator() (const MultiVectorTag<B> &, const member_type &member) const {
4241  const local_ordinal_type packidx = member.league_rank();
4242  const local_ordinal_type partidx = packptr(packidx);
4243  const local_ordinal_type npacks = packptr(packidx+1) - partidx;
4244  const local_ordinal_type pri0 = part2packrowidx0(partidx);
4245  const local_ordinal_type i0 = pack_td_ptr(partidx,0);
4246  const local_ordinal_type r0 = part2packrowidx0(partidx);
4247  const local_ordinal_type nrows = partptr(partidx+1) - partptr(partidx);
4248  const local_ordinal_type blocksize = (B == 0 ? D_internal_vector_values.extent(1) : B);
4249  const local_ordinal_type num_vectors = X_internal_vector_values.extent(2);
4250 
4251  internal_vector_scratch_type_3d_view
4252  WW(member.team_scratch(0), blocksize, num_vectors, vector_loop_size);
4253  Kokkos::single(Kokkos::PerTeam(member), [&]() {
4254  Z_scalar_vector(member.league_rank()) = impl_scalar_type(0);
4255  });
4256  Kokkos::parallel_for
4257  (Kokkos::ThreadVectorRange(member, vector_loop_size),[&](const int &v) {
4258  solveMultiVector(member, blocksize, i0, r0, nrows, v, WW);
4259  copyToFlatMultiVector(member, partidx, npacks, pri0, v, blocksize, num_vectors);
4260  });
4261  }
4262 
4263  template<int B>
4264  KOKKOS_INLINE_FUNCTION
4265  void
4266  operator() (const SingleVectorSubLineTag<B> &, const member_type &member) const {
4267  // btdm is packed and sorted from largest one
4268  const local_ordinal_type packidx = packindices_sub(member.league_rank());
4269 
4270  const local_ordinal_type subpartidx = packptr_sub(packidx);
4271  const local_ordinal_type n_parts = part2packrowidx0_sub.extent(0);
4272  const local_ordinal_type local_subpartidx = subpartidx/n_parts;
4273  const local_ordinal_type partidx = subpartidx%n_parts;
4274 
4275  const local_ordinal_type npacks = packptr_sub(packidx+1) - subpartidx;
4276  const local_ordinal_type i0 = pack_td_ptr(partidx,local_subpartidx);
4277  const local_ordinal_type r0 = part2packrowidx0_sub(partidx,local_subpartidx);
4278  const local_ordinal_type nrows = partptr_sub(subpartidx,1) - partptr_sub(subpartidx,0);
4279  const local_ordinal_type blocksize = e_internal_vector_values.extent(2);
4280  const local_ordinal_type num_vectors = blocksize;
4281 
4282  //(void) i0;
4283  //(void) nrows;
4284  (void) npacks;
4285 
4286  internal_vector_scratch_type_3d_view
4287  WW(member.team_scratch(0), blocksize, num_vectors, vector_loop_size);
4288 
4289  Kokkos::parallel_for
4290  (Kokkos::ThreadVectorRange(member, vector_loop_size),[&](const int &v) {
4291  solveSingleVectorNew<impl_type, internal_vector_scratch_type_3d_view> (member, blocksize, i0, r0, nrows, v, D_internal_vector_values, X_internal_vector_values, WW);
4292  });
4293  }
4294 
4295  template<int B>
4296  KOKKOS_INLINE_FUNCTION
4297  void
4298  operator() (const SingleVectorApplyCTag<B> &, const member_type &member) const {
4299  // btdm is packed and sorted from largest one
4300  //const local_ordinal_type packidx = packindices_schur(member.league_rank());
4301  const local_ordinal_type packidx = packindices_sub(member.league_rank());
4302 
4303  const local_ordinal_type subpartidx = packptr_sub(packidx);
4304  const local_ordinal_type n_parts = part2packrowidx0_sub.extent(0);
4305  const local_ordinal_type local_subpartidx = subpartidx/n_parts;
4306  const local_ordinal_type partidx = subpartidx%n_parts;
4307  const local_ordinal_type blocksize = e_internal_vector_values.extent(2);
4308 
4309  //const local_ordinal_type npacks = packptr_sub(packidx+1) - subpartidx;
4310  const local_ordinal_type i0 = pack_td_ptr(partidx,local_subpartidx);
4311  const local_ordinal_type r0 = part2packrowidx0_sub(partidx,local_subpartidx);
4312  const local_ordinal_type nrows = partptr_sub(subpartidx,1) - partptr_sub(subpartidx,0);
4313 
4314  internal_vector_scratch_type_3d_view
4315  WW(member.team_scratch(0), blocksize, blocksize, vector_loop_size);
4316 
4317  // Compute v_2 = v_2 - C v_1
4318 
4319  const local_ordinal_type local_subpartidx_schur = (local_subpartidx-1)/2;
4320  const local_ordinal_type i0_schur = local_subpartidx_schur == 0 ? pack_td_ptr_schur(partidx,local_subpartidx_schur) : pack_td_ptr_schur(partidx,local_subpartidx_schur) + 1;
4321  const local_ordinal_type i0_offset = local_subpartidx_schur == 0 ? i0+2 : i0+2;
4322 
4323  (void) i0_schur;
4324  (void) i0_offset;
4325 
4326  const auto one = Kokkos::ArithTraits<btdm_magnitude_type>::one();
4327 
4328  const size_type c_kps2 = local_subpartidx > 0 ? pack_td_ptr(partidx, local_subpartidx)-2 : 0;
4329  const size_type c_kps1 = pack_td_ptr(partidx, local_subpartidx+1)+1;
4330 
4331  typedef SolveTridiagsDefaultModeAndAlgo
4332  <typename execution_space::memory_space> default_mode_and_algo_type;
4333 
4334  typedef typename default_mode_and_algo_type::mode_type default_mode_type;
4335  typedef typename default_mode_and_algo_type::single_vector_algo_type default_algo_type;
4336 
4337  if (local_subpartidx == 0) {
4338  Kokkos::parallel_for
4339  (Kokkos::ThreadVectorRange(member, vector_loop_size),[&](const int &v) {
4340  auto v_1 = Kokkos::subview(X_internal_vector_values, r0+nrows-1, Kokkos::ALL(), 0, v);
4341  auto v_2 = Kokkos::subview(X_internal_vector_values, r0+nrows, Kokkos::ALL(), 0, v);
4342  auto C = Kokkos::subview(D_internal_vector_values, c_kps1, Kokkos::ALL(), Kokkos::ALL(), v);
4343 
4344  KOKKOSBATCHED_GEMV_NO_TRANSPOSE_INTERNAL_INVOKE
4345  (default_mode_type,default_algo_type,
4346  member,
4347  blocksize, blocksize,
4348  -one,
4349  C.data(), C.stride_0(), C.stride_1(),
4350  v_1.data(), v_1.stride_0(),
4351  one,
4352  v_2.data(), v_2.stride_0());
4353  });
4354  }
4355  else if (local_subpartidx == (local_ordinal_type) part2packrowidx0_sub.extent(1) - 2) {
4356  Kokkos::parallel_for
4357  (Kokkos::ThreadVectorRange(member, vector_loop_size),[&](const int &v) {
4358  auto v_1 = Kokkos::subview(X_internal_vector_values, r0, Kokkos::ALL(), 0, v);
4359  auto v_2 = Kokkos::subview(X_internal_vector_values, r0-1, Kokkos::ALL(), 0, v);
4360  auto C = Kokkos::subview(D_internal_vector_values, c_kps2, Kokkos::ALL(), Kokkos::ALL(), v);
4361 
4362  KOKKOSBATCHED_GEMV_NO_TRANSPOSE_INTERNAL_INVOKE
4363  (default_mode_type,default_algo_type,
4364  member,
4365  blocksize, blocksize,
4366  -one,
4367  C.data(), C.stride_0(), C.stride_1(),
4368  v_1.data(), v_1.stride_0(),
4369  one,
4370  v_2.data(), v_2.stride_0());
4371  });
4372  }
4373  else {
4374  Kokkos::parallel_for
4375  (Kokkos::ThreadVectorRange(member, vector_loop_size),[&](const int &v) {
4376  {
4377  auto v_1 = Kokkos::subview(X_internal_vector_values, r0+nrows-1, Kokkos::ALL(), 0, v);
4378  auto v_2 = Kokkos::subview(X_internal_vector_values, r0+nrows, Kokkos::ALL(), 0, v);
4379  auto C = Kokkos::subview(D_internal_vector_values, c_kps1, Kokkos::ALL(), Kokkos::ALL(), v);
4380 
4381  KOKKOSBATCHED_GEMV_NO_TRANSPOSE_INTERNAL_INVOKE
4382  (default_mode_type,default_algo_type,
4383  member,
4384  blocksize, blocksize,
4385  -one,
4386  C.data(), C.stride_0(), C.stride_1(),
4387  v_1.data(), v_1.stride_0(),
4388  one,
4389  v_2.data(), v_2.stride_0());
4390  }
4391  {
4392  auto v_1 = Kokkos::subview(X_internal_vector_values, r0, Kokkos::ALL(), 0, v);
4393  auto v_2 = Kokkos::subview(X_internal_vector_values, r0-1, Kokkos::ALL(), 0, v);
4394  auto C = Kokkos::subview(D_internal_vector_values, c_kps2, Kokkos::ALL(), Kokkos::ALL(), v);
4395 
4396  KOKKOSBATCHED_GEMV_NO_TRANSPOSE_INTERNAL_INVOKE
4397  (default_mode_type,default_algo_type,
4398  member,
4399  blocksize, blocksize,
4400  -one,
4401  C.data(), C.stride_0(), C.stride_1(),
4402  v_1.data(), v_1.stride_0(),
4403  one,
4404  v_2.data(), v_2.stride_0());
4405  }
4406  });
4407  }
4408  }
4409 
4410  template<int B>
4411  KOKKOS_INLINE_FUNCTION
4412  void
4413  operator() (const SingleVectorSchurTag<B> &, const member_type &member) const {
4414  const local_ordinal_type packidx = packindices_sub(member.league_rank());
4415 
4416  const local_ordinal_type partidx = packptr_sub(packidx);
4417 
4418  const local_ordinal_type blocksize = e_internal_vector_values.extent(2);
4419 
4420  const local_ordinal_type i0_schur = pack_td_ptr_schur(partidx,0);
4421  const local_ordinal_type nrows = 2*(n_subparts_per_part-1);
4422 
4423  const local_ordinal_type r0_schur = nrows * member.league_rank();
4424 
4425  internal_vector_scratch_type_3d_view
4426  WW(member.team_scratch(0), blocksize, blocksize, vector_loop_size);
4427 
4428  for (local_ordinal_type schur_sub_part = 0; schur_sub_part < n_subparts_per_part-1; ++schur_sub_part) {
4429  const local_ordinal_type r0 = part2packrowidx0_sub(partidx,2*schur_sub_part+1);
4430  for (local_ordinal_type i = 0; i < 2; ++i) {
4431  copy3DView<local_ordinal_type>(member,
4432  Kokkos::subview(X_internal_vector_values_schur, r0_schur+2*schur_sub_part+i, Kokkos::ALL(), Kokkos::ALL(), Kokkos::ALL()),
4433  Kokkos::subview(X_internal_vector_values, r0+i, Kokkos::ALL(), Kokkos::ALL(), Kokkos::ALL()));
4434  }
4435  }
4436 
4437  Kokkos::parallel_for
4438  (Kokkos::ThreadVectorRange(member, vector_loop_size),[&](const int &v) {
4439  solveSingleVectorNew<impl_type, internal_vector_scratch_type_3d_view> (member, blocksize, i0_schur, r0_schur, nrows, v, D_internal_vector_values_schur, X_internal_vector_values_schur, WW);
4440  });
4441 
4442  for (local_ordinal_type schur_sub_part = 0; schur_sub_part < n_subparts_per_part-1; ++schur_sub_part) {
4443  const local_ordinal_type r0 = part2packrowidx0_sub(partidx,2*schur_sub_part+1);
4444  for (local_ordinal_type i = 0; i < 2; ++i) {
4445  copy3DView<local_ordinal_type>(member,
4446  Kokkos::subview(X_internal_vector_values, r0+i, Kokkos::ALL(), Kokkos::ALL(), Kokkos::ALL()),
4447  Kokkos::subview(X_internal_vector_values_schur, r0_schur+2*schur_sub_part+i, Kokkos::ALL(), Kokkos::ALL(), Kokkos::ALL()));
4448  }
4449  }
4450  }
4451 
4452  template<int B>
4453  KOKKOS_INLINE_FUNCTION
4454  void
4455  operator() (const SingleVectorApplyETag<B> &, const member_type &member) const {
4456  const local_ordinal_type packidx = packindices_sub(member.league_rank());
4457 
4458  const local_ordinal_type subpartidx = packptr_sub(packidx);
4459  const local_ordinal_type n_parts = part2packrowidx0_sub.extent(0);
4460  const local_ordinal_type local_subpartidx = subpartidx/n_parts;
4461  const local_ordinal_type partidx = subpartidx%n_parts;
4462  const local_ordinal_type blocksize = e_internal_vector_values.extent(2);
4463 
4464  const local_ordinal_type r0 = part2packrowidx0_sub(partidx,local_subpartidx);
4465  const local_ordinal_type nrows = partptr_sub(subpartidx,1) - partptr_sub(subpartidx,0);
4466 
4467  internal_vector_scratch_type_3d_view
4468  WW(member.team_scratch(0), blocksize, blocksize, vector_loop_size);
4469 
4470  // Compute v_2 = v_2 - C v_1
4471 
4472  const auto one = Kokkos::ArithTraits<btdm_magnitude_type>::one();
4473 
4474  typedef SolveTridiagsDefaultModeAndAlgo
4475  <typename execution_space::memory_space> default_mode_and_algo_type;
4476 
4477  typedef typename default_mode_and_algo_type::mode_type default_mode_type;
4478  typedef typename default_mode_and_algo_type::single_vector_algo_type default_algo_type;
4479 
4480  if (local_subpartidx == 0) {
4481  Kokkos::parallel_for
4482  (Kokkos::ThreadVectorRange(member, vector_loop_size),[&](const int &v) {
4483 
4484  auto v_2 = Kokkos::subview(X_internal_vector_values, r0+nrows, Kokkos::ALL(), 0, v);
4485 
4486  for (local_ordinal_type row = 0; row < nrows; ++row) {
4487  auto v_1 = Kokkos::subview(X_internal_vector_values, r0+row, Kokkos::ALL(), 0, v);
4488  auto E = Kokkos::subview(e_internal_vector_values, 0, r0+row, Kokkos::ALL(), Kokkos::ALL(), v);
4489 
4490  KOKKOSBATCHED_GEMV_NO_TRANSPOSE_INTERNAL_INVOKE
4491  (default_mode_type,default_algo_type,
4492  member,
4493  blocksize, blocksize,
4494  -one,
4495  E.data(), E.stride_0(), E.stride_1(),
4496  v_2.data(), v_2.stride_0(),
4497  one,
4498  v_1.data(), v_1.stride_0());
4499  }
4500  });
4501  }
4502  else if (local_subpartidx == (local_ordinal_type) part2packrowidx0_sub.extent(1) - 2) {
4503  Kokkos::parallel_for
4504  (Kokkos::ThreadVectorRange(member, vector_loop_size),[&](const int &v) {
4505  auto v_2 = Kokkos::subview(X_internal_vector_values, r0-1, Kokkos::ALL(), 0, v);
4506 
4507  for (local_ordinal_type row = 0; row < nrows; ++row) {
4508  auto v_1 = Kokkos::subview(X_internal_vector_values, r0+row, Kokkos::ALL(), 0, v);
4509  auto E = Kokkos::subview(e_internal_vector_values, 1, r0+row, Kokkos::ALL(), Kokkos::ALL(), v);
4510 
4511  KOKKOSBATCHED_GEMV_NO_TRANSPOSE_INTERNAL_INVOKE
4512  (default_mode_type,default_algo_type,
4513  member,
4514  blocksize, blocksize,
4515  -one,
4516  E.data(), E.stride_0(), E.stride_1(),
4517  v_2.data(), v_2.stride_0(),
4518  one,
4519  v_1.data(), v_1.stride_0());
4520  }
4521  });
4522  }
4523  else {
4524  Kokkos::parallel_for
4525  (Kokkos::ThreadVectorRange(member, vector_loop_size),[&](const int &v) {
4526  {
4527  auto v_2 = Kokkos::subview(X_internal_vector_values, r0+nrows, Kokkos::ALL(), 0, v);
4528 
4529  for (local_ordinal_type row = 0; row < nrows; ++row) {
4530  auto v_1 = Kokkos::subview(X_internal_vector_values, r0+row, Kokkos::ALL(), 0, v);
4531  auto E = Kokkos::subview(e_internal_vector_values, 0, r0+row, Kokkos::ALL(), Kokkos::ALL(), v);
4532 
4533  KOKKOSBATCHED_GEMV_NO_TRANSPOSE_INTERNAL_INVOKE
4534  (default_mode_type,default_algo_type,
4535  member,
4536  blocksize, blocksize,
4537  -one,
4538  E.data(), E.stride_0(), E.stride_1(),
4539  v_2.data(), v_2.stride_0(),
4540  one,
4541  v_1.data(), v_1.stride_0());
4542  }
4543  }
4544  {
4545  auto v_2 = Kokkos::subview(X_internal_vector_values, r0-1, Kokkos::ALL(), 0, v);
4546 
4547  for (local_ordinal_type row = 0; row < nrows; ++row) {
4548  auto v_1 = Kokkos::subview(X_internal_vector_values, r0+row, Kokkos::ALL(), 0, v);
4549  auto E = Kokkos::subview(e_internal_vector_values, 1, r0+row, Kokkos::ALL(), Kokkos::ALL(), v);
4550 
4551  KOKKOSBATCHED_GEMV_NO_TRANSPOSE_INTERNAL_INVOKE
4552  (default_mode_type,default_algo_type,
4553  member,
4554  blocksize, blocksize,
4555  -one,
4556  E.data(), E.stride_0(), E.stride_1(),
4557  v_2.data(), v_2.stride_0(),
4558  one,
4559  v_1.data(), v_1.stride_0());
4560  }
4561  }
4562  });
4563  }
4564  }
4565 
4566  template<int B>
4567  KOKKOS_INLINE_FUNCTION
4568  void
4569  operator() (const SingleVectorCopyToFlatTag<B> &, const member_type &member) const {
4570  const local_ordinal_type packidx = member.league_rank();
4571  const local_ordinal_type partidx = packptr(packidx);
4572  const local_ordinal_type npacks = packptr(packidx+1) - partidx;
4573  const local_ordinal_type pri0 = part2packrowidx0(partidx);
4574  const local_ordinal_type blocksize = (B == 0 ? D_internal_vector_values.extent(1) : B);
4575  const local_ordinal_type num_vectors = 1;
4576 
4577  Kokkos::parallel_for
4578  (Kokkos::ThreadVectorRange(member, vector_loop_size),[&](const int &v) {
4579  copyToFlatMultiVector(member, partidx, npacks, pri0, v, blocksize, num_vectors);
4580  });
4581  }
4582 
4583  template<int B>
4584  KOKKOS_INLINE_FUNCTION
4585  void
4586  operator() (const SingleZeroingTag<B> &, const member_type &member) const {
4587  Kokkos::single(Kokkos::PerTeam(member), [&]() {
4588  Z_scalar_vector(member.league_rank()) = impl_scalar_type(0);
4589  });
4590  }
4591 
4592  void run(const impl_scalar_type_2d_view_tpetra &Y,
4593  const impl_scalar_type_1d_view &Z) {
4594  IFPACK2_BLOCKTRIDICONTAINER_PROFILER_REGION_BEGIN;
4595  IFPACK2_BLOCKHELPER_TIMER("BlockTriDi::SolveTridiags");
4596 
4598  this->Y_scalar_multivector = Y;
4599  this->Z_scalar_vector = Z;
4600 
4601  const local_ordinal_type num_vectors = X_internal_vector_values.extent(2);
4602  const local_ordinal_type blocksize = D_internal_vector_values.extent(1);
4603 
4604  const local_ordinal_type team_size =
4605  SolveTridiagsDefaultModeAndAlgo<typename execution_space::memory_space>::
4606  recommended_team_size(blocksize, vector_length, internal_vector_length);
4607  const int per_team_scratch = internal_vector_scratch_type_3d_view
4608  ::shmem_size(blocksize, num_vectors, vector_loop_size);
4609 
4610 #if defined(KOKKOS_ENABLE_DEPRECATED_CODE)
4611 #define BLOCKTRIDICONTAINER_DETAILS_SOLVETRIDIAGS(B) \
4612  if (num_vectors == 1) { \
4613  const Kokkos::TeamPolicy<execution_space,SingleVectorTag<B> > \
4614  policy(packptr.extent(0) - 1, team_size, vector_loop_size); \
4615  Kokkos::parallel_for \
4616  ("SolveTridiags::TeamPolicy::run<SingleVector>", \
4617  policy.set_scratch_size(0,Kokkos::PerTeam(per_team_scratch)), *this); \
4618  } else { \
4619  const Kokkos::TeamPolicy<execution_space,MultiVectorTag<B> > \
4620  policy(packptr.extent(0) - 1, team_size, vector_loop_size); \
4621  Kokkos::parallel_for \
4622  ("SolveTridiags::TeamPolicy::run<MultiVector>", \
4623  policy.set_scratch_size(0,Kokkos::PerTeam(per_team_scratch)), *this); \
4624  } break
4625 #else
4626 #define BLOCKTRIDICONTAINER_DETAILS_SOLVETRIDIAGS(B) \
4627  if (num_vectors == 1) { \
4628  if (packindices_schur.extent(1) <= 0) { \
4629  Kokkos::TeamPolicy<execution_space,SingleVectorTag<B> > \
4630  policy(packptr.extent(0) - 1, team_size, vector_loop_size); \
4631  policy.set_scratch_size(0,Kokkos::PerTeam(per_team_scratch)); \
4632  Kokkos::parallel_for \
4633  ("SolveTridiags::TeamPolicy::run<SingleVector>", \
4634  policy, *this); \
4635  } \
4636  else { \
4637  { \
4638  \
4639  Kokkos::TeamPolicy<execution_space,SingleZeroingTag<B> > \
4640  policy(packptr.extent(0) - 1, team_size, vector_loop_size); \
4641  Kokkos::parallel_for \
4642  ("SolveTridiags::TeamPolicy::run<SingleZeroingTag>", \
4643  policy, *this); \
4644  } \
4645  { \
4646  IFPACK2_BLOCKHELPER_TIMER("BlockTriDi::ApplyInverseJacobi::SingleVectorSubLineTag"); \
4647  write4DMultiVectorValuesToFile(part2packrowidx0_sub.extent(0), X_internal_scalar_values, "x_scalar_values_before_SingleVectorSubLineTag.mm"); \
4648  Kokkos::TeamPolicy<execution_space,SingleVectorSubLineTag<B> > \
4649  policy(packindices_sub.extent(0), team_size, vector_loop_size); \
4650  policy.set_scratch_size(0,Kokkos::PerTeam(per_team_scratch)); \
4651  Kokkos::parallel_for \
4652  ("SolveTridiags::TeamPolicy::run<SingleVector>", \
4653  policy, *this); \
4654  write4DMultiVectorValuesToFile(part2packrowidx0_sub.extent(0), X_internal_scalar_values, "x_scalar_values_after_SingleVectorSubLineTag.mm"); \
4655  IFPACK2_BLOCKHELPER_TIMER_FENCE(execution_space) \
4656  } \
4657  { \
4658  IFPACK2_BLOCKHELPER_TIMER("BlockTriDi::ApplyInverseJacobi::SingleVectorApplyCTag"); \
4659  write4DMultiVectorValuesToFile(part2packrowidx0_sub.extent(0), X_internal_scalar_values, "x_scalar_values_before_SingleVectorApplyCTag.mm"); \
4660  Kokkos::TeamPolicy<execution_space,SingleVectorApplyCTag<B> > \
4661  policy(packindices_sub.extent(0), team_size, vector_loop_size); \
4662  policy.set_scratch_size(0,Kokkos::PerTeam(per_team_scratch)); \
4663  Kokkos::parallel_for \
4664  ("SolveTridiags::TeamPolicy::run<SingleVector>", \
4665  policy, *this); \
4666  write4DMultiVectorValuesToFile(part2packrowidx0_sub.extent(0), X_internal_scalar_values, "x_scalar_values_after_SingleVectorApplyCTag.mm"); \
4667  IFPACK2_BLOCKHELPER_TIMER_FENCE(execution_space) \
4668  } \
4669  { \
4670  IFPACK2_BLOCKHELPER_TIMER("BlockTriDi::ApplyInverseJacobi::SingleVectorSchurTag"); \
4671  write4DMultiVectorValuesToFile(part2packrowidx0_sub.extent(0), X_internal_scalar_values, "x_scalar_values_before_SingleVectorSchurTag.mm"); \
4672  Kokkos::TeamPolicy<execution_space,SingleVectorSchurTag<B> > \
4673  policy(packindices_schur.extent(0), team_size, vector_loop_size); \
4674  policy.set_scratch_size(0,Kokkos::PerTeam(per_team_scratch)); \
4675  Kokkos::parallel_for \
4676  ("SolveTridiags::TeamPolicy::run<SingleVector>", \
4677  policy, *this); \
4678  write4DMultiVectorValuesToFile(part2packrowidx0_sub.extent(0), X_internal_scalar_values, "x_scalar_values_after_SingleVectorSchurTag.mm"); \
4679  IFPACK2_BLOCKHELPER_TIMER_FENCE(execution_space) \
4680  } \
4681  { \
4682  IFPACK2_BLOCKHELPER_TIMER("BlockTriDi::ApplyInverseJacobi::SingleVectorApplyETag"); \
4683  write4DMultiVectorValuesToFile(part2packrowidx0_sub.extent(0), X_internal_scalar_values, "x_scalar_values_before_SingleVectorApplyETag.mm"); \
4684  Kokkos::TeamPolicy<execution_space,SingleVectorApplyETag<B> > \
4685  policy(packindices_sub.extent(0), team_size, vector_loop_size); \
4686  policy.set_scratch_size(0,Kokkos::PerTeam(per_team_scratch)); \
4687  Kokkos::parallel_for \
4688  ("SolveTridiags::TeamPolicy::run<SingleVector>", \
4689  policy, *this); \
4690  write4DMultiVectorValuesToFile(part2packrowidx0_sub.extent(0), X_internal_scalar_values, "x_scalar_values_after_SingleVectorApplyETag.mm"); \
4691  IFPACK2_BLOCKHELPER_TIMER_FENCE(execution_space) \
4692  } \
4693  { \
4694  \
4695  Kokkos::TeamPolicy<execution_space,SingleVectorCopyToFlatTag<B> > \
4696  policy(packptr.extent(0) - 1, team_size, vector_loop_size); \
4697  Kokkos::parallel_for \
4698  ("SolveTridiags::TeamPolicy::run<SingleVectorCopyToFlatTag>", \
4699  policy, *this); \
4700  } \
4701  } \
4702  } else { \
4703  Kokkos::TeamPolicy<execution_space,MultiVectorTag<B> > \
4704  policy(packptr.extent(0) - 1, team_size, vector_loop_size); \
4705  policy.set_scratch_size(0,Kokkos::PerTeam(per_team_scratch)); \
4706  Kokkos::parallel_for \
4707  ("SolveTridiags::TeamPolicy::run<MultiVector>", \
4708  policy, *this); \
4709  } break
4710 #endif
4711  switch (blocksize) {
4712  case 3: BLOCKTRIDICONTAINER_DETAILS_SOLVETRIDIAGS( 3);
4713  case 5: BLOCKTRIDICONTAINER_DETAILS_SOLVETRIDIAGS( 5);
4714  case 6: BLOCKTRIDICONTAINER_DETAILS_SOLVETRIDIAGS( 6);
4715  case 7: BLOCKTRIDICONTAINER_DETAILS_SOLVETRIDIAGS( 7);
4716  case 10: BLOCKTRIDICONTAINER_DETAILS_SOLVETRIDIAGS(10);
4717  case 11: BLOCKTRIDICONTAINER_DETAILS_SOLVETRIDIAGS(11);
4718  case 12: BLOCKTRIDICONTAINER_DETAILS_SOLVETRIDIAGS(12);
4719  case 13: BLOCKTRIDICONTAINER_DETAILS_SOLVETRIDIAGS(13);
4720  case 16: BLOCKTRIDICONTAINER_DETAILS_SOLVETRIDIAGS(16);
4721  case 17: BLOCKTRIDICONTAINER_DETAILS_SOLVETRIDIAGS(17);
4722  case 18: BLOCKTRIDICONTAINER_DETAILS_SOLVETRIDIAGS(18);
4723  case 19: BLOCKTRIDICONTAINER_DETAILS_SOLVETRIDIAGS(19);
4724  default : BLOCKTRIDICONTAINER_DETAILS_SOLVETRIDIAGS( 0);
4725  }
4726 #undef BLOCKTRIDICONTAINER_DETAILS_SOLVETRIDIAGS
4727 
4728  IFPACK2_BLOCKTRIDICONTAINER_PROFILER_REGION_END;
4729  IFPACK2_BLOCKHELPER_TIMER_FENCE(execution_space)
4730  }
4731  };
4732 
4736  template<typename MatrixType>
4737  int
4739  const Teuchos::RCP<const typename BlockHelperDetails::ImplType<MatrixType>::tpetra_block_crs_matrix_type> &A,
4740  const Teuchos::RCP<const typename BlockHelperDetails::ImplType<MatrixType>::tpetra_import_type> &tpetra_importer,
4741  const Teuchos::RCP<AsyncableImport<MatrixType> > &async_importer,
4742  const bool overlap_communication_and_computation,
4743  // tpetra interface
4744  const typename BlockHelperDetails::ImplType<MatrixType>::tpetra_multivector_type &X, // tpetra interface
4745  /* */ typename BlockHelperDetails::ImplType<MatrixType>::tpetra_multivector_type &Y, // tpetra interface
4746  /* */ typename BlockHelperDetails::ImplType<MatrixType>::tpetra_multivector_type &Z, // temporary tpetra interface (seq_method)
4747  /* */ typename BlockHelperDetails::ImplType<MatrixType>::impl_scalar_type_1d_view &W, // temporary tpetra interface (diff)
4748  // local object interface
4749  const BlockHelperDetails::PartInterface<MatrixType> &interf, // mesh interface
4750  const BlockTridiags<MatrixType> &btdm, // packed block tridiagonal matrices
4751  const BlockHelperDetails::AmD<MatrixType> &amd, // R = A - D
4752  /* */ typename BlockHelperDetails::ImplType<MatrixType>::vector_type_1d_view &work, // workspace for packed multivector of right hand side
4753  /* */ BlockHelperDetails::NormManager<MatrixType> &norm_manager,
4754  // preconditioner parameters
4755  const typename BlockHelperDetails::ImplType<MatrixType>::impl_scalar_type &damping_factor,
4756  /* */ bool is_y_zero,
4757  const int max_num_sweeps,
4758  const typename BlockHelperDetails::ImplType<MatrixType>::magnitude_type tol,
4759  const int check_tol_every) {
4760  IFPACK2_BLOCKHELPER_TIMER("BlockTriDi::ApplyInverseJacobi");
4761 
4762  using impl_type = BlockHelperDetails::ImplType<MatrixType>;
4763  using node_memory_space = typename impl_type::node_memory_space;
4764  using local_ordinal_type = typename impl_type::local_ordinal_type;
4765  using size_type = typename impl_type::size_type;
4766  using impl_scalar_type = typename impl_type::impl_scalar_type;
4767  using magnitude_type = typename impl_type::magnitude_type;
4768  using local_ordinal_type_1d_view = typename impl_type::local_ordinal_type_1d_view;
4769  using vector_type_1d_view = typename impl_type::vector_type_1d_view;
4770  using vector_type_3d_view = typename impl_type::vector_type_3d_view;
4771  using tpetra_multivector_type = typename impl_type::tpetra_multivector_type;
4772 
4773  using impl_scalar_type_1d_view = typename impl_type::impl_scalar_type_1d_view;
4774 
4775  // either tpetra importer or async importer must be active
4776  TEUCHOS_TEST_FOR_EXCEPT_MSG(!tpetra_importer.is_null() && !async_importer.is_null(),
4777  "Neither Tpetra importer nor Async importer is null.");
4778  // max number of sweeps should be positive number
4779  TEUCHOS_TEST_FOR_EXCEPT_MSG(max_num_sweeps <= 0,
4780  "Maximum number of sweeps must be >= 1.");
4781 
4782  // const parameters
4783  const bool is_seq_method_requested = !tpetra_importer.is_null();
4784  const bool is_async_importer_active = !async_importer.is_null();
4785  const bool is_norm_manager_active = tol > Kokkos::ArithTraits<magnitude_type>::zero();
4786  const magnitude_type tolerance = tol*tol;
4787  const local_ordinal_type blocksize = btdm.values.extent(1);
4788  const local_ordinal_type num_vectors = Y.getNumVectors();
4789  const local_ordinal_type num_blockrows = interf.part2packrowidx0_back;
4790 
4791  const impl_scalar_type zero(0.0);
4792 
4793  TEUCHOS_TEST_FOR_EXCEPT_MSG(is_norm_manager_active && is_seq_method_requested,
4794  "The seq method for applyInverseJacobi, " <<
4795  "which in any case is for developer use only, " <<
4796  "does not support norm-based termination.");
4797  const bool device_accessible_from_host = Kokkos::SpaceAccessibility<
4798  Kokkos::DefaultHostExecutionSpace, node_memory_space>::accessible;
4799  TEUCHOS_TEST_FOR_EXCEPTION(is_seq_method_requested && !device_accessible_from_host,
4800  std::invalid_argument,
4801  "The seq method for applyInverseJacobi, " <<
4802  "which in any case is for developer use only, " <<
4803  "only supports memory spaces accessible from host.");
4804 
4805  // if workspace is needed more, resize it
4806  const size_type work_span_required = num_blockrows*num_vectors*blocksize;
4807  if (work.span() < work_span_required)
4808  work = vector_type_1d_view("vector workspace 1d view", work_span_required);
4809 
4810  // construct W
4811  const local_ordinal_type W_size = interf.packptr.extent(0)-1;
4812  if (local_ordinal_type(W.extent(0)) < W_size)
4813  W = impl_scalar_type_1d_view("W", W_size);
4814 
4815  typename impl_type::impl_scalar_type_2d_view_tpetra remote_multivector;
4816  {
4817  if (is_seq_method_requested) {
4818  if (Z.getNumVectors() != Y.getNumVectors())
4819  Z = tpetra_multivector_type(tpetra_importer->getTargetMap(), num_vectors, false);
4820  } else {
4821  if (is_async_importer_active) {
4822  // create comm data buffer and keep it here
4823  async_importer->createDataBuffer(num_vectors);
4824  remote_multivector = async_importer->getRemoteMultiVectorLocalView();
4825  }
4826  }
4827  }
4828 
4829  // wrap the workspace with 3d view
4830  vector_type_3d_view pmv(work.data(), num_blockrows, blocksize, num_vectors);
4831  const auto XX = X.getLocalViewDevice(Tpetra::Access::ReadOnly);
4832  const auto YY = Y.getLocalViewDevice(Tpetra::Access::ReadWrite);
4833  const auto ZZ = Z.getLocalViewDevice(Tpetra::Access::ReadWrite);
4834  if (is_y_zero) Kokkos::deep_copy(YY, zero);
4835 
4836  MultiVectorConverter<MatrixType> multivector_converter(interf, pmv);
4837  SolveTridiags<MatrixType> solve_tridiags(interf, btdm, pmv,
4838  damping_factor, is_norm_manager_active);
4839 
4840  const local_ordinal_type_1d_view dummy_local_ordinal_type_1d_view;
4841  BlockHelperDetails::ComputeResidualVector<MatrixType>
4842  compute_residual_vector(amd, A->getCrsGraph().getLocalGraphDevice(), blocksize, interf,
4843  is_async_importer_active ? async_importer->dm2cm : dummy_local_ordinal_type_1d_view);
4844 
4845  // norm manager workspace resize
4846  if (is_norm_manager_active)
4847  norm_manager.setCheckFrequency(check_tol_every);
4848 
4849  // iterate
4850  int sweep = 0;
4851  for (;sweep<max_num_sweeps;++sweep) {
4852  {
4853  if (is_y_zero) {
4854  // pmv := x(lclrow)
4855  multivector_converter.run(XX);
4856  } else {
4857  if (is_seq_method_requested) {
4858  // SEQ METHOD IS TESTING ONLY
4859 
4860  // y := x - R y
4861  Z.doImport(Y, *tpetra_importer, Tpetra::REPLACE);
4862  compute_residual_vector.run(YY, XX, ZZ);
4863 
4864  // pmv := y(lclrow).
4865  multivector_converter.run(YY);
4866  } else {
4867  // fused y := x - R y and pmv := y(lclrow);
4868  // real use case does not use overlap comp and comm
4869  if (overlap_communication_and_computation || !is_async_importer_active) {
4870  if (is_async_importer_active) async_importer->asyncSendRecv(YY);
4871  compute_residual_vector.run(pmv, XX, YY, remote_multivector, true);
4872  if (is_norm_manager_active && norm_manager.checkDone(sweep, tolerance)) {
4873  if (is_async_importer_active) async_importer->cancel();
4874  break;
4875  }
4876  if (is_async_importer_active) {
4877  async_importer->syncRecv();
4878  compute_residual_vector.run(pmv, XX, YY, remote_multivector, false);
4879  }
4880  } else {
4881  if (is_async_importer_active)
4882  async_importer->syncExchange(YY);
4883  if (is_norm_manager_active && norm_manager.checkDone(sweep, tolerance)) break;
4884  compute_residual_vector.run(pmv, XX, YY, remote_multivector);
4885  }
4886  }
4887  }
4888  }
4889 
4890  // pmv := inv(D) pmv.
4891  {
4892  solve_tridiags.run(YY, W);
4893  }
4894  {
4895  if (is_norm_manager_active) {
4896  // y(lclrow) = (b - a) y(lclrow) + a pmv, with b = 1 always.
4897  BlockHelperDetails::reduceVector<MatrixType>(W, norm_manager.getBuffer());
4898  if (sweep + 1 == max_num_sweeps) {
4899  norm_manager.ireduce(sweep, true);
4900  norm_manager.checkDone(sweep + 1, tolerance, true);
4901  } else {
4902  norm_manager.ireduce(sweep);
4903  }
4904  }
4905  }
4906  is_y_zero = false;
4907  }
4908 
4909  //sqrt the norms for the caller's use.
4910  if (is_norm_manager_active) norm_manager.finalize();
4911 
4912  return sweep;
4913  }
4914 
4915 
4916  template<typename MatrixType>
4917  struct ImplObject {
4918  using impl_type = BlockHelperDetails::ImplType<MatrixType>;
4919  using part_interface_type = BlockHelperDetails::PartInterface<MatrixType>;
4920  using block_tridiags_type = BlockTridiags<MatrixType>;
4921  using amd_type = BlockHelperDetails::AmD<MatrixType>;
4922  using norm_manager_type = BlockHelperDetails::NormManager<MatrixType>;
4923  using async_import_type = AsyncableImport<MatrixType>;
4924 
4925  // distructed objects
4928  Teuchos::RCP<async_import_type> async_importer;
4929  bool overlap_communication_and_computation;
4930 
4931  // copy of Y (mutable to penentrate const)
4932  mutable typename impl_type::tpetra_multivector_type Z;
4933  mutable typename impl_type::impl_scalar_type_1d_view W;
4934 
4935  // local objects
4936  part_interface_type part_interface;
4937  block_tridiags_type block_tridiags; // D
4938  amd_type a_minus_d; // R = A - D
4939  mutable typename impl_type::vector_type_1d_view work; // right hand side workspace
4940  mutable norm_manager_type norm_manager;
4941  };
4942 
4943  } // namespace BlockTriDiContainerDetails
4944 
4945 } // namespace Ifpack2
4946 
4947 #endif
Definition: Ifpack2_BlockTriDiContainer_impl.hpp:173
BlockHelperDetails::PartInterface< MatrixType > createPartInterface(const Teuchos::RCP< const typename BlockHelperDetails::ImplType< MatrixType >::tpetra_block_crs_matrix_type > &A, const Teuchos::Array< Teuchos::Array< typename BlockHelperDetails::ImplType< MatrixType >::local_ordinal_type > > &partitions, const typename BlockHelperDetails::ImplType< MatrixType >::local_ordinal_type n_subparts_per_part_in)
Definition: Ifpack2_BlockTriDiContainer_impl.hpp:1007
#define TEUCHOS_TEST_FOR_EXCEPTION(throw_exception_test, Exception, msg)
int applyInverseJacobi(const Teuchos::RCP< const typename BlockHelperDetails::ImplType< MatrixType >::tpetra_block_crs_matrix_type > &A, const Teuchos::RCP< const typename BlockHelperDetails::ImplType< MatrixType >::tpetra_import_type > &tpetra_importer, const Teuchos::RCP< AsyncableImport< MatrixType > > &async_importer, const bool overlap_communication_and_computation, const typename BlockHelperDetails::ImplType< MatrixType >::tpetra_multivector_type &X, typename BlockHelperDetails::ImplType< MatrixType >::tpetra_multivector_type &Y, typename BlockHelperDetails::ImplType< MatrixType >::tpetra_multivector_type &Z, typename BlockHelperDetails::ImplType< MatrixType >::impl_scalar_type_1d_view &W, const BlockHelperDetails::PartInterface< MatrixType > &interf, const BlockTridiags< MatrixType > &btdm, const BlockHelperDetails::AmD< MatrixType > &amd, typename BlockHelperDetails::ImplType< MatrixType >::vector_type_1d_view &work, BlockHelperDetails::NormManager< MatrixType > &norm_manager, const typename BlockHelperDetails::ImplType< MatrixType >::impl_scalar_type &damping_factor, bool is_y_zero, const int max_num_sweeps, const typename BlockHelperDetails::ImplType< MatrixType >::magnitude_type tol, const int check_tol_every)
Definition: Ifpack2_BlockTriDiContainer_impl.hpp:4738
size_type size() const
size_t size_type
Definition: Ifpack2_BlockHelper.hpp:294
TEUCHOS_DEPRECATED RCP< T > rcp(T *p, Dealloc_T dealloc, bool owns_mem)
#define TEUCHOS_TEST_FOR_EXCEPT_MSG(throw_exception_test, msg)
BlockTridiags< MatrixType > createBlockTridiags(const BlockHelperDetails::PartInterface< MatrixType > &interf)
Definition: Ifpack2_BlockTriDiContainer_impl.hpp:1578
Definition: Ifpack2_BlockHelper.hpp:388
Kokkos::ViewAllocateWithoutInitializing do_not_initialize_tag
Definition: Ifpack2_BlockTriDiContainer_impl.hpp:129
void send(const Packet sendBuffer[], const Ordinal count, const int destRank, const int tag, const Comm< Ordinal > &comm)
void performNumericPhase(const Teuchos::RCP< const typename BlockHelperDetails::ImplType< MatrixType >::tpetra_block_crs_matrix_type > &A, const BlockHelperDetails::PartInterface< MatrixType > &interf, BlockTridiags< MatrixType > &btdm, const typename BlockHelperDetails::ImplType< MatrixType >::magnitude_type tiny)
Definition: Ifpack2_BlockTriDiContainer_impl.hpp:3481
T * getRawPtr() const
void performSymbolicPhase(const Teuchos::RCP< const typename BlockHelperDetails::ImplType< MatrixType >::tpetra_block_crs_matrix_type > &A, const BlockHelperDetails::PartInterface< MatrixType > &interf, BlockTridiags< MatrixType > &btdm, BlockHelperDetails::AmD< MatrixType > &amd, const bool overlap_communication_and_computation)
Definition: Ifpack2_BlockTriDiContainer_impl.hpp:1820
Kokkos::Details::ArithTraits< scalar_type >::val_type impl_scalar_type
Definition: Ifpack2_BlockHelper.hpp:303
Teuchos::RCP< AsyncableImport< MatrixType > > createBlockCrsAsyncImporter(const Teuchos::RCP< const typename BlockHelperDetails::ImplType< MatrixType >::tpetra_block_crs_matrix_type > &A)
Definition: Ifpack2_BlockTriDiContainer_impl.hpp:886
Teuchos::RCP< const typename BlockHelperDetails::ImplType< MatrixType >::tpetra_import_type > createBlockCrsTpetraImporter(const Teuchos::RCP< const typename BlockHelperDetails::ImplType< MatrixType >::tpetra_block_crs_matrix_type > &A)
Definition: Ifpack2_BlockTriDiContainer_impl.hpp:196
Definition: Ifpack2_BlockHelper.hpp:229
Kokkos::View< size_type *, device_type > size_type_1d_view
Definition: Ifpack2_BlockHelper.hpp:359
RCP< CommRequest< Ordinal > > isend(const ArrayRCP< const Packet > &sendBuffer, const int destRank, const int tag, const Comm< Ordinal > &comm)
#define TEUCHOS_ASSERT(assertion_test)
Definition: Ifpack2_BlockHelper.hpp:256
Definition: Ifpack2_BlockHelper.hpp:290
Definition: Ifpack2_BlockTriDiContainer_impl.hpp:1523
Definition: Ifpack2_BlockComputeResidualVector.hpp:56
Definition: Ifpack2_BlockTriDiContainer_impl.hpp:3495