10 #ifndef IFPACK2_BLOCKTRIDICONTAINER_IMPL_HPP
11 #define IFPACK2_BLOCKTRIDICONTAINER_IMPL_HPP
18 #include <Tpetra_Details_extractMpiCommFromTeuchos.hpp>
19 #include <Tpetra_Distributor.hpp>
20 #include <Tpetra_BlockMultiVector.hpp>
22 #if KOKKOS_VERSION >= 40799
23 #include <KokkosKernels_ArithTraits.hpp>
25 #include <Kokkos_ArithTraits.hpp>
27 #include <KokkosBatched_Util.hpp>
28 #include <KokkosBatched_Vector.hpp>
29 #include <KokkosBatched_Copy_Decl.hpp>
30 #include <KokkosBatched_Copy_Impl.hpp>
31 #include <KokkosBatched_AddRadial_Decl.hpp>
32 #include <KokkosBatched_AddRadial_Impl.hpp>
33 #include <KokkosBatched_SetIdentity_Decl.hpp>
34 #include <KokkosBatched_SetIdentity_Impl.hpp>
35 #include <KokkosBatched_Gemm_Decl.hpp>
36 #include <KokkosBatched_Gemm_Serial_Impl.hpp>
37 #include <KokkosBatched_Gemm_Team_Impl.hpp>
38 #include <KokkosBatched_Gemv_Decl.hpp>
39 #include <KokkosBatched_Gemv_Team_Impl.hpp>
40 #include <KokkosBatched_Trsm_Decl.hpp>
41 #include <KokkosBatched_Trsm_Serial_Impl.hpp>
42 #include <KokkosBatched_Trsm_Team_Impl.hpp>
43 #include <KokkosBatched_Trsv_Decl.hpp>
44 #include <KokkosBatched_Trsv_Serial_Impl.hpp>
45 #include <KokkosBatched_Trsv_Team_Impl.hpp>
46 #include <KokkosBatched_LU_Decl.hpp>
47 #include <KokkosBatched_LU_Serial_Impl.hpp>
48 #include <KokkosBatched_LU_Team_Impl.hpp>
50 #include <KokkosBlas1_nrm1.hpp>
51 #include <KokkosBlas1_nrm2.hpp>
55 #include "Ifpack2_BlockHelper.hpp"
56 #include "Ifpack2_BlockComputeResidualVector.hpp"
57 #include "Ifpack2_BlockComputeResidualAndSolve.hpp"
62 #if defined(KOKKOS_ENABLE_CUDA) && defined(IFPACK2_BLOCKTRIDICONTAINER_ENABLE_PROFILE)
63 #include "cuda_profiler_api.h"
68 #define IFPACK2_BLOCKTRIDICONTAINER_USE_MPI_3
76 #define IFPACK2_BLOCKTRIDICONTAINER_USE_PINNED_MEMORY_FOR_MPI
80 #define IFPACK2_BLOCKTRIDICONTAINER_USE_CUDA_SPACE
83 #if defined(HAVE_IFPACK2_BLOCKTRIDICONTAINER_SMALL_SCALAR)
84 #define IFPACK2_BLOCKTRIDICONTAINER_USE_SMALL_SCALAR_FOR_BLOCKTRIDIAG
88 #define IFPACK2_BLOCKTRIDICONTAINER_USE_EXEC_SPACE_INSTANCES
92 namespace BlockTriDiContainerDetails {
94 namespace KB = KokkosBatched;
101 template <
typename MemoryTraitsType, Kokkos::MemoryTraitsFlags flag>
102 using MemoryTraits = Kokkos::MemoryTraits<MemoryTraitsType::is_unmanaged |
103 MemoryTraitsType::is_random_access |
106 template <
typename ViewType>
107 using Unmanaged = Kokkos::View<
typename ViewType::data_type,
108 typename ViewType::array_layout,
109 typename ViewType::device_type,
110 MemoryTraits<typename ViewType::memory_traits, Kokkos::Unmanaged>>;
111 template <
typename ViewType>
112 using Atomic = Kokkos::View<
typename ViewType::data_type,
113 typename ViewType::array_layout,
114 typename ViewType::device_type,
115 MemoryTraits<typename ViewType::memory_traits, Kokkos::Atomic>>;
116 template <
typename ViewType>
117 using Const = Kokkos::View<
typename ViewType::const_data_type,
118 typename ViewType::array_layout,
119 typename ViewType::device_type,
120 typename ViewType::memory_traits>;
121 template <
typename ViewType>
122 using ConstUnmanaged = Const<Unmanaged<ViewType>>;
124 template <
typename ViewType>
125 using AtomicUnmanaged = Atomic<Unmanaged<ViewType>>;
127 template <
typename ViewType>
128 using Unmanaged = Kokkos::View<
typename ViewType::data_type,
129 typename ViewType::array_layout,
130 typename ViewType::device_type,
131 MemoryTraits<typename ViewType::memory_traits, Kokkos::Unmanaged>>;
133 template <
typename ViewType>
134 using Scratch = Kokkos::View<
typename ViewType::data_type,
135 typename ViewType::array_layout,
136 typename ViewType::execution_space::scratch_memory_space,
137 MemoryTraits<typename ViewType::memory_traits, Kokkos::Unmanaged>>;
142 template <
typename T>
146 #if defined(IFPACK2_BLOCKTRIDICONTAINER_USE_SMALL_SCALAR_FOR_BLOCKTRIDIAG)
154 #if defined(KOKKOS_ENABLE_CUDA) && defined(IFPACK2_BLOCKTRIDICONTAINER_ENABLE_PROFILE)
155 #define IFPACK2_BLOCKTRIDICONTAINER_PROFILER_REGION_BEGIN \
156 KOKKOS_IMPL_CUDA_SAFE_CALL(cudaProfilerStart());
158 #define IFPACK2_BLOCKTRIDICONTAINER_PROFILER_REGION_END \
159 { KOKKOS_IMPL_CUDA_SAFE_CALL(cudaProfilerStop()); }
161 #define IFPACK2_BLOCKTRIDICONTAINER_PROFILER_REGION_BEGIN
163 #define IFPACK2_BLOCKTRIDICONTAINER_PROFILER_REGION_END
169 template <
typename MatrixType>
172 IFPACK2_BLOCKHELPER_TIMER(
"BlockTriDi::CreateBlockCrsTpetraImporter", CreateBlockCrsTpetraImporter);
174 using tpetra_map_type =
typename impl_type::tpetra_map_type;
175 using tpetra_mv_type =
typename impl_type::tpetra_block_multivector_type;
176 using tpetra_import_type =
typename impl_type::tpetra_import_type;
177 using crs_matrix_type =
typename impl_type::tpetra_crs_matrix_type;
178 using block_crs_matrix_type =
typename impl_type::tpetra_block_crs_matrix_type;
180 auto A_crs = Teuchos::rcp_dynamic_cast<
const crs_matrix_type>(A);
181 auto A_bcrs = Teuchos::rcp_dynamic_cast<
const block_crs_matrix_type>(A);
183 bool hasBlockCrsMatrix = !A_bcrs.is_null();
186 const auto g = hasBlockCrsMatrix ? A_bcrs->getCrsGraph() : *(A_crs->getCrsGraph());
188 const auto blocksize = hasBlockCrsMatrix ? A_bcrs->getBlockSize() : 1;
189 const auto src =
Teuchos::rcp(
new tpetra_map_type(tpetra_mv_type::makePointMap(*g.getDomainMap(), blocksize)));
190 const auto tgt =
Teuchos::rcp(
new tpetra_map_type(tpetra_mv_type::makePointMap(*g.getColMap(), blocksize)));
191 IFPACK2_BLOCKHELPER_TIMER_FENCE(
typename BlockHelperDetails::ImplType<MatrixType>::execution_space)
200 template <
typename MatrixType>
201 struct AsyncableImport {
209 #if !defined(HAVE_IFPACK2_MPI)
210 typedef int MPI_Request;
211 typedef int MPI_Comm;
213 using scalar_type =
typename impl_type::scalar_type;
217 static int isend(
const MPI_Comm comm,
const char *buf,
int count,
int dest,
int tag, MPI_Request *ireq) {
218 #ifdef HAVE_IFPACK2_MPI
220 int ret = MPI_Isend(const_cast<char *>(buf), count, MPI_CHAR, dest, tag, comm, ireq == NULL ? &ureq : ireq);
221 if (ireq == NULL) MPI_Request_free(&ureq);
228 static int irecv(
const MPI_Comm comm,
char *buf,
int count,
int src,
int tag, MPI_Request *ireq) {
229 #ifdef HAVE_IFPACK2_MPI
231 int ret = MPI_Irecv(buf, count, MPI_CHAR, src, tag, comm, ireq == NULL ? &ureq : ireq);
232 if (ireq == NULL) MPI_Request_free(&ureq);
239 static int waitany(
int count, MPI_Request *reqs,
int *index) {
240 #ifdef HAVE_IFPACK2_MPI
241 return MPI_Waitany(count, reqs, index, MPI_STATUS_IGNORE);
247 static int waitall(
int count, MPI_Request *reqs) {
248 #ifdef HAVE_IFPACK2_MPI
249 return MPI_Waitall(count, reqs, MPI_STATUS_IGNORE);
256 using tpetra_map_type =
typename impl_type::tpetra_map_type;
257 using tpetra_import_type =
typename impl_type::tpetra_import_type;
259 using local_ordinal_type =
typename impl_type::local_ordinal_type;
260 using global_ordinal_type =
typename impl_type::global_ordinal_type;
264 using int_1d_view_host = Kokkos::View<int *, Kokkos::HostSpace>;
265 using local_ordinal_type_1d_view_host = Kokkos::View<local_ordinal_type *, Kokkos::HostSpace>;
267 using execution_space =
typename impl_type::execution_space;
268 using memory_space =
typename impl_type::memory_space;
269 using local_ordinal_type_1d_view =
typename impl_type::local_ordinal_type_1d_view;
271 using size_type_1d_view_host = Kokkos::View<size_type *, Kokkos::HostSpace>;
273 #if defined(KOKKOS_ENABLE_CUDA)
274 using impl_scalar_type_1d_view =
275 typename std::conditional<std::is_same<execution_space, Kokkos::Cuda>::value,
276 #if defined(IFPACK2_BLOCKTRIDICONTAINER_USE_PINNED_MEMORY_FOR_MPI)
277 Kokkos::View<impl_scalar_type *, Kokkos::CudaHostPinnedSpace>,
278 #elif defined(IFPACK2_BLOCKTRIDICONTAINER_USE_CUDA_MEMORY_FOR_MPI)
279 Kokkos::View<impl_scalar_type *, Kokkos::CudaSpace>,
280 #else // no experimental macros are defined
281 typename impl_type::impl_scalar_type_1d_view,
283 typename impl_type::impl_scalar_type_1d_view>::type;
285 using impl_scalar_type_1d_view =
typename impl_type::impl_scalar_type_1d_view;
287 using impl_scalar_type_1d_view_host = Kokkos::View<impl_scalar_type *, Kokkos::HostSpace>;
288 using impl_scalar_type_2d_view =
typename impl_type::impl_scalar_type_2d_view;
289 using impl_scalar_type_2d_view_tpetra =
typename impl_type::impl_scalar_type_2d_view_tpetra;
291 #ifdef HAVE_IFPACK2_MPI
295 impl_scalar_type_2d_view_tpetra remote_multivector;
296 local_ordinal_type blocksize;
298 template <
typename T>
299 struct SendRecvPair {
304 SendRecvPair<int_1d_view_host> pids;
305 SendRecvPair<std::vector<MPI_Request>> reqs;
306 SendRecvPair<size_type_1d_view> offset;
307 SendRecvPair<size_type_1d_view_host> offset_host;
308 SendRecvPair<local_ordinal_type_1d_view> lids;
309 SendRecvPair<impl_scalar_type_1d_view> buffer;
310 SendRecvPair<impl_scalar_type_1d_view_host> buffer_host;
312 local_ordinal_type_1d_view dm2cm;
314 #if defined(KOKKOS_ENABLE_CUDA) || defined(KOKKOS_ENABLE_HIP) || defined(KOKKOS_ENABLE_SYCL)
315 using exec_instance_1d_std_vector = std::vector<execution_space>;
316 exec_instance_1d_std_vector exec_instances;
322 const size_type_1d_view &offs) {
324 Kokkos::View<size_t *, Kokkos::HostSpace> lens_host(const_cast<size_t *>(lens.
getRawPtr()), lens.
size());
325 const auto lens_device = Kokkos::create_mirror_view_and_copy(memory_space(), lens_host);
328 const Kokkos::RangePolicy<execution_space> policy(0, offs.extent(0));
329 const local_ordinal_type lens_size = lens_device.extent(0);
330 Kokkos::parallel_scan(
331 "AsyncableImport::RangePolicy::setOffsetValues",
332 policy, KOKKOS_LAMBDA(
const local_ordinal_type &i, size_type &update,
const bool &
final) {
335 update += (i < lens_size ? lens_device[i] : 0);
340 const size_type_1d_view_host &offs) {
342 Kokkos::View<size_t *, Kokkos::HostSpace> lens_host(const_cast<size_t *>(lens.
getRawPtr()), lens.
size());
343 const auto lens_device = Kokkos::create_mirror_view_and_copy(memory_space(), lens_host);
347 for (local_ordinal_type i = 1, iend = offs.extent(0); i < iend; ++i) {
348 offs(i) = offs(i - 1) + lens[i - 1];
353 void createMpiRequests(
const tpetra_import_type &
import) {
354 Tpetra::Distributor &distributor =
import.getDistributor();
357 const auto pids_from = distributor.getProcsFrom();
359 memcpy(pids.recv.data(), pids_from.getRawPtr(),
sizeof(int) * pids.recv.extent(0));
361 const auto pids_to = distributor.getProcsTo();
363 memcpy(pids.send.data(), pids_to.getRawPtr(),
sizeof(int) * pids.send.extent(0));
366 reqs.recv.resize(pids.recv.extent(0));
367 memset(reqs.recv.data(), 0, reqs.recv.size() *
sizeof(MPI_Request));
368 reqs.send.resize(pids.send.extent(0));
369 memset(reqs.send.data(), 0, reqs.send.size() *
sizeof(MPI_Request));
373 const auto lengths_to = distributor.getLengthsTo();
376 const auto lengths_from = distributor.getLengthsFrom();
379 setOffsetValues(lengths_to, offset.send);
380 offset_host.send = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), offset.send);
382 setOffsetValues(lengths_from, offset.recv);
383 offset_host.recv = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), offset.recv);
385 const auto lengths_to = distributor.getLengthsTo();
386 offset_host.send = size_type_1d_view_host(
do_not_initialize_tag(
"offset send"), lengths_to.size() + 1);
388 const auto lengths_from = distributor.getLengthsFrom();
389 offset_host.recv = size_type_1d_view_host(
do_not_initialize_tag(
"offset recv"), lengths_from.size() + 1);
391 setOffsetValuesHost(lengths_to, offset_host.send);
394 setOffsetValuesHost(lengths_from, offset_host.recv);
399 void createSendRecvIDs(
const tpetra_import_type &
import) {
401 const auto remote_lids =
import.getRemoteLIDs();
402 const local_ordinal_type_1d_view_host
403 remote_lids_view_host(const_cast<local_ordinal_type *>(remote_lids.getRawPtr()), remote_lids.size());
405 Kokkos::deep_copy(lids.recv, remote_lids_view_host);
408 auto epids =
import.getExportPIDs();
409 auto elids =
import.getExportLIDs();
412 auto lids_send_host = Kokkos::create_mirror_view(lids.send);
415 for (local_ordinal_type cnt = 0, i = 0, iend = pids.send.extent(0); i < iend; ++i) {
416 const auto pid_send_value = pids.send[i];
417 for (local_ordinal_type j = 0, jend = epids.size(); j < jend; ++j)
418 if (epids[j] == pid_send_value) lids_send_host[cnt++] = elids[j];
419 TEUCHOS_ASSERT(static_cast<size_t>(cnt) == offset_host.send[i + 1]);
421 Kokkos::deep_copy(lids.send, lids_send_host);
424 void createExecutionSpaceInstances() {
425 #if defined(KOKKOS_ENABLE_CUDA) || defined(KOKKOS_ENABLE_HIP) || defined(KOKKOS_ENABLE_SYCL)
427 #if KOKKOS_VERSION >= 40699
429 Kokkos::Experimental::partition_space(execution_space(), std::vector<int>(8, 1));
432 Kokkos::Experimental::partition_space(execution_space(), 1, 1, 1, 1, 1, 1, 1, 1);
440 struct ToMultiVector {};
444 const local_ordinal_type blocksize_,
445 const local_ordinal_type_1d_view dm2cm_) {
446 blocksize = blocksize_;
449 #ifdef HAVE_IFPACK2_MPI
450 comm = Tpetra::Details::extractMpiCommFromTeuchos(*tgt_map->getComm());
452 const tpetra_import_type
import(src_map, tgt_map);
454 createMpiRequests(
import);
455 createSendRecvIDs(
import);
456 createExecutionSpaceInstances();
459 void createDataBuffer(
const local_ordinal_type &num_vectors) {
460 const size_type extent_0 = lids.recv.extent(0) * blocksize;
461 const size_type extent_1 = num_vectors;
462 if (remote_multivector.extent(0) == extent_0 &&
463 remote_multivector.extent(1) == extent_1) {
469 const auto send_buffer_size = offset_host.send[offset_host.send.extent(0) - 1] * blocksize * num_vectors;
470 const auto recv_buffer_size = offset_host.recv[offset_host.recv.extent(0) - 1] * blocksize * num_vectors;
475 if (!Tpetra::Details::Behavior::assumeMpiIsGPUAware()) {
476 buffer_host.send = impl_scalar_type_1d_view_host(
do_not_initialize_tag(
"buffer send"), send_buffer_size);
477 buffer_host.recv = impl_scalar_type_1d_view_host(
do_not_initialize_tag(
"buffer recv"), recv_buffer_size);
483 #ifdef HAVE_IFPACK2_MPI
484 waitall(reqs.recv.size(), reqs.recv.data());
485 waitall(reqs.send.size(), reqs.send.data());
493 #if defined(KOKKOS_ENABLE_CUDA) || defined(KOKKOS_ENABLE_HIP) || defined(KOKKOS_ENABLE_SYCL)
494 template <
typename PackTag>
495 static void copy(
const local_ordinal_type_1d_view &lids_,
496 const impl_scalar_type_1d_view &buffer_,
497 const local_ordinal_type ibeg_,
498 const local_ordinal_type iend_,
499 const impl_scalar_type_2d_view_tpetra &multivector_,
500 const local_ordinal_type blocksize_,
501 const execution_space &exec_instance_) {
502 const local_ordinal_type num_vectors = multivector_.extent(1);
503 const local_ordinal_type mv_blocksize = blocksize_ * num_vectors;
504 const local_ordinal_type idiff = iend_ - ibeg_;
505 const auto abase = buffer_.data() + mv_blocksize * ibeg_;
507 using team_policy_type = Kokkos::TeamPolicy<execution_space>;
508 local_ordinal_type vector_size(0);
511 else if (blocksize_ <= 8)
513 else if (blocksize_ <= 16)
518 const auto work_item_property = Kokkos::Experimental::WorkItemProperty::HintLightWeight;
519 const team_policy_type policy(exec_instance_, idiff, 1, vector_size);
520 Kokkos::parallel_for(
521 Kokkos::Experimental::require(policy, work_item_property),
522 KOKKOS_LAMBDA(
const typename team_policy_type::member_type &member) {
523 const local_ordinal_type i = member.league_rank();
524 Kokkos::parallel_for(Kokkos::TeamThreadRange(member, num_vectors), [&](
const local_ordinal_type &j) {
525 auto aptr = abase + blocksize_ * (i + idiff * j);
526 auto bptr = &multivector_(blocksize_ * lids_(i + ibeg_), j);
527 if (std::is_same<PackTag, ToBuffer>::value)
528 Kokkos::parallel_for(Kokkos::ThreadVectorRange(member, blocksize_), [&](
const local_ordinal_type &k) {
532 Kokkos::parallel_for(Kokkos::ThreadVectorRange(member, blocksize_), [&](
const local_ordinal_type &k) {
539 void asyncSendRecvVar1(
const impl_scalar_type_2d_view_tpetra &mv) {
540 IFPACK2_BLOCKHELPER_TIMER(
"BlockTriDi::AsyncableImport::AsyncSendRecv", AsyncSendRecv);
542 #ifdef HAVE_IFPACK2_MPI
544 const local_ordinal_type num_vectors = mv.extent(1);
545 const local_ordinal_type mv_blocksize = blocksize * num_vectors;
548 for (local_ordinal_type i = 0, iend = pids.recv.extent(0); i < iend; ++i) {
549 if (Tpetra::Details::Behavior::assumeMpiIsGPUAware()) {
551 reinterpret_cast<char *>(buffer.recv.data() + offset_host.recv[i] * mv_blocksize),
552 (offset_host.recv[i + 1] - offset_host.recv[i]) * mv_blocksize *
sizeof(impl_scalar_type),
558 reinterpret_cast<char *>(buffer_host.recv.data() + offset_host.recv[i] * mv_blocksize),
559 (offset_host.recv[i + 1] - offset_host.recv[i]) * mv_blocksize *
sizeof(impl_scalar_type),
567 execution_space().fence();
570 for (local_ordinal_type i = 0; i < static_cast<local_ordinal_type>(pids.send.extent(0)); ++i) {
572 if (i < 8) exec_instances[i % 8].fence();
573 copy<ToBuffer>(lids.send, buffer.send,
574 offset_host.send(i), offset_host.send(i + 1),
577 exec_instances[i % 8]);
578 if (!Tpetra::Details::Behavior::assumeMpiIsGPUAware()) {
580 const local_ordinal_type num_vectors = mv.extent(1);
581 const local_ordinal_type mv_blocksize = blocksize * num_vectors;
583 Kokkos::deep_copy(exec_instances[i % 8],
584 Kokkos::subview(buffer_host.send,
585 Kokkos::pair<local_ordinal_type, local_ordinal_type>(
586 offset_host.send(i) * mv_blocksize,
587 offset_host.send(i + 1) * mv_blocksize)),
588 Kokkos::subview(buffer.send,
589 Kokkos::pair<local_ordinal_type, local_ordinal_type>(
590 offset_host.send(i) * mv_blocksize,
591 offset_host.send(i + 1) * mv_blocksize)));
596 for (local_ordinal_type i = 0; i < static_cast<local_ordinal_type>(pids.send.extent(0)); ++i) {
598 if (i < 8) exec_instances[i % 8].fence();
599 if (Tpetra::Details::Behavior::assumeMpiIsGPUAware()) {
601 reinterpret_cast<const char *>(buffer.send.data() + offset_host.send[i] * mv_blocksize),
602 (offset_host.send[i + 1] - offset_host.send[i]) * mv_blocksize *
sizeof(impl_scalar_type),
608 reinterpret_cast<const char *>(buffer_host.send.data() + offset_host.send[i] * mv_blocksize),
609 (offset_host.send[i + 1] - offset_host.send[i]) * mv_blocksize *
sizeof(impl_scalar_type),
617 for (local_ordinal_type i = 0, iend = pids.recv.extent(0); i < iend; ++i) {
620 MPI_Iprobe(pids.recv[i], 42, comm, &flag, &stat);
622 #endif // HAVE_IFPACK2_MPI
623 IFPACK2_BLOCKHELPER_TIMER_FENCE(execution_space)
626 void syncRecvVar1() {
627 IFPACK2_BLOCKHELPER_TIMER(
"BlockTriDi::AsyncableImport::SyncRecv", SyncRecv);
628 #ifdef HAVE_IFPACK2_MPI
630 for (local_ordinal_type i = 0; i < static_cast<local_ordinal_type>(pids.recv.extent(0)); ++i) {
631 local_ordinal_type idx = i;
634 waitany(pids.recv.extent(0), reqs.recv.data(), &idx);
636 if (!Tpetra::Details::Behavior::assumeMpiIsGPUAware()) {
637 const local_ordinal_type num_vectors = remote_multivector.extent(1);
638 const local_ordinal_type mv_blocksize = blocksize * num_vectors;
641 Kokkos::subview(buffer.recv,
642 Kokkos::pair<local_ordinal_type, local_ordinal_type>(
643 offset_host.recv(idx) * mv_blocksize,
644 offset_host.recv(idx + 1) * mv_blocksize)),
645 Kokkos::subview(buffer_host.recv,
646 Kokkos::pair<local_ordinal_type, local_ordinal_type>(
647 offset_host.recv(idx) * mv_blocksize,
648 offset_host.recv(idx + 1) * mv_blocksize)));
652 copy<ToMultiVector>(lids.recv, buffer.recv,
653 offset_host.recv(idx), offset_host.recv(idx + 1),
654 remote_multivector, blocksize,
655 exec_instances[idx % 8]);
662 waitall(reqs.send.size(), reqs.send.data());
663 #endif // HAVE_IFPACK2_MPI
664 IFPACK2_BLOCKHELPER_TIMER_FENCE(execution_space)
666 #endif // defined(KOKKOS_ENABLE_CUDA|HIP|SYCL)
673 template <
typename PackTag>
674 static void copy(
const local_ordinal_type_1d_view &lids_,
675 const impl_scalar_type_1d_view &buffer_,
676 const local_ordinal_type &ibeg_,
677 const local_ordinal_type &iend_,
678 const impl_scalar_type_2d_view_tpetra &multivector_,
679 const local_ordinal_type blocksize_) {
680 const local_ordinal_type num_vectors = multivector_.extent(1);
681 const local_ordinal_type mv_blocksize = blocksize_ * num_vectors;
682 const local_ordinal_type idiff = iend_ - ibeg_;
683 const auto abase = buffer_.data() + mv_blocksize * ibeg_;
684 if constexpr (BlockHelperDetails::is_device<execution_space>::value) {
685 using team_policy_type = Kokkos::TeamPolicy<execution_space>;
686 local_ordinal_type vector_size(0);
689 else if (blocksize_ <= 8)
691 else if (blocksize_ <= 16)
695 const team_policy_type policy(idiff, 1, vector_size);
696 Kokkos::parallel_for(
697 "AsyncableImport::TeamPolicy::copy",
698 policy, KOKKOS_LAMBDA(
const typename team_policy_type::member_type &member) {
699 const local_ordinal_type i = member.league_rank();
700 Kokkos::parallel_for(Kokkos::TeamThreadRange(member, num_vectors), [&](
const local_ordinal_type &j) {
701 auto aptr = abase + blocksize_ * (i + idiff * j);
702 auto bptr = &multivector_(blocksize_ * lids_(i + ibeg_), j);
703 if (std::is_same<PackTag, ToBuffer>::value)
704 Kokkos::parallel_for(Kokkos::ThreadVectorRange(member, blocksize_), [&](
const local_ordinal_type &k) {
708 Kokkos::parallel_for(Kokkos::ThreadVectorRange(member, blocksize_), [&](
const local_ordinal_type &k) {
714 const Kokkos::RangePolicy<execution_space> policy(0, idiff * num_vectors);
715 Kokkos::parallel_for(
716 "AsyncableImport::RangePolicy::copy",
717 policy, KOKKOS_LAMBDA(
const local_ordinal_type &ij) {
718 const local_ordinal_type i = ij % idiff;
719 const local_ordinal_type j = ij / idiff;
720 auto aptr = abase + blocksize_ * (i + idiff * j);
721 auto bptr = &multivector_(blocksize_ * lids_(i + ibeg_), j);
722 auto from = std::is_same<PackTag, ToBuffer>::value ? bptr : aptr;
723 auto to = std::is_same<PackTag, ToBuffer>::value ? aptr : bptr;
724 memcpy(to, from,
sizeof(impl_scalar_type) * blocksize_);
732 void asyncSendRecvVar0(
const impl_scalar_type_2d_view_tpetra &mv) {
733 IFPACK2_BLOCKHELPER_TIMER(
"BlockTriDi::AsyncableImport::AsyncSendRecv", AsyncSendRecv);
735 #ifdef HAVE_IFPACK2_MPI
737 const local_ordinal_type num_vectors = mv.extent(1);
738 const local_ordinal_type mv_blocksize = blocksize * num_vectors;
741 for (local_ordinal_type i = 0, iend = pids.recv.extent(0); i < iend; ++i) {
742 if (Tpetra::Details::Behavior::assumeMpiIsGPUAware()) {
744 reinterpret_cast<char *>(buffer.recv.data() + offset_host.recv[i] * mv_blocksize),
745 (offset_host.recv[i + 1] - offset_host.recv[i]) * mv_blocksize *
sizeof(impl_scalar_type),
751 reinterpret_cast<char *>(buffer_host.recv.data() + offset_host.recv[i] * mv_blocksize),
752 (offset_host.recv[i + 1] - offset_host.recv[i]) * mv_blocksize *
sizeof(impl_scalar_type),
760 for (local_ordinal_type i = 0, iend = pids.send.extent(0); i < iend; ++i) {
761 copy<ToBuffer>(lids.send, buffer.send, offset_host.send(i), offset_host.send(i + 1),
764 if (Tpetra::Details::Behavior::assumeMpiIsGPUAware()) {
766 reinterpret_cast<const char *>(buffer.send.data() + offset_host.send[i] * mv_blocksize),
767 (offset_host.send[i + 1] - offset_host.send[i]) * mv_blocksize *
sizeof(impl_scalar_type),
773 Kokkos::subview(buffer_host.send,
774 Kokkos::pair<local_ordinal_type, local_ordinal_type>(
775 offset_host.send(i) * mv_blocksize,
776 offset_host.send(i + 1) * mv_blocksize)),
777 Kokkos::subview(buffer.send,
778 Kokkos::pair<local_ordinal_type, local_ordinal_type>(
779 offset_host.send(i) * mv_blocksize,
780 offset_host.send(i + 1) * mv_blocksize)));
782 reinterpret_cast<const char *>(buffer_host.send.data() + offset_host.send[i] * mv_blocksize),
783 (offset_host.send[i + 1] - offset_host.send[i]) * mv_blocksize *
sizeof(impl_scalar_type),
792 for (local_ordinal_type i = 0, iend = pids.recv.extent(0); i < iend; ++i) {
795 MPI_Iprobe(pids.recv[i], 42, comm, &flag, &stat);
798 IFPACK2_BLOCKHELPER_TIMER_FENCE(execution_space)
801 void syncRecvVar0() {
802 IFPACK2_BLOCKHELPER_TIMER(
"BlockTriDi::AsyncableImport::SyncRecv", SyncRecv);
803 #ifdef HAVE_IFPACK2_MPI
805 for (local_ordinal_type i = 0, iend = pids.recv.extent(0); i < iend; ++i) {
806 local_ordinal_type idx = i;
807 waitany(pids.recv.extent(0), reqs.recv.data(), &idx);
808 if (!Tpetra::Details::Behavior::assumeMpiIsGPUAware()) {
809 const local_ordinal_type num_vectors = remote_multivector.extent(1);
810 const local_ordinal_type mv_blocksize = blocksize * num_vectors;
812 Kokkos::subview(buffer.recv,
813 Kokkos::pair<local_ordinal_type, local_ordinal_type>(
814 offset_host.recv(idx) * mv_blocksize,
815 offset_host.recv(idx + 1) * mv_blocksize)),
816 Kokkos::subview(buffer_host.recv,
817 Kokkos::pair<local_ordinal_type, local_ordinal_type>(
818 offset_host.recv(idx) * mv_blocksize,
819 offset_host.recv(idx + 1) * mv_blocksize)));
821 copy<ToMultiVector>(lids.recv, buffer.recv, offset_host.recv(idx), offset_host.recv(idx + 1),
822 remote_multivector, blocksize);
825 waitall(reqs.send.size(), reqs.send.data());
827 IFPACK2_BLOCKHELPER_TIMER_FENCE(execution_space)
833 void asyncSendRecv(
const impl_scalar_type_2d_view_tpetra &mv) {
834 #if defined(KOKKOS_ENABLE_CUDA) || defined(KOKKOS_ENABLE_HIP) || defined(KOKKOS_ENABLE_SYCL)
835 #if defined(IFPACK2_BLOCKTRIDICONTAINER_USE_EXEC_SPACE_INSTANCES)
836 asyncSendRecvVar1(mv);
838 asyncSendRecvVar0(mv);
841 asyncSendRecvVar0(mv);
845 #if defined(KOKKOS_ENABLE_CUDA) || defined(KOKKOS_ENABLE_HIP) || defined(KOKKOS_ENABLE_SYCL)
846 #if defined(IFPACK2_BLOCKTRIDICONTAINER_USE_EXEC_SPACE_INSTANCES)
856 void syncExchange(
const impl_scalar_type_2d_view_tpetra &mv) {
857 IFPACK2_BLOCKHELPER_TIMER(
"BlockTriDi::AsyncableImport::SyncExchange", SyncExchange);
860 IFPACK2_BLOCKHELPER_TIMER_FENCE(execution_space)
863 impl_scalar_type_2d_view_tpetra getRemoteMultiVectorLocalView()
const {
return remote_multivector; }
866 template <
typename ViewType1,
typename ViewType2>
867 struct are_same_struct {
871 are_same_struct(ViewType1 keys1_, ViewType2 keys2_)
874 KOKKOS_INLINE_FUNCTION
875 void operator()(
int i,
unsigned int &count)
const {
876 if (keys1(i) != keys2(i)) count++;
880 template <
typename ViewType1,
typename ViewType2>
881 bool are_same(ViewType1 keys1, ViewType2 keys2) {
882 unsigned int are_same_ = 0;
884 Kokkos::parallel_reduce(Kokkos::RangePolicy<typename ViewType1::execution_space>(0, keys1.extent(0)),
885 are_same_struct(keys1, keys2),
887 return are_same_ == 0;
893 template <
typename MatrixType>
898 using tpetra_map_type =
typename impl_type::tpetra_map_type;
899 using local_ordinal_type =
typename impl_type::local_ordinal_type;
900 using global_ordinal_type =
typename impl_type::global_ordinal_type;
901 using local_ordinal_type_1d_view =
typename impl_type::local_ordinal_type_1d_view;
902 using crs_matrix_type =
typename impl_type::tpetra_crs_matrix_type;
903 using block_crs_matrix_type =
typename impl_type::tpetra_block_crs_matrix_type;
904 using global_indices_array_device_type = Kokkos::View<const global_ordinal_type *, typename tpetra_map_type::device_type>;
906 auto A_crs = Teuchos::rcp_dynamic_cast<
const crs_matrix_type>(A);
907 auto A_bcrs = Teuchos::rcp_dynamic_cast<
const block_crs_matrix_type>(A);
909 bool hasBlockCrsMatrix = !A_bcrs.is_null();
912 const auto g = hasBlockCrsMatrix ? A_bcrs->getCrsGraph() : *(A_crs->getCrsGraph());
914 const auto blocksize = hasBlockCrsMatrix ? A_bcrs->getBlockSize() : 1;
915 const auto domain_map = g.getDomainMap();
916 const auto column_map = g.getColMap();
918 std::vector<global_ordinal_type> gids;
920 Kokkos::Subview<global_indices_array_device_type, std::pair<int, int>> column_map_global_iD_last;
922 bool separate_remotes =
true, found_first =
false, need_owned_permutation =
false;
924 IFPACK2_BLOCKHELPER_TIMER(
"createBlockCrsAsyncImporter::loop_over_local_elements", loop_over_local_elements);
926 global_indices_array_device_type column_map_global_iD = column_map->getMyGlobalIndicesDevice();
927 global_indices_array_device_type domain_map_global_iD = domain_map->getMyGlobalIndicesDevice();
929 if (are_same(domain_map_global_iD, column_map_global_iD)) {
931 separate_remotes =
true;
932 need_owned_permutation =
false;
934 column_map_global_iD_last = Kokkos::subview(column_map_global_iD,
935 std::pair<int, int>(domain_map_global_iD.extent(0), column_map_global_iD.extent(0)));
938 for (
size_t i = 0; i < column_map->getLocalNumElements(); ++i) {
939 const global_ordinal_type gid = column_map->getGlobalElement(i);
940 if (!domain_map->isNodeGlobalElement(gid)) {
943 }
else if (found_first) {
944 separate_remotes =
false;
947 if (!found_first && !need_owned_permutation &&
948 domain_map->getLocalElement(gid) !=
static_cast<local_ordinal_type
>(i)) {
957 need_owned_permutation =
true;
961 IFPACK2_BLOCKHELPER_TIMER_FENCE(
typename BlockHelperDetails::ImplType<MatrixType>::execution_space)
964 if (separate_remotes) {
965 IFPACK2_BLOCKHELPER_TIMER(
"createBlockCrsAsyncImporter::separate_remotes", separate_remotes);
967 const auto parsimonious_col_map = need_owned_permutation ?
Teuchos::rcp(
new tpetra_map_type(invalid, gids.data(), gids.size(), 0, domain_map->getComm())) :
Teuchos::rcp(
new tpetra_map_type(invalid, column_map_global_iD_last, 0, domain_map->getComm()));
968 if (parsimonious_col_map->getGlobalNumElements() > 0) {
970 local_ordinal_type_1d_view dm2cm;
971 if (need_owned_permutation) {
972 dm2cm = local_ordinal_type_1d_view(
do_not_initialize_tag(
"dm2cm"), domain_map->getLocalNumElements());
973 const auto dm2cm_host = Kokkos::create_mirror_view(dm2cm);
974 for (
size_t i = 0; i < domain_map->getLocalNumElements(); ++i)
975 dm2cm_host(i) = domain_map->getLocalElement(column_map->getGlobalElement(i));
976 Kokkos::deep_copy(dm2cm, dm2cm_host);
978 IFPACK2_BLOCKHELPER_TIMER_FENCE(
typename BlockHelperDetails::ImplType<MatrixType>::execution_space)
979 return Teuchos::rcp(
new AsyncableImport<MatrixType>(domain_map, parsimonious_col_map, blocksize, dm2cm));
982 IFPACK2_BLOCKHELPER_TIMER_FENCE(
typename BlockHelperDetails::ImplType<MatrixType>::execution_space)
983 return Teuchos::null;
986 template <
typename local_ordinal_type>
987 local_ordinal_type costTRSM(
const local_ordinal_type block_size) {
988 return block_size * block_size;
991 template <
typename local_ordinal_type>
992 local_ordinal_type costGEMV(
const local_ordinal_type block_size) {
993 return 2 * block_size * block_size;
996 template <
typename local_ordinal_type>
997 local_ordinal_type costTriDiagSolve(
const local_ordinal_type subline_length,
const local_ordinal_type block_size) {
998 return 2 * subline_length * costTRSM(block_size) + 2 * (subline_length - 1) * costGEMV(block_size);
1001 template <
typename local_ordinal_type>
1002 local_ordinal_type costSolveSchur(
const local_ordinal_type num_parts,
1003 const local_ordinal_type num_teams,
1004 const local_ordinal_type line_length,
1005 const local_ordinal_type block_size,
1006 const local_ordinal_type n_subparts_per_part) {
1007 const local_ordinal_type subline_length = ceil(
double(line_length - (n_subparts_per_part - 1) * 2) / n_subparts_per_part);
1008 if (subline_length < 1) {
1012 const local_ordinal_type p_n_lines = ceil(
double(num_parts) / num_teams);
1013 const local_ordinal_type p_n_sublines = ceil(
double(n_subparts_per_part) * num_parts / num_teams);
1014 const local_ordinal_type p_n_sublines_2 = ceil(
double(n_subparts_per_part - 1) * num_parts / num_teams);
1016 const local_ordinal_type p_costApplyE = p_n_sublines_2 * subline_length * 2 * costGEMV(block_size);
1017 const local_ordinal_type p_costApplyS = p_n_lines * costTriDiagSolve((n_subparts_per_part - 1) * 2, block_size);
1018 const local_ordinal_type p_costApplyAinv = p_n_sublines * costTriDiagSolve(subline_length, block_size);
1019 const local_ordinal_type p_costApplyC = p_n_sublines_2 * 2 * costGEMV(block_size);
1021 if (n_subparts_per_part == 1) {
1022 return p_costApplyAinv;
1024 return p_costApplyE + p_costApplyS + p_costApplyAinv + p_costApplyC;
1027 template <
typename local_ordinal_type>
1028 local_ordinal_type getAutomaticNSubparts(
const local_ordinal_type num_parts,
1029 const local_ordinal_type num_teams,
1030 const local_ordinal_type line_length,
1031 const local_ordinal_type block_size) {
1038 double parallelismSurplus = Kokkos::sqrt((
double)num_teams / num_parts);
1039 double logLineLength = Kokkos::log2((
double)line_length);
1040 (void)logLineLength;
1042 #if defined(KOKKOS_ARCH_AMD_GFX942) || defined(KOKKOS_ARCH_AMD_GFX942_APU)
1044 double modeled = -9.2312 + 4.6946 * parallelismSurplus + 0.4095 * block_size + 0.966 * logLineLength;
1046 if (parallelismSurplus < 0.3)
1048 #elif defined(KOKKOS_ARCH_HOPPER) || defined(KOKKOS_ARCH_BLACKWELL)
1050 double modeled = -9.6053 + 4.7477 * parallelismSurplus + 0.2338 * block_size + 1.0794 * logLineLength;
1052 double maxSplit = (double)line_length / 8;
1053 if (modeled > maxSplit)
1055 #elif defined(KOKKOS_ENABLE_CUDA)
1059 if (parallelismSurplus > 1 && line_length > 64)
1061 #elif defined(KOKKOS_ENABLE_HIP)
1063 double modeled = -8.6214 + 7.3468 * parallelismSurplus + 0.3596 * block_size + 0.6673 * logLineLength;
1067 if (parallelismSurplus > 1 && line_length > 64)
1072 local_ordinal_type n_subparts_per_part = 0.5 + modeled;
1074 if (parallelismSurplus < 0.3)
1075 n_subparts_per_part = 1;
1082 local_ordinal_type min_subparts_per_part = 1;
1083 local_ordinal_type max_subparts_per_part = (line_length + 2) / 3;
1085 if (max_subparts_per_part > 16)
1086 max_subparts_per_part = 16;
1087 if (n_subparts_per_part < min_subparts_per_part)
1088 n_subparts_per_part = min_subparts_per_part;
1089 if (n_subparts_per_part > max_subparts_per_part)
1090 n_subparts_per_part = max_subparts_per_part;
1091 return n_subparts_per_part;
1094 template <
typename ArgActiveExecutionMemorySpace>
1095 struct SolveTridiagsDefaultModeAndAlgo;
1100 template <
typename MatrixType>
1101 BlockHelperDetails::PartInterface<MatrixType>
1103 const Teuchos::RCP<
const typename BlockHelperDetails::ImplType<MatrixType>::tpetra_crs_graph_type> &G,
1105 const typename BlockHelperDetails::ImplType<MatrixType>::local_ordinal_type n_subparts_per_part_in) {
1108 using local_ordinal_type =
typename impl_type::local_ordinal_type;
1109 using local_ordinal_type_1d_view =
typename impl_type::local_ordinal_type_1d_view;
1110 using local_ordinal_type_2d_view =
typename impl_type::local_ordinal_type_2d_view;
1111 using size_type =
typename impl_type::size_type;
1113 auto bA = Teuchos::rcp_dynamic_cast<
const typename BlockHelperDetails::ImplType<MatrixType>::tpetra_block_crs_matrix_type>(A);
1116 const local_ordinal_type blocksize = bA.is_null() ? A->getLocalNumRows() / G->getLocalNumRows() : A->getBlockSize();
1117 constexpr
int vector_length = impl_type::vector_length;
1118 constexpr
int internal_vector_length = impl_type::internal_vector_length;
1120 const auto comm = A->getRowMap()->getComm();
1122 BlockHelperDetails::PartInterface<MatrixType> interf;
1124 const local_ordinal_type A_n_lclrows = G->getLocalNumRows();
1125 const bool jacobi = partitions.size() == 0 || partitions.size() == A_n_lclrows;
1126 const local_ordinal_type nparts = jacobi ? A_n_lclrows : partitions.size();
1128 typedef std::pair<local_ordinal_type, local_ordinal_type> size_idx_pair_type;
1129 std::vector<size_idx_pair_type> partsz(nparts);
1132 for (local_ordinal_type i = 0; i < nparts; ++i)
1133 partsz[i] = size_idx_pair_type(partitions[i].size(), i);
1134 std::sort(partsz.begin(), partsz.end(),
1135 [](
const size_idx_pair_type &x,
const size_idx_pair_type &y) {
1136 return x.first > y.first;
1140 local_ordinal_type n_subparts_per_part;
1142 n_subparts_per_part = 1;
1144 if (n_subparts_per_part_in == -1) {
1147 using execution_space =
typename impl_type::execution_space;
1150 if constexpr (impl_type::node_type::is_gpu) {
1151 const int line_length = partsz[0].first;
1153 const local_ordinal_type team_size =
1154 SolveTridiagsDefaultModeAndAlgo<typename execution_space::memory_space>::
1155 recommended_team_size(blocksize, vector_length, internal_vector_length);
1157 const local_ordinal_type num_teams = std::max(1, execution_space().concurrency() / (team_size * vector_length));
1158 n_subparts_per_part = getAutomaticNSubparts(nparts, num_teams, line_length, blocksize);
1159 #ifdef IFPACK2_BLOCKTRIDICONTAINER_USE_PRINTF
1160 printf(
"Automatically chosen n_subparts_per_part = %d for nparts = %d, num_teams = %d, team_size = %d, line_length = %d, and blocksize = %d;\n", n_subparts_per_part, nparts, num_teams, team_size, line_length, blocksize);
1163 n_subparts_per_part = 1;
1164 #ifdef IFPACK2_BLOCKTRIDICONTAINER_USE_PRINTF
1165 printf(
"Automatically chosen n_subparts_per_part = 1 for CPU backend\n");
1169 n_subparts_per_part = n_subparts_per_part_in;
1174 const local_ordinal_type n_sub_parts = nparts * n_subparts_per_part;
1177 const local_ordinal_type n_sub_parts_and_schur = n_sub_parts + nparts * (n_subparts_per_part - 1);
1179 #if defined(BLOCKTRIDICONTAINER_DEBUG)
1180 local_ordinal_type nrows = 0;
1184 for (local_ordinal_type i = 0; i < nparts; ++i) nrows += partitions[i].size();
1186 TEUCHOS_TEST_FOR_EXCEPT_MSG(nrows != A_n_lclrows, BlockHelperDetails::get_msg_prefix(comm) <<
"The #rows implied by the local partition is not "
1187 <<
"the same as getLocalNumRows: " << nrows <<
" vs " << A_n_lclrows);
1191 std::vector<local_ordinal_type> p;
1193 interf.max_partsz = 1;
1194 interf.max_subpartsz = 0;
1195 interf.n_subparts_per_part = 1;
1196 interf.nparts = nparts;
1201 for (local_ordinal_type i = 0; i < nparts; ++i)
1202 p[i] = partsz[i].second;
1204 interf.max_partsz = partsz[0].first;
1206 constexpr local_ordinal_type connection_length = 2;
1207 const local_ordinal_type sub_line_length = (interf.max_partsz - (n_subparts_per_part - 1) * connection_length) / n_subparts_per_part;
1208 const local_ordinal_type last_sub_line_length = interf.max_partsz - (n_subparts_per_part - 1) * (connection_length + sub_line_length);
1210 interf.max_subpartsz = (sub_line_length > last_sub_line_length) ? sub_line_length : last_sub_line_length;
1211 interf.n_subparts_per_part = n_subparts_per_part;
1212 interf.nparts = nparts;
1218 interf.part2rowidx0 = local_ordinal_type_1d_view(
do_not_initialize_tag(
"part2rowidx0"), nparts + 1);
1219 interf.part2packrowidx0 = local_ordinal_type_1d_view(
do_not_initialize_tag(
"part2packrowidx0"), nparts + 1);
1222 interf.part2rowidx0_sub = local_ordinal_type_1d_view(
do_not_initialize_tag(
"part2rowidx0_sub"), n_sub_parts_and_schur + 1);
1223 interf.part2packrowidx0_sub = local_ordinal_type_2d_view(
do_not_initialize_tag(
"part2packrowidx0_sub"), nparts, 2 * n_subparts_per_part);
1224 interf.rowidx2part_sub = local_ordinal_type_1d_view(
do_not_initialize_tag(
"rowidx2part"), A_n_lclrows);
1226 interf.partptr_sub = local_ordinal_type_2d_view(
do_not_initialize_tag(
"partptr_sub"), n_sub_parts_and_schur, 2);
1229 const auto partptr = Kokkos::create_mirror_view(interf.partptr);
1230 const auto partptr_sub = Kokkos::create_mirror_view(interf.partptr_sub);
1232 const auto lclrow = Kokkos::create_mirror_view(interf.lclrow);
1233 const auto part2rowidx0 = Kokkos::create_mirror_view(interf.part2rowidx0);
1234 const auto part2packrowidx0 = Kokkos::create_mirror_view(interf.part2packrowidx0);
1235 const auto rowidx2part = Kokkos::create_mirror_view(interf.rowidx2part);
1237 const auto part2rowidx0_sub = Kokkos::create_mirror_view(interf.part2rowidx0_sub);
1238 const auto part2packrowidx0_sub = Kokkos::create_mirror_view(Kokkos::HostSpace(), interf.part2packrowidx0_sub);
1239 const auto rowidx2part_sub = Kokkos::create_mirror_view(interf.rowidx2part_sub);
1242 interf.row_contiguous =
true;
1244 part2rowidx0(0) = 0;
1245 part2packrowidx0(0) = 0;
1246 local_ordinal_type pack_nrows = 0;
1247 local_ordinal_type pack_nrows_sub = 0;
1249 IFPACK2_BLOCKHELPER_TIMER(
"compute part indices (Jacobi)", Jacobi);
1253 for (local_ordinal_type i = 0; i <= nparts; ++i) {
1254 part2rowidx0(i) = i;
1257 for (local_ordinal_type i = 0; i < nparts; ++i) {
1261 for (local_ordinal_type ip = 0; ip < nparts; ++ip) {
1263 if (ip % vector_length == 0) pack_nrows = 1;
1264 part2packrowidx0(ip + 1) = part2packrowidx0(ip) + ((ip + 1) % vector_length == 0 || ip + 1 == nparts ? pack_nrows : 0);
1266 part2rowidx0_sub(0) = 0;
1267 partptr_sub(0, 0) = 0;
1269 for (local_ordinal_type ip = 0; ip < nparts; ++ip) {
1270 constexpr local_ordinal_type ipnrows = 1;
1271 const local_ordinal_type full_line_length = partptr(ip + 1) - partptr(ip);
1274 "In the part " << ip);
1276 constexpr local_ordinal_type connection_length = 2;
1278 if (full_line_length < n_subparts_per_part + (n_subparts_per_part - 1) * connection_length)
1280 "The part " << ip <<
" is too short to use " << n_subparts_per_part <<
" sub parts.");
1282 const local_ordinal_type sub_line_length = (full_line_length - (n_subparts_per_part - 1) * connection_length) / n_subparts_per_part;
1283 const local_ordinal_type last_sub_line_length = full_line_length - (n_subparts_per_part - 1) * (connection_length + sub_line_length);
1285 if (ip % vector_length == 0) pack_nrows_sub = ipnrows;
1287 for (local_ordinal_type local_sub_ip = 0; local_sub_ip < n_subparts_per_part; ++local_sub_ip) {
1288 const local_ordinal_type sub_ip = nparts * (2 * local_sub_ip) + ip;
1289 const local_ordinal_type schur_ip = nparts * (2 * local_sub_ip + 1) + ip;
1290 if (local_sub_ip != n_subparts_per_part - 1) {
1291 if (local_sub_ip != 0) {
1292 partptr_sub(sub_ip, 0) = partptr_sub(nparts * (2 * local_sub_ip - 1) + ip, 1);
1293 }
else if (ip != 0) {
1294 partptr_sub(sub_ip, 0) = partptr_sub(nparts * 2 * (n_subparts_per_part - 1) + ip - 1, 1);
1296 partptr_sub(sub_ip, 1) = sub_line_length + partptr_sub(sub_ip, 0);
1297 partptr_sub(schur_ip, 0) = partptr_sub(sub_ip, 1);
1298 partptr_sub(schur_ip, 1) = connection_length + partptr_sub(schur_ip, 0);
1300 part2rowidx0_sub(sub_ip + 1) = part2rowidx0_sub(sub_ip) + sub_line_length;
1301 part2rowidx0_sub(sub_ip + 2) = part2rowidx0_sub(sub_ip + 1) + connection_length;
1303 #ifdef IFPACK2_BLOCKTRIDICONTAINER_USE_PRINTF
1304 printf(
"Sub Part index = %d, first LID associated to the sub part = %d, sub part size = %d;\n", sub_ip, partptr_sub(ip, 2 * local_sub_ip), sub_line_length);
1305 printf(
"Sub Part index Schur = %d, first LID associated to the sub part = %d, sub part size = %d;\n", sub_ip + 1, partptr_sub(ip, 2 * local_sub_ip + 1), connection_length);
1308 if (local_sub_ip != 0) {
1309 partptr_sub(sub_ip, 0) = partptr_sub(nparts * (2 * local_sub_ip - 1) + ip, 1);
1310 }
else if (ip != 0) {
1311 partptr_sub(sub_ip, 0) = partptr_sub(nparts * 2 * (n_subparts_per_part - 1) + ip - 1, 1);
1313 partptr_sub(sub_ip, 1) = last_sub_line_length + partptr_sub(sub_ip, 0);
1315 part2rowidx0_sub(sub_ip + 1) = part2rowidx0_sub(sub_ip) + last_sub_line_length;
1317 #ifdef IFPACK2_BLOCKTRIDICONTAINER_USE_PRINTF
1318 printf(
"Sub Part index = %d, first LID associated to the sub part = %d, sub part size = %d;\n", sub_ip, partptr_sub(ip, 2 * local_sub_ip), last_sub_line_length);
1324 #ifdef IFPACK2_BLOCKTRIDICONTAINER_WRITE_MM
1325 std::cout <<
"partptr_sub = " << std::endl;
1326 for (size_type i = 0; i < partptr_sub.extent(0); ++i) {
1327 for (size_type j = 0; j < partptr_sub.extent(1); ++j) {
1328 std::cout << partptr_sub(i, j) <<
" ";
1330 std::cout << std::endl;
1332 std::cout <<
"partptr_sub end" << std::endl;
1336 local_ordinal_type npacks = ceil(
float(nparts) / vector_length);
1338 local_ordinal_type ip_max = nparts > vector_length ? vector_length : nparts;
1339 for (local_ordinal_type ip = 0; ip < ip_max; ++ip) {
1340 part2packrowidx0_sub(ip, 0) = 0;
1342 for (local_ordinal_type ipack = 0; ipack < npacks; ++ipack) {
1344 local_ordinal_type ip_min = ipack * vector_length;
1345 ip_max = nparts > (ipack + 1) * vector_length ? (ipack + 1) * vector_length : nparts;
1346 for (local_ordinal_type ip = ip_min; ip < ip_max; ++ip) {
1347 part2packrowidx0_sub(ip, 0) = part2packrowidx0_sub(ip - vector_length, part2packrowidx0_sub.extent(1) - 1);
1351 for (size_type local_sub_ip = 0; local_sub_ip < part2packrowidx0_sub.extent(1) - 1; ++local_sub_ip) {
1352 local_ordinal_type ip_min = ipack * vector_length;
1353 ip_max = nparts > (ipack + 1) * vector_length ? (ipack + 1) * vector_length : nparts;
1355 const local_ordinal_type full_line_length = partptr(ip_min + 1) - partptr(ip_min);
1357 constexpr local_ordinal_type connection_length = 2;
1359 const local_ordinal_type sub_line_length = (full_line_length - (n_subparts_per_part - 1) * connection_length) / n_subparts_per_part;
1360 const local_ordinal_type last_sub_line_length = full_line_length - (n_subparts_per_part - 1) * (connection_length + sub_line_length);
1362 if (local_sub_ip % 2 == 0) pack_nrows_sub = sub_line_length;
1363 if (local_sub_ip % 2 == 1) pack_nrows_sub = connection_length;
1364 if (local_sub_ip == part2packrowidx0_sub.extent(1) - 2) pack_nrows_sub = last_sub_line_length;
1366 part2packrowidx0_sub(ip_min, local_sub_ip + 1) = part2packrowidx0_sub(ip_min, local_sub_ip) + pack_nrows_sub;
1368 for (local_ordinal_type ip = ip_min + 1; ip < ip_max; ++ip) {
1369 part2packrowidx0_sub(ip, local_sub_ip + 1) = part2packrowidx0_sub(ip_min, local_sub_ip + 1);
1374 Kokkos::deep_copy(interf.part2packrowidx0_sub, part2packrowidx0_sub);
1376 IFPACK2_BLOCKHELPER_TIMER_FENCE(
typename BlockHelperDetails::ImplType<MatrixType>::execution_space)
1378 IFPACK2_BLOCKHELPER_TIMER(
"compute part indices", indices);
1379 for (local_ordinal_type ip = 0; ip < nparts; ++ip) {
1380 const auto *part = &partitions[p[ip]];
1381 const local_ordinal_type ipnrows = part->size();
1382 TEUCHOS_ASSERT(ip == 0 || (ipnrows <= static_cast<local_ordinal_type>(partitions[p[ip - 1]].size())));
1384 BlockHelperDetails::get_msg_prefix(comm)
1385 <<
"partition " << p[ip]
1386 <<
" is empty, which is not allowed.");
1388 part2rowidx0(ip + 1) = part2rowidx0(ip) + ipnrows;
1391 if (ip % vector_length == 0) pack_nrows = ipnrows;
1392 part2packrowidx0(ip + 1) = part2packrowidx0(ip) + ((ip + 1) % vector_length == 0 || ip + 1 == nparts ? pack_nrows : 0);
1393 const local_ordinal_type offset = partptr(ip);
1394 for (local_ordinal_type i = 0; i < ipnrows; ++i) {
1395 const auto lcl_row = (*part)[i];
1397 BlockHelperDetails::get_msg_prefix(comm)
1398 <<
"partitions[" << p[ip] <<
"]["
1399 << i <<
"] = " << lcl_row
1400 <<
" but input matrix implies limits of [0, " << A_n_lclrows - 1
1402 lclrow(offset + i) = lcl_row;
1403 rowidx2part(offset + i) = ip;
1404 if (interf.row_contiguous && offset + i > 0 && lclrow((offset + i) - 1) + 1 != lcl_row)
1405 interf.row_contiguous =
false;
1407 partptr(ip + 1) = offset + ipnrows;
1409 #ifdef IFPACK2_BLOCKTRIDICONTAINER_USE_PRINTF
1410 printf(
"Part index = ip = %d, first LID associated to the part = partptr(ip) = offset = %d, part->size() = ipnrows = %d;\n", ip, offset, ipnrows);
1411 printf(
"partptr(%d+1) = %d\n", ip, partptr(ip + 1));
1415 part2rowidx0_sub(0) = 0;
1416 partptr_sub(0, 0) = 0;
1419 for (local_ordinal_type ip = 0; ip < nparts; ++ip) {
1420 const auto *part = &partitions[p[ip]];
1421 const local_ordinal_type ipnrows = part->size();
1422 const local_ordinal_type full_line_length = partptr(ip + 1) - partptr(ip);
1425 "In the part " << ip);
1427 constexpr local_ordinal_type connection_length = 2;
1429 if (full_line_length < n_subparts_per_part + (n_subparts_per_part - 1) * connection_length)
1431 "The part " << ip <<
" is too short to use " << n_subparts_per_part <<
" sub parts.");
1433 const local_ordinal_type sub_line_length = (full_line_length - (n_subparts_per_part - 1) * connection_length) / n_subparts_per_part;
1434 const local_ordinal_type last_sub_line_length = full_line_length - (n_subparts_per_part - 1) * (connection_length + sub_line_length);
1436 if (ip % vector_length == 0) pack_nrows_sub = ipnrows;
1438 for (local_ordinal_type local_sub_ip = 0; local_sub_ip < n_subparts_per_part; ++local_sub_ip) {
1439 const local_ordinal_type sub_ip = nparts * (2 * local_sub_ip) + ip;
1440 const local_ordinal_type schur_ip = nparts * (2 * local_sub_ip + 1) + ip;
1441 if (local_sub_ip != n_subparts_per_part - 1) {
1442 if (local_sub_ip != 0) {
1443 partptr_sub(sub_ip, 0) = partptr_sub(nparts * (2 * local_sub_ip - 1) + ip, 1);
1444 }
else if (ip != 0) {
1445 partptr_sub(sub_ip, 0) = partptr_sub(nparts * 2 * (n_subparts_per_part - 1) + ip - 1, 1);
1447 partptr_sub(sub_ip, 1) = sub_line_length + partptr_sub(sub_ip, 0);
1448 partptr_sub(schur_ip, 0) = partptr_sub(sub_ip, 1);
1449 partptr_sub(schur_ip, 1) = connection_length + partptr_sub(schur_ip, 0);
1451 part2rowidx0_sub(sub_ip + 1) = part2rowidx0_sub(sub_ip) + sub_line_length;
1452 part2rowidx0_sub(sub_ip + 2) = part2rowidx0_sub(sub_ip + 1) + connection_length;
1454 #ifdef IFPACK2_BLOCKTRIDICONTAINER_USE_PRINTF
1455 printf(
"Sub Part index = %d, first LID associated to the sub part = %d, sub part size = %d;\n", sub_ip, partptr_sub(sub_ip, 0), sub_line_length);
1456 printf(
"Sub Part index Schur = %d, first LID associated to the sub part = %d, sub part size = %d;\n", sub_ip + 1, partptr_sub(ip, 2 * local_sub_ip + 1), connection_length);
1459 if (local_sub_ip != 0) {
1460 partptr_sub(sub_ip, 0) = partptr_sub(nparts * (2 * local_sub_ip - 1) + ip, 1);
1461 }
else if (ip != 0) {
1462 partptr_sub(sub_ip, 0) = partptr_sub(nparts * 2 * (n_subparts_per_part - 1) + ip - 1, 1);
1464 partptr_sub(sub_ip, 1) = last_sub_line_length + partptr_sub(sub_ip, 0);
1466 part2rowidx0_sub(sub_ip + 1) = part2rowidx0_sub(sub_ip) + last_sub_line_length;
1468 #ifdef IFPACK2_BLOCKTRIDICONTAINER_USE_PRINTF
1469 printf(
"Sub Part index = %d, first LID associated to the sub part = %d, sub part size = %d;\n", sub_ip, partptr_sub(sub_ip, 0), last_sub_line_length);
1476 local_ordinal_type npacks = ceil(
float(nparts) / vector_length);
1478 local_ordinal_type ip_max = nparts > vector_length ? vector_length : nparts;
1479 for (local_ordinal_type ip = 0; ip < ip_max; ++ip) {
1480 part2packrowidx0_sub(ip, 0) = 0;
1482 for (local_ordinal_type ipack = 0; ipack < npacks; ++ipack) {
1484 local_ordinal_type ip_min = ipack * vector_length;
1485 ip_max = nparts > (ipack + 1) * vector_length ? (ipack + 1) * vector_length : nparts;
1486 for (local_ordinal_type ip = ip_min; ip < ip_max; ++ip) {
1487 part2packrowidx0_sub(ip, 0) = part2packrowidx0_sub(ip - vector_length, part2packrowidx0_sub.extent(1) - 1);
1491 for (size_type local_sub_ip = 0; local_sub_ip < part2packrowidx0_sub.extent(1) - 1; ++local_sub_ip) {
1492 local_ordinal_type ip_min = ipack * vector_length;
1493 ip_max = nparts > (ipack + 1) * vector_length ? (ipack + 1) * vector_length : nparts;
1495 const local_ordinal_type full_line_length = partptr(ip_min + 1) - partptr(ip_min);
1497 constexpr local_ordinal_type connection_length = 2;
1499 const local_ordinal_type sub_line_length = (full_line_length - (n_subparts_per_part - 1) * connection_length) / n_subparts_per_part;
1500 const local_ordinal_type last_sub_line_length = full_line_length - (n_subparts_per_part - 1) * (connection_length + sub_line_length);
1502 if (local_sub_ip % 2 == 0) pack_nrows_sub = sub_line_length;
1503 if (local_sub_ip % 2 == 1) pack_nrows_sub = connection_length;
1504 if (local_sub_ip == part2packrowidx0_sub.extent(1) - 2) pack_nrows_sub = last_sub_line_length;
1506 part2packrowidx0_sub(ip_min, local_sub_ip + 1) = part2packrowidx0_sub(ip_min, local_sub_ip) + pack_nrows_sub;
1508 for (local_ordinal_type ip = ip_min + 1; ip < ip_max; ++ip) {
1509 part2packrowidx0_sub(ip, local_sub_ip + 1) = part2packrowidx0_sub(ip_min, local_sub_ip + 1);
1514 Kokkos::deep_copy(interf.part2packrowidx0_sub, part2packrowidx0_sub);
1516 IFPACK2_BLOCKHELPER_TIMER_FENCE(
typename BlockHelperDetails::ImplType<MatrixType>::execution_space)
1518 #if defined(BLOCKTRIDICONTAINER_DEBUG)
1521 if (lclrow(0) != 0) interf.row_contiguous =
false;
1523 Kokkos::deep_copy(interf.partptr, partptr);
1524 Kokkos::deep_copy(interf.lclrow, lclrow);
1526 Kokkos::deep_copy(interf.partptr_sub, partptr_sub);
1529 interf.part2rowidx0 = interf.partptr;
1530 Kokkos::deep_copy(interf.part2packrowidx0, part2packrowidx0);
1532 interf.part2packrowidx0_back = part2packrowidx0_sub(part2packrowidx0_sub.extent(0) - 1, part2packrowidx0_sub.extent(1) - 1);
1533 Kokkos::deep_copy(interf.rowidx2part, rowidx2part);
1536 IFPACK2_BLOCKHELPER_TIMER(
"Fill packptr", packptr0);
1537 local_ordinal_type npacks = ceil(
float(nparts) / vector_length) * (part2packrowidx0_sub.extent(1) - 1);
1539 for (local_ordinal_type ip = 1; ip <= nparts; ++ip)
1540 if (part2packrowidx0(ip) != part2packrowidx0(ip - 1))
1544 const auto packptr = Kokkos::create_mirror_view(interf.packptr);
1546 for (local_ordinal_type ip = 1, k = 1; ip <= nparts; ++ip)
1547 if (part2packrowidx0(ip) != part2packrowidx0(ip - 1))
1550 Kokkos::deep_copy(interf.packptr, packptr);
1552 local_ordinal_type npacks_per_subpart = ceil(
float(nparts) / vector_length);
1553 npacks = ceil(
float(nparts) / vector_length) * (part2packrowidx0_sub.extent(1) - 1);
1555 interf.packindices_sub = local_ordinal_type_1d_view(
do_not_initialize_tag(
"packindices_sub"), npacks_per_subpart * n_subparts_per_part);
1556 interf.packindices_schur = local_ordinal_type_2d_view(
do_not_initialize_tag(
"packindices_schur"), npacks_per_subpart, n_subparts_per_part - 1);
1558 const auto packindices_sub = Kokkos::create_mirror_view(interf.packindices_sub);
1559 const auto packindices_schur = Kokkos::create_mirror_view(interf.packindices_schur);
1562 for (local_ordinal_type local_sub_ip = 0; local_sub_ip < n_subparts_per_part - 1; ++local_sub_ip) {
1563 for (local_ordinal_type local_pack_ip = 0; local_pack_ip < npacks_per_subpart; ++local_pack_ip) {
1564 packindices_sub(local_sub_ip * npacks_per_subpart + local_pack_ip) = 2 * local_sub_ip * npacks_per_subpart + local_pack_ip;
1565 packindices_schur(local_pack_ip, local_sub_ip) = 2 * local_sub_ip * npacks_per_subpart + local_pack_ip + npacks_per_subpart;
1569 for (local_ordinal_type local_pack_ip = 0; local_pack_ip < npacks_per_subpart; ++local_pack_ip) {
1570 packindices_sub((n_subparts_per_part - 1) * npacks_per_subpart + local_pack_ip) = 2 * (n_subparts_per_part - 1) * npacks_per_subpart + local_pack_ip;
1573 #ifdef IFPACK2_BLOCKTRIDICONTAINER_WRITE_MM
1574 std::cout <<
"packindices_sub = " << std::endl;
1575 for (size_type i = 0; i < packindices_sub.extent(0); ++i) {
1576 std::cout << packindices_sub(i) <<
" ";
1578 std::cout << std::endl;
1579 std::cout <<
"packindices_sub end" << std::endl;
1581 std::cout <<
"packindices_schur = " << std::endl;
1582 for (size_type i = 0; i < packindices_schur.extent(0); ++i) {
1583 for (size_type j = 0; j < packindices_schur.extent(1); ++j) {
1584 std::cout << packindices_schur(i, j) <<
" ";
1586 std::cout << std::endl;
1589 std::cout <<
"packindices_schur end" << std::endl;
1592 Kokkos::deep_copy(interf.packindices_sub, packindices_sub);
1593 Kokkos::deep_copy(interf.packindices_schur, packindices_schur);
1596 const auto packptr_sub = Kokkos::create_mirror_view(interf.packptr_sub);
1598 for (local_ordinal_type k = 0; k < npacks + 1; ++k)
1599 packptr_sub(k) = packptr(k % npacks_per_subpart) + (k / npacks_per_subpart) * packptr(npacks_per_subpart);
1601 Kokkos::deep_copy(interf.packptr_sub, packptr_sub);
1602 IFPACK2_BLOCKHELPER_TIMER_FENCE(
typename BlockHelperDetails::ImplType<MatrixType>::execution_space)
1604 IFPACK2_BLOCKHELPER_TIMER_FENCE(
typename BlockHelperDetails::ImplType<MatrixType>::execution_space)
1612 template <
typename MatrixType>
1615 using local_ordinal_type_1d_view =
typename impl_type::local_ordinal_type_1d_view;
1617 using size_type_2d_view =
typename impl_type::size_type_2d_view;
1618 using vector_type_3d_view =
typename impl_type::vector_type_3d_view;
1619 using vector_type_4d_view =
typename impl_type::vector_type_4d_view;
1620 using btdm_scalar_type_3d_view =
typename impl_type::btdm_scalar_type_3d_view;
1626 size_type_2d_view flat_td_ptr, pack_td_ptr, pack_td_ptr_schur;
1629 local_ordinal_type_1d_view A_colindsub;
1632 vector_type_3d_view values;
1635 vector_type_3d_view values_schur;
1637 vector_type_4d_view e_values;
1642 size_type_1d_view diag_offsets;
1646 btdm_scalar_type_3d_view d_inv;
1648 bool is_diagonal_only;
1654 template <
typename idx_type>
1655 static KOKKOS_FORCEINLINE_FUNCTION
1657 IndexToRow(
const idx_type &ind) {
return (ind + 1) / 3; }
1660 template <
typename idx_type>
1661 static KOKKOS_FORCEINLINE_FUNCTION
1663 RowToIndex(
const idx_type &row) {
return row > 0 ? 3 * row - 1 : 0; }
1665 template <
typename idx_type>
1666 static KOKKOS_FORCEINLINE_FUNCTION
1668 NumBlocks(
const idx_type &nrows) {
return nrows > 0 ? 3 * nrows - 2 : 0; }
1670 template <
typename idx_type>
1671 static KOKKOS_FORCEINLINE_FUNCTION
1673 NumBlocksSchur(
const idx_type &nrows) {
return nrows > 0 ? 3 * nrows + 2 : 0; }
1679 template <
typename MatrixType>
1682 IFPACK2_BLOCKHELPER_TIMER(
"createBlockTridiags", createBlockTridiags0);
1684 using execution_space =
typename impl_type::execution_space;
1685 using local_ordinal_type =
typename impl_type::local_ordinal_type;
1686 using size_type =
typename impl_type::size_type;
1687 using size_type_2d_view =
typename impl_type::size_type_2d_view;
1689 constexpr
int vector_length = impl_type::vector_length;
1693 const local_ordinal_type ntridiags = interf.partptr_sub.extent(0);
1696 btdm.flat_td_ptr = size_type_2d_view(
do_not_initialize_tag(
"btdm.flat_td_ptr"), interf.nparts, 2 * interf.n_subparts_per_part);
1697 const Kokkos::RangePolicy<execution_space> policy(0, 2 * interf.nparts * interf.n_subparts_per_part);
1698 Kokkos::parallel_scan(
1699 "createBlockTridiags::RangePolicy::flat_td_ptr",
1700 policy, KOKKOS_LAMBDA(
const local_ordinal_type &i, size_type &update,
const bool &
final) {
1701 const local_ordinal_type partidx = i / (2 * interf.n_subparts_per_part);
1702 const local_ordinal_type local_subpartidx = i % (2 * interf.n_subparts_per_part);
1705 btdm.flat_td_ptr(partidx, local_subpartidx) = update;
1707 if (local_subpartidx != (2 * interf.n_subparts_per_part - 1)) {
1708 const local_ordinal_type nrows = interf.partptr_sub(interf.nparts * local_subpartidx + partidx, 1) - interf.partptr_sub(interf.nparts * local_subpartidx + partidx, 0);
1709 if (local_subpartidx % 2 == 0)
1710 update += btdm.NumBlocks(nrows);
1712 update += btdm.NumBlocksSchur(nrows);
1716 const auto nblocks = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), Kokkos::subview(btdm.flat_td_ptr, interf.nparts - 1, 2 * interf.n_subparts_per_part - 1));
1717 btdm.is_diagonal_only = (
static_cast<local_ordinal_type
>(nblocks()) == ntridiags);
1721 if (vector_length == 1) {
1722 btdm.pack_td_ptr = btdm.flat_td_ptr;
1726 local_ordinal_type npacks_per_subpart = 0;
1727 const auto part2packrowidx0 = Kokkos::create_mirror_view(interf.part2packrowidx0);
1728 Kokkos::deep_copy(part2packrowidx0, interf.part2packrowidx0);
1729 for (local_ordinal_type ip = 1; ip <= interf.nparts; ++ip)
1730 if (part2packrowidx0(ip) != part2packrowidx0(ip - 1))
1731 ++npacks_per_subpart;
1733 btdm.pack_td_ptr = size_type_2d_view(
do_not_initialize_tag(
"btdm.pack_td_ptr"), interf.nparts, 2 * interf.n_subparts_per_part);
1734 const Kokkos::RangePolicy<execution_space> policy(0, npacks_per_subpart);
1736 Kokkos::parallel_for(
1737 "createBlockTridiags::RangePolicy::pack_td_ptr",
1738 policy, KOKKOS_LAMBDA(
const local_ordinal_type &i) {
1739 for (local_ordinal_type j = 0; j < 2 * interf.n_subparts_per_part; ++j) {
1740 const local_ordinal_type pack_id = (j == 2 * interf.n_subparts_per_part - 1) ? i + (j - 1) * npacks_per_subpart : i + j * npacks_per_subpart;
1741 const local_ordinal_type nparts_in_pack = interf.packptr_sub(pack_id + 1) - interf.packptr_sub(pack_id);
1743 const local_ordinal_type parti = interf.packptr_sub(pack_id);
1744 const local_ordinal_type partidx = parti % interf.nparts;
1746 for (local_ordinal_type pti = 0; pti < nparts_in_pack; ++pti) {
1747 btdm.pack_td_ptr(partidx + pti, j) = btdm.flat_td_ptr(i, j);
1753 btdm.pack_td_ptr_schur = size_type_2d_view(
do_not_initialize_tag(
"btdm.pack_td_ptr_schur"), interf.nparts, interf.n_subparts_per_part);
1755 const auto host_pack_td_ptr_schur = Kokkos::create_mirror_view(btdm.pack_td_ptr_schur);
1756 constexpr local_ordinal_type connection_length = 2;
1758 host_pack_td_ptr_schur(0, 0) = 0;
1759 for (local_ordinal_type i = 0; i < interf.nparts; ++i) {
1760 if (i % vector_length == 0) {
1762 host_pack_td_ptr_schur(i, 0) = host_pack_td_ptr_schur(i - 1, host_pack_td_ptr_schur.extent(1) - 1);
1763 for (local_ordinal_type j = 0; j < interf.n_subparts_per_part - 1; ++j) {
1764 host_pack_td_ptr_schur(i, j + 1) = host_pack_td_ptr_schur(i, j) + btdm.NumBlocks(connection_length) + (j != 0 ? 1 : 0) + (j != interf.n_subparts_per_part - 2 ? 1 : 0);
1767 for (local_ordinal_type j = 0; j < interf.n_subparts_per_part; ++j) {
1768 host_pack_td_ptr_schur(i, j) = host_pack_td_ptr_schur(i - 1, j);
1773 Kokkos::deep_copy(btdm.pack_td_ptr_schur, host_pack_td_ptr_schur);
1775 #ifdef IFPACK2_BLOCKTRIDICONTAINER_WRITE_MM
1776 const auto host_flat_td_ptr = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), btdm.flat_td_ptr);
1777 std::cout <<
"flat_td_ptr = " << std::endl;
1778 for (size_type i = 0; i < host_flat_td_ptr.extent(0); ++i) {
1779 for (size_type j = 0; j < host_flat_td_ptr.extent(1); ++j) {
1780 std::cout << host_flat_td_ptr(i, j) <<
" ";
1782 std::cout << std::endl;
1784 std::cout <<
"flat_td_ptr end" << std::endl;
1786 const auto host_pack_td_ptr = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), btdm.pack_td_ptr);
1788 std::cout <<
"pack_td_ptr = " << std::endl;
1789 for (size_type i = 0; i < host_pack_td_ptr.extent(0); ++i) {
1790 for (size_type j = 0; j < host_pack_td_ptr.extent(1); ++j) {
1791 std::cout << host_pack_td_ptr(i, j) <<
" ";
1793 std::cout << std::endl;
1795 std::cout <<
"pack_td_ptr end" << std::endl;
1797 std::cout <<
"pack_td_ptr_schur = " << std::endl;
1798 for (size_type i = 0; i < host_pack_td_ptr_schur.extent(0); ++i) {
1799 for (size_type j = 0; j < host_pack_td_ptr_schur.extent(1); ++j) {
1800 std::cout << host_pack_td_ptr_schur(i, j) <<
" ";
1802 std::cout << std::endl;
1804 std::cout <<
"pack_td_ptr_schur end" << std::endl;
1808 IFPACK2_BLOCKHELPER_TIMER_FENCE(
typename BlockHelperDetails::ImplType<MatrixType>::execution_space)
1823 template <
typename MatrixType>
1824 void setTridiagsToIdentity(
const BlockTridiags<MatrixType> &btdm,
1825 const typename BlockHelperDetails::ImplType<MatrixType>::local_ordinal_type_1d_view &packptr) {
1827 using execution_space =
typename impl_type::execution_space;
1828 using local_ordinal_type =
typename impl_type::local_ordinal_type;
1829 using size_type_2d_view =
typename impl_type::size_type_2d_view;
1831 const ConstUnmanaged<size_type_2d_view> pack_td_ptr(btdm.pack_td_ptr);
1832 const local_ordinal_type blocksize = btdm.values.extent(1);
1835 const int vector_length = impl_type::vector_length;
1836 const int internal_vector_length = impl_type::internal_vector_length;
1838 using btdm_scalar_type =
typename impl_type::btdm_scalar_type;
1839 using internal_vector_type =
typename impl_type::internal_vector_type;
1840 using internal_vector_type_4d_view =
1841 typename impl_type::internal_vector_type_4d_view;
1843 using team_policy_type = Kokkos::TeamPolicy<execution_space>;
1844 const internal_vector_type_4d_view values(reinterpret_cast<internal_vector_type *>(btdm.values.data()),
1845 btdm.values.extent(0),
1846 btdm.values.extent(1),
1847 btdm.values.extent(2),
1848 vector_length / internal_vector_length);
1849 const local_ordinal_type vector_loop_size = values.extent(3);
1850 #if defined(KOKKOS_ENABLE_CUDA) && defined(__CUDA_ARCH__)
1851 local_ordinal_type total_team_size(0);
1853 total_team_size = 32;
1854 else if (blocksize <= 9)
1855 total_team_size = 64;
1856 else if (blocksize <= 12)
1857 total_team_size = 96;
1858 else if (blocksize <= 16)
1859 total_team_size = 128;
1860 else if (blocksize <= 20)
1861 total_team_size = 160;
1863 total_team_size = 160;
1864 const local_ordinal_type team_size = total_team_size / vector_loop_size;
1865 const team_policy_type policy(packptr.extent(0) - 1, team_size, vector_loop_size);
1866 #elif defined(KOKKOS_ENABLE_HIP)
1871 local_ordinal_type total_team_size(0);
1873 total_team_size = 32;
1874 else if (blocksize <= 9)
1875 total_team_size = 64;
1876 else if (blocksize <= 12)
1877 total_team_size = 96;
1878 else if (blocksize <= 16)
1879 total_team_size = 128;
1880 else if (blocksize <= 20)
1881 total_team_size = 160;
1883 total_team_size = 160;
1884 const local_ordinal_type team_size = total_team_size / vector_loop_size;
1885 const team_policy_type policy(packptr.extent(0) - 1, team_size, vector_loop_size);
1886 #elif defined(KOKKOS_ENABLE_SYCL)
1888 local_ordinal_type total_team_size(0);
1890 total_team_size = 32;
1891 else if (blocksize <= 9)
1892 total_team_size = 64;
1893 else if (blocksize <= 12)
1894 total_team_size = 96;
1895 else if (blocksize <= 16)
1896 total_team_size = 128;
1897 else if (blocksize <= 20)
1898 total_team_size = 160;
1900 total_team_size = 160;
1901 const local_ordinal_type team_size = total_team_size / vector_loop_size;
1902 const team_policy_type policy(packptr.extent(0) - 1, team_size, vector_loop_size);
1905 const team_policy_type policy(packptr.extent(0) - 1, 1, 1);
1907 Kokkos::parallel_for(
1908 "setTridiagsToIdentity::TeamPolicy",
1909 policy, KOKKOS_LAMBDA(
const typename team_policy_type::member_type &member) {
1910 const local_ordinal_type k = member.league_rank();
1911 const local_ordinal_type ibeg = pack_td_ptr(packptr(k), 0);
1912 const local_ordinal_type iend = pack_td_ptr(packptr(k), pack_td_ptr.extent(1) - 1);
1914 const local_ordinal_type diff = iend - ibeg;
1915 const local_ordinal_type icount = diff / 3 + (diff % 3 > 0);
1916 const btdm_scalar_type one(1);
1917 Kokkos::parallel_for(Kokkos::ThreadVectorRange(member, vector_loop_size), [&](
const int &v) {
1918 Kokkos::parallel_for(Kokkos::TeamThreadRange(member, icount), [&](
const local_ordinal_type &ii) {
1919 const local_ordinal_type i = ibeg + ii * 3;
1920 for (local_ordinal_type j = 0; j < blocksize; ++j) {
1921 values(i, j, j, v) = one;
1932 template <
typename MatrixType>
1934 const Teuchos::RCP<
const typename BlockHelperDetails::ImplType<MatrixType>::tpetra_crs_graph_type> &g,
1935 const BlockHelperDetails::PartInterface<MatrixType> &interf,
1938 const bool overlap_communication_and_computation,
1939 const Teuchos::RCP<AsyncableImport<MatrixType>> &async_importer,
1941 bool use_fused_jacobi) {
1942 IFPACK2_BLOCKHELPER_TIMER(
"BlockTriDi::SymbolicPhase", SymbolicPhase);
1946 using execution_space =
typename impl_type::execution_space;
1947 using host_execution_space =
typename impl_type::host_execution_space;
1949 using local_ordinal_type =
typename impl_type::local_ordinal_type;
1950 using global_ordinal_type =
typename impl_type::global_ordinal_type;
1951 using size_type =
typename impl_type::size_type;
1952 using local_ordinal_type_1d_view =
typename impl_type::local_ordinal_type_1d_view;
1953 using size_type_1d_view =
typename impl_type::size_type_1d_view;
1954 using vector_type_3d_view =
typename impl_type::vector_type_3d_view;
1955 using vector_type_4d_view =
typename impl_type::vector_type_4d_view;
1956 using crs_matrix_type =
typename impl_type::tpetra_crs_matrix_type;
1957 using block_crs_matrix_type =
typename impl_type::tpetra_block_crs_matrix_type;
1958 using btdm_scalar_type_3d_view =
typename impl_type::btdm_scalar_type_3d_view;
1960 constexpr
int vector_length = impl_type::vector_length;
1962 const auto comm = A->getRowMap()->getComm();
1964 auto A_crs = Teuchos::rcp_dynamic_cast<
const crs_matrix_type>(A);
1965 auto A_bcrs = Teuchos::rcp_dynamic_cast<
const block_crs_matrix_type>(A);
1967 bool hasBlockCrsMatrix = !A_bcrs.is_null();
1969 const local_ordinal_type blocksize = hasBlockCrsMatrix ? A->getBlockSize() : A->getLocalNumRows() / g->getLocalNumRows();
1972 const auto partptr = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), interf.partptr);
1973 const auto lclrow = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), interf.lclrow);
1974 const auto rowidx2part = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), interf.rowidx2part);
1975 const auto part2rowidx0 = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), interf.part2rowidx0);
1976 const auto packptr = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), interf.packptr);
1978 const local_ordinal_type nrows = partptr(partptr.extent(0) - 1);
1980 Kokkos::View<local_ordinal_type *, host_execution_space> col2row(
"col2row", A->getLocalNumCols());
1986 const auto rowmap = g->getRowMap();
1987 const auto colmap = g->getColMap();
1988 const auto dommap = g->getDomainMap();
1989 TEUCHOS_ASSERT(!(rowmap.is_null() || colmap.is_null() || dommap.is_null()));
1990 rowmap->lazyPushToHost();
1991 colmap->lazyPushToHost();
1992 dommap->lazyPushToHost();
1994 #if !defined(__CUDA_ARCH__) && !defined(__HIP_DEVICE_COMPILE__) && !defined(__SYCL_DEVICE_ONLY__)
1995 const Kokkos::RangePolicy<host_execution_space> policy(0, nrows);
1996 Kokkos::parallel_for(
1997 "performSymbolicPhase::RangePolicy::col2row",
1998 policy, KOKKOS_LAMBDA(
const local_ordinal_type &lr) {
1999 const global_ordinal_type gid = rowmap->getGlobalElement(lr);
2001 if (dommap->isNodeGlobalElement(gid)) {
2002 const local_ordinal_type lc = colmap->getLocalElement(gid);
2003 #if defined(BLOCKTRIDICONTAINER_DEBUG)
2005 BlockHelperDetails::get_msg_prefix(comm) <<
"GID " << gid
2006 <<
" gives an invalid local column.");
2016 const auto local_graph = g->getLocalGraphHost();
2017 const auto local_graph_rowptr = local_graph.row_map;
2018 TEUCHOS_ASSERT(local_graph_rowptr.size() ==
static_cast<size_t>(nrows + 1));
2019 const auto local_graph_colidx = local_graph.entries;
2023 Kokkos::View<local_ordinal_type *, host_execution_space> lclrow2idx(
"lclrow2idx", nrows);
2025 const Kokkos::RangePolicy<host_execution_space> policy(0, nrows);
2026 Kokkos::parallel_for(
2027 "performSymbolicPhase::RangePolicy::lclrow2idx",
2028 policy, KOKKOS_LAMBDA(
const local_ordinal_type &i) {
2029 lclrow2idx[lclrow(i)] = i;
2035 typename sum_reducer_type::value_type sum_reducer_value;
2037 const Kokkos::RangePolicy<host_execution_space> policy(0, nrows);
2038 Kokkos::parallel_reduce
2041 policy, KOKKOS_LAMBDA(
const local_ordinal_type &lr,
typename sum_reducer_type::value_type &update) {
2043 const local_ordinal_type ri0 = lclrow2idx[lr];
2044 const local_ordinal_type pi0 = rowidx2part(ri0);
2045 for (size_type j = local_graph_rowptr(lr); j < local_graph_rowptr(lr + 1); ++j) {
2046 const local_ordinal_type lc = local_graph_colidx(j);
2047 const local_ordinal_type lc2r = col2row[lc];
2048 bool incr_R =
false;
2050 if (lc2r == (local_ordinal_type)-1) {
2054 const local_ordinal_type ri = lclrow2idx[lc2r];
2055 const local_ordinal_type pi = rowidx2part(ri);
2063 if (ri0 + 1 >= ri && ri0 <= ri + 1)
2076 sum_reducer_type(sum_reducer_value));
2078 size_type D_nnz = sum_reducer_value.v[0];
2079 size_type R_nnz_owned = sum_reducer_value.v[1];
2080 size_type R_nnz_remote = sum_reducer_value.v[2];
2082 if (!overlap_communication_and_computation) {
2083 R_nnz_owned += R_nnz_remote;
2089 const auto flat_td_ptr = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), btdm.flat_td_ptr);
2091 btdm.A_colindsub = local_ordinal_type_1d_view(
"btdm.A_colindsub", D_nnz);
2092 const auto D_A_colindsub = Kokkos::create_mirror_view(btdm.A_colindsub);
2094 #if defined(BLOCKTRIDICONTAINER_DEBUG)
2098 const local_ordinal_type nparts = partptr.extent(0) - 1;
2101 const Kokkos::RangePolicy<host_execution_space> policy(0, nparts);
2102 Kokkos::parallel_for(
2103 "performSymbolicPhase::RangePolicy<host_execution_space>::D_graph",
2104 policy, KOKKOS_LAMBDA(
const local_ordinal_type &pi0) {
2105 const local_ordinal_type part_ri0 = part2rowidx0(pi0);
2106 local_ordinal_type offset = 0;
2107 for (local_ordinal_type ri0 = partptr(pi0); ri0 < partptr(pi0 + 1); ++ri0) {
2108 const local_ordinal_type td_row_os = btdm.RowToIndex(ri0 - part_ri0) + offset;
2110 const local_ordinal_type lr0 = lclrow(ri0);
2111 const size_type j0 = local_graph_rowptr(lr0);
2112 for (size_type j = j0; j < local_graph_rowptr(lr0 + 1); ++j) {
2113 const local_ordinal_type lc = local_graph_colidx(j);
2114 const local_ordinal_type lc2r = col2row[lc];
2115 if (lc2r == (local_ordinal_type)-1)
continue;
2116 const local_ordinal_type ri = lclrow2idx[lc2r];
2117 const local_ordinal_type pi = rowidx2part(ri);
2118 if (pi != pi0)
continue;
2119 if (ri + 1 < ri0 || ri > ri0 + 1)
continue;
2120 const local_ordinal_type row_entry = j - j0;
2121 D_A_colindsub(flat_td_ptr(pi0, 0) + ((td_row_os + ri) - ri0)) = row_entry;
2126 #if defined(BLOCKTRIDICONTAINER_DEBUG)
2127 for (
size_t i = 0; i < D_A_colindsub.extent(0); ++i)
2130 Kokkos::deep_copy(btdm.A_colindsub, D_A_colindsub);
2134 const auto pack_td_ptr_last = Kokkos::subview(btdm.pack_td_ptr, btdm.pack_td_ptr.extent(0) - 1, btdm.pack_td_ptr.extent(1) - 1);
2135 const auto num_packed_blocks = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), pack_td_ptr_last);
2136 btdm.values = vector_type_3d_view(
"btdm.values", num_packed_blocks(), blocksize, blocksize);
2138 if (interf.n_subparts_per_part > 1) {
2139 const auto pack_td_ptr_schur_last = Kokkos::subview(btdm.pack_td_ptr_schur, btdm.pack_td_ptr_schur.extent(0) - 1, btdm.pack_td_ptr_schur.extent(1) - 1);
2140 const auto num_packed_blocks_schur = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), pack_td_ptr_schur_last);
2141 btdm.values_schur = vector_type_3d_view(
"btdm.values_schur", num_packed_blocks_schur(), blocksize, blocksize);
2144 if (vector_length > 1) setTridiagsToIdentity(btdm, interf.packptr);
2150 amd.rowptr = size_type_1d_view(
"amd.rowptr", nrows + 1);
2151 amd.A_colindsub = local_ordinal_type_1d_view(
do_not_initialize_tag(
"amd.A_colindsub"), R_nnz_owned);
2153 const auto R_rowptr = Kokkos::create_mirror_view(amd.rowptr);
2154 const auto R_A_colindsub = Kokkos::create_mirror_view(amd.A_colindsub);
2156 amd.rowptr_remote = size_type_1d_view(
"amd.rowptr_remote", overlap_communication_and_computation ? nrows + 1 : 0);
2157 amd.A_colindsub_remote = local_ordinal_type_1d_view(
do_not_initialize_tag(
"amd.A_colindsub_remote"), R_nnz_remote);
2159 const auto R_rowptr_remote = Kokkos::create_mirror_view(amd.rowptr_remote);
2160 const auto R_A_colindsub_remote = Kokkos::create_mirror_view(amd.A_colindsub_remote);
2163 const Kokkos::RangePolicy<host_execution_space> policy(0, nrows);
2164 Kokkos::parallel_for(
2165 "performSymbolicPhase::RangePolicy<host_execution_space>::R_graph_count",
2166 policy, KOKKOS_LAMBDA(
const local_ordinal_type &lr) {
2167 const local_ordinal_type ri0 = lclrow2idx[lr];
2168 const local_ordinal_type pi0 = rowidx2part(ri0);
2169 const size_type j0 = local_graph_rowptr(lr);
2170 for (size_type j = j0; j < local_graph_rowptr(lr + 1); ++j) {
2171 const local_ordinal_type lc = local_graph_colidx(j);
2172 const local_ordinal_type lc2r = col2row[lc];
2173 if (lc2r != (local_ordinal_type)-1) {
2174 const local_ordinal_type ri = lclrow2idx[lc2r];
2175 const local_ordinal_type pi = rowidx2part(ri);
2176 if (pi == pi0 && ri + 1 >= ri0 && ri <= ri0 + 1) {
2181 if (!overlap_communication_and_computation || lc < nrows) {
2184 ++R_rowptr_remote(lr);
2193 Kokkos::RangePolicy<host_execution_space> policy(0, nrows + 1);
2194 Kokkos::parallel_scan(
2195 "performSymbolicPhase::RangePolicy<host_execution_space>::R_graph_fill",
2196 policy, KOKKOS_LAMBDA(
const local_ordinal_type &lr, update_type &update,
const bool &
final) {
2198 val.v[0] = R_rowptr(lr);
2199 if (overlap_communication_and_computation)
2200 val.v[1] = R_rowptr_remote(lr);
2203 R_rowptr(lr) = update.v[0];
2204 if (overlap_communication_and_computation)
2205 R_rowptr_remote(lr) = update.v[1];
2208 const local_ordinal_type ri0 = lclrow2idx[lr];
2209 const local_ordinal_type pi0 = rowidx2part(ri0);
2211 size_type cnt_rowptr = R_rowptr(lr);
2212 size_type cnt_rowptr_remote = overlap_communication_and_computation ? R_rowptr_remote(lr) : 0;
2214 const size_type j0 = local_graph_rowptr(lr);
2215 for (size_type j = j0; j < local_graph_rowptr(lr + 1); ++j) {
2216 const local_ordinal_type lc = local_graph_colidx(j);
2217 const local_ordinal_type lc2r = col2row[lc];
2218 if (lc2r != (local_ordinal_type)-1) {
2219 const local_ordinal_type ri = lclrow2idx[lc2r];
2220 const local_ordinal_type pi = rowidx2part(ri);
2221 if (pi == pi0 && ri + 1 >= ri0 && ri <= ri0 + 1)
2224 const local_ordinal_type row_entry = j - j0;
2225 if (!overlap_communication_and_computation || lc < nrows)
2226 R_A_colindsub(cnt_rowptr++) = row_entry;
2228 R_A_colindsub_remote(cnt_rowptr_remote++) = row_entry;
2236 Kokkos::deep_copy(amd.rowptr, R_rowptr);
2237 Kokkos::deep_copy(amd.A_colindsub, R_A_colindsub);
2238 if (overlap_communication_and_computation) {
2240 Kokkos::deep_copy(amd.rowptr_remote, R_rowptr_remote);
2241 Kokkos::deep_copy(amd.A_colindsub_remote, R_A_colindsub_remote);
2245 if (hasBlockCrsMatrix)
2246 amd.tpetra_values = (
const_cast<block_crs_matrix_type *
>(A_bcrs.get())->getValuesDeviceNonConst());
2248 amd.tpetra_values = (
const_cast<crs_matrix_type *
>(A_crs.get()))->getLocalValuesDevice(Tpetra::Access::ReadWrite);
2254 if (interf.n_subparts_per_part > 1)
2255 btdm.e_values = vector_type_4d_view(
"btdm.e_values", 2, interf.part2packrowidx0_back, blocksize, blocksize);
2265 if (BlockHelperDetails::is_device<execution_space>::value && !useSeqMethod && hasBlockCrsMatrix) {
2266 bool is_async_importer_active = !async_importer.is_null();
2267 local_ordinal_type_1d_view dm2cm = is_async_importer_active ? async_importer->dm2cm : local_ordinal_type_1d_view();
2268 bool ownedRemoteSeparate = overlap_communication_and_computation || !is_async_importer_active;
2269 BlockHelperDetails::precompute_A_x_offsets<MatrixType>(amd, interf, g, dm2cm, blocksize, ownedRemoteSeparate);
2273 if (use_fused_jacobi) {
2274 btdm.d_inv = btdm_scalar_type_3d_view(
do_not_initialize_tag(
"btdm.d_inv"), interf.nparts, blocksize, blocksize);
2275 auto rowptrs = A_bcrs->getCrsGraph().getLocalRowPtrsDevice();
2276 auto entries = A_bcrs->getCrsGraph().getLocalIndicesDevice();
2277 btdm.diag_offsets = BlockHelperDetails::findDiagOffsets<execution_space, size_type_1d_view>(rowptrs, entries, interf.nparts, blocksize);
2279 IFPACK2_BLOCKHELPER_TIMER_FENCE(
typename BlockHelperDetails::ImplType<MatrixType>::execution_space)
2285 template <
typename ArgActiveExecutionMemorySpace>
2290 typedef KB::Mode::Serial mode_type;
2291 #if defined(__KOKKOSBATCHED_INTEL_MKL_COMPACT_BATCHED__)
2292 typedef KB::Algo::Level3::CompactMKL algo_type;
2294 typedef KB::Algo::Level3::Blocked algo_type;
2296 static int recommended_team_size(
const int ,
2303 #if defined(KOKKOS_ENABLE_CUDA)
2304 static inline int ExtractAndFactorizeRecommendedCudaTeamSize(
const int blksize,
2305 const int vector_length,
2306 const int internal_vector_length) {
2307 const int vector_size = vector_length / internal_vector_length;
2308 int total_team_size(0);
2310 total_team_size = 32;
2311 else if (blksize <= 9)
2312 total_team_size = 32;
2313 else if (blksize <= 12)
2314 total_team_size = 96;
2315 else if (blksize <= 16)
2316 total_team_size = 128;
2317 else if (blksize <= 20)
2318 total_team_size = 160;
2320 total_team_size = 160;
2321 return 2 * total_team_size / vector_size;
2324 struct ExtractAndFactorizeTridiagsDefaultModeAndAlgo<Kokkos::CudaSpace> {
2325 typedef KB::Mode::Team mode_type;
2326 typedef KB::Algo::Level3::Unblocked algo_type;
2327 static int recommended_team_size(
const int blksize,
2328 const int vector_length,
2329 const int internal_vector_length) {
2330 return ExtractAndFactorizeRecommendedCudaTeamSize(blksize, vector_length, internal_vector_length);
2334 struct ExtractAndFactorizeTridiagsDefaultModeAndAlgo<Kokkos::CudaUVMSpace> {
2335 typedef KB::Mode::Team mode_type;
2336 typedef KB::Algo::Level3::Unblocked algo_type;
2337 static int recommended_team_size(
const int blksize,
2338 const int vector_length,
2339 const int internal_vector_length) {
2340 return ExtractAndFactorizeRecommendedCudaTeamSize(blksize, vector_length, internal_vector_length);
2345 #if defined(KOKKOS_ENABLE_HIP)
2346 static inline int ExtractAndFactorizeRecommendedHIPTeamSize(
const int blksize,
2347 const int vector_length,
2348 const int internal_vector_length) {
2349 const int vector_size = vector_length / internal_vector_length;
2350 int total_team_size(0);
2352 total_team_size = 32;
2353 else if (blksize <= 9)
2354 total_team_size = 32;
2355 else if (blksize <= 12)
2356 total_team_size = 96;
2357 else if (blksize <= 16)
2358 total_team_size = 128;
2359 else if (blksize <= 20)
2360 total_team_size = 160;
2362 total_team_size = 160;
2363 return 2 * total_team_size / vector_size;
2366 struct ExtractAndFactorizeTridiagsDefaultModeAndAlgo<Kokkos::HIPSpace> {
2367 typedef KB::Mode::Team mode_type;
2368 typedef KB::Algo::Level3::Unblocked algo_type;
2369 static int recommended_team_size(
const int blksize,
2370 const int vector_length,
2371 const int internal_vector_length) {
2372 return ExtractAndFactorizeRecommendedHIPTeamSize(blksize, vector_length, internal_vector_length);
2376 struct ExtractAndFactorizeTridiagsDefaultModeAndAlgo<Kokkos::HIPHostPinnedSpace> {
2377 typedef KB::Mode::Team mode_type;
2378 typedef KB::Algo::Level3::Unblocked algo_type;
2379 static int recommended_team_size(
const int blksize,
2380 const int vector_length,
2381 const int internal_vector_length) {
2382 return ExtractAndFactorizeRecommendedHIPTeamSize(blksize, vector_length, internal_vector_length);
2387 #if defined(KOKKOS_ENABLE_SYCL)
2388 static inline int ExtractAndFactorizeRecommendedSYCLTeamSize(
const int blksize,
2389 const int vector_length,
2390 const int internal_vector_length) {
2391 const int vector_size = vector_length / internal_vector_length;
2392 int total_team_size(0);
2394 total_team_size = 32;
2395 else if (blksize <= 9)
2396 total_team_size = 32;
2397 else if (blksize <= 12)
2398 total_team_size = 96;
2399 else if (blksize <= 16)
2400 total_team_size = 128;
2401 else if (blksize <= 20)
2402 total_team_size = 160;
2404 total_team_size = 160;
2405 return 2 * total_team_size / vector_size;
2408 struct ExtractAndFactorizeTridiagsDefaultModeAndAlgo<Kokkos::Experimental::SYCLDeviceUSMSpace> {
2409 typedef KB::Mode::Team mode_type;
2410 typedef KB::Algo::Level3::Unblocked algo_type;
2411 static int recommended_team_size(
const int blksize,
2412 const int vector_length,
2413 const int internal_vector_length) {
2414 return ExtractAndFactorizeRecommendedSYCLTeamSize(blksize, vector_length, internal_vector_length);
2418 struct ExtractAndFactorizeTridiagsDefaultModeAndAlgo<Kokkos::Experimental::SYCLSharedUSMSpace> {
2419 typedef KB::Mode::Team mode_type;
2420 typedef KB::Algo::Level3::Unblocked algo_type;
2421 static int recommended_team_size(
const int blksize,
2422 const int vector_length,
2423 const int internal_vector_length) {
2424 return ExtractAndFactorizeRecommendedSYCLTeamSize(blksize, vector_length, internal_vector_length);
2429 template <
typename impl_type,
typename WWViewType>
2430 KOKKOS_INLINE_FUNCTION
void
2431 solveMultiVector(
const typename Kokkos::TeamPolicy<typename impl_type::execution_space>::member_type &member,
2432 const typename impl_type::local_ordinal_type & ,
2433 const typename impl_type::local_ordinal_type &i0,
2434 const typename impl_type::local_ordinal_type &r0,
2435 const typename impl_type::local_ordinal_type &nrows,
2436 const typename impl_type::local_ordinal_type &v,
2437 const ConstUnmanaged<typename impl_type::internal_vector_type_4d_view> D_internal_vector_values,
2438 const Unmanaged<typename impl_type::internal_vector_type_4d_view> X_internal_vector_values,
2439 const WWViewType &WW,
2440 const bool skip_first_pass =
false) {
2441 using execution_space =
typename impl_type::execution_space;
2442 using team_policy_type = Kokkos::TeamPolicy<execution_space>;
2443 using member_type =
typename team_policy_type::member_type;
2444 using local_ordinal_type =
typename impl_type::local_ordinal_type;
2446 typedef SolveTridiagsDefaultModeAndAlgo<typename execution_space::memory_space> default_mode_and_algo_type;
2448 typedef typename default_mode_and_algo_type::mode_type default_mode_type;
2449 typedef typename default_mode_and_algo_type::multi_vector_algo_type default_algo_type;
2451 using btdm_magnitude_type =
typename impl_type::btdm_magnitude_type;
2454 #if KOKKOS_VERSION >= 40799
2455 const auto one = KokkosKernels::ArithTraits<btdm_magnitude_type>::one();
2457 const auto one = Kokkos::ArithTraits<btdm_magnitude_type>::one();
2459 #if KOKKOS_VERSION >= 40799
2460 const auto zero = KokkosKernels::ArithTraits<btdm_magnitude_type>::zero();
2462 const auto zero = Kokkos::ArithTraits<btdm_magnitude_type>::zero();
2466 auto A = Kokkos::subview(D_internal_vector_values, i0, Kokkos::ALL(), Kokkos::ALL(), v);
2467 auto X1 = Kokkos::subview(X_internal_vector_values, r0, Kokkos::ALL(), Kokkos::ALL(), v);
2470 local_ordinal_type i = i0, r = r0;
2474 if (skip_first_pass) {
2475 i += (nrows - 2) * 3;
2477 A.assign_data(&D_internal_vector_values(i + 2, 0, 0, v));
2478 X2.assign_data(&X_internal_vector_values(++r, 0, 0, v));
2479 A.assign_data(&D_internal_vector_values(i + 3, 0, 0, v));
2480 KB::Trsm<member_type,
2481 KB::Side::Left, KB::Uplo::Lower, KB::Trans::NoTranspose, KB::Diag::Unit,
2482 default_mode_type, default_algo_type>::invoke(member, one, A, X2);
2483 X1.assign_data(X2.data());
2486 KB::Trsm<member_type,
2487 KB::Side::Left, KB::Uplo::Lower, KB::Trans::NoTranspose, KB::Diag::Unit,
2488 default_mode_type, default_algo_type>::invoke(member, one, A, X1);
2489 for (local_ordinal_type tr = 1; tr < nrows; ++tr, i += 3) {
2490 A.assign_data(&D_internal_vector_values(i + 2, 0, 0, v));
2491 X2.assign_data(&X_internal_vector_values(++r, 0, 0, v));
2492 member.team_barrier();
2493 KB::Gemm<member_type,
2494 KB::Trans::NoTranspose, KB::Trans::NoTranspose,
2495 default_mode_type, default_algo_type>::invoke(member, -one, A, X1, one, X2);
2496 A.assign_data(&D_internal_vector_values(i + 3, 0, 0, v));
2497 KB::Trsm<member_type,
2498 KB::Side::Left, KB::Uplo::Lower, KB::Trans::NoTranspose, KB::Diag::Unit,
2499 default_mode_type, default_algo_type>::invoke(member, one, A, X2);
2500 X1.assign_data(X2.data());
2505 KB::Trsm<member_type,
2506 KB::Side::Left, KB::Uplo::Upper, KB::Trans::NoTranspose, KB::Diag::NonUnit,
2507 default_mode_type, default_algo_type>::invoke(member, one, A, X1);
2508 for (local_ordinal_type tr = nrows; tr > 1; --tr) {
2510 A.assign_data(&D_internal_vector_values(i + 1, 0, 0, v));
2511 X2.assign_data(&X_internal_vector_values(--r, 0, 0, v));
2512 member.team_barrier();
2513 KB::Gemm<member_type,
2514 KB::Trans::NoTranspose, KB::Trans::NoTranspose,
2515 default_mode_type, default_algo_type>::invoke(member, -one, A, X1, one, X2);
2517 A.assign_data(&D_internal_vector_values(i, 0, 0, v));
2518 KB::Trsm<member_type,
2519 KB::Side::Left, KB::Uplo::Upper, KB::Trans::NoTranspose, KB::Diag::NonUnit,
2520 default_mode_type, default_algo_type>::invoke(member, one, A, X2);
2521 X1.assign_data(X2.data());
2525 auto W = Kokkos::subview(WW, Kokkos::ALL(), Kokkos::ALL(), v);
2526 KB::Copy<member_type, KB::Trans::NoTranspose, default_mode_type>::invoke(member, X1, W);
2527 member.team_barrier();
2528 KB::Gemm<member_type,
2529 KB::Trans::NoTranspose, KB::Trans::NoTranspose,
2530 default_mode_type, default_algo_type>::invoke(member, one, A, W, zero, X1);
2534 template <
typename impl_type,
typename WWViewType,
typename XViewType>
2535 KOKKOS_INLINE_FUNCTION
void
2536 solveSingleVectorNew(
const typename Kokkos::TeamPolicy<typename impl_type::execution_space>::member_type &member,
2537 const typename impl_type::local_ordinal_type &blocksize,
2538 const typename impl_type::local_ordinal_type &i0,
2539 const typename impl_type::local_ordinal_type &r0,
2540 const typename impl_type::local_ordinal_type &nrows,
2541 const typename impl_type::local_ordinal_type &v,
2542 const ConstUnmanaged<typename impl_type::internal_vector_type_4d_view> D_internal_vector_values,
2543 const XViewType &X_internal_vector_values,
2544 const WWViewType &WW) {
2545 using execution_space =
typename impl_type::execution_space;
2548 using local_ordinal_type =
typename impl_type::local_ordinal_type;
2550 typedef SolveTridiagsDefaultModeAndAlgo<typename execution_space::memory_space> default_mode_and_algo_type;
2552 typedef typename default_mode_and_algo_type::mode_type default_mode_type;
2553 typedef typename default_mode_and_algo_type::single_vector_algo_type default_algo_type;
2555 using btdm_magnitude_type =
typename impl_type::btdm_magnitude_type;
2558 auto A = D_internal_vector_values.data();
2559 auto X = X_internal_vector_values.data();
2562 #if KOKKOS_VERSION >= 40799
2563 const auto one = KokkosKernels::ArithTraits<btdm_magnitude_type>::one();
2565 const auto one = Kokkos::ArithTraits<btdm_magnitude_type>::one();
2567 #if KOKKOS_VERSION >= 40799
2568 const auto zero = KokkosKernels::ArithTraits<btdm_magnitude_type>::zero();
2570 const auto zero = Kokkos::ArithTraits<btdm_magnitude_type>::zero();
2575 const local_ordinal_type astep = D_internal_vector_values.stride(0);
2576 const local_ordinal_type as0 = D_internal_vector_values.stride(1);
2577 const local_ordinal_type as1 = D_internal_vector_values.stride(2);
2578 const local_ordinal_type xstep = X_internal_vector_values.stride(0);
2579 const local_ordinal_type xs0 = X_internal_vector_values.stride(1);
2582 A += i0 * astep + v;
2583 X += r0 * xstep + v;
2588 KOKKOSBATCHED_TRSV_LOWER_NO_TRANSPOSE_INTERNAL_INVOKE(default_mode_type, default_algo_type,
2591 blocksize, blocksize,
2596 for (local_ordinal_type tr = 1; tr < nrows; ++tr) {
2597 member.team_barrier();
2598 KOKKOSBATCHED_GEMV_NO_TRANSPOSE_INTERNAL_INVOKE(default_mode_type, default_algo_type,
2600 blocksize, blocksize,
2602 A + 2 * astep, as0, as1,
2605 X + 1 * xstep, xs0);
2606 KOKKOSBATCHED_TRSV_LOWER_NO_TRANSPOSE_INTERNAL_INVOKE(default_mode_type, default_algo_type,
2609 blocksize, blocksize,
2611 A + 3 * astep, as0, as1,
2612 X + 1 * xstep, xs0);
2619 KOKKOSBATCHED_TRSV_UPPER_NO_TRANSPOSE_INTERNAL_INVOKE(default_mode_type, default_algo_type,
2622 blocksize, blocksize,
2627 for (local_ordinal_type tr = nrows; tr > 1; --tr) {
2629 member.team_barrier();
2630 KOKKOSBATCHED_GEMV_NO_TRANSPOSE_INTERNAL_INVOKE(default_mode_type, default_algo_type,
2632 blocksize, blocksize,
2634 A + 1 * astep, as0, as1,
2637 X - 1 * xstep, xs0);
2638 KOKKOSBATCHED_TRSV_UPPER_NO_TRANSPOSE_INTERNAL_INVOKE(default_mode_type, default_algo_type,
2641 blocksize, blocksize,
2644 X - 1 * xstep, xs0);
2650 const local_ordinal_type ws0 = WW.stride(0);
2651 auto W = WW.data() + v;
2652 KOKKOSBATCHED_COPY_VECTOR_NO_TRANSPOSE_INTERNAL_INVOKE(default_mode_type,
2653 member, blocksize, X, xs0, W, ws0);
2654 member.team_barrier();
2655 KOKKOSBATCHED_GEMV_NO_TRANSPOSE_INTERNAL_INVOKE(default_mode_type, default_algo_type,
2657 blocksize, blocksize,
2666 template <
typename local_ordinal_type,
typename ViewType>
2667 void writeBTDValuesToFile(
const local_ordinal_type &n_parts,
const ViewType &scalar_values_device, std::string fileName) {
2668 #ifdef IFPACK2_BLOCKTRIDICONTAINER_WRITE_MM
2669 auto scalar_values = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), scalar_values_device);
2670 std::ofstream myfile;
2671 myfile.open(fileName);
2673 const local_ordinal_type n_parts_per_pack = n_parts < (local_ordinal_type)scalar_values.extent(3) ? n_parts : scalar_values.extent(3);
2674 local_ordinal_type nnz = scalar_values.extent(0) * scalar_values.extent(1) * scalar_values.extent(2) * n_parts_per_pack;
2675 const local_ordinal_type n_blocks = scalar_values.extent(0) * n_parts_per_pack;
2676 const local_ordinal_type n_blocks_per_part = n_blocks / n_parts;
2678 const local_ordinal_type block_size = scalar_values.extent(1);
2680 const local_ordinal_type n_rows_per_part = (n_blocks_per_part + 2) / 3 * block_size;
2681 const local_ordinal_type n_rows = n_rows_per_part * n_parts;
2683 const local_ordinal_type n_packs = ceil(
float(n_parts) / n_parts_per_pack);
2685 myfile <<
"%%MatrixMarket matrix coordinate real general" << std::endl;
2686 myfile <<
"%%nnz = " << nnz;
2687 myfile <<
" block size = " << block_size;
2688 myfile <<
" number of blocks = " << n_blocks;
2689 myfile <<
" number of parts = " << n_parts;
2690 myfile <<
" number of blocks per part = " << n_blocks_per_part;
2691 myfile <<
" number of rows = " << n_rows;
2692 myfile <<
" number of cols = " << n_rows;
2693 myfile <<
" number of packs = " << n_packs << std::endl;
2695 myfile << n_rows <<
" " << n_rows <<
" " << nnz << std::setprecision(9) << std::endl;
2697 local_ordinal_type current_part_idx, current_block_idx, current_row_offset, current_col_offset, current_row, current_col;
2698 for (local_ordinal_type i_pack = 0; i_pack < n_packs; ++i_pack) {
2699 for (local_ordinal_type i_part_in_pack = 0; i_part_in_pack < n_parts_per_pack; ++i_part_in_pack) {
2700 current_part_idx = i_part_in_pack + i_pack * n_parts_per_pack;
2701 for (local_ordinal_type i_block_in_part = 0; i_block_in_part < n_blocks_per_part; ++i_block_in_part) {
2702 current_block_idx = i_block_in_part + i_pack * n_blocks_per_part;
2703 if (current_block_idx >= (local_ordinal_type)scalar_values.extent(0))
2705 if (i_block_in_part % 3 == 0) {
2706 current_row_offset = i_block_in_part / 3 * block_size;
2707 current_col_offset = i_block_in_part / 3 * block_size;
2708 }
else if (i_block_in_part % 3 == 1) {
2709 current_row_offset = (i_block_in_part - 1) / 3 * block_size;
2710 current_col_offset = ((i_block_in_part - 1) / 3 + 1) * block_size;
2711 }
else if (i_block_in_part % 3 == 2) {
2712 current_row_offset = ((i_block_in_part - 2) / 3 + 1) * block_size;
2713 current_col_offset = (i_block_in_part - 2) / 3 * block_size;
2715 current_row_offset += current_part_idx * n_rows_per_part;
2716 current_col_offset += current_part_idx * n_rows_per_part;
2717 for (local_ordinal_type i_in_block = 0; i_in_block < block_size; ++i_in_block) {
2718 for (local_ordinal_type j_in_block = 0; j_in_block < block_size; ++j_in_block) {
2719 current_row = current_row_offset + i_in_block + 1;
2720 current_col = current_col_offset + j_in_block + 1;
2721 myfile << current_row <<
" " << current_col <<
" " << scalar_values(current_block_idx, i_in_block, j_in_block, i_part_in_pack) << std::endl;
2732 template <
typename local_ordinal_type,
typename ViewType>
2733 void write4DMultiVectorValuesToFile(
const local_ordinal_type &n_parts,
const ViewType &scalar_values_device, std::string fileName) {
2734 #ifdef IFPACK2_BLOCKTRIDICONTAINER_WRITE_MM
2735 auto scalar_values = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), scalar_values_device);
2736 std::ofstream myfile;
2737 myfile.open(fileName);
2739 const local_ordinal_type n_parts_per_pack = n_parts < scalar_values.extent(3) ? n_parts : scalar_values.extent(3);
2740 const local_ordinal_type n_blocks = scalar_values.extent(0) * n_parts_per_pack;
2741 const local_ordinal_type n_blocks_per_part = n_blocks / n_parts;
2743 const local_ordinal_type block_size = scalar_values.extent(1);
2744 const local_ordinal_type n_cols = scalar_values.extent(2);
2746 const local_ordinal_type n_rows_per_part = n_blocks_per_part * block_size;
2747 const local_ordinal_type n_rows = n_rows_per_part * n_parts;
2749 const local_ordinal_type n_packs = ceil(
float(n_parts) / n_parts_per_pack);
2751 myfile <<
"%%MatrixMarket matrix array real general" << std::endl;
2752 myfile <<
"%%block size = " << block_size;
2753 myfile <<
" number of blocks = " << n_blocks;
2754 myfile <<
" number of parts = " << n_parts;
2755 myfile <<
" number of blocks per part = " << n_blocks_per_part;
2756 myfile <<
" number of rows = " << n_rows;
2757 myfile <<
" number of cols = " << n_cols;
2758 myfile <<
" number of packs = " << n_packs << std::endl;
2760 myfile << n_rows <<
" " << n_cols << std::setprecision(9) << std::endl;
2762 local_ordinal_type current_part_idx, current_block_idx, current_row_offset;
2763 (void)current_row_offset;
2764 (void)current_part_idx;
2765 for (local_ordinal_type j_in_block = 0; j_in_block < n_cols; ++j_in_block) {
2766 for (local_ordinal_type i_pack = 0; i_pack < n_packs; ++i_pack) {
2767 for (local_ordinal_type i_part_in_pack = 0; i_part_in_pack < n_parts_per_pack; ++i_part_in_pack) {
2768 current_part_idx = i_part_in_pack + i_pack * n_parts_per_pack;
2769 for (local_ordinal_type i_block_in_part = 0; i_block_in_part < n_blocks_per_part; ++i_block_in_part) {
2770 current_block_idx = i_block_in_part + i_pack * n_blocks_per_part;
2772 if (current_block_idx >= (local_ordinal_type)scalar_values.extent(0))
2774 for (local_ordinal_type i_in_block = 0; i_in_block < block_size; ++i_in_block) {
2775 myfile << scalar_values(current_block_idx, i_in_block, j_in_block, i_part_in_pack) << std::endl;
2785 template <
typename local_ordinal_type,
typename ViewType>
2786 void write5DMultiVectorValuesToFile(
const local_ordinal_type &n_parts,
const ViewType &scalar_values_device, std::string fileName) {
2787 #ifdef IFPACK2_BLOCKTRIDICONTAINER_WRITE_MM
2788 auto scalar_values = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), scalar_values_device);
2789 std::ofstream myfile;
2790 myfile.open(fileName);
2792 const local_ordinal_type n_parts_per_pack = n_parts < scalar_values.extent(4) ? n_parts : scalar_values.extent(4);
2793 const local_ordinal_type n_blocks = scalar_values.extent(1) * n_parts_per_pack;
2794 const local_ordinal_type n_blocks_per_part = n_blocks / n_parts;
2796 const local_ordinal_type block_size = scalar_values.extent(2);
2797 const local_ordinal_type n_blocks_cols = scalar_values.extent(0);
2798 const local_ordinal_type n_cols = n_blocks_cols * block_size;
2800 const local_ordinal_type n_rows_per_part = n_blocks_per_part * block_size;
2801 const local_ordinal_type n_rows = n_rows_per_part * n_parts;
2803 const local_ordinal_type n_packs = ceil(
float(n_parts) / n_parts_per_pack);
2805 myfile <<
"%%MatrixMarket matrix array real general" << std::endl;
2806 myfile <<
"%%block size = " << block_size;
2807 myfile <<
" number of blocks = " << n_blocks;
2808 myfile <<
" number of parts = " << n_parts;
2809 myfile <<
" number of blocks per part = " << n_blocks_per_part;
2810 myfile <<
" number of rows = " << n_rows;
2811 myfile <<
" number of cols = " << n_cols;
2812 myfile <<
" number of packs = " << n_packs << std::endl;
2814 myfile << n_rows <<
" " << n_cols << std::setprecision(9) << std::endl;
2816 local_ordinal_type current_part_idx, current_block_idx, current_row_offset;
2817 (void)current_row_offset;
2818 (void)current_part_idx;
2819 for (local_ordinal_type i_block_col = 0; i_block_col < n_blocks_cols; ++i_block_col) {
2820 for (local_ordinal_type j_in_block = 0; j_in_block < block_size; ++j_in_block) {
2821 for (local_ordinal_type i_pack = 0; i_pack < n_packs; ++i_pack) {
2822 for (local_ordinal_type i_part_in_pack = 0; i_part_in_pack < n_parts_per_pack; ++i_part_in_pack) {
2823 current_part_idx = i_part_in_pack + i_pack * n_parts_per_pack;
2824 for (local_ordinal_type i_block_in_part = 0; i_block_in_part < n_blocks_per_part; ++i_block_in_part) {
2825 current_block_idx = i_block_in_part + i_pack * n_blocks_per_part;
2827 if (current_block_idx >= (local_ordinal_type)scalar_values.extent(1))
2829 for (local_ordinal_type i_in_block = 0; i_in_block < block_size; ++i_in_block) {
2830 myfile << scalar_values(i_block_col, current_block_idx, i_in_block, j_in_block, i_part_in_pack) << std::endl;
2841 template <
typename local_ordinal_type,
typename member_type,
typename ViewType1,
typename ViewType2>
2842 KOKKOS_INLINE_FUNCTION
void
2843 copy3DView(
const member_type &member,
const ViewType1 &view1,
const ViewType2 &view2) {
2856 Kokkos::Experimental::local_deep_copy(member, view1, view2);
2858 template <
typename MatrixType,
int ScratchLevel>
2859 struct ExtractAndFactorizeTridiags {
2861 using impl_type = BlockHelperDetails::ImplType<MatrixType>;
2863 using execution_space =
typename impl_type::execution_space;
2864 using memory_space =
typename impl_type::memory_space;
2866 using local_ordinal_type =
typename impl_type::local_ordinal_type;
2869 using magnitude_type =
typename impl_type::magnitude_type;
2871 using row_matrix_type =
typename impl_type::tpetra_row_matrix_type;
2872 using crs_graph_type =
typename impl_type::tpetra_crs_graph_type;
2874 using local_ordinal_type_1d_view =
typename impl_type::local_ordinal_type_1d_view;
2875 using local_ordinal_type_2d_view =
typename impl_type::local_ordinal_type_2d_view;
2877 using size_type_2d_view =
typename impl_type::size_type_2d_view;
2878 using impl_scalar_type_1d_view_tpetra =
typename impl_type::impl_scalar_type_1d_view_tpetra;
2880 using btdm_scalar_type =
typename impl_type::btdm_scalar_type;
2881 using btdm_magnitude_type =
typename impl_type::btdm_magnitude_type;
2882 using vector_type_3d_view =
typename impl_type::vector_type_3d_view;
2883 using vector_type_4d_view =
typename impl_type::vector_type_4d_view;
2884 using internal_vector_type_4d_view =
typename impl_type::internal_vector_type_4d_view;
2885 using internal_vector_type_5d_view =
typename impl_type::internal_vector_type_5d_view;
2886 using btdm_scalar_type_2d_view =
typename impl_type::btdm_scalar_type_2d_view;
2887 using btdm_scalar_type_3d_view =
typename impl_type::btdm_scalar_type_3d_view;
2888 using btdm_scalar_type_4d_view =
typename impl_type::btdm_scalar_type_4d_view;
2889 using btdm_scalar_type_5d_view =
typename impl_type::btdm_scalar_type_5d_view;
2890 using internal_vector_scratch_type_3d_view = Scratch<typename impl_type::internal_vector_type_3d_view>;
2891 using btdm_scalar_scratch_type_3d_view = Scratch<typename impl_type::btdm_scalar_type_3d_view>;
2892 using tpetra_block_access_view_type =
typename impl_type::tpetra_block_access_view_type;
2893 using local_crs_graph_type =
typename impl_type::local_crs_graph_type;
2894 using colinds_view =
typename local_crs_graph_type::entries_type;
2896 using internal_vector_type =
typename impl_type::internal_vector_type;
2897 static constexpr
int vector_length = impl_type::vector_length;
2898 static constexpr
int internal_vector_length = impl_type::internal_vector_length;
2899 static_assert(vector_length >= internal_vector_length,
"Ifpack2 BlockTriDi Numeric: vector_length must be at least as large as internal_vector_length");
2900 static_assert(vector_length % internal_vector_length == 0,
"Ifpack2 BlockTriDi Numeric: vector_length must be divisible by internal_vector_length");
2905 static constexpr
int half_vector_length = impl_type::half_vector_length;
2908 using team_policy_type = Kokkos::TeamPolicy<execution_space>;
2909 using member_type =
typename team_policy_type::member_type;
2913 const ConstUnmanaged<local_ordinal_type_1d_view> partptr, lclrow, packptr, packindices_sub, packptr_sub;
2914 const ConstUnmanaged<local_ordinal_type_2d_view> partptr_sub, part2packrowidx0_sub, packindices_schur;
2915 const local_ordinal_type max_partsz;
2917 using size_type_1d_view_tpetra = Kokkos::View<size_t *, typename impl_type::node_device_type>;
2918 ConstUnmanaged<size_type_1d_view_tpetra> A_block_rowptr;
2919 ConstUnmanaged<size_type_1d_view_tpetra> A_point_rowptr;
2920 ConstUnmanaged<impl_scalar_type_1d_view_tpetra> A_values;
2922 const ConstUnmanaged<size_type_2d_view> pack_td_ptr, flat_td_ptr, pack_td_ptr_schur;
2923 const ConstUnmanaged<local_ordinal_type_1d_view> A_colindsub;
2924 const Unmanaged<internal_vector_type_4d_view> internal_vector_values, internal_vector_values_schur;
2925 const Unmanaged<internal_vector_type_5d_view> e_internal_vector_values;
2926 const Unmanaged<btdm_scalar_type_4d_view> scalar_values, scalar_values_schur;
2927 const Unmanaged<btdm_scalar_type_5d_view> e_scalar_values;
2928 const Unmanaged<btdm_scalar_type_3d_view> d_inv;
2929 const Unmanaged<size_type_1d_view> diag_offsets;
2931 const local_ordinal_type blocksize, blocksize_square;
2933 const magnitude_type tiny;
2934 const local_ordinal_type vector_loop_size;
2936 bool hasBlockCrsMatrix;
2939 ExtractAndFactorizeTridiags(
const BlockTridiags<MatrixType> &btdm_,
2940 const BlockHelperDetails::PartInterface<MatrixType> &interf_,
2943 const magnitude_type &tiny_)
2945 partptr(interf_.partptr)
2946 , lclrow(interf_.lclrow)
2947 , packptr(interf_.packptr)
2948 , packindices_sub(interf_.packindices_sub)
2949 , packptr_sub(interf_.packptr_sub)
2950 , partptr_sub(interf_.partptr_sub)
2951 , part2packrowidx0_sub(interf_.part2packrowidx0_sub)
2952 , packindices_schur(interf_.packindices_schur)
2953 , max_partsz(interf_.max_partsz)
2956 pack_td_ptr(btdm_.pack_td_ptr)
2957 , flat_td_ptr(btdm_.flat_td_ptr)
2958 , pack_td_ptr_schur(btdm_.pack_td_ptr_schur)
2959 , A_colindsub(btdm_.A_colindsub)
2960 , internal_vector_values((internal_vector_type *)btdm_.values.data(),
2961 btdm_.values.extent(0),
2962 btdm_.values.extent(1),
2963 btdm_.values.extent(2),
2964 vector_length / internal_vector_length)
2965 , internal_vector_values_schur((internal_vector_type *)btdm_.values_schur.data(),
2966 btdm_.values_schur.extent(0),
2967 btdm_.values_schur.extent(1),
2968 btdm_.values_schur.extent(2),
2969 vector_length / internal_vector_length)
2970 , e_internal_vector_values((internal_vector_type *)btdm_.e_values.data(),
2971 btdm_.e_values.extent(0),
2972 btdm_.e_values.extent(1),
2973 btdm_.e_values.extent(2),
2974 btdm_.e_values.extent(3),
2975 vector_length / internal_vector_length)
2976 , scalar_values((btdm_scalar_type *)btdm_.values.data(),
2977 btdm_.values.extent(0),
2978 btdm_.values.extent(1),
2979 btdm_.values.extent(2),
2981 , scalar_values_schur((btdm_scalar_type *)btdm_.values_schur.data(),
2982 btdm_.values_schur.extent(0),
2983 btdm_.values_schur.extent(1),
2984 btdm_.values_schur.extent(2),
2986 , e_scalar_values((btdm_scalar_type *)btdm_.e_values.data(),
2987 btdm_.e_values.extent(0),
2988 btdm_.e_values.extent(1),
2989 btdm_.e_values.extent(2),
2990 btdm_.e_values.extent(3),
2992 , d_inv(btdm_.d_inv)
2993 , diag_offsets(btdm_.diag_offsets)
2994 , blocksize(btdm_.values.extent(1))
2995 , blocksize_square(blocksize * blocksize)
2999 , vector_loop_size(vector_length / internal_vector_length) {
3000 using crs_matrix_type =
typename impl_type::tpetra_crs_matrix_type;
3001 using block_crs_matrix_type =
typename impl_type::tpetra_block_crs_matrix_type;
3003 auto A_crs = Teuchos::rcp_dynamic_cast<
const crs_matrix_type>(A_);
3004 auto A_bcrs = Teuchos::rcp_dynamic_cast<
const block_crs_matrix_type>(A_);
3006 hasBlockCrsMatrix = !A_bcrs.is_null();
3008 A_block_rowptr = G_->getLocalGraphDevice().row_map;
3009 if (hasBlockCrsMatrix) {
3010 A_values =
const_cast<block_crs_matrix_type *
>(A_bcrs.get())->getValuesDeviceNonConst();
3012 A_point_rowptr = A_crs->getCrsGraph()->getLocalGraphDevice().row_map;
3013 A_values = A_crs->getLocalValuesDevice(Tpetra::Access::ReadOnly);
3018 KOKKOS_INLINE_FUNCTION
3020 extract(local_ordinal_type partidx,
3021 local_ordinal_type local_subpartidx,
3022 local_ordinal_type npacks)
const {
3023 #ifdef IFPACK2_BLOCKTRIDICONTAINER_USE_PRINTF
3024 printf(
"extract partidx = %d, local_subpartidx = %d, npacks = %d;\n", partidx, local_subpartidx, npacks);
3026 using tlb = BlockHelperDetails::TpetraLittleBlock<Tpetra::Impl::BlockCrsMatrixLittleBlockArrayLayout>;
3027 const size_type kps = pack_td_ptr(partidx, local_subpartidx);
3028 local_ordinal_type kfs[vector_length] = {};
3029 local_ordinal_type ri0[vector_length] = {};
3030 local_ordinal_type nrows[vector_length] = {};
3032 for (local_ordinal_type vi = 0; vi < npacks; ++vi, ++partidx) {
3033 kfs[vi] = flat_td_ptr(partidx, local_subpartidx);
3034 ri0[vi] = partptr_sub(pack_td_ptr.extent(0) * local_subpartidx + partidx, 0);
3035 nrows[vi] = partptr_sub(pack_td_ptr.extent(0) * local_subpartidx + partidx, 1) - ri0[vi];
3036 #ifdef IFPACK2_BLOCKTRIDICONTAINER_USE_PRINTF
3037 printf(
"kfs[%d] = %d;\n", vi, kfs[vi]);
3038 printf(
"ri0[%d] = %d;\n", vi, ri0[vi]);
3039 printf(
"nrows[%d] = %d;\n", vi, nrows[vi]);
3042 local_ordinal_type tr_min = 0;
3043 local_ordinal_type tr_max = nrows[0];
3044 if (local_subpartidx % 2 == 1) {
3048 #ifdef IFPACK2_BLOCKTRIDICONTAINER_USE_PRINTF
3049 printf(
"tr_min = %d and tr_max = %d;\n", tr_min, tr_max);
3051 for (local_ordinal_type tr = tr_min, j = 0; tr < tr_max; ++tr) {
3052 for (local_ordinal_type e = 0; e < 3; ++e) {
3053 if (hasBlockCrsMatrix) {
3054 const impl_scalar_type *block[vector_length] = {};
3055 for (local_ordinal_type vi = 0; vi < npacks; ++vi) {
3056 const size_type Aj = A_block_rowptr(lclrow(ri0[vi] + tr)) + A_colindsub(kfs[vi] + j);
3058 block[vi] = &A_values(Aj * blocksize_square);
3060 const size_type pi = kps + j;
3061 #ifdef IFPACK2_BLOCKTRIDICONTAINER_USE_PRINTF
3062 printf(
"Extract pi = %ld, ri0 + tr = %d, kfs + j = %d\n", pi, ri0[0] + tr, kfs[0] + j);
3065 for (local_ordinal_type ii = 0; ii < blocksize; ++ii) {
3066 for (local_ordinal_type jj = 0; jj < blocksize; ++jj) {
3067 const auto idx = tlb::getFlatIndex(ii, jj, blocksize);
3068 auto &v = internal_vector_values(pi, ii, jj, 0);
3069 for (local_ordinal_type vi = 0; vi < npacks; ++vi) {
3070 v[vi] =
static_cast<btdm_scalar_type
>(block[vi][idx]);
3075 const size_type pi = kps + j;
3077 for (local_ordinal_type vi = 0; vi < npacks; ++vi) {
3078 const size_type Aj_c = A_colindsub(kfs[vi] + j);
3080 for (local_ordinal_type ii = 0; ii < blocksize; ++ii) {
3081 auto point_row_offset = A_point_rowptr(lclrow(ri0[vi] + tr) * blocksize + ii);
3083 for (local_ordinal_type jj = 0; jj < blocksize; ++jj) {
3084 scalar_values(pi, ii, jj, vi) = A_values(point_row_offset + Aj_c * blocksize + jj);
3090 if (nrows[0] == 1)
break;
3091 if (local_subpartidx % 2 == 0) {
3092 if (e == 1 && (tr == 0 || tr + 1 == nrows[0]))
break;
3093 for (local_ordinal_type vi = 1; vi < npacks; ++vi) {
3094 if ((e == 0 && nrows[vi] == 1) || (e == 1 && tr + 1 == nrows[vi])) {
3100 if (e == 0 && (tr == -1 || tr == nrows[0]))
break;
3101 for (local_ordinal_type vi = 1; vi < npacks; ++vi) {
3102 if ((e == 0 && nrows[vi] == 1) || (e == 0 && tr == nrows[vi])) {
3112 KOKKOS_INLINE_FUNCTION
3114 extract(
const member_type &member,
3115 const local_ordinal_type &partidxbeg,
3116 local_ordinal_type local_subpartidx,
3117 const local_ordinal_type &npacks,
3118 const local_ordinal_type &vbeg)
const {
3119 #ifdef IFPACK2_BLOCKTRIDICONTAINER_USE_PRINTF
3120 printf(
"extract partidxbeg = %d, local_subpartidx = %d, npacks = %d, vbeg = %d;\n", partidxbeg, local_subpartidx, npacks, vbeg);
3122 using tlb = BlockHelperDetails::TpetraLittleBlock<Tpetra::Impl::BlockCrsMatrixLittleBlockArrayLayout>;
3123 local_ordinal_type kfs_vals[internal_vector_length] = {};
3124 local_ordinal_type ri0_vals[internal_vector_length] = {};
3125 local_ordinal_type nrows_vals[internal_vector_length] = {};
3127 const size_type kps = pack_td_ptr(partidxbeg, local_subpartidx);
3128 for (local_ordinal_type v = vbeg, vi = 0; v < npacks && vi < internal_vector_length; ++v, ++vi) {
3129 kfs_vals[vi] = flat_td_ptr(partidxbeg + vi, local_subpartidx);
3130 ri0_vals[vi] = partptr_sub(pack_td_ptr.extent(0) * local_subpartidx + partidxbeg + vi, 0);
3131 nrows_vals[vi] = partptr_sub(pack_td_ptr.extent(0) * local_subpartidx + partidxbeg + vi, 1) - ri0_vals[vi];
3132 #ifdef IFPACK2_BLOCKTRIDICONTAINER_USE_PRINTF
3133 printf(
"kfs_vals[%d] = %d;\n", vi, kfs_vals[vi]);
3134 printf(
"ri0_vals[%d] = %d;\n", vi, ri0_vals[vi]);
3135 printf(
"nrows_vals[%d] = %d;\n", vi, nrows_vals[vi]);
3139 local_ordinal_type j_vals[internal_vector_length] = {};
3141 local_ordinal_type tr_min = 0;
3142 local_ordinal_type tr_max = nrows_vals[0];
3143 if (local_subpartidx % 2 == 1) {
3147 #ifdef IFPACK2_BLOCKTRIDICONTAINER_USE_PRINTF
3148 printf(
"tr_min = %d and tr_max = %d;\n", tr_min, tr_max);
3150 for (local_ordinal_type tr = tr_min; tr < tr_max; ++tr) {
3151 for (local_ordinal_type v = vbeg, vi = 0; v < npacks && vi < internal_vector_length; ++v, ++vi) {
3152 const local_ordinal_type nrows = (local_subpartidx % 2 == 0 ? nrows_vals[vi] : nrows_vals[vi]);
3153 if ((local_subpartidx % 2 == 0 && tr < nrows) || (local_subpartidx % 2 == 1 && tr < nrows + 1)) {
3154 auto &j = j_vals[vi];
3155 const local_ordinal_type kfs = kfs_vals[vi];
3156 const local_ordinal_type ri0 = ri0_vals[vi];
3157 local_ordinal_type lbeg, lend;
3158 if (local_subpartidx % 2 == 0) {
3159 lbeg = (tr == tr_min ? 1 : 0);
3160 lend = (tr == nrows - 1 ? 2 : 3);
3167 }
else if (tr == nrows) {
3172 if (hasBlockCrsMatrix) {
3173 for (local_ordinal_type l = lbeg; l < lend; ++l, ++j) {
3174 const size_type Aj = A_block_rowptr(lclrow(ri0 + tr)) + A_colindsub(kfs + j);
3175 const impl_scalar_type *block = &A_values(Aj * blocksize_square);
3176 const size_type pi = kps + j;
3177 #ifdef IFPACK2_BLOCKTRIDICONTAINER_USE_PRINTF
3178 printf(
"Extract pi = %ld, ri0 + tr = %d, kfs + j = %d, tr = %d, lbeg = %d, lend = %d, l = %d\n", pi, ri0 + tr, kfs + j, tr, lbeg, lend, l);
3180 Kokkos::parallel_for(Kokkos::TeamThreadRange(member, blocksize),
3181 [&](
const local_ordinal_type &ii) {
3182 for (local_ordinal_type jj = 0; jj < blocksize; ++jj) {
3183 scalar_values(pi, ii, jj, v) =
static_cast<btdm_scalar_type
>(block[tlb::getFlatIndex(ii, jj, blocksize)]);
3188 for (local_ordinal_type l = lbeg; l < lend; ++l, ++j) {
3189 const size_type Aj_c = A_colindsub(kfs + j);
3190 const size_type pi = kps + j;
3191 Kokkos::parallel_for(Kokkos::TeamThreadRange(member, blocksize),
3192 [&](
const local_ordinal_type &ii) {
3193 auto point_row_offset = A_point_rowptr(lclrow(ri0 + tr) * blocksize + ii);
3194 for (local_ordinal_type jj = 0; jj < blocksize; ++jj) {
3195 scalar_values(pi, ii, jj, v) = A_values(point_row_offset + Aj_c * blocksize + jj);
3205 template <
typename AAViewType,
3206 typename WWViewType>
3207 KOKKOS_INLINE_FUNCTION
void
3208 factorize_subline(
const member_type &member,
3209 const local_ordinal_type &i0,
3210 const local_ordinal_type &nrows,
3211 const local_ordinal_type &v,
3212 const AAViewType &AA,
3213 const WWViewType &WW)
const {
3214 typedef ExtractAndFactorizeTridiagsDefaultModeAndAlgo<typename execution_space::memory_space> default_mode_and_algo_type;
3216 typedef typename default_mode_and_algo_type::mode_type default_mode_type;
3217 typedef typename default_mode_and_algo_type::algo_type default_algo_type;
3220 #if KOKKOS_VERSION >= 40799
3221 const auto one = KokkosKernels::ArithTraits<btdm_magnitude_type>::one();
3223 const auto one = Kokkos::ArithTraits<btdm_magnitude_type>::one();
3226 #ifdef IFPACK2_BLOCKTRIDICONTAINER_USE_PRINTF
3227 printf(
"i0 = %d, nrows = %d, v = %d, AA.extent(0) = %ld;\n", i0, nrows, v, AA.extent(0));
3231 auto A = Kokkos::subview(AA, i0, Kokkos::ALL(), Kokkos::ALL(), v);
3233 default_mode_type, KB::Algo::LU::Unblocked>::invoke(member, A, tiny);
3238 local_ordinal_type i = i0;
3239 for (local_ordinal_type tr = 1; tr < nrows; ++tr, i += 3) {
3240 #ifdef IFPACK2_BLOCKTRIDICONTAINER_USE_PRINTF
3241 printf(
"tr = %d, i = %d;\n", tr, i);
3243 B.assign_data(&AA(i + 1, 0, 0, v));
3244 KB::Trsm<member_type,
3245 KB::Side::Left, KB::Uplo::Lower, KB::Trans::NoTranspose, KB::Diag::Unit,
3246 default_mode_type, default_algo_type>::invoke(member, one, A, B);
3247 C.assign_data(&AA(i + 2, 0, 0, v));
3248 KB::Trsm<member_type,
3249 KB::Side::Right, KB::Uplo::Upper, KB::Trans::NoTranspose, KB::Diag::NonUnit,
3250 default_mode_type, default_algo_type>::invoke(member, one, A, C);
3251 A.assign_data(&AA(i + 3, 0, 0, v));
3253 member.team_barrier();
3254 KB::Gemm<member_type,
3255 KB::Trans::NoTranspose, KB::Trans::NoTranspose,
3256 default_mode_type, default_algo_type>::invoke(member, -one, C, B, one, A);
3258 default_mode_type, KB::Algo::LU::Unblocked>::invoke(member, A, tiny);
3262 auto W = Kokkos::subview(WW, Kokkos::ALL(), Kokkos::ALL(), v);
3263 KB::Copy<member_type, KB::Trans::NoTranspose, default_mode_type>::invoke(member, A, W);
3264 KB::SetIdentity<member_type, default_mode_type>::invoke(member, A);
3265 member.team_barrier();
3266 KB::Trsm<member_type,
3267 KB::Side::Left, KB::Uplo::Lower, KB::Trans::NoTranspose, KB::Diag::Unit,
3268 default_mode_type, default_algo_type>::invoke(member, one, W, A);
3269 KB::Trsm<member_type,
3270 KB::Side::Left, KB::Uplo::Upper, KB::Trans::NoTranspose, KB::Diag::NonUnit,
3271 default_mode_type, default_algo_type>::invoke(member, one, W, A);
3276 struct ExtractAndFactorizeSubLineTag {};
3277 struct ExtractAndFactorizeFusedJacobiTag {};
3278 struct ExtractBCDTag {};
3279 struct ComputeETag {};
3280 struct ComputeSchurTag {};
3281 struct FactorizeSchurTag {};
3283 KOKKOS_INLINE_FUNCTION
3285 operator()(
const ExtractAndFactorizeSubLineTag &,
const member_type &member)
const {
3287 const local_ordinal_type packidx = packindices_sub(member.league_rank());
3289 const local_ordinal_type subpartidx = packptr_sub(packidx);
3290 const local_ordinal_type n_parts = part2packrowidx0_sub.extent(0);
3291 const local_ordinal_type local_subpartidx = subpartidx / n_parts;
3292 const local_ordinal_type partidx = subpartidx % n_parts;
3294 const local_ordinal_type npacks = packptr_sub(packidx + 1) - subpartidx;
3295 const local_ordinal_type i0 = pack_td_ptr(partidx, local_subpartidx);
3296 const local_ordinal_type nrows = partptr_sub(subpartidx, 1) - partptr_sub(subpartidx, 0);
3298 internal_vector_scratch_type_3d_view
3299 WW(member.team_scratch(ScratchLevel), blocksize, blocksize, vector_loop_size);
3301 #ifdef IFPACK2_BLOCKTRIDICONTAINER_USE_PRINTF
3302 printf(
"rank = %d, i0 = %d, npacks = %d, nrows = %d, packidx = %d, subpartidx = %d, partidx = %d, local_subpartidx = %d;\n", member.league_rank(), i0, npacks, nrows, packidx, subpartidx, partidx, local_subpartidx);
3303 printf(
"vector_loop_size = %d\n", vector_loop_size);
3306 if (vector_loop_size == 1) {
3307 extract(partidx, local_subpartidx, npacks);
3308 factorize_subline(member, i0, nrows, 0, internal_vector_values, WW);
3310 Kokkos::parallel_for(Kokkos::ThreadVectorRange(member, vector_loop_size),
3311 [&](
const local_ordinal_type &v) {
3312 const local_ordinal_type vbeg = v * internal_vector_length;
3313 #ifdef IFPACK2_BLOCKTRIDICONTAINER_USE_PRINTF
3314 printf(
"i0 = %d, npacks = %d, vbeg = %d;\n", i0, npacks, vbeg);
3317 extract(member, partidx + vbeg, local_subpartidx, npacks, vbeg);
3320 member.team_barrier();
3321 factorize_subline(member, i0, nrows, v, internal_vector_values, WW);
3326 KOKKOS_INLINE_FUNCTION
3328 operator()(
const ExtractAndFactorizeFusedJacobiTag &,
const member_type &member)
const {
3329 using default_mode_and_algo_type = ExtractAndFactorizeTridiagsDefaultModeAndAlgo<typename execution_space::memory_space>;
3330 using default_mode_type =
typename default_mode_and_algo_type::mode_type;
3331 using default_algo_type =
typename default_mode_and_algo_type::algo_type;
3334 btdm_scalar_scratch_type_3d_view WW1(member.team_scratch(ScratchLevel), half_vector_length, blocksize, blocksize);
3335 btdm_scalar_scratch_type_3d_view WW2(member.team_scratch(ScratchLevel), half_vector_length, blocksize, blocksize);
3336 #if KOKKOS_VERSION >= 40799
3337 const auto one = KokkosKernels::ArithTraits<btdm_magnitude_type>::one();
3339 const auto one = Kokkos::ArithTraits<btdm_magnitude_type>::one();
3341 const local_ordinal_type nrows = lclrow.extent(0);
3342 Kokkos::parallel_for(Kokkos::ThreadVectorRange(member, half_vector_length),
3343 [&](
const local_ordinal_type &v) {
3344 local_ordinal_type row = member.league_rank() * half_vector_length + v;
3346 auto W1 = Kokkos::subview(WW1, v, Kokkos::ALL(), Kokkos::ALL());
3347 auto W2 = Kokkos::subview(WW2, v, Kokkos::ALL(), Kokkos::ALL());
3350 const impl_scalar_type *A_diag = A_values.data() + diag_offsets(row);
3353 Kokkos::parallel_for(Kokkos::TeamThreadRange(member, blocksize * blocksize),
3355 W1.data()[i] = A_diag[i];
3358 KB::SetIdentity<member_type, default_mode_type>::invoke(member, W2);
3363 KB::SetIdentity<member_type, default_mode_type>::invoke(member, W1);
3365 member.team_barrier();
3367 KB::LU<member_type, default_mode_type, KB::Algo::LU::Unblocked>::invoke(member, W1, tiny);
3368 member.team_barrier();
3369 KB::Trsm<member_type,
3370 KB::Side::Left, KB::Uplo::Lower, KB::Trans::NoTranspose, KB::Diag::Unit,
3371 default_mode_type, default_algo_type>::invoke(member, one, W1, W2);
3372 KB::Trsm<member_type,
3373 KB::Side::Left, KB::Uplo::Upper, KB::Trans::NoTranspose, KB::Diag::NonUnit,
3374 default_mode_type, default_algo_type>::invoke(member, one, W1, W2);
3375 member.team_barrier();
3377 Kokkos::parallel_for(Kokkos::TeamThreadRange(member, blocksize * blocksize),
3379 auto d_inv_block = &d_inv(row, 0, 0);
3380 d_inv_block[i] = W2.data()[i];
3386 KOKKOS_INLINE_FUNCTION
3388 operator()(
const ExtractBCDTag &,
const member_type &member)
const {
3390 const local_ordinal_type packindices_schur_i = member.league_rank() % packindices_schur.extent(0);
3391 const local_ordinal_type packindices_schur_j = member.league_rank() / packindices_schur.extent(0);
3392 const local_ordinal_type packidx = packindices_schur(packindices_schur_i, packindices_schur_j);
3394 const local_ordinal_type subpartidx = packptr_sub(packidx);
3395 const local_ordinal_type n_parts = part2packrowidx0_sub.extent(0);
3396 const local_ordinal_type local_subpartidx = subpartidx / n_parts;
3397 const local_ordinal_type partidx = subpartidx % n_parts;
3399 const local_ordinal_type npacks = packptr_sub(packidx + 1) - subpartidx;
3403 if (vector_loop_size == 1) {
3404 extract(partidx, local_subpartidx, npacks);
3406 Kokkos::parallel_for(Kokkos::ThreadVectorRange(member, vector_loop_size),
3407 [&](
const local_ordinal_type &v) {
3408 const local_ordinal_type vbeg = v * internal_vector_length;
3409 #ifdef IFPACK2_BLOCKTRIDICONTAINER_USE_PRINTF
3410 const local_ordinal_type i0 = pack_td_ptr(partidx, local_subpartidx);
3411 printf(
"i0 = %d, npacks = %d, vbeg = %d;\n", i0, npacks, vbeg);
3414 extract(member, partidx + vbeg, local_subpartidx, npacks, vbeg);
3418 member.team_barrier();
3420 const size_type kps1 = pack_td_ptr(partidx, local_subpartidx);
3421 const size_type kps2 = pack_td_ptr(partidx, local_subpartidx + 1) - 1;
3423 const local_ordinal_type r1 = part2packrowidx0_sub(partidx, local_subpartidx) - 1;
3424 const local_ordinal_type r2 = part2packrowidx0_sub(partidx, local_subpartidx) + 2;
3426 #ifdef IFPACK2_BLOCKTRIDICONTAINER_USE_PRINTF
3427 printf(
"Copy for Schur complement part id = %d from kps1 = %ld to r1 = %d and from kps2 = %ld to r2 = %d partidx = %d local_subpartidx = %d;\n", packidx, kps1, r1, kps2, r2, partidx, local_subpartidx);
3431 copy3DView<local_ordinal_type>(member, Kokkos::subview(e_internal_vector_values, 0, r1, Kokkos::ALL(), Kokkos::ALL(), Kokkos::ALL()),
3432 Kokkos::subview(internal_vector_values, kps1, Kokkos::ALL(), Kokkos::ALL(), Kokkos::ALL()));
3434 copy3DView<local_ordinal_type>(member, Kokkos::subview(e_internal_vector_values, 1, r2, Kokkos::ALL(), Kokkos::ALL(), Kokkos::ALL()),
3435 Kokkos::subview(internal_vector_values, kps2, Kokkos::ALL(), Kokkos::ALL(), Kokkos::ALL()));
3438 KOKKOS_INLINE_FUNCTION
3440 operator()(
const ComputeETag &,
const member_type &member)
const {
3442 const local_ordinal_type packidx = packindices_sub(member.league_rank());
3444 const local_ordinal_type subpartidx = packptr_sub(packidx);
3445 const local_ordinal_type n_parts = part2packrowidx0_sub.extent(0);
3446 const local_ordinal_type local_subpartidx = subpartidx / n_parts;
3447 const local_ordinal_type partidx = subpartidx % n_parts;
3449 const local_ordinal_type npacks = packptr_sub(packidx + 1) - subpartidx;
3450 const local_ordinal_type i0 = pack_td_ptr(partidx, local_subpartidx);
3451 const local_ordinal_type r0 = part2packrowidx0_sub(partidx, local_subpartidx);
3452 const local_ordinal_type nrows = partptr_sub(subpartidx, 1) - partptr_sub(subpartidx, 0);
3453 const local_ordinal_type num_vectors = blocksize;
3457 internal_vector_scratch_type_3d_view
3458 WW(member.team_scratch(ScratchLevel), blocksize, num_vectors, vector_loop_size);
3459 if (local_subpartidx == 0) {
3460 Kokkos::parallel_for(Kokkos::ThreadVectorRange(member, vector_loop_size), [&](
const int &v) {
3461 solveMultiVector<impl_type, internal_vector_scratch_type_3d_view>(member, blocksize, i0, r0, nrows, v, internal_vector_values, Kokkos::subview(e_internal_vector_values, 0, Kokkos::ALL(), Kokkos::ALL(), Kokkos::ALL(), Kokkos::ALL()), WW,
true);
3463 }
else if (local_subpartidx == (local_ordinal_type)part2packrowidx0_sub.extent(1) - 2) {
3464 Kokkos::parallel_for(Kokkos::ThreadVectorRange(member, vector_loop_size), [&](
const int &v) {
3465 solveMultiVector<impl_type, internal_vector_scratch_type_3d_view>(member, blocksize, i0, r0, nrows, v, internal_vector_values, Kokkos::subview(e_internal_vector_values, 1, Kokkos::ALL(), Kokkos::ALL(), Kokkos::ALL(), Kokkos::ALL()), WW);
3468 Kokkos::parallel_for(Kokkos::ThreadVectorRange(member, vector_loop_size), [&](
const int &v) {
3469 solveMultiVector<impl_type, internal_vector_scratch_type_3d_view>(member, blocksize, i0, r0, nrows, v, internal_vector_values, Kokkos::subview(e_internal_vector_values, 0, Kokkos::ALL(), Kokkos::ALL(), Kokkos::ALL(), Kokkos::ALL()), WW,
true);
3470 solveMultiVector<impl_type, internal_vector_scratch_type_3d_view>(member, blocksize, i0, r0, nrows, v, internal_vector_values, Kokkos::subview(e_internal_vector_values, 1, Kokkos::ALL(), Kokkos::ALL(), Kokkos::ALL(), Kokkos::ALL()), WW);
3475 KOKKOS_INLINE_FUNCTION
3477 operator()(
const ComputeSchurTag &,
const member_type &member)
const {
3479 const local_ordinal_type packindices_schur_i = member.league_rank() % packindices_schur.extent(0);
3480 const local_ordinal_type packindices_schur_j = member.league_rank() / packindices_schur.extent(0);
3481 const local_ordinal_type packidx = packindices_schur(packindices_schur_i, packindices_schur_j);
3483 const local_ordinal_type subpartidx = packptr_sub(packidx);
3484 const local_ordinal_type n_parts = part2packrowidx0_sub.extent(0);
3485 const local_ordinal_type local_subpartidx = subpartidx / n_parts;
3486 const local_ordinal_type partidx = subpartidx % n_parts;
3489 const local_ordinal_type i0 = pack_td_ptr(partidx, local_subpartidx);
3495 const local_ordinal_type local_subpartidx_schur = (local_subpartidx - 1) / 2;
3496 const local_ordinal_type i0_schur = local_subpartidx_schur == 0 ? pack_td_ptr_schur(partidx, local_subpartidx_schur) : pack_td_ptr_schur(partidx, local_subpartidx_schur) + 1;
3497 const local_ordinal_type i0_offset = local_subpartidx_schur == 0 ? i0 + 2 : i0 + 2;
3499 for (local_ordinal_type i = 0; i < 4; ++i) {
3500 copy3DView<local_ordinal_type>(member, Kokkos::subview(internal_vector_values_schur, i0_schur + i, Kokkos::ALL(), Kokkos::ALL(), Kokkos::ALL()),
3501 Kokkos::subview(internal_vector_values, i0_offset + i, Kokkos::ALL(), Kokkos::ALL(), Kokkos::ALL()));
3504 member.team_barrier();
3506 #if KOKKOS_VERSION >= 40799
3507 const auto one = KokkosKernels::ArithTraits<btdm_magnitude_type>::one();
3509 const auto one = Kokkos::ArithTraits<btdm_magnitude_type>::one();
3512 const size_type c_kps1 = pack_td_ptr(partidx, local_subpartidx) + 1;
3513 const size_type c_kps2 = pack_td_ptr(partidx, local_subpartidx + 1) - 2;
3515 const local_ordinal_type e_r1 = part2packrowidx0_sub(partidx, local_subpartidx) - 1;
3516 const local_ordinal_type e_r2 = part2packrowidx0_sub(partidx, local_subpartidx) + 2;
3518 typedef ExtractAndFactorizeTridiagsDefaultModeAndAlgo<typename execution_space::memory_space> default_mode_and_algo_type;
3520 typedef typename default_mode_and_algo_type::mode_type default_mode_type;
3521 typedef typename default_mode_and_algo_type::algo_type default_algo_type;
3523 Kokkos::parallel_for(Kokkos::ThreadVectorRange(member, vector_loop_size), [&](
const int &v) {
3524 for (size_type i = 0; i < pack_td_ptr_schur(partidx, local_subpartidx_schur + 1) - pack_td_ptr_schur(partidx, local_subpartidx_schur); ++i) {
3525 local_ordinal_type e_r, e_c, c_kps;
3527 if (local_subpartidx_schur == 0) {
3532 }
else if (i == 3) {
3536 }
else if (i == 4) {
3548 }
else if (i == 1) {
3552 }
else if (i == 4) {
3556 }
else if (i == 5) {
3565 auto S = Kokkos::subview(internal_vector_values_schur, pack_td_ptr_schur(partidx, local_subpartidx_schur) + i, Kokkos::ALL(), Kokkos::ALL(), v);
3566 auto C = Kokkos::subview(internal_vector_values, c_kps, Kokkos::ALL(), Kokkos::ALL(), v);
3567 auto E = Kokkos::subview(e_internal_vector_values, e_c, e_r, Kokkos::ALL(), Kokkos::ALL(), v);
3568 KB::Gemm<member_type,
3569 KB::Trans::NoTranspose, KB::Trans::NoTranspose,
3570 default_mode_type, default_algo_type>::invoke(member, -one, C, E, one, S);
3575 KOKKOS_INLINE_FUNCTION
3577 operator()(
const FactorizeSchurTag &,
const member_type &member)
const {
3578 const local_ordinal_type packidx = packindices_schur(member.league_rank(), 0);
3580 const local_ordinal_type subpartidx = packptr_sub(packidx);
3582 const local_ordinal_type n_parts = part2packrowidx0_sub.extent(0);
3583 const local_ordinal_type partidx = subpartidx % n_parts;
3585 const local_ordinal_type i0 = pack_td_ptr_schur(partidx, 0);
3586 const local_ordinal_type nrows = 2 * (pack_td_ptr_schur.extent(1) - 1);
3588 internal_vector_scratch_type_3d_view
3589 WW(member.team_scratch(ScratchLevel), blocksize, blocksize, vector_loop_size);
3591 #ifdef IFPACK2_BLOCKTRIDICONTAINER_USE_PRINTF
3592 printf(
"FactorizeSchurTag rank = %d, i0 = %d, nrows = %d, vector_loop_size = %d;\n", member.league_rank(), i0, nrows, vector_loop_size);
3595 if (vector_loop_size == 1) {
3596 factorize_subline(member, i0, nrows, 0, internal_vector_values_schur, WW);
3598 Kokkos::parallel_for(Kokkos::ThreadVectorRange(member, vector_loop_size),
3599 [&](
const local_ordinal_type &v) {
3600 factorize_subline(member, i0, nrows, v, internal_vector_values_schur, WW);
3606 IFPACK2_BLOCKTRIDICONTAINER_PROFILER_REGION_BEGIN;
3607 const local_ordinal_type team_size =
3608 ExtractAndFactorizeTridiagsDefaultModeAndAlgo<typename execution_space::memory_space>::
3609 recommended_team_size(blocksize, vector_length, internal_vector_length);
3610 const local_ordinal_type per_team_scratch = internal_vector_scratch_type_3d_view::
3611 shmem_size(blocksize, blocksize, vector_loop_size);
3614 #ifdef IFPACK2_BLOCKTRIDICONTAINER_USE_PRINTF
3615 printf(
"Start ExtractAndFactorizeSubLineTag\n");
3617 IFPACK2_BLOCKHELPER_TIMER(
"BlockTriDi::NumericPhase::ExtractAndFactorizeSubLineTag", ExtractAndFactorizeSubLineTag0);
3618 Kokkos::TeamPolicy<execution_space, ExtractAndFactorizeSubLineTag>
3619 policy(packindices_sub.extent(0), team_size, vector_loop_size);
3621 const local_ordinal_type n_parts = part2packrowidx0_sub.extent(0);
3622 writeBTDValuesToFile(n_parts, scalar_values,
"before.mm");
3624 policy.set_scratch_size(ScratchLevel, Kokkos::PerTeam(per_team_scratch));
3625 Kokkos::parallel_for(
"ExtractAndFactorize::TeamPolicy::run<ExtractAndFactorizeSubLineTag>",
3627 execution_space().fence();
3629 writeBTDValuesToFile(n_parts, scalar_values,
"after.mm");
3630 #ifdef IFPACK2_BLOCKTRIDICONTAINER_USE_PRINTF
3631 printf(
"End ExtractAndFactorizeSubLineTag\n");
3635 if (packindices_schur.extent(1) > 0) {
3637 #ifdef IFPACK2_BLOCKTRIDICONTAINER_USE_PRINTF
3638 printf(
"Start ExtractBCDTag\n");
3640 #if KOKKOS_VERSION >= 40799
3641 Kokkos::deep_copy(e_scalar_values, KokkosKernels::ArithTraits<btdm_magnitude_type>::zero());
3643 Kokkos::deep_copy(e_scalar_values, Kokkos::ArithTraits<btdm_magnitude_type>::zero());
3645 #if KOKKOS_VERSION >= 40799
3646 Kokkos::deep_copy(scalar_values_schur, KokkosKernels::ArithTraits<btdm_magnitude_type>::zero());
3648 Kokkos::deep_copy(scalar_values_schur, Kokkos::ArithTraits<btdm_magnitude_type>::zero());
3651 write5DMultiVectorValuesToFile(part2packrowidx0_sub.extent(0), e_scalar_values,
"e_scalar_values_before_extract.mm");
3654 IFPACK2_BLOCKHELPER_TIMER(
"BlockTriDi::NumericPhase::ExtractBCDTag", ExtractBCDTag0);
3655 Kokkos::TeamPolicy<execution_space, ExtractBCDTag>
3656 policy(packindices_schur.extent(0) * packindices_schur.extent(1), team_size, vector_loop_size);
3658 policy.set_scratch_size(ScratchLevel, Kokkos::PerTeam(per_team_scratch));
3659 Kokkos::parallel_for(
"ExtractAndFactorize::TeamPolicy::run<ExtractBCDTag>",
3661 execution_space().fence();
3664 #ifdef IFPACK2_BLOCKTRIDICONTAINER_USE_PRINTF
3665 printf(
"End ExtractBCDTag\n");
3667 writeBTDValuesToFile(part2packrowidx0_sub.extent(0), scalar_values,
"after_extraction_of_BCD.mm");
3668 #ifdef IFPACK2_BLOCKTRIDICONTAINER_USE_PRINTF
3669 printf(
"Start ComputeETag\n");
3671 write5DMultiVectorValuesToFile(part2packrowidx0_sub.extent(0), e_scalar_values,
"e_scalar_values_after_extract.mm");
3673 IFPACK2_BLOCKHELPER_TIMER(
"BlockTriDi::NumericPhase::ComputeETag", ComputeETag0);
3674 Kokkos::TeamPolicy<execution_space, ComputeETag>
3675 policy(packindices_sub.extent(0), team_size, vector_loop_size);
3677 policy.set_scratch_size(ScratchLevel, Kokkos::PerTeam(per_team_scratch));
3678 Kokkos::parallel_for(
"ExtractAndFactorize::TeamPolicy::run<ComputeETag>",
3680 execution_space().fence();
3682 write5DMultiVectorValuesToFile(part2packrowidx0_sub.extent(0), e_scalar_values,
"e_scalar_values_after_compute.mm");
3684 #ifdef IFPACK2_BLOCKTRIDICONTAINER_USE_PRINTF
3685 printf(
"End ComputeETag\n");
3690 #ifdef IFPACK2_BLOCKTRIDICONTAINER_USE_PRINTF
3691 printf(
"Start ComputeSchurTag\n");
3693 IFPACK2_BLOCKHELPER_TIMER(
"BlockTriDi::NumericPhase::ComputeSchurTag", ComputeSchurTag0);
3694 writeBTDValuesToFile(part2packrowidx0_sub.extent(0), scalar_values_schur,
"before_schur.mm");
3695 Kokkos::TeamPolicy<execution_space, ComputeSchurTag>
3696 policy(packindices_schur.extent(0) * packindices_schur.extent(1), team_size, vector_loop_size);
3698 Kokkos::parallel_for(
"ExtractAndFactorize::TeamPolicy::run<ComputeSchurTag>",
3700 writeBTDValuesToFile(part2packrowidx0_sub.extent(0), scalar_values_schur,
"after_schur.mm");
3701 execution_space().fence();
3702 #ifdef IFPACK2_BLOCKTRIDICONTAINER_USE_PRINTF
3703 printf(
"End ComputeSchurTag\n");
3708 #ifdef IFPACK2_BLOCKTRIDICONTAINER_USE_PRINTF
3709 printf(
"Start FactorizeSchurTag\n");
3711 IFPACK2_BLOCKHELPER_TIMER(
"BlockTriDi::NumericPhase::FactorizeSchurTag", FactorizeSchurTag0);
3712 Kokkos::TeamPolicy<execution_space, FactorizeSchurTag>
3713 policy(packindices_schur.extent(0), team_size, vector_loop_size);
3714 policy.set_scratch_size(ScratchLevel, Kokkos::PerTeam(per_team_scratch));
3715 Kokkos::parallel_for(
"ExtractAndFactorize::TeamPolicy::run<FactorizeSchurTag>",
3717 execution_space().fence();
3718 writeBTDValuesToFile(part2packrowidx0_sub.extent(0), scalar_values_schur,
"after_factor_schur.mm");
3719 #ifdef IFPACK2_BLOCKTRIDICONTAINER_USE_PRINTF
3720 printf(
"End FactorizeSchurTag\n");
3725 IFPACK2_BLOCKTRIDICONTAINER_PROFILER_REGION_END;
3728 void run_fused_jacobi() {
3729 IFPACK2_BLOCKTRIDICONTAINER_PROFILER_REGION_BEGIN;
3730 const local_ordinal_type team_size =
3731 ExtractAndFactorizeTridiagsDefaultModeAndAlgo<typename execution_space::memory_space>::
3732 recommended_team_size(blocksize, half_vector_length, 1);
3733 const local_ordinal_type per_team_scratch =
3734 btdm_scalar_scratch_type_3d_view::shmem_size(blocksize, blocksize, 2 * half_vector_length);
3736 IFPACK2_BLOCKHELPER_TIMER(
"BlockTriDi::NumericPhase::ExtractAndFactorizeFusedJacobi", ExtractAndFactorizeFusedJacobiTag);
3737 Kokkos::TeamPolicy<execution_space, ExtractAndFactorizeFusedJacobiTag>
3738 policy((lclrow.extent(0) + half_vector_length - 1) / half_vector_length, team_size, half_vector_length);
3740 policy.set_scratch_size(ScratchLevel, Kokkos::PerTeam(per_team_scratch));
3741 Kokkos::parallel_for(
"ExtractAndFactorize::TeamPolicy::run<ExtractAndFactorizeFusedJacobiTag>",
3744 IFPACK2_BLOCKTRIDICONTAINER_PROFILER_REGION_END;
3751 template <
typename MatrixType>
3753 const Teuchos::RCP<
const typename BlockHelperDetails::ImplType<MatrixType>::tpetra_crs_graph_type> &G,
3754 const BlockHelperDetails::PartInterface<MatrixType> &interf,
3756 const typename BlockHelperDetails::ImplType<MatrixType>::magnitude_type tiny,
3757 bool use_fused_jacobi) {
3759 using execution_space =
typename impl_type::execution_space;
3760 using team_policy_type = Kokkos::TeamPolicy<execution_space>;
3761 using internal_vector_scratch_type_3d_view = Scratch<typename impl_type::internal_vector_type_3d_view>;
3762 using btdm_scalar_scratch_type_3d_view = Scratch<typename impl_type::btdm_scalar_type_3d_view>;
3764 IFPACK2_BLOCKHELPER_TIMER(
"BlockTriDi::NumericPhase", NumericPhase);
3766 int blocksize = btdm.values.extent(1);
3769 int scratch_required;
3770 if (!use_fused_jacobi) {
3772 scratch_required = internal_vector_scratch_type_3d_view::shmem_size(blocksize, blocksize, impl_type::vector_length / impl_type::internal_vector_length);
3775 scratch_required = btdm_scalar_scratch_type_3d_view::shmem_size(blocksize, blocksize, 2 * impl_type::half_vector_length);
3778 int max_scratch = team_policy_type::scratch_size_max(0);
3780 if (scratch_required < max_scratch) {
3782 ExtractAndFactorizeTridiags<MatrixType, 0>
function(btdm, interf, A, G, tiny);
3783 if (!use_fused_jacobi)
3786 function.run_fused_jacobi();
3789 ExtractAndFactorizeTridiags<MatrixType, 1>
function(btdm, interf, A, G, tiny);
3790 if (!use_fused_jacobi)
3793 function.run_fused_jacobi();
3795 IFPACK2_BLOCKHELPER_TIMER_FENCE(
typename BlockHelperDetails::ImplType<MatrixType>::execution_space)
3801 template <
typename MatrixType>
3805 using execution_space =
typename impl_type::execution_space;
3806 using memory_space =
typename impl_type::memory_space;
3808 using local_ordinal_type =
typename impl_type::local_ordinal_type;
3810 using btdm_scalar_type =
typename impl_type::btdm_scalar_type;
3811 using tpetra_multivector_type =
typename impl_type::tpetra_multivector_type;
3812 using local_ordinal_type_1d_view =
typename impl_type::local_ordinal_type_1d_view;
3813 using vector_type_3d_view =
typename impl_type::vector_type_3d_view;
3814 using impl_scalar_type_2d_view_tpetra =
typename impl_type::impl_scalar_type_2d_view_tpetra;
3815 using const_impl_scalar_type_2d_view_tpetra =
typename impl_scalar_type_2d_view_tpetra::const_type;
3816 static constexpr
int vector_length = impl_type::vector_length;
3818 using member_type =
typename Kokkos::TeamPolicy<execution_space>::member_type;
3822 const ConstUnmanaged<local_ordinal_type_1d_view> partptr;
3823 const ConstUnmanaged<local_ordinal_type_1d_view> packptr;
3824 const ConstUnmanaged<local_ordinal_type_1d_view> part2packrowidx0;
3825 const ConstUnmanaged<local_ordinal_type_1d_view> part2rowidx0;
3826 const ConstUnmanaged<local_ordinal_type_1d_view> lclrow;
3827 const local_ordinal_type blocksize;
3828 const local_ordinal_type num_vectors;
3831 vector_type_3d_view packed_multivector;
3832 const_impl_scalar_type_2d_view_tpetra scalar_multivector;
3834 template <
typename TagType>
3835 KOKKOS_INLINE_FUNCTION
void copy_multivectors(
const local_ordinal_type &j,
3836 const local_ordinal_type &vi,
3837 const local_ordinal_type &pri,
3838 const local_ordinal_type &ri0)
const {
3839 for (local_ordinal_type col = 0; col < num_vectors; ++col)
3840 for (local_ordinal_type i = 0; i < blocksize; ++i)
3841 packed_multivector(pri, i, col)[vi] =
static_cast<btdm_scalar_type
>(scalar_multivector(blocksize * lclrow(ri0 + j) + i, col));
3846 const vector_type_3d_view &pmv)
3847 : partptr(interf.partptr)
3848 , packptr(interf.packptr)
3849 , part2packrowidx0(interf.part2packrowidx0)
3850 , part2rowidx0(interf.part2rowidx0)
3851 , lclrow(interf.lclrow)
3852 , blocksize(pmv.extent(1))
3853 , num_vectors(pmv.extent(2))
3854 , packed_multivector(pmv) {}
3857 KOKKOS_INLINE_FUNCTION
3859 operator()(
const local_ordinal_type &packidx)
const {
3860 local_ordinal_type partidx = packptr(packidx);
3861 local_ordinal_type npacks = packptr(packidx + 1) - partidx;
3862 const local_ordinal_type pri0 = part2packrowidx0(partidx);
3864 local_ordinal_type ri0[vector_length] = {};
3865 local_ordinal_type nrows[vector_length] = {};
3866 for (local_ordinal_type v = 0; v < npacks; ++v, ++partidx) {
3867 ri0[v] = part2rowidx0(partidx);
3868 nrows[v] = part2rowidx0(partidx + 1) - ri0[v];
3870 for (local_ordinal_type j = 0; j < nrows[0]; ++j) {
3871 local_ordinal_type cnt = 1;
3872 for (; cnt < npacks && j != nrows[cnt]; ++cnt)
3875 const local_ordinal_type pri = pri0 + j;
3876 for (local_ordinal_type col = 0; col < num_vectors; ++col)
3877 for (local_ordinal_type i = 0; i < blocksize; ++i)
3878 for (local_ordinal_type v = 0; v < npacks; ++v)
3879 packed_multivector(pri, i, col)[v] =
static_cast<btdm_scalar_type
>(scalar_multivector(blocksize * lclrow(ri0[v] + j) + i, col));
3883 KOKKOS_INLINE_FUNCTION
3885 operator()(
const member_type &member)
const {
3886 const local_ordinal_type packidx = member.league_rank();
3887 const local_ordinal_type partidx_begin = packptr(packidx);
3888 const local_ordinal_type npacks = packptr(packidx + 1) - partidx_begin;
3889 const local_ordinal_type pri0 = part2packrowidx0(partidx_begin);
3890 Kokkos::parallel_for(Kokkos::ThreadVectorRange(member, npacks), [&](
const local_ordinal_type &v) {
3891 const local_ordinal_type partidx = partidx_begin + v;
3892 const local_ordinal_type ri0 = part2rowidx0(partidx);
3893 const local_ordinal_type nrows = part2rowidx0(partidx + 1) - ri0;
3896 const local_ordinal_type pri = pri0;
3897 for (local_ordinal_type col = 0; col < num_vectors; ++col) {
3898 Kokkos::parallel_for(Kokkos::TeamThreadRange(member, blocksize), [&](
const local_ordinal_type &i) {
3899 packed_multivector(pri, i, col)[v] =
static_cast<btdm_scalar_type
>(scalar_multivector(blocksize * lclrow(ri0) + i, col));
3903 Kokkos::parallel_for(Kokkos::TeamThreadRange(member, nrows), [&](
const local_ordinal_type &j) {
3904 const local_ordinal_type pri = pri0 + j;
3905 for (local_ordinal_type col = 0; col < num_vectors; ++col)
3906 for (local_ordinal_type i = 0; i < blocksize; ++i)
3907 packed_multivector(pri, i, col)[v] = static_cast<btdm_scalar_type>(scalar_multivector(blocksize * lclrow(ri0 + j) + i, col));
3913 void run(
const const_impl_scalar_type_2d_view_tpetra &scalar_multivector_) {
3914 IFPACK2_BLOCKTRIDICONTAINER_PROFILER_REGION_BEGIN;
3915 IFPACK2_BLOCKHELPER_TIMER(
"BlockTriDi::MultiVectorConverter", MultiVectorConverter0);
3917 scalar_multivector = scalar_multivector_;
3918 if constexpr (BlockHelperDetails::is_device<execution_space>::value) {
3919 const local_ordinal_type vl = vector_length;
3920 const Kokkos::TeamPolicy<execution_space> policy(packptr.extent(0) - 1, Kokkos::AUTO(), vl);
3921 Kokkos::parallel_for(
"MultiVectorConverter::TeamPolicy", policy, *
this);
3923 const Kokkos::RangePolicy<execution_space> policy(0, packptr.extent(0) - 1);
3924 Kokkos::parallel_for(
"MultiVectorConverter::RangePolicy", policy, *
this);
3926 IFPACK2_BLOCKTRIDICONTAINER_PROFILER_REGION_END;
3927 IFPACK2_BLOCKHELPER_TIMER_FENCE(execution_space)
3936 struct SolveTridiagsDefaultModeAndAlgo<Kokkos::HostSpace> {
3937 typedef KB::Mode::Serial mode_type;
3938 typedef KB::Algo::Level2::Unblocked single_vector_algo_type;
3939 #if defined(__KOKKOSBATCHED_INTEL_MKL_COMPACT_BATCHED__)
3940 typedef KB::Algo::Level3::CompactMKL multi_vector_algo_type;
3942 typedef KB::Algo::Level3::Blocked multi_vector_algo_type;
3944 static int recommended_team_size(
const int ,
3951 #if defined(KOKKOS_ENABLE_CUDA)
3952 static inline int SolveTridiagsRecommendedCudaTeamSize(
const int blksize,
3953 const int vector_length,
3954 const int internal_vector_length) {
3955 const int vector_size = vector_length / internal_vector_length;
3956 int total_team_size(0);
3958 total_team_size = 32;
3959 else if (blksize <= 9)
3960 total_team_size = 32;
3961 else if (blksize <= 12)
3962 total_team_size = 96;
3963 else if (blksize <= 16)
3964 total_team_size = 128;
3965 else if (blksize <= 20)
3966 total_team_size = 160;
3968 total_team_size = 160;
3969 return total_team_size / vector_size;
3973 struct SolveTridiagsDefaultModeAndAlgo<Kokkos::CudaSpace> {
3974 typedef KB::Mode::Team mode_type;
3975 typedef KB::Algo::Level2::Unblocked single_vector_algo_type;
3976 typedef KB::Algo::Level3::Unblocked multi_vector_algo_type;
3977 static int recommended_team_size(
const int blksize,
3978 const int vector_length,
3979 const int internal_vector_length) {
3980 return SolveTridiagsRecommendedCudaTeamSize(blksize, vector_length, internal_vector_length);
3984 struct SolveTridiagsDefaultModeAndAlgo<Kokkos::CudaUVMSpace> {
3985 typedef KB::Mode::Team mode_type;
3986 typedef KB::Algo::Level2::Unblocked single_vector_algo_type;
3987 typedef KB::Algo::Level3::Unblocked multi_vector_algo_type;
3988 static int recommended_team_size(
const int blksize,
3989 const int vector_length,
3990 const int internal_vector_length) {
3991 return SolveTridiagsRecommendedCudaTeamSize(blksize, vector_length, internal_vector_length);
3996 #if defined(KOKKOS_ENABLE_HIP)
3997 static inline int SolveTridiagsRecommendedHIPTeamSize(
const int blksize,
3998 const int vector_length,
3999 const int internal_vector_length) {
4000 const int vector_size = vector_length / internal_vector_length;
4001 int total_team_size(0);
4003 total_team_size = 32;
4004 else if (blksize <= 9)
4005 total_team_size = 32;
4006 else if (blksize <= 12)
4007 total_team_size = 96;
4008 else if (blksize <= 16)
4009 total_team_size = 128;
4010 else if (blksize <= 20)
4011 total_team_size = 160;
4013 total_team_size = 160;
4014 return total_team_size / vector_size;
4018 struct SolveTridiagsDefaultModeAndAlgo<Kokkos::HIPSpace> {
4019 typedef KB::Mode::Team mode_type;
4020 typedef KB::Algo::Level2::Unblocked single_vector_algo_type;
4021 typedef KB::Algo::Level3::Unblocked multi_vector_algo_type;
4022 static int recommended_team_size(
const int blksize,
4023 const int vector_length,
4024 const int internal_vector_length) {
4025 return SolveTridiagsRecommendedHIPTeamSize(blksize, vector_length, internal_vector_length);
4029 struct SolveTridiagsDefaultModeAndAlgo<Kokkos::HIPHostPinnedSpace> {
4030 typedef KB::Mode::Team mode_type;
4031 typedef KB::Algo::Level2::Unblocked single_vector_algo_type;
4032 typedef KB::Algo::Level3::Unblocked multi_vector_algo_type;
4033 static int recommended_team_size(
const int blksize,
4034 const int vector_length,
4035 const int internal_vector_length) {
4036 return SolveTridiagsRecommendedHIPTeamSize(blksize, vector_length, internal_vector_length);
4041 #if defined(KOKKOS_ENABLE_SYCL)
4042 static inline int SolveTridiagsRecommendedSYCLTeamSize(
const int blksize,
4043 const int vector_length,
4044 const int internal_vector_length) {
4045 const int vector_size = vector_length / internal_vector_length;
4046 int total_team_size(0);
4048 total_team_size = 32;
4049 else if (blksize <= 9)
4050 total_team_size = 32;
4051 else if (blksize <= 12)
4052 total_team_size = 96;
4053 else if (blksize <= 16)
4054 total_team_size = 128;
4055 else if (blksize <= 20)
4056 total_team_size = 160;
4058 total_team_size = 160;
4059 return total_team_size / vector_size;
4063 struct SolveTridiagsDefaultModeAndAlgo<Kokkos::Experimental::SYCLSharedUSMSpace> {
4064 typedef KB::Mode::Team mode_type;
4065 typedef KB::Algo::Level2::Unblocked single_vector_algo_type;
4066 typedef KB::Algo::Level3::Unblocked multi_vector_algo_type;
4067 static int recommended_team_size(
const int blksize,
4068 const int vector_length,
4069 const int internal_vector_length) {
4070 return SolveTridiagsRecommendedSYCLTeamSize(blksize, vector_length, internal_vector_length);
4074 struct SolveTridiagsDefaultModeAndAlgo<Kokkos::Experimental::SYCLDeviceUSMSpace> {
4075 typedef KB::Mode::Team mode_type;
4076 typedef KB::Algo::Level2::Unblocked single_vector_algo_type;
4077 typedef KB::Algo::Level3::Unblocked multi_vector_algo_type;
4078 static int recommended_team_size(
const int blksize,
4079 const int vector_length,
4080 const int internal_vector_length) {
4081 return SolveTridiagsRecommendedSYCLTeamSize(blksize, vector_length, internal_vector_length);
4086 template <
typename MatrixType>
4087 struct SolveTridiags {
4089 using impl_type = BlockHelperDetails::ImplType<MatrixType>;
4090 using execution_space =
typename impl_type::execution_space;
4092 using local_ordinal_type =
typename impl_type::local_ordinal_type;
4095 using magnitude_type =
typename impl_type::magnitude_type;
4096 using btdm_scalar_type =
typename impl_type::btdm_scalar_type;
4097 using btdm_magnitude_type =
typename impl_type::btdm_magnitude_type;
4099 using local_ordinal_type_1d_view =
typename impl_type::local_ordinal_type_1d_view;
4100 using local_ordinal_type_2d_view =
typename impl_type::local_ordinal_type_2d_view;
4101 using size_type_2d_view =
typename impl_type::size_type_2d_view;
4103 using vector_type_3d_view =
typename impl_type::vector_type_3d_view;
4104 using internal_vector_type_4d_view =
typename impl_type::internal_vector_type_4d_view;
4105 using internal_vector_type_5d_view =
typename impl_type::internal_vector_type_5d_view;
4106 using btdm_scalar_type_4d_view =
typename impl_type::btdm_scalar_type_4d_view;
4108 using internal_vector_scratch_type_3d_view = Scratch<typename impl_type::internal_vector_type_3d_view>;
4110 using internal_vector_type =
typename impl_type::internal_vector_type;
4111 static constexpr
int vector_length = impl_type::vector_length;
4112 static constexpr
int internal_vector_length = impl_type::internal_vector_length;
4115 using impl_scalar_type_1d_view =
typename impl_type::impl_scalar_type_1d_view;
4116 using impl_scalar_type_2d_view_tpetra =
typename impl_type::impl_scalar_type_2d_view_tpetra;
4119 using team_policy_type = Kokkos::TeamPolicy<execution_space>;
4120 using member_type =
typename team_policy_type::member_type;
4124 local_ordinal_type n_subparts_per_part;
4125 const ConstUnmanaged<local_ordinal_type_1d_view> partptr;
4126 const ConstUnmanaged<local_ordinal_type_1d_view> packptr;
4127 const ConstUnmanaged<local_ordinal_type_1d_view> packindices_sub;
4128 const ConstUnmanaged<local_ordinal_type_2d_view> packindices_schur;
4129 const ConstUnmanaged<local_ordinal_type_1d_view> part2packrowidx0;
4130 const ConstUnmanaged<local_ordinal_type_2d_view> part2packrowidx0_sub;
4131 const ConstUnmanaged<local_ordinal_type_1d_view> lclrow;
4132 const ConstUnmanaged<local_ordinal_type_1d_view> packptr_sub;
4134 const ConstUnmanaged<local_ordinal_type_2d_view> partptr_sub;
4135 const ConstUnmanaged<size_type_2d_view> pack_td_ptr_schur;
4138 const ConstUnmanaged<size_type_2d_view> pack_td_ptr;
4141 const ConstUnmanaged<internal_vector_type_4d_view> D_internal_vector_values;
4142 const Unmanaged<internal_vector_type_4d_view> X_internal_vector_values;
4143 const Unmanaged<btdm_scalar_type_4d_view> X_internal_scalar_values;
4145 internal_vector_type_4d_view X_internal_vector_values_schur;
4147 const ConstUnmanaged<internal_vector_type_4d_view> D_internal_vector_values_schur;
4148 const ConstUnmanaged<internal_vector_type_5d_view> e_internal_vector_values;
4150 const local_ordinal_type vector_loop_size;
4153 Unmanaged<impl_scalar_type_2d_view_tpetra> Y_scalar_multivector;
4154 #if defined(__CUDA_ARCH__) || defined(__HIP_DEVICE_COMPILE__) || defined(__SYCL_DEVICE_ONLY__)
4155 AtomicUnmanaged<impl_scalar_type_1d_view> Z_scalar_vector;
4157 Unmanaged<impl_scalar_type_1d_view> Z_scalar_vector;
4159 const impl_scalar_type df;
4160 const bool compute_diff;
4163 SolveTridiags(
const BlockHelperDetails::PartInterface<MatrixType> &interf,
4164 const BlockTridiags<MatrixType> &btdm,
4165 const vector_type_3d_view &pmv,
4166 const impl_scalar_type damping_factor,
4167 const bool is_norm_manager_active)
4169 n_subparts_per_part(interf.n_subparts_per_part)
4170 , partptr(interf.partptr)
4171 , packptr(interf.packptr)
4172 , packindices_sub(interf.packindices_sub)
4173 , packindices_schur(interf.packindices_schur)
4174 , part2packrowidx0(interf.part2packrowidx0)
4175 , part2packrowidx0_sub(interf.part2packrowidx0_sub)
4176 , lclrow(interf.lclrow)
4177 , packptr_sub(interf.packptr_sub)
4178 , partptr_sub(interf.partptr_sub)
4179 , pack_td_ptr_schur(btdm.pack_td_ptr_schur)
4182 pack_td_ptr(btdm.pack_td_ptr)
4183 , D_internal_vector_values((internal_vector_type *)btdm.values.data(),
4184 btdm.values.extent(0),
4185 btdm.values.extent(1),
4186 btdm.values.extent(2),
4187 vector_length / internal_vector_length)
4188 , X_internal_vector_values((internal_vector_type *)pmv.data(),
4192 vector_length / internal_vector_length)
4193 , X_internal_scalar_values((btdm_scalar_type *)pmv.data(),
4199 2 * (n_subparts_per_part - 1) * part2packrowidx0_sub.extent(0),
4202 vector_length / internal_vector_length)
4203 , D_internal_vector_values_schur((internal_vector_type *)btdm.values_schur.data(),
4204 btdm.values_schur.extent(0),
4205 btdm.values_schur.extent(1),
4206 btdm.values_schur.extent(2),
4207 vector_length / internal_vector_length)
4208 , e_internal_vector_values((internal_vector_type *)btdm.e_values.data(),
4209 btdm.e_values.extent(0),
4210 btdm.e_values.extent(1),
4211 btdm.e_values.extent(2),
4212 btdm.e_values.extent(3),
4213 vector_length / internal_vector_length)
4214 , vector_loop_size(vector_length / internal_vector_length)
4215 , Y_scalar_multivector()
4217 , df(damping_factor)
4218 , compute_diff(is_norm_manager_active) {}
4222 KOKKOS_INLINE_FUNCTION
4224 copyToFlatMultiVector(
const member_type &member,
4225 const local_ordinal_type partidxbeg,
4226 const local_ordinal_type npacks,
4227 const local_ordinal_type pri0,
4228 const local_ordinal_type v,
4229 const local_ordinal_type blocksize,
4230 const local_ordinal_type num_vectors)
const {
4231 const local_ordinal_type vbeg = v * internal_vector_length;
4232 if (vbeg < npacks) {
4233 local_ordinal_type ri0_vals[internal_vector_length] = {};
4234 local_ordinal_type nrows_vals[internal_vector_length] = {};
4235 for (local_ordinal_type vv = vbeg, vi = 0; vv < npacks && vi < internal_vector_length; ++vv, ++vi) {
4236 const local_ordinal_type partidx = partidxbeg + vv;
4237 ri0_vals[vi] = partptr(partidx);
4238 nrows_vals[vi] = partptr(partidx + 1) - ri0_vals[vi];
4241 impl_scalar_type z_partial_sum(0);
4242 if (nrows_vals[0] == 1) {
4243 const local_ordinal_type j = 0, pri = pri0;
4245 for (local_ordinal_type vv = vbeg, vi = 0; vv < npacks && vi < internal_vector_length; ++vv, ++vi) {
4246 const local_ordinal_type ri0 = ri0_vals[vi];
4247 const local_ordinal_type nrows = nrows_vals[vi];
4249 Kokkos::parallel_for(Kokkos::TeamThreadRange(member, blocksize),
4250 [&](
const local_ordinal_type &i) {
4251 const local_ordinal_type row = blocksize * lclrow(ri0 + j) + i;
4252 for (local_ordinal_type col = 0; col < num_vectors; ++col) {
4253 impl_scalar_type &y = Y_scalar_multivector(row, col);
4254 const impl_scalar_type yd = X_internal_vector_values(pri, i, col, v)[vi] - y;
4258 #if KOKKOS_VERSION >= 40799
4259 const auto yd_abs = KokkosKernels::ArithTraits<impl_scalar_type>::abs(yd);
4261 const auto yd_abs = Kokkos::ArithTraits<impl_scalar_type>::abs(yd);
4263 z_partial_sum += yd_abs * yd_abs;
4271 Kokkos::parallel_for(Kokkos::TeamThreadRange(member, nrows_vals[0]),
4272 [&](
const local_ordinal_type &j) {
4273 const local_ordinal_type pri = pri0 + j;
4274 for (local_ordinal_type vv = vbeg, vi = 0; vv < npacks && vi < internal_vector_length; ++vv, ++vi) {
4275 const local_ordinal_type ri0 = ri0_vals[vi];
4276 const local_ordinal_type nrows = nrows_vals[vi];
4278 for (local_ordinal_type col = 0; col < num_vectors; ++col) {
4279 for (local_ordinal_type i = 0; i < blocksize; ++i) {
4280 const local_ordinal_type row = blocksize * lclrow(ri0 + j) + i;
4281 impl_scalar_type &y = Y_scalar_multivector(row, col);
4282 const impl_scalar_type yd = X_internal_vector_values(pri, i, col, v)[vi] - y;
4286 #if KOKKOS_VERSION >= 40799
4287 const auto yd_abs = KokkosKernels::ArithTraits<impl_scalar_type>::abs(yd);
4289 const auto yd_abs = Kokkos::ArithTraits<impl_scalar_type>::abs(yd);
4291 z_partial_sum += yd_abs * yd_abs;
4300 Z_scalar_vector(member.league_rank()) += z_partial_sum;
4307 template <
typename WWViewType>
4308 KOKKOS_INLINE_FUNCTION
void
4309 solveSingleVector(
const member_type &member,
4310 const local_ordinal_type &blocksize,
4311 const local_ordinal_type &i0,
4312 const local_ordinal_type &r0,
4313 const local_ordinal_type &nrows,
4314 const local_ordinal_type &v,
4315 const WWViewType &WW)
const {
4316 typedef SolveTridiagsDefaultModeAndAlgo<typename execution_space::memory_space> default_mode_and_algo_type;
4318 typedef typename default_mode_and_algo_type::mode_type default_mode_type;
4319 typedef typename default_mode_and_algo_type::single_vector_algo_type default_algo_type;
4322 auto A = D_internal_vector_values.data();
4323 auto X = X_internal_vector_values.data();
4326 #if KOKKOS_VERSION >= 40799
4327 const auto one = KokkosKernels::ArithTraits<btdm_magnitude_type>::one();
4329 const auto one = Kokkos::ArithTraits<btdm_magnitude_type>::one();
4331 #if KOKKOS_VERSION >= 40799
4332 const auto zero = KokkosKernels::ArithTraits<btdm_magnitude_type>::zero();
4334 const auto zero = Kokkos::ArithTraits<btdm_magnitude_type>::zero();
4339 const local_ordinal_type astep = D_internal_vector_values.stride(0);
4340 const local_ordinal_type as0 = D_internal_vector_values.stride(1);
4341 const local_ordinal_type as1 = D_internal_vector_values.stride(2);
4342 const local_ordinal_type xstep = X_internal_vector_values.stride(0);
4343 const local_ordinal_type xs0 = X_internal_vector_values.stride(1);
4346 A += i0 * astep + v;
4347 X += r0 * xstep + v;
4352 KOKKOSBATCHED_TRSV_LOWER_NO_TRANSPOSE_INTERNAL_INVOKE(default_mode_type, default_algo_type,
4355 blocksize, blocksize,
4360 for (local_ordinal_type tr = 1; tr < nrows; ++tr) {
4361 member.team_barrier();
4362 KOKKOSBATCHED_GEMV_NO_TRANSPOSE_INTERNAL_INVOKE(default_mode_type, default_algo_type,
4364 blocksize, blocksize,
4366 A + 2 * astep, as0, as1,
4369 X + 1 * xstep, xs0);
4370 KOKKOSBATCHED_TRSV_LOWER_NO_TRANSPOSE_INTERNAL_INVOKE(default_mode_type, default_algo_type,
4373 blocksize, blocksize,
4375 A + 3 * astep, as0, as1,
4376 X + 1 * xstep, xs0);
4383 KOKKOSBATCHED_TRSV_UPPER_NO_TRANSPOSE_INTERNAL_INVOKE(default_mode_type, default_algo_type,
4386 blocksize, blocksize,
4391 for (local_ordinal_type tr = nrows; tr > 1; --tr) {
4393 member.team_barrier();
4394 KOKKOSBATCHED_GEMV_NO_TRANSPOSE_INTERNAL_INVOKE(default_mode_type, default_algo_type,
4396 blocksize, blocksize,
4398 A + 1 * astep, as0, as1,
4401 X - 1 * xstep, xs0);
4402 KOKKOSBATCHED_TRSV_UPPER_NO_TRANSPOSE_INTERNAL_INVOKE(default_mode_type, default_algo_type,
4405 blocksize, blocksize,
4408 X - 1 * xstep, xs0);
4414 const local_ordinal_type ws0 = WW.stride(0);
4415 auto W = WW.data() + v;
4416 KOKKOSBATCHED_COPY_VECTOR_NO_TRANSPOSE_INTERNAL_INVOKE(default_mode_type,
4417 member, blocksize, X, xs0, W, ws0);
4418 member.team_barrier();
4419 KOKKOSBATCHED_GEMV_NO_TRANSPOSE_INTERNAL_INVOKE(default_mode_type, default_algo_type,
4421 blocksize, blocksize,
4430 template <
typename WWViewType>
4431 KOKKOS_INLINE_FUNCTION
void
4432 solveMultiVector(
const member_type &member,
4433 const local_ordinal_type & ,
4434 const local_ordinal_type &i0,
4435 const local_ordinal_type &r0,
4436 const local_ordinal_type &nrows,
4437 const local_ordinal_type &v,
4438 const WWViewType &WW)
const {
4439 typedef SolveTridiagsDefaultModeAndAlgo<typename execution_space::memory_space> default_mode_and_algo_type;
4441 typedef typename default_mode_and_algo_type::mode_type default_mode_type;
4442 typedef typename default_mode_and_algo_type::multi_vector_algo_type default_algo_type;
4445 #if KOKKOS_VERSION >= 40799
4446 const auto one = KokkosKernels::ArithTraits<btdm_magnitude_type>::one();
4448 const auto one = Kokkos::ArithTraits<btdm_magnitude_type>::one();
4450 #if KOKKOS_VERSION >= 40799
4451 const auto zero = KokkosKernels::ArithTraits<btdm_magnitude_type>::zero();
4453 const auto zero = Kokkos::ArithTraits<btdm_magnitude_type>::zero();
4457 auto A = Kokkos::subview(D_internal_vector_values, i0, Kokkos::ALL(), Kokkos::ALL(), v);
4458 auto X1 = Kokkos::subview(X_internal_vector_values, r0, Kokkos::ALL(), Kokkos::ALL(), v);
4461 local_ordinal_type i = i0, r = r0;
4465 KB::Trsm<member_type,
4466 KB::Side::Left, KB::Uplo::Lower, KB::Trans::NoTranspose, KB::Diag::Unit,
4467 default_mode_type, default_algo_type>::invoke(member, one, A, X1);
4468 for (local_ordinal_type tr = 1; tr < nrows; ++tr, i += 3) {
4469 A.assign_data(&D_internal_vector_values(i + 2, 0, 0, v));
4470 X2.assign_data(&X_internal_vector_values(++r, 0, 0, v));
4471 member.team_barrier();
4472 KB::Gemm<member_type,
4473 KB::Trans::NoTranspose, KB::Trans::NoTranspose,
4474 default_mode_type, default_algo_type>::invoke(member, -one, A, X1, one, X2);
4475 A.assign_data(&D_internal_vector_values(i + 3, 0, 0, v));
4476 KB::Trsm<member_type,
4477 KB::Side::Left, KB::Uplo::Lower, KB::Trans::NoTranspose, KB::Diag::Unit,
4478 default_mode_type, default_algo_type>::invoke(member, one, A, X2);
4479 X1.assign_data(X2.data());
4483 KB::Trsm<member_type,
4484 KB::Side::Left, KB::Uplo::Upper, KB::Trans::NoTranspose, KB::Diag::NonUnit,
4485 default_mode_type, default_algo_type>::invoke(member, one, A, X1);
4486 for (local_ordinal_type tr = nrows; tr > 1; --tr) {
4488 A.assign_data(&D_internal_vector_values(i + 1, 0, 0, v));
4489 X2.assign_data(&X_internal_vector_values(--r, 0, 0, v));
4490 member.team_barrier();
4491 KB::Gemm<member_type,
4492 KB::Trans::NoTranspose, KB::Trans::NoTranspose,
4493 default_mode_type, default_algo_type>::invoke(member, -one, A, X1, one, X2);
4495 A.assign_data(&D_internal_vector_values(i, 0, 0, v));
4496 KB::Trsm<member_type,
4497 KB::Side::Left, KB::Uplo::Upper, KB::Trans::NoTranspose, KB::Diag::NonUnit,
4498 default_mode_type, default_algo_type>::invoke(member, one, A, X2);
4499 X1.assign_data(X2.data());
4503 auto W = Kokkos::subview(WW, Kokkos::ALL(), Kokkos::ALL(), v);
4504 KB::Copy<member_type, KB::Trans::NoTranspose, default_mode_type>::invoke(member, X1, W);
4505 member.team_barrier();
4506 KB::Gemm<member_type,
4507 KB::Trans::NoTranspose, KB::Trans::NoTranspose,
4508 default_mode_type, default_algo_type>::invoke(member, one, A, W, zero, X1);
4513 struct SingleVectorTag {};
4515 struct MultiVectorTag {};
4518 struct SingleVectorSubLineTag {};
4520 struct MultiVectorSubLineTag {};
4522 struct SingleVectorApplyCTag {};
4524 struct MultiVectorApplyCTag {};
4526 struct SingleVectorSchurTag {};
4528 struct MultiVectorSchurTag {};
4530 struct SingleVectorApplyETag {};
4532 struct MultiVectorApplyETag {};
4534 struct SingleVectorCopyToFlatTag {};
4536 struct SingleZeroingTag {};
4539 KOKKOS_INLINE_FUNCTION
void
4540 operator()(
const SingleVectorTag<B> &,
const member_type &member)
const {
4541 const local_ordinal_type packidx = member.league_rank();
4542 const local_ordinal_type partidx = packptr(packidx);
4543 const local_ordinal_type npacks = packptr(packidx + 1) - partidx;
4544 const local_ordinal_type pri0 = part2packrowidx0(partidx);
4545 const local_ordinal_type i0 = pack_td_ptr(partidx, 0);
4546 const local_ordinal_type r0 = part2packrowidx0(partidx);
4547 const local_ordinal_type nrows = partptr(partidx + 1) - partptr(partidx);
4548 const local_ordinal_type blocksize = (B == 0 ? D_internal_vector_values.extent(1) : B);
4549 const local_ordinal_type num_vectors = 1;
4550 internal_vector_scratch_type_3d_view
4551 WW(member.team_scratch(0), blocksize, 1, vector_loop_size);
4552 Kokkos::single(Kokkos::PerTeam(member), [&]() {
4553 Z_scalar_vector(member.league_rank()) = impl_scalar_type(0);
4555 Kokkos::parallel_for(Kokkos::ThreadVectorRange(member, vector_loop_size), [&](
const int &v) {
4556 solveSingleVector(member, blocksize, i0, r0, nrows, v, WW);
4557 copyToFlatMultiVector(member, partidx, npacks, pri0, v, blocksize, num_vectors);
4562 KOKKOS_INLINE_FUNCTION
void
4563 operator()(
const MultiVectorTag<B> &,
const member_type &member)
const {
4564 const local_ordinal_type packidx = member.league_rank();
4565 const local_ordinal_type partidx = packptr(packidx);
4566 const local_ordinal_type npacks = packptr(packidx + 1) - partidx;
4567 const local_ordinal_type pri0 = part2packrowidx0(partidx);
4568 const local_ordinal_type i0 = pack_td_ptr(partidx, 0);
4569 const local_ordinal_type r0 = part2packrowidx0(partidx);
4570 const local_ordinal_type nrows = partptr(partidx + 1) - partptr(partidx);
4571 const local_ordinal_type blocksize = (B == 0 ? D_internal_vector_values.extent(1) : B);
4572 const local_ordinal_type num_vectors = X_internal_vector_values.extent(2);
4574 internal_vector_scratch_type_3d_view
4575 WW(member.team_scratch(0), blocksize, num_vectors, vector_loop_size);
4576 Kokkos::single(Kokkos::PerTeam(member), [&]() {
4577 Z_scalar_vector(member.league_rank()) = impl_scalar_type(0);
4579 Kokkos::parallel_for(Kokkos::ThreadVectorRange(member, vector_loop_size), [&](
const int &v) {
4580 solveMultiVector(member, blocksize, i0, r0, nrows, v, WW);
4581 copyToFlatMultiVector(member, partidx, npacks, pri0, v, blocksize, num_vectors);
4586 KOKKOS_INLINE_FUNCTION
void
4587 operator()(
const SingleVectorSubLineTag<B> &,
const member_type &member)
const {
4589 const local_ordinal_type packidx = packindices_sub(member.league_rank());
4591 const local_ordinal_type subpartidx = packptr_sub(packidx);
4592 const local_ordinal_type n_parts = part2packrowidx0_sub.extent(0);
4593 const local_ordinal_type local_subpartidx = subpartidx / n_parts;
4594 const local_ordinal_type partidx = subpartidx % n_parts;
4596 const local_ordinal_type npacks = packptr_sub(packidx + 1) - subpartidx;
4597 const local_ordinal_type i0 = pack_td_ptr(partidx, local_subpartidx);
4598 const local_ordinal_type r0 = part2packrowidx0_sub(partidx, local_subpartidx);
4599 const local_ordinal_type nrows = partptr_sub(subpartidx, 1) - partptr_sub(subpartidx, 0);
4600 const local_ordinal_type blocksize = e_internal_vector_values.extent(2);
4606 internal_vector_scratch_type_3d_view
4607 WW(member.team_scratch(0), blocksize, 1, vector_loop_size);
4609 Kokkos::parallel_for(Kokkos::ThreadVectorRange(member, vector_loop_size), [&](
const int &v) {
4610 solveSingleVectorNew<impl_type, internal_vector_scratch_type_3d_view>(member, blocksize, i0, r0, nrows, v, D_internal_vector_values, X_internal_vector_values, WW);
4615 KOKKOS_INLINE_FUNCTION
void
4616 operator()(
const SingleVectorApplyCTag<B> &,
const member_type &member)
const {
4619 const local_ordinal_type packidx = packindices_sub(member.league_rank());
4621 const local_ordinal_type subpartidx = packptr_sub(packidx);
4622 const local_ordinal_type n_parts = part2packrowidx0_sub.extent(0);
4623 const local_ordinal_type local_subpartidx = subpartidx / n_parts;
4624 const local_ordinal_type partidx = subpartidx % n_parts;
4625 const local_ordinal_type blocksize = e_internal_vector_values.extent(2);
4628 const local_ordinal_type i0 = pack_td_ptr(partidx, local_subpartidx);
4629 const local_ordinal_type r0 = part2packrowidx0_sub(partidx, local_subpartidx);
4630 const local_ordinal_type nrows = partptr_sub(subpartidx, 1) - partptr_sub(subpartidx, 0);
4632 internal_vector_scratch_type_3d_view
4633 WW(member.team_scratch(0), blocksize, blocksize, vector_loop_size);
4637 const local_ordinal_type local_subpartidx_schur = (local_subpartidx - 1) / 2;
4638 const local_ordinal_type i0_schur = local_subpartidx_schur == 0 ? pack_td_ptr_schur(partidx, local_subpartidx_schur) : pack_td_ptr_schur(partidx, local_subpartidx_schur) + 1;
4639 const local_ordinal_type i0_offset = local_subpartidx_schur == 0 ? i0 + 2 : i0 + 2;
4644 #if KOKKOS_VERSION >= 40799
4645 const auto one = KokkosKernels::ArithTraits<btdm_magnitude_type>::one();
4647 const auto one = Kokkos::ArithTraits<btdm_magnitude_type>::one();
4650 const size_type c_kps2 = local_subpartidx > 0 ? pack_td_ptr(partidx, local_subpartidx) - 2 : 0;
4651 const size_type c_kps1 = pack_td_ptr(partidx, local_subpartidx + 1) + 1;
4653 typedef SolveTridiagsDefaultModeAndAlgo<typename execution_space::memory_space> default_mode_and_algo_type;
4655 typedef typename default_mode_and_algo_type::mode_type default_mode_type;
4656 typedef typename default_mode_and_algo_type::single_vector_algo_type default_algo_type;
4658 if (local_subpartidx == 0) {
4659 Kokkos::parallel_for(Kokkos::ThreadVectorRange(member, vector_loop_size), [&](
const int &v) {
4660 auto v_1 = Kokkos::subview(X_internal_vector_values, r0 + nrows - 1, Kokkos::ALL(), 0, v);
4661 auto v_2 = Kokkos::subview(X_internal_vector_values, r0 + nrows, Kokkos::ALL(), 0, v);
4662 auto C = Kokkos::subview(D_internal_vector_values, c_kps1, Kokkos::ALL(), Kokkos::ALL(), v);
4664 KOKKOSBATCHED_GEMV_NO_TRANSPOSE_INTERNAL_INVOKE(default_mode_type, default_algo_type,
4666 blocksize, blocksize,
4668 C.data(), C.stride(0), C.stride(1),
4669 v_1.data(), v_1.stride(0),
4671 v_2.data(), v_2.stride(0));
4673 }
else if (local_subpartidx == (local_ordinal_type)part2packrowidx0_sub.extent(1) - 2) {
4674 Kokkos::parallel_for(Kokkos::ThreadVectorRange(member, vector_loop_size), [&](
const int &v) {
4675 auto v_1 = Kokkos::subview(X_internal_vector_values, r0, Kokkos::ALL(), 0, v);
4676 auto v_2 = Kokkos::subview(X_internal_vector_values, r0 - 1, Kokkos::ALL(), 0, v);
4677 auto C = Kokkos::subview(D_internal_vector_values, c_kps2, Kokkos::ALL(), Kokkos::ALL(), v);
4679 KOKKOSBATCHED_GEMV_NO_TRANSPOSE_INTERNAL_INVOKE(default_mode_type, default_algo_type,
4681 blocksize, blocksize,
4683 C.data(), C.stride(0), C.stride(1),
4684 v_1.data(), v_1.stride(0),
4686 v_2.data(), v_2.stride(0));
4689 Kokkos::parallel_for(Kokkos::ThreadVectorRange(member, vector_loop_size), [&](
const int &v) {
4691 auto v_1 = Kokkos::subview(X_internal_vector_values, r0 + nrows - 1, Kokkos::ALL(), 0, v);
4692 auto v_2 = Kokkos::subview(X_internal_vector_values, r0 + nrows, Kokkos::ALL(), 0, v);
4693 auto C = Kokkos::subview(D_internal_vector_values, c_kps1, Kokkos::ALL(), Kokkos::ALL(), v);
4695 KOKKOSBATCHED_GEMV_NO_TRANSPOSE_INTERNAL_INVOKE(default_mode_type, default_algo_type,
4697 blocksize, blocksize,
4699 C.data(), C.stride(0), C.stride(1),
4700 v_1.data(), v_1.stride(0),
4702 v_2.data(), v_2.stride(0));
4705 auto v_1 = Kokkos::subview(X_internal_vector_values, r0, Kokkos::ALL(), 0, v);
4706 auto v_2 = Kokkos::subview(X_internal_vector_values, r0 - 1, Kokkos::ALL(), 0, v);
4707 auto C = Kokkos::subview(D_internal_vector_values, c_kps2, Kokkos::ALL(), Kokkos::ALL(), v);
4709 KOKKOSBATCHED_GEMV_NO_TRANSPOSE_INTERNAL_INVOKE(default_mode_type, default_algo_type,
4711 blocksize, blocksize,
4713 C.data(), C.stride(0), C.stride(1),
4714 v_1.data(), v_1.stride(0),
4716 v_2.data(), v_2.stride(0));
4723 KOKKOS_INLINE_FUNCTION
void
4724 operator()(
const SingleVectorSchurTag<B> &,
const member_type &member)
const {
4725 const local_ordinal_type packidx = packindices_sub(member.league_rank());
4727 const local_ordinal_type partidx = packptr_sub(packidx);
4729 const local_ordinal_type blocksize = e_internal_vector_values.extent(2);
4731 const local_ordinal_type i0_schur = pack_td_ptr_schur(partidx, 0);
4732 const local_ordinal_type nrows = 2 * (n_subparts_per_part - 1);
4734 const local_ordinal_type r0_schur = nrows * member.league_rank();
4736 internal_vector_scratch_type_3d_view
4737 WW(member.team_scratch(0), blocksize, blocksize, vector_loop_size);
4739 for (local_ordinal_type schur_sub_part = 0; schur_sub_part < n_subparts_per_part - 1; ++schur_sub_part) {
4740 const local_ordinal_type r0 = part2packrowidx0_sub(partidx, 2 * schur_sub_part + 1);
4741 for (local_ordinal_type i = 0; i < 2; ++i) {
4742 copy3DView<local_ordinal_type>(member,
4743 Kokkos::subview(X_internal_vector_values_schur, r0_schur + 2 * schur_sub_part + i, Kokkos::ALL(), Kokkos::ALL(), Kokkos::ALL()),
4744 Kokkos::subview(X_internal_vector_values, r0 + i, Kokkos::ALL(), Kokkos::ALL(), Kokkos::ALL()));
4748 Kokkos::parallel_for(Kokkos::ThreadVectorRange(member, vector_loop_size), [&](
const int &v) {
4749 solveSingleVectorNew<impl_type, internal_vector_scratch_type_3d_view>(member, blocksize, i0_schur, r0_schur, nrows, v, D_internal_vector_values_schur, X_internal_vector_values_schur, WW);
4752 for (local_ordinal_type schur_sub_part = 0; schur_sub_part < n_subparts_per_part - 1; ++schur_sub_part) {
4753 const local_ordinal_type r0 = part2packrowidx0_sub(partidx, 2 * schur_sub_part + 1);
4754 for (local_ordinal_type i = 0; i < 2; ++i) {
4755 copy3DView<local_ordinal_type>(member,
4756 Kokkos::subview(X_internal_vector_values, r0 + i, Kokkos::ALL(), Kokkos::ALL(), Kokkos::ALL()),
4757 Kokkos::subview(X_internal_vector_values_schur, r0_schur + 2 * schur_sub_part + i, Kokkos::ALL(), Kokkos::ALL(), Kokkos::ALL()));
4763 KOKKOS_INLINE_FUNCTION
void
4764 operator()(
const SingleVectorApplyETag<B> &,
const member_type &member)
const {
4765 const local_ordinal_type packidx = packindices_sub(member.league_rank());
4767 const local_ordinal_type subpartidx = packptr_sub(packidx);
4768 const local_ordinal_type n_parts = part2packrowidx0_sub.extent(0);
4769 const local_ordinal_type local_subpartidx = subpartidx / n_parts;
4770 const local_ordinal_type partidx = subpartidx % n_parts;
4771 const local_ordinal_type blocksize = e_internal_vector_values.extent(2);
4773 const local_ordinal_type r0 = part2packrowidx0_sub(partidx, local_subpartidx);
4774 const local_ordinal_type nrows = partptr_sub(subpartidx, 1) - partptr_sub(subpartidx, 0);
4776 internal_vector_scratch_type_3d_view
4777 WW(member.team_scratch(0), blocksize, blocksize, vector_loop_size);
4781 #if KOKKOS_VERSION >= 40799
4782 const auto one = KokkosKernels::ArithTraits<btdm_magnitude_type>::one();
4784 const auto one = Kokkos::ArithTraits<btdm_magnitude_type>::one();
4787 typedef SolveTridiagsDefaultModeAndAlgo<typename execution_space::memory_space> default_mode_and_algo_type;
4789 typedef typename default_mode_and_algo_type::mode_type default_mode_type;
4790 typedef typename default_mode_and_algo_type::single_vector_algo_type default_algo_type;
4792 if (local_subpartidx == 0) {
4793 Kokkos::parallel_for(Kokkos::ThreadVectorRange(member, vector_loop_size), [&](
const int &v) {
4794 auto v_2 = Kokkos::subview(X_internal_vector_values, r0 + nrows, Kokkos::ALL(), 0, v);
4796 for (local_ordinal_type row = 0; row < nrows; ++row) {
4797 auto v_1 = Kokkos::subview(X_internal_vector_values, r0 + row, Kokkos::ALL(), 0, v);
4798 auto E = Kokkos::subview(e_internal_vector_values, 0, r0 + row, Kokkos::ALL(), Kokkos::ALL(), v);
4800 KOKKOSBATCHED_GEMV_NO_TRANSPOSE_INTERNAL_INVOKE(default_mode_type, default_algo_type,
4802 blocksize, blocksize,
4804 E.data(), E.stride(0), E.stride(1),
4805 v_2.data(), v_2.stride(0),
4807 v_1.data(), v_1.stride(0));
4810 }
else if (local_subpartidx == (local_ordinal_type)part2packrowidx0_sub.extent(1) - 2) {
4811 Kokkos::parallel_for(Kokkos::ThreadVectorRange(member, vector_loop_size), [&](
const int &v) {
4812 auto v_2 = Kokkos::subview(X_internal_vector_values, r0 - 1, Kokkos::ALL(), 0, v);
4814 for (local_ordinal_type row = 0; row < nrows; ++row) {
4815 auto v_1 = Kokkos::subview(X_internal_vector_values, r0 + row, Kokkos::ALL(), 0, v);
4816 auto E = Kokkos::subview(e_internal_vector_values, 1, r0 + row, Kokkos::ALL(), Kokkos::ALL(), v);
4818 KOKKOSBATCHED_GEMV_NO_TRANSPOSE_INTERNAL_INVOKE(default_mode_type, default_algo_type,
4820 blocksize, blocksize,
4822 E.data(), E.stride(0), E.stride(1),
4823 v_2.data(), v_2.stride(0),
4825 v_1.data(), v_1.stride(0));
4829 Kokkos::parallel_for(Kokkos::ThreadVectorRange(member, vector_loop_size), [&](
const int &v) {
4831 auto v_2 = Kokkos::subview(X_internal_vector_values, r0 + nrows, Kokkos::ALL(), 0, v);
4833 for (local_ordinal_type row = 0; row < nrows; ++row) {
4834 auto v_1 = Kokkos::subview(X_internal_vector_values, r0 + row, Kokkos::ALL(), 0, v);
4835 auto E = Kokkos::subview(e_internal_vector_values, 0, r0 + row, Kokkos::ALL(), Kokkos::ALL(), v);
4837 KOKKOSBATCHED_GEMV_NO_TRANSPOSE_INTERNAL_INVOKE(default_mode_type, default_algo_type,
4839 blocksize, blocksize,
4841 E.data(), E.stride(0), E.stride(1),
4842 v_2.data(), v_2.stride(0),
4844 v_1.data(), v_1.stride(0));
4848 auto v_2 = Kokkos::subview(X_internal_vector_values, r0 - 1, Kokkos::ALL(), 0, v);
4850 for (local_ordinal_type row = 0; row < nrows; ++row) {
4851 auto v_1 = Kokkos::subview(X_internal_vector_values, r0 + row, Kokkos::ALL(), 0, v);
4852 auto E = Kokkos::subview(e_internal_vector_values, 1, r0 + row, Kokkos::ALL(), Kokkos::ALL(), v);
4854 KOKKOSBATCHED_GEMV_NO_TRANSPOSE_INTERNAL_INVOKE(default_mode_type, default_algo_type,
4856 blocksize, blocksize,
4858 E.data(), E.stride(0), E.stride(1),
4859 v_2.data(), v_2.stride(0),
4861 v_1.data(), v_1.stride(0));
4869 KOKKOS_INLINE_FUNCTION
void
4870 operator()(
const SingleVectorCopyToFlatTag<B> &,
const member_type &member)
const {
4871 const local_ordinal_type packidx = member.league_rank();
4872 const local_ordinal_type partidx = packptr(packidx);
4873 const local_ordinal_type npacks = packptr(packidx + 1) - partidx;
4874 const local_ordinal_type pri0 = part2packrowidx0(partidx);
4875 const local_ordinal_type blocksize = (B == 0 ? D_internal_vector_values.extent(1) : B);
4876 const local_ordinal_type num_vectors = 1;
4878 Kokkos::parallel_for(Kokkos::ThreadVectorRange(member, vector_loop_size), [&](
const int &v) {
4879 copyToFlatMultiVector(member, partidx, npacks, pri0, v, blocksize, num_vectors);
4884 KOKKOS_INLINE_FUNCTION
void
4885 operator()(
const SingleZeroingTag<B> &,
const member_type &member)
const {
4886 Kokkos::single(Kokkos::PerTeam(member), [&]() {
4887 Z_scalar_vector(member.league_rank()) = impl_scalar_type(0);
4891 void run(
const impl_scalar_type_2d_view_tpetra &Y,
4892 const impl_scalar_type_1d_view &Z) {
4893 IFPACK2_BLOCKTRIDICONTAINER_PROFILER_REGION_BEGIN;
4894 IFPACK2_BLOCKHELPER_TIMER(
"BlockTriDi::SolveTridiags", SolveTridiags);
4897 this->Y_scalar_multivector = Y;
4898 this->Z_scalar_vector = Z;
4900 const local_ordinal_type num_vectors = X_internal_vector_values.extent(2);
4901 const local_ordinal_type blocksize = D_internal_vector_values.extent(1);
4903 const local_ordinal_type team_size =
4904 SolveTridiagsDefaultModeAndAlgo<typename execution_space::memory_space>::
4905 recommended_team_size(blocksize, vector_length, internal_vector_length);
4906 const int per_team_scratch = internal_vector_scratch_type_3d_view ::shmem_size(blocksize, num_vectors, vector_loop_size);
4908 #if defined(KOKKOS_ENABLE_DEPRECATED_CODE)
4909 #define BLOCKTRIDICONTAINER_DETAILS_SOLVETRIDIAGS(B) \
4910 if (num_vectors == 1) { \
4911 const Kokkos::TeamPolicy<execution_space, SingleVectorTag<B>> \
4912 policy(packptr.extent(0) - 1, team_size, vector_loop_size); \
4913 Kokkos::parallel_for("SolveTridiags::TeamPolicy::run<SingleVector>", \
4914 policy.set_scratch_size(0, Kokkos::PerTeam(per_team_scratch)), *this); \
4916 const Kokkos::TeamPolicy<execution_space, MultiVectorTag<B>> \
4917 policy(packptr.extent(0) - 1, team_size, vector_loop_size); \
4918 Kokkos::parallel_for("SolveTridiags::TeamPolicy::run<MultiVector>", \
4919 policy.set_scratch_size(0, Kokkos::PerTeam(per_team_scratch)), *this); \
4923 #define BLOCKTRIDICONTAINER_DETAILS_SOLVETRIDIAGS(B) \
4924 if (num_vectors == 1) { \
4925 if (packindices_schur.extent(1) <= 0) { \
4926 Kokkos::TeamPolicy<execution_space, SingleVectorTag<B>> \
4927 policy(packptr.extent(0) - 1, team_size, vector_loop_size); \
4928 policy.set_scratch_size(0, Kokkos::PerTeam(per_team_scratch)); \
4929 Kokkos::parallel_for("SolveTridiags::TeamPolicy::run<SingleVector>", \
4933 Kokkos::TeamPolicy<execution_space, SingleZeroingTag<B>> \
4934 policy(packptr.extent(0) - 1, team_size, vector_loop_size); \
4935 Kokkos::parallel_for("SolveTridiags::TeamPolicy::run<SingleZeroingTag>", \
4939 IFPACK2_BLOCKHELPER_TIMER("BlockTriDi::ApplyInverseJacobi::SingleVectorSubLineTag", SingleVectorSubLineTag0); \
4940 write4DMultiVectorValuesToFile(part2packrowidx0_sub.extent(0), X_internal_scalar_values, "x_scalar_values_before_SingleVectorSubLineTag.mm"); \
4941 Kokkos::TeamPolicy<execution_space, SingleVectorSubLineTag<B>> \
4942 policy(packindices_sub.extent(0), team_size, vector_loop_size); \
4943 policy.set_scratch_size(0, Kokkos::PerTeam(per_team_scratch)); \
4944 Kokkos::parallel_for("SolveTridiags::TeamPolicy::run<SingleVector>", \
4946 write4DMultiVectorValuesToFile(part2packrowidx0_sub.extent(0), X_internal_scalar_values, "x_scalar_values_after_SingleVectorSubLineTag.mm"); \
4947 IFPACK2_BLOCKHELPER_TIMER_FENCE(execution_space) \
4950 IFPACK2_BLOCKHELPER_TIMER("BlockTriDi::ApplyInverseJacobi::SingleVectorApplyCTag", SingleVectorApplyCTag0); \
4951 write4DMultiVectorValuesToFile(part2packrowidx0_sub.extent(0), X_internal_scalar_values, "x_scalar_values_before_SingleVectorApplyCTag.mm"); \
4952 Kokkos::TeamPolicy<execution_space, SingleVectorApplyCTag<B>> \
4953 policy(packindices_sub.extent(0), team_size, vector_loop_size); \
4954 policy.set_scratch_size(0, Kokkos::PerTeam(per_team_scratch)); \
4955 Kokkos::parallel_for("SolveTridiags::TeamPolicy::run<SingleVector>", \
4957 write4DMultiVectorValuesToFile(part2packrowidx0_sub.extent(0), X_internal_scalar_values, "x_scalar_values_after_SingleVectorApplyCTag.mm"); \
4958 IFPACK2_BLOCKHELPER_TIMER_FENCE(execution_space) \
4961 IFPACK2_BLOCKHELPER_TIMER("BlockTriDi::ApplyInverseJacobi::SingleVectorSchurTag", SingleVectorSchurTag0); \
4962 write4DMultiVectorValuesToFile(part2packrowidx0_sub.extent(0), X_internal_scalar_values, "x_scalar_values_before_SingleVectorSchurTag.mm"); \
4963 Kokkos::TeamPolicy<execution_space, SingleVectorSchurTag<B>> \
4964 policy(packindices_schur.extent(0), team_size, vector_loop_size); \
4965 policy.set_scratch_size(0, Kokkos::PerTeam(per_team_scratch)); \
4966 Kokkos::parallel_for("SolveTridiags::TeamPolicy::run<SingleVector>", \
4968 write4DMultiVectorValuesToFile(part2packrowidx0_sub.extent(0), X_internal_scalar_values, "x_scalar_values_after_SingleVectorSchurTag.mm"); \
4969 IFPACK2_BLOCKHELPER_TIMER_FENCE(execution_space) \
4972 IFPACK2_BLOCKHELPER_TIMER("BlockTriDi::ApplyInverseJacobi::SingleVectorApplyETag", SingleVectorApplyETag0); \
4973 write4DMultiVectorValuesToFile(part2packrowidx0_sub.extent(0), X_internal_scalar_values, "x_scalar_values_before_SingleVectorApplyETag.mm"); \
4974 Kokkos::TeamPolicy<execution_space, SingleVectorApplyETag<B>> \
4975 policy(packindices_sub.extent(0), team_size, vector_loop_size); \
4976 policy.set_scratch_size(0, Kokkos::PerTeam(per_team_scratch)); \
4977 Kokkos::parallel_for("SolveTridiags::TeamPolicy::run<SingleVector>", \
4979 write4DMultiVectorValuesToFile(part2packrowidx0_sub.extent(0), X_internal_scalar_values, "x_scalar_values_after_SingleVectorApplyETag.mm"); \
4980 IFPACK2_BLOCKHELPER_TIMER_FENCE(execution_space) \
4983 Kokkos::TeamPolicy<execution_space, SingleVectorCopyToFlatTag<B>> \
4984 policy(packptr.extent(0) - 1, team_size, vector_loop_size); \
4985 Kokkos::parallel_for("SolveTridiags::TeamPolicy::run<SingleVectorCopyToFlatTag>", \
4990 Kokkos::TeamPolicy<execution_space, MultiVectorTag<B>> \
4991 policy(packptr.extent(0) - 1, team_size, vector_loop_size); \
4992 policy.set_scratch_size(0, Kokkos::PerTeam(per_team_scratch)); \
4993 Kokkos::parallel_for("SolveTridiags::TeamPolicy::run<MultiVector>", \
4998 switch (blocksize) {
4999 case 3: BLOCKTRIDICONTAINER_DETAILS_SOLVETRIDIAGS(3);
5000 case 5: BLOCKTRIDICONTAINER_DETAILS_SOLVETRIDIAGS(5);
5001 case 6: BLOCKTRIDICONTAINER_DETAILS_SOLVETRIDIAGS(6);
5002 case 7: BLOCKTRIDICONTAINER_DETAILS_SOLVETRIDIAGS(7);
5003 case 10: BLOCKTRIDICONTAINER_DETAILS_SOLVETRIDIAGS(10);
5004 case 11: BLOCKTRIDICONTAINER_DETAILS_SOLVETRIDIAGS(11);
5005 case 12: BLOCKTRIDICONTAINER_DETAILS_SOLVETRIDIAGS(12);
5006 case 13: BLOCKTRIDICONTAINER_DETAILS_SOLVETRIDIAGS(13);
5007 case 16: BLOCKTRIDICONTAINER_DETAILS_SOLVETRIDIAGS(16);
5008 case 17: BLOCKTRIDICONTAINER_DETAILS_SOLVETRIDIAGS(17);
5009 case 18: BLOCKTRIDICONTAINER_DETAILS_SOLVETRIDIAGS(18);
5010 case 19: BLOCKTRIDICONTAINER_DETAILS_SOLVETRIDIAGS(19);
5011 default: BLOCKTRIDICONTAINER_DETAILS_SOLVETRIDIAGS(0);
5013 #undef BLOCKTRIDICONTAINER_DETAILS_SOLVETRIDIAGS
5015 IFPACK2_BLOCKTRIDICONTAINER_PROFILER_REGION_END;
5016 IFPACK2_BLOCKHELPER_TIMER_FENCE(execution_space)
5023 template <
typename MatrixType>
5025 const Teuchos::RCP<
const typename BlockHelperDetails::ImplType<MatrixType>::tpetra_row_matrix_type> &A,
5026 const Teuchos::RCP<
const typename BlockHelperDetails::ImplType<MatrixType>::tpetra_crs_graph_type> &G,
5027 const Teuchos::RCP<
const typename BlockHelperDetails::ImplType<MatrixType>::tpetra_import_type> &tpetra_importer,
5028 const Teuchos::RCP<AsyncableImport<MatrixType>> &async_importer,
5029 const bool overlap_communication_and_computation,
5031 const typename BlockHelperDetails::ImplType<MatrixType>::tpetra_multivector_type &X,
5032 typename BlockHelperDetails::ImplType<MatrixType>::tpetra_multivector_type &Y,
5033 typename BlockHelperDetails::ImplType<MatrixType>::tpetra_multivector_type &Z,
5034 typename BlockHelperDetails::ImplType<MatrixType>::impl_scalar_type_1d_view &W,
5036 const BlockHelperDetails::PartInterface<MatrixType> &interf,
5039 typename BlockHelperDetails::ImplType<MatrixType>::vector_type_1d_view &work,
5044 const int max_num_sweeps,
5045 const typename BlockHelperDetails::ImplType<MatrixType>::magnitude_type tol,
5046 const int check_tol_every) {
5047 IFPACK2_BLOCKHELPER_TIMER(
"BlockTriDi::ApplyInverseJacobi", ApplyInverseJacobi);
5050 using node_memory_space =
typename impl_type::node_memory_space;
5051 using local_ordinal_type =
typename impl_type::local_ordinal_type;
5052 using size_type =
typename impl_type::size_type;
5053 using impl_scalar_type =
typename impl_type::impl_scalar_type;
5054 using magnitude_type =
typename impl_type::magnitude_type;
5055 using local_ordinal_type_1d_view =
typename impl_type::local_ordinal_type_1d_view;
5056 using vector_type_1d_view =
typename impl_type::vector_type_1d_view;
5057 using vector_type_3d_view =
typename impl_type::vector_type_3d_view;
5058 using tpetra_multivector_type =
typename impl_type::tpetra_multivector_type;
5060 using impl_scalar_type_1d_view =
typename impl_type::impl_scalar_type_1d_view;
5064 "Neither Tpetra importer nor Async importer is null.");
5067 "Maximum number of sweeps must be >= 1.");
5070 const bool is_seq_method_requested = !tpetra_importer.is_null();
5071 const bool is_async_importer_active = !async_importer.is_null();
5072 #if KOKKOS_VERSION >= 40799
5073 const bool is_norm_manager_active = tol > KokkosKernels::ArithTraits<magnitude_type>::zero();
5075 const bool is_norm_manager_active = tol > Kokkos::ArithTraits<magnitude_type>::zero();
5077 const magnitude_type tolerance = tol * tol;
5078 const local_ordinal_type blocksize = btdm.values.extent(1);
5079 const local_ordinal_type num_vectors = Y.getNumVectors();
5080 const local_ordinal_type num_blockrows = interf.part2packrowidx0_back;
5082 const impl_scalar_type zero(0.0);
5084 TEUCHOS_TEST_FOR_EXCEPT_MSG(is_norm_manager_active && is_seq_method_requested,
5085 "The seq method for applyInverseJacobi, "
5086 <<
"which in any case is for developer use only, "
5087 <<
"does not support norm-based termination.");
5088 const bool device_accessible_from_host = Kokkos::SpaceAccessibility<
5089 Kokkos::DefaultHostExecutionSpace, node_memory_space>::accessible;
5091 std::invalid_argument,
5092 "The seq method for applyInverseJacobi, "
5093 <<
"which in any case is for developer use only, "
5094 <<
"only supports memory spaces accessible from host.");
5097 const size_type work_span_required = num_blockrows * num_vectors * blocksize;
5098 if (work.span() < work_span_required)
5099 work = vector_type_1d_view(
"vector workspace 1d view", work_span_required);
5102 const local_ordinal_type W_size = interf.packptr.extent(0) - 1;
5103 if (local_ordinal_type(W.extent(0)) < W_size)
5104 W = impl_scalar_type_1d_view(
"W", W_size);
5106 typename impl_type::impl_scalar_type_2d_view_tpetra remote_multivector;
5108 if (is_seq_method_requested) {
5109 if (Z.getNumVectors() != Y.getNumVectors())
5110 Z = tpetra_multivector_type(tpetra_importer->getTargetMap(), num_vectors,
false);
5112 if (is_async_importer_active) {
5114 async_importer->createDataBuffer(num_vectors);
5115 remote_multivector = async_importer->getRemoteMultiVectorLocalView();
5121 vector_type_3d_view pmv(work.data(), num_blockrows, blocksize, num_vectors);
5122 const auto XX = X.getLocalViewDevice(Tpetra::Access::ReadOnly);
5123 const auto YY = Y.getLocalViewDevice(Tpetra::Access::ReadWrite);
5124 const auto ZZ = Z.getLocalViewDevice(Tpetra::Access::ReadWrite);
5125 if (is_y_zero) Kokkos::deep_copy(YY, zero);
5128 SolveTridiags<MatrixType> solve_tridiags(interf, btdm, pmv,
5129 damping_factor, is_norm_manager_active);
5131 const local_ordinal_type_1d_view dummy_local_ordinal_type_1d_view;
5133 auto A_crs = Teuchos::rcp_dynamic_cast<
const typename impl_type::tpetra_crs_matrix_type>(A);
5134 auto A_bcrs = Teuchos::rcp_dynamic_cast<
const typename impl_type::tpetra_block_crs_matrix_type>(A);
5136 bool hasBlockCrsMatrix = !A_bcrs.is_null();
5139 const auto g = hasBlockCrsMatrix ? A_bcrs->getCrsGraph() : *(A_crs->getCrsGraph());
5141 BlockHelperDetails::ComputeResidualVector<MatrixType>
5142 compute_residual_vector(amd, G->getLocalGraphDevice(), g.getLocalGraphDevice(), blocksize, interf,
5143 is_async_importer_active ? async_importer->dm2cm : dummy_local_ordinal_type_1d_view,
5147 if (is_norm_manager_active)
5148 norm_manager.setCheckFrequency(check_tol_every);
5152 for (; sweep < max_num_sweeps; ++sweep) {
5156 multivector_converter.run(XX);
5158 if (is_seq_method_requested) {
5162 Z.doImport(Y, *tpetra_importer, Tpetra::REPLACE);
5163 compute_residual_vector.run(YY, XX, ZZ);
5166 multivector_converter.run(YY);
5170 if (overlap_communication_and_computation || !is_async_importer_active) {
5171 if (is_async_importer_active) async_importer->asyncSendRecv(YY);
5173 compute_residual_vector.run(pmv, XX, YY, remote_multivector,
true);
5174 if (is_norm_manager_active && norm_manager.checkDone(sweep, tolerance)) {
5175 if (is_async_importer_active) async_importer->cancel();
5178 if (is_async_importer_active) {
5179 async_importer->syncRecv();
5181 compute_residual_vector.run(pmv, XX, YY, remote_multivector,
false);
5184 if (is_async_importer_active)
5185 async_importer->syncExchange(YY);
5186 if (is_norm_manager_active && norm_manager.checkDone(sweep, tolerance))
break;
5188 compute_residual_vector.run(pmv, XX, YY, remote_multivector);
5196 solve_tridiags.run(YY, W);
5199 if (is_norm_manager_active) {
5201 BlockHelperDetails::reduceVector<MatrixType>(W, norm_manager.getBuffer());
5202 if (sweep + 1 == max_num_sweeps) {
5203 norm_manager.ireduce(sweep,
true);
5204 norm_manager.checkDone(sweep + 1, tolerance,
true);
5206 norm_manager.ireduce(sweep);
5214 if (is_norm_manager_active) norm_manager.finalize();
5221 template <
typename MatrixType,
int B>
5222 int applyFusedBlockJacobi_Impl(
5223 const Teuchos::RCP<
const typename BlockHelperDetails::ImplType<MatrixType>::tpetra_import_type> &tpetra_importer,
5224 const Teuchos::RCP<AsyncableImport<MatrixType>> &async_importer,
5225 const bool overlap_communication_and_computation,
5227 const typename BlockHelperDetails::ImplType<MatrixType>::tpetra_multivector_type &X,
5228 typename BlockHelperDetails::ImplType<MatrixType>::tpetra_multivector_type &Y,
5229 typename BlockHelperDetails::ImplType<MatrixType>::impl_scalar_type_1d_view &W,
5231 const BlockHelperDetails::PartInterface<MatrixType> &interf,
5232 const BlockTridiags<MatrixType> &btdm,
5234 typename BlockHelperDetails::ImplType<MatrixType>::impl_scalar_type_1d_view &work,
5239 const int max_num_sweeps,
5240 const typename BlockHelperDetails::ImplType<MatrixType>::magnitude_type tol,
5241 const int check_tol_every) {
5243 using local_ordinal_type =
typename impl_type::local_ordinal_type;
5244 using size_type =
typename impl_type::size_type;
5245 using magnitude_type =
typename impl_type::magnitude_type;
5246 using impl_scalar_type_1d_view =
typename impl_type::impl_scalar_type_1d_view;
5247 using impl_scalar_type_2d_view_tpetra =
typename impl_type::impl_scalar_type_2d_view_tpetra;
5251 "Neither Tpetra importer nor Async importer is null.");
5254 "Maximum number of sweeps must be >= 1.");
5257 const bool is_async_importer_active = !async_importer.is_null();
5258 #if KOKKOS_VERSION >= 40799
5259 const bool is_norm_manager_active = tol > KokkosKernels::ArithTraits<magnitude_type>::zero();
5261 const bool is_norm_manager_active = tol > Kokkos::ArithTraits<magnitude_type>::zero();
5263 const magnitude_type tolerance = tol * tol;
5264 const local_ordinal_type blocksize = btdm.d_inv.extent(1);
5265 const local_ordinal_type num_vectors = Y.getNumVectors();
5266 const local_ordinal_type num_blockrows = interf.nparts;
5268 typename impl_type::impl_scalar_type_2d_view_tpetra remote_multivector;
5270 if (is_async_importer_active) {
5272 async_importer->createDataBuffer(num_vectors);
5273 remote_multivector = async_importer->getRemoteMultiVectorLocalView();
5277 const auto XX = X.getLocalViewDevice(Tpetra::Access::ReadOnly);
5278 const auto YY = Y.getLocalViewDevice(Tpetra::Access::ReadWrite);
5280 const bool two_pass_residual =
5281 overlap_communication_and_computation && is_async_importer_active;
5286 size_t(num_blockrows) * blocksize * num_vectors != YY.extent(0) * YY.extent(1),
5287 "Local LHS vector (YY) has total size " << YY.extent(0) <<
"x" << YY.extent(1) <<
" = " << YY.extent(0) * YY.extent(1) <<
",\n"
5288 <<
"but expected " << num_blockrows <<
"x" << blocksize <<
"x" << num_vectors <<
" = " << size_t(num_blockrows) * blocksize * num_vectors <<
'\n');
5289 size_type work_required = size_type(num_blockrows) * blocksize * num_vectors;
5290 if (work.extent(0) < work_required) {
5294 Unmanaged<impl_scalar_type_2d_view_tpetra> y_doublebuf(work.data(), num_blockrows * blocksize, num_vectors);
5297 if (W.extent(0) != size_t(num_blockrows))
5301 BlockHelperDetails::ComputeResidualAndSolve_SolveOnly<MatrixType, B>
5302 functor_solve_only(amd, btdm.d_inv, W, blocksize, damping_factor);
5303 BlockHelperDetails::ComputeResidualAndSolve_1Pass<MatrixType, B>
5304 functor_1pass(amd, btdm.d_inv, W, blocksize, damping_factor);
5305 BlockHelperDetails::ComputeResidualAndSolve_2Pass<MatrixType, B>
5306 functor_2pass(amd, btdm.d_inv, W, blocksize, damping_factor);
5309 if (is_norm_manager_active)
5310 norm_manager.setCheckFrequency(check_tol_every);
5315 Unmanaged<impl_scalar_type_2d_view_tpetra> y_buffers[2] = {YY, y_doublebuf};
5320 for (; sweep < max_num_sweeps; ++sweep) {
5323 functor_solve_only.run(XX, y_buffers[1 - current_y]);
5326 if (overlap_communication_and_computation || !is_async_importer_active) {
5327 if (is_async_importer_active) async_importer->asyncSendRecv(y_buffers[current_y]);
5328 if (two_pass_residual) {
5331 functor_2pass.run_pass1(XX, y_buffers[current_y], y_buffers[1 - current_y]);
5335 functor_1pass.run(XX, y_buffers[current_y], remote_multivector, y_buffers[1 - current_y]);
5337 if (is_norm_manager_active && norm_manager.checkDone(sweep, tolerance)) {
5338 if (is_async_importer_active) async_importer->cancel();
5341 if (is_async_importer_active) {
5342 async_importer->syncRecv();
5344 functor_2pass.run_pass2(y_buffers[current_y], remote_multivector, y_buffers[1 - current_y]);
5347 if (is_async_importer_active)
5348 async_importer->syncExchange(y_buffers[current_y]);
5349 if (is_norm_manager_active && norm_manager.checkDone(sweep, tolerance))
break;
5351 functor_1pass.run(XX, y_buffers[current_y], remote_multivector, y_buffers[1 - current_y]);
5356 if (is_norm_manager_active) {
5357 BlockHelperDetails::reduceVector<MatrixType>(W, norm_manager.getBuffer());
5358 if (sweep + 1 == max_num_sweeps) {
5359 norm_manager.ireduce(sweep,
true);
5360 norm_manager.checkDone(sweep + 1, tolerance,
true);
5362 norm_manager.ireduce(sweep);
5367 current_y = 1 - current_y;
5369 if (current_y == 1) {
5371 Kokkos::deep_copy(YY, y_doublebuf);
5375 if (is_norm_manager_active) norm_manager.finalize();
5382 template <
typename MatrixType>
5384 const Teuchos::RCP<
const typename BlockHelperDetails::ImplType<MatrixType>::tpetra_import_type> &tpetra_importer,
5385 const Teuchos::RCP<AsyncableImport<MatrixType>> &async_importer,
5386 const bool overlap_communication_and_computation,
5388 const typename BlockHelperDetails::ImplType<MatrixType>::tpetra_multivector_type &X,
5389 typename BlockHelperDetails::ImplType<MatrixType>::tpetra_multivector_type &Y,
5390 typename BlockHelperDetails::ImplType<MatrixType>::impl_scalar_type_1d_view &W,
5392 const BlockHelperDetails::PartInterface<MatrixType> &interf,
5395 typename BlockHelperDetails::ImplType<MatrixType>::impl_scalar_type_1d_view &work,
5400 const int max_num_sweeps,
5401 const typename BlockHelperDetails::ImplType<MatrixType>::magnitude_type tol,
5402 const int check_tol_every) {
5403 IFPACK2_BLOCKHELPER_TIMER(
"BlockTriDi::ApplyFusedBlockJacobi", ApplyFusedBlockJacobi);
5404 int blocksize = btdm.d_inv.extent(1);
5406 #define BLOCKTRIDICONTAINER_APPLY_FUSED_JACOBI(B) \
5408 sweep = applyFusedBlockJacobi_Impl<MatrixType, B>( \
5409 tpetra_importer, async_importer, overlap_communication_and_computation, \
5410 X, Y, W, interf, btdm, amd, work, \
5411 norm_manager, damping_factor, is_y_zero, \
5412 max_num_sweeps, tol, check_tol_every); \
5415 switch (blocksize) {
5416 case 3: BLOCKTRIDICONTAINER_APPLY_FUSED_JACOBI(3);
5417 case 5: BLOCKTRIDICONTAINER_APPLY_FUSED_JACOBI(5);
5418 case 7: BLOCKTRIDICONTAINER_APPLY_FUSED_JACOBI(7);
5419 case 9: BLOCKTRIDICONTAINER_APPLY_FUSED_JACOBI(9);
5420 case 10: BLOCKTRIDICONTAINER_APPLY_FUSED_JACOBI(10);
5421 case 11: BLOCKTRIDICONTAINER_APPLY_FUSED_JACOBI(11);
5422 case 16: BLOCKTRIDICONTAINER_APPLY_FUSED_JACOBI(16);
5423 case 17: BLOCKTRIDICONTAINER_APPLY_FUSED_JACOBI(17);
5424 case 18: BLOCKTRIDICONTAINER_APPLY_FUSED_JACOBI(18);
5425 default: BLOCKTRIDICONTAINER_APPLY_FUSED_JACOBI(0);
5427 #undef BLOCKTRIDICONTAINER_APPLY_FUSED_JACOBI
5432 template <
typename MatrixType>
5435 using part_interface_type = BlockHelperDetails::PartInterface<MatrixType>;
5436 using block_tridiags_type = BlockTridiags<MatrixType>;
5439 using async_import_type = AsyncableImport<MatrixType>;
5446 bool overlap_communication_and_computation;
5449 mutable typename impl_type::tpetra_multivector_type Z;
5450 mutable typename impl_type::impl_scalar_type_1d_view W;
5453 part_interface_type part_interface;
5454 block_tridiags_type block_tridiags;
5458 bool use_fused_jacobi;
5461 mutable typename impl_type::vector_type_1d_view work;
5463 mutable typename impl_type::impl_scalar_type_1d_view work_flat;
5464 mutable norm_manager_type norm_manager;
Definition: Ifpack2_BlockTriDiContainer_impl.hpp:143
int applyFusedBlockJacobi(const Teuchos::RCP< const typename BlockHelperDetails::ImplType< MatrixType >::tpetra_import_type > &tpetra_importer, const Teuchos::RCP< AsyncableImport< MatrixType >> &async_importer, const bool overlap_communication_and_computation, const typename BlockHelperDetails::ImplType< MatrixType >::tpetra_multivector_type &X, typename BlockHelperDetails::ImplType< MatrixType >::tpetra_multivector_type &Y, typename BlockHelperDetails::ImplType< MatrixType >::impl_scalar_type_1d_view &W, const BlockHelperDetails::PartInterface< MatrixType > &interf, const BlockTridiags< MatrixType > &btdm, const BlockHelperDetails::AmD< MatrixType > &amd, typename BlockHelperDetails::ImplType< MatrixType >::impl_scalar_type_1d_view &work, BlockHelperDetails::NormManager< MatrixType > &norm_manager, const typename BlockHelperDetails::ImplType< MatrixType >::impl_scalar_type &damping_factor, bool is_y_zero, const int max_num_sweeps, const typename BlockHelperDetails::ImplType< MatrixType >::magnitude_type tol, const int check_tol_every)
Definition: Ifpack2_BlockTriDiContainer_impl.hpp:5383
void performNumericPhase(const Teuchos::RCP< const typename BlockHelperDetails::ImplType< MatrixType >::tpetra_row_matrix_type > &A, const Teuchos::RCP< const typename BlockHelperDetails::ImplType< MatrixType >::tpetra_crs_graph_type > &G, const BlockHelperDetails::PartInterface< MatrixType > &interf, BlockTridiags< MatrixType > &btdm, const typename BlockHelperDetails::ImplType< MatrixType >::magnitude_type tiny, bool use_fused_jacobi)
Definition: Ifpack2_BlockTriDiContainer_impl.hpp:3752
void performSymbolicPhase(const Teuchos::RCP< const typename BlockHelperDetails::ImplType< MatrixType >::tpetra_row_matrix_type > &A, const Teuchos::RCP< const typename BlockHelperDetails::ImplType< MatrixType >::tpetra_crs_graph_type > &g, const BlockHelperDetails::PartInterface< MatrixType > &interf, BlockTridiags< MatrixType > &btdm, BlockHelperDetails::AmD< MatrixType > &amd, const bool overlap_communication_and_computation, const Teuchos::RCP< AsyncableImport< MatrixType >> &async_importer, bool useSeqMethod, bool use_fused_jacobi)
Definition: Ifpack2_BlockTriDiContainer_impl.hpp:1933
#define TEUCHOS_TEST_FOR_EXCEPTION(throw_exception_test, Exception, msg)
size_t size_type
Definition: Ifpack2_BlockHelper.hpp:274
Kokkos::ArithTraits< scalar_type >::val_type impl_scalar_type
Definition: Ifpack2_BlockHelper.hpp:286
BlockHelperDetails::PartInterface< MatrixType > createPartInterface(const Teuchos::RCP< const typename BlockHelperDetails::ImplType< MatrixType >::tpetra_row_matrix_type > &A, const Teuchos::RCP< const typename BlockHelperDetails::ImplType< MatrixType >::tpetra_crs_graph_type > &G, const Teuchos::Array< Teuchos::Array< typename BlockHelperDetails::ImplType< MatrixType >::local_ordinal_type >> &partitions, const typename BlockHelperDetails::ImplType< MatrixType >::local_ordinal_type n_subparts_per_part_in)
Definition: Ifpack2_BlockTriDiContainer_impl.hpp:1102
Teuchos::RCP< AsyncableImport< MatrixType > > createBlockCrsAsyncImporter(const Teuchos::RCP< const typename BlockHelperDetails::ImplType< MatrixType >::tpetra_row_matrix_type > &A)
Definition: Ifpack2_BlockTriDiContainer_impl.hpp:895
TEUCHOS_DEPRECATED RCP< T > rcp(T *p, Dealloc_T dealloc, bool owns_mem)
#define TEUCHOS_TEST_FOR_EXCEPT_MSG(throw_exception_test, msg)
BlockTridiags< MatrixType > createBlockTridiags(const BlockHelperDetails::PartInterface< MatrixType > &interf)
Definition: Ifpack2_BlockTriDiContainer_impl.hpp:1681
Kokkos::View< size_type *, device_type > size_type_1d_view
Definition: Ifpack2_BlockHelper.hpp:358
Definition: Ifpack2_BlockHelper.hpp:389
Kokkos::ViewAllocateWithoutInitializing do_not_initialize_tag
Definition: Ifpack2_BlockTriDiContainer_impl.hpp:99
void send(const Packet sendBuffer[], const Ordinal count, const int destRank, const int tag, const Comm< Ordinal > &comm)
Definition: Ifpack2_BlockHelper.hpp:211
Definition: Ifpack2_BlockTriDiContainer_impl.hpp:2286
Teuchos::RCP< const typename BlockHelperDetails::ImplType< MatrixType >::tpetra_import_type > createBlockCrsTpetraImporter(const Teuchos::RCP< const typename BlockHelperDetails::ImplType< MatrixType >::tpetra_row_matrix_type > &A)
Definition: Ifpack2_BlockTriDiContainer_impl.hpp:171
RCP< CommRequest< Ordinal > > isend(const ArrayRCP< const Packet > &sendBuffer, const int destRank, const int tag, const Comm< Ordinal > &comm)
#define TEUCHOS_ASSERT(assertion_test)
Definition: Ifpack2_BlockHelper.hpp:236
Definition: Ifpack2_BlockHelper.hpp:270
int applyInverseJacobi(const Teuchos::RCP< const typename BlockHelperDetails::ImplType< MatrixType >::tpetra_row_matrix_type > &A, const Teuchos::RCP< const typename BlockHelperDetails::ImplType< MatrixType >::tpetra_crs_graph_type > &G, const Teuchos::RCP< const typename BlockHelperDetails::ImplType< MatrixType >::tpetra_import_type > &tpetra_importer, const Teuchos::RCP< AsyncableImport< MatrixType >> &async_importer, const bool overlap_communication_and_computation, const typename BlockHelperDetails::ImplType< MatrixType >::tpetra_multivector_type &X, typename BlockHelperDetails::ImplType< MatrixType >::tpetra_multivector_type &Y, typename BlockHelperDetails::ImplType< MatrixType >::tpetra_multivector_type &Z, typename BlockHelperDetails::ImplType< MatrixType >::impl_scalar_type_1d_view &W, const BlockHelperDetails::PartInterface< MatrixType > &interf, const BlockTridiags< MatrixType > &btdm, const BlockHelperDetails::AmD< MatrixType > &amd, typename BlockHelperDetails::ImplType< MatrixType >::vector_type_1d_view &work, BlockHelperDetails::NormManager< MatrixType > &norm_manager, const typename BlockHelperDetails::ImplType< MatrixType >::impl_scalar_type &damping_factor, bool is_y_zero, const int max_num_sweeps, const typename BlockHelperDetails::ImplType< MatrixType >::magnitude_type tol, const int check_tol_every)
Definition: Ifpack2_BlockTriDiContainer_impl.hpp:5024
Definition: Ifpack2_BlockTriDiContainer_impl.hpp:1613
Definition: Ifpack2_BlockComputeResidualVector.hpp:23
Definition: Ifpack2_BlockTriDiContainer_impl.hpp:3802