10 #ifndef IFPACK2_BLOCKTRIDICONTAINER_IMPL_HPP
11 #define IFPACK2_BLOCKTRIDICONTAINER_IMPL_HPP
18 #include <Tpetra_Details_extractMpiCommFromTeuchos.hpp>
19 #include <Tpetra_Distributor.hpp>
20 #include <Tpetra_BlockMultiVector.hpp>
22 #include <Kokkos_ArithTraits.hpp>
23 #include <KokkosBatched_Util.hpp>
24 #include <KokkosBatched_Vector.hpp>
25 #include <KokkosBatched_Copy_Decl.hpp>
26 #include <KokkosBatched_Copy_Impl.hpp>
27 #include <KokkosBatched_AddRadial_Decl.hpp>
28 #include <KokkosBatched_AddRadial_Impl.hpp>
29 #include <KokkosBatched_SetIdentity_Decl.hpp>
30 #include <KokkosBatched_SetIdentity_Impl.hpp>
31 #include <KokkosBatched_Gemm_Decl.hpp>
32 #include <KokkosBatched_Gemm_Serial_Impl.hpp>
33 #include <KokkosBatched_Gemm_Team_Impl.hpp>
34 #include <KokkosBatched_Gemv_Decl.hpp>
35 #include <KokkosBatched_Gemv_Team_Impl.hpp>
36 #include <KokkosBatched_Trsm_Decl.hpp>
37 #include <KokkosBatched_Trsm_Serial_Impl.hpp>
38 #include <KokkosBatched_Trsm_Team_Impl.hpp>
39 #include <KokkosBatched_Trsv_Decl.hpp>
40 #include <KokkosBatched_Trsv_Serial_Impl.hpp>
41 #include <KokkosBatched_Trsv_Team_Impl.hpp>
42 #include <KokkosBatched_LU_Decl.hpp>
43 #include <KokkosBatched_LU_Serial_Impl.hpp>
44 #include <KokkosBatched_LU_Team_Impl.hpp>
46 #include <KokkosBlas1_nrm1.hpp>
47 #include <KokkosBlas1_nrm2.hpp>
51 #include "Ifpack2_BlockHelper.hpp"
52 #include "Ifpack2_BlockComputeResidualVector.hpp"
59 #if defined(KOKKOS_ENABLE_CUDA) && defined(IFPACK2_BLOCKTRIDICONTAINER_ENABLE_PROFILE)
60 #include "cuda_profiler_api.h"
65 #define IFPACK2_BLOCKTRIDICONTAINER_USE_MPI_3
73 #define IFPACK2_BLOCKTRIDICONTAINER_USE_PINNED_MEMORY_FOR_MPI
77 #define IFPACK2_BLOCKTRIDICONTAINER_USE_CUDA_SPACE
80 #if defined(HAVE_IFPACK2_BLOCKTRIDICONTAINER_SMALL_SCALAR)
81 #define IFPACK2_BLOCKTRIDICONTAINER_USE_SMALL_SCALAR_FOR_BLOCKTRIDIAG
85 #define IFPACK2_BLOCKTRIDICONTAINER_USE_EXEC_SPACE_INSTANCES
89 namespace BlockTriDiContainerDetails {
91 namespace KB = KokkosBatched;
98 template <
typename MemoryTraitsType, Kokkos::MemoryTraitsFlags flag>
99 using MemoryTraits = Kokkos::MemoryTraits<MemoryTraitsType::is_unmanaged |
100 MemoryTraitsType::is_random_access |
103 template <
typename ViewType>
104 using Unmanaged = Kokkos::View<
typename ViewType::data_type,
105 typename ViewType::array_layout,
106 typename ViewType::device_type,
107 MemoryTraits<typename ViewType::memory_traits,Kokkos::Unmanaged> >;
108 template <
typename ViewType>
109 using Atomic = Kokkos::View<
typename ViewType::data_type,
110 typename ViewType::array_layout,
111 typename ViewType::device_type,
112 MemoryTraits<typename ViewType::memory_traits,Kokkos::Atomic> >;
113 template <
typename ViewType>
114 using Const = Kokkos::View<
typename ViewType::const_data_type,
115 typename ViewType::array_layout,
116 typename ViewType::device_type,
117 typename ViewType::memory_traits>;
118 template <
typename ViewType>
119 using ConstUnmanaged = Const<Unmanaged<ViewType> >;
121 template <
typename ViewType>
122 using AtomicUnmanaged = Atomic<Unmanaged<ViewType> >;
124 template <
typename ViewType>
125 using Unmanaged = Kokkos::View<
typename ViewType::data_type,
126 typename ViewType::array_layout,
127 typename ViewType::device_type,
128 MemoryTraits<typename ViewType::memory_traits,Kokkos::Unmanaged> >;
131 template <
typename ViewType>
132 using Scratch = Kokkos::View<
typename ViewType::data_type,
133 typename ViewType::array_layout,
134 typename ViewType::execution_space::scratch_memory_space,
135 MemoryTraits<typename ViewType::memory_traits, Kokkos::Unmanaged> >;
141 #if defined(IFPACK2_BLOCKTRIDICONTAINER_USE_SMALL_SCALAR_FOR_BLOCKTRIDIAG)
146 #if defined(KOKKOS_ENABLE_CUDA) && defined(IFPACK2_BLOCKTRIDICONTAINER_ENABLE_PROFILE)
147 #define IFPACK2_BLOCKTRIDICONTAINER_PROFILER_REGION_BEGIN \
148 KOKKOS_IMPL_CUDA_SAFE_CALL(cudaProfilerStart());
150 #define IFPACK2_BLOCKTRIDICONTAINER_PROFILER_REGION_END \
151 { KOKKOS_IMPL_CUDA_SAFE_CALL( cudaProfilerStop() ); }
153 #define IFPACK2_BLOCKTRIDICONTAINER_PROFILER_REGION_BEGIN
155 #define IFPACK2_BLOCKTRIDICONTAINER_PROFILER_REGION_END
161 template<
typename MatrixType>
164 IFPACK2_BLOCKHELPER_TIMER(
"BlockTriDi::CreateBlockCrsTpetraImporter", CreateBlockCrsTpetraImporter);
166 using tpetra_map_type =
typename impl_type::tpetra_map_type;
167 using tpetra_mv_type =
typename impl_type::tpetra_block_multivector_type;
168 using tpetra_import_type =
typename impl_type::tpetra_import_type;
169 using crs_matrix_type =
typename impl_type::tpetra_crs_matrix_type;
170 using block_crs_matrix_type =
typename impl_type::tpetra_block_crs_matrix_type;
172 auto A_crs = Teuchos::rcp_dynamic_cast<
const crs_matrix_type>(A);
173 auto A_bcrs = Teuchos::rcp_dynamic_cast<
const block_crs_matrix_type>(A);
175 bool hasBlockCrsMatrix = ! A_bcrs.is_null ();
178 const auto g = hasBlockCrsMatrix ? A_bcrs->getCrsGraph() : *(A_crs->getCrsGraph());
180 const auto blocksize = hasBlockCrsMatrix ? A_bcrs->getBlockSize() : 1;
181 const auto src =
Teuchos::rcp(
new tpetra_map_type(tpetra_mv_type::makePointMap(*g.getDomainMap(), blocksize)));
182 const auto tgt =
Teuchos::rcp(
new tpetra_map_type(tpetra_mv_type::makePointMap(*g.getColMap() , blocksize)));
183 IFPACK2_BLOCKHELPER_TIMER_FENCE(
typename BlockHelperDetails::ImplType<MatrixType>::execution_space)
192 template<
typename MatrixType>
193 struct AsyncableImport {
201 #if !defined(HAVE_IFPACK2_MPI)
202 typedef int MPI_Request;
203 typedef int MPI_Comm;
205 using scalar_type =
typename impl_type::scalar_type;
209 static int isend(
const MPI_Comm comm,
const char* buf,
int count,
int dest,
int tag, MPI_Request* ireq) {
210 #ifdef HAVE_IFPACK2_MPI
212 int ret = MPI_Isend(const_cast<char*>(buf), count, MPI_CHAR, dest, tag, comm, ireq == NULL ? &ureq : ireq);
213 if (ireq == NULL) MPI_Request_free(&ureq);
220 static int irecv(
const MPI_Comm comm,
char* buf,
int count,
int src,
int tag, MPI_Request* ireq) {
221 #ifdef HAVE_IFPACK2_MPI
223 int ret = MPI_Irecv(buf, count, MPI_CHAR, src, tag, comm, ireq == NULL ? &ureq : ireq);
224 if (ireq == NULL) MPI_Request_free(&ureq);
231 static int waitany(
int count, MPI_Request* reqs,
int* index) {
232 #ifdef HAVE_IFPACK2_MPI
233 return MPI_Waitany(count, reqs, index, MPI_STATUS_IGNORE);
239 static int waitall(
int count, MPI_Request* reqs) {
240 #ifdef HAVE_IFPACK2_MPI
241 return MPI_Waitall(count, reqs, MPI_STATUS_IGNORE);
248 using tpetra_map_type =
typename impl_type::tpetra_map_type;
249 using tpetra_import_type =
typename impl_type::tpetra_import_type;
251 using local_ordinal_type =
typename impl_type::local_ordinal_type;
252 using global_ordinal_type =
typename impl_type::global_ordinal_type;
256 using int_1d_view_host = Kokkos::View<int*,Kokkos::HostSpace>;
257 using local_ordinal_type_1d_view_host = Kokkos::View<local_ordinal_type*,Kokkos::HostSpace>;
259 using execution_space =
typename impl_type::execution_space;
260 using memory_space =
typename impl_type::memory_space;
261 using local_ordinal_type_1d_view =
typename impl_type::local_ordinal_type_1d_view;
263 using size_type_1d_view_host = Kokkos::View<size_type*,Kokkos::HostSpace>;
265 #if defined(KOKKOS_ENABLE_CUDA)
266 using impl_scalar_type_1d_view =
267 typename std::conditional<std::is_same<execution_space,Kokkos::Cuda>::value,
268 # if defined(IFPACK2_BLOCKTRIDICONTAINER_USE_PINNED_MEMORY_FOR_MPI)
269 Kokkos::View<impl_scalar_type*,Kokkos::CudaHostPinnedSpace>,
270 # elif defined(IFPACK2_BLOCKTRIDICONTAINER_USE_CUDA_MEMORY_FOR_MPI)
271 Kokkos::View<impl_scalar_type*,Kokkos::CudaSpace>,
272 # else // no experimental macros are defined
273 typename impl_type::impl_scalar_type_1d_view,
275 typename impl_type::impl_scalar_type_1d_view>::type;
277 using impl_scalar_type_1d_view =
typename impl_type::impl_scalar_type_1d_view;
279 using impl_scalar_type_1d_view_host = Kokkos::View<impl_scalar_type*,Kokkos::HostSpace>;
280 using impl_scalar_type_2d_view =
typename impl_type::impl_scalar_type_2d_view;
281 using impl_scalar_type_2d_view_tpetra =
typename impl_type::impl_scalar_type_2d_view_tpetra;
283 #ifdef HAVE_IFPACK2_MPI
287 impl_scalar_type_2d_view_tpetra remote_multivector;
288 local_ordinal_type blocksize;
291 struct SendRecvPair {
296 SendRecvPair<int_1d_view_host> pids;
297 SendRecvPair<std::vector<MPI_Request> > reqs;
298 SendRecvPair<size_type_1d_view> offset;
299 SendRecvPair<size_type_1d_view_host> offset_host;
300 SendRecvPair<local_ordinal_type_1d_view> lids;
301 SendRecvPair<impl_scalar_type_1d_view> buffer;
302 SendRecvPair<impl_scalar_type_1d_view_host> buffer_host;
304 local_ordinal_type_1d_view dm2cm;
306 #if defined(KOKKOS_ENABLE_CUDA) || defined(KOKKOS_ENABLE_HIP) || defined(KOKKOS_ENABLE_SYCL)
307 using exec_instance_1d_std_vector = std::vector<execution_space>;
308 exec_instance_1d_std_vector exec_instances;
314 const size_type_1d_view &offs) {
316 Kokkos::View<size_t*,Kokkos::HostSpace> lens_host(const_cast<size_t*>(lens.
getRawPtr()), lens.
size());
317 const auto lens_device = Kokkos::create_mirror_view_and_copy(memory_space(), lens_host);
320 const Kokkos::RangePolicy<execution_space> policy(0,offs.extent(0));
321 const local_ordinal_type lens_size = lens_device.extent(0);
322 Kokkos::parallel_scan
323 (
"AsyncableImport::RangePolicy::setOffsetValues",
324 policy, KOKKOS_LAMBDA(
const local_ordinal_type &i, size_type &update,
const bool &
final) {
327 update += (i < lens_size ? lens_device[i] : 0);
332 const size_type_1d_view_host &offs) {
334 Kokkos::View<size_t*,Kokkos::HostSpace> lens_host(const_cast<size_t*>(lens.
getRawPtr()), lens.
size());
335 const auto lens_device = Kokkos::create_mirror_view_and_copy(memory_space(), lens_host);
339 for (local_ordinal_type i=1,iend=offs.extent(0);i<iend;++i) {
340 offs(i) = offs(i-1) + lens[i-1];
345 void createMpiRequests(
const tpetra_import_type &
import) {
346 Tpetra::Distributor &distributor =
import.getDistributor();
349 const auto pids_from = distributor.getProcsFrom();
351 memcpy(pids.recv.data(), pids_from.getRawPtr(),
sizeof(int)*pids.recv.extent(0));
353 const auto pids_to = distributor.getProcsTo();
355 memcpy(pids.send.data(), pids_to.getRawPtr(),
sizeof(int)*pids.send.extent(0));
358 reqs.recv.resize(pids.recv.extent(0)); memset(reqs.recv.data(), 0, reqs.recv.size()*
sizeof(MPI_Request));
359 reqs.send.resize(pids.send.extent(0)); memset(reqs.send.data(), 0, reqs.send.size()*
sizeof(MPI_Request));
363 const auto lengths_to = distributor.getLengthsTo();
366 const auto lengths_from = distributor.getLengthsFrom();
369 setOffsetValues(lengths_to, offset.send);
370 offset_host.send = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), offset.send);
372 setOffsetValues(lengths_from, offset.recv);
373 offset_host.recv = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), offset.recv);
375 const auto lengths_to = distributor.getLengthsTo();
376 offset_host.send = size_type_1d_view_host(
do_not_initialize_tag(
"offset send"), lengths_to.size() + 1);
378 const auto lengths_from = distributor.getLengthsFrom();
379 offset_host.recv = size_type_1d_view_host(
do_not_initialize_tag(
"offset recv"), lengths_from.size() + 1);
381 setOffsetValuesHost(lengths_to, offset_host.send);
384 setOffsetValuesHost(lengths_from, offset_host.recv);
389 void createSendRecvIDs(
const tpetra_import_type &
import) {
391 const auto remote_lids =
import.getRemoteLIDs();
392 const local_ordinal_type_1d_view_host
393 remote_lids_view_host(const_cast<local_ordinal_type*>(remote_lids.getRawPtr()), remote_lids.size());
395 Kokkos::deep_copy(lids.recv, remote_lids_view_host);
398 auto epids =
import.getExportPIDs();
399 auto elids =
import.getExportLIDs();
402 auto lids_send_host = Kokkos::create_mirror_view(lids.send);
405 for (local_ordinal_type cnt=0,i=0,iend=pids.send.extent(0);i<iend;++i) {
406 const auto pid_send_value = pids.send[i];
407 for (local_ordinal_type j=0,jend=epids.size();j<jend;++j)
408 if (epids[j] == pid_send_value) lids_send_host[cnt++] = elids[j];
409 TEUCHOS_ASSERT(static_cast<size_t>(cnt) == offset_host.send[i+1]);
411 Kokkos::deep_copy(lids.send, lids_send_host);
414 void createExecutionSpaceInstances() {
415 #if defined(KOKKOS_ENABLE_CUDA) || defined(KOKKOS_ENABLE_HIP) || defined(KOKKOS_ENABLE_SYCL)
418 Kokkos::Experimental::partition_space(execution_space(), 1, 1, 1, 1, 1, 1, 1, 1);
425 struct ToMultiVector {};
429 const local_ordinal_type blocksize_,
430 const local_ordinal_type_1d_view dm2cm_) {
431 blocksize = blocksize_;
434 #ifdef HAVE_IFPACK2_MPI
435 comm = Tpetra::Details::extractMpiCommFromTeuchos(*tgt_map->getComm());
437 const tpetra_import_type
import(src_map, tgt_map);
439 createMpiRequests(
import);
440 createSendRecvIDs(
import);
441 createExecutionSpaceInstances();
444 void createDataBuffer(
const local_ordinal_type &num_vectors) {
445 const size_type extent_0 = lids.recv.extent(0)*blocksize;
446 const size_type extent_1 = num_vectors;
447 if (remote_multivector.extent(0) == extent_0 &&
448 remote_multivector.extent(1) == extent_1) {
454 const auto send_buffer_size = offset_host.send[offset_host.send.extent(0)-1]*blocksize*num_vectors;
455 const auto recv_buffer_size = offset_host.recv[offset_host.recv.extent(0)-1]*blocksize*num_vectors;
460 if (!Tpetra::Details::Behavior::assumeMpiIsGPUAware()) {
461 buffer_host.send = impl_scalar_type_1d_view_host(
do_not_initialize_tag(
"buffer send"), send_buffer_size);
462 buffer_host.recv = impl_scalar_type_1d_view_host(
do_not_initialize_tag(
"buffer recv"), recv_buffer_size);
468 #ifdef HAVE_IFPACK2_MPI
469 waitall(reqs.recv.size(), reqs.recv.data());
470 waitall(reqs.send.size(), reqs.send.data());
478 #if defined(KOKKOS_ENABLE_CUDA) || defined(KOKKOS_ENABLE_HIP) || defined(KOKKOS_ENABLE_SYCL)
479 template<
typename PackTag>
481 void copy(
const local_ordinal_type_1d_view &lids_,
482 const impl_scalar_type_1d_view &buffer_,
483 const local_ordinal_type ibeg_,
484 const local_ordinal_type iend_,
485 const impl_scalar_type_2d_view_tpetra &multivector_,
486 const local_ordinal_type blocksize_,
487 const execution_space &exec_instance_) {
488 const local_ordinal_type num_vectors = multivector_.extent(1);
489 const local_ordinal_type mv_blocksize = blocksize_*num_vectors;
490 const local_ordinal_type idiff = iend_ - ibeg_;
491 const auto abase = buffer_.data() + mv_blocksize*ibeg_;
493 using team_policy_type = Kokkos::TeamPolicy<execution_space>;
494 local_ordinal_type vector_size(0);
495 if (blocksize_ <= 4) vector_size = 4;
496 else if (blocksize_ <= 8) vector_size = 8;
497 else if (blocksize_ <= 16) vector_size = 16;
498 else vector_size = 32;
500 const auto work_item_property = Kokkos::Experimental::WorkItemProperty::HintLightWeight;
501 const team_policy_type policy(exec_instance_, idiff, 1, vector_size);
504 Kokkos::Experimental::require(policy, work_item_property),
505 KOKKOS_LAMBDA(
const typename team_policy_type::member_type &member) {
506 const local_ordinal_type i = member.league_rank();
508 (Kokkos::TeamThreadRange(member,num_vectors),[&](
const local_ordinal_type &j) {
509 auto aptr = abase + blocksize_*(i + idiff*j);
510 auto bptr = &multivector_(blocksize_*lids_(i + ibeg_), j);
511 if (std::is_same<PackTag,ToBuffer>::value)
513 (Kokkos::ThreadVectorRange(member,blocksize_),[&](
const local_ordinal_type &k) {
518 (Kokkos::ThreadVectorRange(member,blocksize_),[&](
const local_ordinal_type &k) {
525 void asyncSendRecvVar1(
const impl_scalar_type_2d_view_tpetra &mv) {
526 IFPACK2_BLOCKHELPER_TIMER(
"BlockTriDi::AsyncableImport::AsyncSendRecv", AsyncSendRecv);
528 #ifdef HAVE_IFPACK2_MPI
530 const local_ordinal_type num_vectors = mv.extent(1);
531 const local_ordinal_type mv_blocksize = blocksize*num_vectors;
534 for (local_ordinal_type i=0,iend=pids.recv.extent(0);i<iend;++i) {
535 if(Tpetra::Details::Behavior::assumeMpiIsGPUAware()) {
537 reinterpret_cast<char*>(buffer.recv.data() + offset_host.recv[i]*mv_blocksize),
538 (offset_host.recv[i+1] - offset_host.recv[i])*mv_blocksize*
sizeof(impl_scalar_type),
545 reinterpret_cast<char*>(buffer_host.recv.data() + offset_host.recv[i]*mv_blocksize),
546 (offset_host.recv[i+1] - offset_host.recv[i])*mv_blocksize*
sizeof(impl_scalar_type),
554 execution_space().fence();
557 for (local_ordinal_type i=0;i<static_cast<local_ordinal_type>(pids.send.extent(0));++i) {
559 if (i<8) exec_instances[i%8].fence();
560 copy<ToBuffer>(lids.send, buffer.send,
561 offset_host.send(i), offset_host.send(i+1),
564 exec_instances[i%8]);
565 if (!Tpetra::Details::Behavior::assumeMpiIsGPUAware()) {
567 const local_ordinal_type num_vectors = mv.extent(1);
568 const local_ordinal_type mv_blocksize = blocksize*num_vectors;
570 Kokkos::deep_copy(exec_instances[i%8],
571 Kokkos::subview(buffer_host.send,
572 Kokkos::pair<local_ordinal_type, local_ordinal_type>(
573 offset_host.send(i)*mv_blocksize,
574 offset_host.send(i+1)*mv_blocksize)),
575 Kokkos::subview(buffer.send,
576 Kokkos::pair<local_ordinal_type, local_ordinal_type>(
577 offset_host.send(i)*mv_blocksize,
578 offset_host.send(i+1)*mv_blocksize)));
583 for (local_ordinal_type i=0;i<static_cast<local_ordinal_type>(pids.send.extent(0));++i) {
585 if (i<8) exec_instances[i%8].fence();
586 if(Tpetra::Details::Behavior::assumeMpiIsGPUAware()) {
588 reinterpret_cast<const char*>(buffer.send.data() + offset_host.send[i]*mv_blocksize),
589 (offset_host.send[i+1] - offset_host.send[i])*mv_blocksize*
sizeof(impl_scalar_type),
596 reinterpret_cast<const char*>(buffer_host.send.data() + offset_host.send[i]*mv_blocksize),
597 (offset_host.send[i+1] - offset_host.send[i])*mv_blocksize*
sizeof(impl_scalar_type),
605 for (local_ordinal_type i=0,iend=pids.recv.extent(0);i<iend;++i) {
608 MPI_Iprobe(pids.recv[i], 42, comm, &flag, &stat);
610 #endif // HAVE_IFPACK2_MPI
611 IFPACK2_BLOCKHELPER_TIMER_FENCE(execution_space)
614 void syncRecvVar1() {
615 IFPACK2_BLOCKHELPER_TIMER(
"BlockTriDi::AsyncableImport::SyncRecv", SyncRecv);
616 #ifdef HAVE_IFPACK2_MPI
618 for (local_ordinal_type i=0;i<static_cast<local_ordinal_type>(pids.recv.extent(0));++i) {
619 local_ordinal_type idx = i;
622 waitany(pids.recv.extent(0), reqs.recv.data(), &idx);
624 if (!Tpetra::Details::Behavior::assumeMpiIsGPUAware()) {
625 const local_ordinal_type num_vectors = remote_multivector.extent(1);
626 const local_ordinal_type mv_blocksize = blocksize*num_vectors;
629 Kokkos::subview(buffer.recv,
630 Kokkos::pair<local_ordinal_type, local_ordinal_type>(
631 offset_host.recv(idx)*mv_blocksize,
632 offset_host.recv(idx+1)*mv_blocksize)),
633 Kokkos::subview(buffer_host.recv,
634 Kokkos::pair<local_ordinal_type, local_ordinal_type>(
635 offset_host.recv(idx)*mv_blocksize,
636 offset_host.recv(idx+1)*mv_blocksize)));
640 copy<ToMultiVector>(lids.recv, buffer.recv,
641 offset_host.recv(idx), offset_host.recv(idx+1),
642 remote_multivector, blocksize,
643 exec_instances[idx%8]);
650 waitall(reqs.send.size(), reqs.send.data());
651 #endif // HAVE_IFPACK2_MPI
652 IFPACK2_BLOCKHELPER_TIMER_FENCE(execution_space)
654 #endif //defined(KOKKOS_ENABLE_CUDA|HIP|SYCL)
661 template<
typename PackTag>
663 void copy(
const local_ordinal_type_1d_view &lids_,
664 const impl_scalar_type_1d_view &buffer_,
665 const local_ordinal_type &ibeg_,
666 const local_ordinal_type &iend_,
667 const impl_scalar_type_2d_view_tpetra &multivector_,
668 const local_ordinal_type blocksize_) {
669 const local_ordinal_type num_vectors = multivector_.extent(1);
670 const local_ordinal_type mv_blocksize = blocksize_*num_vectors;
671 const local_ordinal_type idiff = iend_ - ibeg_;
672 const auto abase = buffer_.data() + mv_blocksize*ibeg_;
673 if constexpr (BlockHelperDetails::is_device<execution_space>::value) {
674 using team_policy_type = Kokkos::TeamPolicy<execution_space>;
675 local_ordinal_type vector_size(0);
676 if (blocksize_ <= 4) vector_size = 4;
677 else if (blocksize_ <= 8) vector_size = 8;
678 else if (blocksize_ <= 16) vector_size = 16;
679 else vector_size = 32;
680 const team_policy_type policy(idiff, 1, vector_size);
682 (
"AsyncableImport::TeamPolicy::copy",
683 policy, KOKKOS_LAMBDA(
const typename team_policy_type::member_type &member) {
684 const local_ordinal_type i = member.league_rank();
686 (Kokkos::TeamThreadRange(member,num_vectors),[&](
const local_ordinal_type &j) {
687 auto aptr = abase + blocksize_*(i + idiff*j);
688 auto bptr = &multivector_(blocksize_*lids_(i + ibeg_), j);
689 if (std::is_same<PackTag,ToBuffer>::value)
691 (Kokkos::ThreadVectorRange(member,blocksize_),[&](
const local_ordinal_type &k) {
696 (Kokkos::ThreadVectorRange(member,blocksize_),[&](
const local_ordinal_type &k) {
702 const Kokkos::RangePolicy<execution_space> policy(0, idiff*num_vectors);
704 (
"AsyncableImport::RangePolicy::copy",
705 policy, KOKKOS_LAMBDA(
const local_ordinal_type &ij) {
706 const local_ordinal_type i = ij%idiff;
707 const local_ordinal_type j = ij/idiff;
708 auto aptr = abase + blocksize_*(i + idiff*j);
709 auto bptr = &multivector_(blocksize_*lids_(i + ibeg_), j);
710 auto from = std::is_same<PackTag,ToBuffer>::value ? bptr : aptr;
711 auto to = std::is_same<PackTag,ToBuffer>::value ? aptr : bptr;
712 memcpy(to, from,
sizeof(impl_scalar_type)*blocksize_);
721 void asyncSendRecvVar0(
const impl_scalar_type_2d_view_tpetra &mv) {
722 IFPACK2_BLOCKHELPER_TIMER(
"BlockTriDi::AsyncableImport::AsyncSendRecv", AsyncSendRecv);
724 #ifdef HAVE_IFPACK2_MPI
726 const local_ordinal_type num_vectors = mv.extent(1);
727 const local_ordinal_type mv_blocksize = blocksize*num_vectors;
730 for (local_ordinal_type i=0,iend=pids.recv.extent(0);i<iend;++i) {
731 if(Tpetra::Details::Behavior::assumeMpiIsGPUAware()) {
733 reinterpret_cast<char*>(buffer.recv.data() + offset_host.recv[i]*mv_blocksize),
734 (offset_host.recv[i+1] - offset_host.recv[i])*mv_blocksize*
sizeof(impl_scalar_type),
741 reinterpret_cast<char*>(buffer_host.recv.data() + offset_host.recv[i]*mv_blocksize),
742 (offset_host.recv[i+1] - offset_host.recv[i])*mv_blocksize*
sizeof(impl_scalar_type),
750 for (local_ordinal_type i=0,iend=pids.send.extent(0);i<iend;++i) {
751 copy<ToBuffer>(lids.send, buffer.send, offset_host.send(i), offset_host.send(i+1),
754 if(Tpetra::Details::Behavior::assumeMpiIsGPUAware()) {
756 reinterpret_cast<const char*>(buffer.send.data() + offset_host.send[i]*mv_blocksize),
757 (offset_host.send[i+1] - offset_host.send[i])*mv_blocksize*
sizeof(impl_scalar_type),
764 Kokkos::subview(buffer_host.send,
765 Kokkos::pair<local_ordinal_type, local_ordinal_type>(
766 offset_host.send(i)*mv_blocksize,
767 offset_host.send(i+1)*mv_blocksize)),
768 Kokkos::subview(buffer.send,
769 Kokkos::pair<local_ordinal_type, local_ordinal_type>(
770 offset_host.send(i)*mv_blocksize,
771 offset_host.send(i+1)*mv_blocksize)));
773 reinterpret_cast<const char*>(buffer_host.send.data() + offset_host.send[i]*mv_blocksize),
774 (offset_host.send[i+1] - offset_host.send[i])*mv_blocksize*
sizeof(impl_scalar_type),
783 for (local_ordinal_type i=0,iend=pids.recv.extent(0);i<iend;++i) {
786 MPI_Iprobe(pids.recv[i], 42, comm, &flag, &stat);
789 IFPACK2_BLOCKHELPER_TIMER_FENCE(execution_space)
792 void syncRecvVar0() {
793 IFPACK2_BLOCKHELPER_TIMER(
"BlockTriDi::AsyncableImport::SyncRecv", SyncRecv);
794 #ifdef HAVE_IFPACK2_MPI
796 for (local_ordinal_type i=0,iend=pids.recv.extent(0);i<iend;++i) {
797 local_ordinal_type idx = i;
798 waitany(pids.recv.extent(0), reqs.recv.data(), &idx);
799 if (!Tpetra::Details::Behavior::assumeMpiIsGPUAware()) {
800 const local_ordinal_type num_vectors = remote_multivector.extent(1);
801 const local_ordinal_type mv_blocksize = blocksize*num_vectors;
803 Kokkos::subview(buffer.recv,
804 Kokkos::pair<local_ordinal_type, local_ordinal_type>(
805 offset_host.recv(idx)*mv_blocksize,
806 offset_host.recv(idx+1)*mv_blocksize)),
807 Kokkos::subview(buffer_host.recv,
808 Kokkos::pair<local_ordinal_type, local_ordinal_type>(
809 offset_host.recv(idx)*mv_blocksize,
810 offset_host.recv(idx+1)*mv_blocksize)));
812 copy<ToMultiVector>(lids.recv, buffer.recv, offset_host.recv(idx), offset_host.recv(idx+1),
813 remote_multivector, blocksize);
816 waitall(reqs.send.size(), reqs.send.data());
818 IFPACK2_BLOCKHELPER_TIMER_FENCE(execution_space)
824 void asyncSendRecv(
const impl_scalar_type_2d_view_tpetra &mv) {
825 #if defined(KOKKOS_ENABLE_CUDA) || defined(KOKKOS_ENABLE_HIP) || defined(KOKKOS_ENABLE_SYCL)
826 #if defined(IFPACK2_BLOCKTRIDICONTAINER_USE_EXEC_SPACE_INSTANCES)
827 asyncSendRecvVar1(mv);
829 asyncSendRecvVar0(mv);
832 asyncSendRecvVar0(mv);
836 #if defined(KOKKOS_ENABLE_CUDA) || defined(KOKKOS_ENABLE_HIP) || defined(KOKKOS_ENABLE_SYCL)
837 #if defined(IFPACK2_BLOCKTRIDICONTAINER_USE_EXEC_SPACE_INSTANCES)
847 void syncExchange(
const impl_scalar_type_2d_view_tpetra &mv) {
848 IFPACK2_BLOCKHELPER_TIMER(
"BlockTriDi::AsyncableImport::SyncExchange", SyncExchange);
851 IFPACK2_BLOCKHELPER_TIMER_FENCE(execution_space)
854 impl_scalar_type_2d_view_tpetra getRemoteMultiVectorLocalView()
const {
return remote_multivector; }
857 template <
typename ViewType1,
typename ViewType2>
858 struct are_same_struct {
862 are_same_struct(ViewType1 keys1_, ViewType2 keys2_) : keys1(keys1_), keys2(keys2_) {}
863 KOKKOS_INLINE_FUNCTION
864 void operator()(
int i,
unsigned int& count)
const {
865 if (keys1(i) != keys2(i)) count++;
869 template <
typename ViewType1,
typename ViewType2>
870 bool are_same (ViewType1 keys1, ViewType2 keys2) {
871 unsigned int are_same_ = 0;
873 Kokkos::parallel_reduce(Kokkos::RangePolicy<typename ViewType1::execution_space>(0, keys1.extent(0)),
874 are_same_struct(keys1, keys2),
882 template<
typename MatrixType>
887 using tpetra_map_type =
typename impl_type::tpetra_map_type;
888 using local_ordinal_type =
typename impl_type::local_ordinal_type;
889 using global_ordinal_type =
typename impl_type::global_ordinal_type;
890 using local_ordinal_type_1d_view =
typename impl_type::local_ordinal_type_1d_view;
891 using crs_matrix_type =
typename impl_type::tpetra_crs_matrix_type;
892 using block_crs_matrix_type =
typename impl_type::tpetra_block_crs_matrix_type;
893 using global_indices_array_device_type = Kokkos::View<const global_ordinal_type*, typename tpetra_map_type::device_type>;
895 auto A_crs = Teuchos::rcp_dynamic_cast<
const crs_matrix_type>(A);
896 auto A_bcrs = Teuchos::rcp_dynamic_cast<
const block_crs_matrix_type>(A);
898 bool hasBlockCrsMatrix = ! A_bcrs.is_null ();
901 const auto g = hasBlockCrsMatrix ? A_bcrs->getCrsGraph() : *(A_crs->getCrsGraph());
903 const auto blocksize = hasBlockCrsMatrix ? A_bcrs->getBlockSize() : 1;
904 const auto domain_map = g.getDomainMap();
905 const auto column_map = g.getColMap();
907 std::vector<global_ordinal_type> gids;
909 Kokkos::Subview<global_indices_array_device_type, std::pair<int,int>> column_map_global_iD_last;
911 bool separate_remotes =
true, found_first =
false, need_owned_permutation =
false;
913 IFPACK2_BLOCKHELPER_TIMER(
"createBlockCrsAsyncImporter::loop_over_local_elements", loop_over_local_elements);
915 global_indices_array_device_type column_map_global_iD = column_map->getMyGlobalIndicesDevice();
916 global_indices_array_device_type domain_map_global_iD = domain_map->getMyGlobalIndicesDevice();
918 if(are_same(domain_map_global_iD, column_map_global_iD)) {
920 separate_remotes =
true;
921 need_owned_permutation =
false;
923 column_map_global_iD_last = Kokkos::subview(column_map_global_iD,
924 std::pair<int,int>(domain_map_global_iD.extent(0), column_map_global_iD.extent(0)));
928 for (
size_t i=0;i<column_map->getLocalNumElements();++i) {
929 const global_ordinal_type gid = column_map->getGlobalElement(i);
930 if (!domain_map->isNodeGlobalElement(gid)) {
933 }
else if (found_first) {
934 separate_remotes =
false;
937 if (!found_first && !need_owned_permutation &&
938 domain_map->getLocalElement(gid) !=
static_cast<local_ordinal_type
>(i)) {
947 need_owned_permutation =
true;
951 IFPACK2_BLOCKHELPER_TIMER_FENCE(
typename BlockHelperDetails::ImplType<MatrixType>::execution_space)
954 if (separate_remotes) {
955 IFPACK2_BLOCKHELPER_TIMER(
"createBlockCrsAsyncImporter::separate_remotes", separate_remotes);
957 const auto parsimonious_col_map
958 = need_owned_permutation ?
959 Teuchos::rcp(
new tpetra_map_type(invalid, gids.data(), gids.size(), 0, domain_map->getComm())):
960 Teuchos::rcp(
new tpetra_map_type(invalid, column_map_global_iD_last, 0, domain_map->getComm()));
961 if (parsimonious_col_map->getGlobalNumElements() > 0) {
963 local_ordinal_type_1d_view dm2cm;
964 if (need_owned_permutation) {
965 dm2cm = local_ordinal_type_1d_view(
do_not_initialize_tag(
"dm2cm"), domain_map->getLocalNumElements());
966 const auto dm2cm_host = Kokkos::create_mirror_view(dm2cm);
967 for (
size_t i=0;i<domain_map->getLocalNumElements();++i)
968 dm2cm_host(i) = domain_map->getLocalElement(column_map->getGlobalElement(i));
969 Kokkos::deep_copy(dm2cm, dm2cm_host);
971 IFPACK2_BLOCKHELPER_TIMER_FENCE(
typename BlockHelperDetails::ImplType<MatrixType>::execution_space)
972 return Teuchos::rcp(
new AsyncableImport<MatrixType>(domain_map, parsimonious_col_map, blocksize, dm2cm));
975 IFPACK2_BLOCKHELPER_TIMER_FENCE(
typename BlockHelperDetails::ImplType<MatrixType>::execution_space)
976 return Teuchos::null;
979 template<
typename local_ordinal_type>
980 local_ordinal_type costTRSM(
const local_ordinal_type block_size) {
981 return block_size*block_size;
984 template<
typename local_ordinal_type>
985 local_ordinal_type costGEMV(
const local_ordinal_type block_size) {
986 return 2*block_size*block_size;
989 template<
typename local_ordinal_type>
990 local_ordinal_type costTriDiagSolve(
const local_ordinal_type subline_length,
const local_ordinal_type block_size) {
991 return 2 * subline_length * costTRSM(block_size) + 2 * (subline_length-1) * costGEMV(block_size);
994 template<
typename local_ordinal_type>
995 local_ordinal_type costSolveSchur(
const local_ordinal_type num_parts,
996 const local_ordinal_type num_teams,
997 const local_ordinal_type line_length,
998 const local_ordinal_type block_size,
999 const local_ordinal_type n_subparts_per_part) {
1000 const local_ordinal_type subline_length = ceil(
double(line_length - (n_subparts_per_part-1) * 2) / n_subparts_per_part);
1001 if (subline_length < 1) {
1005 const local_ordinal_type p_n_lines = ceil(
double(num_parts)/num_teams);
1006 const local_ordinal_type p_n_sublines = ceil(
double(n_subparts_per_part)*num_parts/num_teams);
1007 const local_ordinal_type p_n_sublines_2 = ceil(
double(n_subparts_per_part-1)*num_parts/num_teams);
1009 const local_ordinal_type p_costApplyE = p_n_sublines_2 * subline_length * 2 * costGEMV(block_size);
1010 const local_ordinal_type p_costApplyS = p_n_lines * costTriDiagSolve((n_subparts_per_part-1)*2,block_size);
1011 const local_ordinal_type p_costApplyAinv = p_n_sublines * costTriDiagSolve(subline_length,block_size);
1012 const local_ordinal_type p_costApplyC = p_n_sublines_2 * 2 * costGEMV(block_size);
1014 if (n_subparts_per_part == 1) {
1015 return p_costApplyAinv;
1017 return p_costApplyE + p_costApplyS + p_costApplyAinv + p_costApplyC;
1020 template<
typename local_ordinal_type>
1021 local_ordinal_type getAutomaticNSubparts(
const local_ordinal_type num_parts,
1022 const local_ordinal_type num_teams,
1023 const local_ordinal_type line_length,
1024 const local_ordinal_type block_size) {
1025 local_ordinal_type n_subparts_per_part_0 = 1;
1026 local_ordinal_type flop_0 = costSolveSchur(num_parts, num_teams, line_length, block_size, n_subparts_per_part_0);
1027 local_ordinal_type flop_1 = costSolveSchur(num_parts, num_teams, line_length, block_size, n_subparts_per_part_0+1);
1028 while (flop_0 > flop_1) {
1030 flop_1 = costSolveSchur(num_parts, num_teams, line_length, block_size, (++n_subparts_per_part_0)+1);
1032 return n_subparts_per_part_0;
1035 template<
typename ArgActiveExecutionMemorySpace>
1036 struct SolveTridiagsDefaultModeAndAlgo;
1041 template<
typename MatrixType>
1042 BlockHelperDetails::PartInterface<MatrixType>
1044 const Teuchos::RCP<
const typename BlockHelperDetails::ImplType<MatrixType>::tpetra_crs_graph_type> &G,
1046 const typename BlockHelperDetails::ImplType<MatrixType>::local_ordinal_type n_subparts_per_part_in) {
1049 using local_ordinal_type =
typename impl_type::local_ordinal_type;
1050 using local_ordinal_type_1d_view =
typename impl_type::local_ordinal_type_1d_view;
1051 using local_ordinal_type_2d_view =
typename impl_type::local_ordinal_type_2d_view;
1052 using size_type =
typename impl_type::size_type;
1054 auto bA = Teuchos::rcp_dynamic_cast<
const typename BlockHelperDetails::ImplType<MatrixType>::tpetra_block_crs_matrix_type>(A);
1057 const local_ordinal_type blocksize = bA.is_null() ? A->getLocalNumRows() / G->getLocalNumRows() : A->getBlockSize();
1058 constexpr
int vector_length = impl_type::vector_length;
1059 constexpr
int internal_vector_length = impl_type::internal_vector_length;
1061 const auto comm = A->getRowMap()->getComm();
1063 BlockHelperDetails::PartInterface<MatrixType> interf;
1065 const bool jacobi = partitions.size() == 0;
1066 const local_ordinal_type A_n_lclrows = G->getLocalNumRows();
1067 const local_ordinal_type nparts = jacobi ? A_n_lclrows : partitions.size();
1069 typedef std::pair<local_ordinal_type,local_ordinal_type> size_idx_pair_type;
1070 std::vector<size_idx_pair_type> partsz(nparts);
1073 for (local_ordinal_type i=0;i<nparts;++i)
1074 partsz[i] = size_idx_pair_type(partitions[i].size(), i);
1075 std::sort(partsz.begin(), partsz.end(),
1076 [] (
const size_idx_pair_type& x,
const size_idx_pair_type& y) {
1077 return x.first > y.first;
1081 local_ordinal_type n_subparts_per_part;
1082 if (n_subparts_per_part_in == -1) {
1085 using execution_space =
typename impl_type::execution_space;
1087 const int line_length = partsz[0].first;
1089 const local_ordinal_type team_size =
1090 SolveTridiagsDefaultModeAndAlgo<typename execution_space::memory_space>::
1091 recommended_team_size(blocksize, vector_length, internal_vector_length);
1093 const local_ordinal_type num_teams = std::max(1, execution_space().concurrency() / (team_size * vector_length));
1095 n_subparts_per_part = getAutomaticNSubparts(nparts, num_teams, line_length, blocksize);
1097 #ifdef IFPACK2_BLOCKTRIDICONTAINER_USE_PRINTF
1098 printf(
"Automatically chosen n_subparts_per_part = %d for nparts = %d, num_teams = %d, team_size = %d, line_length = %d, and blocksize = %d;\n", n_subparts_per_part, nparts, num_teams, team_size, line_length, blocksize);
1102 n_subparts_per_part = n_subparts_per_part_in;
1106 const local_ordinal_type n_sub_parts = nparts * n_subparts_per_part;
1109 const local_ordinal_type n_sub_parts_and_schur = n_sub_parts + nparts * (n_subparts_per_part-1);
1111 #if defined(BLOCKTRIDICONTAINER_DEBUG)
1112 local_ordinal_type nrows = 0;
1116 for (local_ordinal_type i=0;i<nparts;++i) nrows += partitions[i].size();
1119 (nrows != A_n_lclrows, BlockHelperDetails::get_msg_prefix(comm) <<
"The #rows implied by the local partition is not "
1120 <<
"the same as getLocalNumRows: " << nrows <<
" vs " << A_n_lclrows);
1124 std::vector<local_ordinal_type> p;
1126 interf.max_partsz = 1;
1127 interf.max_subpartsz = 0;
1128 interf.n_subparts_per_part = 1;
1129 interf.nparts = nparts;
1134 for (local_ordinal_type i=0;i<nparts;++i)
1135 p[i] = partsz[i].second;
1137 interf.max_partsz = partsz[0].first;
1139 constexpr local_ordinal_type connection_length = 2;
1140 const local_ordinal_type sub_line_length = (interf.max_partsz - (n_subparts_per_part - 1) * connection_length) / n_subparts_per_part;
1141 const local_ordinal_type last_sub_line_length = interf.max_partsz - (n_subparts_per_part - 1) * (connection_length + sub_line_length);
1143 interf.max_subpartsz = (sub_line_length > last_sub_line_length) ? sub_line_length : last_sub_line_length;
1144 interf.n_subparts_per_part = n_subparts_per_part;
1145 interf.nparts = nparts;
1151 interf.part2rowidx0 = local_ordinal_type_1d_view(
do_not_initialize_tag(
"part2rowidx0"), nparts + 1);
1152 interf.part2packrowidx0 = local_ordinal_type_1d_view(
do_not_initialize_tag(
"part2packrowidx0"), nparts + 1);
1155 interf.part2rowidx0_sub = local_ordinal_type_1d_view(
do_not_initialize_tag(
"part2rowidx0_sub"), n_sub_parts_and_schur + 1);
1156 interf.part2packrowidx0_sub = local_ordinal_type_2d_view(
do_not_initialize_tag(
"part2packrowidx0_sub"), nparts, 2 * n_subparts_per_part);
1157 interf.rowidx2part_sub = local_ordinal_type_1d_view(
do_not_initialize_tag(
"rowidx2part"), A_n_lclrows);
1159 interf.partptr_sub = local_ordinal_type_2d_view(
do_not_initialize_tag(
"partptr_sub"), n_sub_parts_and_schur, 2);
1162 const auto partptr = Kokkos::create_mirror_view(interf.partptr);
1163 const auto partptr_sub = Kokkos::create_mirror_view(interf.partptr_sub);
1165 const auto lclrow = Kokkos::create_mirror_view(interf.lclrow);
1166 const auto part2rowidx0 = Kokkos::create_mirror_view(interf.part2rowidx0);
1167 const auto part2packrowidx0 = Kokkos::create_mirror_view(interf.part2packrowidx0);
1168 const auto rowidx2part = Kokkos::create_mirror_view(interf.rowidx2part);
1170 const auto part2rowidx0_sub = Kokkos::create_mirror_view(interf.part2rowidx0_sub);
1171 const auto part2packrowidx0_sub = Kokkos::create_mirror_view(Kokkos::HostSpace(), interf.part2packrowidx0_sub);
1172 const auto rowidx2part_sub = Kokkos::create_mirror_view(interf.rowidx2part_sub);
1175 interf.row_contiguous =
true;
1177 part2rowidx0(0) = 0;
1178 part2packrowidx0(0) = 0;
1179 local_ordinal_type pack_nrows = 0;
1180 local_ordinal_type pack_nrows_sub = 0;
1182 IFPACK2_BLOCKHELPER_TIMER(
"compute part indices (Jacobi)", Jacobi);
1183 for (local_ordinal_type ip=0;ip<nparts;++ip) {
1184 constexpr local_ordinal_type ipnrows = 1;
1186 part2rowidx0(ip+1) = part2rowidx0(ip) + ipnrows;
1189 if (ip % vector_length == 0) pack_nrows = ipnrows;
1190 part2packrowidx0(ip+1) = part2packrowidx0(ip) + ((ip+1) % vector_length == 0 || ip+1 == nparts ? pack_nrows : 0);
1191 const local_ordinal_type offset = partptr(ip);
1192 for (local_ordinal_type i=0;i<ipnrows;++i) {
1193 const auto lcl_row = ip;
1195 BlockHelperDetails::get_msg_prefix(comm)
1196 <<
"partitions[" << p[ip] <<
"]["
1197 << i <<
"] = " << lcl_row
1198 <<
" but input matrix implies limits of [0, " << A_n_lclrows-1
1200 lclrow(offset+i) = lcl_row;
1201 rowidx2part(offset+i) = ip;
1202 if (interf.row_contiguous && offset+i > 0 && lclrow((offset+i)-1) + 1 != lcl_row)
1203 interf.row_contiguous =
false;
1205 partptr(ip+1) = offset + ipnrows;
1207 part2rowidx0_sub(0) = 0;
1208 partptr_sub(0, 0) = 0;
1210 for (local_ordinal_type ip=0;ip<nparts;++ip) {
1211 constexpr local_ordinal_type ipnrows = 1;
1212 const local_ordinal_type full_line_length = partptr(ip+1) - partptr(ip);
1215 (full_line_length != ipnrows, std::logic_error,
1216 "In the part " << ip );
1218 constexpr local_ordinal_type connection_length = 2;
1220 if (full_line_length < n_subparts_per_part + (n_subparts_per_part - 1) * connection_length )
1222 (
true, std::logic_error,
1223 "The part " << ip <<
" is too short to use " << n_subparts_per_part <<
" sub parts.");
1225 const local_ordinal_type sub_line_length = (full_line_length - (n_subparts_per_part - 1) * connection_length) / n_subparts_per_part;
1226 const local_ordinal_type last_sub_line_length = full_line_length - (n_subparts_per_part - 1) * (connection_length + sub_line_length);
1228 if (ip % vector_length == 0) pack_nrows_sub = ipnrows;
1230 for (local_ordinal_type local_sub_ip=0; local_sub_ip<n_subparts_per_part;++local_sub_ip) {
1231 const local_ordinal_type sub_ip = nparts*(2*local_sub_ip) + ip;
1232 const local_ordinal_type schur_ip = nparts*(2*local_sub_ip+1) + ip;
1233 if (local_sub_ip != n_subparts_per_part-1) {
1234 if (local_sub_ip != 0) {
1235 partptr_sub(sub_ip, 0) = partptr_sub(nparts*(2*local_sub_ip-1) + ip, 1);
1238 partptr_sub(sub_ip, 0) = partptr_sub(nparts*2*(n_subparts_per_part-1) + ip - 1, 1);
1240 partptr_sub(sub_ip, 1) = sub_line_length + partptr_sub(sub_ip, 0);
1241 partptr_sub(schur_ip, 0) = partptr_sub(sub_ip, 1);
1242 partptr_sub(schur_ip, 1) = connection_length + partptr_sub(schur_ip, 0);
1244 part2rowidx0_sub(sub_ip + 1) = part2rowidx0_sub(sub_ip) + sub_line_length;
1245 part2rowidx0_sub(sub_ip + 2) = part2rowidx0_sub(sub_ip + 1) + connection_length;
1247 #ifdef IFPACK2_BLOCKTRIDICONTAINER_USE_PRINTF
1248 printf(
"Sub Part index = %d, first LID associated to the sub part = %d, sub part size = %d;\n", sub_ip, partptr_sub(ip, 2 * local_sub_ip), sub_line_length);
1249 printf(
"Sub Part index Schur = %d, first LID associated to the sub part = %d, sub part size = %d;\n", sub_ip + 1, partptr_sub(ip, 2 * local_sub_ip + 1), connection_length);
1253 if (local_sub_ip != 0) {
1254 partptr_sub(sub_ip, 0) = partptr_sub(nparts*(2*local_sub_ip-1) + ip, 1);
1257 partptr_sub(sub_ip, 0) = partptr_sub(nparts*2*(n_subparts_per_part-1) + ip - 1, 1);
1259 partptr_sub(sub_ip, 1) = last_sub_line_length + partptr_sub(sub_ip, 0);
1261 part2rowidx0_sub(sub_ip + 1) = part2rowidx0_sub(sub_ip) + last_sub_line_length;
1263 #ifdef IFPACK2_BLOCKTRIDICONTAINER_USE_PRINTF
1264 printf(
"Sub Part index = %d, first LID associated to the sub part = %d, sub part size = %d;\n", sub_ip, partptr_sub(ip, 2 * local_sub_ip), last_sub_line_length);
1270 #ifdef IFPACK2_BLOCKTRIDICONTAINER_WRITE_MM
1271 std::cout <<
"partptr_sub = " << std::endl;
1272 for (size_type i = 0; i < partptr_sub.extent(0); ++i) {
1273 for (size_type j = 0; j < partptr_sub.extent(1); ++j) {
1274 std::cout << partptr_sub(i,j) <<
" ";
1276 std::cout << std::endl;
1278 std::cout <<
"partptr_sub end" << std::endl;
1282 local_ordinal_type npacks = ceil(
float(nparts)/vector_length);
1284 local_ordinal_type ip_max = nparts > vector_length ? vector_length : nparts;
1285 for (local_ordinal_type ip=0;ip<ip_max;++ip) {
1286 part2packrowidx0_sub(ip, 0) = 0;
1288 for (local_ordinal_type ipack=0;ipack<npacks;++ipack) {
1290 local_ordinal_type ip_min = ipack*vector_length;
1291 ip_max = nparts > (ipack+1)*vector_length ? (ipack+1)*vector_length : nparts;
1292 for (local_ordinal_type ip=ip_min;ip<ip_max;++ip) {
1293 part2packrowidx0_sub(ip, 0) = part2packrowidx0_sub(ip-vector_length, part2packrowidx0_sub.extent(1)-1);
1297 for (size_type local_sub_ip=0; local_sub_ip<part2packrowidx0_sub.extent(1)-1;++local_sub_ip) {
1298 local_ordinal_type ip_min = ipack*vector_length;
1299 ip_max = nparts > (ipack+1)*vector_length ? (ipack+1)*vector_length : nparts;
1301 const local_ordinal_type full_line_length = partptr(ip_min+1) - partptr(ip_min);
1303 constexpr local_ordinal_type connection_length = 2;
1305 const local_ordinal_type sub_line_length = (full_line_length - (n_subparts_per_part - 1) * connection_length) / n_subparts_per_part;
1306 const local_ordinal_type last_sub_line_length = full_line_length - (n_subparts_per_part - 1) * (connection_length + sub_line_length);
1308 if (local_sub_ip % 2 == 0) pack_nrows_sub = sub_line_length;
1309 if (local_sub_ip % 2 == 1) pack_nrows_sub = connection_length;
1310 if (local_sub_ip == part2packrowidx0_sub.extent(1)-2) pack_nrows_sub = last_sub_line_length;
1312 part2packrowidx0_sub(ip_min, local_sub_ip + 1) = part2packrowidx0_sub(ip_min, local_sub_ip) + pack_nrows_sub;
1314 for (local_ordinal_type ip=ip_min+1;ip<ip_max;++ip) {
1315 part2packrowidx0_sub(ip, local_sub_ip + 1) = part2packrowidx0_sub(ip_min, local_sub_ip + 1);
1320 Kokkos::deep_copy(interf.part2packrowidx0_sub, part2packrowidx0_sub);
1322 IFPACK2_BLOCKHELPER_TIMER_FENCE(
typename BlockHelperDetails::ImplType<MatrixType>::execution_space)
1324 IFPACK2_BLOCKHELPER_TIMER(
"compute part indices", indices);
1325 for (local_ordinal_type ip=0;ip<nparts;++ip) {
1326 const auto* part = &partitions[p[ip]];
1327 const local_ordinal_type ipnrows = part->size();
1328 TEUCHOS_ASSERT(ip == 0 || (ipnrows <= static_cast<local_ordinal_type>(partitions[p[ip-1]].size())));
1330 BlockHelperDetails::get_msg_prefix(comm)
1331 <<
"partition " << p[ip]
1332 <<
" is empty, which is not allowed.");
1334 part2rowidx0(ip+1) = part2rowidx0(ip) + ipnrows;
1337 if (ip % vector_length == 0) pack_nrows = ipnrows;
1338 part2packrowidx0(ip+1) = part2packrowidx0(ip) + ((ip+1) % vector_length == 0 || ip+1 == nparts ? pack_nrows : 0);
1339 const local_ordinal_type offset = partptr(ip);
1340 for (local_ordinal_type i=0;i<ipnrows;++i) {
1341 const auto lcl_row = (*part)[i];
1343 BlockHelperDetails::get_msg_prefix(comm)
1344 <<
"partitions[" << p[ip] <<
"]["
1345 << i <<
"] = " << lcl_row
1346 <<
" but input matrix implies limits of [0, " << A_n_lclrows-1
1348 lclrow(offset+i) = lcl_row;
1349 rowidx2part(offset+i) = ip;
1350 if (interf.row_contiguous && offset+i > 0 && lclrow((offset+i)-1) + 1 != lcl_row)
1351 interf.row_contiguous =
false;
1353 partptr(ip+1) = offset + ipnrows;
1355 #ifdef IFPACK2_BLOCKTRIDICONTAINER_USE_PRINTF
1356 printf(
"Part index = ip = %d, first LID associated to the part = partptr(ip) = offset = %d, part->size() = ipnrows = %d;\n", ip, offset, ipnrows);
1357 printf(
"partptr(%d+1) = %d\n", ip, partptr(ip+1));
1361 part2rowidx0_sub(0) = 0;
1362 partptr_sub(0, 0) = 0;
1365 for (local_ordinal_type ip=0;ip<nparts;++ip) {
1366 const auto* part = &partitions[p[ip]];
1367 const local_ordinal_type ipnrows = part->size();
1368 const local_ordinal_type full_line_length = partptr(ip+1) - partptr(ip);
1371 (full_line_length != ipnrows, std::logic_error,
1372 "In the part " << ip );
1374 constexpr local_ordinal_type connection_length = 2;
1376 if (full_line_length < n_subparts_per_part + (n_subparts_per_part - 1) * connection_length )
1378 (
true, std::logic_error,
1379 "The part " << ip <<
" is too short to use " << n_subparts_per_part <<
" sub parts.");
1381 const local_ordinal_type sub_line_length = (full_line_length - (n_subparts_per_part - 1) * connection_length) / n_subparts_per_part;
1382 const local_ordinal_type last_sub_line_length = full_line_length - (n_subparts_per_part - 1) * (connection_length + sub_line_length);
1384 if (ip % vector_length == 0) pack_nrows_sub = ipnrows;
1386 for (local_ordinal_type local_sub_ip=0; local_sub_ip<n_subparts_per_part;++local_sub_ip) {
1387 const local_ordinal_type sub_ip = nparts*(2*local_sub_ip) + ip;
1388 const local_ordinal_type schur_ip = nparts*(2*local_sub_ip+1) + ip;
1389 if (local_sub_ip != n_subparts_per_part-1) {
1390 if (local_sub_ip != 0) {
1391 partptr_sub(sub_ip, 0) = partptr_sub(nparts*(2*local_sub_ip-1) + ip, 1);
1394 partptr_sub(sub_ip, 0) = partptr_sub(nparts*2*(n_subparts_per_part-1) + ip - 1, 1);
1396 partptr_sub(sub_ip, 1) = sub_line_length + partptr_sub(sub_ip, 0);
1397 partptr_sub(schur_ip, 0) = partptr_sub(sub_ip, 1);
1398 partptr_sub(schur_ip, 1) = connection_length + partptr_sub(schur_ip, 0);
1400 part2rowidx0_sub(sub_ip + 1) = part2rowidx0_sub(sub_ip) + sub_line_length;
1401 part2rowidx0_sub(sub_ip + 2) = part2rowidx0_sub(sub_ip + 1) + connection_length;
1403 #ifdef IFPACK2_BLOCKTRIDICONTAINER_USE_PRINTF
1404 printf(
"Sub Part index = %d, first LID associated to the sub part = %d, sub part size = %d;\n", sub_ip, partptr_sub(sub_ip, 0), sub_line_length);
1405 printf(
"Sub Part index Schur = %d, first LID associated to the sub part = %d, sub part size = %d;\n", sub_ip + 1, partptr_sub(ip, 2 * local_sub_ip + 1), connection_length);
1409 if (local_sub_ip != 0) {
1410 partptr_sub(sub_ip, 0) = partptr_sub(nparts*(2*local_sub_ip-1) + ip, 1);
1413 partptr_sub(sub_ip, 0) = partptr_sub(nparts*2*(n_subparts_per_part-1) + ip - 1, 1);
1415 partptr_sub(sub_ip, 1) = last_sub_line_length + partptr_sub(sub_ip, 0);
1417 part2rowidx0_sub(sub_ip + 1) = part2rowidx0_sub(sub_ip) + last_sub_line_length;
1419 #ifdef IFPACK2_BLOCKTRIDICONTAINER_USE_PRINTF
1420 printf(
"Sub Part index = %d, first LID associated to the sub part = %d, sub part size = %d;\n", sub_ip, partptr_sub(sub_ip, 0), last_sub_line_length);
1427 local_ordinal_type npacks = ceil(
float(nparts)/vector_length);
1429 local_ordinal_type ip_max = nparts > vector_length ? vector_length : nparts;
1430 for (local_ordinal_type ip=0;ip<ip_max;++ip) {
1431 part2packrowidx0_sub(ip, 0) = 0;
1433 for (local_ordinal_type ipack=0;ipack<npacks;++ipack) {
1435 local_ordinal_type ip_min = ipack*vector_length;
1436 ip_max = nparts > (ipack+1)*vector_length ? (ipack+1)*vector_length : nparts;
1437 for (local_ordinal_type ip=ip_min;ip<ip_max;++ip) {
1438 part2packrowidx0_sub(ip, 0) = part2packrowidx0_sub(ip-vector_length, part2packrowidx0_sub.extent(1)-1);
1442 for (size_type local_sub_ip=0; local_sub_ip<part2packrowidx0_sub.extent(1)-1;++local_sub_ip) {
1443 local_ordinal_type ip_min = ipack*vector_length;
1444 ip_max = nparts > (ipack+1)*vector_length ? (ipack+1)*vector_length : nparts;
1446 const local_ordinal_type full_line_length = partptr(ip_min+1) - partptr(ip_min);
1448 constexpr local_ordinal_type connection_length = 2;
1450 const local_ordinal_type sub_line_length = (full_line_length - (n_subparts_per_part - 1) * connection_length) / n_subparts_per_part;
1451 const local_ordinal_type last_sub_line_length = full_line_length - (n_subparts_per_part - 1) * (connection_length + sub_line_length);
1453 if (local_sub_ip % 2 == 0) pack_nrows_sub = sub_line_length;
1454 if (local_sub_ip % 2 == 1) pack_nrows_sub = connection_length;
1455 if (local_sub_ip == part2packrowidx0_sub.extent(1)-2) pack_nrows_sub = last_sub_line_length;
1457 part2packrowidx0_sub(ip_min, local_sub_ip + 1) = part2packrowidx0_sub(ip_min, local_sub_ip) + pack_nrows_sub;
1459 for (local_ordinal_type ip=ip_min+1;ip<ip_max;++ip) {
1460 part2packrowidx0_sub(ip, local_sub_ip + 1) = part2packrowidx0_sub(ip_min, local_sub_ip + 1);
1465 Kokkos::deep_copy(interf.part2packrowidx0_sub, part2packrowidx0_sub);
1467 IFPACK2_BLOCKHELPER_TIMER_FENCE(
typename BlockHelperDetails::ImplType<MatrixType>::execution_space)
1469 #if defined(BLOCKTRIDICONTAINER_DEBUG)
1472 if (lclrow(0) != 0) interf.row_contiguous =
false;
1474 Kokkos::deep_copy(interf.partptr, partptr);
1475 Kokkos::deep_copy(interf.lclrow, lclrow);
1477 Kokkos::deep_copy(interf.partptr_sub, partptr_sub);
1480 interf.part2rowidx0 = interf.partptr;
1481 Kokkos::deep_copy(interf.part2packrowidx0, part2packrowidx0);
1483 interf.part2packrowidx0_back = part2packrowidx0_sub(part2packrowidx0_sub.extent(0) - 1, part2packrowidx0_sub.extent(1) - 1);
1484 Kokkos::deep_copy(interf.rowidx2part, rowidx2part);
1487 IFPACK2_BLOCKHELPER_TIMER(
"Fill packptr", packptr0);
1488 local_ordinal_type npacks = ceil(
float(nparts)/vector_length) * (part2packrowidx0_sub.extent(1)-1);
1490 for (local_ordinal_type ip=1;ip<=nparts;++ip)
1491 if (part2packrowidx0(ip) != part2packrowidx0(ip-1))
1495 const auto packptr = Kokkos::create_mirror_view(interf.packptr);
1497 for (local_ordinal_type ip=1,k=1;ip<=nparts;++ip)
1498 if (part2packrowidx0(ip) != part2packrowidx0(ip-1))
1501 Kokkos::deep_copy(interf.packptr, packptr);
1503 local_ordinal_type npacks_per_subpart = ceil(
float(nparts)/vector_length);
1504 npacks = ceil(
float(nparts)/vector_length) * (part2packrowidx0_sub.extent(1)-1);
1506 interf.packindices_sub = local_ordinal_type_1d_view(
do_not_initialize_tag(
"packindices_sub"), npacks_per_subpart*n_subparts_per_part);
1507 interf.packindices_schur = local_ordinal_type_2d_view(
do_not_initialize_tag(
"packindices_schur"), npacks_per_subpart,n_subparts_per_part-1);
1509 const auto packindices_sub = Kokkos::create_mirror_view(interf.packindices_sub);
1510 const auto packindices_schur = Kokkos::create_mirror_view(interf.packindices_schur);
1514 for (local_ordinal_type local_sub_ip=0; local_sub_ip<n_subparts_per_part-1;++local_sub_ip) {
1515 for (local_ordinal_type local_pack_ip=0; local_pack_ip<npacks_per_subpart;++local_pack_ip) {
1516 packindices_sub(local_sub_ip * npacks_per_subpart + local_pack_ip) = 2 * local_sub_ip * npacks_per_subpart + local_pack_ip;
1517 packindices_schur(local_pack_ip,local_sub_ip) = 2 * local_sub_ip * npacks_per_subpart + local_pack_ip + npacks_per_subpart;
1521 for (local_ordinal_type local_pack_ip=0; local_pack_ip<npacks_per_subpart;++local_pack_ip) {
1522 packindices_sub((n_subparts_per_part-1) * npacks_per_subpart + local_pack_ip) = 2 * (n_subparts_per_part-1) * npacks_per_subpart + local_pack_ip;
1525 #ifdef IFPACK2_BLOCKTRIDICONTAINER_WRITE_MM
1526 std::cout <<
"packindices_sub = " << std::endl;
1527 for (size_type i = 0; i < packindices_sub.extent(0); ++i) {
1528 std::cout << packindices_sub(i) <<
" ";
1530 std::cout << std::endl;
1531 std::cout <<
"packindices_sub end" << std::endl;
1533 std::cout <<
"packindices_schur = " << std::endl;
1534 for (size_type i = 0; i < packindices_schur.extent(0); ++i) {
1535 for (size_type j = 0; j < packindices_schur.extent(1); ++j) {
1536 std::cout << packindices_schur(i,j) <<
" ";
1538 std::cout << std::endl;
1541 std::cout <<
"packindices_schur end" << std::endl;
1544 Kokkos::deep_copy(interf.packindices_sub, packindices_sub);
1545 Kokkos::deep_copy(interf.packindices_schur, packindices_schur);
1548 const auto packptr_sub = Kokkos::create_mirror_view(interf.packptr_sub);
1550 for (local_ordinal_type k=0;k<npacks + 1;++k)
1551 packptr_sub(k) = packptr(k%npacks_per_subpart) + (k / npacks_per_subpart) * packptr(npacks_per_subpart);
1553 Kokkos::deep_copy(interf.packptr_sub, packptr_sub);
1554 IFPACK2_BLOCKHELPER_TIMER_FENCE(
typename BlockHelperDetails::ImplType<MatrixType>::execution_space)
1556 IFPACK2_BLOCKHELPER_TIMER_FENCE(
typename BlockHelperDetails::ImplType<MatrixType>::execution_space)
1564 template <
typename MatrixType>
1567 using local_ordinal_type_1d_view =
typename impl_type::local_ordinal_type_1d_view;
1569 using size_type_2d_view =
typename impl_type::size_type_2d_view;
1570 using vector_type_3d_view =
typename impl_type::vector_type_3d_view;
1571 using vector_type_4d_view =
typename impl_type::vector_type_4d_view;
1577 size_type_2d_view flat_td_ptr, pack_td_ptr, pack_td_ptr_schur;
1580 local_ordinal_type_1d_view A_colindsub;
1583 vector_type_3d_view values;
1586 vector_type_3d_view values_schur;
1588 vector_type_4d_view e_values;
1590 bool is_diagonal_only;
1596 template <
typename idx_type>
1597 static KOKKOS_FORCEINLINE_FUNCTION
1598 idx_type IndexToRow (
const idx_type& ind) {
return (ind + 1) / 3; }
1601 template <
typename idx_type>
1602 static KOKKOS_FORCEINLINE_FUNCTION
1603 idx_type RowToIndex (
const idx_type& row) {
return row > 0 ? 3*row - 1 : 0; }
1605 template <
typename idx_type>
1606 static KOKKOS_FORCEINLINE_FUNCTION
1607 idx_type NumBlocks (
const idx_type& nrows) {
return nrows > 0 ? 3*nrows - 2 : 0; }
1609 template <
typename idx_type>
1610 static KOKKOS_FORCEINLINE_FUNCTION
1611 idx_type NumBlocksSchur (
const idx_type& nrows) {
return nrows > 0 ? 3*nrows + 2 : 0; }
1618 template<
typename MatrixType>
1621 IFPACK2_BLOCKHELPER_TIMER(
"createBlockTridiags", createBlockTridiags0);
1623 using execution_space =
typename impl_type::execution_space;
1624 using local_ordinal_type =
typename impl_type::local_ordinal_type;
1625 using size_type =
typename impl_type::size_type;
1626 using size_type_2d_view =
typename impl_type::size_type_2d_view;
1628 constexpr
int vector_length = impl_type::vector_length;
1632 const local_ordinal_type ntridiags = interf.partptr_sub.extent(0);
1635 btdm.flat_td_ptr = size_type_2d_view(
do_not_initialize_tag(
"btdm.flat_td_ptr"), interf.nparts, 2*interf.n_subparts_per_part);
1636 const Kokkos::RangePolicy<execution_space> policy(0, 2 * interf.nparts * interf.n_subparts_per_part );
1637 Kokkos::parallel_scan
1638 (
"createBlockTridiags::RangePolicy::flat_td_ptr",
1639 policy, KOKKOS_LAMBDA(
const local_ordinal_type &i, size_type &update,
const bool &
final) {
1640 const local_ordinal_type partidx = i/(2 * interf.n_subparts_per_part);
1641 const local_ordinal_type local_subpartidx = i % (2 * interf.n_subparts_per_part);
1644 btdm.flat_td_ptr(partidx, local_subpartidx) = update;
1646 if (local_subpartidx != (2 * interf.n_subparts_per_part -1)) {
1647 const local_ordinal_type nrows = interf.partptr_sub(interf.nparts*local_subpartidx + partidx,1) - interf.partptr_sub(interf.nparts*local_subpartidx + partidx,0);
1648 if (local_subpartidx % 2 == 0)
1649 update += btdm.NumBlocks(nrows);
1651 update += btdm.NumBlocksSchur(nrows);
1655 const auto nblocks = Kokkos::create_mirror_view_and_copy
1656 (Kokkos::HostSpace(), Kokkos::subview(btdm.flat_td_ptr, interf.nparts-1, 2*interf.n_subparts_per_part-1));
1657 btdm.is_diagonal_only = (
static_cast<local_ordinal_type
>(nblocks()) == ntridiags);
1661 if (vector_length == 1) {
1662 btdm.pack_td_ptr = btdm.flat_td_ptr;
1666 local_ordinal_type npacks_per_subpart = 0;
1667 const auto part2packrowidx0 = Kokkos::create_mirror_view(interf.part2packrowidx0);
1668 Kokkos::deep_copy(part2packrowidx0, interf.part2packrowidx0);
1669 for (local_ordinal_type ip=1;ip<=interf.nparts;++ip)
1670 if (part2packrowidx0(ip) != part2packrowidx0(ip-1))
1671 ++npacks_per_subpart;
1673 btdm.pack_td_ptr = size_type_2d_view(
do_not_initialize_tag(
"btdm.pack_td_ptr"), interf.nparts, 2*interf.n_subparts_per_part);
1674 const Kokkos::RangePolicy<execution_space> policy(0,npacks_per_subpart);
1676 Kokkos::parallel_for
1677 (
"createBlockTridiags::RangePolicy::pack_td_ptr",
1678 policy, KOKKOS_LAMBDA(
const local_ordinal_type &i) {
1679 for (local_ordinal_type j = 0; j < 2*interf.n_subparts_per_part; ++j) {
1680 const local_ordinal_type pack_id = ( j == 2*interf.n_subparts_per_part-1 ) ? i+(j-1)*npacks_per_subpart : i+j*npacks_per_subpart;
1681 const local_ordinal_type nparts_in_pack = interf.packptr_sub(pack_id+1) - interf.packptr_sub(pack_id);
1683 const local_ordinal_type parti = interf.packptr_sub(pack_id);
1684 const local_ordinal_type partidx = parti%interf.nparts;
1686 for (local_ordinal_type pti=0;pti<nparts_in_pack;++pti) {
1687 btdm.pack_td_ptr(partidx+pti, j) = btdm.flat_td_ptr(i, j);
1693 btdm.pack_td_ptr_schur = size_type_2d_view(
do_not_initialize_tag(
"btdm.pack_td_ptr_schur"), interf.nparts, interf.n_subparts_per_part);
1695 const auto host_pack_td_ptr_schur = Kokkos::create_mirror_view(btdm.pack_td_ptr_schur);
1696 constexpr local_ordinal_type connection_length = 2;
1698 host_pack_td_ptr_schur(0,0) = 0;
1699 for (local_ordinal_type i = 0; i < interf.nparts; ++i) {
1700 if (i % vector_length == 0) {
1702 host_pack_td_ptr_schur(i,0) = host_pack_td_ptr_schur(i-1,host_pack_td_ptr_schur.extent(1)-1);
1703 for (local_ordinal_type j = 0; j < interf.n_subparts_per_part-1; ++j) {
1704 host_pack_td_ptr_schur(i,j+1) = host_pack_td_ptr_schur(i,j) + btdm.NumBlocks(connection_length) + (j != 0 ? 1 : 0) + (j != interf.n_subparts_per_part-2 ? 1 : 0);
1708 for (local_ordinal_type j = 0; j < interf.n_subparts_per_part; ++j) {
1709 host_pack_td_ptr_schur(i,j) = host_pack_td_ptr_schur(i-1,j);
1714 Kokkos::deep_copy(btdm.pack_td_ptr_schur, host_pack_td_ptr_schur);
1716 #ifdef IFPACK2_BLOCKTRIDICONTAINER_WRITE_MM
1717 const auto host_flat_td_ptr = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), btdm.flat_td_ptr);
1718 std::cout <<
"flat_td_ptr = " << std::endl;
1719 for (size_type i = 0; i < host_flat_td_ptr.extent(0); ++i) {
1720 for (size_type j = 0; j < host_flat_td_ptr.extent(1); ++j) {
1721 std::cout << host_flat_td_ptr(i,j) <<
" ";
1723 std::cout << std::endl;
1725 std::cout <<
"flat_td_ptr end" << std::endl;
1727 const auto host_pack_td_ptr = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), btdm.pack_td_ptr);
1729 std::cout <<
"pack_td_ptr = " << std::endl;
1730 for (size_type i = 0; i < host_pack_td_ptr.extent(0); ++i) {
1731 for (size_type j = 0; j < host_pack_td_ptr.extent(1); ++j) {
1732 std::cout << host_pack_td_ptr(i,j) <<
" ";
1734 std::cout << std::endl;
1736 std::cout <<
"pack_td_ptr end" << std::endl;
1739 std::cout <<
"pack_td_ptr_schur = " << std::endl;
1740 for (size_type i = 0; i < host_pack_td_ptr_schur.extent(0); ++i) {
1741 for (size_type j = 0; j < host_pack_td_ptr_schur.extent(1); ++j) {
1742 std::cout << host_pack_td_ptr_schur(i,j) <<
" ";
1744 std::cout << std::endl;
1746 std::cout <<
"pack_td_ptr_schur end" << std::endl;
1750 IFPACK2_BLOCKHELPER_TIMER_FENCE(
typename BlockHelperDetails::ImplType<MatrixType>::execution_space)
1765 template<
typename MatrixType>
1767 setTridiagsToIdentity
1768 (
const BlockTridiags<MatrixType>& btdm,
1769 const typename BlockHelperDetails::ImplType<MatrixType>::local_ordinal_type_1d_view& packptr)
1772 using execution_space =
typename impl_type::execution_space;
1773 using local_ordinal_type =
typename impl_type::local_ordinal_type;
1774 using size_type_2d_view =
typename impl_type::size_type_2d_view;
1776 const ConstUnmanaged<size_type_2d_view> pack_td_ptr(btdm.pack_td_ptr);
1777 const local_ordinal_type blocksize = btdm.values.extent(1);
1780 const int vector_length = impl_type::vector_length;
1781 const int internal_vector_length = impl_type::internal_vector_length;
1783 using btdm_scalar_type =
typename impl_type::btdm_scalar_type;
1784 using internal_vector_type =
typename impl_type::internal_vector_type;
1785 using internal_vector_type_4d_view =
1786 typename impl_type::internal_vector_type_4d_view;
1788 using team_policy_type = Kokkos::TeamPolicy<execution_space>;
1789 const internal_vector_type_4d_view values
1790 (reinterpret_cast<internal_vector_type*>(btdm.values.data()),
1791 btdm.values.extent(0),
1792 btdm.values.extent(1),
1793 btdm.values.extent(2),
1794 vector_length/internal_vector_length);
1795 const local_ordinal_type vector_loop_size = values.extent(3);
1796 #if defined(KOKKOS_ENABLE_CUDA) && defined(__CUDA_ARCH__)
1797 local_ordinal_type total_team_size(0);
1798 if (blocksize <= 5) total_team_size = 32;
1799 else if (blocksize <= 9) total_team_size = 64;
1800 else if (blocksize <= 12) total_team_size = 96;
1801 else if (blocksize <= 16) total_team_size = 128;
1802 else if (blocksize <= 20) total_team_size = 160;
1803 else total_team_size = 160;
1804 const local_ordinal_type team_size = total_team_size/vector_loop_size;
1805 const team_policy_type policy(packptr.extent(0)-1, team_size, vector_loop_size);
1806 #elif defined(KOKKOS_ENABLE_HIP)
1811 local_ordinal_type total_team_size(0);
1812 if (blocksize <= 5) total_team_size = 32;
1813 else if (blocksize <= 9) total_team_size = 64;
1814 else if (blocksize <= 12) total_team_size = 96;
1815 else if (blocksize <= 16) total_team_size = 128;
1816 else if (blocksize <= 20) total_team_size = 160;
1817 else total_team_size = 160;
1818 const local_ordinal_type team_size = total_team_size/vector_loop_size;
1819 const team_policy_type policy(packptr.extent(0)-1, team_size, vector_loop_size);
1820 #elif defined(KOKKOS_ENABLE_SYCL)
1822 local_ordinal_type total_team_size(0);
1823 if (blocksize <= 5) total_team_size = 32;
1824 else if (blocksize <= 9) total_team_size = 64;
1825 else if (blocksize <= 12) total_team_size = 96;
1826 else if (blocksize <= 16) total_team_size = 128;
1827 else if (blocksize <= 20) total_team_size = 160;
1828 else total_team_size = 160;
1829 const local_ordinal_type team_size = total_team_size/vector_loop_size;
1830 const team_policy_type policy(packptr.extent(0)-1, team_size, vector_loop_size);
1833 const team_policy_type policy(packptr.extent(0)-1, 1, 1);
1835 Kokkos::parallel_for
1836 (
"setTridiagsToIdentity::TeamPolicy",
1837 policy, KOKKOS_LAMBDA(
const typename team_policy_type::member_type &member) {
1838 const local_ordinal_type k = member.league_rank();
1839 const local_ordinal_type ibeg = pack_td_ptr(packptr(k),0);
1840 const local_ordinal_type iend = pack_td_ptr(packptr(k),pack_td_ptr.extent(1)-1);
1842 const local_ordinal_type diff = iend - ibeg;
1843 const local_ordinal_type icount = diff/3 + (diff%3 > 0);
1844 const btdm_scalar_type one(1);
1845 Kokkos::parallel_for(Kokkos::ThreadVectorRange(member, vector_loop_size),[&](
const int &v) {
1846 Kokkos::parallel_for(Kokkos::TeamThreadRange(member,icount),[&](
const local_ordinal_type &ii) {
1847 const local_ordinal_type i = ibeg + ii*3;
1848 for (local_ordinal_type j=0;j<blocksize;++j) {
1849 values(i,j,j,v) = one;
1860 template<
typename MatrixType>
1863 const Teuchos::RCP<
const typename BlockHelperDetails::ImplType<MatrixType>::tpetra_crs_graph_type> &g,
1864 const BlockHelperDetails::PartInterface<MatrixType> &interf,
1867 const bool overlap_communication_and_computation,
1868 const Teuchos::RCP<AsyncableImport<MatrixType> > &async_importer,
1869 bool useSeqMethod) {
1870 IFPACK2_BLOCKHELPER_TIMER(
"BlockTriDi::SymbolicPhase", SymbolicPhase);
1874 using execution_space =
typename impl_type::execution_space;
1875 using host_execution_space =
typename impl_type::host_execution_space;
1877 using local_ordinal_type =
typename impl_type::local_ordinal_type;
1878 using global_ordinal_type =
typename impl_type::global_ordinal_type;
1879 using size_type =
typename impl_type::size_type;
1880 using local_ordinal_type_1d_view =
typename impl_type::local_ordinal_type_1d_view;
1881 using size_type_1d_view =
typename impl_type::size_type_1d_view;
1882 using vector_type_3d_view =
typename impl_type::vector_type_3d_view;
1883 using vector_type_4d_view =
typename impl_type::vector_type_4d_view;
1884 using crs_matrix_type =
typename impl_type::tpetra_crs_matrix_type;
1885 using block_crs_matrix_type =
typename impl_type::tpetra_block_crs_matrix_type;
1887 constexpr
int vector_length = impl_type::vector_length;
1889 const auto comm = A->getRowMap()->getComm();
1891 auto A_crs = Teuchos::rcp_dynamic_cast<
const crs_matrix_type>(A);
1892 auto A_bcrs = Teuchos::rcp_dynamic_cast<
const block_crs_matrix_type>(A);
1894 bool hasBlockCrsMatrix = ! A_bcrs.is_null ();
1896 const local_ordinal_type blocksize = hasBlockCrsMatrix ? A->getBlockSize() : A->getLocalNumRows()/g->getLocalNumRows();
1899 const auto partptr = Kokkos::create_mirror_view_and_copy (Kokkos::HostSpace(), interf.partptr);
1900 const auto lclrow = Kokkos::create_mirror_view_and_copy (Kokkos::HostSpace(), interf.lclrow);
1901 const auto rowidx2part = Kokkos::create_mirror_view_and_copy (Kokkos::HostSpace(), interf.rowidx2part);
1902 const auto part2rowidx0 = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), interf.part2rowidx0);
1903 const auto packptr = Kokkos::create_mirror_view_and_copy (Kokkos::HostSpace(), interf.packptr);
1905 const local_ordinal_type nrows = partptr(partptr.extent(0) - 1);
1907 Kokkos::View<local_ordinal_type*,host_execution_space> col2row(
"col2row", A->getLocalNumCols());
1913 const auto rowmap = g->getRowMap();
1914 const auto colmap = g->getColMap();
1915 const auto dommap = g->getDomainMap();
1916 TEUCHOS_ASSERT( !(rowmap.is_null() || colmap.is_null() || dommap.is_null()));
1918 #if !defined(__CUDA_ARCH__) && !defined(__HIP_DEVICE_COMPILE__) && !defined(__SYCL_DEVICE_ONLY__)
1919 const Kokkos::RangePolicy<host_execution_space> policy(0,nrows);
1920 Kokkos::parallel_for
1921 (
"performSymbolicPhase::RangePolicy::col2row",
1922 policy, KOKKOS_LAMBDA(
const local_ordinal_type &lr) {
1923 const global_ordinal_type gid = rowmap->getGlobalElement(lr);
1925 if (dommap->isNodeGlobalElement(gid)) {
1926 const local_ordinal_type lc = colmap->getLocalElement(gid);
1927 # if defined(BLOCKTRIDICONTAINER_DEBUG)
1929 BlockHelperDetails::get_msg_prefix(comm) <<
"GID " << gid
1930 <<
" gives an invalid local column.");
1940 const auto local_graph = g->getLocalGraphHost();
1941 const auto local_graph_rowptr = local_graph.row_map;
1942 TEUCHOS_ASSERT(local_graph_rowptr.size() ==
static_cast<size_t>(nrows + 1));
1943 const auto local_graph_colidx = local_graph.entries;
1947 Kokkos::View<local_ordinal_type*,host_execution_space> lclrow2idx(
"lclrow2idx", nrows);
1949 const Kokkos::RangePolicy<host_execution_space> policy(0,nrows);
1950 Kokkos::parallel_for
1951 (
"performSymbolicPhase::RangePolicy::lclrow2idx",
1952 policy, KOKKOS_LAMBDA(
const local_ordinal_type &i) {
1953 lclrow2idx[lclrow(i)] = i;
1959 typename sum_reducer_type::value_type sum_reducer_value;
1961 const Kokkos::RangePolicy<host_execution_space> policy(0,nrows);
1962 Kokkos::parallel_reduce
1965 policy, KOKKOS_LAMBDA(
const local_ordinal_type &lr,
typename sum_reducer_type::value_type &update) {
1967 const local_ordinal_type ri0 = lclrow2idx[lr];
1968 const local_ordinal_type pi0 = rowidx2part(ri0);
1969 for (size_type j=local_graph_rowptr(lr);j<local_graph_rowptr(lr+1);++j) {
1970 const local_ordinal_type lc = local_graph_colidx(j);
1971 const local_ordinal_type lc2r = col2row[lc];
1972 bool incr_R =
false;
1974 if (lc2r == (local_ordinal_type) -1) {
1978 const local_ordinal_type ri = lclrow2idx[lc2r];
1979 const local_ordinal_type pi = rowidx2part(ri);
1987 if (ri0 + 1 >= ri && ri0 <= ri + 1)
1993 if (lc < nrows) ++update.v[1];
1997 }, sum_reducer_type(sum_reducer_value));
1999 size_type D_nnz = sum_reducer_value.v[0];
2000 size_type R_nnz_owned = sum_reducer_value.v[1];
2001 size_type R_nnz_remote = sum_reducer_value.v[2];
2003 if (!overlap_communication_and_computation) {
2004 R_nnz_owned += R_nnz_remote;
2010 const auto flat_td_ptr = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), btdm.flat_td_ptr);
2012 btdm.A_colindsub = local_ordinal_type_1d_view(
"btdm.A_colindsub", D_nnz);
2013 const auto D_A_colindsub = Kokkos::create_mirror_view(btdm.A_colindsub);
2015 #if defined(BLOCKTRIDICONTAINER_DEBUG)
2019 const local_ordinal_type nparts = partptr.extent(0) - 1;
2022 const Kokkos::RangePolicy<host_execution_space> policy(0, nparts);
2023 Kokkos::parallel_for
2024 (
"performSymbolicPhase::RangePolicy<host_execution_space>::D_graph",
2025 policy, KOKKOS_LAMBDA(
const local_ordinal_type &pi0) {
2026 const local_ordinal_type part_ri0 = part2rowidx0(pi0);
2027 local_ordinal_type offset = 0;
2028 for (local_ordinal_type ri0=partptr(pi0);ri0<partptr(pi0+1);++ri0) {
2029 const local_ordinal_type td_row_os = btdm.RowToIndex(ri0 - part_ri0) + offset;
2031 const local_ordinal_type lr0 = lclrow(ri0);
2032 const size_type j0 = local_graph_rowptr(lr0);
2033 for (size_type j=j0;j<local_graph_rowptr(lr0+1);++j) {
2034 const local_ordinal_type lc = local_graph_colidx(j);
2035 const local_ordinal_type lc2r = col2row[lc];
2036 if (lc2r == (local_ordinal_type) -1)
continue;
2037 const local_ordinal_type ri = lclrow2idx[lc2r];
2038 const local_ordinal_type pi = rowidx2part(ri);
2039 if (pi != pi0)
continue;
2040 if (ri + 1 < ri0 || ri > ri0 + 1)
continue;
2041 const local_ordinal_type row_entry = j - j0;
2042 D_A_colindsub(flat_td_ptr(pi0,0) + ((td_row_os + ri) - ri0)) = row_entry;
2047 #if defined(BLOCKTRIDICONTAINER_DEBUG)
2048 for (
size_t i=0;i<D_A_colindsub.extent(0);++i)
2051 Kokkos::deep_copy(btdm.A_colindsub, D_A_colindsub);
2055 const auto pack_td_ptr_last = Kokkos::subview(btdm.pack_td_ptr, btdm.pack_td_ptr.extent(0)-1, btdm.pack_td_ptr.extent(1)-1);
2056 const auto num_packed_blocks = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), pack_td_ptr_last);
2057 btdm.values = vector_type_3d_view(
"btdm.values", num_packed_blocks(), blocksize, blocksize);
2059 if (interf.n_subparts_per_part > 1) {
2060 const auto pack_td_ptr_schur_last = Kokkos::subview(btdm.pack_td_ptr_schur, btdm.pack_td_ptr_schur.extent(0)-1, btdm.pack_td_ptr_schur.extent(1)-1);
2061 const auto num_packed_blocks_schur = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), pack_td_ptr_schur_last);
2062 btdm.values_schur = vector_type_3d_view(
"btdm.values_schur", num_packed_blocks_schur(), blocksize, blocksize);
2065 if (vector_length > 1) setTridiagsToIdentity(btdm, interf.packptr);
2071 amd.rowptr = size_type_1d_view(
"amd.rowptr", nrows + 1);
2072 amd.A_colindsub = local_ordinal_type_1d_view(
do_not_initialize_tag(
"amd.A_colindsub"), R_nnz_owned);
2074 const auto R_rowptr = Kokkos::create_mirror_view(amd.rowptr);
2075 const auto R_A_colindsub = Kokkos::create_mirror_view(amd.A_colindsub);
2077 amd.rowptr_remote = size_type_1d_view(
"amd.rowptr_remote", overlap_communication_and_computation ? nrows + 1 : 0);
2078 amd.A_colindsub_remote = local_ordinal_type_1d_view(
do_not_initialize_tag(
"amd.A_colindsub_remote"), R_nnz_remote);
2080 const auto R_rowptr_remote = Kokkos::create_mirror_view(amd.rowptr_remote);
2081 const auto R_A_colindsub_remote = Kokkos::create_mirror_view(amd.A_colindsub_remote);
2084 const Kokkos::RangePolicy<host_execution_space> policy(0,nrows);
2085 Kokkos::parallel_for
2086 (
"performSymbolicPhase::RangePolicy<host_execution_space>::R_graph_count",
2087 policy, KOKKOS_LAMBDA(
const local_ordinal_type &lr) {
2088 const local_ordinal_type ri0 = lclrow2idx[lr];
2089 const local_ordinal_type pi0 = rowidx2part(ri0);
2090 const size_type j0 = local_graph_rowptr(lr);
2091 for (size_type j=j0;j<local_graph_rowptr(lr+1);++j) {
2092 const local_ordinal_type lc = local_graph_colidx(j);
2093 const local_ordinal_type lc2r = col2row[lc];
2094 if (lc2r != (local_ordinal_type) -1) {
2095 const local_ordinal_type ri = lclrow2idx[lc2r];
2096 const local_ordinal_type pi = rowidx2part(ri);
2097 if (pi == pi0 && ri + 1 >= ri0 && ri <= ri0 + 1) {
2102 if (!overlap_communication_and_computation || lc < nrows) {
2105 ++R_rowptr_remote(lr);
2114 Kokkos::RangePolicy<host_execution_space> policy(0,nrows+1);
2115 Kokkos::parallel_scan
2116 (
"performSymbolicPhase::RangePolicy<host_execution_space>::R_graph_fill",
2117 policy, KOKKOS_LAMBDA(
const local_ordinal_type &lr,
2118 update_type &update,
2119 const bool &
final) {
2121 val.v[0] = R_rowptr(lr);
2122 if (overlap_communication_and_computation)
2123 val.v[1] = R_rowptr_remote(lr);
2126 R_rowptr(lr) = update.v[0];
2127 if (overlap_communication_and_computation)
2128 R_rowptr_remote(lr) = update.v[1];
2131 const local_ordinal_type ri0 = lclrow2idx[lr];
2132 const local_ordinal_type pi0 = rowidx2part(ri0);
2134 size_type cnt_rowptr = R_rowptr(lr);
2135 size_type cnt_rowptr_remote = overlap_communication_and_computation ? R_rowptr_remote(lr) : 0;
2137 const size_type j0 = local_graph_rowptr(lr);
2138 for (size_type j=j0;j<local_graph_rowptr(lr+1);++j) {
2139 const local_ordinal_type lc = local_graph_colidx(j);
2140 const local_ordinal_type lc2r = col2row[lc];
2141 if (lc2r != (local_ordinal_type) -1) {
2142 const local_ordinal_type ri = lclrow2idx[lc2r];
2143 const local_ordinal_type pi = rowidx2part(ri);
2144 if (pi == pi0 && ri + 1 >= ri0 && ri <= ri0 + 1)
2147 const local_ordinal_type row_entry = j - j0;
2148 if (!overlap_communication_and_computation || lc < nrows)
2149 R_A_colindsub(cnt_rowptr++) = row_entry;
2151 R_A_colindsub_remote(cnt_rowptr_remote++) = row_entry;
2159 Kokkos::deep_copy(amd.rowptr, R_rowptr);
2160 Kokkos::deep_copy(amd.A_colindsub, R_A_colindsub);
2161 if (overlap_communication_and_computation) {
2163 Kokkos::deep_copy(amd.rowptr_remote, R_rowptr_remote);
2164 Kokkos::deep_copy(amd.A_colindsub_remote, R_A_colindsub_remote);
2168 if (hasBlockCrsMatrix)
2169 amd.tpetra_values = (
const_cast<block_crs_matrix_type*
>(A_bcrs.get())->getValuesDeviceNonConst());
2171 amd.tpetra_values = (
const_cast<crs_matrix_type*
>(A_crs.get()))->getLocalValuesDevice (Tpetra::Access::ReadWrite);
2177 if (interf.n_subparts_per_part > 1)
2178 btdm.e_values = vector_type_4d_view(
"btdm.e_values", 2, interf.part2packrowidx0_back, blocksize, blocksize);
2188 if(BlockHelperDetails::is_device<execution_space>::value && !useSeqMethod && hasBlockCrsMatrix)
2190 bool is_async_importer_active = !async_importer.is_null();
2191 local_ordinal_type_1d_view dm2cm = is_async_importer_active ? async_importer->dm2cm : local_ordinal_type_1d_view();
2192 bool ownedRemoteSeparate = overlap_communication_and_computation || !is_async_importer_active;
2193 BlockHelperDetails::precompute_A_x_offsets<MatrixType>(amd, interf, g, dm2cm, blocksize, ownedRemoteSeparate);
2196 IFPACK2_BLOCKHELPER_TIMER_FENCE(
typename BlockHelperDetails::ImplType<MatrixType>::execution_space)
2203 template<
typename ArgActiveExecutionMemorySpace>
2208 typedef KB::Mode::Serial mode_type;
2209 #if defined(__KOKKOSBATCHED_INTEL_MKL_COMPACT_BATCHED__)
2210 typedef KB::Algo::Level3::CompactMKL algo_type;
2212 typedef KB::Algo::Level3::Blocked algo_type;
2214 static int recommended_team_size(
const int ,
2222 #if defined(KOKKOS_ENABLE_CUDA)
2223 static inline int ExtractAndFactorizeRecommendedCudaTeamSize(
const int blksize,
2224 const int vector_length,
2225 const int internal_vector_length) {
2226 const int vector_size = vector_length/internal_vector_length;
2227 int total_team_size(0);
2228 if (blksize <= 5) total_team_size = 32;
2229 else if (blksize <= 9) total_team_size = 32;
2230 else if (blksize <= 12) total_team_size = 96;
2231 else if (blksize <= 16) total_team_size = 128;
2232 else if (blksize <= 20) total_team_size = 160;
2233 else total_team_size = 160;
2234 return 2*total_team_size/vector_size;
2237 struct ExtractAndFactorizeTridiagsDefaultModeAndAlgo<Kokkos::CudaSpace> {
2238 typedef KB::Mode::Team mode_type;
2239 typedef KB::Algo::Level3::Unblocked algo_type;
2240 static int recommended_team_size(
const int blksize,
2241 const int vector_length,
2242 const int internal_vector_length) {
2243 return ExtractAndFactorizeRecommendedCudaTeamSize(blksize, vector_length, internal_vector_length);
2247 struct ExtractAndFactorizeTridiagsDefaultModeAndAlgo<Kokkos::CudaUVMSpace> {
2248 typedef KB::Mode::Team mode_type;
2249 typedef KB::Algo::Level3::Unblocked algo_type;
2250 static int recommended_team_size(
const int blksize,
2251 const int vector_length,
2252 const int internal_vector_length) {
2253 return ExtractAndFactorizeRecommendedCudaTeamSize(blksize, vector_length, internal_vector_length);
2258 #if defined(KOKKOS_ENABLE_HIP)
2259 static inline int ExtractAndFactorizeRecommendedHIPTeamSize(
const int blksize,
2260 const int vector_length,
2261 const int internal_vector_length) {
2262 const int vector_size = vector_length/internal_vector_length;
2263 int total_team_size(0);
2264 if (blksize <= 5) total_team_size = 32;
2265 else if (blksize <= 9) total_team_size = 32;
2266 else if (blksize <= 12) total_team_size = 96;
2267 else if (blksize <= 16) total_team_size = 128;
2268 else if (blksize <= 20) total_team_size = 160;
2269 else total_team_size = 160;
2270 return 2*total_team_size/vector_size;
2273 struct ExtractAndFactorizeTridiagsDefaultModeAndAlgo<Kokkos::HIPSpace> {
2274 typedef KB::Mode::Team mode_type;
2275 typedef KB::Algo::Level3::Unblocked algo_type;
2276 static int recommended_team_size(
const int blksize,
2277 const int vector_length,
2278 const int internal_vector_length) {
2279 return ExtractAndFactorizeRecommendedHIPTeamSize(blksize, vector_length, internal_vector_length);
2283 struct ExtractAndFactorizeTridiagsDefaultModeAndAlgo<Kokkos::HIPHostPinnedSpace> {
2284 typedef KB::Mode::Team mode_type;
2285 typedef KB::Algo::Level3::Unblocked algo_type;
2286 static int recommended_team_size(
const int blksize,
2287 const int vector_length,
2288 const int internal_vector_length) {
2289 return ExtractAndFactorizeRecommendedHIPTeamSize(blksize, vector_length, internal_vector_length);
2294 #if defined(KOKKOS_ENABLE_SYCL)
2295 static inline int ExtractAndFactorizeRecommendedSYCLTeamSize(
const int blksize,
2296 const int vector_length,
2297 const int internal_vector_length) {
2298 const int vector_size = vector_length/internal_vector_length;
2299 int total_team_size(0);
2300 if (blksize <= 5) total_team_size = 32;
2301 else if (blksize <= 9) total_team_size = 32;
2302 else if (blksize <= 12) total_team_size = 96;
2303 else if (blksize <= 16) total_team_size = 128;
2304 else if (blksize <= 20) total_team_size = 160;
2305 else total_team_size = 160;
2306 return 2*total_team_size/vector_size;
2309 struct ExtractAndFactorizeTridiagsDefaultModeAndAlgo<Kokkos::Experimental::SYCLDeviceUSMSpace> {
2310 typedef KB::Mode::Team mode_type;
2311 typedef KB::Algo::Level3::Unblocked algo_type;
2312 static int recommended_team_size(
const int blksize,
2313 const int vector_length,
2314 const int internal_vector_length) {
2315 return ExtractAndFactorizeRecommendedSYCLTeamSize(blksize, vector_length, internal_vector_length);
2319 struct ExtractAndFactorizeTridiagsDefaultModeAndAlgo<Kokkos::Experimental::SYCLSharedUSMSpace> {
2320 typedef KB::Mode::Team mode_type;
2321 typedef KB::Algo::Level3::Unblocked algo_type;
2322 static int recommended_team_size(
const int blksize,
2323 const int vector_length,
2324 const int internal_vector_length) {
2325 return ExtractAndFactorizeRecommendedSYCLTeamSize(blksize, vector_length, internal_vector_length);
2330 template<
typename impl_type,
typename WWViewType>
2331 KOKKOS_INLINE_FUNCTION
2333 solveMultiVector(
const typename Kokkos::TeamPolicy<typename impl_type::execution_space>::member_type &member,
2334 const typename impl_type::local_ordinal_type &,
2335 const typename impl_type::local_ordinal_type &i0,
2336 const typename impl_type::local_ordinal_type &r0,
2337 const typename impl_type::local_ordinal_type &nrows,
2338 const typename impl_type::local_ordinal_type &v,
2339 const ConstUnmanaged<typename impl_type::internal_vector_type_4d_view> D_internal_vector_values,
2340 const Unmanaged<typename impl_type::internal_vector_type_4d_view> X_internal_vector_values,
2341 const WWViewType &WW,
2342 const bool skip_first_pass=
false) {
2343 using execution_space =
typename impl_type::execution_space;
2344 using team_policy_type = Kokkos::TeamPolicy<execution_space>;
2345 using member_type =
typename team_policy_type::member_type;
2346 using local_ordinal_type =
typename impl_type::local_ordinal_type;
2348 typedef SolveTridiagsDefaultModeAndAlgo
2349 <
typename execution_space::memory_space> default_mode_and_algo_type;
2351 typedef typename default_mode_and_algo_type::mode_type default_mode_type;
2352 typedef typename default_mode_and_algo_type::multi_vector_algo_type default_algo_type;
2354 using btdm_magnitude_type =
typename impl_type::btdm_magnitude_type;
2357 const auto one = Kokkos::ArithTraits<btdm_magnitude_type>::one();
2358 const auto zero = Kokkos::ArithTraits<btdm_magnitude_type>::zero();
2361 auto A = Kokkos::subview(D_internal_vector_values, i0, Kokkos::ALL(), Kokkos::ALL(), v);
2362 auto X1 = Kokkos::subview(X_internal_vector_values, r0, Kokkos::ALL(), Kokkos::ALL(), v);
2365 local_ordinal_type i = i0, r = r0;
2370 if (skip_first_pass) {
2373 A.assign_data( &D_internal_vector_values(i+2,0,0,v) );
2374 X2.assign_data( &X_internal_vector_values(++r,0,0,v) );
2375 A.assign_data( &D_internal_vector_values(i+3,0,0,v) );
2376 KB::Trsm<member_type,
2377 KB::Side::Left,KB::Uplo::Lower,KB::Trans::NoTranspose,KB::Diag::Unit,
2378 default_mode_type,default_algo_type>
2379 ::invoke(member, one, A, X2);
2380 X1.assign_data( X2.data() );
2384 KB::Trsm<member_type,
2385 KB::Side::Left,KB::Uplo::Lower,KB::Trans::NoTranspose,KB::Diag::Unit,
2386 default_mode_type,default_algo_type>
2387 ::invoke(member, one, A, X1);
2388 for (local_ordinal_type tr=1;tr<nrows;++tr,i+=3) {
2389 A.assign_data( &D_internal_vector_values(i+2,0,0,v) );
2390 X2.assign_data( &X_internal_vector_values(++r,0,0,v) );
2391 member.team_barrier();
2392 KB::Gemm<member_type,
2393 KB::Trans::NoTranspose,KB::Trans::NoTranspose,
2394 default_mode_type,default_algo_type>
2395 ::invoke(member, -one, A, X1, one, X2);
2396 A.assign_data( &D_internal_vector_values(i+3,0,0,v) );
2397 KB::Trsm<member_type,
2398 KB::Side::Left,KB::Uplo::Lower,KB::Trans::NoTranspose,KB::Diag::Unit,
2399 default_mode_type,default_algo_type>
2400 ::invoke(member, one, A, X2);
2401 X1.assign_data( X2.data() );
2406 KB::Trsm<member_type,
2407 KB::Side::Left,KB::Uplo::Upper,KB::Trans::NoTranspose,KB::Diag::NonUnit,
2408 default_mode_type,default_algo_type>
2409 ::invoke(member, one, A, X1);
2410 for (local_ordinal_type tr=nrows;tr>1;--tr) {
2412 A.assign_data( &D_internal_vector_values(i+1,0,0,v) );
2413 X2.assign_data( &X_internal_vector_values(--r,0,0,v) );
2414 member.team_barrier();
2415 KB::Gemm<member_type,
2416 KB::Trans::NoTranspose,KB::Trans::NoTranspose,
2417 default_mode_type,default_algo_type>
2418 ::invoke(member, -one, A, X1, one, X2);
2420 A.assign_data( &D_internal_vector_values(i,0,0,v) );
2421 KB::Trsm<member_type,
2422 KB::Side::Left,KB::Uplo::Upper,KB::Trans::NoTranspose,KB::Diag::NonUnit,
2423 default_mode_type,default_algo_type>
2424 ::invoke(member, one, A, X2);
2425 X1.assign_data( X2.data() );
2429 auto W = Kokkos::subview(WW, Kokkos::ALL(), Kokkos::ALL(), v);
2430 KB::Copy<member_type,KB::Trans::NoTranspose,default_mode_type>
2431 ::invoke(member, X1, W);
2432 member.team_barrier();
2433 KB::Gemm<member_type,
2434 KB::Trans::NoTranspose,KB::Trans::NoTranspose,
2435 default_mode_type,default_algo_type>
2436 ::invoke(member, one, A, W, zero, X1);
2441 template<
typename impl_type,
typename WWViewType,
typename XViewType>
2442 KOKKOS_INLINE_FUNCTION
2444 solveSingleVectorNew(
const typename Kokkos::TeamPolicy<typename impl_type::execution_space>::member_type &member,
2445 const typename impl_type::local_ordinal_type &blocksize,
2446 const typename impl_type::local_ordinal_type &i0,
2447 const typename impl_type::local_ordinal_type &r0,
2448 const typename impl_type::local_ordinal_type &nrows,
2449 const typename impl_type::local_ordinal_type &v,
2450 const ConstUnmanaged<typename impl_type::internal_vector_type_4d_view> D_internal_vector_values,
2451 const XViewType &X_internal_vector_values,
2452 const WWViewType &WW) {
2453 using execution_space =
typename impl_type::execution_space;
2456 using local_ordinal_type =
typename impl_type::local_ordinal_type;
2458 typedef SolveTridiagsDefaultModeAndAlgo
2459 <
typename execution_space::memory_space> default_mode_and_algo_type;
2461 typedef typename default_mode_and_algo_type::mode_type default_mode_type;
2462 typedef typename default_mode_and_algo_type::single_vector_algo_type default_algo_type;
2464 using btdm_magnitude_type =
typename impl_type::btdm_magnitude_type;
2467 auto A = D_internal_vector_values.data();
2468 auto X = X_internal_vector_values.data();
2471 const auto one = Kokkos::ArithTraits<btdm_magnitude_type>::one();
2472 const auto zero = Kokkos::ArithTraits<btdm_magnitude_type>::zero();
2476 const local_ordinal_type astep = D_internal_vector_values.stride_0();
2477 const local_ordinal_type as0 = D_internal_vector_values.stride_1();
2478 const local_ordinal_type as1 = D_internal_vector_values.stride_2();
2479 const local_ordinal_type xstep = X_internal_vector_values.stride_0();
2480 const local_ordinal_type xs0 = X_internal_vector_values.stride_1();
2489 KOKKOSBATCHED_TRSV_LOWER_NO_TRANSPOSE_INTERNAL_INVOKE
2490 (default_mode_type,default_algo_type,
2493 blocksize,blocksize,
2498 for (local_ordinal_type tr=1;tr<nrows;++tr) {
2499 member.team_barrier();
2500 KOKKOSBATCHED_GEMV_NO_TRANSPOSE_INTERNAL_INVOKE
2501 (default_mode_type,default_algo_type,
2503 blocksize, blocksize,
2505 A+2*astep, as0, as1,
2509 KOKKOSBATCHED_TRSV_LOWER_NO_TRANSPOSE_INTERNAL_INVOKE
2510 (default_mode_type,default_algo_type,
2513 blocksize,blocksize,
2515 A+3*astep, as0, as1,
2523 KOKKOSBATCHED_TRSV_UPPER_NO_TRANSPOSE_INTERNAL_INVOKE
2524 (default_mode_type,default_algo_type,
2527 blocksize, blocksize,
2532 for (local_ordinal_type tr=nrows;tr>1;--tr) {
2534 member.team_barrier();
2535 KOKKOSBATCHED_GEMV_NO_TRANSPOSE_INTERNAL_INVOKE
2536 (default_mode_type,default_algo_type,
2538 blocksize, blocksize,
2540 A+1*astep, as0, as1,
2544 KOKKOSBATCHED_TRSV_UPPER_NO_TRANSPOSE_INTERNAL_INVOKE
2545 (default_mode_type,default_algo_type,
2548 blocksize, blocksize,
2557 const local_ordinal_type ws0 = WW.stride_0();
2558 auto W = WW.data() + v;
2559 KOKKOSBATCHED_COPY_VECTOR_NO_TRANSPOSE_INTERNAL_INVOKE
2561 member, blocksize, X, xs0, W, ws0);
2562 member.team_barrier();
2563 KOKKOSBATCHED_GEMV_NO_TRANSPOSE_INTERNAL_INVOKE
2564 (default_mode_type,default_algo_type,
2566 blocksize, blocksize,
2575 template<
typename local_ordinal_type,
typename ViewType>
2576 void writeBTDValuesToFile (
const local_ordinal_type &n_parts,
const ViewType &scalar_values_device, std::string fileName) {
2577 #ifdef IFPACK2_BLOCKTRIDICONTAINER_WRITE_MM
2578 auto scalar_values = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), scalar_values_device);
2579 std::ofstream myfile;
2580 myfile.open (fileName);
2582 const local_ordinal_type n_parts_per_pack = n_parts < (local_ordinal_type) scalar_values.extent(3) ? n_parts : scalar_values.extent(3);
2583 local_ordinal_type nnz = scalar_values.extent(0) * scalar_values.extent(1) * scalar_values.extent(2) * n_parts_per_pack;
2584 const local_ordinal_type n_blocks = scalar_values.extent(0)*n_parts_per_pack;
2585 const local_ordinal_type n_blocks_per_part = n_blocks/n_parts;
2587 const local_ordinal_type block_size = scalar_values.extent(1);
2589 const local_ordinal_type n_rows_per_part = (n_blocks_per_part+2)/3 * block_size;
2590 const local_ordinal_type n_rows = n_rows_per_part*n_parts;
2592 const local_ordinal_type n_packs = ceil(
float(n_parts)/n_parts_per_pack);
2594 myfile <<
"%%MatrixMarket matrix coordinate real general"<< std::endl;
2595 myfile <<
"%%nnz = " << nnz;
2596 myfile <<
" block size = " << block_size;
2597 myfile <<
" number of blocks = " << n_blocks;
2598 myfile <<
" number of parts = " << n_parts;
2599 myfile <<
" number of blocks per part = " << n_blocks_per_part;
2600 myfile <<
" number of rows = " << n_rows ;
2601 myfile <<
" number of cols = " << n_rows;
2602 myfile <<
" number of packs = " << n_packs << std::endl;
2604 myfile << n_rows <<
" " << n_rows <<
" " << nnz << std::setprecision(9) << std::endl;
2606 local_ordinal_type current_part_idx, current_block_idx, current_row_offset, current_col_offset, current_row, current_col;
2607 for (local_ordinal_type i_pack=0;i_pack<n_packs;++i_pack) {
2608 for (local_ordinal_type i_part_in_pack=0;i_part_in_pack<n_parts_per_pack;++i_part_in_pack) {
2609 current_part_idx = i_part_in_pack + i_pack * n_parts_per_pack;
2610 for (local_ordinal_type i_block_in_part=0;i_block_in_part<n_blocks_per_part;++i_block_in_part) {
2611 current_block_idx = i_block_in_part + i_pack * n_blocks_per_part;
2612 if (current_block_idx >= (local_ordinal_type) scalar_values.extent(0))
2614 if (i_block_in_part % 3 == 0) {
2615 current_row_offset = i_block_in_part/3 * block_size;
2616 current_col_offset = i_block_in_part/3 * block_size;
2618 else if (i_block_in_part % 3 == 1) {
2619 current_row_offset = (i_block_in_part-1)/3 * block_size;
2620 current_col_offset = ((i_block_in_part-1)/3+1) * block_size;
2622 else if (i_block_in_part % 3 == 2) {
2623 current_row_offset = ((i_block_in_part-2)/3+1) * block_size;
2624 current_col_offset = (i_block_in_part-2)/3 * block_size;
2626 current_row_offset += current_part_idx * n_rows_per_part;
2627 current_col_offset += current_part_idx * n_rows_per_part;
2628 for (local_ordinal_type i_in_block=0;i_in_block<block_size;++i_in_block) {
2629 for (local_ordinal_type j_in_block=0;j_in_block<block_size;++j_in_block) {
2630 current_row = current_row_offset + i_in_block + 1;
2631 current_col = current_col_offset + j_in_block + 1;
2632 myfile << current_row <<
" " << current_col <<
" " << scalar_values(current_block_idx,i_in_block,j_in_block,i_part_in_pack) << std::endl;
2643 template<
typename local_ordinal_type,
typename ViewType>
2644 void write4DMultiVectorValuesToFile (
const local_ordinal_type &n_parts,
const ViewType &scalar_values_device, std::string fileName) {
2645 #ifdef IFPACK2_BLOCKTRIDICONTAINER_WRITE_MM
2646 auto scalar_values = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), scalar_values_device);
2647 std::ofstream myfile;
2648 myfile.open (fileName);
2650 const local_ordinal_type n_parts_per_pack = n_parts < scalar_values.extent(3) ? n_parts : scalar_values.extent(3);
2651 const local_ordinal_type n_blocks = scalar_values.extent(0)*n_parts_per_pack;
2652 const local_ordinal_type n_blocks_per_part = n_blocks/n_parts;
2654 const local_ordinal_type block_size = scalar_values.extent(1);
2655 const local_ordinal_type n_cols = scalar_values.extent(2);
2657 const local_ordinal_type n_rows_per_part = n_blocks_per_part * block_size;
2658 const local_ordinal_type n_rows = n_rows_per_part*n_parts;
2660 const local_ordinal_type n_packs = ceil(
float(n_parts)/n_parts_per_pack);
2663 myfile <<
"%%MatrixMarket matrix array real general"<< std::endl;
2664 myfile <<
"%%block size = " << block_size;
2665 myfile <<
" number of blocks = " << n_blocks;
2666 myfile <<
" number of parts = " << n_parts;
2667 myfile <<
" number of blocks per part = " << n_blocks_per_part;
2668 myfile <<
" number of rows = " << n_rows ;
2669 myfile <<
" number of cols = " << n_cols;
2670 myfile <<
" number of packs = " << n_packs << std::endl;
2672 myfile << n_rows <<
" " << n_cols << std::setprecision(9) << std::endl;
2674 local_ordinal_type current_part_idx, current_block_idx, current_row_offset;
2675 (void) current_row_offset;
2676 (void) current_part_idx;
2677 for (local_ordinal_type j_in_block=0;j_in_block<n_cols;++j_in_block) {
2678 for (local_ordinal_type i_pack=0;i_pack<n_packs;++i_pack) {
2679 for (local_ordinal_type i_part_in_pack=0;i_part_in_pack<n_parts_per_pack;++i_part_in_pack) {
2680 current_part_idx = i_part_in_pack + i_pack * n_parts_per_pack;
2681 for (local_ordinal_type i_block_in_part=0;i_block_in_part<n_blocks_per_part;++i_block_in_part) {
2682 current_block_idx = i_block_in_part + i_pack * n_blocks_per_part;
2684 if (current_block_idx >= (local_ordinal_type) scalar_values.extent(0))
2686 for (local_ordinal_type i_in_block=0;i_in_block<block_size;++i_in_block) {
2687 myfile << scalar_values(current_block_idx,i_in_block,j_in_block,i_part_in_pack) << std::endl;
2697 template<
typename local_ordinal_type,
typename ViewType>
2698 void write5DMultiVectorValuesToFile (
const local_ordinal_type &n_parts,
const ViewType &scalar_values_device, std::string fileName) {
2699 #ifdef IFPACK2_BLOCKTRIDICONTAINER_WRITE_MM
2700 auto scalar_values = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), scalar_values_device);
2701 std::ofstream myfile;
2702 myfile.open (fileName);
2704 const local_ordinal_type n_parts_per_pack = n_parts < scalar_values.extent(4) ? n_parts : scalar_values.extent(4);
2705 const local_ordinal_type n_blocks = scalar_values.extent(1)*n_parts_per_pack;
2706 const local_ordinal_type n_blocks_per_part = n_blocks/n_parts;
2708 const local_ordinal_type block_size = scalar_values.extent(2);
2709 const local_ordinal_type n_blocks_cols = scalar_values.extent(0);
2710 const local_ordinal_type n_cols = n_blocks_cols * block_size;
2712 const local_ordinal_type n_rows_per_part = n_blocks_per_part * block_size;
2713 const local_ordinal_type n_rows = n_rows_per_part*n_parts;
2715 const local_ordinal_type n_packs = ceil(
float(n_parts)/n_parts_per_pack);
2717 myfile <<
"%%MatrixMarket matrix array real general"<< std::endl;
2718 myfile <<
"%%block size = " << block_size;
2719 myfile <<
" number of blocks = " << n_blocks;
2720 myfile <<
" number of parts = " << n_parts;
2721 myfile <<
" number of blocks per part = " << n_blocks_per_part;
2722 myfile <<
" number of rows = " << n_rows ;
2723 myfile <<
" number of cols = " << n_cols;
2724 myfile <<
" number of packs = " << n_packs << std::endl;
2726 myfile << n_rows <<
" " << n_cols << std::setprecision(9) << std::endl;
2728 local_ordinal_type current_part_idx, current_block_idx, current_row_offset;
2729 (void) current_row_offset;
2730 (void) current_part_idx;
2731 for (local_ordinal_type i_block_col=0;i_block_col<n_blocks_cols;++i_block_col) {
2732 for (local_ordinal_type j_in_block=0;j_in_block<block_size;++j_in_block) {
2733 for (local_ordinal_type i_pack=0;i_pack<n_packs;++i_pack) {
2734 for (local_ordinal_type i_part_in_pack=0;i_part_in_pack<n_parts_per_pack;++i_part_in_pack) {
2735 current_part_idx = i_part_in_pack + i_pack * n_parts_per_pack;
2736 for (local_ordinal_type i_block_in_part=0;i_block_in_part<n_blocks_per_part;++i_block_in_part) {
2737 current_block_idx = i_block_in_part + i_pack * n_blocks_per_part;
2739 if (current_block_idx >= (local_ordinal_type) scalar_values.extent(1))
2741 for (local_ordinal_type i_in_block=0;i_in_block<block_size;++i_in_block) {
2742 myfile << scalar_values(i_block_col,current_block_idx,i_in_block,j_in_block,i_part_in_pack) << std::endl;
2753 template<
typename local_ordinal_type,
typename member_type,
typename ViewType1,
typename ViewType2>
2754 KOKKOS_INLINE_FUNCTION
2756 copy3DView(
const member_type &member,
const ViewType1 &view1,
const ViewType2 &view2) {
2769 Kokkos::Experimental::local_deep_copy(member, view1, view2);
2771 template<
typename MatrixType,
int ScratchLevel>
2772 struct ExtractAndFactorizeTridiags {
2774 using impl_type = BlockHelperDetails::ImplType<MatrixType>;
2776 using execution_space =
typename impl_type::execution_space;
2777 using memory_space =
typename impl_type::memory_space;
2779 using local_ordinal_type =
typename impl_type::local_ordinal_type;
2782 using magnitude_type =
typename impl_type::magnitude_type;
2784 using row_matrix_type =
typename impl_type::tpetra_row_matrix_type;
2785 using crs_graph_type =
typename impl_type::tpetra_crs_graph_type;
2787 using local_ordinal_type_1d_view =
typename impl_type::local_ordinal_type_1d_view;
2788 using local_ordinal_type_2d_view =
typename impl_type::local_ordinal_type_2d_view;
2789 using size_type_2d_view =
typename impl_type::size_type_2d_view;
2790 using impl_scalar_type_1d_view_tpetra =
typename impl_type::impl_scalar_type_1d_view_tpetra;
2792 using btdm_scalar_type =
typename impl_type::btdm_scalar_type;
2793 using btdm_magnitude_type =
typename impl_type::btdm_magnitude_type;
2794 using vector_type_3d_view =
typename impl_type::vector_type_3d_view;
2795 using vector_type_4d_view =
typename impl_type::vector_type_4d_view;
2796 using internal_vector_type_4d_view =
typename impl_type::internal_vector_type_4d_view;
2797 using internal_vector_type_5d_view =
typename impl_type::internal_vector_type_5d_view;
2798 using btdm_scalar_type_4d_view =
typename impl_type::btdm_scalar_type_4d_view;
2799 using btdm_scalar_type_5d_view =
typename impl_type::btdm_scalar_type_5d_view;
2800 using internal_vector_scratch_type_3d_view = Scratch<typename impl_type::internal_vector_type_3d_view>;
2801 using btdm_scalar_scratch_type_3d_view = Scratch<typename impl_type::btdm_scalar_type_3d_view>;
2803 using internal_vector_type =
typename impl_type::internal_vector_type;
2804 static constexpr
int vector_length = impl_type::vector_length;
2805 static constexpr
int internal_vector_length = impl_type::internal_vector_length;
2806 static_assert(vector_length >= internal_vector_length,
"Ifpack2 BlockTriDi Numeric: vector_length must be at least as large as internal_vector_length");
2807 static_assert(vector_length % internal_vector_length == 0,
"Ifpack2 BlockTriDi Numeric: vector_length must be divisible by internal_vector_length");
2810 using team_policy_type = Kokkos::TeamPolicy<execution_space>;
2811 using member_type =
typename team_policy_type::member_type;
2815 const ConstUnmanaged<local_ordinal_type_1d_view> partptr, lclrow, packptr, packindices_sub, packptr_sub;
2816 const ConstUnmanaged<local_ordinal_type_2d_view> partptr_sub, part2packrowidx0_sub, packindices_schur;
2817 const local_ordinal_type max_partsz;
2819 using size_type_1d_view_tpetra = Kokkos::View<size_t*,typename impl_type::node_device_type>;
2820 ConstUnmanaged<size_type_1d_view_tpetra> A_block_rowptr;
2821 ConstUnmanaged<size_type_1d_view_tpetra> A_point_rowptr;
2822 ConstUnmanaged<impl_scalar_type_1d_view_tpetra> A_values;
2824 const ConstUnmanaged<size_type_2d_view> pack_td_ptr, flat_td_ptr, pack_td_ptr_schur;
2825 const ConstUnmanaged<local_ordinal_type_1d_view> A_colindsub;
2826 const Unmanaged<internal_vector_type_4d_view> internal_vector_values, internal_vector_values_schur;
2827 const Unmanaged<internal_vector_type_5d_view> e_internal_vector_values;
2828 const Unmanaged<btdm_scalar_type_4d_view> scalar_values, scalar_values_schur;
2829 const Unmanaged<btdm_scalar_type_5d_view> e_scalar_values;
2831 const local_ordinal_type blocksize, blocksize_square;
2833 const magnitude_type tiny;
2834 const local_ordinal_type vector_loop_size;
2836 bool hasBlockCrsMatrix;
2839 ExtractAndFactorizeTridiags(
const BlockTridiags<MatrixType> &btdm_,
2840 const BlockHelperDetails::PartInterface<MatrixType> &interf_,
2843 const magnitude_type& tiny_) :
2845 partptr(interf_.partptr),
2846 lclrow(interf_.lclrow),
2847 packptr(interf_.packptr),
2848 packindices_sub(interf_.packindices_sub),
2849 packptr_sub(interf_.packptr_sub),
2850 partptr_sub(interf_.partptr_sub),
2851 part2packrowidx0_sub(interf_.part2packrowidx0_sub),
2852 packindices_schur(interf_.packindices_schur),
2853 max_partsz(interf_.max_partsz),
2855 pack_td_ptr(btdm_.pack_td_ptr),
2856 flat_td_ptr(btdm_.flat_td_ptr),
2857 pack_td_ptr_schur(btdm_.pack_td_ptr_schur),
2858 A_colindsub(btdm_.A_colindsub),
2859 internal_vector_values((internal_vector_type*)btdm_.values.data(),
2860 btdm_.values.extent(0),
2861 btdm_.values.extent(1),
2862 btdm_.values.extent(2),
2863 vector_length/internal_vector_length),
2864 internal_vector_values_schur((internal_vector_type*)btdm_.values_schur.data(),
2865 btdm_.values_schur.extent(0),
2866 btdm_.values_schur.extent(1),
2867 btdm_.values_schur.extent(2),
2868 vector_length/internal_vector_length),
2869 e_internal_vector_values((internal_vector_type*)btdm_.e_values.data(),
2870 btdm_.e_values.extent(0),
2871 btdm_.e_values.extent(1),
2872 btdm_.e_values.extent(2),
2873 btdm_.e_values.extent(3),
2874 vector_length/internal_vector_length),
2875 scalar_values((btdm_scalar_type*)btdm_.values.data(),
2876 btdm_.values.extent(0),
2877 btdm_.values.extent(1),
2878 btdm_.values.extent(2),
2880 scalar_values_schur((btdm_scalar_type*)btdm_.values_schur.data(),
2881 btdm_.values_schur.extent(0),
2882 btdm_.values_schur.extent(1),
2883 btdm_.values_schur.extent(2),
2885 e_scalar_values((btdm_scalar_type*)btdm_.e_values.data(),
2886 btdm_.e_values.extent(0),
2887 btdm_.e_values.extent(1),
2888 btdm_.e_values.extent(2),
2889 btdm_.e_values.extent(3),
2891 blocksize(btdm_.values.extent(1)),
2892 blocksize_square(blocksize*blocksize),
2895 vector_loop_size(vector_length/internal_vector_length) {
2896 using crs_matrix_type =
typename impl_type::tpetra_crs_matrix_type;
2897 using block_crs_matrix_type =
typename impl_type::tpetra_block_crs_matrix_type;
2899 auto A_crs = Teuchos::rcp_dynamic_cast<
const crs_matrix_type>(A_);
2900 auto A_bcrs = Teuchos::rcp_dynamic_cast<
const block_crs_matrix_type>(A_);
2902 hasBlockCrsMatrix = ! A_bcrs.is_null ();
2904 A_block_rowptr = G_->getLocalGraphDevice().row_map;
2905 if (hasBlockCrsMatrix) {
2906 A_values =
const_cast<block_crs_matrix_type*
>(A_bcrs.get())->getValuesDeviceNonConst();
2909 A_point_rowptr = A_crs->getCrsGraph()->getLocalGraphDevice().row_map;
2910 A_values = A_crs->getLocalValuesDevice (Tpetra::Access::ReadOnly);
2916 KOKKOS_INLINE_FUNCTION
2918 extract(local_ordinal_type partidx,
2919 local_ordinal_type local_subpartidx,
2920 local_ordinal_type npacks)
const {
2921 #ifdef IFPACK2_BLOCKTRIDICONTAINER_USE_PRINTF
2922 printf(
"extract partidx = %d, local_subpartidx = %d, npacks = %d;\n", partidx, local_subpartidx, npacks);
2924 using tlb = BlockHelperDetails::TpetraLittleBlock<Tpetra::Impl::BlockCrsMatrixLittleBlockArrayLayout>;
2925 const size_type kps = pack_td_ptr(partidx, local_subpartidx);
2926 local_ordinal_type kfs[vector_length] = {};
2927 local_ordinal_type ri0[vector_length] = {};
2928 local_ordinal_type nrows[vector_length] = {};
2930 for (local_ordinal_type vi=0;vi<npacks;++vi,++partidx) {
2931 kfs[vi] = flat_td_ptr(partidx,local_subpartidx);
2932 ri0[vi] = partptr_sub(pack_td_ptr.extent(0)*local_subpartidx + partidx,0);
2933 nrows[vi] = partptr_sub(pack_td_ptr.extent(0)*local_subpartidx + partidx,1) - ri0[vi];
2934 #ifdef IFPACK2_BLOCKTRIDICONTAINER_USE_PRINTF
2935 printf(
"kfs[%d] = %d;\n", vi, kfs[vi]);
2936 printf(
"ri0[%d] = %d;\n", vi, ri0[vi]);
2937 printf(
"nrows[%d] = %d;\n", vi, nrows[vi]);
2940 local_ordinal_type tr_min = 0;
2941 local_ordinal_type tr_max = nrows[0];
2942 if (local_subpartidx % 2 == 1) {
2946 #ifdef IFPACK2_BLOCKTRIDICONTAINER_USE_PRINTF
2947 printf(
"tr_min = %d and tr_max = %d;\n", tr_min, tr_max);
2949 for (local_ordinal_type tr=tr_min,j=0;tr<tr_max;++tr) {
2950 for (local_ordinal_type e=0;e<3;++e) {
2951 if (hasBlockCrsMatrix) {
2952 const impl_scalar_type* block[vector_length] = {};
2953 for (local_ordinal_type vi=0;vi<npacks;++vi) {
2954 const size_type Aj = A_block_rowptr(lclrow(ri0[vi] + tr)) + A_colindsub(kfs[vi] + j);
2956 block[vi] = &A_values(Aj*blocksize_square);
2958 const size_type pi = kps + j;
2959 #ifdef IFPACK2_BLOCKTRIDICONTAINER_USE_PRINTF
2960 printf(
"Extract pi = %ld, ri0 + tr = %d, kfs + j = %d\n", pi, ri0[0] + tr, kfs[0] + j);
2963 for (local_ordinal_type ii=0;ii<blocksize;++ii) {
2964 for (local_ordinal_type jj=0;jj<blocksize;++jj) {
2965 const auto idx = tlb::getFlatIndex(ii, jj, blocksize);
2966 auto& v = internal_vector_values(pi, ii, jj, 0);
2967 for (local_ordinal_type vi=0;vi<npacks;++vi) {
2968 v[vi] =
static_cast<btdm_scalar_type
>(block[vi][idx]);
2974 const size_type pi = kps + j;
2976 for (local_ordinal_type vi=0;vi<npacks;++vi) {
2977 const size_type Aj_c = A_colindsub(kfs[vi] + j);
2979 for (local_ordinal_type ii=0;ii<blocksize;++ii) {
2980 auto point_row_offset = A_point_rowptr(lclrow(ri0[vi] + tr)*blocksize + ii);
2982 for (local_ordinal_type jj=0;jj<blocksize;++jj) {
2983 scalar_values(pi, ii, jj, vi) = A_values(point_row_offset + Aj_c*blocksize + jj);
2989 if (nrows[0] == 1)
break;
2990 if (local_subpartidx % 2 == 0) {
2991 if (e == 1 && (tr == 0 || tr+1 == nrows[0]))
break;
2992 for (local_ordinal_type vi=1;vi<npacks;++vi) {
2993 if ((e == 0 && nrows[vi] == 1) || (e == 1 && tr+1 == nrows[vi])) {
3000 if (e == 0 && (tr == -1 || tr == nrows[0]))
break;
3001 for (local_ordinal_type vi=1;vi<npacks;++vi) {
3002 if ((e == 0 && nrows[vi] == 1) || (e == 0 && tr == nrows[vi])) {
3012 KOKKOS_INLINE_FUNCTION
3014 extract(
const member_type &member,
3015 const local_ordinal_type &partidxbeg,
3016 local_ordinal_type local_subpartidx,
3017 const local_ordinal_type &npacks,
3018 const local_ordinal_type &vbeg)
const {
3019 #ifdef IFPACK2_BLOCKTRIDICONTAINER_USE_PRINTF
3020 printf(
"extract partidxbeg = %d, local_subpartidx = %d, npacks = %d, vbeg = %d;\n", partidxbeg, local_subpartidx, npacks, vbeg);
3022 using tlb = BlockHelperDetails::TpetraLittleBlock<Tpetra::Impl::BlockCrsMatrixLittleBlockArrayLayout>;
3023 local_ordinal_type kfs_vals[internal_vector_length] = {};
3024 local_ordinal_type ri0_vals[internal_vector_length] = {};
3025 local_ordinal_type nrows_vals[internal_vector_length] = {};
3027 const size_type kps = pack_td_ptr(partidxbeg,local_subpartidx);
3028 for (local_ordinal_type v=vbeg,vi=0;v<npacks && vi<internal_vector_length;++v,++vi) {
3029 kfs_vals[vi] = flat_td_ptr(partidxbeg+vi,local_subpartidx);
3030 ri0_vals[vi] = partptr_sub(pack_td_ptr.extent(0)*local_subpartidx + partidxbeg+vi,0);
3031 nrows_vals[vi] = partptr_sub(pack_td_ptr.extent(0)*local_subpartidx + partidxbeg+vi,1) - ri0_vals[vi];
3032 #ifdef IFPACK2_BLOCKTRIDICONTAINER_USE_PRINTF
3033 printf(
"kfs_vals[%d] = %d;\n", vi, kfs_vals[vi]);
3034 printf(
"ri0_vals[%d] = %d;\n", vi, ri0_vals[vi]);
3035 printf(
"nrows_vals[%d] = %d;\n", vi, nrows_vals[vi]);
3039 local_ordinal_type j_vals[internal_vector_length] = {};
3041 local_ordinal_type tr_min = 0;
3042 local_ordinal_type tr_max = nrows_vals[0];
3043 if (local_subpartidx % 2 == 1) {
3047 #ifdef IFPACK2_BLOCKTRIDICONTAINER_USE_PRINTF
3048 printf(
"tr_min = %d and tr_max = %d;\n", tr_min, tr_max);
3050 for (local_ordinal_type tr=tr_min;tr<tr_max;++tr) {
3051 for (local_ordinal_type v=vbeg,vi=0;v<npacks && vi<internal_vector_length;++v,++vi) {
3052 const local_ordinal_type nrows = (local_subpartidx % 2 == 0 ? nrows_vals[vi] : nrows_vals[vi]);
3053 if ((local_subpartidx % 2 == 0 && tr < nrows) || (local_subpartidx % 2 == 1 && tr < nrows+1)) {
3054 auto &j = j_vals[vi];
3055 const local_ordinal_type kfs = kfs_vals[vi];
3056 const local_ordinal_type ri0 = ri0_vals[vi];
3057 local_ordinal_type lbeg, lend;
3058 if (local_subpartidx % 2 == 0) {
3059 lbeg = (tr == tr_min ? 1 : 0);
3060 lend = (tr == nrows - 1 ? 2 : 3);
3069 else if (tr == nrows) {
3074 if (hasBlockCrsMatrix) {
3075 for (local_ordinal_type l=lbeg;l<lend;++l,++j) {
3076 const size_type Aj = A_block_rowptr(lclrow(ri0 + tr)) + A_colindsub(kfs + j);
3077 const impl_scalar_type* block = &A_values(Aj*blocksize_square);
3078 const size_type pi = kps + j;
3079 #ifdef IFPACK2_BLOCKTRIDICONTAINER_USE_PRINTF
3080 printf(
"Extract pi = %ld, ri0 + tr = %d, kfs + j = %d, tr = %d, lbeg = %d, lend = %d, l = %d\n", pi, ri0 + tr, kfs + j, tr, lbeg, lend, l);
3082 Kokkos::parallel_for
3083 (Kokkos::TeamThreadRange(member,blocksize),
3084 [&](
const local_ordinal_type &ii) {
3085 for (local_ordinal_type jj=0;jj<blocksize;++jj) {
3086 scalar_values(pi, ii, jj, v) =
static_cast<btdm_scalar_type
>(block[tlb::getFlatIndex(ii,jj,blocksize)]);
3092 for (local_ordinal_type l=lbeg;l<lend;++l,++j) {
3093 const size_type Aj_c = A_colindsub(kfs + j);
3094 const size_type pi = kps + j;
3095 Kokkos::parallel_for
3096 (Kokkos::TeamThreadRange(member,blocksize),
3097 [&](
const local_ordinal_type &ii) {
3098 auto point_row_offset = A_point_rowptr(lclrow(ri0 + tr)*blocksize + ii);
3099 for (local_ordinal_type jj=0;jj<blocksize;++jj) {
3100 scalar_values(pi, ii, jj, v) = A_values(point_row_offset + Aj_c*blocksize + jj);
3110 template<
typename AAViewType,
3111 typename WWViewType>
3112 KOKKOS_INLINE_FUNCTION
3114 factorize_subline(
const member_type &member,
3115 const local_ordinal_type &i0,
3116 const local_ordinal_type &nrows,
3117 const local_ordinal_type &v,
3118 const AAViewType &AA,
3119 const WWViewType &WW)
const {
3121 typedef ExtractAndFactorizeTridiagsDefaultModeAndAlgo
3122 <
typename execution_space::memory_space> default_mode_and_algo_type;
3124 typedef typename default_mode_and_algo_type::mode_type default_mode_type;
3125 typedef typename default_mode_and_algo_type::algo_type default_algo_type;
3128 const auto one = Kokkos::ArithTraits<btdm_magnitude_type>::one();
3130 #ifdef IFPACK2_BLOCKTRIDICONTAINER_USE_PRINTF
3131 printf(
"i0 = %d, nrows = %d, v = %d, AA.extent(0) = %ld;\n", i0, nrows, v, AA.extent(0));
3135 auto A = Kokkos::subview(AA, i0, Kokkos::ALL(), Kokkos::ALL(), v);
3137 default_mode_type,KB::Algo::LU::Unblocked>
3138 ::invoke(member, A , tiny);
3143 local_ordinal_type i = i0;
3144 for (local_ordinal_type tr=1;tr<nrows;++tr,i+=3) {
3145 #ifdef IFPACK2_BLOCKTRIDICONTAINER_USE_PRINTF
3146 printf(
"tr = %d, i = %d;\n", tr, i);
3148 B.assign_data( &AA(i+1,0,0,v) );
3149 KB::Trsm<member_type,
3150 KB::Side::Left,KB::Uplo::Lower,KB::Trans::NoTranspose,KB::Diag::Unit,
3151 default_mode_type,default_algo_type>
3152 ::invoke(member, one, A, B);
3153 C.assign_data( &AA(i+2,0,0,v) );
3154 KB::Trsm<member_type,
3155 KB::Side::Right,KB::Uplo::Upper,KB::Trans::NoTranspose,KB::Diag::NonUnit,
3156 default_mode_type,default_algo_type>
3157 ::invoke(member, one, A, C);
3158 A.assign_data( &AA(i+3,0,0,v) );
3160 member.team_barrier();
3161 KB::Gemm<member_type,
3162 KB::Trans::NoTranspose,KB::Trans::NoTranspose,
3163 default_mode_type,default_algo_type>
3164 ::invoke(member, -one, C, B, one, A);
3166 default_mode_type,KB::Algo::LU::Unblocked>
3167 ::invoke(member, A, tiny);
3171 auto W = Kokkos::subview(WW, Kokkos::ALL(), Kokkos::ALL(), v);
3172 KB::Copy<member_type,KB::Trans::NoTranspose,default_mode_type>
3173 ::invoke(member, A, W);
3174 KB::SetIdentity<member_type,default_mode_type>
3175 ::invoke(member, A);
3176 member.team_barrier();
3177 KB::Trsm<member_type,
3178 KB::Side::Left,KB::Uplo::Lower,KB::Trans::NoTranspose,KB::Diag::Unit,
3179 default_mode_type,default_algo_type>
3180 ::invoke(member, one, W, A);
3181 KB::Trsm<member_type,
3182 KB::Side::Left,KB::Uplo::Upper,KB::Trans::NoTranspose,KB::Diag::NonUnit,
3183 default_mode_type,default_algo_type>
3184 ::invoke(member, one, W, A);
3190 struct ExtractAndFactorizeSubLineTag {};
3191 struct ExtractBCDTag {};
3192 struct ComputeETag {};
3193 struct ComputeSchurTag {};
3194 struct FactorizeSchurTag {};
3196 KOKKOS_INLINE_FUNCTION
3198 operator() (
const ExtractAndFactorizeSubLineTag &,
const member_type &member)
const {
3200 const local_ordinal_type packidx = packindices_sub(member.league_rank());
3202 const local_ordinal_type subpartidx = packptr_sub(packidx);
3203 const local_ordinal_type n_parts = part2packrowidx0_sub.extent(0);
3204 const local_ordinal_type local_subpartidx = subpartidx/n_parts;
3205 const local_ordinal_type partidx = subpartidx%n_parts;
3207 const local_ordinal_type npacks = packptr_sub(packidx+1) - subpartidx;
3208 const local_ordinal_type i0 = pack_td_ptr(partidx,local_subpartidx);
3209 const local_ordinal_type nrows = partptr_sub(subpartidx,1) - partptr_sub(subpartidx,0);
3211 internal_vector_scratch_type_3d_view
3212 WW(member.team_scratch(ScratchLevel), blocksize, blocksize, vector_loop_size);
3214 #ifdef IFPACK2_BLOCKTRIDICONTAINER_USE_PRINTF
3215 printf(
"rank = %d, i0 = %d, npacks = %d, nrows = %d, packidx = %d, subpartidx = %d, partidx = %d, local_subpartidx = %d;\n", member.league_rank(), i0, npacks, nrows, packidx, subpartidx, partidx, local_subpartidx);
3216 printf(
"vector_loop_size = %d\n", vector_loop_size);
3219 if (vector_loop_size == 1) {
3220 extract(partidx, local_subpartidx, npacks);
3221 factorize_subline(member, i0, nrows, 0, internal_vector_values, WW);
3223 Kokkos::parallel_for
3224 (Kokkos::ThreadVectorRange(member, vector_loop_size),
3225 [&](
const local_ordinal_type &v) {
3226 const local_ordinal_type vbeg = v*internal_vector_length;
3227 #ifdef IFPACK2_BLOCKTRIDICONTAINER_USE_PRINTF
3228 printf(
"i0 = %d, npacks = %d, vbeg = %d;\n", i0, npacks, vbeg);
3231 extract(member, partidx+vbeg, local_subpartidx, npacks, vbeg);
3234 member.team_barrier();
3235 factorize_subline(member, i0, nrows, v, internal_vector_values, WW);
3240 KOKKOS_INLINE_FUNCTION
3242 operator() (
const ExtractBCDTag &,
const member_type &member)
const {
3244 const local_ordinal_type packindices_schur_i = member.league_rank() % packindices_schur.extent(0);
3245 const local_ordinal_type packindices_schur_j = member.league_rank() / packindices_schur.extent(0);
3246 const local_ordinal_type packidx = packindices_schur(packindices_schur_i, packindices_schur_j);
3248 const local_ordinal_type subpartidx = packptr_sub(packidx);
3249 const local_ordinal_type n_parts = part2packrowidx0_sub.extent(0);
3250 const local_ordinal_type local_subpartidx = subpartidx/n_parts;
3251 const local_ordinal_type partidx = subpartidx%n_parts;
3253 const local_ordinal_type npacks = packptr_sub(packidx+1) - subpartidx;
3257 if (vector_loop_size == 1) {
3258 extract(partidx, local_subpartidx, npacks);
3261 Kokkos::parallel_for
3262 (Kokkos::ThreadVectorRange(member, vector_loop_size),
3263 [&](
const local_ordinal_type &v) {
3264 const local_ordinal_type vbeg = v*internal_vector_length;
3265 #ifdef IFPACK2_BLOCKTRIDICONTAINER_USE_PRINTF
3266 const local_ordinal_type i0 = pack_td_ptr(partidx,local_subpartidx);
3267 printf(
"i0 = %d, npacks = %d, vbeg = %d;\n", i0, npacks, vbeg);
3270 extract(member, partidx+vbeg, local_subpartidx, npacks, vbeg);
3274 member.team_barrier();
3276 const size_type kps1 = pack_td_ptr(partidx, local_subpartidx);
3277 const size_type kps2 = pack_td_ptr(partidx, local_subpartidx+1)-1;
3279 const local_ordinal_type r1 = part2packrowidx0_sub(partidx,local_subpartidx)-1;
3280 const local_ordinal_type r2 = part2packrowidx0_sub(partidx,local_subpartidx)+2;
3282 #ifdef IFPACK2_BLOCKTRIDICONTAINER_USE_PRINTF
3283 printf(
"Copy for Schur complement part id = %d from kps1 = %ld to r1 = %d and from kps2 = %ld to r2 = %d partidx = %d local_subpartidx = %d;\n", packidx, kps1, r1, kps2, r2, partidx, local_subpartidx);
3287 copy3DView<local_ordinal_type>(member, Kokkos::subview(e_internal_vector_values, 0, r1, Kokkos::ALL(), Kokkos::ALL(), Kokkos::ALL()),
3288 Kokkos::subview(internal_vector_values, kps1, Kokkos::ALL(), Kokkos::ALL(), Kokkos::ALL()));
3290 copy3DView<local_ordinal_type>(member, Kokkos::subview(e_internal_vector_values, 1, r2, Kokkos::ALL(), Kokkos::ALL(), Kokkos::ALL()),
3291 Kokkos::subview(internal_vector_values, kps2, Kokkos::ALL(), Kokkos::ALL(), Kokkos::ALL()));
3295 KOKKOS_INLINE_FUNCTION
3297 operator() (
const ComputeETag &,
const member_type &member)
const {
3299 const local_ordinal_type packidx = packindices_sub(member.league_rank());
3301 const local_ordinal_type subpartidx = packptr_sub(packidx);
3302 const local_ordinal_type n_parts = part2packrowidx0_sub.extent(0);
3303 const local_ordinal_type local_subpartidx = subpartidx/n_parts;
3304 const local_ordinal_type partidx = subpartidx%n_parts;
3306 const local_ordinal_type npacks = packptr_sub(packidx+1) - subpartidx;
3307 const local_ordinal_type i0 = pack_td_ptr(partidx,local_subpartidx);
3308 const local_ordinal_type r0 = part2packrowidx0_sub(partidx,local_subpartidx);
3309 const local_ordinal_type nrows = partptr_sub(subpartidx,1) - partptr_sub(subpartidx,0);
3310 const local_ordinal_type num_vectors = blocksize;
3314 internal_vector_scratch_type_3d_view
3315 WW(member.team_scratch(ScratchLevel), blocksize, num_vectors, vector_loop_size);
3316 if (local_subpartidx == 0) {
3317 Kokkos::parallel_for
3318 (Kokkos::ThreadVectorRange(member, vector_loop_size),[&](
const int &v) {
3319 solveMultiVector<impl_type, internal_vector_scratch_type_3d_view> (member, blocksize, i0, r0, nrows, v, internal_vector_values, Kokkos::subview(e_internal_vector_values, 0, Kokkos::ALL(), Kokkos::ALL(), Kokkos::ALL(), Kokkos::ALL()), WW,
true);
3322 else if (local_subpartidx == (local_ordinal_type) part2packrowidx0_sub.extent(1) - 2) {
3323 Kokkos::parallel_for
3324 (Kokkos::ThreadVectorRange(member, vector_loop_size),[&](
const int &v) {
3325 solveMultiVector<impl_type, internal_vector_scratch_type_3d_view> (member, blocksize, i0, r0, nrows, v, internal_vector_values, Kokkos::subview(e_internal_vector_values, 1, Kokkos::ALL(), Kokkos::ALL(), Kokkos::ALL(), Kokkos::ALL()), WW);
3329 Kokkos::parallel_for
3330 (Kokkos::ThreadVectorRange(member, vector_loop_size),[&](
const int &v) {
3331 solveMultiVector<impl_type, internal_vector_scratch_type_3d_view> (member, blocksize, i0, r0, nrows, v, internal_vector_values, Kokkos::subview(e_internal_vector_values, 0, Kokkos::ALL(), Kokkos::ALL(), Kokkos::ALL(), Kokkos::ALL()), WW,
true);
3332 solveMultiVector<impl_type, internal_vector_scratch_type_3d_view> (member, blocksize, i0, r0, nrows, v, internal_vector_values, Kokkos::subview(e_internal_vector_values, 1, Kokkos::ALL(), Kokkos::ALL(), Kokkos::ALL(), Kokkos::ALL()), WW);
3337 KOKKOS_INLINE_FUNCTION
3339 operator() (
const ComputeSchurTag &,
const member_type &member)
const {
3341 const local_ordinal_type packindices_schur_i = member.league_rank() % packindices_schur.extent(0);
3342 const local_ordinal_type packindices_schur_j = member.league_rank() / packindices_schur.extent(0);
3343 const local_ordinal_type packidx = packindices_schur(packindices_schur_i, packindices_schur_j);
3345 const local_ordinal_type subpartidx = packptr_sub(packidx);
3346 const local_ordinal_type n_parts = part2packrowidx0_sub.extent(0);
3347 const local_ordinal_type local_subpartidx = subpartidx/n_parts;
3348 const local_ordinal_type partidx = subpartidx%n_parts;
3351 const local_ordinal_type i0 = pack_td_ptr(partidx,local_subpartidx);
3357 const local_ordinal_type local_subpartidx_schur = (local_subpartidx-1)/2;
3358 const local_ordinal_type i0_schur = local_subpartidx_schur == 0 ? pack_td_ptr_schur(partidx,local_subpartidx_schur) : pack_td_ptr_schur(partidx,local_subpartidx_schur) + 1;
3359 const local_ordinal_type i0_offset = local_subpartidx_schur == 0 ? i0+2 : i0+2;
3361 for (local_ordinal_type i = 0; i < 4; ++i) {
3362 copy3DView<local_ordinal_type>(member, Kokkos::subview(internal_vector_values_schur, i0_schur+i, Kokkos::ALL(), Kokkos::ALL(), Kokkos::ALL()),
3363 Kokkos::subview(internal_vector_values, i0_offset+i, Kokkos::ALL(), Kokkos::ALL(), Kokkos::ALL()));
3366 member.team_barrier();
3368 const auto one = Kokkos::ArithTraits<btdm_magnitude_type>::one();
3370 const size_type c_kps1 = pack_td_ptr(partidx, local_subpartidx)+1;
3371 const size_type c_kps2 = pack_td_ptr(partidx, local_subpartidx+1)-2;
3373 const local_ordinal_type e_r1 = part2packrowidx0_sub(partidx,local_subpartidx)-1;
3374 const local_ordinal_type e_r2 = part2packrowidx0_sub(partidx,local_subpartidx)+2;
3376 typedef ExtractAndFactorizeTridiagsDefaultModeAndAlgo
3377 <
typename execution_space::memory_space> default_mode_and_algo_type;
3379 typedef typename default_mode_and_algo_type::mode_type default_mode_type;
3380 typedef typename default_mode_and_algo_type::algo_type default_algo_type;
3382 Kokkos::parallel_for
3383 (Kokkos::ThreadVectorRange(member, vector_loop_size),[&](
const int &v) {
3384 for (size_type i = 0; i < pack_td_ptr_schur(partidx,local_subpartidx_schur+1)-pack_td_ptr_schur(partidx,local_subpartidx_schur); ++i) {
3385 local_ordinal_type e_r, e_c, c_kps;
3387 if ( local_subpartidx_schur == 0 ) {
3393 else if ( i == 3 ) {
3398 else if ( i == 4 ) {
3413 else if ( i == 1 ) {
3418 else if ( i == 4 ) {
3423 else if ( i == 5 ) {
3433 auto S = Kokkos::subview(internal_vector_values_schur, pack_td_ptr_schur(partidx,local_subpartidx_schur)+i, Kokkos::ALL(), Kokkos::ALL(), v);
3434 auto C = Kokkos::subview(internal_vector_values, c_kps, Kokkos::ALL(), Kokkos::ALL(), v);
3435 auto E = Kokkos::subview(e_internal_vector_values, e_c, e_r, Kokkos::ALL(), Kokkos::ALL(), v);
3436 KB::Gemm<member_type,
3437 KB::Trans::NoTranspose,KB::Trans::NoTranspose,
3438 default_mode_type,default_algo_type>
3439 ::invoke(member, -one, C, E, one, S);
3444 KOKKOS_INLINE_FUNCTION
3446 operator() (
const FactorizeSchurTag &,
const member_type &member)
const {
3447 const local_ordinal_type packidx = packindices_schur(member.league_rank(), 0);
3449 const local_ordinal_type subpartidx = packptr_sub(packidx);
3451 const local_ordinal_type n_parts = part2packrowidx0_sub.extent(0);
3452 const local_ordinal_type partidx = subpartidx%n_parts;
3454 const local_ordinal_type i0 = pack_td_ptr_schur(partidx,0);
3455 const local_ordinal_type nrows = 2*(pack_td_ptr_schur.extent(1)-1);
3457 internal_vector_scratch_type_3d_view
3458 WW(member.team_scratch(ScratchLevel), blocksize, blocksize, vector_loop_size);
3460 #ifdef IFPACK2_BLOCKTRIDICONTAINER_USE_PRINTF
3461 printf(
"FactorizeSchurTag rank = %d, i0 = %d, nrows = %d, vector_loop_size = %d;\n", member.league_rank(), i0, nrows, vector_loop_size);
3464 if (vector_loop_size == 1) {
3465 factorize_subline(member, i0, nrows, 0, internal_vector_values_schur, WW);
3467 Kokkos::parallel_for
3468 (Kokkos::ThreadVectorRange(member, vector_loop_size),
3469 [&](
const local_ordinal_type &v) {
3470 factorize_subline(member, i0, nrows, v, internal_vector_values_schur, WW);
3476 IFPACK2_BLOCKTRIDICONTAINER_PROFILER_REGION_BEGIN;
3477 const local_ordinal_type team_size =
3478 ExtractAndFactorizeTridiagsDefaultModeAndAlgo<typename execution_space::memory_space>::
3479 recommended_team_size(blocksize, vector_length, internal_vector_length);
3480 const local_ordinal_type per_team_scratch = internal_vector_scratch_type_3d_view::
3481 shmem_size(blocksize, blocksize, vector_loop_size);
3484 #ifdef IFPACK2_BLOCKTRIDICONTAINER_USE_PRINTF
3485 printf(
"Start ExtractAndFactorizeSubLineTag\n");
3487 IFPACK2_BLOCKHELPER_TIMER(
"BlockTriDi::NumericPhase::ExtractAndFactorizeSubLineTag", ExtractAndFactorizeSubLineTag0);
3488 Kokkos::TeamPolicy<execution_space,ExtractAndFactorizeSubLineTag>
3489 policy(packindices_sub.extent(0), team_size, vector_loop_size);
3492 const local_ordinal_type n_parts = part2packrowidx0_sub.extent(0);
3493 writeBTDValuesToFile(n_parts, scalar_values,
"before.mm");
3495 policy.set_scratch_size(ScratchLevel, Kokkos::PerTeam(per_team_scratch));
3496 Kokkos::parallel_for(
"ExtractAndFactorize::TeamPolicy::run<ExtractAndFactorizeSubLineTag>",
3498 execution_space().fence();
3500 writeBTDValuesToFile(n_parts, scalar_values,
"after.mm");
3501 #ifdef IFPACK2_BLOCKTRIDICONTAINER_USE_PRINTF
3502 printf(
"End ExtractAndFactorizeSubLineTag\n");
3506 if (packindices_schur.extent(1) > 0)
3509 #ifdef IFPACK2_BLOCKTRIDICONTAINER_USE_PRINTF
3510 printf(
"Start ExtractBCDTag\n");
3512 Kokkos::deep_copy(e_scalar_values, Kokkos::ArithTraits<btdm_magnitude_type>::zero());
3513 Kokkos::deep_copy(scalar_values_schur, Kokkos::ArithTraits<btdm_magnitude_type>::zero());
3515 write5DMultiVectorValuesToFile(part2packrowidx0_sub.extent(0), e_scalar_values,
"e_scalar_values_before_extract.mm");
3518 IFPACK2_BLOCKHELPER_TIMER(
"BlockTriDi::NumericPhase::ExtractBCDTag", ExtractBCDTag0);
3519 Kokkos::TeamPolicy<execution_space,ExtractBCDTag>
3520 policy(packindices_schur.extent(0)*packindices_schur.extent(1), team_size, vector_loop_size);
3522 policy.set_scratch_size(ScratchLevel, Kokkos::PerTeam(per_team_scratch));
3523 Kokkos::parallel_for(
"ExtractAndFactorize::TeamPolicy::run<ExtractBCDTag>",
3525 execution_space().fence();
3528 #ifdef IFPACK2_BLOCKTRIDICONTAINER_USE_PRINTF
3529 printf(
"End ExtractBCDTag\n");
3531 writeBTDValuesToFile(part2packrowidx0_sub.extent(0), scalar_values,
"after_extraction_of_BCD.mm");
3532 #ifdef IFPACK2_BLOCKTRIDICONTAINER_USE_PRINTF
3533 printf(
"Start ComputeETag\n");
3535 write5DMultiVectorValuesToFile(part2packrowidx0_sub.extent(0), e_scalar_values,
"e_scalar_values_after_extract.mm");
3537 IFPACK2_BLOCKHELPER_TIMER(
"BlockTriDi::NumericPhase::ComputeETag", ComputeETag0);
3538 Kokkos::TeamPolicy<execution_space,ComputeETag>
3539 policy(packindices_sub.extent(0), team_size, vector_loop_size);
3541 policy.set_scratch_size(ScratchLevel, Kokkos::PerTeam(per_team_scratch));
3542 Kokkos::parallel_for(
"ExtractAndFactorize::TeamPolicy::run<ComputeETag>",
3544 execution_space().fence();
3546 write5DMultiVectorValuesToFile(part2packrowidx0_sub.extent(0), e_scalar_values,
"e_scalar_values_after_compute.mm");
3548 #ifdef IFPACK2_BLOCKTRIDICONTAINER_USE_PRINTF
3549 printf(
"End ComputeETag\n");
3554 #ifdef IFPACK2_BLOCKTRIDICONTAINER_USE_PRINTF
3555 printf(
"Start ComputeSchurTag\n");
3557 IFPACK2_BLOCKHELPER_TIMER(
"BlockTriDi::NumericPhase::ComputeSchurTag", ComputeSchurTag0);
3558 writeBTDValuesToFile(part2packrowidx0_sub.extent(0), scalar_values_schur,
"before_schur.mm");
3559 Kokkos::TeamPolicy<execution_space,ComputeSchurTag>
3560 policy(packindices_schur.extent(0)*packindices_schur.extent(1), team_size, vector_loop_size);
3562 Kokkos::parallel_for(
"ExtractAndFactorize::TeamPolicy::run<ComputeSchurTag>",
3564 writeBTDValuesToFile(part2packrowidx0_sub.extent(0), scalar_values_schur,
"after_schur.mm");
3565 execution_space().fence();
3566 #ifdef IFPACK2_BLOCKTRIDICONTAINER_USE_PRINTF
3567 printf(
"End ComputeSchurTag\n");
3572 #ifdef IFPACK2_BLOCKTRIDICONTAINER_USE_PRINTF
3573 printf(
"Start FactorizeSchurTag\n");
3575 IFPACK2_BLOCKHELPER_TIMER(
"BlockTriDi::NumericPhase::FactorizeSchurTag", FactorizeSchurTag0);
3576 Kokkos::TeamPolicy<execution_space,FactorizeSchurTag>
3577 policy(packindices_schur.extent(0), team_size, vector_loop_size);
3578 policy.set_scratch_size(ScratchLevel, Kokkos::PerTeam(per_team_scratch));
3579 Kokkos::parallel_for(
"ExtractAndFactorize::TeamPolicy::run<FactorizeSchurTag>",
3581 execution_space().fence();
3582 writeBTDValuesToFile(part2packrowidx0_sub.extent(0), scalar_values_schur,
"after_factor_schur.mm");
3583 #ifdef IFPACK2_BLOCKTRIDICONTAINER_USE_PRINTF
3584 printf(
"End FactorizeSchurTag\n");
3589 IFPACK2_BLOCKTRIDICONTAINER_PROFILER_REGION_END;
3597 template<
typename MatrixType>
3600 const Teuchos::RCP<
const typename BlockHelperDetails::ImplType<MatrixType>::tpetra_crs_graph_type> &G,
3601 const BlockHelperDetails::PartInterface<MatrixType> &interf,
3603 const typename BlockHelperDetails::ImplType<MatrixType>::magnitude_type tiny) {
3605 using execution_space =
typename impl_type::execution_space;
3606 using team_policy_type = Kokkos::TeamPolicy<execution_space>;
3607 using internal_vector_scratch_type_3d_view = Scratch<typename impl_type::internal_vector_type_3d_view>;
3609 IFPACK2_BLOCKHELPER_TIMER(
"BlockTriDi::NumericPhase", NumericPhase);
3611 int blocksize = btdm.values.extent(1);
3614 int scratch_required = internal_vector_scratch_type_3d_view::shmem_size(blocksize, blocksize, impl_type::vector_length / impl_type::internal_vector_length);
3615 int max_scratch = team_policy_type::scratch_size_max(0);
3617 if(scratch_required < max_scratch) {
3619 ExtractAndFactorizeTridiags<MatrixType, 0>
function(btdm, interf, A, G, tiny);
3624 ExtractAndFactorizeTridiags<MatrixType, 1>
function(btdm, interf, A, G, tiny);
3627 IFPACK2_BLOCKHELPER_TIMER_FENCE(
typename BlockHelperDetails::ImplType<MatrixType>::execution_space)
3633 template<
typename MatrixType>
3637 using execution_space =
typename impl_type::execution_space;
3638 using memory_space =
typename impl_type::memory_space;
3640 using local_ordinal_type =
typename impl_type::local_ordinal_type;
3642 using btdm_scalar_type =
typename impl_type::btdm_scalar_type;
3643 using tpetra_multivector_type =
typename impl_type::tpetra_multivector_type;
3644 using local_ordinal_type_1d_view =
typename impl_type::local_ordinal_type_1d_view;
3645 using vector_type_3d_view =
typename impl_type::vector_type_3d_view;
3646 using impl_scalar_type_2d_view_tpetra =
typename impl_type::impl_scalar_type_2d_view_tpetra;
3647 using const_impl_scalar_type_2d_view_tpetra =
typename impl_scalar_type_2d_view_tpetra::const_type;
3648 static constexpr
int vector_length = impl_type::vector_length;
3650 using member_type =
typename Kokkos::TeamPolicy<execution_space>::member_type;
3654 const ConstUnmanaged<local_ordinal_type_1d_view> partptr;
3655 const ConstUnmanaged<local_ordinal_type_1d_view> packptr;
3656 const ConstUnmanaged<local_ordinal_type_1d_view> part2packrowidx0;
3657 const ConstUnmanaged<local_ordinal_type_1d_view> part2rowidx0;
3658 const ConstUnmanaged<local_ordinal_type_1d_view> lclrow;
3659 const local_ordinal_type blocksize;
3660 const local_ordinal_type num_vectors;
3663 vector_type_3d_view packed_multivector;
3664 const_impl_scalar_type_2d_view_tpetra scalar_multivector;
3666 template<
typename TagType>
3667 KOKKOS_INLINE_FUNCTION
3668 void copy_multivectors(
const local_ordinal_type &j,
3669 const local_ordinal_type &vi,
3670 const local_ordinal_type &pri,
3671 const local_ordinal_type &ri0)
const {
3672 for (local_ordinal_type col=0;col<num_vectors;++col)
3673 for (local_ordinal_type i=0;i<blocksize;++i)
3674 packed_multivector(pri, i, col)[vi] =
static_cast<btdm_scalar_type
>(scalar_multivector(blocksize*lclrow(ri0+j)+i,col));
3680 const vector_type_3d_view &pmv)
3681 : partptr(interf.partptr),
3682 packptr(interf.packptr),
3683 part2packrowidx0(interf.part2packrowidx0),
3684 part2rowidx0(interf.part2rowidx0),
3685 lclrow(interf.lclrow),
3686 blocksize(pmv.extent(1)),
3687 num_vectors(pmv.extent(2)),
3688 packed_multivector(pmv) {}
3691 KOKKOS_INLINE_FUNCTION
3693 operator() (
const local_ordinal_type &packidx)
const {
3694 local_ordinal_type partidx = packptr(packidx);
3695 local_ordinal_type npacks = packptr(packidx+1) - partidx;
3696 const local_ordinal_type pri0 = part2packrowidx0(partidx);
3698 local_ordinal_type ri0[vector_length] = {};
3699 local_ordinal_type nrows[vector_length] = {};
3700 for (local_ordinal_type v=0;v<npacks;++v,++partidx) {
3701 ri0[v] = part2rowidx0(partidx);
3702 nrows[v] = part2rowidx0(partidx+1) - ri0[v];
3704 for (local_ordinal_type j=0;j<nrows[0];++j) {
3705 local_ordinal_type cnt = 1;
3706 for (;cnt<npacks && j!= nrows[cnt];++cnt);
3708 const local_ordinal_type pri = pri0 + j;
3709 for (local_ordinal_type col=0;col<num_vectors;++col)
3710 for (local_ordinal_type i=0;i<blocksize;++i)
3711 for (local_ordinal_type v=0;v<npacks;++v)
3712 packed_multivector(pri, i, col)[v] =
static_cast<btdm_scalar_type
>(scalar_multivector(blocksize*lclrow(ri0[v]+j)+i,col));
3716 KOKKOS_INLINE_FUNCTION
3718 operator() (
const member_type &member)
const {
3719 const local_ordinal_type packidx = member.league_rank();
3720 const local_ordinal_type partidx_begin = packptr(packidx);
3721 const local_ordinal_type npacks = packptr(packidx+1) - partidx_begin;
3722 const local_ordinal_type pri0 = part2packrowidx0(partidx_begin);
3723 Kokkos::parallel_for(Kokkos::ThreadVectorRange(member, npacks), [&](
const local_ordinal_type &v) {
3724 const local_ordinal_type partidx = partidx_begin + v;
3725 const local_ordinal_type ri0 = part2rowidx0(partidx);
3726 const local_ordinal_type nrows = part2rowidx0(partidx+1) - ri0;
3729 const local_ordinal_type pri = pri0;
3730 for (local_ordinal_type col=0;col<num_vectors;++col) {
3731 Kokkos::parallel_for(Kokkos::TeamThreadRange(member, blocksize), [&](
const local_ordinal_type &i) {
3732 packed_multivector(pri, i, col)[v] =
static_cast<btdm_scalar_type
>(scalar_multivector(blocksize*lclrow(ri0)+i,col));
3736 Kokkos::parallel_for(Kokkos::TeamThreadRange(member, nrows), [&](
const local_ordinal_type &j) {
3737 const local_ordinal_type pri = pri0 + j;
3738 for (local_ordinal_type col=0;col<num_vectors;++col)
3739 for (local_ordinal_type i=0;i<blocksize;++i)
3740 packed_multivector(pri, i, col)[v] = static_cast<btdm_scalar_type>(scalar_multivector(blocksize*lclrow(ri0+j)+i,col));
3746 void run(
const const_impl_scalar_type_2d_view_tpetra &scalar_multivector_) {
3747 IFPACK2_BLOCKTRIDICONTAINER_PROFILER_REGION_BEGIN;
3748 IFPACK2_BLOCKHELPER_TIMER(
"BlockTriDi::MultiVectorConverter", MultiVectorConverter0);
3750 scalar_multivector = scalar_multivector_;
3751 if constexpr (BlockHelperDetails::is_device<execution_space>::value) {
3752 const local_ordinal_type vl = vector_length;
3753 const Kokkos::TeamPolicy<execution_space> policy(packptr.extent(0) - 1, Kokkos::AUTO(), vl);
3754 Kokkos::parallel_for
3755 (
"MultiVectorConverter::TeamPolicy", policy, *
this);
3757 const Kokkos::RangePolicy<execution_space> policy(0, packptr.extent(0) - 1);
3758 Kokkos::parallel_for
3759 (
"MultiVectorConverter::RangePolicy", policy, *
this);
3761 IFPACK2_BLOCKTRIDICONTAINER_PROFILER_REGION_END;
3762 IFPACK2_BLOCKHELPER_TIMER_FENCE(execution_space)
3771 struct SolveTridiagsDefaultModeAndAlgo<Kokkos::HostSpace> {
3772 typedef KB::Mode::Serial mode_type;
3773 typedef KB::Algo::Level2::Unblocked single_vector_algo_type;
3774 #if defined(__KOKKOSBATCHED_INTEL_MKL_COMPACT_BATCHED__)
3775 typedef KB::Algo::Level3::CompactMKL multi_vector_algo_type;
3777 typedef KB::Algo::Level3::Blocked multi_vector_algo_type;
3779 static int recommended_team_size(
const int ,
3786 #if defined(KOKKOS_ENABLE_CUDA)
3787 static inline int SolveTridiagsRecommendedCudaTeamSize(
const int blksize,
3788 const int vector_length,
3789 const int internal_vector_length) {
3790 const int vector_size = vector_length/internal_vector_length;
3791 int total_team_size(0);
3792 if (blksize <= 5) total_team_size = 32;
3793 else if (blksize <= 9) total_team_size = 32;
3794 else if (blksize <= 12) total_team_size = 96;
3795 else if (blksize <= 16) total_team_size = 128;
3796 else if (blksize <= 20) total_team_size = 160;
3797 else total_team_size = 160;
3798 return total_team_size/vector_size;
3802 struct SolveTridiagsDefaultModeAndAlgo<Kokkos::CudaSpace> {
3803 typedef KB::Mode::Team mode_type;
3804 typedef KB::Algo::Level2::Unblocked single_vector_algo_type;
3805 typedef KB::Algo::Level3::Unblocked multi_vector_algo_type;
3806 static int recommended_team_size(
const int blksize,
3807 const int vector_length,
3808 const int internal_vector_length) {
3809 return SolveTridiagsRecommendedCudaTeamSize(blksize, vector_length, internal_vector_length);
3813 struct SolveTridiagsDefaultModeAndAlgo<Kokkos::CudaUVMSpace> {
3814 typedef KB::Mode::Team mode_type;
3815 typedef KB::Algo::Level2::Unblocked single_vector_algo_type;
3816 typedef KB::Algo::Level3::Unblocked multi_vector_algo_type;
3817 static int recommended_team_size(
const int blksize,
3818 const int vector_length,
3819 const int internal_vector_length) {
3820 return SolveTridiagsRecommendedCudaTeamSize(blksize, vector_length, internal_vector_length);
3825 #if defined(KOKKOS_ENABLE_HIP)
3826 static inline int SolveTridiagsRecommendedHIPTeamSize(
const int blksize,
3827 const int vector_length,
3828 const int internal_vector_length) {
3829 const int vector_size = vector_length/internal_vector_length;
3830 int total_team_size(0);
3831 if (blksize <= 5) total_team_size = 32;
3832 else if (blksize <= 9) total_team_size = 32;
3833 else if (blksize <= 12) total_team_size = 96;
3834 else if (blksize <= 16) total_team_size = 128;
3835 else if (blksize <= 20) total_team_size = 160;
3836 else total_team_size = 160;
3837 return total_team_size/vector_size;
3841 struct SolveTridiagsDefaultModeAndAlgo<Kokkos::HIPSpace> {
3842 typedef KB::Mode::Team mode_type;
3843 typedef KB::Algo::Level2::Unblocked single_vector_algo_type;
3844 typedef KB::Algo::Level3::Unblocked multi_vector_algo_type;
3845 static int recommended_team_size(
const int blksize,
3846 const int vector_length,
3847 const int internal_vector_length) {
3848 return SolveTridiagsRecommendedHIPTeamSize(blksize, vector_length, internal_vector_length);
3852 struct SolveTridiagsDefaultModeAndAlgo<Kokkos::HIPHostPinnedSpace> {
3853 typedef KB::Mode::Team mode_type;
3854 typedef KB::Algo::Level2::Unblocked single_vector_algo_type;
3855 typedef KB::Algo::Level3::Unblocked multi_vector_algo_type;
3856 static int recommended_team_size(
const int blksize,
3857 const int vector_length,
3858 const int internal_vector_length) {
3859 return SolveTridiagsRecommendedHIPTeamSize(blksize, vector_length, internal_vector_length);
3864 #if defined(KOKKOS_ENABLE_SYCL)
3865 static inline int SolveTridiagsRecommendedSYCLTeamSize(
const int blksize,
3866 const int vector_length,
3867 const int internal_vector_length) {
3868 const int vector_size = vector_length/internal_vector_length;
3869 int total_team_size(0);
3870 if (blksize <= 5) total_team_size = 32;
3871 else if (blksize <= 9) total_team_size = 32;
3872 else if (blksize <= 12) total_team_size = 96;
3873 else if (blksize <= 16) total_team_size = 128;
3874 else if (blksize <= 20) total_team_size = 160;
3875 else total_team_size = 160;
3876 return total_team_size/vector_size;
3880 struct SolveTridiagsDefaultModeAndAlgo<Kokkos::Experimental::SYCLSharedUSMSpace> {
3881 typedef KB::Mode::Team mode_type;
3882 typedef KB::Algo::Level2::Unblocked single_vector_algo_type;
3883 typedef KB::Algo::Level3::Unblocked multi_vector_algo_type;
3884 static int recommended_team_size(
const int blksize,
3885 const int vector_length,
3886 const int internal_vector_length) {
3887 return SolveTridiagsRecommendedSYCLTeamSize(blksize, vector_length, internal_vector_length);
3891 struct SolveTridiagsDefaultModeAndAlgo<Kokkos::Experimental::SYCLDeviceUSMSpace> {
3892 typedef KB::Mode::Team mode_type;
3893 typedef KB::Algo::Level2::Unblocked single_vector_algo_type;
3894 typedef KB::Algo::Level3::Unblocked multi_vector_algo_type;
3895 static int recommended_team_size(
const int blksize,
3896 const int vector_length,
3897 const int internal_vector_length) {
3898 return SolveTridiagsRecommendedSYCLTeamSize(blksize, vector_length, internal_vector_length);
3906 template<
typename MatrixType>
3907 struct SolveTridiags {
3909 using impl_type = BlockHelperDetails::ImplType<MatrixType>;
3910 using execution_space =
typename impl_type::execution_space;
3912 using local_ordinal_type =
typename impl_type::local_ordinal_type;
3915 using magnitude_type =
typename impl_type::magnitude_type;
3916 using btdm_scalar_type =
typename impl_type::btdm_scalar_type;
3917 using btdm_magnitude_type =
typename impl_type::btdm_magnitude_type;
3919 using local_ordinal_type_1d_view =
typename impl_type::local_ordinal_type_1d_view;
3920 using local_ordinal_type_2d_view =
typename impl_type::local_ordinal_type_2d_view;
3921 using size_type_2d_view =
typename impl_type::size_type_2d_view;
3923 using vector_type_3d_view =
typename impl_type::vector_type_3d_view;
3924 using internal_vector_type_4d_view =
typename impl_type::internal_vector_type_4d_view;
3925 using internal_vector_type_5d_view =
typename impl_type::internal_vector_type_5d_view;
3926 using btdm_scalar_type_4d_view =
typename impl_type::btdm_scalar_type_4d_view;
3928 using internal_vector_scratch_type_3d_view = Scratch<typename impl_type::internal_vector_type_3d_view>;
3930 using internal_vector_type =
typename impl_type::internal_vector_type;
3931 static constexpr
int vector_length = impl_type::vector_length;
3932 static constexpr
int internal_vector_length = impl_type::internal_vector_length;
3935 using impl_scalar_type_1d_view =
typename impl_type::impl_scalar_type_1d_view;
3936 using impl_scalar_type_2d_view_tpetra =
typename impl_type::impl_scalar_type_2d_view_tpetra;
3939 using team_policy_type = Kokkos::TeamPolicy<execution_space>;
3940 using member_type =
typename team_policy_type::member_type;
3944 local_ordinal_type n_subparts_per_part;
3945 const ConstUnmanaged<local_ordinal_type_1d_view> partptr;
3946 const ConstUnmanaged<local_ordinal_type_1d_view> packptr;
3947 const ConstUnmanaged<local_ordinal_type_1d_view> packindices_sub;
3948 const ConstUnmanaged<local_ordinal_type_2d_view> packindices_schur;
3949 const ConstUnmanaged<local_ordinal_type_1d_view> part2packrowidx0;
3950 const ConstUnmanaged<local_ordinal_type_2d_view> part2packrowidx0_sub;
3951 const ConstUnmanaged<local_ordinal_type_1d_view> lclrow;
3952 const ConstUnmanaged<local_ordinal_type_1d_view> packptr_sub;
3954 const ConstUnmanaged<local_ordinal_type_2d_view> partptr_sub;
3955 const ConstUnmanaged<size_type_2d_view> pack_td_ptr_schur;
3958 const ConstUnmanaged<size_type_2d_view> pack_td_ptr;
3961 const ConstUnmanaged<internal_vector_type_4d_view> D_internal_vector_values;
3962 const Unmanaged<internal_vector_type_4d_view> X_internal_vector_values;
3963 const Unmanaged<btdm_scalar_type_4d_view> X_internal_scalar_values;
3965 internal_vector_type_4d_view X_internal_vector_values_schur;
3967 const ConstUnmanaged<internal_vector_type_4d_view> D_internal_vector_values_schur;
3968 const ConstUnmanaged<internal_vector_type_5d_view> e_internal_vector_values;
3971 const local_ordinal_type vector_loop_size;
3974 Unmanaged<impl_scalar_type_2d_view_tpetra> Y_scalar_multivector;
3975 #if defined(__CUDA_ARCH__) || defined(__HIP_DEVICE_COMPILE__) || defined(__SYCL_DEVICE_ONLY__)
3976 AtomicUnmanaged<impl_scalar_type_1d_view> Z_scalar_vector;
3978 Unmanaged<impl_scalar_type_1d_view> Z_scalar_vector;
3980 const impl_scalar_type df;
3981 const bool compute_diff;
3984 SolveTridiags(
const BlockHelperDetails::PartInterface<MatrixType> &interf,
3985 const BlockTridiags<MatrixType> &btdm,
3986 const vector_type_3d_view &pmv,
3987 const impl_scalar_type damping_factor,
3988 const bool is_norm_manager_active)
3991 n_subparts_per_part(interf.n_subparts_per_part),
3992 partptr(interf.partptr),
3993 packptr(interf.packptr),
3994 packindices_sub(interf.packindices_sub),
3995 packindices_schur(interf.packindices_schur),
3996 part2packrowidx0(interf.part2packrowidx0),
3997 part2packrowidx0_sub(interf.part2packrowidx0_sub),
3998 lclrow(interf.lclrow),
3999 packptr_sub(interf.packptr_sub),
4000 partptr_sub(interf.partptr_sub),
4001 pack_td_ptr_schur(btdm.pack_td_ptr_schur),
4003 pack_td_ptr(btdm.pack_td_ptr),
4004 D_internal_vector_values((internal_vector_type*)btdm.values.data(),
4005 btdm.values.extent(0),
4006 btdm.values.extent(1),
4007 btdm.values.extent(2),
4008 vector_length/internal_vector_length),
4009 X_internal_vector_values((internal_vector_type*)pmv.data(),
4013 vector_length/internal_vector_length),
4014 X_internal_scalar_values((btdm_scalar_type*)pmv.data(),
4020 2*(n_subparts_per_part-1) * part2packrowidx0_sub.extent(0),
4023 vector_length/internal_vector_length),
4024 D_internal_vector_values_schur((internal_vector_type*)btdm.values_schur.data(),
4025 btdm.values_schur.extent(0),
4026 btdm.values_schur.extent(1),
4027 btdm.values_schur.extent(2),
4028 vector_length/internal_vector_length),
4029 e_internal_vector_values((internal_vector_type*)btdm.e_values.data(),
4030 btdm.e_values.extent(0),
4031 btdm.e_values.extent(1),
4032 btdm.e_values.extent(2),
4033 btdm.e_values.extent(3),
4034 vector_length/internal_vector_length),
4035 vector_loop_size(vector_length/internal_vector_length),
4036 Y_scalar_multivector(),
4039 compute_diff(is_norm_manager_active)
4045 KOKKOS_INLINE_FUNCTION
4047 copyToFlatMultiVector(
const member_type &member,
4048 const local_ordinal_type partidxbeg,
4049 const local_ordinal_type npacks,
4050 const local_ordinal_type pri0,
4051 const local_ordinal_type v,
4052 const local_ordinal_type blocksize,
4053 const local_ordinal_type num_vectors)
const {
4054 const local_ordinal_type vbeg = v*internal_vector_length;
4055 if (vbeg < npacks) {
4056 local_ordinal_type ri0_vals[internal_vector_length] = {};
4057 local_ordinal_type nrows_vals[internal_vector_length] = {};
4058 for (local_ordinal_type vv=vbeg,vi=0;vv<npacks && vi<internal_vector_length;++vv,++vi) {
4059 const local_ordinal_type partidx = partidxbeg+vv;
4060 ri0_vals[vi] = partptr(partidx);
4061 nrows_vals[vi] = partptr(partidx+1) - ri0_vals[vi];
4064 impl_scalar_type z_partial_sum(0);
4065 if (nrows_vals[0] == 1) {
4066 const local_ordinal_type j=0, pri=pri0;
4068 for (local_ordinal_type vv=vbeg,vi=0;vv<npacks && vi<internal_vector_length;++vv,++vi) {
4069 const local_ordinal_type ri0 = ri0_vals[vi];
4070 const local_ordinal_type nrows = nrows_vals[vi];
4072 Kokkos::parallel_for
4073 (Kokkos::TeamThreadRange(member, blocksize),
4074 [&](
const local_ordinal_type &i) {
4075 const local_ordinal_type row = blocksize*lclrow(ri0+j)+i;
4076 for (local_ordinal_type col=0;col<num_vectors;++col) {
4077 impl_scalar_type &y = Y_scalar_multivector(row,col);
4078 const impl_scalar_type yd = X_internal_vector_values(pri, i, col, v)[vi] - y;
4082 const auto yd_abs = Kokkos::ArithTraits<impl_scalar_type>::abs(yd);
4083 z_partial_sum += yd_abs*yd_abs;
4091 Kokkos::parallel_for
4092 (Kokkos::TeamThreadRange(member, nrows_vals[0]),
4093 [&](
const local_ordinal_type &j) {
4094 const local_ordinal_type pri = pri0 + j;
4095 for (local_ordinal_type vv=vbeg,vi=0;vv<npacks && vi<internal_vector_length;++vv,++vi) {
4096 const local_ordinal_type ri0 = ri0_vals[vi];
4097 const local_ordinal_type nrows = nrows_vals[vi];
4099 for (local_ordinal_type col=0;col<num_vectors;++col) {
4100 for (local_ordinal_type i=0;i<blocksize;++i) {
4101 const local_ordinal_type row = blocksize*lclrow(ri0+j)+i;
4102 impl_scalar_type &y = Y_scalar_multivector(row,col);
4103 const impl_scalar_type yd = X_internal_vector_values(pri, i, col, v)[vi] - y;
4107 const auto yd_abs = Kokkos::ArithTraits<impl_scalar_type>::abs(yd);
4108 z_partial_sum += yd_abs*yd_abs;
4117 Z_scalar_vector(member.league_rank()) += z_partial_sum;
4124 template<
typename WWViewType>
4125 KOKKOS_INLINE_FUNCTION
4127 solveSingleVector(
const member_type &member,
4128 const local_ordinal_type &blocksize,
4129 const local_ordinal_type &i0,
4130 const local_ordinal_type &r0,
4131 const local_ordinal_type &nrows,
4132 const local_ordinal_type &v,
4133 const WWViewType &WW)
const {
4135 typedef SolveTridiagsDefaultModeAndAlgo
4136 <
typename execution_space::memory_space> default_mode_and_algo_type;
4138 typedef typename default_mode_and_algo_type::mode_type default_mode_type;
4139 typedef typename default_mode_and_algo_type::single_vector_algo_type default_algo_type;
4142 auto A = D_internal_vector_values.data();
4143 auto X = X_internal_vector_values.data();
4146 const auto one = Kokkos::ArithTraits<btdm_magnitude_type>::one();
4147 const auto zero = Kokkos::ArithTraits<btdm_magnitude_type>::zero();
4151 const local_ordinal_type astep = D_internal_vector_values.stride_0();
4152 const local_ordinal_type as0 = D_internal_vector_values.stride_1();
4153 const local_ordinal_type as1 = D_internal_vector_values.stride_2();
4154 const local_ordinal_type xstep = X_internal_vector_values.stride_0();
4155 const local_ordinal_type xs0 = X_internal_vector_values.stride_1();
4164 KOKKOSBATCHED_TRSV_LOWER_NO_TRANSPOSE_INTERNAL_INVOKE
4165 (default_mode_type,default_algo_type,
4168 blocksize,blocksize,
4173 for (local_ordinal_type tr=1;tr<nrows;++tr) {
4174 member.team_barrier();
4175 KOKKOSBATCHED_GEMV_NO_TRANSPOSE_INTERNAL_INVOKE
4176 (default_mode_type,default_algo_type,
4178 blocksize, blocksize,
4180 A+2*astep, as0, as1,
4184 KOKKOSBATCHED_TRSV_LOWER_NO_TRANSPOSE_INTERNAL_INVOKE
4185 (default_mode_type,default_algo_type,
4188 blocksize,blocksize,
4190 A+3*astep, as0, as1,
4198 KOKKOSBATCHED_TRSV_UPPER_NO_TRANSPOSE_INTERNAL_INVOKE
4199 (default_mode_type,default_algo_type,
4202 blocksize, blocksize,
4207 for (local_ordinal_type tr=nrows;tr>1;--tr) {
4209 member.team_barrier();
4210 KOKKOSBATCHED_GEMV_NO_TRANSPOSE_INTERNAL_INVOKE
4211 (default_mode_type,default_algo_type,
4213 blocksize, blocksize,
4215 A+1*astep, as0, as1,
4219 KOKKOSBATCHED_TRSV_UPPER_NO_TRANSPOSE_INTERNAL_INVOKE
4220 (default_mode_type,default_algo_type,
4223 blocksize, blocksize,
4232 const local_ordinal_type ws0 = WW.stride_0();
4233 auto W = WW.data() + v;
4234 KOKKOSBATCHED_COPY_VECTOR_NO_TRANSPOSE_INTERNAL_INVOKE
4236 member, blocksize, X, xs0, W, ws0);
4237 member.team_barrier();
4238 KOKKOSBATCHED_GEMV_NO_TRANSPOSE_INTERNAL_INVOKE
4239 (default_mode_type,default_algo_type,
4241 blocksize, blocksize,
4250 template<
typename WWViewType>
4251 KOKKOS_INLINE_FUNCTION
4253 solveMultiVector(
const member_type &member,
4254 const local_ordinal_type &,
4255 const local_ordinal_type &i0,
4256 const local_ordinal_type &r0,
4257 const local_ordinal_type &nrows,
4258 const local_ordinal_type &v,
4259 const WWViewType &WW)
const {
4261 typedef SolveTridiagsDefaultModeAndAlgo
4262 <
typename execution_space::memory_space> default_mode_and_algo_type;
4264 typedef typename default_mode_and_algo_type::mode_type default_mode_type;
4265 typedef typename default_mode_and_algo_type::multi_vector_algo_type default_algo_type;
4268 const auto one = Kokkos::ArithTraits<btdm_magnitude_type>::one();
4269 const auto zero = Kokkos::ArithTraits<btdm_magnitude_type>::zero();
4272 auto A = Kokkos::subview(D_internal_vector_values, i0, Kokkos::ALL(), Kokkos::ALL(), v);
4273 auto X1 = Kokkos::subview(X_internal_vector_values, r0, Kokkos::ALL(), Kokkos::ALL(), v);
4276 local_ordinal_type i = i0, r = r0;
4281 KB::Trsm<member_type,
4282 KB::Side::Left,KB::Uplo::Lower,KB::Trans::NoTranspose,KB::Diag::Unit,
4283 default_mode_type,default_algo_type>
4284 ::invoke(member, one, A, X1);
4285 for (local_ordinal_type tr=1;tr<nrows;++tr,i+=3) {
4286 A.assign_data( &D_internal_vector_values(i+2,0,0,v) );
4287 X2.assign_data( &X_internal_vector_values(++r,0,0,v) );
4288 member.team_barrier();
4289 KB::Gemm<member_type,
4290 KB::Trans::NoTranspose,KB::Trans::NoTranspose,
4291 default_mode_type,default_algo_type>
4292 ::invoke(member, -one, A, X1, one, X2);
4293 A.assign_data( &D_internal_vector_values(i+3,0,0,v) );
4294 KB::Trsm<member_type,
4295 KB::Side::Left,KB::Uplo::Lower,KB::Trans::NoTranspose,KB::Diag::Unit,
4296 default_mode_type,default_algo_type>
4297 ::invoke(member, one, A, X2);
4298 X1.assign_data( X2.data() );
4302 KB::Trsm<member_type,
4303 KB::Side::Left,KB::Uplo::Upper,KB::Trans::NoTranspose,KB::Diag::NonUnit,
4304 default_mode_type,default_algo_type>
4305 ::invoke(member, one, A, X1);
4306 for (local_ordinal_type tr=nrows;tr>1;--tr) {
4308 A.assign_data( &D_internal_vector_values(i+1,0,0,v) );
4309 X2.assign_data( &X_internal_vector_values(--r,0,0,v) );
4310 member.team_barrier();
4311 KB::Gemm<member_type,
4312 KB::Trans::NoTranspose,KB::Trans::NoTranspose,
4313 default_mode_type,default_algo_type>
4314 ::invoke(member, -one, A, X1, one, X2);
4316 A.assign_data( &D_internal_vector_values(i,0,0,v) );
4317 KB::Trsm<member_type,
4318 KB::Side::Left,KB::Uplo::Upper,KB::Trans::NoTranspose,KB::Diag::NonUnit,
4319 default_mode_type,default_algo_type>
4320 ::invoke(member, one, A, X2);
4321 X1.assign_data( X2.data() );
4325 auto W = Kokkos::subview(WW, Kokkos::ALL(), Kokkos::ALL(), v);
4326 KB::Copy<member_type,KB::Trans::NoTranspose,default_mode_type>
4327 ::invoke(member, X1, W);
4328 member.team_barrier();
4329 KB::Gemm<member_type,
4330 KB::Trans::NoTranspose,KB::Trans::NoTranspose,
4331 default_mode_type,default_algo_type>
4332 ::invoke(member, one, A, W, zero, X1);
4336 template<
int B>
struct SingleVectorTag {};
4337 template<
int B>
struct MultiVectorTag {};
4339 template<
int B>
struct SingleVectorSubLineTag {};
4340 template<
int B>
struct MultiVectorSubLineTag {};
4341 template<
int B>
struct SingleVectorApplyCTag {};
4342 template<
int B>
struct MultiVectorApplyCTag {};
4343 template<
int B>
struct SingleVectorSchurTag {};
4344 template<
int B>
struct MultiVectorSchurTag {};
4345 template<
int B>
struct SingleVectorApplyETag {};
4346 template<
int B>
struct MultiVectorApplyETag {};
4347 template<
int B>
struct SingleVectorCopyToFlatTag {};
4348 template<
int B>
struct SingleZeroingTag {};
4351 KOKKOS_INLINE_FUNCTION
4353 operator() (
const SingleVectorTag<B> &,
const member_type &member)
const {
4354 const local_ordinal_type packidx = member.league_rank();
4355 const local_ordinal_type partidx = packptr(packidx);
4356 const local_ordinal_type npacks = packptr(packidx+1) - partidx;
4357 const local_ordinal_type pri0 = part2packrowidx0(partidx);
4358 const local_ordinal_type i0 = pack_td_ptr(partidx,0);
4359 const local_ordinal_type r0 = part2packrowidx0(partidx);
4360 const local_ordinal_type nrows = partptr(partidx+1) - partptr(partidx);
4361 const local_ordinal_type blocksize = (B == 0 ? D_internal_vector_values.extent(1) : B);
4362 const local_ordinal_type num_vectors = 1;
4363 internal_vector_scratch_type_3d_view
4364 WW(member.team_scratch(0), blocksize, 1, vector_loop_size);
4365 Kokkos::single(Kokkos::PerTeam(member), [&]() {
4366 Z_scalar_vector(member.league_rank()) = impl_scalar_type(0);
4368 Kokkos::parallel_for
4369 (Kokkos::ThreadVectorRange(member, vector_loop_size),[&](
const int &v) {
4370 solveSingleVector(member, blocksize, i0, r0, nrows, v, WW);
4371 copyToFlatMultiVector(member, partidx, npacks, pri0, v, blocksize, num_vectors);
4376 KOKKOS_INLINE_FUNCTION
4378 operator() (
const MultiVectorTag<B> &,
const member_type &member)
const {
4379 const local_ordinal_type packidx = member.league_rank();
4380 const local_ordinal_type partidx = packptr(packidx);
4381 const local_ordinal_type npacks = packptr(packidx+1) - partidx;
4382 const local_ordinal_type pri0 = part2packrowidx0(partidx);
4383 const local_ordinal_type i0 = pack_td_ptr(partidx,0);
4384 const local_ordinal_type r0 = part2packrowidx0(partidx);
4385 const local_ordinal_type nrows = partptr(partidx+1) - partptr(partidx);
4386 const local_ordinal_type blocksize = (B == 0 ? D_internal_vector_values.extent(1) : B);
4387 const local_ordinal_type num_vectors = X_internal_vector_values.extent(2);
4389 internal_vector_scratch_type_3d_view
4390 WW(member.team_scratch(0), blocksize, num_vectors, vector_loop_size);
4391 Kokkos::single(Kokkos::PerTeam(member), [&]() {
4392 Z_scalar_vector(member.league_rank()) = impl_scalar_type(0);
4394 Kokkos::parallel_for
4395 (Kokkos::ThreadVectorRange(member, vector_loop_size),[&](
const int &v) {
4396 solveMultiVector(member, blocksize, i0, r0, nrows, v, WW);
4397 copyToFlatMultiVector(member, partidx, npacks, pri0, v, blocksize, num_vectors);
4402 KOKKOS_INLINE_FUNCTION
4404 operator() (
const SingleVectorSubLineTag<B> &,
const member_type &member)
const {
4406 const local_ordinal_type packidx = packindices_sub(member.league_rank());
4408 const local_ordinal_type subpartidx = packptr_sub(packidx);
4409 const local_ordinal_type n_parts = part2packrowidx0_sub.extent(0);
4410 const local_ordinal_type local_subpartidx = subpartidx/n_parts;
4411 const local_ordinal_type partidx = subpartidx%n_parts;
4413 const local_ordinal_type npacks = packptr_sub(packidx+1) - subpartidx;
4414 const local_ordinal_type i0 = pack_td_ptr(partidx,local_subpartidx);
4415 const local_ordinal_type r0 = part2packrowidx0_sub(partidx,local_subpartidx);
4416 const local_ordinal_type nrows = partptr_sub(subpartidx,1) - partptr_sub(subpartidx,0);
4417 const local_ordinal_type blocksize = e_internal_vector_values.extent(2);
4423 internal_vector_scratch_type_3d_view
4424 WW(member.team_scratch(0), blocksize, 1, vector_loop_size);
4426 Kokkos::parallel_for
4427 (Kokkos::ThreadVectorRange(member, vector_loop_size),[&](
const int &v) {
4428 solveSingleVectorNew<impl_type, internal_vector_scratch_type_3d_view> (member, blocksize, i0, r0, nrows, v, D_internal_vector_values, X_internal_vector_values, WW);
4433 KOKKOS_INLINE_FUNCTION
4435 operator() (
const SingleVectorApplyCTag<B> &,
const member_type &member)
const {
4438 const local_ordinal_type packidx = packindices_sub(member.league_rank());
4440 const local_ordinal_type subpartidx = packptr_sub(packidx);
4441 const local_ordinal_type n_parts = part2packrowidx0_sub.extent(0);
4442 const local_ordinal_type local_subpartidx = subpartidx/n_parts;
4443 const local_ordinal_type partidx = subpartidx%n_parts;
4444 const local_ordinal_type blocksize = e_internal_vector_values.extent(2);
4447 const local_ordinal_type i0 = pack_td_ptr(partidx,local_subpartidx);
4448 const local_ordinal_type r0 = part2packrowidx0_sub(partidx,local_subpartidx);
4449 const local_ordinal_type nrows = partptr_sub(subpartidx,1) - partptr_sub(subpartidx,0);
4451 internal_vector_scratch_type_3d_view
4452 WW(member.team_scratch(0), blocksize, blocksize, vector_loop_size);
4456 const local_ordinal_type local_subpartidx_schur = (local_subpartidx-1)/2;
4457 const local_ordinal_type i0_schur = local_subpartidx_schur == 0 ? pack_td_ptr_schur(partidx,local_subpartidx_schur) : pack_td_ptr_schur(partidx,local_subpartidx_schur) + 1;
4458 const local_ordinal_type i0_offset = local_subpartidx_schur == 0 ? i0+2 : i0+2;
4463 const auto one = Kokkos::ArithTraits<btdm_magnitude_type>::one();
4465 const size_type c_kps2 = local_subpartidx > 0 ? pack_td_ptr(partidx, local_subpartidx)-2 : 0;
4466 const size_type c_kps1 = pack_td_ptr(partidx, local_subpartidx+1)+1;
4468 typedef SolveTridiagsDefaultModeAndAlgo
4469 <
typename execution_space::memory_space> default_mode_and_algo_type;
4471 typedef typename default_mode_and_algo_type::mode_type default_mode_type;
4472 typedef typename default_mode_and_algo_type::single_vector_algo_type default_algo_type;
4474 if (local_subpartidx == 0) {
4475 Kokkos::parallel_for
4476 (Kokkos::ThreadVectorRange(member, vector_loop_size),[&](
const int &v) {
4477 auto v_1 = Kokkos::subview(X_internal_vector_values, r0+nrows-1, Kokkos::ALL(), 0, v);
4478 auto v_2 = Kokkos::subview(X_internal_vector_values, r0+nrows, Kokkos::ALL(), 0, v);
4479 auto C = Kokkos::subview(D_internal_vector_values, c_kps1, Kokkos::ALL(), Kokkos::ALL(), v);
4481 KOKKOSBATCHED_GEMV_NO_TRANSPOSE_INTERNAL_INVOKE
4482 (default_mode_type,default_algo_type,
4484 blocksize, blocksize,
4486 C.data(), C.stride_0(), C.stride_1(),
4487 v_1.data(), v_1.stride_0(),
4489 v_2.data(), v_2.stride_0());
4492 else if (local_subpartidx == (local_ordinal_type) part2packrowidx0_sub.extent(1) - 2) {
4493 Kokkos::parallel_for
4494 (Kokkos::ThreadVectorRange(member, vector_loop_size),[&](
const int &v) {
4495 auto v_1 = Kokkos::subview(X_internal_vector_values, r0, Kokkos::ALL(), 0, v);
4496 auto v_2 = Kokkos::subview(X_internal_vector_values, r0-1, Kokkos::ALL(), 0, v);
4497 auto C = Kokkos::subview(D_internal_vector_values, c_kps2, Kokkos::ALL(), Kokkos::ALL(), v);
4499 KOKKOSBATCHED_GEMV_NO_TRANSPOSE_INTERNAL_INVOKE
4500 (default_mode_type,default_algo_type,
4502 blocksize, blocksize,
4504 C.data(), C.stride_0(), C.stride_1(),
4505 v_1.data(), v_1.stride_0(),
4507 v_2.data(), v_2.stride_0());
4511 Kokkos::parallel_for
4512 (Kokkos::ThreadVectorRange(member, vector_loop_size),[&](
const int &v) {
4514 auto v_1 = Kokkos::subview(X_internal_vector_values, r0+nrows-1, Kokkos::ALL(), 0, v);
4515 auto v_2 = Kokkos::subview(X_internal_vector_values, r0+nrows, Kokkos::ALL(), 0, v);
4516 auto C = Kokkos::subview(D_internal_vector_values, c_kps1, Kokkos::ALL(), Kokkos::ALL(), v);
4518 KOKKOSBATCHED_GEMV_NO_TRANSPOSE_INTERNAL_INVOKE
4519 (default_mode_type,default_algo_type,
4521 blocksize, blocksize,
4523 C.data(), C.stride_0(), C.stride_1(),
4524 v_1.data(), v_1.stride_0(),
4526 v_2.data(), v_2.stride_0());
4529 auto v_1 = Kokkos::subview(X_internal_vector_values, r0, Kokkos::ALL(), 0, v);
4530 auto v_2 = Kokkos::subview(X_internal_vector_values, r0-1, Kokkos::ALL(), 0, v);
4531 auto C = Kokkos::subview(D_internal_vector_values, c_kps2, Kokkos::ALL(), Kokkos::ALL(), v);
4533 KOKKOSBATCHED_GEMV_NO_TRANSPOSE_INTERNAL_INVOKE
4534 (default_mode_type,default_algo_type,
4536 blocksize, blocksize,
4538 C.data(), C.stride_0(), C.stride_1(),
4539 v_1.data(), v_1.stride_0(),
4541 v_2.data(), v_2.stride_0());
4548 KOKKOS_INLINE_FUNCTION
4550 operator() (
const SingleVectorSchurTag<B> &,
const member_type &member)
const {
4551 const local_ordinal_type packidx = packindices_sub(member.league_rank());
4553 const local_ordinal_type partidx = packptr_sub(packidx);
4555 const local_ordinal_type blocksize = e_internal_vector_values.extent(2);
4557 const local_ordinal_type i0_schur = pack_td_ptr_schur(partidx,0);
4558 const local_ordinal_type nrows = 2*(n_subparts_per_part-1);
4560 const local_ordinal_type r0_schur = nrows * member.league_rank();
4562 internal_vector_scratch_type_3d_view
4563 WW(member.team_scratch(0), blocksize, blocksize, vector_loop_size);
4565 for (local_ordinal_type schur_sub_part = 0; schur_sub_part < n_subparts_per_part-1; ++schur_sub_part) {
4566 const local_ordinal_type r0 = part2packrowidx0_sub(partidx,2*schur_sub_part+1);
4567 for (local_ordinal_type i = 0; i < 2; ++i) {
4568 copy3DView<local_ordinal_type>(member,
4569 Kokkos::subview(X_internal_vector_values_schur, r0_schur+2*schur_sub_part+i, Kokkos::ALL(), Kokkos::ALL(), Kokkos::ALL()),
4570 Kokkos::subview(X_internal_vector_values, r0+i, Kokkos::ALL(), Kokkos::ALL(), Kokkos::ALL()));
4574 Kokkos::parallel_for
4575 (Kokkos::ThreadVectorRange(member, vector_loop_size),[&](
const int &v) {
4576 solveSingleVectorNew<impl_type, internal_vector_scratch_type_3d_view> (member, blocksize, i0_schur, r0_schur, nrows, v, D_internal_vector_values_schur, X_internal_vector_values_schur, WW);
4579 for (local_ordinal_type schur_sub_part = 0; schur_sub_part < n_subparts_per_part-1; ++schur_sub_part) {
4580 const local_ordinal_type r0 = part2packrowidx0_sub(partidx,2*schur_sub_part+1);
4581 for (local_ordinal_type i = 0; i < 2; ++i) {
4582 copy3DView<local_ordinal_type>(member,
4583 Kokkos::subview(X_internal_vector_values, r0+i, Kokkos::ALL(), Kokkos::ALL(), Kokkos::ALL()),
4584 Kokkos::subview(X_internal_vector_values_schur, r0_schur+2*schur_sub_part+i, Kokkos::ALL(), Kokkos::ALL(), Kokkos::ALL()));
4590 KOKKOS_INLINE_FUNCTION
4592 operator() (
const SingleVectorApplyETag<B> &,
const member_type &member)
const {
4593 const local_ordinal_type packidx = packindices_sub(member.league_rank());
4595 const local_ordinal_type subpartidx = packptr_sub(packidx);
4596 const local_ordinal_type n_parts = part2packrowidx0_sub.extent(0);
4597 const local_ordinal_type local_subpartidx = subpartidx/n_parts;
4598 const local_ordinal_type partidx = subpartidx%n_parts;
4599 const local_ordinal_type blocksize = e_internal_vector_values.extent(2);
4601 const local_ordinal_type r0 = part2packrowidx0_sub(partidx,local_subpartidx);
4602 const local_ordinal_type nrows = partptr_sub(subpartidx,1) - partptr_sub(subpartidx,0);
4604 internal_vector_scratch_type_3d_view
4605 WW(member.team_scratch(0), blocksize, blocksize, vector_loop_size);
4609 const auto one = Kokkos::ArithTraits<btdm_magnitude_type>::one();
4611 typedef SolveTridiagsDefaultModeAndAlgo
4612 <
typename execution_space::memory_space> default_mode_and_algo_type;
4614 typedef typename default_mode_and_algo_type::mode_type default_mode_type;
4615 typedef typename default_mode_and_algo_type::single_vector_algo_type default_algo_type;
4617 if (local_subpartidx == 0) {
4618 Kokkos::parallel_for
4619 (Kokkos::ThreadVectorRange(member, vector_loop_size),[&](
const int &v) {
4621 auto v_2 = Kokkos::subview(X_internal_vector_values, r0+nrows, Kokkos::ALL(), 0, v);
4623 for (local_ordinal_type row = 0; row < nrows; ++row) {
4624 auto v_1 = Kokkos::subview(X_internal_vector_values, r0+row, Kokkos::ALL(), 0, v);
4625 auto E = Kokkos::subview(e_internal_vector_values, 0, r0+row, Kokkos::ALL(), Kokkos::ALL(), v);
4627 KOKKOSBATCHED_GEMV_NO_TRANSPOSE_INTERNAL_INVOKE
4628 (default_mode_type,default_algo_type,
4630 blocksize, blocksize,
4632 E.data(), E.stride_0(), E.stride_1(),
4633 v_2.data(), v_2.stride_0(),
4635 v_1.data(), v_1.stride_0());
4639 else if (local_subpartidx == (local_ordinal_type) part2packrowidx0_sub.extent(1) - 2) {
4640 Kokkos::parallel_for
4641 (Kokkos::ThreadVectorRange(member, vector_loop_size),[&](
const int &v) {
4642 auto v_2 = Kokkos::subview(X_internal_vector_values, r0-1, Kokkos::ALL(), 0, v);
4644 for (local_ordinal_type row = 0; row < nrows; ++row) {
4645 auto v_1 = Kokkos::subview(X_internal_vector_values, r0+row, Kokkos::ALL(), 0, v);
4646 auto E = Kokkos::subview(e_internal_vector_values, 1, r0+row, Kokkos::ALL(), Kokkos::ALL(), v);
4648 KOKKOSBATCHED_GEMV_NO_TRANSPOSE_INTERNAL_INVOKE
4649 (default_mode_type,default_algo_type,
4651 blocksize, blocksize,
4653 E.data(), E.stride_0(), E.stride_1(),
4654 v_2.data(), v_2.stride_0(),
4656 v_1.data(), v_1.stride_0());
4661 Kokkos::parallel_for
4662 (Kokkos::ThreadVectorRange(member, vector_loop_size),[&](
const int &v) {
4664 auto v_2 = Kokkos::subview(X_internal_vector_values, r0+nrows, Kokkos::ALL(), 0, v);
4666 for (local_ordinal_type row = 0; row < nrows; ++row) {
4667 auto v_1 = Kokkos::subview(X_internal_vector_values, r0+row, Kokkos::ALL(), 0, v);
4668 auto E = Kokkos::subview(e_internal_vector_values, 0, r0+row, Kokkos::ALL(), Kokkos::ALL(), v);
4670 KOKKOSBATCHED_GEMV_NO_TRANSPOSE_INTERNAL_INVOKE
4671 (default_mode_type,default_algo_type,
4673 blocksize, blocksize,
4675 E.data(), E.stride_0(), E.stride_1(),
4676 v_2.data(), v_2.stride_0(),
4678 v_1.data(), v_1.stride_0());
4682 auto v_2 = Kokkos::subview(X_internal_vector_values, r0-1, Kokkos::ALL(), 0, v);
4684 for (local_ordinal_type row = 0; row < nrows; ++row) {
4685 auto v_1 = Kokkos::subview(X_internal_vector_values, r0+row, Kokkos::ALL(), 0, v);
4686 auto E = Kokkos::subview(e_internal_vector_values, 1, r0+row, Kokkos::ALL(), Kokkos::ALL(), v);
4688 KOKKOSBATCHED_GEMV_NO_TRANSPOSE_INTERNAL_INVOKE
4689 (default_mode_type,default_algo_type,
4691 blocksize, blocksize,
4693 E.data(), E.stride_0(), E.stride_1(),
4694 v_2.data(), v_2.stride_0(),
4696 v_1.data(), v_1.stride_0());
4704 KOKKOS_INLINE_FUNCTION
4706 operator() (
const SingleVectorCopyToFlatTag<B> &,
const member_type &member)
const {
4707 const local_ordinal_type packidx = member.league_rank();
4708 const local_ordinal_type partidx = packptr(packidx);
4709 const local_ordinal_type npacks = packptr(packidx+1) - partidx;
4710 const local_ordinal_type pri0 = part2packrowidx0(partidx);
4711 const local_ordinal_type blocksize = (B == 0 ? D_internal_vector_values.extent(1) : B);
4712 const local_ordinal_type num_vectors = 1;
4714 Kokkos::parallel_for
4715 (Kokkos::ThreadVectorRange(member, vector_loop_size),[&](
const int &v) {
4716 copyToFlatMultiVector(member, partidx, npacks, pri0, v, blocksize, num_vectors);
4721 KOKKOS_INLINE_FUNCTION
4723 operator() (
const SingleZeroingTag<B> &,
const member_type &member)
const {
4724 Kokkos::single(Kokkos::PerTeam(member), [&]() {
4725 Z_scalar_vector(member.league_rank()) = impl_scalar_type(0);
4729 void run(
const impl_scalar_type_2d_view_tpetra &Y,
4730 const impl_scalar_type_1d_view &Z) {
4731 IFPACK2_BLOCKTRIDICONTAINER_PROFILER_REGION_BEGIN;
4732 IFPACK2_BLOCKHELPER_TIMER(
"BlockTriDi::SolveTridiags", SolveTridiags);
4735 this->Y_scalar_multivector = Y;
4736 this->Z_scalar_vector = Z;
4738 const local_ordinal_type num_vectors = X_internal_vector_values.extent(2);
4739 const local_ordinal_type blocksize = D_internal_vector_values.extent(1);
4741 const local_ordinal_type team_size =
4742 SolveTridiagsDefaultModeAndAlgo<typename execution_space::memory_space>::
4743 recommended_team_size(blocksize, vector_length, internal_vector_length);
4744 const int per_team_scratch = internal_vector_scratch_type_3d_view
4745 ::shmem_size(blocksize, num_vectors, vector_loop_size);
4747 #if defined(KOKKOS_ENABLE_DEPRECATED_CODE)
4748 #define BLOCKTRIDICONTAINER_DETAILS_SOLVETRIDIAGS(B) \
4749 if (num_vectors == 1) { \
4750 const Kokkos::TeamPolicy<execution_space,SingleVectorTag<B> > \
4751 policy(packptr.extent(0) - 1, team_size, vector_loop_size); \
4752 Kokkos::parallel_for \
4753 ("SolveTridiags::TeamPolicy::run<SingleVector>", \
4754 policy.set_scratch_size(0,Kokkos::PerTeam(per_team_scratch)), *this); \
4756 const Kokkos::TeamPolicy<execution_space,MultiVectorTag<B> > \
4757 policy(packptr.extent(0) - 1, team_size, vector_loop_size); \
4758 Kokkos::parallel_for \
4759 ("SolveTridiags::TeamPolicy::run<MultiVector>", \
4760 policy.set_scratch_size(0,Kokkos::PerTeam(per_team_scratch)), *this); \
4763 #define BLOCKTRIDICONTAINER_DETAILS_SOLVETRIDIAGS(B) \
4764 if (num_vectors == 1) { \
4765 if (packindices_schur.extent(1) <= 0) { \
4766 Kokkos::TeamPolicy<execution_space,SingleVectorTag<B> > \
4767 policy(packptr.extent(0) - 1, team_size, vector_loop_size); \
4768 policy.set_scratch_size(0,Kokkos::PerTeam(per_team_scratch)); \
4769 Kokkos::parallel_for \
4770 ("SolveTridiags::TeamPolicy::run<SingleVector>", \
4776 Kokkos::TeamPolicy<execution_space,SingleZeroingTag<B> > \
4777 policy(packptr.extent(0) - 1, team_size, vector_loop_size); \
4778 Kokkos::parallel_for \
4779 ("SolveTridiags::TeamPolicy::run<SingleZeroingTag>", \
4783 IFPACK2_BLOCKHELPER_TIMER("BlockTriDi::ApplyInverseJacobi::SingleVectorSubLineTag", SingleVectorSubLineTag0); \
4784 write4DMultiVectorValuesToFile(part2packrowidx0_sub.extent(0), X_internal_scalar_values, "x_scalar_values_before_SingleVectorSubLineTag.mm"); \
4785 Kokkos::TeamPolicy<execution_space,SingleVectorSubLineTag<B> > \
4786 policy(packindices_sub.extent(0), team_size, vector_loop_size); \
4787 policy.set_scratch_size(0,Kokkos::PerTeam(per_team_scratch)); \
4788 Kokkos::parallel_for \
4789 ("SolveTridiags::TeamPolicy::run<SingleVector>", \
4791 write4DMultiVectorValuesToFile(part2packrowidx0_sub.extent(0), X_internal_scalar_values, "x_scalar_values_after_SingleVectorSubLineTag.mm"); \
4792 IFPACK2_BLOCKHELPER_TIMER_FENCE(execution_space) \
4795 IFPACK2_BLOCKHELPER_TIMER("BlockTriDi::ApplyInverseJacobi::SingleVectorApplyCTag", SingleVectorApplyCTag0); \
4796 write4DMultiVectorValuesToFile(part2packrowidx0_sub.extent(0), X_internal_scalar_values, "x_scalar_values_before_SingleVectorApplyCTag.mm"); \
4797 Kokkos::TeamPolicy<execution_space,SingleVectorApplyCTag<B> > \
4798 policy(packindices_sub.extent(0), team_size, vector_loop_size); \
4799 policy.set_scratch_size(0,Kokkos::PerTeam(per_team_scratch)); \
4800 Kokkos::parallel_for \
4801 ("SolveTridiags::TeamPolicy::run<SingleVector>", \
4803 write4DMultiVectorValuesToFile(part2packrowidx0_sub.extent(0), X_internal_scalar_values, "x_scalar_values_after_SingleVectorApplyCTag.mm"); \
4804 IFPACK2_BLOCKHELPER_TIMER_FENCE(execution_space) \
4807 IFPACK2_BLOCKHELPER_TIMER("BlockTriDi::ApplyInverseJacobi::SingleVectorSchurTag", SingleVectorSchurTag0); \
4808 write4DMultiVectorValuesToFile(part2packrowidx0_sub.extent(0), X_internal_scalar_values, "x_scalar_values_before_SingleVectorSchurTag.mm"); \
4809 Kokkos::TeamPolicy<execution_space,SingleVectorSchurTag<B> > \
4810 policy(packindices_schur.extent(0), team_size, vector_loop_size); \
4811 policy.set_scratch_size(0,Kokkos::PerTeam(per_team_scratch)); \
4812 Kokkos::parallel_for \
4813 ("SolveTridiags::TeamPolicy::run<SingleVector>", \
4815 write4DMultiVectorValuesToFile(part2packrowidx0_sub.extent(0), X_internal_scalar_values, "x_scalar_values_after_SingleVectorSchurTag.mm"); \
4816 IFPACK2_BLOCKHELPER_TIMER_FENCE(execution_space) \
4819 IFPACK2_BLOCKHELPER_TIMER("BlockTriDi::ApplyInverseJacobi::SingleVectorApplyETag", SingleVectorApplyETag0); \
4820 write4DMultiVectorValuesToFile(part2packrowidx0_sub.extent(0), X_internal_scalar_values, "x_scalar_values_before_SingleVectorApplyETag.mm"); \
4821 Kokkos::TeamPolicy<execution_space,SingleVectorApplyETag<B> > \
4822 policy(packindices_sub.extent(0), team_size, vector_loop_size); \
4823 policy.set_scratch_size(0,Kokkos::PerTeam(per_team_scratch)); \
4824 Kokkos::parallel_for \
4825 ("SolveTridiags::TeamPolicy::run<SingleVector>", \
4827 write4DMultiVectorValuesToFile(part2packrowidx0_sub.extent(0), X_internal_scalar_values, "x_scalar_values_after_SingleVectorApplyETag.mm"); \
4828 IFPACK2_BLOCKHELPER_TIMER_FENCE(execution_space) \
4832 Kokkos::TeamPolicy<execution_space,SingleVectorCopyToFlatTag<B> > \
4833 policy(packptr.extent(0) - 1, team_size, vector_loop_size); \
4834 Kokkos::parallel_for \
4835 ("SolveTridiags::TeamPolicy::run<SingleVectorCopyToFlatTag>", \
4840 Kokkos::TeamPolicy<execution_space,MultiVectorTag<B> > \
4841 policy(packptr.extent(0) - 1, team_size, vector_loop_size); \
4842 policy.set_scratch_size(0,Kokkos::PerTeam(per_team_scratch)); \
4843 Kokkos::parallel_for \
4844 ("SolveTridiags::TeamPolicy::run<MultiVector>", \
4848 switch (blocksize) {
4849 case 3: BLOCKTRIDICONTAINER_DETAILS_SOLVETRIDIAGS( 3);
4850 case 5: BLOCKTRIDICONTAINER_DETAILS_SOLVETRIDIAGS( 5);
4851 case 6: BLOCKTRIDICONTAINER_DETAILS_SOLVETRIDIAGS( 6);
4852 case 7: BLOCKTRIDICONTAINER_DETAILS_SOLVETRIDIAGS( 7);
4853 case 10: BLOCKTRIDICONTAINER_DETAILS_SOLVETRIDIAGS(10);
4854 case 11: BLOCKTRIDICONTAINER_DETAILS_SOLVETRIDIAGS(11);
4855 case 12: BLOCKTRIDICONTAINER_DETAILS_SOLVETRIDIAGS(12);
4856 case 13: BLOCKTRIDICONTAINER_DETAILS_SOLVETRIDIAGS(13);
4857 case 16: BLOCKTRIDICONTAINER_DETAILS_SOLVETRIDIAGS(16);
4858 case 17: BLOCKTRIDICONTAINER_DETAILS_SOLVETRIDIAGS(17);
4859 case 18: BLOCKTRIDICONTAINER_DETAILS_SOLVETRIDIAGS(18);
4860 case 19: BLOCKTRIDICONTAINER_DETAILS_SOLVETRIDIAGS(19);
4861 default : BLOCKTRIDICONTAINER_DETAILS_SOLVETRIDIAGS( 0);
4863 #undef BLOCKTRIDICONTAINER_DETAILS_SOLVETRIDIAGS
4865 IFPACK2_BLOCKTRIDICONTAINER_PROFILER_REGION_END;
4866 IFPACK2_BLOCKHELPER_TIMER_FENCE(execution_space)
4873 template<
typename MatrixType>
4876 const Teuchos::RCP<
const typename BlockHelperDetails::ImplType<MatrixType>::tpetra_row_matrix_type> &A,
4877 const Teuchos::RCP<
const typename BlockHelperDetails::ImplType<MatrixType>::tpetra_crs_graph_type> &G,
4878 const Teuchos::RCP<
const typename BlockHelperDetails::ImplType<MatrixType>::tpetra_import_type> &tpetra_importer,
4879 const Teuchos::RCP<AsyncableImport<MatrixType> > &async_importer,
4880 const bool overlap_communication_and_computation,
4882 const typename BlockHelperDetails::ImplType<MatrixType>::tpetra_multivector_type &X,
4883 typename BlockHelperDetails::ImplType<MatrixType>::tpetra_multivector_type &Y,
4884 typename BlockHelperDetails::ImplType<MatrixType>::tpetra_multivector_type &Z,
4885 typename BlockHelperDetails::ImplType<MatrixType>::impl_scalar_type_1d_view &W,
4887 const BlockHelperDetails::PartInterface<MatrixType> &interf,
4890 typename BlockHelperDetails::ImplType<MatrixType>::vector_type_1d_view &work,
4895 const int max_num_sweeps,
4896 const typename BlockHelperDetails::ImplType<MatrixType>::magnitude_type tol,
4897 const int check_tol_every) {
4898 IFPACK2_BLOCKHELPER_TIMER(
"BlockTriDi::ApplyInverseJacobi", ApplyInverseJacobi);
4901 using node_memory_space =
typename impl_type::node_memory_space;
4902 using local_ordinal_type =
typename impl_type::local_ordinal_type;
4903 using size_type =
typename impl_type::size_type;
4904 using impl_scalar_type =
typename impl_type::impl_scalar_type;
4905 using magnitude_type =
typename impl_type::magnitude_type;
4906 using local_ordinal_type_1d_view =
typename impl_type::local_ordinal_type_1d_view;
4907 using vector_type_1d_view =
typename impl_type::vector_type_1d_view;
4908 using vector_type_3d_view =
typename impl_type::vector_type_3d_view;
4909 using tpetra_multivector_type =
typename impl_type::tpetra_multivector_type;
4911 using impl_scalar_type_1d_view =
typename impl_type::impl_scalar_type_1d_view;
4915 "Neither Tpetra importer nor Async importer is null.");
4918 "Maximum number of sweeps must be >= 1.");
4921 const bool is_seq_method_requested = !tpetra_importer.is_null();
4922 const bool is_async_importer_active = !async_importer.is_null();
4923 const bool is_norm_manager_active = tol > Kokkos::ArithTraits<magnitude_type>::zero();
4924 const magnitude_type tolerance = tol*tol;
4925 const local_ordinal_type blocksize = btdm.values.extent(1);
4926 const local_ordinal_type num_vectors = Y.getNumVectors();
4927 const local_ordinal_type num_blockrows = interf.part2packrowidx0_back;
4929 const impl_scalar_type zero(0.0);
4931 TEUCHOS_TEST_FOR_EXCEPT_MSG(is_norm_manager_active && is_seq_method_requested,
4932 "The seq method for applyInverseJacobi, " <<
4933 "which in any case is for developer use only, " <<
4934 "does not support norm-based termination.");
4935 const bool device_accessible_from_host = Kokkos::SpaceAccessibility<
4936 Kokkos::DefaultHostExecutionSpace, node_memory_space>::accessible;
4938 std::invalid_argument,
4939 "The seq method for applyInverseJacobi, " <<
4940 "which in any case is for developer use only, " <<
4941 "only supports memory spaces accessible from host.");
4944 const size_type work_span_required = num_blockrows*num_vectors*blocksize;
4945 if (work.span() < work_span_required)
4946 work = vector_type_1d_view(
"vector workspace 1d view", work_span_required);
4949 const local_ordinal_type W_size = interf.packptr.extent(0)-1;
4950 if (local_ordinal_type(W.extent(0)) < W_size)
4951 W = impl_scalar_type_1d_view(
"W", W_size);
4953 typename impl_type::impl_scalar_type_2d_view_tpetra remote_multivector;
4955 if (is_seq_method_requested) {
4956 if (Z.getNumVectors() != Y.getNumVectors())
4957 Z = tpetra_multivector_type(tpetra_importer->getTargetMap(), num_vectors,
false);
4959 if (is_async_importer_active) {
4961 async_importer->createDataBuffer(num_vectors);
4962 remote_multivector = async_importer->getRemoteMultiVectorLocalView();
4968 vector_type_3d_view pmv(work.data(), num_blockrows, blocksize, num_vectors);
4969 const auto XX = X.getLocalViewDevice(Tpetra::Access::ReadOnly);
4970 const auto YY = Y.getLocalViewDevice(Tpetra::Access::ReadWrite);
4971 const auto ZZ = Z.getLocalViewDevice(Tpetra::Access::ReadWrite);
4972 if (is_y_zero) Kokkos::deep_copy(YY, zero);
4975 SolveTridiags<MatrixType> solve_tridiags(interf, btdm, pmv,
4976 damping_factor, is_norm_manager_active);
4978 const local_ordinal_type_1d_view dummy_local_ordinal_type_1d_view;
4981 auto A_crs = Teuchos::rcp_dynamic_cast<
const typename impl_type::tpetra_crs_matrix_type>(A);
4982 auto A_bcrs = Teuchos::rcp_dynamic_cast<
const typename impl_type::tpetra_block_crs_matrix_type>(A);
4984 bool hasBlockCrsMatrix = ! A_bcrs.is_null ();
4987 const auto g = hasBlockCrsMatrix ? A_bcrs->getCrsGraph() : *(A_crs->getCrsGraph());
4989 BlockHelperDetails::ComputeResidualVector<MatrixType>
4990 compute_residual_vector(amd, G->getLocalGraphDevice(), g.getLocalGraphDevice(), blocksize, interf,
4991 is_async_importer_active ? async_importer->dm2cm : dummy_local_ordinal_type_1d_view,
4995 if (is_norm_manager_active)
4996 norm_manager.setCheckFrequency(check_tol_every);
5000 for (;sweep<max_num_sweeps;++sweep) {
5004 multivector_converter.run(XX);
5006 if (is_seq_method_requested) {
5010 Z.doImport(Y, *tpetra_importer, Tpetra::REPLACE);
5011 compute_residual_vector.run(YY, XX, ZZ);
5014 multivector_converter.run(YY);
5018 if (overlap_communication_and_computation || !is_async_importer_active) {
5019 if (is_async_importer_active) async_importer->asyncSendRecv(YY);
5021 compute_residual_vector.run(pmv, XX, YY, remote_multivector,
true);
5022 if (is_norm_manager_active && norm_manager.checkDone(sweep, tolerance)) {
5023 if (is_async_importer_active) async_importer->cancel();
5026 if (is_async_importer_active) {
5027 async_importer->syncRecv();
5029 compute_residual_vector.run(pmv, XX, YY, remote_multivector,
false);
5032 if (is_async_importer_active)
5033 async_importer->syncExchange(YY);
5034 if (is_norm_manager_active && norm_manager.checkDone(sweep, tolerance))
break;
5036 compute_residual_vector.run(pmv, XX, YY, remote_multivector);
5044 solve_tridiags.run(YY, W);
5047 if (is_norm_manager_active) {
5049 BlockHelperDetails::reduceVector<MatrixType>(W, norm_manager.getBuffer());
5050 if (sweep + 1 == max_num_sweeps) {
5051 norm_manager.ireduce(sweep,
true);
5052 norm_manager.checkDone(sweep + 1, tolerance,
true);
5054 norm_manager.ireduce(sweep);
5062 if (is_norm_manager_active) norm_manager.finalize();
5068 template<
typename MatrixType>
5071 using part_interface_type = BlockHelperDetails::PartInterface<MatrixType>;
5072 using block_tridiags_type = BlockTridiags<MatrixType>;
5075 using async_import_type = AsyncableImport<MatrixType>;
5082 bool overlap_communication_and_computation;
5085 mutable typename impl_type::tpetra_multivector_type Z;
5086 mutable typename impl_type::impl_scalar_type_1d_view W;
5089 part_interface_type part_interface;
5090 block_tridiags_type block_tridiags;
5092 mutable typename impl_type::vector_type_1d_view work;
5093 mutable norm_manager_type norm_manager;
Definition: Ifpack2_BlockTriDiContainer_impl.hpp:140
#define TEUCHOS_TEST_FOR_EXCEPTION(throw_exception_test, Exception, msg)
size_t size_type
Definition: Ifpack2_BlockHelper.hpp:253
Teuchos::RCP< AsyncableImport< MatrixType > > createBlockCrsAsyncImporter(const Teuchos::RCP< const typename BlockHelperDetails::ImplType< MatrixType >::tpetra_row_matrix_type > &A)
Definition: Ifpack2_BlockTriDiContainer_impl.hpp:884
int applyInverseJacobi(const Teuchos::RCP< const typename BlockHelperDetails::ImplType< MatrixType >::tpetra_row_matrix_type > &A, const Teuchos::RCP< const typename BlockHelperDetails::ImplType< MatrixType >::tpetra_crs_graph_type > &G, const Teuchos::RCP< const typename BlockHelperDetails::ImplType< MatrixType >::tpetra_import_type > &tpetra_importer, const Teuchos::RCP< AsyncableImport< MatrixType > > &async_importer, const bool overlap_communication_and_computation, const typename BlockHelperDetails::ImplType< MatrixType >::tpetra_multivector_type &X, typename BlockHelperDetails::ImplType< MatrixType >::tpetra_multivector_type &Y, typename BlockHelperDetails::ImplType< MatrixType >::tpetra_multivector_type &Z, typename BlockHelperDetails::ImplType< MatrixType >::impl_scalar_type_1d_view &W, const BlockHelperDetails::PartInterface< MatrixType > &interf, const BlockTridiags< MatrixType > &btdm, const BlockHelperDetails::AmD< MatrixType > &amd, typename BlockHelperDetails::ImplType< MatrixType >::vector_type_1d_view &work, BlockHelperDetails::NormManager< MatrixType > &norm_manager, const typename BlockHelperDetails::ImplType< MatrixType >::impl_scalar_type &damping_factor, bool is_y_zero, const int max_num_sweeps, const typename BlockHelperDetails::ImplType< MatrixType >::magnitude_type tol, const int check_tol_every)
Definition: Ifpack2_BlockTriDiContainer_impl.hpp:4875
TEUCHOS_DEPRECATED RCP< T > rcp(T *p, Dealloc_T dealloc, bool owns_mem)
#define TEUCHOS_TEST_FOR_EXCEPT_MSG(throw_exception_test, msg)
BlockTridiags< MatrixType > createBlockTridiags(const BlockHelperDetails::PartInterface< MatrixType > &interf)
Definition: Ifpack2_BlockTriDiContainer_impl.hpp:1620
Definition: Ifpack2_BlockHelper.hpp:351
void performSymbolicPhase(const Teuchos::RCP< const typename BlockHelperDetails::ImplType< MatrixType >::tpetra_row_matrix_type > &A, const Teuchos::RCP< const typename BlockHelperDetails::ImplType< MatrixType >::tpetra_crs_graph_type > &g, const BlockHelperDetails::PartInterface< MatrixType > &interf, BlockTridiags< MatrixType > &btdm, BlockHelperDetails::AmD< MatrixType > &amd, const bool overlap_communication_and_computation, const Teuchos::RCP< AsyncableImport< MatrixType > > &async_importer, bool useSeqMethod)
Definition: Ifpack2_BlockTriDiContainer_impl.hpp:1862
Kokkos::ViewAllocateWithoutInitializing do_not_initialize_tag
Definition: Ifpack2_BlockTriDiContainer_impl.hpp:96
void performNumericPhase(const Teuchos::RCP< const typename BlockHelperDetails::ImplType< MatrixType >::tpetra_row_matrix_type > &A, const Teuchos::RCP< const typename BlockHelperDetails::ImplType< MatrixType >::tpetra_crs_graph_type > &G, const BlockHelperDetails::PartInterface< MatrixType > &interf, BlockTridiags< MatrixType > &btdm, const typename BlockHelperDetails::ImplType< MatrixType >::magnitude_type tiny)
Definition: Ifpack2_BlockTriDiContainer_impl.hpp:3599
void send(const Packet sendBuffer[], const Ordinal count, const int destRank, const int tag, const Comm< Ordinal > &comm)
Kokkos::Details::ArithTraits< scalar_type >::val_type impl_scalar_type
Definition: Ifpack2_BlockHelper.hpp:262
Definition: Ifpack2_BlockHelper.hpp:188
BlockHelperDetails::PartInterface< MatrixType > createPartInterface(const Teuchos::RCP< const typename BlockHelperDetails::ImplType< MatrixType >::tpetra_row_matrix_type > &A, const Teuchos::RCP< const typename BlockHelperDetails::ImplType< MatrixType >::tpetra_crs_graph_type > &G, const Teuchos::Array< Teuchos::Array< typename BlockHelperDetails::ImplType< MatrixType >::local_ordinal_type > > &partitions, const typename BlockHelperDetails::ImplType< MatrixType >::local_ordinal_type n_subparts_per_part_in)
Definition: Ifpack2_BlockTriDiContainer_impl.hpp:1043
Definition: Ifpack2_BlockTriDiContainer_impl.hpp:2204
Kokkos::View< size_type *, device_type > size_type_1d_view
Definition: Ifpack2_BlockHelper.hpp:320
Teuchos::RCP< const typename BlockHelperDetails::ImplType< MatrixType >::tpetra_import_type > createBlockCrsTpetraImporter(const Teuchos::RCP< const typename BlockHelperDetails::ImplType< MatrixType >::tpetra_row_matrix_type > &A)
Definition: Ifpack2_BlockTriDiContainer_impl.hpp:163
RCP< CommRequest< Ordinal > > isend(const ArrayRCP< const Packet > &sendBuffer, const int destRank, const int tag, const Comm< Ordinal > &comm)
#define TEUCHOS_ASSERT(assertion_test)
Definition: Ifpack2_BlockHelper.hpp:215
Definition: Ifpack2_BlockHelper.hpp:249
Definition: Ifpack2_BlockTriDiContainer_impl.hpp:1565
Definition: Ifpack2_BlockComputeResidualVector.hpp:23
Definition: Ifpack2_BlockTriDiContainer_impl.hpp:3634